xref: /freebsd/sys/kern/vfs_vnops.c (revision 3e0efd2ec4fcb4cd68fb8ccf8aea6fc6151c454b)
1 /*-
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)vfs_vnops.c	8.2 (Berkeley) 1/21/94
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/fcntl.h>
43 #include <sys/file.h>
44 #include <sys/kdb.h>
45 #include <sys/stat.h>
46 #include <sys/priv.h>
47 #include <sys/proc.h>
48 #include <sys/limits.h>
49 #include <sys/lock.h>
50 #include <sys/mount.h>
51 #include <sys/mutex.h>
52 #include <sys/namei.h>
53 #include <sys/vnode.h>
54 #include <sys/bio.h>
55 #include <sys/buf.h>
56 #include <sys/filio.h>
57 #include <sys/resourcevar.h>
58 #include <sys/sx.h>
59 #include <sys/sysctl.h>
60 #include <sys/ttycom.h>
61 #include <sys/conf.h>
62 #include <sys/syslog.h>
63 #include <sys/unistd.h>
64 
65 #include <security/audit/audit.h>
66 #include <security/mac/mac_framework.h>
67 
68 #include <vm/vm.h>
69 #include <vm/vm_extern.h>
70 #include <vm/pmap.h>
71 #include <vm/vm_map.h>
72 #include <vm/vm_object.h>
73 #include <vm/vm_page.h>
74 
75 static fo_rdwr_t	vn_read;
76 static fo_rdwr_t	vn_write;
77 static fo_rdwr_t	vn_io_fault;
78 static fo_truncate_t	vn_truncate;
79 static fo_ioctl_t	vn_ioctl;
80 static fo_poll_t	vn_poll;
81 static fo_kqfilter_t	vn_kqfilter;
82 static fo_stat_t	vn_statfile;
83 static fo_close_t	vn_closefile;
84 
85 struct 	fileops vnops = {
86 	.fo_read = vn_io_fault,
87 	.fo_write = vn_io_fault,
88 	.fo_truncate = vn_truncate,
89 	.fo_ioctl = vn_ioctl,
90 	.fo_poll = vn_poll,
91 	.fo_kqfilter = vn_kqfilter,
92 	.fo_stat = vn_statfile,
93 	.fo_close = vn_closefile,
94 	.fo_chmod = vn_chmod,
95 	.fo_chown = vn_chown,
96 	.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
97 };
98 
99 int
100 vn_open(ndp, flagp, cmode, fp)
101 	struct nameidata *ndp;
102 	int *flagp, cmode;
103 	struct file *fp;
104 {
105 	struct thread *td = ndp->ni_cnd.cn_thread;
106 
107 	return (vn_open_cred(ndp, flagp, cmode, 0, td->td_ucred, fp));
108 }
109 
110 /*
111  * Common code for vnode open operations.
112  * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
113  *
114  * Note that this does NOT free nameidata for the successful case,
115  * due to the NDINIT being done elsewhere.
116  */
117 int
118 vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags,
119     struct ucred *cred, struct file *fp)
120 {
121 	struct vnode *vp;
122 	struct mount *mp;
123 	struct thread *td = ndp->ni_cnd.cn_thread;
124 	struct vattr vat;
125 	struct vattr *vap = &vat;
126 	int fmode, error;
127 	accmode_t accmode;
128 	int vfslocked, mpsafe;
129 
130 	mpsafe = ndp->ni_cnd.cn_flags & MPSAFE;
131 restart:
132 	vfslocked = 0;
133 	fmode = *flagp;
134 	if (fmode & O_CREAT) {
135 		ndp->ni_cnd.cn_nameiop = CREATE;
136 		ndp->ni_cnd.cn_flags = ISOPEN | LOCKPARENT | LOCKLEAF |
137 		    MPSAFE;
138 		if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
139 			ndp->ni_cnd.cn_flags |= FOLLOW;
140 		if (!(vn_open_flags & VN_OPEN_NOAUDIT))
141 			ndp->ni_cnd.cn_flags |= AUDITVNODE1;
142 		bwillwrite();
143 		if ((error = namei(ndp)) != 0)
144 			return (error);
145 		vfslocked = NDHASGIANT(ndp);
146 		if (!mpsafe)
147 			ndp->ni_cnd.cn_flags &= ~MPSAFE;
148 		if (ndp->ni_vp == NULL) {
149 			VATTR_NULL(vap);
150 			vap->va_type = VREG;
151 			vap->va_mode = cmode;
152 			if (fmode & O_EXCL)
153 				vap->va_vaflags |= VA_EXCLUSIVE;
154 			if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
155 				NDFREE(ndp, NDF_ONLY_PNBUF);
156 				vput(ndp->ni_dvp);
157 				VFS_UNLOCK_GIANT(vfslocked);
158 				if ((error = vn_start_write(NULL, &mp,
159 				    V_XSLEEP | PCATCH)) != 0)
160 					return (error);
161 				goto restart;
162 			}
163 #ifdef MAC
164 			error = mac_vnode_check_create(cred, ndp->ni_dvp,
165 			    &ndp->ni_cnd, vap);
166 			if (error == 0)
167 #endif
168 				error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
169 						   &ndp->ni_cnd, vap);
170 			vput(ndp->ni_dvp);
171 			vn_finished_write(mp);
172 			if (error) {
173 				VFS_UNLOCK_GIANT(vfslocked);
174 				NDFREE(ndp, NDF_ONLY_PNBUF);
175 				return (error);
176 			}
177 			fmode &= ~O_TRUNC;
178 			vp = ndp->ni_vp;
179 		} else {
180 			if (ndp->ni_dvp == ndp->ni_vp)
181 				vrele(ndp->ni_dvp);
182 			else
183 				vput(ndp->ni_dvp);
184 			ndp->ni_dvp = NULL;
185 			vp = ndp->ni_vp;
186 			if (fmode & O_EXCL) {
187 				error = EEXIST;
188 				goto bad;
189 			}
190 			fmode &= ~O_CREAT;
191 		}
192 	} else {
193 		ndp->ni_cnd.cn_nameiop = LOOKUP;
194 		ndp->ni_cnd.cn_flags = ISOPEN |
195 		    ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) |
196 		    LOCKLEAF | MPSAFE;
197 		if (!(fmode & FWRITE))
198 			ndp->ni_cnd.cn_flags |= LOCKSHARED;
199 		if (!(vn_open_flags & VN_OPEN_NOAUDIT))
200 			ndp->ni_cnd.cn_flags |= AUDITVNODE1;
201 		if ((error = namei(ndp)) != 0)
202 			return (error);
203 		if (!mpsafe)
204 			ndp->ni_cnd.cn_flags &= ~MPSAFE;
205 		vfslocked = NDHASGIANT(ndp);
206 		vp = ndp->ni_vp;
207 	}
208 	if (vp->v_type == VLNK) {
209 		error = EMLINK;
210 		goto bad;
211 	}
212 	if (vp->v_type == VSOCK) {
213 		error = EOPNOTSUPP;
214 		goto bad;
215 	}
216 	if (vp->v_type != VDIR && fmode & O_DIRECTORY) {
217 		error = ENOTDIR;
218 		goto bad;
219 	}
220 	accmode = 0;
221 	if (fmode & (FWRITE | O_TRUNC)) {
222 		if (vp->v_type == VDIR) {
223 			error = EISDIR;
224 			goto bad;
225 		}
226 		accmode |= VWRITE;
227 	}
228 	if (fmode & FREAD)
229 		accmode |= VREAD;
230 	if (fmode & FEXEC)
231 		accmode |= VEXEC;
232 	if ((fmode & O_APPEND) && (fmode & FWRITE))
233 		accmode |= VAPPEND;
234 #ifdef MAC
235 	error = mac_vnode_check_open(cred, vp, accmode);
236 	if (error)
237 		goto bad;
238 #endif
239 	if ((fmode & O_CREAT) == 0) {
240 		if (accmode & VWRITE) {
241 			error = vn_writechk(vp);
242 			if (error)
243 				goto bad;
244 		}
245 		if (accmode) {
246 		        error = VOP_ACCESS(vp, accmode, cred, td);
247 			if (error)
248 				goto bad;
249 		}
250 	}
251 	if ((error = VOP_OPEN(vp, fmode, cred, td, fp)) != 0)
252 		goto bad;
253 
254 	if (fmode & FWRITE) {
255 		vp->v_writecount++;
256 		CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d",
257 		    __func__, vp, vp->v_writecount);
258 	}
259 	*flagp = fmode;
260 	ASSERT_VOP_LOCKED(vp, "vn_open_cred");
261 	if (!mpsafe)
262 		VFS_UNLOCK_GIANT(vfslocked);
263 	return (0);
264 bad:
265 	NDFREE(ndp, NDF_ONLY_PNBUF);
266 	vput(vp);
267 	VFS_UNLOCK_GIANT(vfslocked);
268 	*flagp = fmode;
269 	ndp->ni_vp = NULL;
270 	return (error);
271 }
272 
273 /*
274  * Check for write permissions on the specified vnode.
275  * Prototype text segments cannot be written.
276  */
277 int
278 vn_writechk(vp)
279 	register struct vnode *vp;
280 {
281 
282 	ASSERT_VOP_LOCKED(vp, "vn_writechk");
283 	/*
284 	 * If there's shared text associated with
285 	 * the vnode, try to free it up once.  If
286 	 * we fail, we can't allow writing.
287 	 */
288 	if (vp->v_vflag & VV_TEXT)
289 		return (ETXTBSY);
290 
291 	return (0);
292 }
293 
294 /*
295  * Vnode close call
296  */
297 int
298 vn_close(vp, flags, file_cred, td)
299 	register struct vnode *vp;
300 	int flags;
301 	struct ucred *file_cred;
302 	struct thread *td;
303 {
304 	struct mount *mp;
305 	int error, lock_flags;
306 
307 	if (!(flags & FWRITE) && vp->v_mount != NULL &&
308 	    vp->v_mount->mnt_kern_flag & MNTK_EXTENDED_SHARED)
309 		lock_flags = LK_SHARED;
310 	else
311 		lock_flags = LK_EXCLUSIVE;
312 
313 	VFS_ASSERT_GIANT(vp->v_mount);
314 
315 	vn_start_write(vp, &mp, V_WAIT);
316 	vn_lock(vp, lock_flags | LK_RETRY);
317 	if (flags & FWRITE) {
318 		VNASSERT(vp->v_writecount > 0, vp,
319 		    ("vn_close: negative writecount"));
320 		vp->v_writecount--;
321 		CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d",
322 		    __func__, vp, vp->v_writecount);
323 	}
324 	error = VOP_CLOSE(vp, flags, file_cred, td);
325 	vput(vp);
326 	vn_finished_write(mp);
327 	return (error);
328 }
329 
330 /*
331  * Heuristic to detect sequential operation.
332  */
333 static int
334 sequential_heuristic(struct uio *uio, struct file *fp)
335 {
336 
337 	if (atomic_load_acq_int(&(fp->f_flag)) & FRDAHEAD)
338 		return (fp->f_seqcount << IO_SEQSHIFT);
339 
340 	/*
341 	 * Offset 0 is handled specially.  open() sets f_seqcount to 1 so
342 	 * that the first I/O is normally considered to be slightly
343 	 * sequential.  Seeking to offset 0 doesn't change sequentiality
344 	 * unless previous seeks have reduced f_seqcount to 0, in which
345 	 * case offset 0 is not special.
346 	 */
347 	if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
348 	    uio->uio_offset == fp->f_nextoff) {
349 		/*
350 		 * f_seqcount is in units of fixed-size blocks so that it
351 		 * depends mainly on the amount of sequential I/O and not
352 		 * much on the number of sequential I/O's.  The fixed size
353 		 * of 16384 is hard-coded here since it is (not quite) just
354 		 * a magic size that works well here.  This size is more
355 		 * closely related to the best I/O size for real disks than
356 		 * to any block size used by software.
357 		 */
358 		fp->f_seqcount += howmany(uio->uio_resid, 16384);
359 		if (fp->f_seqcount > IO_SEQMAX)
360 			fp->f_seqcount = IO_SEQMAX;
361 		return (fp->f_seqcount << IO_SEQSHIFT);
362 	}
363 
364 	/* Not sequential.  Quickly draw-down sequentiality. */
365 	if (fp->f_seqcount > 1)
366 		fp->f_seqcount = 1;
367 	else
368 		fp->f_seqcount = 0;
369 	return (0);
370 }
371 
372 /*
373  * Package up an I/O request on a vnode into a uio and do it.
374  */
375 int
376 vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset,
377     enum uio_seg segflg, int ioflg, struct ucred *active_cred,
378     struct ucred *file_cred, ssize_t *aresid, struct thread *td)
379 {
380 	struct uio auio;
381 	struct iovec aiov;
382 	struct mount *mp;
383 	struct ucred *cred;
384 	void *rl_cookie;
385 	int error, lock_flags;
386 
387 	VFS_ASSERT_GIANT(vp->v_mount);
388 
389 	auio.uio_iov = &aiov;
390 	auio.uio_iovcnt = 1;
391 	aiov.iov_base = base;
392 	aiov.iov_len = len;
393 	auio.uio_resid = len;
394 	auio.uio_offset = offset;
395 	auio.uio_segflg = segflg;
396 	auio.uio_rw = rw;
397 	auio.uio_td = td;
398 	error = 0;
399 
400 	if ((ioflg & IO_NODELOCKED) == 0) {
401 		if (rw == UIO_READ) {
402 			rl_cookie = vn_rangelock_rlock(vp, offset,
403 			    offset + len);
404 		} else {
405 			rl_cookie = vn_rangelock_wlock(vp, offset,
406 			    offset + len);
407 		}
408 		mp = NULL;
409 		if (rw == UIO_WRITE) {
410 			if (vp->v_type != VCHR &&
411 			    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
412 			    != 0)
413 				goto out;
414 			if (MNT_SHARED_WRITES(mp) ||
415 			    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount)))
416 				lock_flags = LK_SHARED;
417 			else
418 				lock_flags = LK_EXCLUSIVE;
419 		} else
420 			lock_flags = LK_SHARED;
421 		vn_lock(vp, lock_flags | LK_RETRY);
422 	} else
423 		rl_cookie = NULL;
424 
425 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
426 #ifdef MAC
427 	if ((ioflg & IO_NOMACCHECK) == 0) {
428 		if (rw == UIO_READ)
429 			error = mac_vnode_check_read(active_cred, file_cred,
430 			    vp);
431 		else
432 			error = mac_vnode_check_write(active_cred, file_cred,
433 			    vp);
434 	}
435 #endif
436 	if (error == 0) {
437 		if (file_cred != NULL)
438 			cred = file_cred;
439 		else
440 			cred = active_cred;
441 		if (rw == UIO_READ)
442 			error = VOP_READ(vp, &auio, ioflg, cred);
443 		else
444 			error = VOP_WRITE(vp, &auio, ioflg, cred);
445 	}
446 	if (aresid)
447 		*aresid = auio.uio_resid;
448 	else
449 		if (auio.uio_resid && error == 0)
450 			error = EIO;
451 	if ((ioflg & IO_NODELOCKED) == 0) {
452 		VOP_UNLOCK(vp, 0);
453 		if (mp != NULL)
454 			vn_finished_write(mp);
455 	}
456  out:
457 	if (rl_cookie != NULL)
458 		vn_rangelock_unlock(vp, rl_cookie);
459 	return (error);
460 }
461 
462 /*
463  * Package up an I/O request on a vnode into a uio and do it.  The I/O
464  * request is split up into smaller chunks and we try to avoid saturating
465  * the buffer cache while potentially holding a vnode locked, so we
466  * check bwillwrite() before calling vn_rdwr().  We also call kern_yield()
467  * to give other processes a chance to lock the vnode (either other processes
468  * core'ing the same binary, or unrelated processes scanning the directory).
469  */
470 int
471 vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred,
472     file_cred, aresid, td)
473 	enum uio_rw rw;
474 	struct vnode *vp;
475 	void *base;
476 	size_t len;
477 	off_t offset;
478 	enum uio_seg segflg;
479 	int ioflg;
480 	struct ucred *active_cred;
481 	struct ucred *file_cred;
482 	size_t *aresid;
483 	struct thread *td;
484 {
485 	int error = 0;
486 	ssize_t iaresid;
487 
488 	VFS_ASSERT_GIANT(vp->v_mount);
489 
490 	do {
491 		int chunk;
492 
493 		/*
494 		 * Force `offset' to a multiple of MAXBSIZE except possibly
495 		 * for the first chunk, so that filesystems only need to
496 		 * write full blocks except possibly for the first and last
497 		 * chunks.
498 		 */
499 		chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
500 
501 		if (chunk > len)
502 			chunk = len;
503 		if (rw != UIO_READ && vp->v_type == VREG)
504 			bwillwrite();
505 		iaresid = 0;
506 		error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
507 		    ioflg, active_cred, file_cred, &iaresid, td);
508 		len -= chunk;	/* aresid calc already includes length */
509 		if (error)
510 			break;
511 		offset += chunk;
512 		base = (char *)base + chunk;
513 		kern_yield(PRI_USER);
514 	} while (len);
515 	if (aresid)
516 		*aresid = len + iaresid;
517 	return (error);
518 }
519 
520 /*
521  * File table vnode read routine.
522  */
523 static int
524 vn_read(fp, uio, active_cred, flags, td)
525 	struct file *fp;
526 	struct uio *uio;
527 	struct ucred *active_cred;
528 	int flags;
529 	struct thread *td;
530 {
531 	struct vnode *vp;
532 	int error, ioflag;
533 	struct mtx *mtxp;
534 	int advice, vfslocked;
535 	off_t offset;
536 
537 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
538 	    uio->uio_td, td));
539 	mtxp = NULL;
540 	vp = fp->f_vnode;
541 	ioflag = 0;
542 	if (fp->f_flag & FNONBLOCK)
543 		ioflag |= IO_NDELAY;
544 	if (fp->f_flag & O_DIRECT)
545 		ioflag |= IO_DIRECT;
546 	advice = POSIX_FADV_NORMAL;
547 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
548 	/*
549 	 * According to McKusick the vn lock was protecting f_offset here.
550 	 * It is now protected by the FOFFSET_LOCKED flag.
551 	 */
552 	if ((flags & FOF_OFFSET) == 0 || fp->f_advice != NULL) {
553 		mtxp = mtx_pool_find(mtxpool_sleep, fp);
554 		mtx_lock(mtxp);
555 		if ((flags & FOF_OFFSET) == 0) {
556 			while (fp->f_vnread_flags & FOFFSET_LOCKED) {
557 				fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
558 				msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
559 				    "vnread offlock", 0);
560 			}
561 			fp->f_vnread_flags |= FOFFSET_LOCKED;
562 			uio->uio_offset = fp->f_offset;
563 		}
564 		if (fp->f_advice != NULL &&
565 		    uio->uio_offset >= fp->f_advice->fa_start &&
566 		    uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
567 			advice = fp->f_advice->fa_advice;
568 		mtx_unlock(mtxp);
569 	}
570 	vn_lock(vp, LK_SHARED | LK_RETRY);
571 
572 	switch (advice) {
573 	case POSIX_FADV_NORMAL:
574 	case POSIX_FADV_SEQUENTIAL:
575 	case POSIX_FADV_NOREUSE:
576 		ioflag |= sequential_heuristic(uio, fp);
577 		break;
578 	case POSIX_FADV_RANDOM:
579 		/* Disable read-ahead for random I/O. */
580 		break;
581 	}
582 	offset = uio->uio_offset;
583 
584 #ifdef MAC
585 	error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
586 	if (error == 0)
587 #endif
588 		error = VOP_READ(vp, uio, ioflag, fp->f_cred);
589 	if ((flags & FOF_OFFSET) == 0) {
590 		fp->f_offset = uio->uio_offset;
591 		mtx_lock(mtxp);
592 		if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
593 			wakeup(&fp->f_vnread_flags);
594 		fp->f_vnread_flags = 0;
595 		mtx_unlock(mtxp);
596 	}
597 	fp->f_nextoff = uio->uio_offset;
598 	VOP_UNLOCK(vp, 0);
599 	if (error == 0 && advice == POSIX_FADV_NOREUSE &&
600 	    offset != uio->uio_offset)
601 		error = VOP_ADVISE(vp, offset, uio->uio_offset - 1,
602 		    POSIX_FADV_DONTNEED);
603 	VFS_UNLOCK_GIANT(vfslocked);
604 	return (error);
605 }
606 
607 /*
608  * File table vnode write routine.
609  */
610 static int
611 vn_write(fp, uio, active_cred, flags, td)
612 	struct file *fp;
613 	struct uio *uio;
614 	struct ucred *active_cred;
615 	int flags;
616 	struct thread *td;
617 {
618 	struct vnode *vp;
619 	struct mount *mp;
620 	int error, ioflag, lock_flags;
621 	struct mtx *mtxp;
622 	int advice, vfslocked;
623 
624 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
625 	    uio->uio_td, td));
626 	vp = fp->f_vnode;
627 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
628 	if (vp->v_type == VREG)
629 		bwillwrite();
630 	ioflag = IO_UNIT;
631 	if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
632 		ioflag |= IO_APPEND;
633 	if (fp->f_flag & FNONBLOCK)
634 		ioflag |= IO_NDELAY;
635 	if (fp->f_flag & O_DIRECT)
636 		ioflag |= IO_DIRECT;
637 	if ((fp->f_flag & O_FSYNC) ||
638 	    (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
639 		ioflag |= IO_SYNC;
640 	mp = NULL;
641 	if (vp->v_type != VCHR &&
642 	    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
643 		goto unlock;
644 
645 	if ((MNT_SHARED_WRITES(mp) ||
646 	    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) &&
647 	    (flags & FOF_OFFSET) != 0) {
648 		lock_flags = LK_SHARED;
649 	} else {
650 		lock_flags = LK_EXCLUSIVE;
651 	}
652 
653 	vn_lock(vp, lock_flags | LK_RETRY);
654 	if ((flags & FOF_OFFSET) == 0)
655 		uio->uio_offset = fp->f_offset;
656 	advice = POSIX_FADV_NORMAL;
657 	if (fp->f_advice != NULL) {
658 		mtxp = mtx_pool_find(mtxpool_sleep, fp);
659 		mtx_lock(mtxp);
660 		if (fp->f_advice != NULL &&
661 		    uio->uio_offset >= fp->f_advice->fa_start &&
662 		    uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
663 			advice = fp->f_advice->fa_advice;
664 		mtx_unlock(mtxp);
665 	}
666 	switch (advice) {
667 	case POSIX_FADV_NORMAL:
668 	case POSIX_FADV_SEQUENTIAL:
669 		ioflag |= sequential_heuristic(uio, fp);
670 		break;
671 	case POSIX_FADV_RANDOM:
672 		/* XXX: Is this correct? */
673 		break;
674 	case POSIX_FADV_NOREUSE:
675 		/*
676 		 * Request the underlying FS to discard the buffers
677 		 * and pages after the I/O is complete.
678 		 */
679 		ioflag |= IO_DIRECT;
680 		break;
681 	}
682 
683 #ifdef MAC
684 	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
685 	if (error == 0)
686 #endif
687 		error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
688 	if ((flags & FOF_OFFSET) == 0)
689 		fp->f_offset = uio->uio_offset;
690 	fp->f_nextoff = uio->uio_offset;
691 	VOP_UNLOCK(vp, 0);
692 	if (vp->v_type != VCHR)
693 		vn_finished_write(mp);
694 unlock:
695 	VFS_UNLOCK_GIANT(vfslocked);
696 	return (error);
697 }
698 
699 static const int io_hold_cnt = 16;
700 static int vn_io_fault_enable = 1;
701 SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RW,
702     &vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance");
703 static unsigned long vn_io_faults_cnt;
704 SYSCTL_LONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD,
705     &vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers");
706 
707 /*
708  * The vn_io_fault() is a wrapper around vn_read() and vn_write() to
709  * prevent the following deadlock:
710  *
711  * Assume that the thread A reads from the vnode vp1 into userspace
712  * buffer buf1 backed by the pages of vnode vp2.  If a page in buf1 is
713  * currently not resident, then system ends up with the call chain
714  *   vn_read() -> VOP_READ(vp1) -> uiomove() -> [Page Fault] ->
715  *     vm_fault(buf1) -> vnode_pager_getpages(vp2) -> VOP_GETPAGES(vp2)
716  * which establishes lock order vp1->vn_lock, then vp2->vn_lock.
717  * If, at the same time, thread B reads from vnode vp2 into buffer buf2
718  * backed by the pages of vnode vp1, and some page in buf2 is not
719  * resident, we get a reversed order vp2->vn_lock, then vp1->vn_lock.
720  *
721  * To prevent the lock order reversal and deadlock, vn_io_fault() does
722  * not allow page faults to happen during VOP_READ() or VOP_WRITE().
723  * Instead, it first tries to do the whole range i/o with pagefaults
724  * disabled. If all pages in the i/o buffer are resident and mapped,
725  * VOP will succeed (ignoring the genuine filesystem errors).
726  * Otherwise, we get back EFAULT, and vn_io_fault() falls back to do
727  * i/o in chunks, with all pages in the chunk prefaulted and held
728  * using vm_fault_quick_hold_pages().
729  *
730  * Filesystems using this deadlock avoidance scheme should use the
731  * array of the held pages from uio, saved in the curthread->td_ma,
732  * instead of doing uiomove().  A helper function
733  * vn_io_fault_uiomove() converts uiomove request into
734  * uiomove_fromphys() over td_ma array.
735  *
736  * Since vnode locks do not cover the whole i/o anymore, rangelocks
737  * make the current i/o request atomic with respect to other i/os and
738  * truncations.
739  */
740 static int
741 vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred,
742     int flags, struct thread *td)
743 {
744 	vm_page_t ma[io_hold_cnt + 2];
745 	struct uio *uio_clone, short_uio;
746 	struct iovec short_iovec[1];
747 	fo_rdwr_t *doio;
748 	struct vnode *vp;
749 	void *rl_cookie;
750 	struct mount *mp;
751 	vm_page_t *prev_td_ma;
752 	int cnt, error, save, saveheld, prev_td_ma_cnt;
753 	vm_offset_t addr, end;
754 	vm_prot_t prot;
755 	size_t len, resid;
756 	ssize_t adv;
757 
758 	if (uio->uio_rw == UIO_READ)
759 		doio = vn_read;
760 	else
761 		doio = vn_write;
762 	vp = fp->f_vnode;
763 	if (uio->uio_segflg != UIO_USERSPACE || vp->v_type != VREG ||
764 	    ((mp = vp->v_mount) != NULL &&
765 	    (mp->mnt_kern_flag & MNTK_NO_IOPF) == 0) ||
766 	    !vn_io_fault_enable)
767 		return (doio(fp, uio, active_cred, flags, td));
768 
769 	/*
770 	 * The UFS follows IO_UNIT directive and replays back both
771 	 * uio_offset and uio_resid if an error is encountered during the
772 	 * operation.  But, since the iovec may be already advanced,
773 	 * uio is still in an inconsistent state.
774 	 *
775 	 * Cache a copy of the original uio, which is advanced to the redo
776 	 * point using UIO_NOCOPY below.
777 	 */
778 	uio_clone = cloneuio(uio);
779 	resid = uio->uio_resid;
780 
781 	short_uio.uio_segflg = UIO_USERSPACE;
782 	short_uio.uio_rw = uio->uio_rw;
783 	short_uio.uio_td = uio->uio_td;
784 
785 	if (uio->uio_rw == UIO_READ) {
786 		prot = VM_PROT_WRITE;
787 		rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset,
788 		    uio->uio_offset + uio->uio_resid);
789 	} else {
790 		prot = VM_PROT_READ;
791 		if ((fp->f_flag & O_APPEND) != 0 || (flags & FOF_OFFSET) == 0)
792 			/* For appenders, punt and lock the whole range. */
793 			rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
794 		else
795 			rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset,
796 			    uio->uio_offset + uio->uio_resid);
797 	}
798 
799 	save = vm_fault_disable_pagefaults();
800 	error = doio(fp, uio, active_cred, flags, td);
801 	if (error != EFAULT)
802 		goto out;
803 
804 	atomic_add_long(&vn_io_faults_cnt, 1);
805 	uio_clone->uio_segflg = UIO_NOCOPY;
806 	uiomove(NULL, resid - uio->uio_resid, uio_clone);
807 	uio_clone->uio_segflg = uio->uio_segflg;
808 
809 	saveheld = curthread_pflags_set(TDP_UIOHELD);
810 	prev_td_ma = td->td_ma;
811 	prev_td_ma_cnt = td->td_ma_cnt;
812 
813 	while (uio_clone->uio_resid != 0) {
814 		len = uio_clone->uio_iov->iov_len;
815 		if (len == 0) {
816 			KASSERT(uio_clone->uio_iovcnt >= 1,
817 			    ("iovcnt underflow"));
818 			uio_clone->uio_iov++;
819 			uio_clone->uio_iovcnt--;
820 			continue;
821 		}
822 
823 		addr = (vm_offset_t)uio_clone->uio_iov->iov_base;
824 		end = round_page(addr + len);
825 		cnt = howmany(end - trunc_page(addr), PAGE_SIZE);
826 		/*
827 		 * A perfectly misaligned address and length could cause
828 		 * both the start and the end of the chunk to use partial
829 		 * page.  +2 accounts for such a situation.
830 		 */
831 		if (cnt > io_hold_cnt + 2) {
832 			len = io_hold_cnt * PAGE_SIZE;
833 			KASSERT(howmany(round_page(addr + len) -
834 			    trunc_page(addr), PAGE_SIZE) <= io_hold_cnt + 2,
835 			    ("cnt overflow"));
836 		}
837 		cnt = vm_fault_quick_hold_pages(&td->td_proc->p_vmspace->vm_map,
838 		    addr, len, prot, ma, io_hold_cnt + 2);
839 		if (cnt == -1) {
840 			error = EFAULT;
841 			break;
842 		}
843 		short_uio.uio_iov = &short_iovec[0];
844 		short_iovec[0].iov_base = (void *)addr;
845 		short_uio.uio_iovcnt = 1;
846 		short_uio.uio_resid = short_iovec[0].iov_len = len;
847 		short_uio.uio_offset = uio_clone->uio_offset;
848 		td->td_ma = ma;
849 		td->td_ma_cnt = cnt;
850 
851 		error = doio(fp, &short_uio, active_cred, flags, td);
852 		vm_page_unhold_pages(ma, cnt);
853 		adv = len - short_uio.uio_resid;
854 
855 		uio_clone->uio_iov->iov_base =
856 		    (char *)uio_clone->uio_iov->iov_base + adv;
857 		uio_clone->uio_iov->iov_len -= adv;
858 		uio_clone->uio_resid -= adv;
859 		uio_clone->uio_offset += adv;
860 
861 		uio->uio_resid -= adv;
862 		uio->uio_offset += adv;
863 
864 		if (error != 0 || adv == 0)
865 			break;
866 	}
867 	td->td_ma = prev_td_ma;
868 	td->td_ma_cnt = prev_td_ma_cnt;
869 	curthread_pflags_restore(saveheld);
870 out:
871 	vm_fault_enable_pagefaults(save);
872 	vn_rangelock_unlock(vp, rl_cookie);
873 	free(uio_clone, M_IOV);
874 	return (error);
875 }
876 
877 /*
878  * Helper function to perform the requested uiomove operation using
879  * the held pages for io->uio_iov[0].iov_base buffer instead of
880  * copyin/copyout.  Access to the pages with uiomove_fromphys()
881  * instead of iov_base prevents page faults that could occur due to
882  * pmap_collect() invalidating the mapping created by
883  * vm_fault_quick_hold_pages(), or pageout daemon, page laundry or
884  * object cleanup revoking the write access from page mappings.
885  *
886  * Filesystems specified MNTK_NO_IOPF shall use vn_io_fault_uiomove()
887  * instead of plain uiomove().
888  */
889 int
890 vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio)
891 {
892 	struct uio transp_uio;
893 	struct iovec transp_iov[1];
894 	struct thread *td;
895 	size_t adv;
896 	int error, pgadv;
897 
898 	td = curthread;
899 	if ((td->td_pflags & TDP_UIOHELD) == 0 ||
900 	    uio->uio_segflg != UIO_USERSPACE)
901 		return (uiomove(data, xfersize, uio));
902 
903 	KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
904 	transp_iov[0].iov_base = data;
905 	transp_uio.uio_iov = &transp_iov[0];
906 	transp_uio.uio_iovcnt = 1;
907 	if (xfersize > uio->uio_resid)
908 		xfersize = uio->uio_resid;
909 	transp_uio.uio_resid = transp_iov[0].iov_len = xfersize;
910 	transp_uio.uio_offset = 0;
911 	transp_uio.uio_segflg = UIO_SYSSPACE;
912 	/*
913 	 * Since transp_iov points to data, and td_ma page array
914 	 * corresponds to original uio->uio_iov, we need to invert the
915 	 * direction of the i/o operation as passed to
916 	 * uiomove_fromphys().
917 	 */
918 	switch (uio->uio_rw) {
919 	case UIO_WRITE:
920 		transp_uio.uio_rw = UIO_READ;
921 		break;
922 	case UIO_READ:
923 		transp_uio.uio_rw = UIO_WRITE;
924 		break;
925 	}
926 	transp_uio.uio_td = uio->uio_td;
927 	error = uiomove_fromphys(td->td_ma,
928 	    ((vm_offset_t)uio->uio_iov->iov_base) & PAGE_MASK,
929 	    xfersize, &transp_uio);
930 	adv = xfersize - transp_uio.uio_resid;
931 	pgadv =
932 	    (((vm_offset_t)uio->uio_iov->iov_base + adv) >> PAGE_SHIFT) -
933 	    (((vm_offset_t)uio->uio_iov->iov_base) >> PAGE_SHIFT);
934 	td->td_ma += pgadv;
935 	KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
936 	    pgadv));
937 	td->td_ma_cnt -= pgadv;
938 	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + adv;
939 	uio->uio_iov->iov_len -= adv;
940 	uio->uio_resid -= adv;
941 	uio->uio_offset += adv;
942 	return (error);
943 }
944 
945 /*
946  * File table truncate routine.
947  */
948 static int
949 vn_truncate(struct file *fp, off_t length, struct ucred *active_cred,
950     struct thread *td)
951 {
952 	struct vattr vattr;
953 	struct mount *mp;
954 	struct vnode *vp;
955 	void *rl_cookie;
956 	int vfslocked;
957 	int error;
958 
959 	vp = fp->f_vnode;
960 
961 	/*
962 	 * Lock the whole range for truncation.  Otherwise split i/o
963 	 * might happen partly before and partly after the truncation.
964 	 */
965 	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
966 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
967 	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
968 	if (error)
969 		goto out1;
970 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
971 	if (vp->v_type == VDIR) {
972 		error = EISDIR;
973 		goto out;
974 	}
975 #ifdef MAC
976 	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
977 	if (error)
978 		goto out;
979 #endif
980 	error = vn_writechk(vp);
981 	if (error == 0) {
982 		VATTR_NULL(&vattr);
983 		vattr.va_size = length;
984 		error = VOP_SETATTR(vp, &vattr, fp->f_cred);
985 	}
986 out:
987 	VOP_UNLOCK(vp, 0);
988 	vn_finished_write(mp);
989 out1:
990 	VFS_UNLOCK_GIANT(vfslocked);
991 	vn_rangelock_unlock(vp, rl_cookie);
992 	return (error);
993 }
994 
995 /*
996  * File table vnode stat routine.
997  */
998 static int
999 vn_statfile(fp, sb, active_cred, td)
1000 	struct file *fp;
1001 	struct stat *sb;
1002 	struct ucred *active_cred;
1003 	struct thread *td;
1004 {
1005 	struct vnode *vp = fp->f_vnode;
1006 	int vfslocked;
1007 	int error;
1008 
1009 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
1010 	vn_lock(vp, LK_SHARED | LK_RETRY);
1011 	error = vn_stat(vp, sb, active_cred, fp->f_cred, td);
1012 	VOP_UNLOCK(vp, 0);
1013 	VFS_UNLOCK_GIANT(vfslocked);
1014 
1015 	return (error);
1016 }
1017 
1018 /*
1019  * Stat a vnode; implementation for the stat syscall
1020  */
1021 int
1022 vn_stat(vp, sb, active_cred, file_cred, td)
1023 	struct vnode *vp;
1024 	register struct stat *sb;
1025 	struct ucred *active_cred;
1026 	struct ucred *file_cred;
1027 	struct thread *td;
1028 {
1029 	struct vattr vattr;
1030 	register struct vattr *vap;
1031 	int error;
1032 	u_short mode;
1033 
1034 #ifdef MAC
1035 	error = mac_vnode_check_stat(active_cred, file_cred, vp);
1036 	if (error)
1037 		return (error);
1038 #endif
1039 
1040 	vap = &vattr;
1041 
1042 	/*
1043 	 * Initialize defaults for new and unusual fields, so that file
1044 	 * systems which don't support these fields don't need to know
1045 	 * about them.
1046 	 */
1047 	vap->va_birthtime.tv_sec = -1;
1048 	vap->va_birthtime.tv_nsec = 0;
1049 	vap->va_fsid = VNOVAL;
1050 	vap->va_rdev = NODEV;
1051 
1052 	error = VOP_GETATTR(vp, vap, active_cred);
1053 	if (error)
1054 		return (error);
1055 
1056 	/*
1057 	 * Zero the spare stat fields
1058 	 */
1059 	bzero(sb, sizeof *sb);
1060 
1061 	/*
1062 	 * Copy from vattr table
1063 	 */
1064 	if (vap->va_fsid != VNOVAL)
1065 		sb->st_dev = vap->va_fsid;
1066 	else
1067 		sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
1068 	sb->st_ino = vap->va_fileid;
1069 	mode = vap->va_mode;
1070 	switch (vap->va_type) {
1071 	case VREG:
1072 		mode |= S_IFREG;
1073 		break;
1074 	case VDIR:
1075 		mode |= S_IFDIR;
1076 		break;
1077 	case VBLK:
1078 		mode |= S_IFBLK;
1079 		break;
1080 	case VCHR:
1081 		mode |= S_IFCHR;
1082 		break;
1083 	case VLNK:
1084 		mode |= S_IFLNK;
1085 		break;
1086 	case VSOCK:
1087 		mode |= S_IFSOCK;
1088 		break;
1089 	case VFIFO:
1090 		mode |= S_IFIFO;
1091 		break;
1092 	default:
1093 		return (EBADF);
1094 	};
1095 	sb->st_mode = mode;
1096 	sb->st_nlink = vap->va_nlink;
1097 	sb->st_uid = vap->va_uid;
1098 	sb->st_gid = vap->va_gid;
1099 	sb->st_rdev = vap->va_rdev;
1100 	if (vap->va_size > OFF_MAX)
1101 		return (EOVERFLOW);
1102 	sb->st_size = vap->va_size;
1103 	sb->st_atim = vap->va_atime;
1104 	sb->st_mtim = vap->va_mtime;
1105 	sb->st_ctim = vap->va_ctime;
1106 	sb->st_birthtim = vap->va_birthtime;
1107 
1108         /*
1109 	 * According to www.opengroup.org, the meaning of st_blksize is
1110 	 *   "a filesystem-specific preferred I/O block size for this
1111 	 *    object.  In some filesystem types, this may vary from file
1112 	 *    to file"
1113 	 * Use miminum/default of PAGE_SIZE (e.g. for VCHR).
1114 	 */
1115 
1116 	sb->st_blksize = max(PAGE_SIZE, vap->va_blocksize);
1117 
1118 	sb->st_flags = vap->va_flags;
1119 	if (priv_check(td, PRIV_VFS_GENERATION))
1120 		sb->st_gen = 0;
1121 	else
1122 		sb->st_gen = vap->va_gen;
1123 
1124 	sb->st_blocks = vap->va_bytes / S_BLKSIZE;
1125 	return (0);
1126 }
1127 
1128 /*
1129  * File table vnode ioctl routine.
1130  */
1131 static int
1132 vn_ioctl(fp, com, data, active_cred, td)
1133 	struct file *fp;
1134 	u_long com;
1135 	void *data;
1136 	struct ucred *active_cred;
1137 	struct thread *td;
1138 {
1139 	struct vnode *vp = fp->f_vnode;
1140 	struct vattr vattr;
1141 	int vfslocked;
1142 	int error;
1143 
1144 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
1145 	error = ENOTTY;
1146 	switch (vp->v_type) {
1147 	case VREG:
1148 	case VDIR:
1149 		if (com == FIONREAD) {
1150 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1151 			error = VOP_GETATTR(vp, &vattr, active_cred);
1152 			VOP_UNLOCK(vp, 0);
1153 			if (!error)
1154 				*(int *)data = vattr.va_size - fp->f_offset;
1155 		}
1156 		if (com == FIONBIO || com == FIOASYNC)	/* XXX */
1157 			error = 0;
1158 		else
1159 			error = VOP_IOCTL(vp, com, data, fp->f_flag,
1160 			    active_cred, td);
1161 		break;
1162 
1163 	default:
1164 		break;
1165 	}
1166 	VFS_UNLOCK_GIANT(vfslocked);
1167 	return (error);
1168 }
1169 
1170 /*
1171  * File table vnode poll routine.
1172  */
1173 static int
1174 vn_poll(fp, events, active_cred, td)
1175 	struct file *fp;
1176 	int events;
1177 	struct ucred *active_cred;
1178 	struct thread *td;
1179 {
1180 	struct vnode *vp;
1181 	int vfslocked;
1182 	int error;
1183 
1184 	vp = fp->f_vnode;
1185 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
1186 #ifdef MAC
1187 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1188 	error = mac_vnode_check_poll(active_cred, fp->f_cred, vp);
1189 	VOP_UNLOCK(vp, 0);
1190 	if (!error)
1191 #endif
1192 
1193 	error = VOP_POLL(vp, events, fp->f_cred, td);
1194 	VFS_UNLOCK_GIANT(vfslocked);
1195 	return (error);
1196 }
1197 
1198 /*
1199  * Acquire the requested lock and then check for validity.  LK_RETRY
1200  * permits vn_lock to return doomed vnodes.
1201  */
1202 int
1203 _vn_lock(struct vnode *vp, int flags, char *file, int line)
1204 {
1205 	int error;
1206 
1207 	VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
1208 	    ("vn_lock called with no locktype."));
1209 	do {
1210 #ifdef DEBUG_VFS_LOCKS
1211 		KASSERT(vp->v_holdcnt != 0,
1212 		    ("vn_lock %p: zero hold count", vp));
1213 #endif
1214 		error = VOP_LOCK1(vp, flags, file, line);
1215 		flags &= ~LK_INTERLOCK;	/* Interlock is always dropped. */
1216 		KASSERT((flags & LK_RETRY) == 0 || error == 0,
1217 		    ("LK_RETRY set with incompatible flags (0x%x) or an error occured (%d)",
1218 		    flags, error));
1219 		/*
1220 		 * Callers specify LK_RETRY if they wish to get dead vnodes.
1221 		 * If RETRY is not set, we return ENOENT instead.
1222 		 */
1223 		if (error == 0 && vp->v_iflag & VI_DOOMED &&
1224 		    (flags & LK_RETRY) == 0) {
1225 			VOP_UNLOCK(vp, 0);
1226 			error = ENOENT;
1227 			break;
1228 		}
1229 	} while (flags & LK_RETRY && error != 0);
1230 	return (error);
1231 }
1232 
1233 /*
1234  * File table vnode close routine.
1235  */
1236 static int
1237 vn_closefile(fp, td)
1238 	struct file *fp;
1239 	struct thread *td;
1240 {
1241 	struct vnode *vp;
1242 	struct flock lf;
1243 	int vfslocked;
1244 	int error;
1245 
1246 	vp = fp->f_vnode;
1247 
1248 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
1249 	if (fp->f_type == DTYPE_VNODE && fp->f_flag & FHASLOCK) {
1250 		lf.l_whence = SEEK_SET;
1251 		lf.l_start = 0;
1252 		lf.l_len = 0;
1253 		lf.l_type = F_UNLCK;
1254 		(void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK);
1255 	}
1256 
1257 	fp->f_ops = &badfileops;
1258 
1259 	error = vn_close(vp, fp->f_flag, fp->f_cred, td);
1260 	VFS_UNLOCK_GIANT(vfslocked);
1261 	return (error);
1262 }
1263 
1264 /*
1265  * Preparing to start a filesystem write operation. If the operation is
1266  * permitted, then we bump the count of operations in progress and
1267  * proceed. If a suspend request is in progress, we wait until the
1268  * suspension is over, and then proceed.
1269  */
1270 int
1271 vn_start_write(vp, mpp, flags)
1272 	struct vnode *vp;
1273 	struct mount **mpp;
1274 	int flags;
1275 {
1276 	struct mount *mp;
1277 	int error;
1278 
1279 	error = 0;
1280 	/*
1281 	 * If a vnode is provided, get and return the mount point that
1282 	 * to which it will write.
1283 	 */
1284 	if (vp != NULL) {
1285 		if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
1286 			*mpp = NULL;
1287 			if (error != EOPNOTSUPP)
1288 				return (error);
1289 			return (0);
1290 		}
1291 	}
1292 	if ((mp = *mpp) == NULL)
1293 		return (0);
1294 
1295 	/*
1296 	 * VOP_GETWRITEMOUNT() returns with the mp refcount held through
1297 	 * a vfs_ref().
1298 	 * As long as a vnode is not provided we need to acquire a
1299 	 * refcount for the provided mountpoint too, in order to
1300 	 * emulate a vfs_ref().
1301 	 */
1302 	MNT_ILOCK(mp);
1303 	if (vp == NULL)
1304 		MNT_REF(mp);
1305 
1306 	/*
1307 	 * Check on status of suspension.
1308 	 */
1309 	if ((curthread->td_pflags & TDP_IGNSUSP) == 0 ||
1310 	    mp->mnt_susp_owner != curthread) {
1311 		while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
1312 			if (flags & V_NOWAIT) {
1313 				error = EWOULDBLOCK;
1314 				goto unlock;
1315 			}
1316 			error = msleep(&mp->mnt_flag, MNT_MTX(mp),
1317 			    (PUSER - 1) | (flags & PCATCH), "suspfs", 0);
1318 			if (error)
1319 				goto unlock;
1320 		}
1321 	}
1322 	if (flags & V_XSLEEP)
1323 		goto unlock;
1324 	mp->mnt_writeopcount++;
1325 unlock:
1326 	if (error != 0 || (flags & V_XSLEEP) != 0)
1327 		MNT_REL(mp);
1328 	MNT_IUNLOCK(mp);
1329 	return (error);
1330 }
1331 
1332 /*
1333  * Secondary suspension. Used by operations such as vop_inactive
1334  * routines that are needed by the higher level functions. These
1335  * are allowed to proceed until all the higher level functions have
1336  * completed (indicated by mnt_writeopcount dropping to zero). At that
1337  * time, these operations are halted until the suspension is over.
1338  */
1339 int
1340 vn_start_secondary_write(vp, mpp, flags)
1341 	struct vnode *vp;
1342 	struct mount **mpp;
1343 	int flags;
1344 {
1345 	struct mount *mp;
1346 	int error;
1347 
1348  retry:
1349 	if (vp != NULL) {
1350 		if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
1351 			*mpp = NULL;
1352 			if (error != EOPNOTSUPP)
1353 				return (error);
1354 			return (0);
1355 		}
1356 	}
1357 	/*
1358 	 * If we are not suspended or have not yet reached suspended
1359 	 * mode, then let the operation proceed.
1360 	 */
1361 	if ((mp = *mpp) == NULL)
1362 		return (0);
1363 
1364 	/*
1365 	 * VOP_GETWRITEMOUNT() returns with the mp refcount held through
1366 	 * a vfs_ref().
1367 	 * As long as a vnode is not provided we need to acquire a
1368 	 * refcount for the provided mountpoint too, in order to
1369 	 * emulate a vfs_ref().
1370 	 */
1371 	MNT_ILOCK(mp);
1372 	if (vp == NULL)
1373 		MNT_REF(mp);
1374 	if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) {
1375 		mp->mnt_secondary_writes++;
1376 		mp->mnt_secondary_accwrites++;
1377 		MNT_IUNLOCK(mp);
1378 		return (0);
1379 	}
1380 	if (flags & V_NOWAIT) {
1381 		MNT_REL(mp);
1382 		MNT_IUNLOCK(mp);
1383 		return (EWOULDBLOCK);
1384 	}
1385 	/*
1386 	 * Wait for the suspension to finish.
1387 	 */
1388 	error = msleep(&mp->mnt_flag, MNT_MTX(mp),
1389 		       (PUSER - 1) | (flags & PCATCH) | PDROP, "suspfs", 0);
1390 	vfs_rel(mp);
1391 	if (error == 0)
1392 		goto retry;
1393 	return (error);
1394 }
1395 
1396 /*
1397  * Filesystem write operation has completed. If we are suspending and this
1398  * operation is the last one, notify the suspender that the suspension is
1399  * now in effect.
1400  */
1401 void
1402 vn_finished_write(mp)
1403 	struct mount *mp;
1404 {
1405 	if (mp == NULL)
1406 		return;
1407 	MNT_ILOCK(mp);
1408 	MNT_REL(mp);
1409 	mp->mnt_writeopcount--;
1410 	if (mp->mnt_writeopcount < 0)
1411 		panic("vn_finished_write: neg cnt");
1412 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
1413 	    mp->mnt_writeopcount <= 0)
1414 		wakeup(&mp->mnt_writeopcount);
1415 	MNT_IUNLOCK(mp);
1416 }
1417 
1418 
1419 /*
1420  * Filesystem secondary write operation has completed. If we are
1421  * suspending and this operation is the last one, notify the suspender
1422  * that the suspension is now in effect.
1423  */
1424 void
1425 vn_finished_secondary_write(mp)
1426 	struct mount *mp;
1427 {
1428 	if (mp == NULL)
1429 		return;
1430 	MNT_ILOCK(mp);
1431 	MNT_REL(mp);
1432 	mp->mnt_secondary_writes--;
1433 	if (mp->mnt_secondary_writes < 0)
1434 		panic("vn_finished_secondary_write: neg cnt");
1435 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
1436 	    mp->mnt_secondary_writes <= 0)
1437 		wakeup(&mp->mnt_secondary_writes);
1438 	MNT_IUNLOCK(mp);
1439 }
1440 
1441 
1442 
1443 /*
1444  * Request a filesystem to suspend write operations.
1445  */
1446 int
1447 vfs_write_suspend(mp)
1448 	struct mount *mp;
1449 {
1450 	int error;
1451 
1452 	MNT_ILOCK(mp);
1453 	if (mp->mnt_susp_owner == curthread) {
1454 		MNT_IUNLOCK(mp);
1455 		return (EALREADY);
1456 	}
1457 	while (mp->mnt_kern_flag & MNTK_SUSPEND)
1458 		msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0);
1459 	mp->mnt_kern_flag |= MNTK_SUSPEND;
1460 	mp->mnt_susp_owner = curthread;
1461 	if (mp->mnt_writeopcount > 0)
1462 		(void) msleep(&mp->mnt_writeopcount,
1463 		    MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0);
1464 	else
1465 		MNT_IUNLOCK(mp);
1466 	if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0)
1467 		vfs_write_resume(mp);
1468 	return (error);
1469 }
1470 
1471 /*
1472  * Request a filesystem to resume write operations.
1473  */
1474 void
1475 vfs_write_resume(mp)
1476 	struct mount *mp;
1477 {
1478 
1479 	MNT_ILOCK(mp);
1480 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
1481 		KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner"));
1482 		mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 |
1483 				       MNTK_SUSPENDED);
1484 		mp->mnt_susp_owner = NULL;
1485 		wakeup(&mp->mnt_writeopcount);
1486 		wakeup(&mp->mnt_flag);
1487 		curthread->td_pflags &= ~TDP_IGNSUSP;
1488 		MNT_IUNLOCK(mp);
1489 		VFS_SUSP_CLEAN(mp);
1490 	} else
1491 		MNT_IUNLOCK(mp);
1492 }
1493 
1494 /*
1495  * Implement kqueues for files by translating it to vnode operation.
1496  */
1497 static int
1498 vn_kqfilter(struct file *fp, struct knote *kn)
1499 {
1500 	int vfslocked;
1501 	int error;
1502 
1503 	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
1504 	error = VOP_KQFILTER(fp->f_vnode, kn);
1505 	VFS_UNLOCK_GIANT(vfslocked);
1506 
1507 	return error;
1508 }
1509 
1510 /*
1511  * Simplified in-kernel wrapper calls for extended attribute access.
1512  * Both calls pass in a NULL credential, authorizing as "kernel" access.
1513  * Set IO_NODELOCKED in ioflg if the vnode is already locked.
1514  */
1515 int
1516 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
1517     const char *attrname, int *buflen, char *buf, struct thread *td)
1518 {
1519 	struct uio	auio;
1520 	struct iovec	iov;
1521 	int	error;
1522 
1523 	iov.iov_len = *buflen;
1524 	iov.iov_base = buf;
1525 
1526 	auio.uio_iov = &iov;
1527 	auio.uio_iovcnt = 1;
1528 	auio.uio_rw = UIO_READ;
1529 	auio.uio_segflg = UIO_SYSSPACE;
1530 	auio.uio_td = td;
1531 	auio.uio_offset = 0;
1532 	auio.uio_resid = *buflen;
1533 
1534 	if ((ioflg & IO_NODELOCKED) == 0)
1535 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1536 
1537 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
1538 
1539 	/* authorize attribute retrieval as kernel */
1540 	error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
1541 	    td);
1542 
1543 	if ((ioflg & IO_NODELOCKED) == 0)
1544 		VOP_UNLOCK(vp, 0);
1545 
1546 	if (error == 0) {
1547 		*buflen = *buflen - auio.uio_resid;
1548 	}
1549 
1550 	return (error);
1551 }
1552 
1553 /*
1554  * XXX failure mode if partially written?
1555  */
1556 int
1557 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
1558     const char *attrname, int buflen, char *buf, struct thread *td)
1559 {
1560 	struct uio	auio;
1561 	struct iovec	iov;
1562 	struct mount	*mp;
1563 	int	error;
1564 
1565 	iov.iov_len = buflen;
1566 	iov.iov_base = buf;
1567 
1568 	auio.uio_iov = &iov;
1569 	auio.uio_iovcnt = 1;
1570 	auio.uio_rw = UIO_WRITE;
1571 	auio.uio_segflg = UIO_SYSSPACE;
1572 	auio.uio_td = td;
1573 	auio.uio_offset = 0;
1574 	auio.uio_resid = buflen;
1575 
1576 	if ((ioflg & IO_NODELOCKED) == 0) {
1577 		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
1578 			return (error);
1579 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1580 	}
1581 
1582 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
1583 
1584 	/* authorize attribute setting as kernel */
1585 	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
1586 
1587 	if ((ioflg & IO_NODELOCKED) == 0) {
1588 		vn_finished_write(mp);
1589 		VOP_UNLOCK(vp, 0);
1590 	}
1591 
1592 	return (error);
1593 }
1594 
1595 int
1596 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
1597     const char *attrname, struct thread *td)
1598 {
1599 	struct mount	*mp;
1600 	int	error;
1601 
1602 	if ((ioflg & IO_NODELOCKED) == 0) {
1603 		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
1604 			return (error);
1605 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1606 	}
1607 
1608 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
1609 
1610 	/* authorize attribute removal as kernel */
1611 	error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td);
1612 	if (error == EOPNOTSUPP)
1613 		error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
1614 		    NULL, td);
1615 
1616 	if ((ioflg & IO_NODELOCKED) == 0) {
1617 		vn_finished_write(mp);
1618 		VOP_UNLOCK(vp, 0);
1619 	}
1620 
1621 	return (error);
1622 }
1623 
1624 int
1625 vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp)
1626 {
1627 	struct mount *mp;
1628 	int ltype, error;
1629 
1630 	mp = vp->v_mount;
1631 	ltype = VOP_ISLOCKED(vp);
1632 	KASSERT(ltype == LK_EXCLUSIVE || ltype == LK_SHARED,
1633 	    ("vn_vget_ino: vp not locked"));
1634 	error = vfs_busy(mp, MBF_NOWAIT);
1635 	if (error != 0) {
1636 		vfs_ref(mp);
1637 		VOP_UNLOCK(vp, 0);
1638 		error = vfs_busy(mp, 0);
1639 		vn_lock(vp, ltype | LK_RETRY);
1640 		vfs_rel(mp);
1641 		if (error != 0)
1642 			return (ENOENT);
1643 		if (vp->v_iflag & VI_DOOMED) {
1644 			vfs_unbusy(mp);
1645 			return (ENOENT);
1646 		}
1647 	}
1648 	VOP_UNLOCK(vp, 0);
1649 	error = VFS_VGET(mp, ino, lkflags, rvp);
1650 	vfs_unbusy(mp);
1651 	vn_lock(vp, ltype | LK_RETRY);
1652 	if (vp->v_iflag & VI_DOOMED) {
1653 		if (error == 0)
1654 			vput(*rvp);
1655 		error = ENOENT;
1656 	}
1657 	return (error);
1658 }
1659 
1660 int
1661 vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio,
1662     const struct thread *td)
1663 {
1664 
1665 	if (vp->v_type != VREG || td == NULL)
1666 		return (0);
1667 	PROC_LOCK(td->td_proc);
1668 	if ((uoff_t)uio->uio_offset + uio->uio_resid >
1669 	    lim_cur(td->td_proc, RLIMIT_FSIZE)) {
1670 		kern_psignal(td->td_proc, SIGXFSZ);
1671 		PROC_UNLOCK(td->td_proc);
1672 		return (EFBIG);
1673 	}
1674 	PROC_UNLOCK(td->td_proc);
1675 	return (0);
1676 }
1677 
1678 int
1679 vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
1680     struct thread *td)
1681 {
1682 	struct vnode *vp;
1683 	int error, vfslocked;
1684 
1685 	vp = fp->f_vnode;
1686 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
1687 #ifdef AUDIT
1688 	vn_lock(vp, LK_SHARED | LK_RETRY);
1689 	AUDIT_ARG_VNODE1(vp);
1690 	VOP_UNLOCK(vp, 0);
1691 #endif
1692 	error = setfmode(td, active_cred, vp, mode);
1693 	VFS_UNLOCK_GIANT(vfslocked);
1694 	return (error);
1695 }
1696 
1697 int
1698 vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
1699     struct thread *td)
1700 {
1701 	struct vnode *vp;
1702 	int error, vfslocked;
1703 
1704 	vp = fp->f_vnode;
1705 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
1706 #ifdef AUDIT
1707 	vn_lock(vp, LK_SHARED | LK_RETRY);
1708 	AUDIT_ARG_VNODE1(vp);
1709 	VOP_UNLOCK(vp, 0);
1710 #endif
1711 	error = setfown(td, active_cred, vp, uid, gid);
1712 	VFS_UNLOCK_GIANT(vfslocked);
1713 	return (error);
1714 }
1715 
1716 void
1717 vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end)
1718 {
1719 	vm_object_t object;
1720 
1721 	if ((object = vp->v_object) == NULL)
1722 		return;
1723 	VM_OBJECT_LOCK(object);
1724 	vm_object_page_remove(object, start, end, 0);
1725 	VM_OBJECT_UNLOCK(object);
1726 }
1727 
1728 int
1729 vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred)
1730 {
1731 	struct vattr va;
1732 	daddr_t bn, bnp;
1733 	uint64_t bsize;
1734 	off_t noff;
1735 	int error;
1736 
1737 	KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA,
1738 	    ("Wrong command %lu", cmd));
1739 
1740 	if (vn_lock(vp, LK_SHARED) != 0)
1741 		return (EBADF);
1742 	if (vp->v_type != VREG) {
1743 		error = ENOTTY;
1744 		goto unlock;
1745 	}
1746 	error = VOP_GETATTR(vp, &va, cred);
1747 	if (error != 0)
1748 		goto unlock;
1749 	noff = *off;
1750 	if (noff >= va.va_size) {
1751 		error = ENXIO;
1752 		goto unlock;
1753 	}
1754 	bsize = vp->v_mount->mnt_stat.f_iosize;
1755 	for (bn = noff / bsize; noff < va.va_size; bn++, noff += bsize) {
1756 		error = VOP_BMAP(vp, bn, NULL, &bnp, NULL, NULL);
1757 		if (error == EOPNOTSUPP) {
1758 			error = ENOTTY;
1759 			goto unlock;
1760 		}
1761 		if ((bnp == -1 && cmd == FIOSEEKHOLE) ||
1762 		    (bnp != -1 && cmd == FIOSEEKDATA)) {
1763 			noff = bn * bsize;
1764 			if (noff < *off)
1765 				noff = *off;
1766 			goto unlock;
1767 		}
1768 	}
1769 	if (noff > va.va_size)
1770 		noff = va.va_size;
1771 	/* noff == va.va_size. There is an implicit hole at the end of file. */
1772 	if (cmd == FIOSEEKDATA)
1773 		error = ENXIO;
1774 unlock:
1775 	VOP_UNLOCK(vp, 0);
1776 	if (error == 0)
1777 		*off = noff;
1778 	return (error);
1779 }
1780