xref: /freebsd/sys/kern/vfs_vnops.c (revision 4c3a868d13c053ef173268cdfe1365978a282178)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1982, 1986, 1989, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  * (c) UNIX System Laboratories, Inc.
7  * All or some portions of this file are derived from material licensed
8  * to the University of California by American Telephone and Telegraph
9  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10  * the permission of UNIX System Laboratories, Inc.
11  *
12  * Copyright (c) 2012 Konstantin Belousov <kib@FreeBSD.org>
13  * Copyright (c) 2013, 2014 The FreeBSD Foundation
14  *
15  * Portions of this software were developed by Konstantin Belousov
16  * under sponsorship from the FreeBSD Foundation.
17  *
18  * Redistribution and use in source and binary forms, with or without
19  * modification, are permitted provided that the following conditions
20  * are met:
21  * 1. Redistributions of source code must retain the above copyright
22  *    notice, this list of conditions and the following disclaimer.
23  * 2. Redistributions in binary form must reproduce the above copyright
24  *    notice, this list of conditions and the following disclaimer in the
25  *    documentation and/or other materials provided with the distribution.
26  * 3. Neither the name of the University nor the names of its contributors
27  *    may be used to endorse or promote products derived from this software
28  *    without specific prior written permission.
29  *
30  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40  * SUCH DAMAGE.
41  */
42 
43 #include "opt_hwpmc_hooks.h"
44 #include "opt_hwt_hooks.h"
45 
46 #include <sys/param.h>
47 #include <sys/systm.h>
48 #include <sys/buf.h>
49 #include <sys/disk.h>
50 #include <sys/dirent.h>
51 #include <sys/fail.h>
52 #include <sys/fcntl.h>
53 #include <sys/file.h>
54 #include <sys/filio.h>
55 #include <sys/inotify.h>
56 #include <sys/ktr.h>
57 #include <sys/ktrace.h>
58 #include <sys/limits.h>
59 #include <sys/lock.h>
60 #include <sys/mman.h>
61 #include <sys/mount.h>
62 #include <sys/mutex.h>
63 #include <sys/namei.h>
64 #include <sys/priv.h>
65 #include <sys/prng.h>
66 #include <sys/proc.h>
67 #include <sys/rwlock.h>
68 #include <sys/sleepqueue.h>
69 #include <sys/stat.h>
70 #include <sys/sysctl.h>
71 #include <sys/unistd.h>
72 #include <sys/user.h>
73 #include <sys/vnode.h>
74 
75 #include <security/audit/audit.h>
76 #include <security/mac/mac_framework.h>
77 
78 #include <vm/vm.h>
79 #include <vm/vm_extern.h>
80 #include <vm/pmap.h>
81 #include <vm/vm_map.h>
82 #include <vm/vm_object.h>
83 #include <vm/vm_page.h>
84 #include <vm/vm_pager.h>
85 #include <vm/vnode_pager.h>
86 
87 #ifdef HWPMC_HOOKS
88 #include <sys/pmckern.h>
89 #endif
90 
91 #ifdef HWT_HOOKS
92 #include <dev/hwt/hwt_hook.h>
93 #endif
94 
95 static fo_rdwr_t	vn_read;
96 static fo_rdwr_t	vn_write;
97 static fo_rdwr_t	vn_io_fault;
98 static fo_truncate_t	vn_truncate;
99 static fo_ioctl_t	vn_ioctl;
100 static fo_poll_t	vn_poll;
101 static fo_kqfilter_t	vn_kqfilter;
102 static fo_close_t	vn_closefile;
103 static fo_mmap_t	vn_mmap;
104 static fo_fallocate_t	vn_fallocate;
105 static fo_fspacectl_t	vn_fspacectl;
106 
107 const struct fileops vnops = {
108 	.fo_read = vn_io_fault,
109 	.fo_write = vn_io_fault,
110 	.fo_truncate = vn_truncate,
111 	.fo_ioctl = vn_ioctl,
112 	.fo_poll = vn_poll,
113 	.fo_kqfilter = vn_kqfilter,
114 	.fo_stat = vn_statfile,
115 	.fo_close = vn_closefile,
116 	.fo_chmod = vn_chmod,
117 	.fo_chown = vn_chown,
118 	.fo_sendfile = vn_sendfile,
119 	.fo_seek = vn_seek,
120 	.fo_fill_kinfo = vn_fill_kinfo,
121 	.fo_mmap = vn_mmap,
122 	.fo_fallocate = vn_fallocate,
123 	.fo_fspacectl = vn_fspacectl,
124 	.fo_cmp = vn_cmp,
125 	.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
126 };
127 
128 const u_int io_hold_cnt = 16;
129 static int vn_io_fault_enable = 1;
130 SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RWTUN,
131     &vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance");
132 static int vn_io_fault_prefault = 0;
133 SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_prefault, CTLFLAG_RWTUN,
134     &vn_io_fault_prefault, 0, "Enable vn_io_fault prefaulting");
135 static int vn_io_pgcache_read_enable = 1;
136 SYSCTL_INT(_debug, OID_AUTO, vn_io_pgcache_read_enable, CTLFLAG_RWTUN,
137     &vn_io_pgcache_read_enable, 0,
138     "Enable copying from page cache for reads, avoiding fs");
139 static u_long vn_io_faults_cnt;
140 SYSCTL_ULONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD,
141     &vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers");
142 
143 static int vfs_allow_read_dir = 0;
144 SYSCTL_INT(_security_bsd, OID_AUTO, allow_read_dir, CTLFLAG_RW,
145     &vfs_allow_read_dir, 0,
146     "Enable read(2) of directory by root for filesystems that support it");
147 
148 /*
149  * Returns true if vn_io_fault mode of handling the i/o request should
150  * be used.
151  */
152 static bool
do_vn_io_fault(struct vnode * vp,struct uio * uio)153 do_vn_io_fault(struct vnode *vp, struct uio *uio)
154 {
155 	struct mount *mp;
156 
157 	return (uio->uio_segflg == UIO_USERSPACE && vp->v_type == VREG &&
158 	    (mp = vp->v_mount) != NULL &&
159 	    (mp->mnt_kern_flag & MNTK_NO_IOPF) != 0 && vn_io_fault_enable);
160 }
161 
162 /*
163  * Structure used to pass arguments to vn_io_fault1(), to do either
164  * file- or vnode-based I/O calls.
165  */
166 struct vn_io_fault_args {
167 	enum {
168 		VN_IO_FAULT_FOP,
169 		VN_IO_FAULT_VOP
170 	} kind;
171 	struct ucred *cred;
172 	int flags;
173 	union {
174 		struct fop_args_tag {
175 			struct file *fp;
176 			fo_rdwr_t *doio;
177 		} fop_args;
178 		struct vop_args_tag {
179 			struct vnode *vp;
180 		} vop_args;
181 	} args;
182 };
183 
184 static int vn_io_fault1(struct vnode *vp, struct uio *uio,
185     struct vn_io_fault_args *args, struct thread *td);
186 
187 int
vn_open(struct nameidata * ndp,int * flagp,int cmode,struct file * fp)188 vn_open(struct nameidata *ndp, int *flagp, int cmode, struct file *fp)
189 {
190 	struct thread *td = curthread;
191 
192 	return (vn_open_cred(ndp, flagp, cmode, 0, td->td_ucred, fp));
193 }
194 
195 static uint64_t
open2nameif(int fmode,u_int vn_open_flags,uint64_t cn_flags)196 open2nameif(int fmode, u_int vn_open_flags, uint64_t cn_flags)
197 {
198 	uint64_t res;
199 
200 	res = ISOPEN | LOCKLEAF | cn_flags;
201 	if ((fmode & O_RESOLVE_BENEATH) != 0)
202 		res |= RBENEATH;
203 	if ((fmode & O_EMPTY_PATH) != 0)
204 		res |= EMPTYPATH;
205 	if ((fmode & FREAD) != 0)
206 		res |= OPENREAD;
207 	if ((fmode & FWRITE) != 0)
208 		res |= OPENWRITE;
209 	if ((fmode & O_NAMEDATTR) != 0)
210 		res |= OPENNAMED | CREATENAMED;
211 	if ((fmode & O_NOFOLLOW) != 0)
212 		res &= ~FOLLOW;
213 	if ((vn_open_flags & VN_OPEN_NOAUDIT) == 0)
214 		res |= AUDITVNODE1;
215 	else
216 		res &= ~AUDITVNODE1;
217 	if ((vn_open_flags & VN_OPEN_NOCAPCHECK) != 0)
218 		res |= NOCAPCHECK;
219 	if ((vn_open_flags & VN_OPEN_WANTIOCTLCAPS) != 0)
220 		res |= WANTIOCTLCAPS;
221 
222 	return (res);
223 }
224 
225 /*
226  * For the O_NAMEDATTR case, check for a valid use of it.
227  */
228 static int
vfs_check_namedattr(struct vnode * vp)229 vfs_check_namedattr(struct vnode *vp)
230 {
231 	int error;
232 	short irflag;
233 
234 	error = 0;
235 	irflag = vn_irflag_read(vp);
236 	if ((vp->v_mount->mnt_flag & MNT_NAMEDATTR) == 0 ||
237 	    ((irflag & VIRF_NAMEDATTR) != 0 && vp->v_type != VREG))
238 		error = EINVAL;
239 	else if ((irflag & (VIRF_NAMEDDIR | VIRF_NAMEDATTR)) == 0)
240 		error = ENOATTR;
241 	return (error);
242 }
243 
244 /*
245  * Common code for vnode open operations via a name lookup.
246  * Lookup the vnode and invoke VOP_CREATE if needed.
247  * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
248  *
249  * Note that this does NOT free nameidata for the successful case,
250  * due to the NDINIT being done elsewhere.
251  */
252 int
vn_open_cred(struct nameidata * ndp,int * flagp,int cmode,u_int vn_open_flags,struct ucred * cred,struct file * fp)253 vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags,
254     struct ucred *cred, struct file *fp)
255 {
256 	struct vnode *vp;
257 	struct mount *mp;
258 	struct vattr vat;
259 	struct vattr *vap = &vat;
260 	int fmode, error;
261 	bool first_open;
262 
263 restart:
264 	first_open = false;
265 	fmode = *flagp;
266 	if ((fmode & (O_CREAT | O_EXCL | O_DIRECTORY)) == (O_CREAT |
267 	    O_EXCL | O_DIRECTORY) ||
268 	    (fmode & (O_CREAT | O_EMPTY_PATH)) == (O_CREAT | O_EMPTY_PATH))
269 		return (EINVAL);
270 	else if ((fmode & (O_CREAT | O_DIRECTORY)) == O_CREAT) {
271 		ndp->ni_cnd.cn_nameiop = CREATE;
272 		ndp->ni_cnd.cn_flags = open2nameif(fmode, vn_open_flags,
273 		    ndp->ni_cnd.cn_flags);
274 
275 		/*
276 		 * Set NOCACHE to avoid flushing the cache when
277 		 * rolling in many files at once.
278 		 *
279 		 * Set NC_KEEPPOSENTRY to keep positive entries if they already
280 		 * exist despite NOCACHE.
281 		 */
282 		ndp->ni_cnd.cn_flags |= LOCKPARENT | NOCACHE | NC_KEEPPOSENTRY;
283 		if ((fmode & O_EXCL) != 0)
284 			ndp->ni_cnd.cn_flags &= ~FOLLOW;
285 		if ((vn_open_flags & VN_OPEN_INVFS) == 0)
286 			bwillwrite();
287 		if ((error = namei(ndp)) != 0)
288 			return (error);
289 		if (ndp->ni_vp == NULL) {
290 			if ((fmode & O_NAMEDATTR) != 0 &&
291 			    (ndp->ni_dvp->v_mount->mnt_flag & MNT_NAMEDATTR) ==
292 			    0) {
293 				error = EINVAL;
294 				vp = ndp->ni_dvp;
295 				ndp->ni_dvp = NULL;
296 				goto bad;
297 			}
298 			VATTR_NULL(vap);
299 			vap->va_type = VREG;
300 			vap->va_mode = cmode;
301 			if (fmode & O_EXCL)
302 				vap->va_vaflags |= VA_EXCLUSIVE;
303 			if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
304 				NDFREE_PNBUF(ndp);
305 				vput(ndp->ni_dvp);
306 				if ((error = vn_start_write(NULL, &mp,
307 				    V_XSLEEP | V_PCATCH)) != 0)
308 					return (error);
309 				NDREINIT(ndp);
310 				goto restart;
311 			}
312 			if ((vn_open_flags & VN_OPEN_NAMECACHE) != 0 ||
313 			    (vn_irflag_read(ndp->ni_dvp) & VIRF_INOTIFY) != 0)
314 				ndp->ni_cnd.cn_flags |= MAKEENTRY;
315 #ifdef MAC
316 			error = mac_vnode_check_create(cred, ndp->ni_dvp,
317 			    &ndp->ni_cnd, vap);
318 			if (error == 0)
319 #endif
320 				error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
321 				    &ndp->ni_cnd, vap);
322 			vp = ndp->ni_vp;
323 			if (error == 0 && (fmode & O_EXCL) != 0 &&
324 			    (fmode & (O_EXLOCK | O_SHLOCK)) != 0) {
325 				VI_LOCK(vp);
326 				vp->v_iflag |= VI_FOPENING;
327 				VI_UNLOCK(vp);
328 				first_open = true;
329 			}
330 			VOP_VPUT_PAIR(ndp->ni_dvp, error == 0 ? &vp : NULL,
331 			    false);
332 			vn_finished_write(mp);
333 			if (error) {
334 				NDFREE_PNBUF(ndp);
335 				if (error == ERELOOKUP) {
336 					NDREINIT(ndp);
337 					goto restart;
338 				}
339 				return (error);
340 			}
341 			fmode &= ~O_TRUNC;
342 		} else {
343 			if (ndp->ni_dvp == ndp->ni_vp)
344 				vrele(ndp->ni_dvp);
345 			else
346 				vput(ndp->ni_dvp);
347 			ndp->ni_dvp = NULL;
348 			vp = ndp->ni_vp;
349 			if (fmode & O_EXCL) {
350 				error = EEXIST;
351 				goto bad;
352 			}
353 			if ((fmode & O_NAMEDATTR) != 0) {
354 				error = vfs_check_namedattr(vp);
355 				if (error != 0)
356 					goto bad;
357 			} else if (vp->v_type == VDIR) {
358 				error = EISDIR;
359 				goto bad;
360 			}
361 			fmode &= ~O_CREAT;
362 		}
363 	} else {
364 		ndp->ni_cnd.cn_nameiop = LOOKUP;
365 		ndp->ni_cnd.cn_flags = open2nameif(fmode, vn_open_flags,
366 		    ndp->ni_cnd.cn_flags);
367 		if ((fmode & FWRITE) == 0)
368 			ndp->ni_cnd.cn_flags |= LOCKSHARED;
369 		if ((error = namei(ndp)) != 0)
370 			return (error);
371 		vp = ndp->ni_vp;
372 		if ((fmode & O_NAMEDATTR) != 0) {
373 			error = vfs_check_namedattr(vp);
374 			if (error != 0)
375 				goto bad;
376 		}
377 	}
378 	error = vn_open_vnode(vp, fmode, cred, curthread, fp);
379 	if (first_open) {
380 		VI_LOCK(vp);
381 		vp->v_iflag &= ~VI_FOPENING;
382 		wakeup(vp);
383 		VI_UNLOCK(vp);
384 	}
385 	if (error)
386 		goto bad;
387 	*flagp = fmode;
388 	return (0);
389 bad:
390 	NDFREE_PNBUF(ndp);
391 	vput(vp);
392 	*flagp = fmode;
393 	ndp->ni_vp = NULL;
394 	return (error);
395 }
396 
397 static int
vn_open_vnode_advlock(struct vnode * vp,int fmode,struct file * fp)398 vn_open_vnode_advlock(struct vnode *vp, int fmode, struct file *fp)
399 {
400 	struct flock lf;
401 	int error, lock_flags, type;
402 
403 	ASSERT_VOP_LOCKED(vp, "vn_open_vnode_advlock");
404 	if ((fmode & (O_EXLOCK | O_SHLOCK)) == 0)
405 		return (0);
406 	KASSERT(fp != NULL, ("open with flock requires fp"));
407 	if (fp->f_type != DTYPE_NONE && fp->f_type != DTYPE_VNODE)
408 		return (EOPNOTSUPP);
409 
410 	lock_flags = VOP_ISLOCKED(vp);
411 	VOP_UNLOCK(vp);
412 
413 	lf.l_whence = SEEK_SET;
414 	lf.l_start = 0;
415 	lf.l_len = 0;
416 	lf.l_type = (fmode & O_EXLOCK) != 0 ? F_WRLCK : F_RDLCK;
417 	type = F_FLOCK;
418 	if ((fmode & FNONBLOCK) == 0)
419 		type |= F_WAIT;
420 	if ((fmode & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
421 		type |= F_FIRSTOPEN;
422 	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type);
423 	if (error == 0)
424 		fp->f_flag |= FHASLOCK;
425 
426 	vn_lock(vp, lock_flags | LK_RETRY);
427 	return (error);
428 }
429 
430 /*
431  * Common code for vnode open operations once a vnode is located.
432  * Check permissions, and call the VOP_OPEN routine.
433  */
434 int
vn_open_vnode(struct vnode * vp,int fmode,struct ucred * cred,struct thread * td,struct file * fp)435 vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred,
436     struct thread *td, struct file *fp)
437 {
438 	accmode_t accmode;
439 	int error;
440 
441 	KASSERT((fmode & O_PATH) == 0 || (fmode & O_ACCMODE) == 0,
442 	    ("%s: O_PATH and O_ACCMODE are mutually exclusive", __func__));
443 
444 	if (vp->v_type == VLNK) {
445 		if ((fmode & O_PATH) == 0 || (fmode & FEXEC) != 0)
446 			return (EMLINK);
447 	}
448 	if (vp->v_type != VDIR && fmode & O_DIRECTORY)
449 		return (ENOTDIR);
450 
451 	accmode = 0;
452 	if ((fmode & O_PATH) == 0) {
453 		if (vp->v_type == VSOCK)
454 			return (EOPNOTSUPP);
455 		if ((fmode & (FWRITE | O_TRUNC)) != 0) {
456 			if (vp->v_type == VDIR)
457 				return (EISDIR);
458 			accmode |= VWRITE;
459 		}
460 		if ((fmode & FREAD) != 0)
461 			accmode |= VREAD;
462 		if ((fmode & O_APPEND) && (fmode & FWRITE))
463 			accmode |= VAPPEND;
464 #ifdef MAC
465 		if ((fmode & O_CREAT) != 0)
466 			accmode |= VCREAT;
467 #endif
468 	}
469 	if ((fmode & FEXEC) != 0)
470 		accmode |= VEXEC;
471 #ifdef MAC
472 	if ((fmode & O_VERIFY) != 0)
473 		accmode |= VVERIFY;
474 	error = mac_vnode_check_open(cred, vp, accmode);
475 	if (error != 0)
476 		return (error);
477 
478 	accmode &= ~(VCREAT | VVERIFY);
479 #endif
480 	if ((fmode & O_CREAT) == 0 && accmode != 0) {
481 		error = VOP_ACCESS(vp, accmode, cred, td);
482 		if (error != 0)
483 			return (error);
484 	}
485 	if ((fmode & O_PATH) != 0) {
486 		if (vp->v_type != VFIFO && vp->v_type != VSOCK &&
487 		    VOP_ACCESS(vp, VREAD, cred, td) == 0)
488 			fp->f_flag |= FKQALLOWED;
489 		INOTIFY(vp, IN_OPEN);
490 		return (0);
491 	}
492 
493 	if (vp->v_type == VFIFO && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
494 		vn_lock(vp, LK_UPGRADE | LK_RETRY);
495 	error = VOP_OPEN(vp, fmode, cred, td, fp);
496 	if (error != 0)
497 		return (error);
498 
499 	error = vn_open_vnode_advlock(vp, fmode, fp);
500 	if (error == 0 && (fmode & FWRITE) != 0) {
501 		error = VOP_ADD_WRITECOUNT(vp, 1);
502 		if (error == 0) {
503 			CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d",
504 			     __func__, vp, vp->v_writecount);
505 		}
506 	}
507 
508 	/*
509 	 * Error from advlock or VOP_ADD_WRITECOUNT() still requires
510 	 * calling VOP_CLOSE() to pair with earlier VOP_OPEN().
511 	 */
512 	if (error != 0) {
513 		if (fp != NULL) {
514 			/*
515 			 * Arrange the call by having fdrop() to use
516 			 * vn_closefile().  This is to satisfy
517 			 * filesystems like devfs or tmpfs, which
518 			 * override fo_close().
519 			 */
520 			fp->f_flag |= FOPENFAILED;
521 			fp->f_vnode = vp;
522 			if (fp->f_ops == &badfileops) {
523 				fp->f_type = DTYPE_VNODE;
524 				fp->f_ops = &vnops;
525 			}
526 			vref(vp);
527 		} else {
528 			/*
529 			 * If there is no fp, due to kernel-mode open,
530 			 * we can call VOP_CLOSE() now.
531 			 */
532 			if ((vp->v_type == VFIFO ||
533 			    !MNT_EXTENDED_SHARED(vp->v_mount)) &&
534 			    VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
535 				vn_lock(vp, LK_UPGRADE | LK_RETRY);
536 			(void)VOP_CLOSE(vp, fmode & (FREAD | FWRITE | FEXEC),
537 			    cred, td);
538 		}
539 	}
540 
541 	ASSERT_VOP_LOCKED(vp, "vn_open_vnode");
542 	return (error);
543 
544 }
545 
546 /*
547  * Check for write permissions on the specified vnode.
548  * Prototype text segments cannot be written.
549  * It is racy.
550  */
551 int
vn_writechk(struct vnode * vp)552 vn_writechk(struct vnode *vp)
553 {
554 
555 	ASSERT_VOP_LOCKED(vp, "vn_writechk");
556 	/*
557 	 * If there's shared text associated with
558 	 * the vnode, try to free it up once.  If
559 	 * we fail, we can't allow writing.
560 	 */
561 	if (VOP_IS_TEXT(vp))
562 		return (ETXTBSY);
563 
564 	return (0);
565 }
566 
567 /*
568  * Vnode close call
569  */
570 static int
vn_close1(struct vnode * vp,int flags,struct ucred * file_cred,struct thread * td,bool keep_ref)571 vn_close1(struct vnode *vp, int flags, struct ucred *file_cred,
572     struct thread *td, bool keep_ref)
573 {
574 	struct mount *mp;
575 	int error, lock_flags;
576 
577 	lock_flags = vp->v_type != VFIFO && MNT_EXTENDED_SHARED(vp->v_mount) ?
578 	    LK_SHARED : LK_EXCLUSIVE;
579 
580 	vn_start_write(vp, &mp, V_WAIT);
581 	vn_lock(vp, lock_flags | LK_RETRY);
582 	AUDIT_ARG_VNODE1(vp);
583 	if ((flags & (FWRITE | FOPENFAILED)) == FWRITE) {
584 		VOP_ADD_WRITECOUNT_CHECKED(vp, -1);
585 		CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d",
586 		    __func__, vp, vp->v_writecount);
587 	}
588 	error = VOP_CLOSE(vp, flags, file_cred, td);
589 	if (keep_ref)
590 		VOP_UNLOCK(vp);
591 	else
592 		vput(vp);
593 	vn_finished_write(mp);
594 	return (error);
595 }
596 
597 int
vn_close(struct vnode * vp,int flags,struct ucred * file_cred,struct thread * td)598 vn_close(struct vnode *vp, int flags, struct ucred *file_cred,
599     struct thread *td)
600 {
601 
602 	return (vn_close1(vp, flags, file_cred, td, false));
603 }
604 
605 /*
606  * Heuristic to detect sequential operation.
607  */
608 static int
sequential_heuristic(struct uio * uio,struct file * fp)609 sequential_heuristic(struct uio *uio, struct file *fp)
610 {
611 	enum uio_rw rw;
612 
613 	ASSERT_VOP_LOCKED(fp->f_vnode, __func__);
614 
615 	rw = uio->uio_rw;
616 	if (fp->f_flag & FRDAHEAD)
617 		return (fp->f_seqcount[rw] << IO_SEQSHIFT);
618 
619 	/*
620 	 * Offset 0 is handled specially.  open() sets f_seqcount to 1 so
621 	 * that the first I/O is normally considered to be slightly
622 	 * sequential.  Seeking to offset 0 doesn't change sequentiality
623 	 * unless previous seeks have reduced f_seqcount to 0, in which
624 	 * case offset 0 is not special.
625 	 */
626 	if ((uio->uio_offset == 0 && fp->f_seqcount[rw] > 0) ||
627 	    uio->uio_offset == fp->f_nextoff[rw]) {
628 		/*
629 		 * f_seqcount is in units of fixed-size blocks so that it
630 		 * depends mainly on the amount of sequential I/O and not
631 		 * much on the number of sequential I/O's.  The fixed size
632 		 * of 16384 is hard-coded here since it is (not quite) just
633 		 * a magic size that works well here.  This size is more
634 		 * closely related to the best I/O size for real disks than
635 		 * to any block size used by software.
636 		 */
637 		if (uio->uio_resid >= IO_SEQMAX * 16384)
638 			fp->f_seqcount[rw] = IO_SEQMAX;
639 		else {
640 			fp->f_seqcount[rw] += howmany(uio->uio_resid, 16384);
641 			if (fp->f_seqcount[rw] > IO_SEQMAX)
642 				fp->f_seqcount[rw] = IO_SEQMAX;
643 		}
644 		return (fp->f_seqcount[rw] << IO_SEQSHIFT);
645 	}
646 
647 	/* Not sequential.  Quickly draw-down sequentiality. */
648 	if (fp->f_seqcount[rw] > 1)
649 		fp->f_seqcount[rw] = 1;
650 	else
651 		fp->f_seqcount[rw] = 0;
652 	return (0);
653 }
654 
655 /*
656  * Package up an I/O request on a vnode into a uio and do it.
657  */
658 int
vn_rdwr(enum uio_rw rw,struct vnode * vp,void * base,int len,off_t offset,enum uio_seg segflg,int ioflg,struct ucred * active_cred,struct ucred * file_cred,ssize_t * aresid,struct thread * td)659 vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset,
660     enum uio_seg segflg, int ioflg, struct ucred *active_cred,
661     struct ucred *file_cred, ssize_t *aresid, struct thread *td)
662 {
663 	struct uio auio;
664 	struct iovec aiov;
665 	struct mount *mp;
666 	struct ucred *cred;
667 	void *rl_cookie;
668 	struct vn_io_fault_args args;
669 	int error, lock_flags;
670 
671 	if (offset < 0 && vp->v_type != VCHR)
672 		return (EINVAL);
673 	auio.uio_iov = &aiov;
674 	auio.uio_iovcnt = 1;
675 	aiov.iov_base = base;
676 	aiov.iov_len = len;
677 	auio.uio_resid = len;
678 	auio.uio_offset = offset;
679 	auio.uio_segflg = segflg;
680 	auio.uio_rw = rw;
681 	auio.uio_td = td;
682 	error = 0;
683 
684 	if ((ioflg & IO_NODELOCKED) == 0) {
685 		if ((ioflg & IO_RANGELOCKED) == 0) {
686 			if (rw == UIO_READ) {
687 				rl_cookie = vn_rangelock_rlock(vp, offset,
688 				    offset + len);
689 			} else if ((ioflg & IO_APPEND) != 0) {
690 				rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
691 			} else {
692 				rl_cookie = vn_rangelock_wlock(vp, offset,
693 				    offset + len);
694 			}
695 		} else
696 			rl_cookie = NULL;
697 		mp = NULL;
698 		if (rw == UIO_WRITE) {
699 			if (vp->v_type != VCHR &&
700 			    (error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH))
701 			    != 0)
702 				goto out;
703 			lock_flags = vn_lktype_write(mp, vp);
704 		} else
705 			lock_flags = LK_SHARED;
706 		vn_lock(vp, lock_flags | LK_RETRY);
707 	} else
708 		rl_cookie = NULL;
709 
710 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
711 #ifdef MAC
712 	if ((ioflg & IO_NOMACCHECK) == 0) {
713 		if (rw == UIO_READ)
714 			error = mac_vnode_check_read(active_cred, file_cred,
715 			    vp);
716 		else
717 			error = mac_vnode_check_write(active_cred, file_cred,
718 			    vp);
719 	}
720 #endif
721 	if (error == 0) {
722 		if (file_cred != NULL)
723 			cred = file_cred;
724 		else
725 			cred = active_cred;
726 		if (do_vn_io_fault(vp, &auio)) {
727 			args.kind = VN_IO_FAULT_VOP;
728 			args.cred = cred;
729 			args.flags = ioflg;
730 			args.args.vop_args.vp = vp;
731 			error = vn_io_fault1(vp, &auio, &args, td);
732 		} else if (rw == UIO_READ) {
733 			error = VOP_READ(vp, &auio, ioflg, cred);
734 		} else /* if (rw == UIO_WRITE) */ {
735 			error = VOP_WRITE(vp, &auio, ioflg, cred);
736 		}
737 	}
738 	if (aresid)
739 		*aresid = auio.uio_resid;
740 	else
741 		if (auio.uio_resid && error == 0)
742 			error = EIO;
743 	if ((ioflg & IO_NODELOCKED) == 0) {
744 		VOP_UNLOCK(vp);
745 		if (mp != NULL)
746 			vn_finished_write(mp);
747 	}
748  out:
749 	if (rl_cookie != NULL)
750 		vn_rangelock_unlock(vp, rl_cookie);
751 	return (error);
752 }
753 
754 /*
755  * Package up an I/O request on a vnode into a uio and do it.  The I/O
756  * request is split up into smaller chunks and we try to avoid saturating
757  * the buffer cache while potentially holding a vnode locked, so we
758  * check bwillwrite() before calling vn_rdwr().  We also call kern_yield()
759  * to give other processes a chance to lock the vnode (either other processes
760  * core'ing the same binary, or unrelated processes scanning the directory).
761  */
762 int
vn_rdwr_inchunks(enum uio_rw rw,struct vnode * vp,void * base,size_t len,off_t offset,enum uio_seg segflg,int ioflg,struct ucred * active_cred,struct ucred * file_cred,size_t * aresid,struct thread * td)763 vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base, size_t len,
764     off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred,
765     struct ucred *file_cred, size_t *aresid, struct thread *td)
766 {
767 	int error = 0;
768 	ssize_t iaresid;
769 
770 	do {
771 		int chunk;
772 
773 		/*
774 		 * Force `offset' to a multiple of MAXBSIZE except possibly
775 		 * for the first chunk, so that filesystems only need to
776 		 * write full blocks except possibly for the first and last
777 		 * chunks.
778 		 */
779 		chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
780 
781 		if (chunk > len)
782 			chunk = len;
783 		if (rw != UIO_READ && vp->v_type == VREG)
784 			bwillwrite();
785 		iaresid = 0;
786 		error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
787 		    ioflg, active_cred, file_cred, &iaresid, td);
788 		len -= chunk;	/* aresid calc already includes length */
789 		if (error)
790 			break;
791 		offset += chunk;
792 		base = (char *)base + chunk;
793 		kern_yield(PRI_USER);
794 	} while (len);
795 	if (aresid)
796 		*aresid = len + iaresid;
797 	return (error);
798 }
799 
800 #if OFF_MAX <= LONG_MAX
801 static void
file_v_lock(struct file * fp,short lock_bit,short lock_wait_bit)802 file_v_lock(struct file *fp, short lock_bit, short lock_wait_bit)
803 {
804 	short *flagsp;
805 	short state;
806 
807 	flagsp = &fp->f_vflags;
808 	state = atomic_load_16(flagsp);
809 	if ((state & lock_bit) == 0 &&
810 	    atomic_cmpset_acq_16(flagsp, state, state | lock_bit))
811 		return;
812 
813 	sleepq_lock(flagsp);
814 	state = atomic_load_16(flagsp);
815 	for (;;) {
816 		if ((state & lock_bit) == 0) {
817 			if (!atomic_fcmpset_acq_16(flagsp, &state,
818 			    state | lock_bit))
819 				continue;
820 			break;
821 		}
822 		if ((state & lock_wait_bit) == 0) {
823 			if (!atomic_fcmpset_acq_16(flagsp, &state,
824 			    state | lock_wait_bit))
825 				continue;
826 		}
827 		DROP_GIANT();
828 		sleepq_add(flagsp, NULL, "vofflock", 0, 0);
829 		sleepq_wait(flagsp, PRI_MAX_KERN);
830 		PICKUP_GIANT();
831 		sleepq_lock(flagsp);
832 		state = atomic_load_16(flagsp);
833 	}
834 	sleepq_release(flagsp);
835 }
836 
837 static void
file_v_unlock(struct file * fp,short lock_bit,short lock_wait_bit)838 file_v_unlock(struct file *fp, short lock_bit, short lock_wait_bit)
839 {
840 	short *flagsp;
841 	short state;
842 
843 	flagsp = &fp->f_vflags;
844 	state = atomic_load_16(flagsp);
845 	if ((state & lock_wait_bit) == 0 &&
846 	    atomic_cmpset_rel_16(flagsp, state, state & ~lock_bit))
847 		return;
848 
849 	sleepq_lock(flagsp);
850 	MPASS((*flagsp & lock_bit) != 0);
851 	MPASS((*flagsp & lock_wait_bit) != 0);
852 	atomic_clear_16(flagsp, lock_bit | lock_wait_bit);
853 	sleepq_broadcast(flagsp, SLEEPQ_SLEEP, 0, 0);
854 	sleepq_release(flagsp);
855 }
856 
857 off_t
foffset_lock(struct file * fp,int flags)858 foffset_lock(struct file *fp, int flags)
859 {
860 	KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
861 
862 	if ((flags & FOF_NOLOCK) == 0) {
863 		file_v_lock(fp, FILE_V_FOFFSET_LOCKED,
864 		    FILE_V_FOFFSET_LOCK_WAITING);
865 	}
866 
867 	/*
868 	 * According to McKusick the vn lock was protecting f_offset here.
869 	 * It is now protected by the FOFFSET_LOCKED flag.
870 	 */
871 	return (atomic_load_long(&fp->f_offset));
872 }
873 
874 void
foffset_unlock(struct file * fp,off_t val,int flags)875 foffset_unlock(struct file *fp, off_t val, int flags)
876 {
877 	KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
878 
879 	if ((flags & FOF_NOUPDATE) == 0)
880 		atomic_store_long(&fp->f_offset, val);
881 	if ((flags & FOF_NEXTOFF_R) != 0)
882 		fp->f_nextoff[UIO_READ] = val;
883 	if ((flags & FOF_NEXTOFF_W) != 0)
884 		fp->f_nextoff[UIO_WRITE] = val;
885 
886 	if ((flags & FOF_NOLOCK) == 0) {
887 		file_v_unlock(fp, FILE_V_FOFFSET_LOCKED,
888 		    FILE_V_FOFFSET_LOCK_WAITING);
889 	}
890 }
891 
892 static off_t
foffset_read(struct file * fp)893 foffset_read(struct file *fp)
894 {
895 
896 	return (atomic_load_long(&fp->f_offset));
897 }
898 
899 void
fsetfl_lock(struct file * fp)900 fsetfl_lock(struct file *fp)
901 {
902 	file_v_lock(fp, FILE_V_SETFL_LOCKED, FILE_V_SETFL_LOCK_WAITING);
903 }
904 
905 void
fsetfl_unlock(struct file * fp)906 fsetfl_unlock(struct file *fp)
907 {
908 	file_v_unlock(fp, FILE_V_SETFL_LOCKED, FILE_V_SETFL_LOCK_WAITING);
909 }
910 
911 #else	/* OFF_MAX <= LONG_MAX */
912 
913 static void
file_v_lock_mtxp(struct file * fp,struct mtx * mtxp,short lock_bit,short lock_wait_bit)914 file_v_lock_mtxp(struct file *fp, struct mtx *mtxp, short lock_bit,
915     short lock_wait_bit)
916 {
917 	mtx_assert(mtxp, MA_OWNED);
918 
919 	while ((fp->f_vflags & lock_bit) != 0) {
920 		fp->f_vflags |= lock_wait_bit;
921 		msleep(&fp->f_vflags, mtxp, PRI_MAX_KERN,
922 		    "vofflock", 0);
923 	}
924 	fp->f_vflags |= lock_bit;
925 }
926 
927 static void
file_v_unlock_mtxp(struct file * fp,struct mtx * mtxp,short lock_bit,short lock_wait_bit)928 file_v_unlock_mtxp(struct file *fp, struct mtx *mtxp, short lock_bit,
929     short lock_wait_bit)
930 {
931 	mtx_assert(mtxp, MA_OWNED);
932 
933 	KASSERT((fp->f_vflags & lock_bit) != 0, ("Lost lock_bit"));
934 	if ((fp->f_vflags & lock_wait_bit) != 0)
935 		wakeup(&fp->f_vflags);
936 	fp->f_vflags &= ~(lock_bit | lock_wait_bit);
937 }
938 
939 off_t
foffset_lock(struct file * fp,int flags)940 foffset_lock(struct file *fp, int flags)
941 {
942 	struct mtx *mtxp;
943 	off_t res;
944 
945 	KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
946 
947 	mtxp = mtx_pool_find(mtxpool_sleep, fp);
948 	mtx_lock(mtxp);
949 	if ((flags & FOF_NOLOCK) == 0) {
950 		file_v_lock_mtxp(fp, mtxp, FILE_V_FOFFSET_LOCKED,
951 		    FILE_V_FOFFSET_LOCK_WAITING);
952 	}
953 	res = fp->f_offset;
954 	mtx_unlock(mtxp);
955 	return (res);
956 }
957 
958 void
foffset_unlock(struct file * fp,off_t val,int flags)959 foffset_unlock(struct file *fp, off_t val, int flags)
960 {
961 	struct mtx *mtxp;
962 
963 	KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
964 
965 	mtxp = mtx_pool_find(mtxpool_sleep, fp);
966 	mtx_lock(mtxp);
967 	if ((flags & FOF_NOUPDATE) == 0)
968 		fp->f_offset = val;
969 	if ((flags & FOF_NEXTOFF_R) != 0)
970 		fp->f_nextoff[UIO_READ] = val;
971 	if ((flags & FOF_NEXTOFF_W) != 0)
972 		fp->f_nextoff[UIO_WRITE] = val;
973 	if ((flags & FOF_NOLOCK) == 0) {
974 		file_v_unlock_mtxp(fp, mtxp, FILE_V_FOFFSET_LOCKED,
975 		    FILE_V_FOFFSET_LOCK_WAITING);
976 	}
977 	mtx_unlock(mtxp);
978 }
979 
980 static off_t
foffset_read(struct file * fp)981 foffset_read(struct file *fp)
982 {
983 
984 	return (foffset_lock(fp, FOF_NOLOCK));
985 }
986 
987 void
fsetfl_lock(struct file * fp)988 fsetfl_lock(struct file *fp)
989 {
990 	struct mtx *mtxp;
991 
992 	mtxp = mtx_pool_find(mtxpool_sleep, fp);
993 	mtx_lock(mtxp);
994 	file_v_lock_mtxp(fp, mtxp, FILE_V_SETFL_LOCKED,
995 	    FILE_V_SETFL_LOCK_WAITING);
996 	mtx_unlock(mtxp);
997 }
998 
999 void
fsetfl_unlock(struct file * fp)1000 fsetfl_unlock(struct file *fp)
1001 {
1002 	struct mtx *mtxp;
1003 
1004 	mtxp = mtx_pool_find(mtxpool_sleep, fp);
1005 	mtx_lock(mtxp);
1006 	file_v_unlock_mtxp(fp, mtxp, FILE_V_SETFL_LOCKED,
1007 	    FILE_V_SETFL_LOCK_WAITING);
1008 	mtx_unlock(mtxp);
1009 }
1010 #endif
1011 
1012 void
foffset_lock_pair(struct file * fp1,off_t * off1p,struct file * fp2,off_t * off2p,int flags)1013 foffset_lock_pair(struct file *fp1, off_t *off1p, struct file *fp2, off_t *off2p,
1014     int flags)
1015 {
1016 	KASSERT(fp1 != fp2, ("foffset_lock_pair: fp1 == fp2"));
1017 
1018 	/* Lock in a consistent order to avoid deadlock. */
1019 	if ((uintptr_t)fp1 > (uintptr_t)fp2) {
1020 		struct file *tmpfp;
1021 		off_t *tmpoffp;
1022 
1023 		tmpfp = fp1, fp1 = fp2, fp2 = tmpfp;
1024 		tmpoffp = off1p, off1p = off2p, off2p = tmpoffp;
1025 	}
1026 	if (fp1 != NULL)
1027 		*off1p = foffset_lock(fp1, flags);
1028 	if (fp2 != NULL)
1029 		*off2p = foffset_lock(fp2, flags);
1030 }
1031 
1032 void
foffset_lock_uio(struct file * fp,struct uio * uio,int flags)1033 foffset_lock_uio(struct file *fp, struct uio *uio, int flags)
1034 {
1035 
1036 	if ((flags & FOF_OFFSET) == 0)
1037 		uio->uio_offset = foffset_lock(fp, flags);
1038 }
1039 
1040 void
foffset_unlock_uio(struct file * fp,struct uio * uio,int flags)1041 foffset_unlock_uio(struct file *fp, struct uio *uio, int flags)
1042 {
1043 
1044 	if ((flags & FOF_OFFSET) == 0)
1045 		foffset_unlock(fp, uio->uio_offset, flags);
1046 }
1047 
1048 static int
get_advice(struct file * fp,struct uio * uio)1049 get_advice(struct file *fp, struct uio *uio)
1050 {
1051 	struct mtx *mtxp;
1052 	int ret;
1053 
1054 	ret = POSIX_FADV_NORMAL;
1055 	if (fp->f_advice == NULL || fp->f_vnode->v_type != VREG)
1056 		return (ret);
1057 
1058 	mtxp = mtx_pool_find(mtxpool_sleep, fp);
1059 	mtx_lock(mtxp);
1060 	if (fp->f_advice != NULL &&
1061 	    uio->uio_offset >= fp->f_advice->fa_start &&
1062 	    uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
1063 		ret = fp->f_advice->fa_advice;
1064 	mtx_unlock(mtxp);
1065 	return (ret);
1066 }
1067 
1068 static int
get_write_ioflag(struct file * fp)1069 get_write_ioflag(struct file *fp)
1070 {
1071 	int ioflag;
1072 	struct mount *mp;
1073 	struct vnode *vp;
1074 
1075 	ioflag = 0;
1076 	vp = fp->f_vnode;
1077 	mp = atomic_load_ptr(&vp->v_mount);
1078 
1079 	if ((fp->f_flag & O_DIRECT) != 0)
1080 		ioflag |= IO_DIRECT;
1081 
1082 	if ((fp->f_flag & O_FSYNC) != 0 ||
1083 	    (mp != NULL && (mp->mnt_flag & MNT_SYNCHRONOUS) != 0))
1084 		ioflag |= IO_SYNC;
1085 
1086 	/*
1087 	 * For O_DSYNC we set both IO_SYNC and IO_DATASYNC, so that VOP_WRITE()
1088 	 * or VOP_DEALLOCATE() implementations that don't understand IO_DATASYNC
1089 	 * fall back to full O_SYNC behavior.
1090 	 */
1091 	if ((fp->f_flag & O_DSYNC) != 0)
1092 		ioflag |= IO_SYNC | IO_DATASYNC;
1093 
1094 	return (ioflag);
1095 }
1096 
1097 int
vn_read_from_obj(struct vnode * vp,struct uio * uio)1098 vn_read_from_obj(struct vnode *vp, struct uio *uio)
1099 {
1100 	vm_object_t obj;
1101 	vm_page_t ma[io_hold_cnt + 2];
1102 	off_t off, vsz;
1103 	ssize_t resid;
1104 	int error, i, j;
1105 
1106 	MPASS(uio->uio_resid <= ptoa(io_hold_cnt + 2));
1107 	obj = atomic_load_ptr(&vp->v_object);
1108 	if (obj == NULL)
1109 		return (EJUSTRETURN);
1110 
1111 	/*
1112 	 * Depends on type stability of vm_objects.
1113 	 */
1114 	vm_object_pip_add(obj, 1);
1115 	if ((obj->flags & OBJ_DEAD) != 0) {
1116 		/*
1117 		 * Note that object might be already reused from the
1118 		 * vnode, and the OBJ_DEAD flag cleared.  This is fine,
1119 		 * we recheck for DOOMED vnode state after all pages
1120 		 * are busied, and retract then.
1121 		 *
1122 		 * But we check for OBJ_DEAD to ensure that we do not
1123 		 * busy pages while vm_object_terminate_pages()
1124 		 * processes the queue.
1125 		 */
1126 		error = EJUSTRETURN;
1127 		goto out_pip;
1128 	}
1129 
1130 	resid = uio->uio_resid;
1131 	off = uio->uio_offset;
1132 	for (i = 0; resid > 0; i++) {
1133 		MPASS(i < io_hold_cnt + 2);
1134 		ma[i] = vm_page_grab_unlocked(obj, atop(off),
1135 		    VM_ALLOC_NOCREAT | VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY |
1136 		    VM_ALLOC_NOWAIT);
1137 		if (ma[i] == NULL)
1138 			break;
1139 
1140 		/*
1141 		 * Skip invalid pages.  Valid mask can be partial only
1142 		 * at EOF, and we clip later.
1143 		 */
1144 		if (vm_page_none_valid(ma[i])) {
1145 			vm_page_sunbusy(ma[i]);
1146 			break;
1147 		}
1148 
1149 		resid -= PAGE_SIZE;
1150 		off += PAGE_SIZE;
1151 	}
1152 	if (i == 0) {
1153 		error = EJUSTRETURN;
1154 		goto out_pip;
1155 	}
1156 
1157 	/*
1158 	 * Check VIRF_DOOMED after we busied our pages.  Since
1159 	 * vgonel() terminates the vnode' vm_object, it cannot
1160 	 * process past pages busied by us.
1161 	 */
1162 	if (VN_IS_DOOMED(vp)) {
1163 		error = EJUSTRETURN;
1164 		goto out;
1165 	}
1166 
1167 	resid = PAGE_SIZE - (uio->uio_offset & PAGE_MASK) + ptoa(i - 1);
1168 	if (resid > uio->uio_resid)
1169 		resid = uio->uio_resid;
1170 
1171 	/*
1172 	 * Unlocked read of vnp_size is safe because truncation cannot
1173 	 * pass busied page.  But we load vnp_size into a local
1174 	 * variable so that possible concurrent extension does not
1175 	 * break calculation.
1176 	 */
1177 #if defined(__powerpc__) && !defined(__powerpc64__)
1178 	vsz = obj->un_pager.vnp.vnp_size;
1179 #else
1180 	vsz = atomic_load_64(&obj->un_pager.vnp.vnp_size);
1181 #endif
1182 	if (uio->uio_offset >= vsz) {
1183 		error = EJUSTRETURN;
1184 		goto out;
1185 	}
1186 	if (uio->uio_offset + resid > vsz)
1187 		resid = vsz - uio->uio_offset;
1188 
1189 	error = vn_io_fault_pgmove(ma, uio->uio_offset & PAGE_MASK, resid, uio);
1190 
1191 out:
1192 	for (j = 0; j < i; j++) {
1193 		if (error == 0)
1194 			vm_page_reference(ma[j]);
1195 		vm_page_sunbusy(ma[j]);
1196 	}
1197 out_pip:
1198 	vm_object_pip_wakeup(obj);
1199 	if (error != 0)
1200 		return (error);
1201 	return (uio->uio_resid == 0 ? 0 : EJUSTRETURN);
1202 }
1203 
1204 /*
1205  * File table vnode read routine.
1206  */
1207 static int
vn_read(struct file * fp,struct uio * uio,struct ucred * active_cred,int flags,struct thread * td)1208 vn_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags,
1209     struct thread *td)
1210 {
1211 	struct vnode *vp;
1212 	off_t orig_offset;
1213 	int error, ioflag;
1214 	int advice;
1215 
1216 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
1217 	    uio->uio_td, td));
1218 	KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
1219 	vp = fp->f_vnode;
1220 	ioflag = 0;
1221 	if (fp->f_flag & FNONBLOCK)
1222 		ioflag |= IO_NDELAY;
1223 	if (fp->f_flag & O_DIRECT)
1224 		ioflag |= IO_DIRECT;
1225 
1226 	/*
1227 	 * Try to read from page cache.  VIRF_DOOMED check is racy but
1228 	 * allows us to avoid unneeded work outright.
1229 	 */
1230 	if (vn_io_pgcache_read_enable && !mac_vnode_check_read_enabled() &&
1231 	    (vn_irflag_read(vp) & (VIRF_DOOMED | VIRF_PGREAD)) == VIRF_PGREAD) {
1232 		error = VOP_READ_PGCACHE(vp, uio, ioflag, fp->f_cred);
1233 		if (error == 0) {
1234 			fp->f_nextoff[UIO_READ] = uio->uio_offset;
1235 			return (0);
1236 		}
1237 		if (error != EJUSTRETURN)
1238 			return (error);
1239 	}
1240 
1241 	advice = get_advice(fp, uio);
1242 	vn_lock(vp, LK_SHARED | LK_RETRY);
1243 
1244 	switch (advice) {
1245 	case POSIX_FADV_NORMAL:
1246 	case POSIX_FADV_SEQUENTIAL:
1247 	case POSIX_FADV_NOREUSE:
1248 		ioflag |= sequential_heuristic(uio, fp);
1249 		break;
1250 	case POSIX_FADV_RANDOM:
1251 		/* Disable read-ahead for random I/O. */
1252 		break;
1253 	}
1254 	orig_offset = uio->uio_offset;
1255 
1256 #ifdef MAC
1257 	error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
1258 	if (error == 0)
1259 #endif
1260 		error = VOP_READ(vp, uio, ioflag, fp->f_cred);
1261 	fp->f_nextoff[UIO_READ] = uio->uio_offset;
1262 	VOP_UNLOCK(vp);
1263 	if (error == 0 && advice == POSIX_FADV_NOREUSE &&
1264 	    orig_offset != uio->uio_offset)
1265 		/*
1266 		 * Use POSIX_FADV_DONTNEED to flush pages and buffers
1267 		 * for the backing file after a POSIX_FADV_NOREUSE
1268 		 * read(2).
1269 		 */
1270 		error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1,
1271 		    POSIX_FADV_DONTNEED);
1272 	return (error);
1273 }
1274 
1275 /*
1276  * File table vnode write routine.
1277  */
1278 static int
vn_write(struct file * fp,struct uio * uio,struct ucred * active_cred,int flags,struct thread * td)1279 vn_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags,
1280     struct thread *td)
1281 {
1282 	struct vnode *vp;
1283 	struct mount *mp;
1284 	off_t orig_offset;
1285 	int error, ioflag;
1286 	int advice;
1287 	bool need_finished_write;
1288 
1289 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
1290 	    uio->uio_td, td));
1291 	KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
1292 	vp = fp->f_vnode;
1293 	if (vp->v_type == VREG)
1294 		bwillwrite();
1295 	ioflag = IO_UNIT;
1296 	if (vp->v_type == VREG && (fp->f_flag & O_APPEND) != 0)
1297 		ioflag |= IO_APPEND;
1298 	if ((fp->f_flag & FNONBLOCK) != 0)
1299 		ioflag |= IO_NDELAY;
1300 	ioflag |= get_write_ioflag(fp);
1301 
1302 	mp = NULL;
1303 	need_finished_write = false;
1304 	if (vp->v_type != VCHR) {
1305 		error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH);
1306 		if (error != 0)
1307 			goto unlock;
1308 		need_finished_write = true;
1309 	}
1310 
1311 	advice = get_advice(fp, uio);
1312 
1313 	vn_lock(vp, vn_lktype_write(mp, vp) | LK_RETRY);
1314 	switch (advice) {
1315 	case POSIX_FADV_NORMAL:
1316 	case POSIX_FADV_SEQUENTIAL:
1317 	case POSIX_FADV_NOREUSE:
1318 		ioflag |= sequential_heuristic(uio, fp);
1319 		break;
1320 	case POSIX_FADV_RANDOM:
1321 		/* XXX: Is this correct? */
1322 		break;
1323 	}
1324 	orig_offset = uio->uio_offset;
1325 
1326 #ifdef MAC
1327 	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
1328 	if (error == 0)
1329 #endif
1330 		error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
1331 	fp->f_nextoff[UIO_WRITE] = uio->uio_offset;
1332 	VOP_UNLOCK(vp);
1333 	if (need_finished_write)
1334 		vn_finished_write(mp);
1335 	if (error == 0 && advice == POSIX_FADV_NOREUSE &&
1336 	    orig_offset != uio->uio_offset)
1337 		/*
1338 		 * Use POSIX_FADV_DONTNEED to flush pages and buffers
1339 		 * for the backing file after a POSIX_FADV_NOREUSE
1340 		 * write(2).
1341 		 */
1342 		error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1,
1343 		    POSIX_FADV_DONTNEED);
1344 unlock:
1345 	return (error);
1346 }
1347 
1348 /*
1349  * The vn_io_fault() is a wrapper around vn_read() and vn_write() to
1350  * prevent the following deadlock:
1351  *
1352  * Assume that the thread A reads from the vnode vp1 into userspace
1353  * buffer buf1 backed by the pages of vnode vp2.  If a page in buf1 is
1354  * currently not resident, then system ends up with the call chain
1355  *   vn_read() -> VOP_READ(vp1) -> uiomove() -> [Page Fault] ->
1356  *     vm_fault(buf1) -> vnode_pager_getpages(vp2) -> VOP_GETPAGES(vp2)
1357  * which establishes lock order vp1->vn_lock, then vp2->vn_lock.
1358  * If, at the same time, thread B reads from vnode vp2 into buffer buf2
1359  * backed by the pages of vnode vp1, and some page in buf2 is not
1360  * resident, we get a reversed order vp2->vn_lock, then vp1->vn_lock.
1361  *
1362  * To prevent the lock order reversal and deadlock, vn_io_fault() does
1363  * not allow page faults to happen during VOP_READ() or VOP_WRITE().
1364  * Instead, it first tries to do the whole range i/o with pagefaults
1365  * disabled. If all pages in the i/o buffer are resident and mapped,
1366  * VOP will succeed (ignoring the genuine filesystem errors).
1367  * Otherwise, we get back EFAULT, and vn_io_fault() falls back to do
1368  * i/o in chunks, with all pages in the chunk prefaulted and held
1369  * using vm_fault_quick_hold_pages().
1370  *
1371  * Filesystems using this deadlock avoidance scheme should use the
1372  * array of the held pages from uio, saved in the curthread->td_ma,
1373  * instead of doing uiomove().  A helper function
1374  * vn_io_fault_uiomove() converts uiomove request into
1375  * uiomove_fromphys() over td_ma array.
1376  *
1377  * Since vnode locks do not cover the whole i/o anymore, rangelocks
1378  * make the current i/o request atomic with respect to other i/os and
1379  * truncations.
1380  */
1381 
1382 /*
1383  * Decode vn_io_fault_args and perform the corresponding i/o.
1384  */
1385 static int
vn_io_fault_doio(struct vn_io_fault_args * args,struct uio * uio,struct thread * td)1386 vn_io_fault_doio(struct vn_io_fault_args *args, struct uio *uio,
1387     struct thread *td)
1388 {
1389 	int error, save;
1390 
1391 	error = 0;
1392 	save = vm_fault_disable_pagefaults();
1393 	switch (args->kind) {
1394 	case VN_IO_FAULT_FOP:
1395 		error = (args->args.fop_args.doio)(args->args.fop_args.fp,
1396 		    uio, args->cred, args->flags, td);
1397 		break;
1398 	case VN_IO_FAULT_VOP:
1399 		switch (uio->uio_rw) {
1400 		case UIO_READ:
1401 			error = VOP_READ(args->args.vop_args.vp, uio,
1402 			    args->flags, args->cred);
1403 			break;
1404 		case UIO_WRITE:
1405 			error = VOP_WRITE(args->args.vop_args.vp, uio,
1406 			    args->flags, args->cred);
1407 			break;
1408 		}
1409 		break;
1410 	default:
1411 		panic("vn_io_fault_doio: unknown kind of io %d %d",
1412 		    args->kind, uio->uio_rw);
1413 	}
1414 	vm_fault_enable_pagefaults(save);
1415 	return (error);
1416 }
1417 
1418 static int
vn_io_fault_touch(char * base,const struct uio * uio)1419 vn_io_fault_touch(char *base, const struct uio *uio)
1420 {
1421 	int r;
1422 
1423 	r = fubyte(base);
1424 	if (r == -1 || (uio->uio_rw == UIO_READ && subyte(base, r) == -1))
1425 		return (EFAULT);
1426 	return (0);
1427 }
1428 
1429 static int
vn_io_fault_prefault_user(const struct uio * uio)1430 vn_io_fault_prefault_user(const struct uio *uio)
1431 {
1432 	char *base;
1433 	const struct iovec *iov;
1434 	size_t len;
1435 	ssize_t resid;
1436 	int error, i;
1437 
1438 	KASSERT(uio->uio_segflg == UIO_USERSPACE,
1439 	    ("vn_io_fault_prefault userspace"));
1440 
1441 	error = i = 0;
1442 	iov = uio->uio_iov;
1443 	resid = uio->uio_resid;
1444 	base = iov->iov_base;
1445 	len = iov->iov_len;
1446 	while (resid > 0) {
1447 		error = vn_io_fault_touch(base, uio);
1448 		if (error != 0)
1449 			break;
1450 		if (len < PAGE_SIZE) {
1451 			if (len != 0) {
1452 				error = vn_io_fault_touch(base + len - 1, uio);
1453 				if (error != 0)
1454 					break;
1455 				resid -= len;
1456 			}
1457 			if (++i >= uio->uio_iovcnt)
1458 				break;
1459 			iov = uio->uio_iov + i;
1460 			base = iov->iov_base;
1461 			len = iov->iov_len;
1462 		} else {
1463 			len -= PAGE_SIZE;
1464 			base += PAGE_SIZE;
1465 			resid -= PAGE_SIZE;
1466 		}
1467 	}
1468 	return (error);
1469 }
1470 
1471 /*
1472  * Common code for vn_io_fault(), agnostic to the kind of i/o request.
1473  * Uses vn_io_fault_doio() to make the call to an actual i/o function.
1474  * Used from vn_rdwr() and vn_io_fault(), which encode the i/o request
1475  * into args and call vn_io_fault1() to handle faults during the user
1476  * mode buffer accesses.
1477  */
1478 static int
vn_io_fault1(struct vnode * vp,struct uio * uio,struct vn_io_fault_args * args,struct thread * td)1479 vn_io_fault1(struct vnode *vp, struct uio *uio, struct vn_io_fault_args *args,
1480     struct thread *td)
1481 {
1482 	vm_page_t ma[io_hold_cnt + 2];
1483 	struct uio *uio_clone, short_uio;
1484 	struct iovec short_iovec[1];
1485 	vm_page_t *prev_td_ma;
1486 	vm_prot_t prot;
1487 	vm_offset_t addr, end;
1488 	size_t len, resid;
1489 	ssize_t adv;
1490 	int error, cnt, saveheld, prev_td_ma_cnt;
1491 
1492 	if (vn_io_fault_prefault) {
1493 		error = vn_io_fault_prefault_user(uio);
1494 		if (error != 0)
1495 			return (error); /* Or ignore ? */
1496 	}
1497 
1498 	prot = uio->uio_rw == UIO_READ ? VM_PROT_WRITE : VM_PROT_READ;
1499 
1500 	/*
1501 	 * The UFS follows IO_UNIT directive and replays back both
1502 	 * uio_offset and uio_resid if an error is encountered during the
1503 	 * operation.  But, since the iovec may be already advanced,
1504 	 * uio is still in an inconsistent state.
1505 	 *
1506 	 * Cache a copy of the original uio, which is advanced to the redo
1507 	 * point using UIO_NOCOPY below.
1508 	 */
1509 	uio_clone = cloneuio(uio);
1510 	resid = uio->uio_resid;
1511 
1512 	short_uio.uio_segflg = UIO_USERSPACE;
1513 	short_uio.uio_rw = uio->uio_rw;
1514 	short_uio.uio_td = uio->uio_td;
1515 
1516 	error = vn_io_fault_doio(args, uio, td);
1517 	if (error != EFAULT)
1518 		goto out;
1519 
1520 	atomic_add_long(&vn_io_faults_cnt, 1);
1521 	uio_clone->uio_segflg = UIO_NOCOPY;
1522 	uiomove(NULL, resid - uio->uio_resid, uio_clone);
1523 	uio_clone->uio_segflg = uio->uio_segflg;
1524 
1525 	saveheld = curthread_pflags_set(TDP_UIOHELD);
1526 	prev_td_ma = td->td_ma;
1527 	prev_td_ma_cnt = td->td_ma_cnt;
1528 
1529 	while (uio_clone->uio_resid != 0) {
1530 		len = uio_clone->uio_iov->iov_len;
1531 		if (len == 0) {
1532 			KASSERT(uio_clone->uio_iovcnt >= 1,
1533 			    ("iovcnt underflow"));
1534 			uio_clone->uio_iov++;
1535 			uio_clone->uio_iovcnt--;
1536 			continue;
1537 		}
1538 		if (len > ptoa(io_hold_cnt))
1539 			len = ptoa(io_hold_cnt);
1540 		addr = (uintptr_t)uio_clone->uio_iov->iov_base;
1541 		end = round_page(addr + len);
1542 		if (end < addr) {
1543 			error = EFAULT;
1544 			break;
1545 		}
1546 		/*
1547 		 * A perfectly misaligned address and length could cause
1548 		 * both the start and the end of the chunk to use partial
1549 		 * page.  +2 accounts for such a situation.
1550 		 */
1551 		cnt = vm_fault_quick_hold_pages(&td->td_proc->p_vmspace->vm_map,
1552 		    addr, len, prot, ma, io_hold_cnt + 2);
1553 		if (cnt == -1) {
1554 			error = EFAULT;
1555 			break;
1556 		}
1557 		short_uio.uio_iov = &short_iovec[0];
1558 		short_iovec[0].iov_base = (void *)addr;
1559 		short_uio.uio_iovcnt = 1;
1560 		short_uio.uio_resid = short_iovec[0].iov_len = len;
1561 		short_uio.uio_offset = uio_clone->uio_offset;
1562 		td->td_ma = ma;
1563 		td->td_ma_cnt = cnt;
1564 
1565 		error = vn_io_fault_doio(args, &short_uio, td);
1566 		vm_page_unhold_pages(ma, cnt);
1567 		adv = len - short_uio.uio_resid;
1568 
1569 		uio_clone->uio_iov->iov_base =
1570 		    (char *)uio_clone->uio_iov->iov_base + adv;
1571 		uio_clone->uio_iov->iov_len -= adv;
1572 		uio_clone->uio_resid -= adv;
1573 		uio_clone->uio_offset += adv;
1574 
1575 		uio->uio_resid -= adv;
1576 		uio->uio_offset += adv;
1577 
1578 		if (error != 0 || adv == 0)
1579 			break;
1580 	}
1581 	td->td_ma = prev_td_ma;
1582 	td->td_ma_cnt = prev_td_ma_cnt;
1583 	curthread_pflags_restore(saveheld);
1584 out:
1585 	freeuio(uio_clone);
1586 	return (error);
1587 }
1588 
1589 static int
vn_io_fault(struct file * fp,struct uio * uio,struct ucred * active_cred,int flags,struct thread * td)1590 vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred,
1591     int flags, struct thread *td)
1592 {
1593 	fo_rdwr_t *doio;
1594 	struct vnode *vp;
1595 	void *rl_cookie;
1596 	struct vn_io_fault_args args;
1597 	int error;
1598 	bool do_io_fault, do_rangelock;
1599 
1600 	doio = uio->uio_rw == UIO_READ ? vn_read : vn_write;
1601 	vp = fp->f_vnode;
1602 
1603 	/*
1604 	 * The ability to read(2) on a directory has historically been
1605 	 * allowed for all users, but this can and has been the source of
1606 	 * at least one security issue in the past.  As such, it is now hidden
1607 	 * away behind a sysctl for those that actually need it to use it, and
1608 	 * restricted to root when it's turned on to make it relatively safe to
1609 	 * leave on for longer sessions of need.
1610 	 */
1611 	if (vp->v_type == VDIR) {
1612 		KASSERT(uio->uio_rw == UIO_READ,
1613 		    ("illegal write attempted on a directory"));
1614 		if (!vfs_allow_read_dir)
1615 			return (EISDIR);
1616 		if ((error = priv_check(td, PRIV_VFS_READ_DIR)) != 0)
1617 			return (EISDIR);
1618 	}
1619 
1620 	do_io_fault = do_vn_io_fault(vp, uio);
1621 	do_rangelock = do_io_fault || (vn_irflag_read(vp) & VIRF_PGREAD) != 0;
1622 	foffset_lock_uio(fp, uio, flags);
1623 	if (do_rangelock) {
1624 		if (uio->uio_rw == UIO_READ) {
1625 			rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset,
1626 			    uio->uio_offset + uio->uio_resid);
1627 		} else if ((fp->f_flag & O_APPEND) != 0 ||
1628 		    (flags & FOF_OFFSET) == 0) {
1629 			/* For appenders, punt and lock the whole range. */
1630 			rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
1631 		} else {
1632 			rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset,
1633 			    uio->uio_offset + uio->uio_resid);
1634 		}
1635 	}
1636 	if (do_io_fault) {
1637 		args.kind = VN_IO_FAULT_FOP;
1638 		args.args.fop_args.fp = fp;
1639 		args.args.fop_args.doio = doio;
1640 		args.cred = active_cred;
1641 		args.flags = flags | FOF_OFFSET;
1642 		error = vn_io_fault1(vp, uio, &args, td);
1643 	} else {
1644 		error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td);
1645 	}
1646 	if (do_rangelock)
1647 		vn_rangelock_unlock(vp, rl_cookie);
1648 	foffset_unlock_uio(fp, uio, flags);
1649 	return (error);
1650 }
1651 
1652 /*
1653  * Helper function to perform the requested uiomove operation using
1654  * the held pages for io->uio_iov[0].iov_base buffer instead of
1655  * copyin/copyout.  Access to the pages with uiomove_fromphys()
1656  * instead of iov_base prevents page faults that could occur due to
1657  * pmap_collect() invalidating the mapping created by
1658  * vm_fault_quick_hold_pages(), or pageout daemon, page laundry or
1659  * object cleanup revoking the write access from page mappings.
1660  *
1661  * Filesystems specified MNTK_NO_IOPF shall use vn_io_fault_uiomove()
1662  * instead of plain uiomove().
1663  */
1664 int
vn_io_fault_uiomove(char * data,int xfersize,struct uio * uio)1665 vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio)
1666 {
1667 	struct uio transp_uio;
1668 	struct iovec transp_iov[1];
1669 	struct thread *td;
1670 	size_t adv;
1671 	int error, pgadv;
1672 
1673 	td = curthread;
1674 	if ((td->td_pflags & TDP_UIOHELD) == 0 ||
1675 	    uio->uio_segflg != UIO_USERSPACE)
1676 		return (uiomove(data, xfersize, uio));
1677 
1678 	KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
1679 	transp_iov[0].iov_base = data;
1680 	transp_uio.uio_iov = &transp_iov[0];
1681 	transp_uio.uio_iovcnt = 1;
1682 	if (xfersize > uio->uio_resid)
1683 		xfersize = uio->uio_resid;
1684 	transp_uio.uio_resid = transp_iov[0].iov_len = xfersize;
1685 	transp_uio.uio_offset = 0;
1686 	transp_uio.uio_segflg = UIO_SYSSPACE;
1687 	/*
1688 	 * Since transp_iov points to data, and td_ma page array
1689 	 * corresponds to original uio->uio_iov, we need to invert the
1690 	 * direction of the i/o operation as passed to
1691 	 * uiomove_fromphys().
1692 	 */
1693 	switch (uio->uio_rw) {
1694 	case UIO_WRITE:
1695 		transp_uio.uio_rw = UIO_READ;
1696 		break;
1697 	case UIO_READ:
1698 		transp_uio.uio_rw = UIO_WRITE;
1699 		break;
1700 	}
1701 	transp_uio.uio_td = uio->uio_td;
1702 	error = uiomove_fromphys(td->td_ma,
1703 	    ((vm_offset_t)uio->uio_iov->iov_base) & PAGE_MASK,
1704 	    xfersize, &transp_uio);
1705 	adv = xfersize - transp_uio.uio_resid;
1706 	pgadv =
1707 	    (((vm_offset_t)uio->uio_iov->iov_base + adv) >> PAGE_SHIFT) -
1708 	    (((vm_offset_t)uio->uio_iov->iov_base) >> PAGE_SHIFT);
1709 	td->td_ma += pgadv;
1710 	KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
1711 	    pgadv));
1712 	td->td_ma_cnt -= pgadv;
1713 	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + adv;
1714 	uio->uio_iov->iov_len -= adv;
1715 	uio->uio_resid -= adv;
1716 	uio->uio_offset += adv;
1717 	return (error);
1718 }
1719 
1720 int
vn_io_fault_pgmove(vm_page_t ma[],vm_offset_t offset,int xfersize,struct uio * uio)1721 vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize,
1722     struct uio *uio)
1723 {
1724 	struct thread *td;
1725 	vm_offset_t iov_base;
1726 	int cnt, pgadv;
1727 
1728 	td = curthread;
1729 	if ((td->td_pflags & TDP_UIOHELD) == 0 ||
1730 	    uio->uio_segflg != UIO_USERSPACE)
1731 		return (uiomove_fromphys(ma, offset, xfersize, uio));
1732 
1733 	KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
1734 	cnt = xfersize > uio->uio_resid ? uio->uio_resid : xfersize;
1735 	iov_base = (vm_offset_t)uio->uio_iov->iov_base;
1736 	switch (uio->uio_rw) {
1737 	case UIO_WRITE:
1738 		pmap_copy_pages(td->td_ma, iov_base & PAGE_MASK, ma,
1739 		    offset, cnt);
1740 		break;
1741 	case UIO_READ:
1742 		pmap_copy_pages(ma, offset, td->td_ma, iov_base & PAGE_MASK,
1743 		    cnt);
1744 		break;
1745 	}
1746 	pgadv = ((iov_base + cnt) >> PAGE_SHIFT) - (iov_base >> PAGE_SHIFT);
1747 	td->td_ma += pgadv;
1748 	KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
1749 	    pgadv));
1750 	td->td_ma_cnt -= pgadv;
1751 	uio->uio_iov->iov_base = (char *)(iov_base + cnt);
1752 	uio->uio_iov->iov_len -= cnt;
1753 	uio->uio_resid -= cnt;
1754 	uio->uio_offset += cnt;
1755 	return (0);
1756 }
1757 
1758 /*
1759  * File table truncate routine.
1760  */
1761 static int
vn_truncate(struct file * fp,off_t length,struct ucred * active_cred,struct thread * td)1762 vn_truncate(struct file *fp, off_t length, struct ucred *active_cred,
1763     struct thread *td)
1764 {
1765 	struct mount *mp;
1766 	struct vnode *vp;
1767 	void *rl_cookie;
1768 	int error;
1769 
1770 	vp = fp->f_vnode;
1771 
1772 retry:
1773 	/*
1774 	 * Lock the whole range for truncation.  Otherwise split i/o
1775 	 * might happen partly before and partly after the truncation.
1776 	 */
1777 	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
1778 	error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH);
1779 	if (error)
1780 		goto out1;
1781 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1782 	AUDIT_ARG_VNODE1(vp);
1783 	if (vp->v_type == VDIR) {
1784 		error = EISDIR;
1785 		goto out;
1786 	}
1787 #ifdef MAC
1788 	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
1789 	if (error)
1790 		goto out;
1791 #endif
1792 	error = vn_truncate_locked(vp, length, (fp->f_flag & O_FSYNC) != 0,
1793 	    fp->f_cred);
1794 out:
1795 	VOP_UNLOCK(vp);
1796 	vn_finished_write(mp);
1797 out1:
1798 	vn_rangelock_unlock(vp, rl_cookie);
1799 	if (error == ERELOOKUP)
1800 		goto retry;
1801 	return (error);
1802 }
1803 
1804 /*
1805  * Truncate a file that is already locked.
1806  */
1807 int
vn_truncate_locked(struct vnode * vp,off_t length,bool sync,struct ucred * cred)1808 vn_truncate_locked(struct vnode *vp, off_t length, bool sync,
1809     struct ucred *cred)
1810 {
1811 	struct vattr vattr;
1812 	int error;
1813 
1814 	error = VOP_ADD_WRITECOUNT(vp, 1);
1815 	if (error == 0) {
1816 		VATTR_NULL(&vattr);
1817 		vattr.va_size = length;
1818 		if (sync)
1819 			vattr.va_vaflags |= VA_SYNC;
1820 		error = VOP_SETATTR(vp, &vattr, cred);
1821 		VOP_ADD_WRITECOUNT_CHECKED(vp, -1);
1822 		if (error == 0)
1823 			INOTIFY(vp, IN_MODIFY);
1824 	}
1825 	return (error);
1826 }
1827 
1828 /*
1829  * File table vnode stat routine.
1830  */
1831 int
vn_statfile(struct file * fp,struct stat * sb,struct ucred * active_cred)1832 vn_statfile(struct file *fp, struct stat *sb, struct ucred *active_cred)
1833 {
1834 	struct vnode *vp = fp->f_vnode;
1835 	int error;
1836 
1837 	vn_lock(vp, LK_SHARED | LK_RETRY);
1838 	error = VOP_STAT(vp, sb, active_cred, fp->f_cred);
1839 	VOP_UNLOCK(vp);
1840 
1841 	return (error);
1842 }
1843 
1844 /*
1845  * File table vnode ioctl routine.
1846  */
1847 static int
vn_ioctl(struct file * fp,u_long com,void * data,struct ucred * active_cred,struct thread * td)1848 vn_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
1849     struct thread *td)
1850 {
1851 	struct vnode *vp;
1852 	struct fiobmap2_arg *bmarg;
1853 	off_t size;
1854 	int error;
1855 
1856 	vp = fp->f_vnode;
1857 	switch (vp->v_type) {
1858 	case VDIR:
1859 	case VREG:
1860 		switch (com) {
1861 		case FIONREAD:
1862 			error = vn_getsize(vp, &size, active_cred);
1863 			if (error == 0)
1864 				*(int *)data = size - fp->f_offset;
1865 			return (error);
1866 		case FIOBMAP2:
1867 			bmarg = (struct fiobmap2_arg *)data;
1868 			vn_lock(vp, LK_SHARED | LK_RETRY);
1869 #ifdef MAC
1870 			error = mac_vnode_check_read(active_cred, fp->f_cred,
1871 			    vp);
1872 			if (error == 0)
1873 #endif
1874 				error = VOP_BMAP(vp, bmarg->bn, NULL,
1875 				    &bmarg->bn, &bmarg->runp, &bmarg->runb);
1876 			VOP_UNLOCK(vp);
1877 			return (error);
1878 		case FIONBIO:
1879 		case FIOASYNC:
1880 			return (0);
1881 		default:
1882 			return (VOP_IOCTL(vp, com, data, fp->f_flag,
1883 			    active_cred, td));
1884 		}
1885 		break;
1886 	case VCHR:
1887 		return (VOP_IOCTL(vp, com, data, fp->f_flag,
1888 		    active_cred, td));
1889 	default:
1890 		return (ENOTTY);
1891 	}
1892 }
1893 
1894 /*
1895  * File table vnode poll routine.
1896  */
1897 static int
vn_poll(struct file * fp,int events,struct ucred * active_cred,struct thread * td)1898 vn_poll(struct file *fp, int events, struct ucred *active_cred,
1899     struct thread *td)
1900 {
1901 	struct vnode *vp;
1902 	int error;
1903 
1904 	vp = fp->f_vnode;
1905 #if defined(MAC) || defined(AUDIT)
1906 	if (AUDITING_TD(td) || mac_vnode_check_poll_enabled()) {
1907 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1908 		AUDIT_ARG_VNODE1(vp);
1909 		error = mac_vnode_check_poll(active_cred, fp->f_cred, vp);
1910 		VOP_UNLOCK(vp);
1911 		if (error != 0)
1912 			return (error);
1913 	}
1914 #endif
1915 	error = VOP_POLL(vp, events, fp->f_cred, td);
1916 	return (error);
1917 }
1918 
1919 /*
1920  * Acquire the requested lock and then check for validity.  LK_RETRY
1921  * permits vn_lock to return doomed vnodes.
1922  */
1923 static int __noinline
_vn_lock_fallback(struct vnode * vp,int flags,const char * file,int line,int error)1924 _vn_lock_fallback(struct vnode *vp, int flags, const char *file, int line,
1925     int error)
1926 {
1927 
1928 	KASSERT((flags & LK_RETRY) == 0 || error == 0,
1929 	    ("vn_lock: error %d incompatible with flags %#x", error, flags));
1930 
1931 	if (error == 0)
1932 		VNASSERT(VN_IS_DOOMED(vp), vp, ("vnode not doomed"));
1933 
1934 	if ((flags & LK_RETRY) == 0) {
1935 		if (error == 0) {
1936 			VOP_UNLOCK(vp);
1937 			error = ENOENT;
1938 		}
1939 		return (error);
1940 	}
1941 
1942 	/*
1943 	 * LK_RETRY case.
1944 	 *
1945 	 * Nothing to do if we got the lock.
1946 	 */
1947 	if (error == 0)
1948 		return (0);
1949 
1950 	/*
1951 	 * Interlock was dropped by the call in _vn_lock.
1952 	 */
1953 	flags &= ~LK_INTERLOCK;
1954 	do {
1955 		error = VOP_LOCK1(vp, flags, file, line);
1956 	} while (error != 0);
1957 	return (0);
1958 }
1959 
1960 int
_vn_lock(struct vnode * vp,int flags,const char * file,int line)1961 _vn_lock(struct vnode *vp, int flags, const char *file, int line)
1962 {
1963 	int error;
1964 
1965 	VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
1966 	    ("vn_lock: no locktype (%d passed)", flags));
1967 	VNPASS(vp->v_holdcnt > 0, vp);
1968 	error = VOP_LOCK1(vp, flags, file, line);
1969 	if (__predict_false(error != 0 || VN_IS_DOOMED(vp)))
1970 		return (_vn_lock_fallback(vp, flags, file, line, error));
1971 	return (0);
1972 }
1973 
1974 /*
1975  * File table vnode close routine.
1976  */
1977 static int
vn_closefile(struct file * fp,struct thread * td)1978 vn_closefile(struct file *fp, struct thread *td)
1979 {
1980 	struct vnode *vp;
1981 	struct flock lf;
1982 	int error;
1983 	bool ref;
1984 
1985 	vp = fp->f_vnode;
1986 	fp->f_ops = &badfileops;
1987 	ref = (fp->f_flag & FHASLOCK) != 0;
1988 
1989 	error = vn_close1(vp, fp->f_flag, fp->f_cred, td, ref);
1990 
1991 	if (__predict_false(ref)) {
1992 		lf.l_whence = SEEK_SET;
1993 		lf.l_start = 0;
1994 		lf.l_len = 0;
1995 		lf.l_type = F_UNLCK;
1996 		(void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK);
1997 		vrele(vp);
1998 	}
1999 	return (error);
2000 }
2001 
2002 /*
2003  * Preparing to start a filesystem write operation. If the operation is
2004  * permitted, then we bump the count of operations in progress and
2005  * proceed. If a suspend request is in progress, we wait until the
2006  * suspension is over, and then proceed.
2007  */
2008 static int
vn_start_write_refed(struct mount * mp,int flags,bool mplocked)2009 vn_start_write_refed(struct mount *mp, int flags, bool mplocked)
2010 {
2011 	struct mount_pcpu *mpcpu;
2012 	int error, mflags;
2013 
2014 	if (__predict_true(!mplocked) && (flags & V_XSLEEP) == 0 &&
2015 	    vfs_op_thread_enter(mp, mpcpu)) {
2016 		MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) == 0);
2017 		vfs_mp_count_add_pcpu(mpcpu, writeopcount, 1);
2018 		vfs_op_thread_exit(mp, mpcpu);
2019 		return (0);
2020 	}
2021 
2022 	if (mplocked)
2023 		mtx_assert(MNT_MTX(mp), MA_OWNED);
2024 	else
2025 		MNT_ILOCK(mp);
2026 
2027 	error = 0;
2028 
2029 	/*
2030 	 * Check on status of suspension.
2031 	 */
2032 	if ((curthread->td_pflags & TDP_IGNSUSP) == 0 ||
2033 	    mp->mnt_susp_owner != curthread) {
2034 		mflags = 0;
2035 		if ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0) {
2036 			if (flags & V_PCATCH)
2037 				mflags |= PCATCH;
2038 		}
2039 		mflags |= PRI_MAX_KERN;
2040 		while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
2041 			if ((flags & V_NOWAIT) != 0) {
2042 				error = EWOULDBLOCK;
2043 				goto unlock;
2044 			}
2045 			error = msleep(&mp->mnt_flag, MNT_MTX(mp), mflags,
2046 			    "suspfs", 0);
2047 			if (error != 0)
2048 				goto unlock;
2049 		}
2050 	}
2051 	if ((flags & V_XSLEEP) != 0)
2052 		goto unlock;
2053 	mp->mnt_writeopcount++;
2054 unlock:
2055 	if (error != 0 || (flags & V_XSLEEP) != 0)
2056 		MNT_REL(mp);
2057 	MNT_IUNLOCK(mp);
2058 	return (error);
2059 }
2060 
2061 int
vn_start_write(struct vnode * vp,struct mount ** mpp,int flags)2062 vn_start_write(struct vnode *vp, struct mount **mpp, int flags)
2063 {
2064 	struct mount *mp;
2065 	int error;
2066 
2067 	KASSERT((flags & ~V_VALID_FLAGS) == 0,
2068 	    ("%s: invalid flags passed %d\n", __func__, flags));
2069 
2070 	error = 0;
2071 	/*
2072 	 * If a vnode is provided, get and return the mount point that
2073 	 * to which it will write.
2074 	 */
2075 	if (vp != NULL) {
2076 		if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
2077 			*mpp = NULL;
2078 			if (error != EOPNOTSUPP)
2079 				return (error);
2080 			return (0);
2081 		}
2082 	}
2083 	if ((mp = *mpp) == NULL)
2084 		return (0);
2085 
2086 	/*
2087 	 * VOP_GETWRITEMOUNT() returns with the mp refcount held through
2088 	 * a vfs_ref().
2089 	 * As long as a vnode is not provided we need to acquire a
2090 	 * refcount for the provided mountpoint too, in order to
2091 	 * emulate a vfs_ref().
2092 	 */
2093 	if (vp == NULL)
2094 		vfs_ref(mp);
2095 
2096 	error = vn_start_write_refed(mp, flags, false);
2097 	if (error != 0 && (flags & V_NOWAIT) == 0)
2098 		*mpp = NULL;
2099 	return (error);
2100 }
2101 
2102 /*
2103  * Secondary suspension. Used by operations such as vop_inactive
2104  * routines that are needed by the higher level functions. These
2105  * are allowed to proceed until all the higher level functions have
2106  * completed (indicated by mnt_writeopcount dropping to zero). At that
2107  * time, these operations are halted until the suspension is over.
2108  */
2109 int
vn_start_secondary_write(struct vnode * vp,struct mount ** mpp,int flags)2110 vn_start_secondary_write(struct vnode *vp, struct mount **mpp, int flags)
2111 {
2112 	struct mount *mp;
2113 	int error, mflags;
2114 
2115 	KASSERT((flags & (~V_VALID_FLAGS | V_XSLEEP)) == 0,
2116 	    ("%s: invalid flags passed %d\n", __func__, flags));
2117 
2118  retry:
2119 	if (vp != NULL) {
2120 		if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
2121 			*mpp = NULL;
2122 			if (error != EOPNOTSUPP)
2123 				return (error);
2124 			return (0);
2125 		}
2126 	}
2127 	/*
2128 	 * If we are not suspended or have not yet reached suspended
2129 	 * mode, then let the operation proceed.
2130 	 */
2131 	if ((mp = *mpp) == NULL)
2132 		return (0);
2133 
2134 	/*
2135 	 * VOP_GETWRITEMOUNT() returns with the mp refcount held through
2136 	 * a vfs_ref().
2137 	 * As long as a vnode is not provided we need to acquire a
2138 	 * refcount for the provided mountpoint too, in order to
2139 	 * emulate a vfs_ref().
2140 	 */
2141 	MNT_ILOCK(mp);
2142 	if (vp == NULL)
2143 		MNT_REF(mp);
2144 	if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) {
2145 		mp->mnt_secondary_writes++;
2146 		mp->mnt_secondary_accwrites++;
2147 		MNT_IUNLOCK(mp);
2148 		return (0);
2149 	}
2150 	if ((flags & V_NOWAIT) != 0) {
2151 		MNT_REL(mp);
2152 		MNT_IUNLOCK(mp);
2153 		*mpp = NULL;
2154 		return (EWOULDBLOCK);
2155 	}
2156 	/*
2157 	 * Wait for the suspension to finish.
2158 	 */
2159 	mflags = 0;
2160 	if ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0) {
2161 		if ((flags & V_PCATCH) != 0)
2162 			mflags |= PCATCH;
2163 	}
2164 	mflags |= PRI_MAX_KERN | PDROP;
2165 	error = msleep(&mp->mnt_flag, MNT_MTX(mp), mflags, "suspfs", 0);
2166 	vfs_rel(mp);
2167 	if (error == 0)
2168 		goto retry;
2169 	*mpp = NULL;
2170 	return (error);
2171 }
2172 
2173 /*
2174  * Filesystem write operation has completed. If we are suspending and this
2175  * operation is the last one, notify the suspender that the suspension is
2176  * now in effect.
2177  */
2178 void
vn_finished_write(struct mount * mp)2179 vn_finished_write(struct mount *mp)
2180 {
2181 	struct mount_pcpu *mpcpu;
2182 	int c;
2183 
2184 	if (mp == NULL)
2185 		return;
2186 
2187 	if (vfs_op_thread_enter(mp, mpcpu)) {
2188 		vfs_mp_count_sub_pcpu(mpcpu, writeopcount, 1);
2189 		vfs_mp_count_sub_pcpu(mpcpu, ref, 1);
2190 		vfs_op_thread_exit(mp, mpcpu);
2191 		return;
2192 	}
2193 
2194 	MNT_ILOCK(mp);
2195 	vfs_assert_mount_counters(mp);
2196 	MNT_REL(mp);
2197 	c = --mp->mnt_writeopcount;
2198 	if (mp->mnt_vfs_ops == 0) {
2199 		MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) == 0);
2200 		MNT_IUNLOCK(mp);
2201 		return;
2202 	}
2203 	if (c < 0)
2204 		vfs_dump_mount_counters(mp);
2205 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && c == 0)
2206 		wakeup(&mp->mnt_writeopcount);
2207 	MNT_IUNLOCK(mp);
2208 }
2209 
2210 /*
2211  * Filesystem secondary write operation has completed. If we are
2212  * suspending and this operation is the last one, notify the suspender
2213  * that the suspension is now in effect.
2214  */
2215 void
vn_finished_secondary_write(struct mount * mp)2216 vn_finished_secondary_write(struct mount *mp)
2217 {
2218 	if (mp == NULL)
2219 		return;
2220 	MNT_ILOCK(mp);
2221 	MNT_REL(mp);
2222 	mp->mnt_secondary_writes--;
2223 	if (mp->mnt_secondary_writes < 0)
2224 		panic("vn_finished_secondary_write: neg cnt");
2225 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
2226 	    mp->mnt_secondary_writes <= 0)
2227 		wakeup(&mp->mnt_secondary_writes);
2228 	MNT_IUNLOCK(mp);
2229 }
2230 
2231 /*
2232  * Request a filesystem to suspend write operations.
2233  */
2234 int
vfs_write_suspend(struct mount * mp,int flags)2235 vfs_write_suspend(struct mount *mp, int flags)
2236 {
2237 	int error;
2238 
2239 	vfs_op_enter(mp);
2240 
2241 	MNT_ILOCK(mp);
2242 	vfs_assert_mount_counters(mp);
2243 	if (mp->mnt_susp_owner == curthread) {
2244 		vfs_op_exit_locked(mp);
2245 		MNT_IUNLOCK(mp);
2246 		return (EALREADY);
2247 	}
2248 	while (mp->mnt_kern_flag & MNTK_SUSPEND)
2249 		msleep(&mp->mnt_flag, MNT_MTX(mp), PRI_MAX_KERN, "wsuspfs", 0);
2250 
2251 	/*
2252 	 * Unmount holds a write reference on the mount point.  If we
2253 	 * own busy reference and drain for writers, we deadlock with
2254 	 * the reference draining in the unmount path.  Callers of
2255 	 * vfs_write_suspend() must specify VS_SKIP_UNMOUNT if
2256 	 * vfs_busy() reference is owned and caller is not in the
2257 	 * unmount context.
2258 	 */
2259 	if ((flags & VS_SKIP_UNMOUNT) != 0 &&
2260 	    (mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) {
2261 		vfs_op_exit_locked(mp);
2262 		MNT_IUNLOCK(mp);
2263 		return (EBUSY);
2264 	}
2265 
2266 	mp->mnt_kern_flag |= MNTK_SUSPEND;
2267 	mp->mnt_susp_owner = curthread;
2268 	if (mp->mnt_writeopcount > 0)
2269 		(void) msleep(&mp->mnt_writeopcount,
2270 		    MNT_MTX(mp), PRI_MAX_KERN | PDROP, "suspwt", 0);
2271 	else
2272 		MNT_IUNLOCK(mp);
2273 	if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0) {
2274 		vfs_write_resume(mp, 0);
2275 		/* vfs_write_resume does vfs_op_exit() for us */
2276 	}
2277 	return (error);
2278 }
2279 
2280 /*
2281  * Request a filesystem to resume write operations.
2282  */
2283 void
vfs_write_resume(struct mount * mp,int flags)2284 vfs_write_resume(struct mount *mp, int flags)
2285 {
2286 
2287 	MNT_ILOCK(mp);
2288 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
2289 		KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner"));
2290 		mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 |
2291 				       MNTK_SUSPENDED);
2292 		mp->mnt_susp_owner = NULL;
2293 		wakeup(&mp->mnt_writeopcount);
2294 		wakeup(&mp->mnt_flag);
2295 		curthread->td_pflags &= ~TDP_IGNSUSP;
2296 		if ((flags & VR_START_WRITE) != 0) {
2297 			MNT_REF(mp);
2298 			mp->mnt_writeopcount++;
2299 		}
2300 		MNT_IUNLOCK(mp);
2301 		if ((flags & VR_NO_SUSPCLR) == 0)
2302 			VFS_SUSP_CLEAN(mp);
2303 		vfs_op_exit(mp);
2304 	} else if ((flags & VR_START_WRITE) != 0) {
2305 		MNT_REF(mp);
2306 		vn_start_write_refed(mp, 0, true);
2307 	} else {
2308 		MNT_IUNLOCK(mp);
2309 	}
2310 }
2311 
2312 /*
2313  * Helper loop around vfs_write_suspend() for filesystem unmount VFS
2314  * methods.
2315  */
2316 int
vfs_write_suspend_umnt(struct mount * mp)2317 vfs_write_suspend_umnt(struct mount *mp)
2318 {
2319 	int error;
2320 
2321 	KASSERT((curthread->td_pflags & TDP_IGNSUSP) == 0,
2322 	    ("vfs_write_suspend_umnt: recursed"));
2323 
2324 	/* dounmount() already called vn_start_write(). */
2325 	for (;;) {
2326 		vn_finished_write(mp);
2327 		error = vfs_write_suspend(mp, 0);
2328 		if (error != 0) {
2329 			vn_start_write(NULL, &mp, V_WAIT);
2330 			return (error);
2331 		}
2332 		MNT_ILOCK(mp);
2333 		if ((mp->mnt_kern_flag & MNTK_SUSPENDED) != 0)
2334 			break;
2335 		MNT_IUNLOCK(mp);
2336 		vn_start_write(NULL, &mp, V_WAIT);
2337 	}
2338 	mp->mnt_kern_flag &= ~(MNTK_SUSPENDED | MNTK_SUSPEND2);
2339 	wakeup(&mp->mnt_flag);
2340 	MNT_IUNLOCK(mp);
2341 	curthread->td_pflags |= TDP_IGNSUSP;
2342 	return (0);
2343 }
2344 
2345 /*
2346  * Implement kqueues for files by translating it to vnode operation.
2347  */
2348 static int
vn_kqfilter(struct file * fp,struct knote * kn)2349 vn_kqfilter(struct file *fp, struct knote *kn)
2350 {
2351 
2352 	return (VOP_KQFILTER(fp->f_vnode, kn));
2353 }
2354 
2355 int
vn_kqfilter_opath(struct file * fp,struct knote * kn)2356 vn_kqfilter_opath(struct file *fp, struct knote *kn)
2357 {
2358 	if ((fp->f_flag & FKQALLOWED) == 0)
2359 		return (EBADF);
2360 	return (vn_kqfilter(fp, kn));
2361 }
2362 
2363 /*
2364  * Simplified in-kernel wrapper calls for extended attribute access.
2365  * Both calls pass in a NULL credential, authorizing as "kernel" access.
2366  * Set IO_NODELOCKED in ioflg if the vnode is already locked.
2367  */
2368 int
vn_extattr_get(struct vnode * vp,int ioflg,int attrnamespace,const char * attrname,int * buflen,char * buf,struct thread * td)2369 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
2370     const char *attrname, int *buflen, char *buf, struct thread *td)
2371 {
2372 	struct uio	auio;
2373 	struct iovec	iov;
2374 	int	error;
2375 
2376 	iov.iov_len = *buflen;
2377 	iov.iov_base = buf;
2378 
2379 	auio.uio_iov = &iov;
2380 	auio.uio_iovcnt = 1;
2381 	auio.uio_rw = UIO_READ;
2382 	auio.uio_segflg = UIO_SYSSPACE;
2383 	auio.uio_td = td;
2384 	auio.uio_offset = 0;
2385 	auio.uio_resid = *buflen;
2386 
2387 	if ((ioflg & IO_NODELOCKED) == 0)
2388 		vn_lock(vp, LK_SHARED | LK_RETRY);
2389 
2390 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
2391 
2392 	/* authorize attribute retrieval as kernel */
2393 	error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
2394 	    td);
2395 
2396 	if ((ioflg & IO_NODELOCKED) == 0)
2397 		VOP_UNLOCK(vp);
2398 
2399 	if (error == 0) {
2400 		*buflen = *buflen - auio.uio_resid;
2401 	}
2402 
2403 	return (error);
2404 }
2405 
2406 /*
2407  * XXX failure mode if partially written?
2408  */
2409 int
vn_extattr_set(struct vnode * vp,int ioflg,int attrnamespace,const char * attrname,int buflen,char * buf,struct thread * td)2410 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
2411     const char *attrname, int buflen, char *buf, struct thread *td)
2412 {
2413 	struct uio	auio;
2414 	struct iovec	iov;
2415 	struct mount	*mp;
2416 	int	error;
2417 
2418 	iov.iov_len = buflen;
2419 	iov.iov_base = buf;
2420 
2421 	auio.uio_iov = &iov;
2422 	auio.uio_iovcnt = 1;
2423 	auio.uio_rw = UIO_WRITE;
2424 	auio.uio_segflg = UIO_SYSSPACE;
2425 	auio.uio_td = td;
2426 	auio.uio_offset = 0;
2427 	auio.uio_resid = buflen;
2428 
2429 	if ((ioflg & IO_NODELOCKED) == 0) {
2430 		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
2431 			return (error);
2432 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2433 	}
2434 
2435 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
2436 
2437 	/* authorize attribute setting as kernel */
2438 	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
2439 
2440 	if ((ioflg & IO_NODELOCKED) == 0) {
2441 		vn_finished_write(mp);
2442 		VOP_UNLOCK(vp);
2443 	}
2444 
2445 	return (error);
2446 }
2447 
2448 int
vn_extattr_rm(struct vnode * vp,int ioflg,int attrnamespace,const char * attrname,struct thread * td)2449 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
2450     const char *attrname, struct thread *td)
2451 {
2452 	struct mount	*mp;
2453 	int	error;
2454 
2455 	if ((ioflg & IO_NODELOCKED) == 0) {
2456 		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
2457 			return (error);
2458 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2459 	}
2460 
2461 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
2462 
2463 	/* authorize attribute removal as kernel */
2464 	error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td);
2465 	if (error == EOPNOTSUPP)
2466 		error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
2467 		    NULL, td);
2468 
2469 	if ((ioflg & IO_NODELOCKED) == 0) {
2470 		vn_finished_write(mp);
2471 		VOP_UNLOCK(vp);
2472 	}
2473 
2474 	return (error);
2475 }
2476 
2477 static int
vn_get_ino_alloc_vget(struct mount * mp,void * arg,int lkflags,struct vnode ** rvp)2478 vn_get_ino_alloc_vget(struct mount *mp, void *arg, int lkflags,
2479     struct vnode **rvp)
2480 {
2481 
2482 	return (VFS_VGET(mp, *(ino_t *)arg, lkflags, rvp));
2483 }
2484 
2485 int
vn_vget_ino(struct vnode * vp,ino_t ino,int lkflags,struct vnode ** rvp)2486 vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp)
2487 {
2488 
2489 	return (vn_vget_ino_gen(vp, vn_get_ino_alloc_vget, &ino,
2490 	    lkflags, rvp));
2491 }
2492 
2493 int
vn_vget_ino_gen(struct vnode * vp,vn_get_ino_t alloc,void * alloc_arg,int lkflags,struct vnode ** rvp)2494 vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc, void *alloc_arg,
2495     int lkflags, struct vnode **rvp)
2496 {
2497 	struct mount *mp;
2498 	int ltype, error;
2499 
2500 	ASSERT_VOP_LOCKED(vp, "vn_vget_ino_get");
2501 	mp = vp->v_mount;
2502 	ltype = VOP_ISLOCKED(vp);
2503 	KASSERT(ltype == LK_EXCLUSIVE || ltype == LK_SHARED,
2504 	    ("vn_vget_ino: vp not locked"));
2505 	error = vfs_busy(mp, MBF_NOWAIT);
2506 	if (error != 0) {
2507 		vfs_ref(mp);
2508 		VOP_UNLOCK(vp);
2509 		error = vfs_busy(mp, 0);
2510 		vn_lock(vp, ltype | LK_RETRY);
2511 		vfs_rel(mp);
2512 		if (error != 0)
2513 			return (ENOENT);
2514 		if (VN_IS_DOOMED(vp)) {
2515 			vfs_unbusy(mp);
2516 			return (ENOENT);
2517 		}
2518 	}
2519 	VOP_UNLOCK(vp);
2520 	error = alloc(mp, alloc_arg, lkflags, rvp);
2521 	vfs_unbusy(mp);
2522 	if (error != 0 || *rvp != vp)
2523 		vn_lock(vp, ltype | LK_RETRY);
2524 	if (VN_IS_DOOMED(vp)) {
2525 		if (error == 0) {
2526 			if (*rvp == vp)
2527 				vunref(vp);
2528 			else
2529 				vput(*rvp);
2530 		}
2531 		error = ENOENT;
2532 	}
2533 	return (error);
2534 }
2535 
2536 static void
vn_send_sigxfsz(struct proc * p)2537 vn_send_sigxfsz(struct proc *p)
2538 {
2539 	PROC_LOCK(p);
2540 	kern_psignal(p, SIGXFSZ);
2541 	PROC_UNLOCK(p);
2542 }
2543 
2544 int
vn_rlimit_trunc(u_quad_t size,struct thread * td)2545 vn_rlimit_trunc(u_quad_t size, struct thread *td)
2546 {
2547 	if (size <= lim_cur(td, RLIMIT_FSIZE))
2548 		return (0);
2549 	vn_send_sigxfsz(td->td_proc);
2550 	return (EFBIG);
2551 }
2552 
2553 static int
vn_rlimit_fsizex1(const struct vnode * vp,struct uio * uio,off_t maxfsz,bool adj,struct thread * td)2554 vn_rlimit_fsizex1(const struct vnode *vp, struct uio *uio, off_t maxfsz,
2555     bool adj, struct thread *td)
2556 {
2557 	off_t lim;
2558 	bool ktr_write;
2559 
2560 	if (vp->v_type != VREG)
2561 		return (0);
2562 
2563 	/*
2564 	 * Handle file system maximum file size.
2565 	 */
2566 	if (maxfsz != 0 && uio->uio_offset + uio->uio_resid > maxfsz) {
2567 		if (!adj || uio->uio_offset >= maxfsz)
2568 			return (EFBIG);
2569 		uio->uio_resid = maxfsz - uio->uio_offset;
2570 	}
2571 
2572 	/*
2573 	 * This is kernel write (e.g. vnode_pager) or accounting
2574 	 * write, ignore limit.
2575 	 */
2576 	if (td == NULL || (td->td_pflags2 & TDP2_ACCT) != 0)
2577 		return (0);
2578 
2579 	/*
2580 	 * Calculate file size limit.
2581 	 */
2582 	ktr_write = (td->td_pflags & TDP_INKTRACE) != 0;
2583 	lim = __predict_false(ktr_write) ? td->td_ktr_io_lim :
2584 	    lim_cur(td, RLIMIT_FSIZE);
2585 
2586 	/*
2587 	 * Is the limit reached?
2588 	 */
2589 	if (__predict_true((uoff_t)uio->uio_offset + uio->uio_resid <= lim))
2590 		return (0);
2591 
2592 	/*
2593 	 * Prepared filesystems can handle writes truncated to the
2594 	 * file size limit.
2595 	 */
2596 	if (adj && (uoff_t)uio->uio_offset < lim) {
2597 		uio->uio_resid = lim - (uoff_t)uio->uio_offset;
2598 		return (0);
2599 	}
2600 
2601 	if (!ktr_write || ktr_filesize_limit_signal)
2602 		vn_send_sigxfsz(td->td_proc);
2603 	return (EFBIG);
2604 }
2605 
2606 /*
2607  * Helper for VOP_WRITE() implementations, the common code to
2608  * handle maximum supported file size on the filesystem, and
2609  * RLIMIT_FSIZE, except for special writes from accounting subsystem
2610  * and ktrace.
2611  *
2612  * For maximum file size (maxfsz argument):
2613  * - return EFBIG if uio_offset is beyond it
2614  * - otherwise, clamp uio_resid if write would extend file beyond maxfsz.
2615  *
2616  * For RLIMIT_FSIZE:
2617  * - return EFBIG and send SIGXFSZ if uio_offset is beyond the limit
2618  * - otherwise, clamp uio_resid if write would extend file beyond limit.
2619  *
2620  * If clamping occured, the adjustment for uio_resid is stored in
2621  * *resid_adj, to be re-applied by vn_rlimit_fsizex_res() on return
2622  * from the VOP.
2623  */
2624 int
vn_rlimit_fsizex(const struct vnode * vp,struct uio * uio,off_t maxfsz,ssize_t * resid_adj,struct thread * td)2625 vn_rlimit_fsizex(const struct vnode *vp, struct uio *uio, off_t maxfsz,
2626     ssize_t *resid_adj, struct thread *td)
2627 {
2628 	ssize_t resid_orig;
2629 	int error;
2630 	bool adj;
2631 
2632 	resid_orig = uio->uio_resid;
2633 	adj = resid_adj != NULL;
2634 	error = vn_rlimit_fsizex1(vp, uio, maxfsz, adj, td);
2635 	if (adj)
2636 		*resid_adj = resid_orig - uio->uio_resid;
2637 	return (error);
2638 }
2639 
2640 void
vn_rlimit_fsizex_res(struct uio * uio,ssize_t resid_adj)2641 vn_rlimit_fsizex_res(struct uio *uio, ssize_t resid_adj)
2642 {
2643 	uio->uio_resid += resid_adj;
2644 }
2645 
2646 int
vn_rlimit_fsize(const struct vnode * vp,const struct uio * uio,struct thread * td)2647 vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio,
2648     struct thread *td)
2649 {
2650 	return (vn_rlimit_fsizex(vp, __DECONST(struct uio *, uio), 0, NULL,
2651 	    td));
2652 }
2653 
2654 int
vn_chmod(struct file * fp,mode_t mode,struct ucred * active_cred,struct thread * td)2655 vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
2656     struct thread *td)
2657 {
2658 	struct vnode *vp;
2659 
2660 	vp = fp->f_vnode;
2661 #ifdef AUDIT
2662 	vn_lock(vp, LK_SHARED | LK_RETRY);
2663 	AUDIT_ARG_VNODE1(vp);
2664 	VOP_UNLOCK(vp);
2665 #endif
2666 	return (setfmode(td, active_cred, vp, mode));
2667 }
2668 
2669 int
vn_chown(struct file * fp,uid_t uid,gid_t gid,struct ucred * active_cred,struct thread * td)2670 vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
2671     struct thread *td)
2672 {
2673 	struct vnode *vp;
2674 
2675 	vp = fp->f_vnode;
2676 #ifdef AUDIT
2677 	vn_lock(vp, LK_SHARED | LK_RETRY);
2678 	AUDIT_ARG_VNODE1(vp);
2679 	VOP_UNLOCK(vp);
2680 #endif
2681 	return (setfown(td, active_cred, vp, uid, gid));
2682 }
2683 
2684 /*
2685  * Remove pages in the range ["start", "end") from the vnode's VM object.  If
2686  * "end" is 0, then the range extends to the end of the object.
2687  */
2688 void
vn_pages_remove(struct vnode * vp,vm_pindex_t start,vm_pindex_t end)2689 vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end)
2690 {
2691 	vm_object_t object;
2692 
2693 	if ((object = vp->v_object) == NULL)
2694 		return;
2695 	VM_OBJECT_WLOCK(object);
2696 	vm_object_page_remove(object, start, end, 0);
2697 	VM_OBJECT_WUNLOCK(object);
2698 }
2699 
2700 /*
2701  * Like vn_pages_remove(), but skips invalid pages, which by definition are not
2702  * mapped into any process' address space.  Filesystems may use this in
2703  * preference to vn_pages_remove() to avoid blocking on pages busied in
2704  * preparation for a VOP_GETPAGES.
2705  */
2706 void
vn_pages_remove_valid(struct vnode * vp,vm_pindex_t start,vm_pindex_t end)2707 vn_pages_remove_valid(struct vnode *vp, vm_pindex_t start, vm_pindex_t end)
2708 {
2709 	vm_object_t object;
2710 
2711 	if ((object = vp->v_object) == NULL)
2712 		return;
2713 	VM_OBJECT_WLOCK(object);
2714 	vm_object_page_remove(object, start, end, OBJPR_VALIDONLY);
2715 	VM_OBJECT_WUNLOCK(object);
2716 }
2717 
2718 int
vn_bmap_seekhole_locked(struct vnode * vp,u_long cmd,off_t * off,struct ucred * cred)2719 vn_bmap_seekhole_locked(struct vnode *vp, u_long cmd, off_t *off,
2720     struct ucred *cred)
2721 {
2722 	off_t size;
2723 	daddr_t bn, bnp;
2724 	uint64_t bsize;
2725 	off_t noff;
2726 	int error;
2727 
2728 	KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA,
2729 	    ("%s: Wrong command %lu", __func__, cmd));
2730 	ASSERT_VOP_ELOCKED(vp, "vn_bmap_seekhole_locked");
2731 
2732 	if (vp->v_type != VREG) {
2733 		error = ENOTTY;
2734 		goto out;
2735 	}
2736 	error = vn_getsize_locked(vp, &size, cred);
2737 	if (error != 0)
2738 		goto out;
2739 	noff = *off;
2740 	if (noff < 0 || noff >= size) {
2741 		error = ENXIO;
2742 		goto out;
2743 	}
2744 
2745 	/* See the comment in ufs_bmap_seekdata(). */
2746 	vnode_pager_clean_sync(vp);
2747 
2748 	bsize = vp->v_mount->mnt_stat.f_iosize;
2749 	for (bn = noff / bsize; noff < size; bn++, noff += bsize -
2750 	    noff % bsize) {
2751 		error = VOP_BMAP(vp, bn, NULL, &bnp, NULL, NULL);
2752 		if (error == EOPNOTSUPP) {
2753 			error = ENOTTY;
2754 			goto out;
2755 		}
2756 		if ((bnp == -1 && cmd == FIOSEEKHOLE) ||
2757 		    (bnp != -1 && cmd == FIOSEEKDATA)) {
2758 			noff = bn * bsize;
2759 			if (noff < *off)
2760 				noff = *off;
2761 			goto out;
2762 		}
2763 	}
2764 	if (noff > size)
2765 		noff = size;
2766 	/* noff == size. There is an implicit hole at the end of file. */
2767 	if (cmd == FIOSEEKDATA)
2768 		error = ENXIO;
2769 out:
2770 	if (error == 0)
2771 		*off = noff;
2772 	return (error);
2773 }
2774 
2775 int
vn_bmap_seekhole(struct vnode * vp,u_long cmd,off_t * off,struct ucred * cred)2776 vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred)
2777 {
2778 	int error;
2779 
2780 	KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA,
2781 	    ("%s: Wrong command %lu", __func__, cmd));
2782 
2783 	if (vn_lock(vp, LK_EXCLUSIVE) != 0)
2784 		return (EBADF);
2785 	error = vn_bmap_seekhole_locked(vp, cmd, off, cred);
2786 	VOP_UNLOCK(vp);
2787 	return (error);
2788 }
2789 
2790 int
vn_seek(struct file * fp,off_t offset,int whence,struct thread * td)2791 vn_seek(struct file *fp, off_t offset, int whence, struct thread *td)
2792 {
2793 	struct ucred *cred;
2794 	struct vnode *vp;
2795 	off_t foffset, fsize, size;
2796 	int error, noneg;
2797 
2798 	cred = td->td_ucred;
2799 	vp = fp->f_vnode;
2800 	noneg = (vp->v_type != VCHR);
2801 	/*
2802 	 * Try to dodge locking for common case of querying the offset.
2803 	 */
2804 	if (whence == L_INCR && offset == 0) {
2805 		foffset = foffset_read(fp);
2806 		if (__predict_false(foffset < 0 && noneg)) {
2807 			return (EOVERFLOW);
2808 		}
2809 		td->td_uretoff.tdu_off = foffset;
2810 		return (0);
2811 	}
2812 	foffset = foffset_lock(fp, 0);
2813 	error = 0;
2814 	switch (whence) {
2815 	case L_INCR:
2816 		if (noneg &&
2817 		    (foffset < 0 ||
2818 		    (offset > 0 && foffset > OFF_MAX - offset))) {
2819 			error = EOVERFLOW;
2820 			break;
2821 		}
2822 		offset += foffset;
2823 		break;
2824 	case L_XTND:
2825 		error = vn_getsize(vp, &fsize, cred);
2826 		if (error != 0)
2827 			break;
2828 
2829 		/*
2830 		 * If the file references a disk device, then fetch
2831 		 * the media size and use that to determine the ending
2832 		 * offset.
2833 		 */
2834 		if (fsize == 0 && vp->v_type == VCHR &&
2835 		    fo_ioctl(fp, DIOCGMEDIASIZE, &size, cred, td) == 0)
2836 			fsize = size;
2837 		if (noneg && offset > 0 && fsize > OFF_MAX - offset) {
2838 			error = EOVERFLOW;
2839 			break;
2840 		}
2841 		offset += fsize;
2842 		break;
2843 	case L_SET:
2844 		break;
2845 	case SEEK_DATA:
2846 		error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td);
2847 		if (error == ENOTTY)
2848 			error = EINVAL;
2849 		break;
2850 	case SEEK_HOLE:
2851 		error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td);
2852 		if (error == ENOTTY)
2853 			error = EINVAL;
2854 		break;
2855 	default:
2856 		error = EINVAL;
2857 	}
2858 	if (error == 0 && noneg && offset < 0)
2859 		error = EINVAL;
2860 	if (error != 0)
2861 		goto drop;
2862 	VFS_KNOTE_UNLOCKED(vp, 0);
2863 	td->td_uretoff.tdu_off = offset;
2864 drop:
2865 	foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
2866 	return (error);
2867 }
2868 
2869 int
vn_utimes_perm(struct vnode * vp,struct vattr * vap,struct ucred * cred,struct thread * td)2870 vn_utimes_perm(struct vnode *vp, struct vattr *vap, struct ucred *cred,
2871     struct thread *td)
2872 {
2873 	int error;
2874 
2875 	/*
2876 	 * Grant permission if the caller is the owner of the file, or
2877 	 * the super-user, or has ACL_WRITE_ATTRIBUTES permission on
2878 	 * on the file.  If the time pointer is null, then write
2879 	 * permission on the file is also sufficient.
2880 	 *
2881 	 * From NFSv4.1, draft 21, 6.2.1.3.1, Discussion of Mask Attributes:
2882 	 * A user having ACL_WRITE_DATA or ACL_WRITE_ATTRIBUTES
2883 	 * will be allowed to set the times [..] to the current
2884 	 * server time.
2885 	 */
2886 	error = VOP_ACCESSX(vp, VWRITE_ATTRIBUTES, cred, td);
2887 	if (error != 0 && (vap->va_vaflags & VA_UTIMES_NULL) != 0)
2888 		error = VOP_ACCESS(vp, VWRITE, cred, td);
2889 	return (error);
2890 }
2891 
2892 int
vn_fill_kinfo(struct file * fp,struct kinfo_file * kif,struct filedesc * fdp)2893 vn_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
2894 {
2895 	struct vnode *vp;
2896 	int error;
2897 
2898 	if (fp->f_type == DTYPE_FIFO)
2899 		kif->kf_type = KF_TYPE_FIFO;
2900 	else
2901 		kif->kf_type = KF_TYPE_VNODE;
2902 	vp = fp->f_vnode;
2903 	vref(vp);
2904 	FILEDESC_SUNLOCK(fdp);
2905 	error = vn_fill_kinfo_vnode(vp, kif);
2906 	vrele(vp);
2907 	FILEDESC_SLOCK(fdp);
2908 	return (error);
2909 }
2910 
2911 static inline void
vn_fill_junk(struct kinfo_file * kif)2912 vn_fill_junk(struct kinfo_file *kif)
2913 {
2914 	size_t len, olen;
2915 
2916 	/*
2917 	 * Simulate vn_fullpath returning changing values for a given
2918 	 * vp during e.g. coredump.
2919 	 */
2920 	len = (arc4random() % (sizeof(kif->kf_path) - 2)) + 1;
2921 	olen = strlen(kif->kf_path);
2922 	if (len < olen)
2923 		strcpy(&kif->kf_path[len - 1], "$");
2924 	else
2925 		for (; olen < len; olen++)
2926 			strcpy(&kif->kf_path[olen], "A");
2927 }
2928 
2929 int
vn_fill_kinfo_vnode(struct vnode * vp,struct kinfo_file * kif)2930 vn_fill_kinfo_vnode(struct vnode *vp, struct kinfo_file *kif)
2931 {
2932 	struct vattr va;
2933 	char *fullpath, *freepath;
2934 	int error;
2935 
2936 	kif->kf_un.kf_file.kf_file_type = vntype_to_kinfo(vp->v_type);
2937 	freepath = NULL;
2938 	fullpath = "-";
2939 	error = vn_fullpath(vp, &fullpath, &freepath);
2940 	if (error == 0) {
2941 		strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
2942 	}
2943 	if (freepath != NULL)
2944 		free(freepath, M_TEMP);
2945 
2946 	KFAIL_POINT_CODE(DEBUG_FP, fill_kinfo_vnode__random_path,
2947 		vn_fill_junk(kif);
2948 	);
2949 
2950 	/*
2951 	 * Retrieve vnode attributes.
2952 	 */
2953 	va.va_fsid = VNOVAL;
2954 	va.va_rdev = NODEV;
2955 	vn_lock(vp, LK_SHARED | LK_RETRY);
2956 	error = VOP_GETATTR(vp, &va, curthread->td_ucred);
2957 	VOP_UNLOCK(vp);
2958 	if (error != 0)
2959 		return (error);
2960 	if (va.va_fsid != VNOVAL)
2961 		kif->kf_un.kf_file.kf_file_fsid = va.va_fsid;
2962 	else
2963 		kif->kf_un.kf_file.kf_file_fsid =
2964 		    vp->v_mount->mnt_stat.f_fsid.val[0];
2965 	kif->kf_un.kf_file.kf_file_fsid_freebsd11 =
2966 	    kif->kf_un.kf_file.kf_file_fsid; /* truncate */
2967 	kif->kf_un.kf_file.kf_file_fileid = va.va_fileid;
2968 	kif->kf_un.kf_file.kf_file_mode = MAKEIMODE(va.va_type, va.va_mode);
2969 	kif->kf_un.kf_file.kf_file_size = va.va_size;
2970 	kif->kf_un.kf_file.kf_file_rdev = va.va_rdev;
2971 	kif->kf_un.kf_file.kf_file_rdev_freebsd11 =
2972 	    kif->kf_un.kf_file.kf_file_rdev; /* truncate */
2973 	kif->kf_un.kf_file.kf_file_nlink = va.va_nlink;
2974 	return (0);
2975 }
2976 
2977 int
vn_mmap(struct file * fp,vm_map_t map,vm_offset_t * addr,vm_size_t size,vm_prot_t prot,vm_prot_t cap_maxprot,int flags,vm_ooffset_t foff,struct thread * td)2978 vn_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size,
2979     vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff,
2980     struct thread *td)
2981 {
2982 #ifdef HWPMC_HOOKS
2983 	struct pmckern_map_in pkm;
2984 #endif
2985 	struct mount *mp;
2986 	struct vnode *vp;
2987 	vm_object_t object;
2988 	vm_prot_t maxprot;
2989 	boolean_t writecounted;
2990 	int error;
2991 
2992 #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \
2993     defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4)
2994 	/*
2995 	 * POSIX shared-memory objects are defined to have
2996 	 * kernel persistence, and are not defined to support
2997 	 * read(2)/write(2) -- or even open(2).  Thus, we can
2998 	 * use MAP_ASYNC to trade on-disk coherence for speed.
2999 	 * The shm_open(3) library routine turns on the FPOSIXSHM
3000 	 * flag to request this behavior.
3001 	 */
3002 	if ((fp->f_flag & FPOSIXSHM) != 0)
3003 		flags |= MAP_NOSYNC;
3004 #endif
3005 	vp = fp->f_vnode;
3006 
3007 	/*
3008 	 * Ensure that file and memory protections are
3009 	 * compatible.  Note that we only worry about
3010 	 * writability if mapping is shared; in this case,
3011 	 * current and max prot are dictated by the open file.
3012 	 * XXX use the vnode instead?  Problem is: what
3013 	 * credentials do we use for determination? What if
3014 	 * proc does a setuid?
3015 	 */
3016 	mp = vp->v_mount;
3017 	if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) {
3018 		maxprot = VM_PROT_NONE;
3019 		if ((prot & VM_PROT_EXECUTE) != 0)
3020 			return (EACCES);
3021 	} else
3022 		maxprot = VM_PROT_EXECUTE;
3023 	if ((fp->f_flag & FREAD) != 0)
3024 		maxprot |= VM_PROT_READ;
3025 	else if ((prot & VM_PROT_READ) != 0)
3026 		return (EACCES);
3027 
3028 	/*
3029 	 * If we are sharing potential changes via MAP_SHARED and we
3030 	 * are trying to get write permission although we opened it
3031 	 * without asking for it, bail out.
3032 	 */
3033 	if ((flags & MAP_SHARED) != 0) {
3034 		if ((fp->f_flag & FWRITE) != 0)
3035 			maxprot |= VM_PROT_WRITE;
3036 		else if ((prot & VM_PROT_WRITE) != 0)
3037 			return (EACCES);
3038 	} else {
3039 		maxprot |= VM_PROT_WRITE;
3040 		cap_maxprot |= VM_PROT_WRITE;
3041 	}
3042 	maxprot &= cap_maxprot;
3043 
3044 	/*
3045 	 * For regular files and shared memory, POSIX requires that
3046 	 * the value of foff be a legitimate offset within the data
3047 	 * object.  In particular, negative offsets are invalid.
3048 	 * Blocking negative offsets and overflows here avoids
3049 	 * possible wraparound or user-level access into reserved
3050 	 * ranges of the data object later.  In contrast, POSIX does
3051 	 * not dictate how offsets are used by device drivers, so in
3052 	 * the case of a device mapping a negative offset is passed
3053 	 * on.
3054 	 */
3055 	if (
3056 #ifdef _LP64
3057 	    size > OFF_MAX ||
3058 #endif
3059 	    foff > OFF_MAX - size)
3060 		return (EINVAL);
3061 
3062 	writecounted = FALSE;
3063 	error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, vp,
3064 	    &foff, &object, &writecounted);
3065 	if (error != 0)
3066 		return (error);
3067 	error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
3068 	    foff, writecounted, td);
3069 	if (error != 0) {
3070 		/*
3071 		 * If this mapping was accounted for in the vnode's
3072 		 * writecount, then undo that now.
3073 		 */
3074 		if (writecounted)
3075 			vm_pager_release_writecount(object, 0, size);
3076 		vm_object_deallocate(object);
3077 	}
3078 #ifdef HWPMC_HOOKS
3079 	/* Inform hwpmc(4) if an executable is being mapped. */
3080 	if (PMC_HOOK_INSTALLED(PMC_FN_MMAP)) {
3081 		if ((prot & VM_PROT_EXECUTE) != 0 && error == 0) {
3082 			pkm.pm_file = vp;
3083 			pkm.pm_address = (uintptr_t) *addr;
3084 			PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_MMAP, (void *) &pkm);
3085 		}
3086 	}
3087 #endif
3088 
3089 #ifdef HWT_HOOKS
3090 	if (HWT_HOOK_INSTALLED && (prot & VM_PROT_EXECUTE) != 0 &&
3091 	    error == 0) {
3092 		struct hwt_record_entry ent;
3093 		char *fullpath;
3094 		char *freepath;
3095 
3096 		if (vn_fullpath(vp, &fullpath, &freepath) == 0) {
3097 			ent.fullpath = fullpath;
3098 			ent.addr = (uintptr_t) *addr;
3099 			ent.record_type = HWT_RECORD_MMAP;
3100 			HWT_CALL_HOOK(td, HWT_MMAP, &ent);
3101 			free(freepath, M_TEMP);
3102 		}
3103 	}
3104 #endif
3105 
3106 	return (error);
3107 }
3108 
3109 void
vn_fsid(struct vnode * vp,struct vattr * va)3110 vn_fsid(struct vnode *vp, struct vattr *va)
3111 {
3112 	fsid_t *f;
3113 
3114 	f = &vp->v_mount->mnt_stat.f_fsid;
3115 	va->va_fsid = (uint32_t)f->val[1];
3116 	va->va_fsid <<= sizeof(f->val[1]) * NBBY;
3117 	va->va_fsid += (uint32_t)f->val[0];
3118 }
3119 
3120 int
vn_fsync_buf(struct vnode * vp,int waitfor)3121 vn_fsync_buf(struct vnode *vp, int waitfor)
3122 {
3123 	struct buf *bp, *nbp;
3124 	struct bufobj *bo;
3125 	struct mount *mp;
3126 	int error, maxretry;
3127 
3128 	error = 0;
3129 	maxretry = 10000;     /* large, arbitrarily chosen */
3130 	mp = NULL;
3131 	if (vp->v_type == VCHR) {
3132 		VI_LOCK(vp);
3133 		mp = vp->v_rdev->si_mountpt;
3134 		VI_UNLOCK(vp);
3135 	}
3136 	bo = &vp->v_bufobj;
3137 	BO_LOCK(bo);
3138 loop1:
3139 	/*
3140 	 * MARK/SCAN initialization to avoid infinite loops.
3141 	 */
3142         TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
3143 		bp->b_vflags &= ~BV_SCANNED;
3144 		bp->b_error = 0;
3145 	}
3146 
3147 	/*
3148 	 * Flush all dirty buffers associated with a vnode.
3149 	 */
3150 loop2:
3151 	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
3152 		if ((bp->b_vflags & BV_SCANNED) != 0)
3153 			continue;
3154 		bp->b_vflags |= BV_SCANNED;
3155 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
3156 			if (waitfor != MNT_WAIT)
3157 				continue;
3158 			if (BUF_LOCK(bp,
3159 			    LK_EXCLUSIVE | LK_INTERLOCK | LK_SLEEPFAIL,
3160 			    BO_LOCKPTR(bo)) != 0) {
3161 				BO_LOCK(bo);
3162 				goto loop1;
3163 			}
3164 			BO_LOCK(bo);
3165 		}
3166 		BO_UNLOCK(bo);
3167 		KASSERT(bp->b_bufobj == bo,
3168 		    ("bp %p wrong b_bufobj %p should be %p",
3169 		    bp, bp->b_bufobj, bo));
3170 		if ((bp->b_flags & B_DELWRI) == 0)
3171 			panic("fsync: not dirty");
3172 		if ((vp->v_object != NULL) && (bp->b_flags & B_CLUSTEROK)) {
3173 			vfs_bio_awrite(bp);
3174 		} else {
3175 			bremfree(bp);
3176 			bawrite(bp);
3177 		}
3178 		if (maxretry < 1000)
3179 			pause("dirty", hz < 1000 ? 1 : hz / 1000);
3180 		BO_LOCK(bo);
3181 		goto loop2;
3182 	}
3183 
3184 	/*
3185 	 * If synchronous the caller expects us to completely resolve all
3186 	 * dirty buffers in the system.  Wait for in-progress I/O to
3187 	 * complete (which could include background bitmap writes), then
3188 	 * retry if dirty blocks still exist.
3189 	 */
3190 	if (waitfor == MNT_WAIT) {
3191 		bufobj_wwait(bo, 0, 0);
3192 		if (bo->bo_dirty.bv_cnt > 0) {
3193 			/*
3194 			 * If we are unable to write any of these buffers
3195 			 * then we fail now rather than trying endlessly
3196 			 * to write them out.
3197 			 */
3198 			TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
3199 				if ((error = bp->b_error) != 0)
3200 					break;
3201 			if ((mp != NULL && mp->mnt_secondary_writes > 0) ||
3202 			    (error == 0 && --maxretry >= 0))
3203 				goto loop1;
3204 			if (error == 0)
3205 				error = EAGAIN;
3206 		}
3207 	}
3208 	BO_UNLOCK(bo);
3209 	if (error != 0)
3210 		vn_printf(vp, "fsync: giving up on dirty (error = %d) ", error);
3211 
3212 	return (error);
3213 }
3214 
3215 /*
3216  * Copies a byte range from invp to outvp.  Calls VOP_COPY_FILE_RANGE()
3217  * or vn_generic_copy_file_range() after rangelocking the byte ranges,
3218  * to do the actual copy.
3219  * vn_generic_copy_file_range() is factored out, so it can be called
3220  * from a VOP_COPY_FILE_RANGE() call as well, but handles vnodes from
3221  * different file systems.
3222  */
3223 int
vn_copy_file_range(struct vnode * invp,off_t * inoffp,struct vnode * outvp,off_t * outoffp,size_t * lenp,unsigned int flags,struct ucred * incred,struct ucred * outcred,struct thread * fsize_td)3224 vn_copy_file_range(struct vnode *invp, off_t *inoffp, struct vnode *outvp,
3225     off_t *outoffp, size_t *lenp, unsigned int flags, struct ucred *incred,
3226     struct ucred *outcred, struct thread *fsize_td)
3227 {
3228 	struct mount *inmp, *outmp;
3229 	struct vnode *invpl, *outvpl;
3230 	int error;
3231 	size_t len;
3232 	uint64_t uval;
3233 
3234 	invpl = outvpl = NULL;
3235 	len = *lenp;
3236 	*lenp = 0;		/* For error returns. */
3237 	error = 0;
3238 
3239 	/* Do some sanity checks on the arguments. */
3240 	if (invp->v_type == VDIR || outvp->v_type == VDIR)
3241 		error = EISDIR;
3242 	else if (*inoffp < 0 || *outoffp < 0 ||
3243 	    invp->v_type != VREG || outvp->v_type != VREG)
3244 		error = EINVAL;
3245 	if (error != 0)
3246 		goto out;
3247 
3248 	/* Ensure offset + len does not wrap around. */
3249 	uval = *inoffp;
3250 	uval += len;
3251 	if (uval > INT64_MAX)
3252 		len = INT64_MAX - *inoffp;
3253 	uval = *outoffp;
3254 	uval += len;
3255 	if (uval > INT64_MAX)
3256 		len = INT64_MAX - *outoffp;
3257 	if (len == 0)
3258 		goto out;
3259 
3260 	error = VOP_GETLOWVNODE(invp, &invpl, FREAD);
3261 	if (error != 0)
3262 		goto out;
3263 	error = VOP_GETLOWVNODE(outvp, &outvpl, FWRITE);
3264 	if (error != 0)
3265 		goto out1;
3266 
3267 	inmp = invpl->v_mount;
3268 	outmp = outvpl->v_mount;
3269 	if (inmp == NULL || outmp == NULL)
3270 		goto out2;
3271 
3272 	for (;;) {
3273 		error = vfs_busy(inmp, 0);
3274 		if (error != 0)
3275 			goto out2;
3276 		if (inmp == outmp)
3277 			break;
3278 		error = vfs_busy(outmp, MBF_NOWAIT);
3279 		if (error != 0) {
3280 			vfs_unbusy(inmp);
3281 			error = vfs_busy(outmp, 0);
3282 			if (error == 0) {
3283 				vfs_unbusy(outmp);
3284 				continue;
3285 			}
3286 			goto out2;
3287 		}
3288 		break;
3289 	}
3290 
3291 	/*
3292 	 * If the two vnodes are for the same file system type, call
3293 	 * VOP_COPY_FILE_RANGE(), otherwise call vn_generic_copy_file_range()
3294 	 * which can handle copies across multiple file system types.
3295 	 */
3296 	*lenp = len;
3297 	if (inmp == outmp || inmp->mnt_vfc == outmp->mnt_vfc)
3298 		error = VOP_COPY_FILE_RANGE(invpl, inoffp, outvpl, outoffp,
3299 		    lenp, flags, incred, outcred, fsize_td);
3300 	else
3301 		error = ENOSYS;
3302 	if (error == ENOSYS)
3303 		error = vn_generic_copy_file_range(invpl, inoffp, outvpl,
3304 		    outoffp, lenp, flags, incred, outcred, fsize_td);
3305 	vfs_unbusy(outmp);
3306 	if (inmp != outmp)
3307 		vfs_unbusy(inmp);
3308 out2:
3309 	if (outvpl != NULL)
3310 		vrele(outvpl);
3311 out1:
3312 	if (invpl != NULL)
3313 		vrele(invpl);
3314 out:
3315 	return (error);
3316 }
3317 
3318 /*
3319  * Test len bytes of data starting at dat for all bytes == 0.
3320  * Return true if all bytes are zero, false otherwise.
3321  * Expects dat to be well aligned.
3322  */
3323 static bool
mem_iszero(void * dat,int len)3324 mem_iszero(void *dat, int len)
3325 {
3326 	int i;
3327 	const u_int *p;
3328 	const char *cp;
3329 
3330 	for (p = dat; len > 0; len -= sizeof(*p), p++) {
3331 		if (len >= sizeof(*p)) {
3332 			if (*p != 0)
3333 				return (false);
3334 		} else {
3335 			cp = (const char *)p;
3336 			for (i = 0; i < len; i++, cp++)
3337 				if (*cp != '\0')
3338 					return (false);
3339 		}
3340 	}
3341 	return (true);
3342 }
3343 
3344 /*
3345  * Look for a hole in the output file and, if found, adjust *outoffp
3346  * and *xferp to skip past the hole.
3347  * *xferp is the entire hole length to be written and xfer2 is how many bytes
3348  * to be written as 0's upon return.
3349  */
3350 static off_t
vn_skip_hole(struct vnode * outvp,off_t xfer2,off_t * outoffp,off_t * xferp,off_t * dataoffp,off_t * holeoffp,struct ucred * cred)3351 vn_skip_hole(struct vnode *outvp, off_t xfer2, off_t *outoffp, off_t *xferp,
3352     off_t *dataoffp, off_t *holeoffp, struct ucred *cred)
3353 {
3354 	int error;
3355 	off_t delta;
3356 
3357 	if (*holeoffp == 0 || *holeoffp <= *outoffp) {
3358 		*dataoffp = *outoffp;
3359 		error = VOP_IOCTL(outvp, FIOSEEKDATA, dataoffp, 0, cred,
3360 		    curthread);
3361 		if (error == 0) {
3362 			*holeoffp = *dataoffp;
3363 			error = VOP_IOCTL(outvp, FIOSEEKHOLE, holeoffp, 0, cred,
3364 			    curthread);
3365 		}
3366 		if (error != 0 || *holeoffp == *dataoffp) {
3367 			/*
3368 			 * Since outvp is unlocked, it may be possible for
3369 			 * another thread to do a truncate(), lseek(), write()
3370 			 * creating a hole at startoff between the above
3371 			 * VOP_IOCTL() calls, if the other thread does not do
3372 			 * rangelocking.
3373 			 * If that happens, *holeoffp == *dataoffp and finding
3374 			 * the hole has failed, so disable vn_skip_hole().
3375 			 */
3376 			*holeoffp = -1;	/* Disable use of vn_skip_hole(). */
3377 			return (xfer2);
3378 		}
3379 		KASSERT(*dataoffp >= *outoffp,
3380 		    ("vn_skip_hole: dataoff=%jd < outoff=%jd",
3381 		    (intmax_t)*dataoffp, (intmax_t)*outoffp));
3382 		KASSERT(*holeoffp > *dataoffp,
3383 		    ("vn_skip_hole: holeoff=%jd <= dataoff=%jd",
3384 		    (intmax_t)*holeoffp, (intmax_t)*dataoffp));
3385 	}
3386 
3387 	/*
3388 	 * If there is a hole before the data starts, advance *outoffp and
3389 	 * *xferp past the hole.
3390 	 */
3391 	if (*dataoffp > *outoffp) {
3392 		delta = *dataoffp - *outoffp;
3393 		if (delta >= *xferp) {
3394 			/* Entire *xferp is a hole. */
3395 			*outoffp += *xferp;
3396 			*xferp = 0;
3397 			return (0);
3398 		}
3399 		*xferp -= delta;
3400 		*outoffp += delta;
3401 		xfer2 = MIN(xfer2, *xferp);
3402 	}
3403 
3404 	/*
3405 	 * If a hole starts before the end of this xfer2, reduce this xfer2 so
3406 	 * that the write ends at the start of the hole.
3407 	 * *holeoffp should always be greater than *outoffp, but for the
3408 	 * non-INVARIANTS case, check this to make sure xfer2 remains a sane
3409 	 * value.
3410 	 */
3411 	if (*holeoffp > *outoffp && *holeoffp < *outoffp + xfer2)
3412 		xfer2 = *holeoffp - *outoffp;
3413 	return (xfer2);
3414 }
3415 
3416 /*
3417  * Write an xfer sized chunk to outvp in blksize blocks from dat.
3418  * dat is a maximum of blksize in length and can be written repeatedly in
3419  * the chunk.
3420  * If growfile == true, just grow the file via vn_truncate_locked() instead
3421  * of doing actual writes.
3422  * If checkhole == true, a hole is being punched, so skip over any hole
3423  * already in the output file.
3424  */
3425 static int
vn_write_outvp(struct vnode * outvp,char * dat,off_t outoff,off_t xfer,u_long blksize,bool growfile,bool checkhole,struct ucred * cred)3426 vn_write_outvp(struct vnode *outvp, char *dat, off_t outoff, off_t xfer,
3427     u_long blksize, bool growfile, bool checkhole, struct ucred *cred)
3428 {
3429 	struct mount *mp;
3430 	off_t dataoff, holeoff, xfer2;
3431 	int error;
3432 
3433 	/*
3434 	 * Loop around doing writes of blksize until write has been completed.
3435 	 * Lock/unlock on each loop iteration so that a bwillwrite() can be
3436 	 * done for each iteration, since the xfer argument can be very
3437 	 * large if there is a large hole to punch in the output file.
3438 	 */
3439 	error = 0;
3440 	holeoff = 0;
3441 	do {
3442 		xfer2 = MIN(xfer, blksize);
3443 		if (checkhole) {
3444 			/*
3445 			 * Punching a hole.  Skip writing if there is
3446 			 * already a hole in the output file.
3447 			 */
3448 			xfer2 = vn_skip_hole(outvp, xfer2, &outoff, &xfer,
3449 			    &dataoff, &holeoff, cred);
3450 			if (xfer == 0)
3451 				break;
3452 			if (holeoff < 0)
3453 				checkhole = false;
3454 			KASSERT(xfer2 > 0, ("vn_write_outvp: xfer2=%jd",
3455 			    (intmax_t)xfer2));
3456 		}
3457 		bwillwrite();
3458 		mp = NULL;
3459 		error = vn_start_write(outvp, &mp, V_WAIT);
3460 		if (error != 0)
3461 			break;
3462 		if (growfile) {
3463 			error = vn_lock(outvp, LK_EXCLUSIVE);
3464 			if (error == 0) {
3465 				error = vn_truncate_locked(outvp, outoff + xfer,
3466 				    false, cred);
3467 				VOP_UNLOCK(outvp);
3468 			}
3469 		} else {
3470 			error = vn_lock(outvp, vn_lktype_write(mp, outvp));
3471 			if (error == 0) {
3472 				error = vn_rdwr(UIO_WRITE, outvp, dat, xfer2,
3473 				    outoff, UIO_SYSSPACE, IO_NODELOCKED,
3474 				    curthread->td_ucred, cred, NULL, curthread);
3475 				outoff += xfer2;
3476 				xfer -= xfer2;
3477 				VOP_UNLOCK(outvp);
3478 			}
3479 		}
3480 		if (mp != NULL)
3481 			vn_finished_write(mp);
3482 	} while (!growfile && xfer > 0 && error == 0);
3483 	return (error);
3484 }
3485 
3486 /*
3487  * Copy a byte range of one file to another.  This function can handle the
3488  * case where invp and outvp are on different file systems.
3489  * It can also be called by a VOP_COPY_FILE_RANGE() to do the work, if there
3490  * is no better file system specific way to do it.
3491  */
3492 int
vn_generic_copy_file_range(struct vnode * invp,off_t * inoffp,struct vnode * outvp,off_t * outoffp,size_t * lenp,unsigned int flags,struct ucred * incred,struct ucred * outcred,struct thread * fsize_td)3493 vn_generic_copy_file_range(struct vnode *invp, off_t *inoffp,
3494     struct vnode *outvp, off_t *outoffp, size_t *lenp, unsigned int flags,
3495     struct ucred *incred, struct ucred *outcred, struct thread *fsize_td)
3496 {
3497 	struct vattr inva;
3498 	struct mount *mp;
3499 	off_t startoff, endoff, xfer, xfer2;
3500 	u_long blksize;
3501 	int error, interrupted;
3502 	bool cantseek, readzeros, eof, first, lastblock, holetoeof, sparse;
3503 	ssize_t aresid, r = 0;
3504 	size_t copylen, len, savlen;
3505 	off_t outsize;
3506 	char *dat;
3507 	long holein, holeout;
3508 	struct timespec curts, endts;
3509 
3510 	holein = holeout = 0;
3511 	savlen = len = *lenp;
3512 	error = 0;
3513 	interrupted = 0;
3514 	dat = NULL;
3515 
3516 	if ((flags & COPY_FILE_RANGE_CLONE) != 0) {
3517 		error = EOPNOTSUPP;
3518 		goto out;
3519 	}
3520 
3521 	error = vn_lock(invp, LK_SHARED);
3522 	if (error != 0)
3523 		goto out;
3524 	if (VOP_PATHCONF(invp, _PC_MIN_HOLE_SIZE, &holein) != 0)
3525 		holein = 0;
3526 	error = VOP_GETATTR(invp, &inva, incred);
3527 	if (error == 0 && inva.va_size > OFF_MAX)
3528 		error = EFBIG;
3529 	VOP_UNLOCK(invp);
3530 	if (error != 0)
3531 		goto out;
3532 
3533 	/*
3534 	 * Use va_bytes >= va_size as a hint that the file does not have
3535 	 * sufficient holes to justify the overhead of doing FIOSEEKHOLE.
3536 	 * This hint does not work well for file systems doing compression
3537 	 * and may fail when allocations for extended attributes increases
3538 	 * the value of va_bytes to >= va_size.
3539 	 */
3540 	sparse = true;
3541 	if (holein != 0 && inva.va_bytes >= inva.va_size) {
3542 		holein = 0;
3543 		sparse = false;
3544 	}
3545 
3546 	mp = NULL;
3547 	error = vn_start_write(outvp, &mp, V_WAIT);
3548 	if (error == 0)
3549 		error = vn_lock(outvp, LK_EXCLUSIVE);
3550 	if (error == 0) {
3551 		/*
3552 		 * If fsize_td != NULL, do a vn_rlimit_fsizex() call,
3553 		 * now that outvp is locked.
3554 		 */
3555 		if (fsize_td != NULL) {
3556 			struct uio io;
3557 
3558 			io.uio_offset = *outoffp;
3559 			io.uio_resid = len;
3560 			error = vn_rlimit_fsizex(outvp, &io, 0, &r, fsize_td);
3561 			len = savlen = io.uio_resid;
3562 			/*
3563 			 * No need to call vn_rlimit_fsizex_res before return,
3564 			 * since the uio is local.
3565 			 */
3566 		}
3567 		if (VOP_PATHCONF(outvp, _PC_MIN_HOLE_SIZE, &holeout) != 0)
3568 			holeout = 0;
3569 		/*
3570 		 * Holes that are past EOF do not need to be written as a block
3571 		 * of zero bytes.  So, truncate the output file as far as
3572 		 * possible and then use size to decide if writing 0
3573 		 * bytes is necessary in the loop below.
3574 		 */
3575 		if (error == 0)
3576 			error = vn_getsize_locked(outvp, &outsize, outcred);
3577 		if (error == 0 && outsize > *outoffp &&
3578 		    *outoffp <= OFF_MAX - len && outsize <= *outoffp + len &&
3579 		    *inoffp < inva.va_size &&
3580 		    *outoffp <= OFF_MAX - (inva.va_size - *inoffp) &&
3581 		    outsize <= *outoffp + (inva.va_size - *inoffp)) {
3582 #ifdef MAC
3583 			error = mac_vnode_check_write(curthread->td_ucred,
3584 			    outcred, outvp);
3585 			if (error == 0)
3586 #endif
3587 				error = vn_truncate_locked(outvp, *outoffp,
3588 				    false, outcred);
3589 			if (error == 0)
3590 				outsize = *outoffp;
3591 		}
3592 		VOP_UNLOCK(outvp);
3593 	}
3594 	if (mp != NULL)
3595 		vn_finished_write(mp);
3596 	if (error != 0)
3597 		goto out;
3598 
3599 	if (sparse && holein == 0 && holeout > 0) {
3600 		/*
3601 		 * For this special case, the input data will be scanned
3602 		 * for blocks of all 0 bytes.  For these blocks, the
3603 		 * write can be skipped for the output file to create
3604 		 * an unallocated region.
3605 		 * Therefore, use the appropriate size for the output file.
3606 		 */
3607 		blksize = holeout;
3608 		if (blksize <= 512) {
3609 			/*
3610 			 * Use f_iosize, since ZFS reports a _PC_MIN_HOLE_SIZE
3611 			 * of 512, although it actually only creates
3612 			 * unallocated regions for blocks >= f_iosize.
3613 			 */
3614 			blksize = outvp->v_mount->mnt_stat.f_iosize;
3615 		}
3616 	} else {
3617 		/*
3618 		 * Use the larger of the two f_iosize values.  If they are
3619 		 * not the same size, one will normally be an exact multiple of
3620 		 * the other, since they are both likely to be a power of 2.
3621 		 */
3622 		blksize = MAX(invp->v_mount->mnt_stat.f_iosize,
3623 		    outvp->v_mount->mnt_stat.f_iosize);
3624 	}
3625 
3626 	/* Clip to sane limits. */
3627 	if (blksize < 4096)
3628 		blksize = 4096;
3629 	else if (blksize > maxphys)
3630 		blksize = maxphys;
3631 	dat = malloc(blksize, M_TEMP, M_WAITOK);
3632 
3633 	/*
3634 	 * If VOP_IOCTL(FIOSEEKHOLE) works for invp, use it and FIOSEEKDATA
3635 	 * to find holes.  Otherwise, just scan the read block for all 0s
3636 	 * in the inner loop where the data copying is done.
3637 	 * Note that some file systems such as NFSv3, NFSv4.0 and NFSv4.1 may
3638 	 * support holes on the server, but do not support FIOSEEKHOLE.
3639 	 * The kernel flag COPY_FILE_RANGE_TIMEO1SEC is used to indicate
3640 	 * that this function should return after 1second with a partial
3641 	 * completion.
3642 	 */
3643 	if ((flags & COPY_FILE_RANGE_TIMEO1SEC) != 0) {
3644 		getnanouptime(&endts);
3645 		endts.tv_sec++;
3646 	} else
3647 		timespecclear(&endts);
3648 	first = true;
3649 	holetoeof = eof = false;
3650 	while (len > 0 && error == 0 && !eof && interrupted == 0) {
3651 		endoff = 0;			/* To shut up compilers. */
3652 		cantseek = true;
3653 		startoff = *inoffp;
3654 		copylen = len;
3655 
3656 		/*
3657 		 * Find the next data area.  If there is just a hole to EOF,
3658 		 * FIOSEEKDATA should fail with ENXIO.
3659 		 * (I do not know if any file system will report a hole to
3660 		 *  EOF via FIOSEEKHOLE, but I am pretty sure FIOSEEKDATA
3661 		 *  will fail for those file systems.)
3662 		 *
3663 		 * For input files that don't support FIOSEEKDATA/FIOSEEKHOLE,
3664 		 * the code just falls through to the inner copy loop.
3665 		 */
3666 		error = EINVAL;
3667 		if (holein > 0) {
3668 			error = VOP_IOCTL(invp, FIOSEEKDATA, &startoff, 0,
3669 			    incred, curthread);
3670 			if (error == ENXIO) {
3671 				startoff = endoff = inva.va_size;
3672 				eof = holetoeof = true;
3673 				error = 0;
3674 			}
3675 		}
3676 		if (error == 0 && !holetoeof) {
3677 			endoff = startoff;
3678 			error = VOP_IOCTL(invp, FIOSEEKHOLE, &endoff, 0,
3679 			    incred, curthread);
3680 			/*
3681 			 * Since invp is unlocked, it may be possible for
3682 			 * another thread to do a truncate(), lseek(), write()
3683 			 * creating a hole at startoff between the above
3684 			 * VOP_IOCTL() calls, if the other thread does not do
3685 			 * rangelocking.
3686 			 * If that happens, startoff == endoff and finding
3687 			 * the hole has failed, so set an error.
3688 			 */
3689 			if (error == 0 && startoff == endoff)
3690 				error = EINVAL; /* Any error. Reset to 0. */
3691 		}
3692 		if (error == 0) {
3693 			if (startoff > *inoffp) {
3694 				/* Found hole before data block. */
3695 				xfer = MIN(startoff - *inoffp, len);
3696 				if (*outoffp < outsize) {
3697 					/* Must write 0s to punch hole. */
3698 					xfer2 = MIN(outsize - *outoffp,
3699 					    xfer);
3700 					memset(dat, 0, MIN(xfer2, blksize));
3701 					error = vn_write_outvp(outvp, dat,
3702 					    *outoffp, xfer2, blksize, false,
3703 					    holeout > 0, outcred);
3704 				}
3705 
3706 				if (error == 0 && *outoffp + xfer >
3707 				    outsize && (xfer == len || holetoeof)) {
3708 					/* Grow output file (hole at end). */
3709 					error = vn_write_outvp(outvp, dat,
3710 					    *outoffp, xfer, blksize, true,
3711 					    false, outcred);
3712 				}
3713 				if (error == 0) {
3714 					*inoffp += xfer;
3715 					*outoffp += xfer;
3716 					len -= xfer;
3717 					if (len < savlen) {
3718 						interrupted = sig_intr();
3719 						if (timespecisset(&endts) &&
3720 						    interrupted == 0) {
3721 							getnanouptime(&curts);
3722 							if (timespeccmp(&curts,
3723 							    &endts, >=))
3724 								interrupted =
3725 								    EINTR;
3726 						}
3727 					}
3728 				}
3729 			}
3730 			copylen = MIN(len, endoff - startoff);
3731 			cantseek = false;
3732 		} else {
3733 			cantseek = true;
3734 			if (!sparse)
3735 				cantseek = false;
3736 			startoff = *inoffp;
3737 			copylen = len;
3738 			error = 0;
3739 		}
3740 
3741 		xfer = blksize;
3742 		if (cantseek) {
3743 			/*
3744 			 * Set first xfer to end at a block boundary, so that
3745 			 * holes are more likely detected in the loop below via
3746 			 * the for all bytes 0 method.
3747 			 */
3748 			xfer -= (*inoffp % blksize);
3749 		}
3750 
3751 		/*
3752 		 * Loop copying the data block.  If this was our first attempt
3753 		 * to copy anything, allow a zero-length block so that the VOPs
3754 		 * get a chance to update metadata, specifically the atime.
3755 		 */
3756 		while (error == 0 && ((copylen > 0 && !eof) || first) &&
3757 		    interrupted == 0) {
3758 			if (copylen < xfer)
3759 				xfer = copylen;
3760 			first = false;
3761 			error = vn_lock(invp, LK_SHARED);
3762 			if (error != 0)
3763 				goto out;
3764 			error = vn_rdwr(UIO_READ, invp, dat, xfer,
3765 			    startoff, UIO_SYSSPACE, IO_NODELOCKED,
3766 			    curthread->td_ucred, incred, &aresid,
3767 			    curthread);
3768 			VOP_UNLOCK(invp);
3769 			lastblock = false;
3770 			if (error == 0 && (xfer == 0 || aresid > 0)) {
3771 				/* Stop the copy at EOF on the input file. */
3772 				xfer -= aresid;
3773 				eof = true;
3774 				lastblock = true;
3775 			}
3776 			if (error == 0) {
3777 				/*
3778 				 * Skip the write for holes past the initial EOF
3779 				 * of the output file, unless this is the last
3780 				 * write of the output file at EOF.
3781 				 */
3782 				readzeros = cantseek ? mem_iszero(dat, xfer) :
3783 				    false;
3784 				if (xfer == len)
3785 					lastblock = true;
3786 				if (!cantseek || *outoffp < outsize ||
3787 				    lastblock || !readzeros)
3788 					error = vn_write_outvp(outvp, dat,
3789 					    *outoffp, xfer, blksize,
3790 					    readzeros && lastblock &&
3791 					    *outoffp >= outsize, false,
3792 					    outcred);
3793 				if (error == 0) {
3794 					*inoffp += xfer;
3795 					startoff += xfer;
3796 					*outoffp += xfer;
3797 					copylen -= xfer;
3798 					len -= xfer;
3799 					if (len < savlen) {
3800 						interrupted = sig_intr();
3801 						if (timespecisset(&endts) &&
3802 						    interrupted == 0) {
3803 							getnanouptime(&curts);
3804 							if (timespeccmp(&curts,
3805 							    &endts, >=))
3806 								interrupted =
3807 								    EINTR;
3808 						}
3809 					}
3810 				}
3811 			}
3812 			xfer = blksize;
3813 		}
3814 	}
3815 out:
3816 	*lenp = savlen - len;
3817 	free(dat, M_TEMP);
3818 	return (error);
3819 }
3820 
3821 static int
vn_fallocate(struct file * fp,off_t offset,off_t len,struct thread * td)3822 vn_fallocate(struct file *fp, off_t offset, off_t len, struct thread *td)
3823 {
3824 	struct mount *mp;
3825 	struct vnode *vp;
3826 	off_t olen, ooffset;
3827 	int error;
3828 #ifdef AUDIT
3829 	int audited_vnode1 = 0;
3830 #endif
3831 
3832 	vp = fp->f_vnode;
3833 	if (vp->v_type != VREG)
3834 		return (ENODEV);
3835 
3836 	/* Allocating blocks may take a long time, so iterate. */
3837 	for (;;) {
3838 		olen = len;
3839 		ooffset = offset;
3840 
3841 		bwillwrite();
3842 		mp = NULL;
3843 		error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH);
3844 		if (error != 0)
3845 			break;
3846 		error = vn_lock(vp, LK_EXCLUSIVE);
3847 		if (error != 0) {
3848 			vn_finished_write(mp);
3849 			break;
3850 		}
3851 #ifdef AUDIT
3852 		if (!audited_vnode1) {
3853 			AUDIT_ARG_VNODE1(vp);
3854 			audited_vnode1 = 1;
3855 		}
3856 #endif
3857 #ifdef MAC
3858 		error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp);
3859 		if (error == 0)
3860 #endif
3861 			error = VOP_ALLOCATE(vp, &offset, &len, 0,
3862 			    td->td_ucred);
3863 		VOP_UNLOCK(vp);
3864 		vn_finished_write(mp);
3865 
3866 		if (olen + ooffset != offset + len) {
3867 			panic("offset + len changed from %jx/%jx to %jx/%jx",
3868 			    ooffset, olen, offset, len);
3869 		}
3870 		if (error != 0 || len == 0)
3871 			break;
3872 		KASSERT(olen > len, ("Iteration did not make progress?"));
3873 		maybe_yield();
3874 	}
3875 
3876 	return (error);
3877 }
3878 
3879 static int
vn_deallocate_impl(struct vnode * vp,off_t * offset,off_t * length,int flags,int ioflag,struct ucred * cred,struct ucred * active_cred,struct ucred * file_cred)3880 vn_deallocate_impl(struct vnode *vp, off_t *offset, off_t *length, int flags,
3881     int ioflag, struct ucred *cred, struct ucred *active_cred,
3882     struct ucred *file_cred)
3883 {
3884 	struct mount *mp;
3885 	void *rl_cookie;
3886 	off_t off, len;
3887 	int error;
3888 #ifdef AUDIT
3889 	bool audited_vnode1 = false;
3890 #endif
3891 
3892 	rl_cookie = NULL;
3893 	error = 0;
3894 	mp = NULL;
3895 	off = *offset;
3896 	len = *length;
3897 
3898 	if ((ioflag & (IO_NODELOCKED | IO_RANGELOCKED)) == 0)
3899 		rl_cookie = vn_rangelock_wlock(vp, off, off + len);
3900 	while (len > 0 && error == 0) {
3901 		/*
3902 		 * Try to deallocate the longest range in one pass.
3903 		 * In case a pass takes too long to be executed, it returns
3904 		 * partial result. The residue will be proceeded in the next
3905 		 * pass.
3906 		 */
3907 
3908 		if ((ioflag & IO_NODELOCKED) == 0) {
3909 			bwillwrite();
3910 			if ((error = vn_start_write(vp, &mp,
3911 			    V_WAIT | V_PCATCH)) != 0)
3912 				goto out;
3913 			vn_lock(vp, vn_lktype_write(mp, vp) | LK_RETRY);
3914 		}
3915 #ifdef AUDIT
3916 		if (!audited_vnode1) {
3917 			AUDIT_ARG_VNODE1(vp);
3918 			audited_vnode1 = true;
3919 		}
3920 #endif
3921 
3922 #ifdef MAC
3923 		if ((ioflag & IO_NOMACCHECK) == 0)
3924 			error = mac_vnode_check_write(active_cred, file_cred,
3925 			    vp);
3926 #endif
3927 		if (error == 0)
3928 			error = VOP_DEALLOCATE(vp, &off, &len, flags, ioflag,
3929 			    cred);
3930 
3931 		if ((ioflag & IO_NODELOCKED) == 0) {
3932 			VOP_UNLOCK(vp);
3933 			if (mp != NULL) {
3934 				vn_finished_write(mp);
3935 				mp = NULL;
3936 			}
3937 		}
3938 		if (error == 0 && len != 0)
3939 			maybe_yield();
3940 	}
3941 out:
3942 	if (rl_cookie != NULL)
3943 		vn_rangelock_unlock(vp, rl_cookie);
3944 	*offset = off;
3945 	*length = len;
3946 	return (error);
3947 }
3948 
3949 /*
3950  * This function is supposed to be used in the situations where the deallocation
3951  * is not triggered by a user request.
3952  */
3953 int
vn_deallocate(struct vnode * vp,off_t * offset,off_t * length,int flags,int ioflag,struct ucred * active_cred,struct ucred * file_cred)3954 vn_deallocate(struct vnode *vp, off_t *offset, off_t *length, int flags,
3955     int ioflag, struct ucred *active_cred, struct ucred *file_cred)
3956 {
3957 	struct ucred *cred;
3958 
3959 	if (*offset < 0 || *length <= 0 || *length > OFF_MAX - *offset ||
3960 	    flags != 0)
3961 		return (EINVAL);
3962 	if (vp->v_type != VREG)
3963 		return (ENODEV);
3964 
3965 	cred = file_cred != NOCRED ? file_cred : active_cred;
3966 	return (vn_deallocate_impl(vp, offset, length, flags, ioflag, cred,
3967 	    active_cred, file_cred));
3968 }
3969 
3970 static int
vn_fspacectl(struct file * fp,int cmd,off_t * offset,off_t * length,int flags,struct ucred * active_cred,struct thread * td)3971 vn_fspacectl(struct file *fp, int cmd, off_t *offset, off_t *length, int flags,
3972     struct ucred *active_cred, struct thread *td)
3973 {
3974 	int error;
3975 	struct vnode *vp;
3976 	int ioflag;
3977 
3978 	KASSERT(cmd == SPACECTL_DEALLOC, ("vn_fspacectl: Invalid cmd"));
3979 	KASSERT((flags & ~SPACECTL_F_SUPPORTED) == 0,
3980 	    ("vn_fspacectl: non-zero flags"));
3981 	KASSERT(*offset >= 0 && *length > 0 && *length <= OFF_MAX - *offset,
3982 	    ("vn_fspacectl: offset/length overflow or underflow"));
3983 	vp = fp->f_vnode;
3984 
3985 	if (vp->v_type != VREG)
3986 		return (ENODEV);
3987 
3988 	ioflag = get_write_ioflag(fp);
3989 
3990 	switch (cmd) {
3991 	case SPACECTL_DEALLOC:
3992 		error = vn_deallocate_impl(vp, offset, length, flags, ioflag,
3993 		    active_cred, active_cred, fp->f_cred);
3994 		break;
3995 	default:
3996 		panic("vn_fspacectl: unknown cmd %d", cmd);
3997 	}
3998 
3999 	return (error);
4000 }
4001 
4002 /*
4003  * Keep this assert as long as sizeof(struct dirent) is used as the maximum
4004  * entry size.
4005  */
4006 _Static_assert(_GENERIC_MAXDIRSIZ == sizeof(struct dirent),
4007     "'struct dirent' size must be a multiple of its alignment "
4008     "(see _GENERIC_DIRLEN())");
4009 
4010 /*
4011  * Returns successive directory entries through some caller's provided buffer.
4012  *
4013  * This function automatically refills the provided buffer with calls to
4014  * VOP_READDIR() (after MAC permission checks).
4015  *
4016  * 'td' is used for credentials and passed to uiomove().  'dirbuf' is the
4017  * caller's buffer to fill and 'dirbuflen' its allocated size.  'dirbuf' must
4018  * be properly aligned to access 'struct dirent' structures and 'dirbuflen'
4019  * must be greater than GENERIC_MAXDIRSIZ to avoid VOP_READDIR() returning
4020  * EINVAL (the latter is not a strong guarantee (yet); but EINVAL will always
4021  * be returned if this requirement is not verified).  '*dpp' points to the
4022  * current directory entry in the buffer and '*len' contains the remaining
4023  * valid bytes in 'dirbuf' after 'dpp' (including the pointed entry).
4024  *
4025  * At first call (or when restarting the read), '*len' must have been set to 0,
4026  * '*off' to 0 (or any valid start offset) and '*eofflag' to 0.  There are no
4027  * more entries as soon as '*len' is 0 after a call that returned 0.  Calling
4028  * again this function after such a condition is considered an error and EINVAL
4029  * will be returned.  Other possible error codes are those of VOP_READDIR(),
4030  * EINTEGRITY if the returned entries do not pass coherency tests, or EINVAL
4031  * (bad call).  All errors are unrecoverable, i.e., the state ('*len', '*off'
4032  * and '*eofflag') must be re-initialized before a subsequent call.  On error
4033  * or at end of directory, '*dpp' is reset to NULL.
4034  *
4035  * '*len', '*off' and '*eofflag' are internal state the caller should not
4036  * tamper with except as explained above.  '*off' is the next directory offset
4037  * to read from to refill the buffer.  '*eofflag' is set to 0 or 1 by the last
4038  * internal call to VOP_READDIR() that returned without error, indicating
4039  * whether it reached the end of the directory, and to 2 by this function after
4040  * all entries have been read.
4041  */
4042 int
vn_dir_next_dirent(struct vnode * vp,struct thread * td,char * dirbuf,size_t dirbuflen,struct dirent ** dpp,size_t * len,off_t * off,int * eofflag)4043 vn_dir_next_dirent(struct vnode *vp, struct thread *td,
4044     char *dirbuf, size_t dirbuflen,
4045     struct dirent **dpp, size_t *len, off_t *off, int *eofflag)
4046 {
4047 	struct dirent *dp = NULL;
4048 	int reclen;
4049 	int error;
4050 	struct uio uio;
4051 	struct iovec iov;
4052 
4053 	ASSERT_VOP_LOCKED(vp, "vnode not locked");
4054 	VNASSERT(vp->v_type == VDIR, vp, ("vnode is not a directory"));
4055 	MPASS2((uintptr_t)dirbuf < (uintptr_t)dirbuf + dirbuflen,
4056 	    "Address space overflow");
4057 
4058 	if (__predict_false(dirbuflen < GENERIC_MAXDIRSIZ)) {
4059 		/* Don't take any chances in this case */
4060 		error = EINVAL;
4061 		goto out;
4062 	}
4063 
4064 	if (*len != 0) {
4065 		dp = *dpp;
4066 
4067 		/*
4068 		 * The caller continued to call us after an error (we set dp to
4069 		 * NULL in a previous iteration).  Bail out right now.
4070 		 */
4071 		if (__predict_false(dp == NULL))
4072 			return (EINVAL);
4073 
4074 		MPASS(*len <= dirbuflen);
4075 		MPASS2((uintptr_t)dirbuf <= (uintptr_t)dp &&
4076 		    (uintptr_t)dp + *len <= (uintptr_t)dirbuf + dirbuflen,
4077 		    "Filled range not inside buffer");
4078 
4079 		reclen = dp->d_reclen;
4080 		if (reclen >= *len) {
4081 			/* End of buffer reached */
4082 			*len = 0;
4083 		} else {
4084 			dp = (struct dirent *)((char *)dp + reclen);
4085 			*len -= reclen;
4086 		}
4087 	}
4088 
4089 	if (*len == 0) {
4090 		dp = NULL;
4091 
4092 		/* Have to refill. */
4093 		switch (*eofflag) {
4094 		case 0:
4095 			break;
4096 
4097 		case 1:
4098 			/* Nothing more to read. */
4099 			*eofflag = 2; /* Remember the caller reached EOF. */
4100 			goto success;
4101 
4102 		default:
4103 			/* The caller didn't test for EOF. */
4104 			error = EINVAL;
4105 			goto out;
4106 		}
4107 
4108 		iov.iov_base = dirbuf;
4109 		iov.iov_len = dirbuflen;
4110 
4111 		uio.uio_iov = &iov;
4112 		uio.uio_iovcnt = 1;
4113 		uio.uio_offset = *off;
4114 		uio.uio_resid = dirbuflen;
4115 		uio.uio_segflg = UIO_SYSSPACE;
4116 		uio.uio_rw = UIO_READ;
4117 		uio.uio_td = td;
4118 
4119 #ifdef MAC
4120 		error = mac_vnode_check_readdir(td->td_ucred, vp);
4121 		if (error == 0)
4122 #endif
4123 			error = VOP_READDIR(vp, &uio, td->td_ucred, eofflag,
4124 			    NULL, NULL);
4125 		if (error != 0)
4126 			goto out;
4127 
4128 		*len = dirbuflen - uio.uio_resid;
4129 		*off = uio.uio_offset;
4130 
4131 		if (*len == 0) {
4132 			/* Sanity check on INVARIANTS. */
4133 			MPASS(*eofflag != 0);
4134 			*eofflag = 1;
4135 			goto success;
4136 		}
4137 
4138 		/*
4139 		 * Normalize the flag returned by VOP_READDIR(), since we use 2
4140 		 * as a sentinel value.
4141 		 */
4142 		if (*eofflag != 0)
4143 			*eofflag = 1;
4144 
4145 		dp = (struct dirent *)dirbuf;
4146 	}
4147 
4148 	if (__predict_false(*len < GENERIC_MINDIRSIZ ||
4149 	    dp->d_reclen < GENERIC_MINDIRSIZ)) {
4150 		error = EINTEGRITY;
4151 		dp = NULL;
4152 		goto out;
4153 	}
4154 
4155 success:
4156 	error = 0;
4157 out:
4158 	*dpp = dp;
4159 	return (error);
4160 }
4161 
4162 /*
4163  * Checks whether a directory is empty or not.
4164  *
4165  * If the directory is empty, returns 0, and if it is not, ENOTEMPTY.  Other
4166  * values are genuine errors preventing the check.
4167  */
4168 int
vn_dir_check_empty(struct vnode * vp)4169 vn_dir_check_empty(struct vnode *vp)
4170 {
4171 	struct thread *const td = curthread;
4172 	char *dirbuf;
4173 	size_t dirbuflen, len;
4174 	off_t off;
4175 	int eofflag, error;
4176 	struct dirent *dp;
4177 	struct vattr va;
4178 
4179 	ASSERT_VOP_LOCKED(vp, "vfs_emptydir");
4180 	VNPASS(vp->v_type == VDIR, vp);
4181 
4182 	error = VOP_GETATTR(vp, &va, td->td_ucred);
4183 	if (error != 0)
4184 		return (error);
4185 
4186 	dirbuflen = max(DEV_BSIZE, GENERIC_MAXDIRSIZ);
4187 	if (dirbuflen < va.va_blocksize)
4188 		dirbuflen = va.va_blocksize;
4189 	dirbuf = malloc(dirbuflen, M_TEMP, M_WAITOK);
4190 
4191 	len = 0;
4192 	off = 0;
4193 	eofflag = 0;
4194 
4195 	for (;;) {
4196 		error = vn_dir_next_dirent(vp, td, dirbuf, dirbuflen,
4197 		    &dp, &len, &off, &eofflag);
4198 		if (error != 0)
4199 			goto end;
4200 
4201 		if (len == 0) {
4202 			/* EOF */
4203 			error = 0;
4204 			goto end;
4205 		}
4206 
4207 		/*
4208 		 * Skip whiteouts.  Unionfs operates on filesystems only and
4209 		 * not on hierarchies, so these whiteouts would be shadowed on
4210 		 * the system hierarchy but not for a union using the
4211 		 * filesystem of their directories as the upper layer.
4212 		 * Additionally, unionfs currently transparently exposes
4213 		 * union-specific metadata of its upper layer, meaning that
4214 		 * whiteouts can be seen through the union view in empty
4215 		 * directories.  Taking into account these whiteouts would then
4216 		 * prevent mounting another filesystem on such effectively
4217 		 * empty directories.
4218 		 */
4219 		if (dp->d_type == DT_WHT)
4220 			continue;
4221 
4222 		/*
4223 		 * Any file in the directory which is not '.' or '..' indicates
4224 		 * the directory is not empty.
4225 		 */
4226 		switch (dp->d_namlen) {
4227 		case 2:
4228 			if (dp->d_name[1] != '.') {
4229 				/* Can't be '..' (nor '.') */
4230 				error = ENOTEMPTY;
4231 				goto end;
4232 			}
4233 			/* FALLTHROUGH */
4234 		case 1:
4235 			if (dp->d_name[0] != '.') {
4236 				/* Can't be '..' nor '.' */
4237 				error = ENOTEMPTY;
4238 				goto end;
4239 			}
4240 			break;
4241 
4242 		default:
4243 			error = ENOTEMPTY;
4244 			goto end;
4245 		}
4246 	}
4247 
4248 end:
4249 	free(dirbuf, M_TEMP);
4250 	return (error);
4251 }
4252 
4253 
4254 static u_long vn_lock_pair_pause_cnt;
4255 SYSCTL_ULONG(_debug, OID_AUTO, vn_lock_pair_pause, CTLFLAG_RD,
4256     &vn_lock_pair_pause_cnt, 0,
4257     "Count of vn_lock_pair deadlocks");
4258 
4259 u_int vn_lock_pair_pause_max;
4260 SYSCTL_UINT(_debug, OID_AUTO, vn_lock_pair_pause_max, CTLFLAG_RW,
4261     &vn_lock_pair_pause_max, 0,
4262     "Max ticks for vn_lock_pair deadlock avoidance sleep");
4263 
4264 static void
vn_lock_pair_pause(const char * wmesg)4265 vn_lock_pair_pause(const char *wmesg)
4266 {
4267 	atomic_add_long(&vn_lock_pair_pause_cnt, 1);
4268 	pause(wmesg, prng32_bounded(vn_lock_pair_pause_max));
4269 }
4270 
4271 /*
4272  * Lock pair of (possibly same) vnodes vp1, vp2, avoiding lock order
4273  * reversal.  vp1_locked indicates whether vp1 is locked; if not, vp1
4274  * must be unlocked.  Same for vp2 and vp2_locked.  One of the vnodes
4275  * can be NULL.
4276  *
4277  * The function returns with both vnodes exclusively or shared locked,
4278  * according to corresponding lkflags, and guarantees that it does not
4279  * create lock order reversal with other threads during its execution.
4280  * Both vnodes could be unlocked temporary (and reclaimed).
4281  *
4282  * If requesting shared locking, locked vnode lock must not be recursed.
4283  *
4284  * Only one of LK_SHARED and LK_EXCLUSIVE must be specified.
4285  * LK_NODDLKTREAT can be optionally passed.
4286  *
4287  * If vp1 == vp2, only one, most exclusive, lock is obtained on it.
4288  */
4289 void
vn_lock_pair(struct vnode * vp1,bool vp1_locked,int lkflags1,struct vnode * vp2,bool vp2_locked,int lkflags2)4290 vn_lock_pair(struct vnode *vp1, bool vp1_locked, int lkflags1,
4291     struct vnode *vp2, bool vp2_locked, int lkflags2)
4292 {
4293 	int error, locked1;
4294 
4295 	MPASS((((lkflags1 & LK_SHARED) != 0) ^ ((lkflags1 & LK_EXCLUSIVE) != 0)) ||
4296 	    (vp1 == NULL && lkflags1 == 0));
4297 	MPASS((lkflags1 & ~(LK_SHARED | LK_EXCLUSIVE | LK_NODDLKTREAT)) == 0);
4298 	MPASS((((lkflags2 & LK_SHARED) != 0) ^ ((lkflags2 & LK_EXCLUSIVE) != 0)) ||
4299 	    (vp2 == NULL && lkflags2 == 0));
4300 	MPASS((lkflags2 & ~(LK_SHARED | LK_EXCLUSIVE | LK_NODDLKTREAT)) == 0);
4301 
4302 	if (vp1 == NULL && vp2 == NULL)
4303 		return;
4304 
4305 	if (vp1 == vp2) {
4306 		MPASS(vp1_locked == vp2_locked);
4307 
4308 		/* Select the most exclusive mode for lock. */
4309 		if ((lkflags1 & LK_TYPE_MASK) != (lkflags2 & LK_TYPE_MASK))
4310 			lkflags1 = (lkflags1 & ~LK_SHARED) | LK_EXCLUSIVE;
4311 
4312 		if (vp1_locked) {
4313 			ASSERT_VOP_LOCKED(vp1, "vp1");
4314 
4315 			/* No need to relock if any lock is exclusive. */
4316 			if ((vp1->v_vnlock->lock_object.lo_flags &
4317 			    LK_NOSHARE) != 0)
4318 				return;
4319 
4320 			locked1 = VOP_ISLOCKED(vp1);
4321 			if (((lkflags1 & LK_SHARED) != 0 &&
4322 			    locked1 != LK_EXCLUSIVE) ||
4323 			    ((lkflags1 & LK_EXCLUSIVE) != 0 &&
4324 			    locked1 == LK_EXCLUSIVE))
4325 				return;
4326 			VOP_UNLOCK(vp1);
4327 		}
4328 
4329 		ASSERT_VOP_UNLOCKED(vp1, "vp1");
4330 		vn_lock(vp1, lkflags1 | LK_RETRY);
4331 		return;
4332 	}
4333 
4334 	if (vp1 != NULL) {
4335 		if ((lkflags1 & LK_SHARED) != 0 &&
4336 		    (vp1->v_vnlock->lock_object.lo_flags & LK_NOSHARE) != 0)
4337 			lkflags1 = (lkflags1 & ~LK_SHARED) | LK_EXCLUSIVE;
4338 		if (vp1_locked && VOP_ISLOCKED(vp1) != LK_EXCLUSIVE) {
4339 			ASSERT_VOP_LOCKED(vp1, "vp1");
4340 			if ((lkflags1 & LK_EXCLUSIVE) != 0) {
4341 				VOP_UNLOCK(vp1);
4342 				ASSERT_VOP_UNLOCKED(vp1,
4343 				    "vp1 shared recursed");
4344 				vp1_locked = false;
4345 			}
4346 		} else if (!vp1_locked)
4347 			ASSERT_VOP_UNLOCKED(vp1, "vp1");
4348 	} else {
4349 		vp1_locked = true;
4350 	}
4351 
4352 	if (vp2 != NULL) {
4353 		if ((lkflags2 & LK_SHARED) != 0 &&
4354 		    (vp2->v_vnlock->lock_object.lo_flags & LK_NOSHARE) != 0)
4355 			lkflags2 = (lkflags2 & ~LK_SHARED) | LK_EXCLUSIVE;
4356 		if (vp2_locked && VOP_ISLOCKED(vp2) != LK_EXCLUSIVE) {
4357 			ASSERT_VOP_LOCKED(vp2, "vp2");
4358 			if ((lkflags2 & LK_EXCLUSIVE) != 0) {
4359 				VOP_UNLOCK(vp2);
4360 				ASSERT_VOP_UNLOCKED(vp2,
4361 				    "vp2 shared recursed");
4362 				vp2_locked = false;
4363 			}
4364 		} else if (!vp2_locked)
4365 			ASSERT_VOP_UNLOCKED(vp2, "vp2");
4366 	} else {
4367 		vp2_locked = true;
4368 	}
4369 
4370 	if (!vp1_locked && !vp2_locked) {
4371 		vn_lock(vp1, lkflags1 | LK_RETRY);
4372 		vp1_locked = true;
4373 	}
4374 
4375 	while (!vp1_locked || !vp2_locked) {
4376 		if (vp1_locked && vp2 != NULL) {
4377 			if (vp1 != NULL) {
4378 				error = VOP_LOCK1(vp2, lkflags2 | LK_NOWAIT,
4379 				    __FILE__, __LINE__);
4380 				if (error == 0)
4381 					break;
4382 				VOP_UNLOCK(vp1);
4383 				vp1_locked = false;
4384 				vn_lock_pair_pause("vlp1");
4385 			}
4386 			vn_lock(vp2, lkflags2 | LK_RETRY);
4387 			vp2_locked = true;
4388 		}
4389 		if (vp2_locked && vp1 != NULL) {
4390 			if (vp2 != NULL) {
4391 				error = VOP_LOCK1(vp1, lkflags1 | LK_NOWAIT,
4392 				    __FILE__, __LINE__);
4393 				if (error == 0)
4394 					break;
4395 				VOP_UNLOCK(vp2);
4396 				vp2_locked = false;
4397 				vn_lock_pair_pause("vlp2");
4398 			}
4399 			vn_lock(vp1, lkflags1 | LK_RETRY);
4400 			vp1_locked = true;
4401 		}
4402 	}
4403 	if (vp1 != NULL) {
4404 		if (lkflags1 == LK_EXCLUSIVE)
4405 			ASSERT_VOP_ELOCKED(vp1, "vp1 ret");
4406 		else
4407 			ASSERT_VOP_LOCKED(vp1, "vp1 ret");
4408 	}
4409 	if (vp2 != NULL) {
4410 		if (lkflags2 == LK_EXCLUSIVE)
4411 			ASSERT_VOP_ELOCKED(vp2, "vp2 ret");
4412 		else
4413 			ASSERT_VOP_LOCKED(vp2, "vp2 ret");
4414 	}
4415 }
4416 
4417 int
vn_lktype_write(struct mount * mp,struct vnode * vp)4418 vn_lktype_write(struct mount *mp, struct vnode *vp)
4419 {
4420 	if (MNT_SHARED_WRITES(mp) ||
4421 	    (mp == NULL && MNT_SHARED_WRITES(vp->v_mount)))
4422 		return (LK_SHARED);
4423 	return (LK_EXCLUSIVE);
4424 }
4425 
4426 int
vn_cmp(struct file * fp1,struct file * fp2,struct thread * td)4427 vn_cmp(struct file *fp1, struct file *fp2, struct thread *td)
4428 {
4429 	if (fp2->f_type != DTYPE_VNODE)
4430 		return (3);
4431 	return (kcmp_cmp((uintptr_t)fp1->f_vnode, (uintptr_t)fp2->f_vnode));
4432 }
4433