xref: /illumos-gate/usr/src/uts/common/fs/vnode.c (revision 2e0fe3efe5f9d579d4e44b3532d8e342c68b40ca)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
26 /*	  All Rights Reserved  	*/
27 
28 /*
29  * University Copyright- Copyright (c) 1982, 1986, 1988
30  * The Regents of the University of California
31  * All Rights Reserved
32  *
33  * University Acknowledgment- Portions of this document are derived from
34  * software developed by the University of California, Berkeley, and its
35  * contributors.
36  */
37 
38 #include <sys/types.h>
39 #include <sys/param.h>
40 #include <sys/t_lock.h>
41 #include <sys/errno.h>
42 #include <sys/cred.h>
43 #include <sys/user.h>
44 #include <sys/uio.h>
45 #include <sys/file.h>
46 #include <sys/pathname.h>
47 #include <sys/vfs.h>
48 #include <sys/vfs_opreg.h>
49 #include <sys/vnode.h>
50 #include <sys/rwstlock.h>
51 #include <sys/fem.h>
52 #include <sys/stat.h>
53 #include <sys/mode.h>
54 #include <sys/conf.h>
55 #include <sys/sysmacros.h>
56 #include <sys/cmn_err.h>
57 #include <sys/systm.h>
58 #include <sys/kmem.h>
59 #include <sys/debug.h>
60 #include <c2/audit.h>
61 #include <sys/acl.h>
62 #include <sys/nbmlock.h>
63 #include <sys/fcntl.h>
64 #include <fs/fs_subr.h>
65 #include <sys/taskq.h>
66 #include <fs/fs_reparse.h>
67 
68 /* Determine if this vnode is a file that is read-only */
69 #define	ISROFILE(vp)	\
70 	((vp)->v_type != VCHR && (vp)->v_type != VBLK && \
71 	    (vp)->v_type != VFIFO && vn_is_readonly(vp))
72 
73 /* Tunable via /etc/system; used only by admin/install */
74 int nfs_global_client_only;
75 
76 /*
77  * Array of vopstats_t for per-FS-type vopstats.  This array has the same
78  * number of entries as and parallel to the vfssw table.  (Arguably, it could
79  * be part of the vfssw table.)  Once it's initialized, it's accessed using
80  * the same fstype index that is used to index into the vfssw table.
81  */
82 vopstats_t **vopstats_fstype;
83 
84 /* vopstats initialization template used for fast initialization via bcopy() */
85 static vopstats_t *vs_templatep;
86 
87 /* Kmem cache handle for vsk_anchor_t allocations */
88 kmem_cache_t *vsk_anchor_cache;
89 
90 /* file events cleanup routine */
91 extern void free_fopdata(vnode_t *);
92 
93 /*
94  * Root of AVL tree for the kstats associated with vopstats.  Lock protects
95  * updates to vsktat_tree.
96  */
97 avl_tree_t	vskstat_tree;
98 kmutex_t	vskstat_tree_lock;
99 
100 /* Global variable which enables/disables the vopstats collection */
101 int vopstats_enabled = 1;
102 
103 /*
104  * forward declarations for internal vnode specific data (vsd)
105  */
106 static void *vsd_realloc(void *, size_t, size_t);
107 
108 /*
109  * forward declarations for reparse point functions
110  */
111 static int fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr);
112 
113 /*
114  * VSD -- VNODE SPECIFIC DATA
115  * The v_data pointer is typically used by a file system to store a
116  * pointer to the file system's private node (e.g. ufs inode, nfs rnode).
117  * However, there are times when additional project private data needs
118  * to be stored separately from the data (node) pointed to by v_data.
119  * This additional data could be stored by the file system itself or
120  * by a completely different kernel entity.  VSD provides a way for
121  * callers to obtain a key and store a pointer to private data associated
122  * with a vnode.
123  *
124  * Callers are responsible for protecting the vsd by holding v_vsd_lock
125  * for calls to vsd_set() and vsd_get().
126  */
127 
128 /*
129  * vsd_lock protects:
130  *   vsd_nkeys - creation and deletion of vsd keys
131  *   vsd_list - insertion and deletion of vsd_node in the vsd_list
132  *   vsd_destructor - adding and removing destructors to the list
133  */
134 static kmutex_t		vsd_lock;
135 static uint_t		vsd_nkeys;	 /* size of destructor array */
136 /* list of vsd_node's */
137 static list_t *vsd_list = NULL;
138 /* per-key destructor funcs */
139 static void 		(**vsd_destructor)(void *);
140 
141 /*
142  * The following is the common set of actions needed to update the
143  * vopstats structure from a vnode op.  Both VOPSTATS_UPDATE() and
144  * VOPSTATS_UPDATE_IO() do almost the same thing, except for the
145  * recording of the bytes transferred.  Since the code is similar
146  * but small, it is nearly a duplicate.  Consequently any changes
147  * to one may need to be reflected in the other.
148  * Rundown of the variables:
149  * vp - Pointer to the vnode
150  * counter - Partial name structure member to update in vopstats for counts
151  * bytecounter - Partial name structure member to update in vopstats for bytes
152  * bytesval - Value to update in vopstats for bytes
153  * fstype - Index into vsanchor_fstype[], same as index into vfssw[]
154  * vsp - Pointer to vopstats structure (either in vfs or vsanchor_fstype[i])
155  */
156 
157 #define	VOPSTATS_UPDATE(vp, counter) {					\
158 	vfs_t *vfsp = (vp)->v_vfsp;					\
159 	if (vfsp && vfsp->vfs_implp &&					\
160 	    (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) {	\
161 		vopstats_t *vsp = &vfsp->vfs_vopstats;			\
162 		uint64_t *stataddr = &(vsp->n##counter.value.ui64);	\
163 		extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
164 		    size_t, uint64_t *);				\
165 		__dtrace_probe___fsinfo_##counter(vp, 0, stataddr);	\
166 		(*stataddr)++;						\
167 		if ((vsp = vfsp->vfs_fstypevsp) != NULL) {		\
168 			vsp->n##counter.value.ui64++;			\
169 		}							\
170 	}								\
171 }
172 
173 #define	VOPSTATS_UPDATE_IO(vp, counter, bytecounter, bytesval) {	\
174 	vfs_t *vfsp = (vp)->v_vfsp;					\
175 	if (vfsp && vfsp->vfs_implp &&					\
176 	    (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) {	\
177 		vopstats_t *vsp = &vfsp->vfs_vopstats;			\
178 		uint64_t *stataddr = &(vsp->n##counter.value.ui64);	\
179 		extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
180 		    size_t, uint64_t *);				\
181 		__dtrace_probe___fsinfo_##counter(vp, bytesval, stataddr); \
182 		(*stataddr)++;						\
183 		vsp->bytecounter.value.ui64 += bytesval;		\
184 		if ((vsp = vfsp->vfs_fstypevsp) != NULL) {		\
185 			vsp->n##counter.value.ui64++;			\
186 			vsp->bytecounter.value.ui64 += bytesval;	\
187 		}							\
188 	}								\
189 }
190 
191 /*
192  * If the filesystem does not support XIDs map credential
193  * If the vfsp is NULL, perhaps we should also map?
194  */
195 #define	VOPXID_MAP_CR(vp, cr)	{					\
196 	vfs_t *vfsp = (vp)->v_vfsp;					\
197 	if (vfsp != NULL && (vfsp->vfs_flag & VFS_XID) == 0)		\
198 		cr = crgetmapped(cr);					\
199 	}
200 
201 /*
202  * Convert stat(2) formats to vnode types and vice versa.  (Knows about
203  * numerical order of S_IFMT and vnode types.)
204  */
205 enum vtype iftovt_tab[] = {
206 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
207 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
208 };
209 
210 ushort_t vttoif_tab[] = {
211 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO,
212 	S_IFDOOR, 0, S_IFSOCK, S_IFPORT, 0
213 };
214 
215 /*
216  * The system vnode cache.
217  */
218 
219 kmem_cache_t *vn_cache;
220 
221 
222 /*
223  * Vnode operations vector.
224  */
225 
226 static const fs_operation_trans_def_t vn_ops_table[] = {
227 	VOPNAME_OPEN, offsetof(struct vnodeops, vop_open),
228 	    fs_nosys, fs_nosys,
229 
230 	VOPNAME_CLOSE, offsetof(struct vnodeops, vop_close),
231 	    fs_nosys, fs_nosys,
232 
233 	VOPNAME_READ, offsetof(struct vnodeops, vop_read),
234 	    fs_nosys, fs_nosys,
235 
236 	VOPNAME_WRITE, offsetof(struct vnodeops, vop_write),
237 	    fs_nosys, fs_nosys,
238 
239 	VOPNAME_IOCTL, offsetof(struct vnodeops, vop_ioctl),
240 	    fs_nosys, fs_nosys,
241 
242 	VOPNAME_SETFL, offsetof(struct vnodeops, vop_setfl),
243 	    fs_setfl, fs_nosys,
244 
245 	VOPNAME_GETATTR, offsetof(struct vnodeops, vop_getattr),
246 	    fs_nosys, fs_nosys,
247 
248 	VOPNAME_SETATTR, offsetof(struct vnodeops, vop_setattr),
249 	    fs_nosys, fs_nosys,
250 
251 	VOPNAME_ACCESS, offsetof(struct vnodeops, vop_access),
252 	    fs_nosys, fs_nosys,
253 
254 	VOPNAME_LOOKUP, offsetof(struct vnodeops, vop_lookup),
255 	    fs_nosys, fs_nosys,
256 
257 	VOPNAME_CREATE, offsetof(struct vnodeops, vop_create),
258 	    fs_nosys, fs_nosys,
259 
260 	VOPNAME_REMOVE, offsetof(struct vnodeops, vop_remove),
261 	    fs_nosys, fs_nosys,
262 
263 	VOPNAME_LINK, offsetof(struct vnodeops, vop_link),
264 	    fs_nosys, fs_nosys,
265 
266 	VOPNAME_RENAME, offsetof(struct vnodeops, vop_rename),
267 	    fs_nosys, fs_nosys,
268 
269 	VOPNAME_MKDIR, offsetof(struct vnodeops, vop_mkdir),
270 	    fs_nosys, fs_nosys,
271 
272 	VOPNAME_RMDIR, offsetof(struct vnodeops, vop_rmdir),
273 	    fs_nosys, fs_nosys,
274 
275 	VOPNAME_READDIR, offsetof(struct vnodeops, vop_readdir),
276 	    fs_nosys, fs_nosys,
277 
278 	VOPNAME_SYMLINK, offsetof(struct vnodeops, vop_symlink),
279 	    fs_nosys, fs_nosys,
280 
281 	VOPNAME_READLINK, offsetof(struct vnodeops, vop_readlink),
282 	    fs_nosys, fs_nosys,
283 
284 	VOPNAME_FSYNC, offsetof(struct vnodeops, vop_fsync),
285 	    fs_nosys, fs_nosys,
286 
287 	VOPNAME_INACTIVE, offsetof(struct vnodeops, vop_inactive),
288 	    fs_nosys, fs_nosys,
289 
290 	VOPNAME_FID, offsetof(struct vnodeops, vop_fid),
291 	    fs_nosys, fs_nosys,
292 
293 	VOPNAME_RWLOCK, offsetof(struct vnodeops, vop_rwlock),
294 	    fs_rwlock, fs_rwlock,
295 
296 	VOPNAME_RWUNLOCK, offsetof(struct vnodeops, vop_rwunlock),
297 	    (fs_generic_func_p) fs_rwunlock,
298 	    (fs_generic_func_p) fs_rwunlock,	/* no errors allowed */
299 
300 	VOPNAME_SEEK, offsetof(struct vnodeops, vop_seek),
301 	    fs_nosys, fs_nosys,
302 
303 	VOPNAME_CMP, offsetof(struct vnodeops, vop_cmp),
304 	    fs_cmp, fs_cmp,		/* no errors allowed */
305 
306 	VOPNAME_FRLOCK, offsetof(struct vnodeops, vop_frlock),
307 	    fs_frlock, fs_nosys,
308 
309 	VOPNAME_SPACE, offsetof(struct vnodeops, vop_space),
310 	    fs_nosys, fs_nosys,
311 
312 	VOPNAME_REALVP, offsetof(struct vnodeops, vop_realvp),
313 	    fs_nosys, fs_nosys,
314 
315 	VOPNAME_GETPAGE, offsetof(struct vnodeops, vop_getpage),
316 	    fs_nosys, fs_nosys,
317 
318 	VOPNAME_PUTPAGE, offsetof(struct vnodeops, vop_putpage),
319 	    fs_nosys, fs_nosys,
320 
321 	VOPNAME_MAP, offsetof(struct vnodeops, vop_map),
322 	    (fs_generic_func_p) fs_nosys_map,
323 	    (fs_generic_func_p) fs_nosys_map,
324 
325 	VOPNAME_ADDMAP, offsetof(struct vnodeops, vop_addmap),
326 	    (fs_generic_func_p) fs_nosys_addmap,
327 	    (fs_generic_func_p) fs_nosys_addmap,
328 
329 	VOPNAME_DELMAP, offsetof(struct vnodeops, vop_delmap),
330 	    fs_nosys, fs_nosys,
331 
332 	VOPNAME_POLL, offsetof(struct vnodeops, vop_poll),
333 	    (fs_generic_func_p) fs_poll, (fs_generic_func_p) fs_nosys_poll,
334 
335 	VOPNAME_DUMP, offsetof(struct vnodeops, vop_dump),
336 	    fs_nosys, fs_nosys,
337 
338 	VOPNAME_PATHCONF, offsetof(struct vnodeops, vop_pathconf),
339 	    fs_pathconf, fs_nosys,
340 
341 	VOPNAME_PAGEIO, offsetof(struct vnodeops, vop_pageio),
342 	    fs_nosys, fs_nosys,
343 
344 	VOPNAME_DUMPCTL, offsetof(struct vnodeops, vop_dumpctl),
345 	    fs_nosys, fs_nosys,
346 
347 	VOPNAME_DISPOSE, offsetof(struct vnodeops, vop_dispose),
348 	    (fs_generic_func_p) fs_dispose,
349 	    (fs_generic_func_p) fs_nodispose,
350 
351 	VOPNAME_SETSECATTR, offsetof(struct vnodeops, vop_setsecattr),
352 	    fs_nosys, fs_nosys,
353 
354 	VOPNAME_GETSECATTR, offsetof(struct vnodeops, vop_getsecattr),
355 	    fs_fab_acl, fs_nosys,
356 
357 	VOPNAME_SHRLOCK, offsetof(struct vnodeops, vop_shrlock),
358 	    fs_shrlock, fs_nosys,
359 
360 	VOPNAME_VNEVENT, offsetof(struct vnodeops, vop_vnevent),
361 	    (fs_generic_func_p) fs_vnevent_nosupport,
362 	    (fs_generic_func_p) fs_vnevent_nosupport,
363 
364 	VOPNAME_REQZCBUF, offsetof(struct vnodeops, vop_reqzcbuf),
365 	    fs_nosys, fs_nosys,
366 
367 	VOPNAME_RETZCBUF, offsetof(struct vnodeops, vop_retzcbuf),
368 	    fs_nosys, fs_nosys,
369 
370 	NULL, 0, NULL, NULL
371 };
372 
373 /* Extensible attribute (xva) routines. */
374 
375 /*
376  * Zero out the structure, set the size of the requested/returned bitmaps,
377  * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer
378  * to the returned attributes array.
379  */
380 void
381 xva_init(xvattr_t *xvap)
382 {
383 	bzero(xvap, sizeof (xvattr_t));
384 	xvap->xva_mapsize = XVA_MAPSIZE;
385 	xvap->xva_magic = XVA_MAGIC;
386 	xvap->xva_vattr.va_mask = AT_XVATTR;
387 	xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0];
388 }
389 
390 /*
391  * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t
392  * structure.  Otherwise, returns NULL.
393  */
394 xoptattr_t *
395 xva_getxoptattr(xvattr_t *xvap)
396 {
397 	xoptattr_t *xoap = NULL;
398 	if (xvap->xva_vattr.va_mask & AT_XVATTR)
399 		xoap = &xvap->xva_xoptattrs;
400 	return (xoap);
401 }
402 
403 /*
404  * Used by the AVL routines to compare two vsk_anchor_t structures in the tree.
405  * We use the f_fsid reported by VFS_STATVFS() since we use that for the
406  * kstat name.
407  */
408 static int
409 vska_compar(const void *n1, const void *n2)
410 {
411 	int ret;
412 	ulong_t p1 = ((vsk_anchor_t *)n1)->vsk_fsid;
413 	ulong_t p2 = ((vsk_anchor_t *)n2)->vsk_fsid;
414 
415 	if (p1 < p2) {
416 		ret = -1;
417 	} else if (p1 > p2) {
418 		ret = 1;
419 	} else {
420 		ret = 0;
421 	}
422 
423 	return (ret);
424 }
425 
426 /*
427  * Used to create a single template which will be bcopy()ed to a newly
428  * allocated vsanchor_combo_t structure in new_vsanchor(), below.
429  */
430 static vopstats_t *
431 create_vopstats_template()
432 {
433 	vopstats_t		*vsp;
434 
435 	vsp = kmem_alloc(sizeof (vopstats_t), KM_SLEEP);
436 	bzero(vsp, sizeof (*vsp));	/* Start fresh */
437 
438 	/* VOP_OPEN */
439 	kstat_named_init(&vsp->nopen, "nopen", KSTAT_DATA_UINT64);
440 	/* VOP_CLOSE */
441 	kstat_named_init(&vsp->nclose, "nclose", KSTAT_DATA_UINT64);
442 	/* VOP_READ I/O */
443 	kstat_named_init(&vsp->nread, "nread", KSTAT_DATA_UINT64);
444 	kstat_named_init(&vsp->read_bytes, "read_bytes", KSTAT_DATA_UINT64);
445 	/* VOP_WRITE I/O */
446 	kstat_named_init(&vsp->nwrite, "nwrite", KSTAT_DATA_UINT64);
447 	kstat_named_init(&vsp->write_bytes, "write_bytes", KSTAT_DATA_UINT64);
448 	/* VOP_IOCTL */
449 	kstat_named_init(&vsp->nioctl, "nioctl", KSTAT_DATA_UINT64);
450 	/* VOP_SETFL */
451 	kstat_named_init(&vsp->nsetfl, "nsetfl", KSTAT_DATA_UINT64);
452 	/* VOP_GETATTR */
453 	kstat_named_init(&vsp->ngetattr, "ngetattr", KSTAT_DATA_UINT64);
454 	/* VOP_SETATTR */
455 	kstat_named_init(&vsp->nsetattr, "nsetattr", KSTAT_DATA_UINT64);
456 	/* VOP_ACCESS */
457 	kstat_named_init(&vsp->naccess, "naccess", KSTAT_DATA_UINT64);
458 	/* VOP_LOOKUP */
459 	kstat_named_init(&vsp->nlookup, "nlookup", KSTAT_DATA_UINT64);
460 	/* VOP_CREATE */
461 	kstat_named_init(&vsp->ncreate, "ncreate", KSTAT_DATA_UINT64);
462 	/* VOP_REMOVE */
463 	kstat_named_init(&vsp->nremove, "nremove", KSTAT_DATA_UINT64);
464 	/* VOP_LINK */
465 	kstat_named_init(&vsp->nlink, "nlink", KSTAT_DATA_UINT64);
466 	/* VOP_RENAME */
467 	kstat_named_init(&vsp->nrename, "nrename", KSTAT_DATA_UINT64);
468 	/* VOP_MKDIR */
469 	kstat_named_init(&vsp->nmkdir, "nmkdir", KSTAT_DATA_UINT64);
470 	/* VOP_RMDIR */
471 	kstat_named_init(&vsp->nrmdir, "nrmdir", KSTAT_DATA_UINT64);
472 	/* VOP_READDIR I/O */
473 	kstat_named_init(&vsp->nreaddir, "nreaddir", KSTAT_DATA_UINT64);
474 	kstat_named_init(&vsp->readdir_bytes, "readdir_bytes",
475 	    KSTAT_DATA_UINT64);
476 	/* VOP_SYMLINK */
477 	kstat_named_init(&vsp->nsymlink, "nsymlink", KSTAT_DATA_UINT64);
478 	/* VOP_READLINK */
479 	kstat_named_init(&vsp->nreadlink, "nreadlink", KSTAT_DATA_UINT64);
480 	/* VOP_FSYNC */
481 	kstat_named_init(&vsp->nfsync, "nfsync", KSTAT_DATA_UINT64);
482 	/* VOP_INACTIVE */
483 	kstat_named_init(&vsp->ninactive, "ninactive", KSTAT_DATA_UINT64);
484 	/* VOP_FID */
485 	kstat_named_init(&vsp->nfid, "nfid", KSTAT_DATA_UINT64);
486 	/* VOP_RWLOCK */
487 	kstat_named_init(&vsp->nrwlock, "nrwlock", KSTAT_DATA_UINT64);
488 	/* VOP_RWUNLOCK */
489 	kstat_named_init(&vsp->nrwunlock, "nrwunlock", KSTAT_DATA_UINT64);
490 	/* VOP_SEEK */
491 	kstat_named_init(&vsp->nseek, "nseek", KSTAT_DATA_UINT64);
492 	/* VOP_CMP */
493 	kstat_named_init(&vsp->ncmp, "ncmp", KSTAT_DATA_UINT64);
494 	/* VOP_FRLOCK */
495 	kstat_named_init(&vsp->nfrlock, "nfrlock", KSTAT_DATA_UINT64);
496 	/* VOP_SPACE */
497 	kstat_named_init(&vsp->nspace, "nspace", KSTAT_DATA_UINT64);
498 	/* VOP_REALVP */
499 	kstat_named_init(&vsp->nrealvp, "nrealvp", KSTAT_DATA_UINT64);
500 	/* VOP_GETPAGE */
501 	kstat_named_init(&vsp->ngetpage, "ngetpage", KSTAT_DATA_UINT64);
502 	/* VOP_PUTPAGE */
503 	kstat_named_init(&vsp->nputpage, "nputpage", KSTAT_DATA_UINT64);
504 	/* VOP_MAP */
505 	kstat_named_init(&vsp->nmap, "nmap", KSTAT_DATA_UINT64);
506 	/* VOP_ADDMAP */
507 	kstat_named_init(&vsp->naddmap, "naddmap", KSTAT_DATA_UINT64);
508 	/* VOP_DELMAP */
509 	kstat_named_init(&vsp->ndelmap, "ndelmap", KSTAT_DATA_UINT64);
510 	/* VOP_POLL */
511 	kstat_named_init(&vsp->npoll, "npoll", KSTAT_DATA_UINT64);
512 	/* VOP_DUMP */
513 	kstat_named_init(&vsp->ndump, "ndump", KSTAT_DATA_UINT64);
514 	/* VOP_PATHCONF */
515 	kstat_named_init(&vsp->npathconf, "npathconf", KSTAT_DATA_UINT64);
516 	/* VOP_PAGEIO */
517 	kstat_named_init(&vsp->npageio, "npageio", KSTAT_DATA_UINT64);
518 	/* VOP_DUMPCTL */
519 	kstat_named_init(&vsp->ndumpctl, "ndumpctl", KSTAT_DATA_UINT64);
520 	/* VOP_DISPOSE */
521 	kstat_named_init(&vsp->ndispose, "ndispose", KSTAT_DATA_UINT64);
522 	/* VOP_SETSECATTR */
523 	kstat_named_init(&vsp->nsetsecattr, "nsetsecattr", KSTAT_DATA_UINT64);
524 	/* VOP_GETSECATTR */
525 	kstat_named_init(&vsp->ngetsecattr, "ngetsecattr", KSTAT_DATA_UINT64);
526 	/* VOP_SHRLOCK */
527 	kstat_named_init(&vsp->nshrlock, "nshrlock", KSTAT_DATA_UINT64);
528 	/* VOP_VNEVENT */
529 	kstat_named_init(&vsp->nvnevent, "nvnevent", KSTAT_DATA_UINT64);
530 	/* VOP_REQZCBUF */
531 	kstat_named_init(&vsp->nreqzcbuf, "nreqzcbuf", KSTAT_DATA_UINT64);
532 	/* VOP_RETZCBUF */
533 	kstat_named_init(&vsp->nretzcbuf, "nretzcbuf", KSTAT_DATA_UINT64);
534 
535 	return (vsp);
536 }
537 
538 /*
539  * Creates a kstat structure associated with a vopstats structure.
540  */
541 kstat_t *
542 new_vskstat(char *ksname, vopstats_t *vsp)
543 {
544 	kstat_t		*ksp;
545 
546 	if (!vopstats_enabled) {
547 		return (NULL);
548 	}
549 
550 	ksp = kstat_create("unix", 0, ksname, "misc", KSTAT_TYPE_NAMED,
551 	    sizeof (vopstats_t)/sizeof (kstat_named_t),
552 	    KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE);
553 	if (ksp) {
554 		ksp->ks_data = vsp;
555 		kstat_install(ksp);
556 	}
557 
558 	return (ksp);
559 }
560 
561 /*
562  * Called from vfsinit() to initialize the support mechanisms for vopstats
563  */
564 void
565 vopstats_startup()
566 {
567 	if (!vopstats_enabled)
568 		return;
569 
570 	/*
571 	 * Creates the AVL tree which holds per-vfs vopstat anchors.  This
572 	 * is necessary since we need to check if a kstat exists before we
573 	 * attempt to create it.  Also, initialize its lock.
574 	 */
575 	avl_create(&vskstat_tree, vska_compar, sizeof (vsk_anchor_t),
576 	    offsetof(vsk_anchor_t, vsk_node));
577 	mutex_init(&vskstat_tree_lock, NULL, MUTEX_DEFAULT, NULL);
578 
579 	vsk_anchor_cache = kmem_cache_create("vsk_anchor_cache",
580 	    sizeof (vsk_anchor_t), sizeof (uintptr_t), NULL, NULL, NULL,
581 	    NULL, NULL, 0);
582 
583 	/*
584 	 * Set up the array of pointers for the vopstats-by-FS-type.
585 	 * The entries will be allocated/initialized as each file system
586 	 * goes through modload/mod_installfs.
587 	 */
588 	vopstats_fstype = (vopstats_t **)kmem_zalloc(
589 	    (sizeof (vopstats_t *) * nfstype), KM_SLEEP);
590 
591 	/* Set up the global vopstats initialization template */
592 	vs_templatep = create_vopstats_template();
593 }
594 
595 /*
596  * We need to have the all of the counters zeroed.
597  * The initialization of the vopstats_t includes on the order of
598  * 50 calls to kstat_named_init().  Rather that do that on every call,
599  * we do it once in a template (vs_templatep) then bcopy it over.
600  */
601 void
602 initialize_vopstats(vopstats_t *vsp)
603 {
604 	if (vsp == NULL)
605 		return;
606 
607 	bcopy(vs_templatep, vsp, sizeof (vopstats_t));
608 }
609 
610 /*
611  * If possible, determine which vopstats by fstype to use and
612  * return a pointer to the caller.
613  */
614 vopstats_t *
615 get_fstype_vopstats(vfs_t *vfsp, struct vfssw *vswp)
616 {
617 	int		fstype = 0;	/* Index into vfssw[] */
618 	vopstats_t	*vsp = NULL;
619 
620 	if (vfsp == NULL || (vfsp->vfs_flag & VFS_STATS) == 0 ||
621 	    !vopstats_enabled)
622 		return (NULL);
623 	/*
624 	 * Set up the fstype.  We go to so much trouble because all versions
625 	 * of NFS use the same fstype in their vfs even though they have
626 	 * distinct entries in the vfssw[] table.
627 	 * NOTE: A special vfs (e.g., EIO_vfs) may not have an entry.
628 	 */
629 	if (vswp) {
630 		fstype = vswp - vfssw;	/* Gets us the index */
631 	} else {
632 		fstype = vfsp->vfs_fstype;
633 	}
634 
635 	/*
636 	 * Point to the per-fstype vopstats. The only valid values are
637 	 * non-zero positive values less than the number of vfssw[] table
638 	 * entries.
639 	 */
640 	if (fstype > 0 && fstype < nfstype) {
641 		vsp = vopstats_fstype[fstype];
642 	}
643 
644 	return (vsp);
645 }
646 
647 /*
648  * Generate a kstat name, create the kstat structure, and allocate a
649  * vsk_anchor_t to hold it together.  Return the pointer to the vsk_anchor_t
650  * to the caller.  This must only be called from a mount.
651  */
652 vsk_anchor_t *
653 get_vskstat_anchor(vfs_t *vfsp)
654 {
655 	char		kstatstr[KSTAT_STRLEN]; /* kstat name for vopstats */
656 	statvfs64_t	statvfsbuf;		/* Needed to find f_fsid */
657 	vsk_anchor_t	*vskp = NULL;		/* vfs <--> kstat anchor */
658 	kstat_t		*ksp;			/* Ptr to new kstat */
659 	avl_index_t	where;			/* Location in the AVL tree */
660 
661 	if (vfsp == NULL || vfsp->vfs_implp == NULL ||
662 	    (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
663 		return (NULL);
664 
665 	/* Need to get the fsid to build a kstat name */
666 	if (VFS_STATVFS(vfsp, &statvfsbuf) == 0) {
667 		/* Create a name for our kstats based on fsid */
668 		(void) snprintf(kstatstr, KSTAT_STRLEN, "%s%lx",
669 		    VOPSTATS_STR, statvfsbuf.f_fsid);
670 
671 		/* Allocate and initialize the vsk_anchor_t */
672 		vskp = kmem_cache_alloc(vsk_anchor_cache, KM_SLEEP);
673 		bzero(vskp, sizeof (*vskp));
674 		vskp->vsk_fsid = statvfsbuf.f_fsid;
675 
676 		mutex_enter(&vskstat_tree_lock);
677 		if (avl_find(&vskstat_tree, vskp, &where) == NULL) {
678 			avl_insert(&vskstat_tree, vskp, where);
679 			mutex_exit(&vskstat_tree_lock);
680 
681 			/*
682 			 * Now that we've got the anchor in the AVL
683 			 * tree, we can create the kstat.
684 			 */
685 			ksp = new_vskstat(kstatstr, &vfsp->vfs_vopstats);
686 			if (ksp) {
687 				vskp->vsk_ksp = ksp;
688 			}
689 		} else {
690 			/* Oops, found one! Release memory and lock. */
691 			mutex_exit(&vskstat_tree_lock);
692 			kmem_cache_free(vsk_anchor_cache, vskp);
693 			vskp = NULL;
694 		}
695 	}
696 	return (vskp);
697 }
698 
699 /*
700  * We're in the process of tearing down the vfs and need to cleanup
701  * the data structures associated with the vopstats. Must only be called
702  * from dounmount().
703  */
704 void
705 teardown_vopstats(vfs_t *vfsp)
706 {
707 	vsk_anchor_t	*vskap;
708 	avl_index_t	where;
709 
710 	if (vfsp == NULL || vfsp->vfs_implp == NULL ||
711 	    (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
712 		return;
713 
714 	/* This is a safe check since VFS_STATS must be set (see above) */
715 	if ((vskap = vfsp->vfs_vskap) == NULL)
716 		return;
717 
718 	/* Whack the pointer right away */
719 	vfsp->vfs_vskap = NULL;
720 
721 	/* Lock the tree, remove the node, and delete the kstat */
722 	mutex_enter(&vskstat_tree_lock);
723 	if (avl_find(&vskstat_tree, vskap, &where)) {
724 		avl_remove(&vskstat_tree, vskap);
725 	}
726 
727 	if (vskap->vsk_ksp) {
728 		kstat_delete(vskap->vsk_ksp);
729 	}
730 	mutex_exit(&vskstat_tree_lock);
731 
732 	kmem_cache_free(vsk_anchor_cache, vskap);
733 }
734 
735 /*
736  * Read or write a vnode.  Called from kernel code.
737  */
738 int
739 vn_rdwr(
740 	enum uio_rw rw,
741 	struct vnode *vp,
742 	caddr_t base,
743 	ssize_t len,
744 	offset_t offset,
745 	enum uio_seg seg,
746 	int ioflag,
747 	rlim64_t ulimit,	/* meaningful only if rw is UIO_WRITE */
748 	cred_t *cr,
749 	ssize_t *residp)
750 {
751 	struct uio uio;
752 	struct iovec iov;
753 	int error;
754 	int in_crit = 0;
755 
756 	if (rw == UIO_WRITE && ISROFILE(vp))
757 		return (EROFS);
758 
759 	if (len < 0)
760 		return (EIO);
761 
762 	VOPXID_MAP_CR(vp, cr);
763 
764 	iov.iov_base = base;
765 	iov.iov_len = len;
766 	uio.uio_iov = &iov;
767 	uio.uio_iovcnt = 1;
768 	uio.uio_loffset = offset;
769 	uio.uio_segflg = (short)seg;
770 	uio.uio_resid = len;
771 	uio.uio_llimit = ulimit;
772 
773 	/*
774 	 * We have to enter the critical region before calling VOP_RWLOCK
775 	 * to avoid a deadlock with ufs.
776 	 */
777 	if (nbl_need_check(vp)) {
778 		int svmand;
779 
780 		nbl_start_crit(vp, RW_READER);
781 		in_crit = 1;
782 		error = nbl_svmand(vp, cr, &svmand);
783 		if (error != 0)
784 			goto done;
785 		if (nbl_conflict(vp, rw == UIO_WRITE ? NBL_WRITE : NBL_READ,
786 		    uio.uio_offset, uio.uio_resid, svmand, NULL)) {
787 			error = EACCES;
788 			goto done;
789 		}
790 	}
791 
792 	(void) VOP_RWLOCK(vp,
793 	    rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
794 	if (rw == UIO_WRITE) {
795 		uio.uio_fmode = FWRITE;
796 		uio.uio_extflg = UIO_COPY_DEFAULT;
797 		error = VOP_WRITE(vp, &uio, ioflag, cr, NULL);
798 	} else {
799 		uio.uio_fmode = FREAD;
800 		uio.uio_extflg = UIO_COPY_CACHED;
801 		error = VOP_READ(vp, &uio, ioflag, cr, NULL);
802 	}
803 	VOP_RWUNLOCK(vp,
804 	    rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
805 	if (residp)
806 		*residp = uio.uio_resid;
807 	else if (uio.uio_resid)
808 		error = EIO;
809 
810 done:
811 	if (in_crit)
812 		nbl_end_crit(vp);
813 	return (error);
814 }
815 
816 /*
817  * Release a vnode.  Call VOP_INACTIVE on last reference or
818  * decrement reference count.
819  *
820  * To avoid race conditions, the v_count is left at 1 for
821  * the call to VOP_INACTIVE. This prevents another thread
822  * from reclaiming and releasing the vnode *before* the
823  * VOP_INACTIVE routine has a chance to destroy the vnode.
824  * We can't have more than 1 thread calling VOP_INACTIVE
825  * on a vnode.
826  */
827 void
828 vn_rele(vnode_t *vp)
829 {
830 	VERIFY(vp->v_count > 0);
831 	mutex_enter(&vp->v_lock);
832 	if (vp->v_count == 1) {
833 		mutex_exit(&vp->v_lock);
834 		VOP_INACTIVE(vp, CRED(), NULL);
835 		return;
836 	}
837 	vp->v_count--;
838 	mutex_exit(&vp->v_lock);
839 }
840 
841 /*
842  * Release a vnode referenced by the DNLC. Multiple DNLC references are treated
843  * as a single reference, so v_count is not decremented until the last DNLC hold
844  * is released. This makes it possible to distinguish vnodes that are referenced
845  * only by the DNLC.
846  */
847 void
848 vn_rele_dnlc(vnode_t *vp)
849 {
850 	VERIFY((vp->v_count > 0) && (vp->v_count_dnlc > 0));
851 	mutex_enter(&vp->v_lock);
852 	if (--vp->v_count_dnlc == 0) {
853 		if (vp->v_count == 1) {
854 			mutex_exit(&vp->v_lock);
855 			VOP_INACTIVE(vp, CRED(), NULL);
856 			return;
857 		}
858 		vp->v_count--;
859 	}
860 	mutex_exit(&vp->v_lock);
861 }
862 
863 /*
864  * Like vn_rele() except that it clears v_stream under v_lock.
865  * This is used by sockfs when it dismantels the association between
866  * the sockfs node and the vnode in the underlaying file system.
867  * v_lock has to be held to prevent a thread coming through the lookupname
868  * path from accessing a stream head that is going away.
869  */
870 void
871 vn_rele_stream(vnode_t *vp)
872 {
873 	VERIFY(vp->v_count > 0);
874 	mutex_enter(&vp->v_lock);
875 	vp->v_stream = NULL;
876 	if (vp->v_count == 1) {
877 		mutex_exit(&vp->v_lock);
878 		VOP_INACTIVE(vp, CRED(), NULL);
879 		return;
880 	}
881 	vp->v_count--;
882 	mutex_exit(&vp->v_lock);
883 }
884 
885 static void
886 vn_rele_inactive(vnode_t *vp)
887 {
888 	VOP_INACTIVE(vp, CRED(), NULL);
889 }
890 
891 /*
892  * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it
893  * asynchronously using a taskq. This can avoid deadlocks caused by re-entering
894  * the file system as a result of releasing the vnode. Note, file systems
895  * already have to handle the race where the vnode is incremented before the
896  * inactive routine is called and does its locking.
897  *
898  * Warning: Excessive use of this routine can lead to performance problems.
899  * This is because taskqs throttle back allocation if too many are created.
900  */
901 void
902 vn_rele_async(vnode_t *vp, taskq_t *taskq)
903 {
904 	VERIFY(vp->v_count > 0);
905 	mutex_enter(&vp->v_lock);
906 	if (vp->v_count == 1) {
907 		mutex_exit(&vp->v_lock);
908 		VERIFY(taskq_dispatch(taskq, (task_func_t *)vn_rele_inactive,
909 		    vp, TQ_SLEEP) != NULL);
910 		return;
911 	}
912 	vp->v_count--;
913 	mutex_exit(&vp->v_lock);
914 }
915 
916 int
917 vn_open(
918 	char *pnamep,
919 	enum uio_seg seg,
920 	int filemode,
921 	int createmode,
922 	struct vnode **vpp,
923 	enum create crwhy,
924 	mode_t umask)
925 {
926 	return (vn_openat(pnamep, seg, filemode, createmode, vpp, crwhy,
927 	    umask, NULL, -1));
928 }
929 
930 
931 /*
932  * Open/create a vnode.
933  * This may be callable by the kernel, the only known use
934  * of user context being that the current user credentials
935  * are used for permissions.  crwhy is defined iff filemode & FCREAT.
936  */
937 int
938 vn_openat(
939 	char *pnamep,
940 	enum uio_seg seg,
941 	int filemode,
942 	int createmode,
943 	struct vnode **vpp,
944 	enum create crwhy,
945 	mode_t umask,
946 	struct vnode *startvp,
947 	int fd)
948 {
949 	struct vnode *vp;
950 	int mode;
951 	int accessflags;
952 	int error;
953 	int in_crit = 0;
954 	int open_done = 0;
955 	int shrlock_done = 0;
956 	struct vattr vattr;
957 	enum symfollow follow;
958 	int estale_retry = 0;
959 	struct shrlock shr;
960 	struct shr_locowner shr_own;
961 
962 	mode = 0;
963 	accessflags = 0;
964 	if (filemode & FREAD)
965 		mode |= VREAD;
966 	if (filemode & (FWRITE|FTRUNC))
967 		mode |= VWRITE;
968 	if (filemode & FXATTRDIROPEN)
969 		mode |= VEXEC;
970 
971 	/* symlink interpretation */
972 	if (filemode & FNOFOLLOW)
973 		follow = NO_FOLLOW;
974 	else
975 		follow = FOLLOW;
976 
977 	if (filemode & FAPPEND)
978 		accessflags |= V_APPEND;
979 
980 top:
981 	if (filemode & FCREAT) {
982 		enum vcexcl excl;
983 
984 		/*
985 		 * Wish to create a file.
986 		 */
987 		vattr.va_type = VREG;
988 		vattr.va_mode = createmode;
989 		vattr.va_mask = AT_TYPE|AT_MODE;
990 		if (filemode & FTRUNC) {
991 			vattr.va_size = 0;
992 			vattr.va_mask |= AT_SIZE;
993 		}
994 		if (filemode & FEXCL)
995 			excl = EXCL;
996 		else
997 			excl = NONEXCL;
998 
999 		if (error =
1000 		    vn_createat(pnamep, seg, &vattr, excl, mode, &vp, crwhy,
1001 		    (filemode & ~(FTRUNC|FEXCL)), umask, startvp))
1002 			return (error);
1003 	} else {
1004 		/*
1005 		 * Wish to open a file.  Just look it up.
1006 		 */
1007 		if (error = lookupnameat(pnamep, seg, follow,
1008 		    NULLVPP, &vp, startvp)) {
1009 			if ((error == ESTALE) &&
1010 			    fs_need_estale_retry(estale_retry++))
1011 				goto top;
1012 			return (error);
1013 		}
1014 
1015 		/*
1016 		 * Get the attributes to check whether file is large.
1017 		 * We do this only if the FOFFMAX flag is not set and
1018 		 * only for regular files.
1019 		 */
1020 
1021 		if (!(filemode & FOFFMAX) && (vp->v_type == VREG)) {
1022 			vattr.va_mask = AT_SIZE;
1023 			if ((error = VOP_GETATTR(vp, &vattr, 0,
1024 			    CRED(), NULL))) {
1025 				goto out;
1026 			}
1027 			if (vattr.va_size > (u_offset_t)MAXOFF32_T) {
1028 				/*
1029 				 * Large File API - regular open fails
1030 				 * if FOFFMAX flag is set in file mode
1031 				 */
1032 				error = EOVERFLOW;
1033 				goto out;
1034 			}
1035 		}
1036 		/*
1037 		 * Can't write directories, active texts, or
1038 		 * read-only filesystems.  Can't truncate files
1039 		 * on which mandatory locking is in effect.
1040 		 */
1041 		if (filemode & (FWRITE|FTRUNC)) {
1042 			/*
1043 			 * Allow writable directory if VDIROPEN flag is set.
1044 			 */
1045 			if (vp->v_type == VDIR && !(vp->v_flag & VDIROPEN)) {
1046 				error = EISDIR;
1047 				goto out;
1048 			}
1049 			if (ISROFILE(vp)) {
1050 				error = EROFS;
1051 				goto out;
1052 			}
1053 			/*
1054 			 * Can't truncate files on which
1055 			 * sysv mandatory locking is in effect.
1056 			 */
1057 			if (filemode & FTRUNC) {
1058 				vnode_t *rvp;
1059 
1060 				if (VOP_REALVP(vp, &rvp, NULL) != 0)
1061 					rvp = vp;
1062 				if (rvp->v_filocks != NULL) {
1063 					vattr.va_mask = AT_MODE;
1064 					if ((error = VOP_GETATTR(vp,
1065 					    &vattr, 0, CRED(), NULL)) == 0 &&
1066 					    MANDLOCK(vp, vattr.va_mode))
1067 						error = EAGAIN;
1068 				}
1069 			}
1070 			if (error)
1071 				goto out;
1072 		}
1073 		/*
1074 		 * Check permissions.
1075 		 */
1076 		if (error = VOP_ACCESS(vp, mode, accessflags, CRED(), NULL))
1077 			goto out;
1078 	}
1079 
1080 	/*
1081 	 * Do remaining checks for FNOFOLLOW and FNOLINKS.
1082 	 */
1083 	if ((filemode & FNOFOLLOW) && vp->v_type == VLNK) {
1084 		error = ELOOP;
1085 		goto out;
1086 	}
1087 	if (filemode & FNOLINKS) {
1088 		vattr.va_mask = AT_NLINK;
1089 		if ((error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))) {
1090 			goto out;
1091 		}
1092 		if (vattr.va_nlink != 1) {
1093 			error = EMLINK;
1094 			goto out;
1095 		}
1096 	}
1097 
1098 	/*
1099 	 * Opening a socket corresponding to the AF_UNIX pathname
1100 	 * in the filesystem name space is not supported.
1101 	 * However, VSOCK nodes in namefs are supported in order
1102 	 * to make fattach work for sockets.
1103 	 *
1104 	 * XXX This uses VOP_REALVP to distinguish between
1105 	 * an unopened namefs node (where VOP_REALVP returns a
1106 	 * different VSOCK vnode) and a VSOCK created by vn_create
1107 	 * in some file system (where VOP_REALVP would never return
1108 	 * a different vnode).
1109 	 */
1110 	if (vp->v_type == VSOCK) {
1111 		struct vnode *nvp;
1112 
1113 		error = VOP_REALVP(vp, &nvp, NULL);
1114 		if (error != 0 || nvp == NULL || nvp == vp ||
1115 		    nvp->v_type != VSOCK) {
1116 			error = EOPNOTSUPP;
1117 			goto out;
1118 		}
1119 	}
1120 
1121 	if ((vp->v_type == VREG) && nbl_need_check(vp)) {
1122 		/* get share reservation */
1123 		shr.s_access = 0;
1124 		if (filemode & FWRITE)
1125 			shr.s_access |= F_WRACC;
1126 		if (filemode & FREAD)
1127 			shr.s_access |= F_RDACC;
1128 		shr.s_deny = 0;
1129 		shr.s_sysid = 0;
1130 		shr.s_pid = ttoproc(curthread)->p_pid;
1131 		shr_own.sl_pid = shr.s_pid;
1132 		shr_own.sl_id = fd;
1133 		shr.s_own_len = sizeof (shr_own);
1134 		shr.s_owner = (caddr_t)&shr_own;
1135 		error = VOP_SHRLOCK(vp, F_SHARE_NBMAND, &shr, filemode, CRED(),
1136 		    NULL);
1137 		if (error)
1138 			goto out;
1139 		shrlock_done = 1;
1140 
1141 		/* nbmand conflict check if truncating file */
1142 		if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1143 			nbl_start_crit(vp, RW_READER);
1144 			in_crit = 1;
1145 
1146 			vattr.va_mask = AT_SIZE;
1147 			if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
1148 				goto out;
1149 			if (nbl_conflict(vp, NBL_WRITE, 0, vattr.va_size, 0,
1150 			    NULL)) {
1151 				error = EACCES;
1152 				goto out;
1153 			}
1154 		}
1155 	}
1156 
1157 	/*
1158 	 * Do opening protocol.
1159 	 */
1160 	error = VOP_OPEN(&vp, filemode, CRED(), NULL);
1161 	if (error)
1162 		goto out;
1163 	open_done = 1;
1164 
1165 	/*
1166 	 * Truncate if required.
1167 	 */
1168 	if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1169 		vattr.va_size = 0;
1170 		vattr.va_mask = AT_SIZE;
1171 		if ((error = VOP_SETATTR(vp, &vattr, 0, CRED(), NULL)) != 0)
1172 			goto out;
1173 	}
1174 out:
1175 	ASSERT(vp->v_count > 0);
1176 
1177 	if (in_crit) {
1178 		nbl_end_crit(vp);
1179 		in_crit = 0;
1180 	}
1181 	if (error) {
1182 		if (open_done) {
1183 			(void) VOP_CLOSE(vp, filemode, 1, (offset_t)0, CRED(),
1184 			    NULL);
1185 			open_done = 0;
1186 			shrlock_done = 0;
1187 		}
1188 		if (shrlock_done) {
1189 			(void) VOP_SHRLOCK(vp, F_UNSHARE, &shr, 0, CRED(),
1190 			    NULL);
1191 			shrlock_done = 0;
1192 		}
1193 
1194 		/*
1195 		 * The following clause was added to handle a problem
1196 		 * with NFS consistency.  It is possible that a lookup
1197 		 * of the file to be opened succeeded, but the file
1198 		 * itself doesn't actually exist on the server.  This
1199 		 * is chiefly due to the DNLC containing an entry for
1200 		 * the file which has been removed on the server.  In
1201 		 * this case, we just start over.  If there was some
1202 		 * other cause for the ESTALE error, then the lookup
1203 		 * of the file will fail and the error will be returned
1204 		 * above instead of looping around from here.
1205 		 */
1206 		VN_RELE(vp);
1207 		if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1208 			goto top;
1209 	} else
1210 		*vpp = vp;
1211 	return (error);
1212 }
1213 
1214 /*
1215  * The following two accessor functions are for the NFSv4 server.  Since there
1216  * is no VOP_OPEN_UP/DOWNGRADE we need a way for the NFS server to keep the
1217  * vnode open counts correct when a client "upgrades" an open or does an
1218  * open_downgrade.  In NFS, an upgrade or downgrade can not only change the
1219  * open mode (add or subtract read or write), but also change the share/deny
1220  * modes.  However, share reservations are not integrated with OPEN, yet, so
1221  * we need to handle each separately.  These functions are cleaner than having
1222  * the NFS server manipulate the counts directly, however, nobody else should
1223  * use these functions.
1224  */
1225 void
1226 vn_open_upgrade(
1227 	vnode_t *vp,
1228 	int filemode)
1229 {
1230 	ASSERT(vp->v_type == VREG);
1231 
1232 	if (filemode & FREAD)
1233 		atomic_add_32(&(vp->v_rdcnt), 1);
1234 	if (filemode & FWRITE)
1235 		atomic_add_32(&(vp->v_wrcnt), 1);
1236 
1237 }
1238 
1239 void
1240 vn_open_downgrade(
1241 	vnode_t *vp,
1242 	int filemode)
1243 {
1244 	ASSERT(vp->v_type == VREG);
1245 
1246 	if (filemode & FREAD) {
1247 		ASSERT(vp->v_rdcnt > 0);
1248 		atomic_add_32(&(vp->v_rdcnt), -1);
1249 	}
1250 	if (filemode & FWRITE) {
1251 		ASSERT(vp->v_wrcnt > 0);
1252 		atomic_add_32(&(vp->v_wrcnt), -1);
1253 	}
1254 
1255 }
1256 
1257 int
1258 vn_create(
1259 	char *pnamep,
1260 	enum uio_seg seg,
1261 	struct vattr *vap,
1262 	enum vcexcl excl,
1263 	int mode,
1264 	struct vnode **vpp,
1265 	enum create why,
1266 	int flag,
1267 	mode_t umask)
1268 {
1269 	return (vn_createat(pnamep, seg, vap, excl, mode, vpp, why, flag,
1270 	    umask, NULL));
1271 }
1272 
1273 /*
1274  * Create a vnode (makenode).
1275  */
1276 int
1277 vn_createat(
1278 	char *pnamep,
1279 	enum uio_seg seg,
1280 	struct vattr *vap,
1281 	enum vcexcl excl,
1282 	int mode,
1283 	struct vnode **vpp,
1284 	enum create why,
1285 	int flag,
1286 	mode_t umask,
1287 	struct vnode *startvp)
1288 {
1289 	struct vnode *dvp;	/* ptr to parent dir vnode */
1290 	struct vnode *vp = NULL;
1291 	struct pathname pn;
1292 	int error;
1293 	int in_crit = 0;
1294 	struct vattr vattr;
1295 	enum symfollow follow;
1296 	int estale_retry = 0;
1297 	uint32_t auditing = AU_AUDITING();
1298 
1299 	ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
1300 
1301 	/* symlink interpretation */
1302 	if ((flag & FNOFOLLOW) || excl == EXCL)
1303 		follow = NO_FOLLOW;
1304 	else
1305 		follow = FOLLOW;
1306 	flag &= ~(FNOFOLLOW|FNOLINKS);
1307 
1308 top:
1309 	/*
1310 	 * Lookup directory.
1311 	 * If new object is a file, call lower level to create it.
1312 	 * Note that it is up to the lower level to enforce exclusive
1313 	 * creation, if the file is already there.
1314 	 * This allows the lower level to do whatever
1315 	 * locking or protocol that is needed to prevent races.
1316 	 * If the new object is directory call lower level to make
1317 	 * the new directory, with "." and "..".
1318 	 */
1319 	if (error = pn_get(pnamep, seg, &pn))
1320 		return (error);
1321 	if (auditing)
1322 		audit_vncreate_start();
1323 	dvp = NULL;
1324 	*vpp = NULL;
1325 	/*
1326 	 * lookup will find the parent directory for the vnode.
1327 	 * When it is done the pn holds the name of the entry
1328 	 * in the directory.
1329 	 * If this is a non-exclusive create we also find the node itself.
1330 	 */
1331 	error = lookuppnat(&pn, NULL, follow, &dvp,
1332 	    (excl == EXCL) ? NULLVPP : vpp, startvp);
1333 	if (error) {
1334 		pn_free(&pn);
1335 		if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1336 			goto top;
1337 		if (why == CRMKDIR && error == EINVAL)
1338 			error = EEXIST;		/* SVID */
1339 		return (error);
1340 	}
1341 
1342 	if (why != CRMKNOD)
1343 		vap->va_mode &= ~VSVTX;
1344 
1345 	/*
1346 	 * If default ACLs are defined for the directory don't apply the
1347 	 * umask if umask is passed.
1348 	 */
1349 
1350 	if (umask) {
1351 
1352 		vsecattr_t vsec;
1353 
1354 		vsec.vsa_aclcnt = 0;
1355 		vsec.vsa_aclentp = NULL;
1356 		vsec.vsa_dfaclcnt = 0;
1357 		vsec.vsa_dfaclentp = NULL;
1358 		vsec.vsa_mask = VSA_DFACLCNT;
1359 		error = VOP_GETSECATTR(dvp, &vsec, 0, CRED(), NULL);
1360 		/*
1361 		 * If error is ENOSYS then treat it as no error
1362 		 * Don't want to force all file systems to support
1363 		 * aclent_t style of ACL's.
1364 		 */
1365 		if (error == ENOSYS)
1366 			error = 0;
1367 		if (error) {
1368 			if (*vpp != NULL)
1369 				VN_RELE(*vpp);
1370 			goto out;
1371 		} else {
1372 			/*
1373 			 * Apply the umask if no default ACLs.
1374 			 */
1375 			if (vsec.vsa_dfaclcnt == 0)
1376 				vap->va_mode &= ~umask;
1377 
1378 			/*
1379 			 * VOP_GETSECATTR() may have allocated memory for
1380 			 * ACLs we didn't request, so double-check and
1381 			 * free it if necessary.
1382 			 */
1383 			if (vsec.vsa_aclcnt && vsec.vsa_aclentp != NULL)
1384 				kmem_free((caddr_t)vsec.vsa_aclentp,
1385 				    vsec.vsa_aclcnt * sizeof (aclent_t));
1386 			if (vsec.vsa_dfaclcnt && vsec.vsa_dfaclentp != NULL)
1387 				kmem_free((caddr_t)vsec.vsa_dfaclentp,
1388 				    vsec.vsa_dfaclcnt * sizeof (aclent_t));
1389 		}
1390 	}
1391 
1392 	/*
1393 	 * In general we want to generate EROFS if the file system is
1394 	 * readonly.  However, POSIX (IEEE Std. 1003.1) section 5.3.1
1395 	 * documents the open system call, and it says that O_CREAT has no
1396 	 * effect if the file already exists.  Bug 1119649 states
1397 	 * that open(path, O_CREAT, ...) fails when attempting to open an
1398 	 * existing file on a read only file system.  Thus, the first part
1399 	 * of the following if statement has 3 checks:
1400 	 *	if the file exists &&
1401 	 *		it is being open with write access &&
1402 	 *		the file system is read only
1403 	 *	then generate EROFS
1404 	 */
1405 	if ((*vpp != NULL && (mode & VWRITE) && ISROFILE(*vpp)) ||
1406 	    (*vpp == NULL && dvp->v_vfsp->vfs_flag & VFS_RDONLY)) {
1407 		if (*vpp)
1408 			VN_RELE(*vpp);
1409 		error = EROFS;
1410 	} else if (excl == NONEXCL && *vpp != NULL) {
1411 		vnode_t *rvp;
1412 
1413 		/*
1414 		 * File already exists.  If a mandatory lock has been
1415 		 * applied, return error.
1416 		 */
1417 		vp = *vpp;
1418 		if (VOP_REALVP(vp, &rvp, NULL) != 0)
1419 			rvp = vp;
1420 		if ((vap->va_mask & AT_SIZE) && nbl_need_check(vp)) {
1421 			nbl_start_crit(vp, RW_READER);
1422 			in_crit = 1;
1423 		}
1424 		if (rvp->v_filocks != NULL || rvp->v_shrlocks != NULL) {
1425 			vattr.va_mask = AT_MODE|AT_SIZE;
1426 			if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) {
1427 				goto out;
1428 			}
1429 			if (MANDLOCK(vp, vattr.va_mode)) {
1430 				error = EAGAIN;
1431 				goto out;
1432 			}
1433 			/*
1434 			 * File cannot be truncated if non-blocking mandatory
1435 			 * locks are currently on the file.
1436 			 */
1437 			if ((vap->va_mask & AT_SIZE) && in_crit) {
1438 				u_offset_t offset;
1439 				ssize_t length;
1440 
1441 				offset = vap->va_size > vattr.va_size ?
1442 				    vattr.va_size : vap->va_size;
1443 				length = vap->va_size > vattr.va_size ?
1444 				    vap->va_size - vattr.va_size :
1445 				    vattr.va_size - vap->va_size;
1446 				if (nbl_conflict(vp, NBL_WRITE, offset,
1447 				    length, 0, NULL)) {
1448 					error = EACCES;
1449 					goto out;
1450 				}
1451 			}
1452 		}
1453 
1454 		/*
1455 		 * If the file is the root of a VFS, we've crossed a
1456 		 * mount point and the "containing" directory that we
1457 		 * acquired above (dvp) is irrelevant because it's in
1458 		 * a different file system.  We apply VOP_CREATE to the
1459 		 * target itself instead of to the containing directory
1460 		 * and supply a null path name to indicate (conventionally)
1461 		 * the node itself as the "component" of interest.
1462 		 *
1463 		 * The intercession of the file system is necessary to
1464 		 * ensure that the appropriate permission checks are
1465 		 * done.
1466 		 */
1467 		if (vp->v_flag & VROOT) {
1468 			ASSERT(why != CRMKDIR);
1469 			error = VOP_CREATE(vp, "", vap, excl, mode, vpp,
1470 			    CRED(), flag, NULL, NULL);
1471 			/*
1472 			 * If the create succeeded, it will have created
1473 			 * a new reference to the vnode.  Give up the
1474 			 * original reference.  The assertion should not
1475 			 * get triggered because NBMAND locks only apply to
1476 			 * VREG files.  And if in_crit is non-zero for some
1477 			 * reason, detect that here, rather than when we
1478 			 * deference a null vp.
1479 			 */
1480 			ASSERT(in_crit == 0);
1481 			VN_RELE(vp);
1482 			vp = NULL;
1483 			goto out;
1484 		}
1485 
1486 		/*
1487 		 * Large File API - non-large open (FOFFMAX flag not set)
1488 		 * of regular file fails if the file size exceeds MAXOFF32_T.
1489 		 */
1490 		if (why != CRMKDIR &&
1491 		    !(flag & FOFFMAX) &&
1492 		    (vp->v_type == VREG)) {
1493 			vattr.va_mask = AT_SIZE;
1494 			if ((error = VOP_GETATTR(vp, &vattr, 0,
1495 			    CRED(), NULL))) {
1496 				goto out;
1497 			}
1498 			if ((vattr.va_size > (u_offset_t)MAXOFF32_T)) {
1499 				error = EOVERFLOW;
1500 				goto out;
1501 			}
1502 		}
1503 	}
1504 
1505 	if (error == 0) {
1506 		/*
1507 		 * Call mkdir() if specified, otherwise create().
1508 		 */
1509 		int must_be_dir = pn_fixslash(&pn);	/* trailing '/'? */
1510 
1511 		if (why == CRMKDIR)
1512 			/*
1513 			 * N.B., if vn_createat() ever requests
1514 			 * case-insensitive behavior then it will need
1515 			 * to be passed to VOP_MKDIR().  VOP_CREATE()
1516 			 * will already get it via "flag"
1517 			 */
1518 			error = VOP_MKDIR(dvp, pn.pn_path, vap, vpp, CRED(),
1519 			    NULL, 0, NULL);
1520 		else if (!must_be_dir)
1521 			error = VOP_CREATE(dvp, pn.pn_path, vap,
1522 			    excl, mode, vpp, CRED(), flag, NULL, NULL);
1523 		else
1524 			error = ENOTDIR;
1525 	}
1526 
1527 out:
1528 
1529 	if (auditing)
1530 		audit_vncreate_finish(*vpp, error);
1531 	if (in_crit) {
1532 		nbl_end_crit(vp);
1533 		in_crit = 0;
1534 	}
1535 	if (vp != NULL) {
1536 		VN_RELE(vp);
1537 		vp = NULL;
1538 	}
1539 	pn_free(&pn);
1540 	VN_RELE(dvp);
1541 	/*
1542 	 * The following clause was added to handle a problem
1543 	 * with NFS consistency.  It is possible that a lookup
1544 	 * of the file to be created succeeded, but the file
1545 	 * itself doesn't actually exist on the server.  This
1546 	 * is chiefly due to the DNLC containing an entry for
1547 	 * the file which has been removed on the server.  In
1548 	 * this case, we just start over.  If there was some
1549 	 * other cause for the ESTALE error, then the lookup
1550 	 * of the file will fail and the error will be returned
1551 	 * above instead of looping around from here.
1552 	 */
1553 	if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1554 		goto top;
1555 	return (error);
1556 }
1557 
1558 int
1559 vn_link(char *from, char *to, enum uio_seg seg)
1560 {
1561 	struct vnode *fvp;		/* from vnode ptr */
1562 	struct vnode *tdvp;		/* to directory vnode ptr */
1563 	struct pathname pn;
1564 	int error;
1565 	struct vattr vattr;
1566 	dev_t fsid;
1567 	int estale_retry = 0;
1568 
1569 top:
1570 	fvp = tdvp = NULL;
1571 	if (error = pn_get(to, seg, &pn))
1572 		return (error);
1573 	if (error = lookupname(from, seg, NO_FOLLOW, NULLVPP, &fvp))
1574 		goto out;
1575 	if (error = lookuppn(&pn, NULL, NO_FOLLOW, &tdvp, NULLVPP))
1576 		goto out;
1577 	/*
1578 	 * Make sure both source vnode and target directory vnode are
1579 	 * in the same vfs and that it is writeable.
1580 	 */
1581 	vattr.va_mask = AT_FSID;
1582 	if (error = VOP_GETATTR(fvp, &vattr, 0, CRED(), NULL))
1583 		goto out;
1584 	fsid = vattr.va_fsid;
1585 	vattr.va_mask = AT_FSID;
1586 	if (error = VOP_GETATTR(tdvp, &vattr, 0, CRED(), NULL))
1587 		goto out;
1588 	if (fsid != vattr.va_fsid) {
1589 		error = EXDEV;
1590 		goto out;
1591 	}
1592 	if (tdvp->v_vfsp->vfs_flag & VFS_RDONLY) {
1593 		error = EROFS;
1594 		goto out;
1595 	}
1596 	/*
1597 	 * Do the link.
1598 	 */
1599 	(void) pn_fixslash(&pn);
1600 	error = VOP_LINK(tdvp, fvp, pn.pn_path, CRED(), NULL, 0);
1601 out:
1602 	pn_free(&pn);
1603 	if (fvp)
1604 		VN_RELE(fvp);
1605 	if (tdvp)
1606 		VN_RELE(tdvp);
1607 	if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1608 		goto top;
1609 	return (error);
1610 }
1611 
1612 int
1613 vn_rename(char *from, char *to, enum uio_seg seg)
1614 {
1615 	return (vn_renameat(NULL, from, NULL, to, seg));
1616 }
1617 
1618 int
1619 vn_renameat(vnode_t *fdvp, char *fname, vnode_t *tdvp,
1620 		char *tname, enum uio_seg seg)
1621 {
1622 	int error;
1623 	struct vattr vattr;
1624 	struct pathname fpn;		/* from pathname */
1625 	struct pathname tpn;		/* to pathname */
1626 	dev_t fsid;
1627 	int in_crit_src, in_crit_targ;
1628 	vnode_t *fromvp, *fvp;
1629 	vnode_t *tovp, *targvp;
1630 	int estale_retry = 0;
1631 	uint32_t auditing = AU_AUDITING();
1632 
1633 top:
1634 	fvp = fromvp = tovp = targvp = NULL;
1635 	in_crit_src = in_crit_targ = 0;
1636 	/*
1637 	 * Get to and from pathnames.
1638 	 */
1639 	if (error = pn_get(fname, seg, &fpn))
1640 		return (error);
1641 	if (error = pn_get(tname, seg, &tpn)) {
1642 		pn_free(&fpn);
1643 		return (error);
1644 	}
1645 
1646 	/*
1647 	 * First we need to resolve the correct directories
1648 	 * The passed in directories may only be a starting point,
1649 	 * but we need the real directories the file(s) live in.
1650 	 * For example the fname may be something like usr/lib/sparc
1651 	 * and we were passed in the / directory, but we need to
1652 	 * use the lib directory for the rename.
1653 	 */
1654 
1655 	if (auditing)
1656 		audit_setfsat_path(1);
1657 	/*
1658 	 * Lookup to and from directories.
1659 	 */
1660 	if (error = lookuppnat(&fpn, NULL, NO_FOLLOW, &fromvp, &fvp, fdvp)) {
1661 		goto out;
1662 	}
1663 
1664 	/*
1665 	 * Make sure there is an entry.
1666 	 */
1667 	if (fvp == NULL) {
1668 		error = ENOENT;
1669 		goto out;
1670 	}
1671 
1672 	if (auditing)
1673 		audit_setfsat_path(3);
1674 	if (error = lookuppnat(&tpn, NULL, NO_FOLLOW, &tovp, &targvp, tdvp)) {
1675 		goto out;
1676 	}
1677 
1678 	/*
1679 	 * Make sure both the from vnode directory and the to directory
1680 	 * are in the same vfs and the to directory is writable.
1681 	 * We check fsid's, not vfs pointers, so loopback fs works.
1682 	 */
1683 	if (fromvp != tovp) {
1684 		vattr.va_mask = AT_FSID;
1685 		if (error = VOP_GETATTR(fromvp, &vattr, 0, CRED(), NULL))
1686 			goto out;
1687 		fsid = vattr.va_fsid;
1688 		vattr.va_mask = AT_FSID;
1689 		if (error = VOP_GETATTR(tovp, &vattr, 0, CRED(), NULL))
1690 			goto out;
1691 		if (fsid != vattr.va_fsid) {
1692 			error = EXDEV;
1693 			goto out;
1694 		}
1695 	}
1696 
1697 	if (tovp->v_vfsp->vfs_flag & VFS_RDONLY) {
1698 		error = EROFS;
1699 		goto out;
1700 	}
1701 
1702 	if (targvp && (fvp != targvp)) {
1703 		nbl_start_crit(targvp, RW_READER);
1704 		in_crit_targ = 1;
1705 		if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
1706 			error = EACCES;
1707 			goto out;
1708 		}
1709 	}
1710 
1711 	if (nbl_need_check(fvp)) {
1712 		nbl_start_crit(fvp, RW_READER);
1713 		in_crit_src = 1;
1714 		if (nbl_conflict(fvp, NBL_RENAME, 0, 0, 0, NULL)) {
1715 			error = EACCES;
1716 			goto out;
1717 		}
1718 	}
1719 
1720 	/*
1721 	 * Do the rename.
1722 	 */
1723 	(void) pn_fixslash(&tpn);
1724 	error = VOP_RENAME(fromvp, fpn.pn_path, tovp, tpn.pn_path, CRED(),
1725 	    NULL, 0);
1726 
1727 out:
1728 	pn_free(&fpn);
1729 	pn_free(&tpn);
1730 	if (in_crit_src)
1731 		nbl_end_crit(fvp);
1732 	if (in_crit_targ)
1733 		nbl_end_crit(targvp);
1734 	if (fromvp)
1735 		VN_RELE(fromvp);
1736 	if (tovp)
1737 		VN_RELE(tovp);
1738 	if (targvp)
1739 		VN_RELE(targvp);
1740 	if (fvp)
1741 		VN_RELE(fvp);
1742 	if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1743 		goto top;
1744 	return (error);
1745 }
1746 
1747 /*
1748  * Remove a file or directory.
1749  */
1750 int
1751 vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag)
1752 {
1753 	return (vn_removeat(NULL, fnamep, seg, dirflag));
1754 }
1755 
1756 int
1757 vn_removeat(vnode_t *startvp, char *fnamep, enum uio_seg seg, enum rm dirflag)
1758 {
1759 	struct vnode *vp;		/* entry vnode */
1760 	struct vnode *dvp;		/* ptr to parent dir vnode */
1761 	struct vnode *coveredvp;
1762 	struct pathname pn;		/* name of entry */
1763 	enum vtype vtype;
1764 	int error;
1765 	struct vfs *vfsp;
1766 	struct vfs *dvfsp;	/* ptr to parent dir vfs */
1767 	int in_crit = 0;
1768 	int estale_retry = 0;
1769 
1770 top:
1771 	if (error = pn_get(fnamep, seg, &pn))
1772 		return (error);
1773 	dvp = vp = NULL;
1774 	if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &dvp, &vp, startvp)) {
1775 		pn_free(&pn);
1776 		if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1777 			goto top;
1778 		return (error);
1779 	}
1780 
1781 	/*
1782 	 * Make sure there is an entry.
1783 	 */
1784 	if (vp == NULL) {
1785 		error = ENOENT;
1786 		goto out;
1787 	}
1788 
1789 	vfsp = vp->v_vfsp;
1790 	dvfsp = dvp->v_vfsp;
1791 
1792 	/*
1793 	 * If the named file is the root of a mounted filesystem, fail,
1794 	 * unless it's marked unlinkable.  In that case, unmount the
1795 	 * filesystem and proceed to unlink the covered vnode.  (If the
1796 	 * covered vnode is a directory, use rmdir instead of unlink,
1797 	 * to avoid file system corruption.)
1798 	 */
1799 	if (vp->v_flag & VROOT) {
1800 		if ((vfsp->vfs_flag & VFS_UNLINKABLE) == 0) {
1801 			error = EBUSY;
1802 			goto out;
1803 		}
1804 
1805 		/*
1806 		 * Namefs specific code starts here.
1807 		 */
1808 
1809 		if (dirflag == RMDIRECTORY) {
1810 			/*
1811 			 * User called rmdir(2) on a file that has
1812 			 * been namefs mounted on top of.  Since
1813 			 * namefs doesn't allow directories to
1814 			 * be mounted on other files we know
1815 			 * vp is not of type VDIR so fail to operation.
1816 			 */
1817 			error = ENOTDIR;
1818 			goto out;
1819 		}
1820 
1821 		/*
1822 		 * If VROOT is still set after grabbing vp->v_lock,
1823 		 * noone has finished nm_unmount so far and coveredvp
1824 		 * is valid.
1825 		 * If we manage to grab vn_vfswlock(coveredvp) before releasing
1826 		 * vp->v_lock, any race window is eliminated.
1827 		 */
1828 
1829 		mutex_enter(&vp->v_lock);
1830 		if ((vp->v_flag & VROOT) == 0) {
1831 			/* Someone beat us to the unmount */
1832 			mutex_exit(&vp->v_lock);
1833 			error = EBUSY;
1834 			goto out;
1835 		}
1836 		vfsp = vp->v_vfsp;
1837 		coveredvp = vfsp->vfs_vnodecovered;
1838 		ASSERT(coveredvp);
1839 		/*
1840 		 * Note: Implementation of vn_vfswlock shows that ordering of
1841 		 * v_lock / vn_vfswlock is not an issue here.
1842 		 */
1843 		error = vn_vfswlock(coveredvp);
1844 		mutex_exit(&vp->v_lock);
1845 
1846 		if (error)
1847 			goto out;
1848 
1849 		VN_HOLD(coveredvp);
1850 		VN_RELE(vp);
1851 		error = dounmount(vfsp, 0, CRED());
1852 
1853 		/*
1854 		 * Unmounted the namefs file system; now get
1855 		 * the object it was mounted over.
1856 		 */
1857 		vp = coveredvp;
1858 		/*
1859 		 * If namefs was mounted over a directory, then
1860 		 * we want to use rmdir() instead of unlink().
1861 		 */
1862 		if (vp->v_type == VDIR)
1863 			dirflag = RMDIRECTORY;
1864 
1865 		if (error)
1866 			goto out;
1867 	}
1868 
1869 	/*
1870 	 * Make sure filesystem is writeable.
1871 	 * We check the parent directory's vfs in case this is an lofs vnode.
1872 	 */
1873 	if (dvfsp && dvfsp->vfs_flag & VFS_RDONLY) {
1874 		error = EROFS;
1875 		goto out;
1876 	}
1877 
1878 	vtype = vp->v_type;
1879 
1880 	/*
1881 	 * If there is the possibility of an nbmand share reservation, make
1882 	 * sure it's okay to remove the file.  Keep a reference to the
1883 	 * vnode, so that we can exit the nbl critical region after
1884 	 * calling VOP_REMOVE.
1885 	 * If there is no possibility of an nbmand share reservation,
1886 	 * release the vnode reference now.  Filesystems like NFS may
1887 	 * behave differently if there is an extra reference, so get rid of
1888 	 * this one.  Fortunately, we can't have nbmand mounts on NFS
1889 	 * filesystems.
1890 	 */
1891 	if (nbl_need_check(vp)) {
1892 		nbl_start_crit(vp, RW_READER);
1893 		in_crit = 1;
1894 		if (nbl_conflict(vp, NBL_REMOVE, 0, 0, 0, NULL)) {
1895 			error = EACCES;
1896 			goto out;
1897 		}
1898 	} else {
1899 		VN_RELE(vp);
1900 		vp = NULL;
1901 	}
1902 
1903 	if (dirflag == RMDIRECTORY) {
1904 		/*
1905 		 * Caller is using rmdir(2), which can only be applied to
1906 		 * directories.
1907 		 */
1908 		if (vtype != VDIR) {
1909 			error = ENOTDIR;
1910 		} else {
1911 			vnode_t *cwd;
1912 			proc_t *pp = curproc;
1913 
1914 			mutex_enter(&pp->p_lock);
1915 			cwd = PTOU(pp)->u_cdir;
1916 			VN_HOLD(cwd);
1917 			mutex_exit(&pp->p_lock);
1918 			error = VOP_RMDIR(dvp, pn.pn_path, cwd, CRED(),
1919 			    NULL, 0);
1920 			VN_RELE(cwd);
1921 		}
1922 	} else {
1923 		/*
1924 		 * Unlink(2) can be applied to anything.
1925 		 */
1926 		error = VOP_REMOVE(dvp, pn.pn_path, CRED(), NULL, 0);
1927 	}
1928 
1929 out:
1930 	pn_free(&pn);
1931 	if (in_crit) {
1932 		nbl_end_crit(vp);
1933 		in_crit = 0;
1934 	}
1935 	if (vp != NULL)
1936 		VN_RELE(vp);
1937 	if (dvp != NULL)
1938 		VN_RELE(dvp);
1939 	if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1940 		goto top;
1941 	return (error);
1942 }
1943 
1944 /*
1945  * Utility function to compare equality of vnodes.
1946  * Compare the underlying real vnodes, if there are underlying vnodes.
1947  * This is a more thorough comparison than the VN_CMP() macro provides.
1948  */
1949 int
1950 vn_compare(vnode_t *vp1, vnode_t *vp2)
1951 {
1952 	vnode_t *realvp;
1953 
1954 	if (vp1 != NULL && VOP_REALVP(vp1, &realvp, NULL) == 0)
1955 		vp1 = realvp;
1956 	if (vp2 != NULL && VOP_REALVP(vp2, &realvp, NULL) == 0)
1957 		vp2 = realvp;
1958 	return (VN_CMP(vp1, vp2));
1959 }
1960 
1961 /*
1962  * The number of locks to hash into.  This value must be a power
1963  * of 2 minus 1 and should probably also be prime.
1964  */
1965 #define	NUM_BUCKETS	1023
1966 
1967 struct  vn_vfslocks_bucket {
1968 	kmutex_t vb_lock;
1969 	vn_vfslocks_entry_t *vb_list;
1970 	char pad[64 - sizeof (kmutex_t) - sizeof (void *)];
1971 };
1972 
1973 /*
1974  * Total number of buckets will be NUM_BUCKETS + 1 .
1975  */
1976 
1977 #pragma	align	64(vn_vfslocks_buckets)
1978 static	struct vn_vfslocks_bucket	vn_vfslocks_buckets[NUM_BUCKETS + 1];
1979 
1980 #define	VN_VFSLOCKS_SHIFT	9
1981 
1982 #define	VN_VFSLOCKS_HASH(vfsvpptr)	\
1983 	((((intptr_t)(vfsvpptr)) >> VN_VFSLOCKS_SHIFT) & NUM_BUCKETS)
1984 
1985 /*
1986  * vn_vfslocks_getlock() uses an HASH scheme to generate
1987  * rwstlock using vfs/vnode pointer passed to it.
1988  *
1989  * vn_vfslocks_rele() releases a reference in the
1990  * HASH table which allows the entry allocated by
1991  * vn_vfslocks_getlock() to be freed at a later
1992  * stage when the refcount drops to zero.
1993  */
1994 
1995 vn_vfslocks_entry_t *
1996 vn_vfslocks_getlock(void *vfsvpptr)
1997 {
1998 	struct vn_vfslocks_bucket *bp;
1999 	vn_vfslocks_entry_t *vep;
2000 	vn_vfslocks_entry_t *tvep;
2001 
2002 	ASSERT(vfsvpptr != NULL);
2003 	bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vfsvpptr)];
2004 
2005 	mutex_enter(&bp->vb_lock);
2006 	for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
2007 		if (vep->ve_vpvfs == vfsvpptr) {
2008 			vep->ve_refcnt++;
2009 			mutex_exit(&bp->vb_lock);
2010 			return (vep);
2011 		}
2012 	}
2013 	mutex_exit(&bp->vb_lock);
2014 	vep = kmem_alloc(sizeof (*vep), KM_SLEEP);
2015 	rwst_init(&vep->ve_lock, NULL, RW_DEFAULT, NULL);
2016 	vep->ve_vpvfs = (char *)vfsvpptr;
2017 	vep->ve_refcnt = 1;
2018 	mutex_enter(&bp->vb_lock);
2019 	for (tvep = bp->vb_list; tvep != NULL; tvep = tvep->ve_next) {
2020 		if (tvep->ve_vpvfs == vfsvpptr) {
2021 			tvep->ve_refcnt++;
2022 			mutex_exit(&bp->vb_lock);
2023 
2024 			/*
2025 			 * There is already an entry in the hash
2026 			 * destroy what we just allocated.
2027 			 */
2028 			rwst_destroy(&vep->ve_lock);
2029 			kmem_free(vep, sizeof (*vep));
2030 			return (tvep);
2031 		}
2032 	}
2033 	vep->ve_next = bp->vb_list;
2034 	bp->vb_list = vep;
2035 	mutex_exit(&bp->vb_lock);
2036 	return (vep);
2037 }
2038 
2039 void
2040 vn_vfslocks_rele(vn_vfslocks_entry_t *vepent)
2041 {
2042 	struct vn_vfslocks_bucket *bp;
2043 	vn_vfslocks_entry_t *vep;
2044 	vn_vfslocks_entry_t *pvep;
2045 
2046 	ASSERT(vepent != NULL);
2047 	ASSERT(vepent->ve_vpvfs != NULL);
2048 
2049 	bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vepent->ve_vpvfs)];
2050 
2051 	mutex_enter(&bp->vb_lock);
2052 	vepent->ve_refcnt--;
2053 
2054 	if ((int32_t)vepent->ve_refcnt < 0)
2055 		cmn_err(CE_PANIC, "vn_vfslocks_rele: refcount negative");
2056 
2057 	if (vepent->ve_refcnt == 0) {
2058 		for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
2059 			if (vep->ve_vpvfs == vepent->ve_vpvfs) {
2060 				if (bp->vb_list == vep)
2061 					bp->vb_list = vep->ve_next;
2062 				else {
2063 					/* LINTED */
2064 					pvep->ve_next = vep->ve_next;
2065 				}
2066 				mutex_exit(&bp->vb_lock);
2067 				rwst_destroy(&vep->ve_lock);
2068 				kmem_free(vep, sizeof (*vep));
2069 				return;
2070 			}
2071 			pvep = vep;
2072 		}
2073 		cmn_err(CE_PANIC, "vn_vfslocks_rele: vp/vfs not found");
2074 	}
2075 	mutex_exit(&bp->vb_lock);
2076 }
2077 
2078 /*
2079  * vn_vfswlock_wait is used to implement a lock which is logically a writers
2080  * lock protecting the v_vfsmountedhere field.
2081  * vn_vfswlock_wait has been modified to be similar to vn_vfswlock,
2082  * except that it blocks to acquire the lock VVFSLOCK.
2083  *
2084  * traverse() and routines re-implementing part of traverse (e.g. autofs)
2085  * need to hold this lock. mount(), vn_rename(), vn_remove() and so on
2086  * need the non-blocking version of the writers lock i.e. vn_vfswlock
2087  */
2088 int
2089 vn_vfswlock_wait(vnode_t *vp)
2090 {
2091 	int retval;
2092 	vn_vfslocks_entry_t *vpvfsentry;
2093 	ASSERT(vp != NULL);
2094 
2095 	vpvfsentry = vn_vfslocks_getlock(vp);
2096 	retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_WRITER);
2097 
2098 	if (retval == EINTR) {
2099 		vn_vfslocks_rele(vpvfsentry);
2100 		return (EINTR);
2101 	}
2102 	return (retval);
2103 }
2104 
2105 int
2106 vn_vfsrlock_wait(vnode_t *vp)
2107 {
2108 	int retval;
2109 	vn_vfslocks_entry_t *vpvfsentry;
2110 	ASSERT(vp != NULL);
2111 
2112 	vpvfsentry = vn_vfslocks_getlock(vp);
2113 	retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_READER);
2114 
2115 	if (retval == EINTR) {
2116 		vn_vfslocks_rele(vpvfsentry);
2117 		return (EINTR);
2118 	}
2119 
2120 	return (retval);
2121 }
2122 
2123 
2124 /*
2125  * vn_vfswlock is used to implement a lock which is logically a writers lock
2126  * protecting the v_vfsmountedhere field.
2127  */
2128 int
2129 vn_vfswlock(vnode_t *vp)
2130 {
2131 	vn_vfslocks_entry_t *vpvfsentry;
2132 
2133 	/*
2134 	 * If vp is NULL then somebody is trying to lock the covered vnode
2135 	 * of /.  (vfs_vnodecovered is NULL for /).  This situation will
2136 	 * only happen when unmounting /.  Since that operation will fail
2137 	 * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2138 	 */
2139 	if (vp == NULL)
2140 		return (EBUSY);
2141 
2142 	vpvfsentry = vn_vfslocks_getlock(vp);
2143 
2144 	if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER))
2145 		return (0);
2146 
2147 	vn_vfslocks_rele(vpvfsentry);
2148 	return (EBUSY);
2149 }
2150 
2151 int
2152 vn_vfsrlock(vnode_t *vp)
2153 {
2154 	vn_vfslocks_entry_t *vpvfsentry;
2155 
2156 	/*
2157 	 * If vp is NULL then somebody is trying to lock the covered vnode
2158 	 * of /.  (vfs_vnodecovered is NULL for /).  This situation will
2159 	 * only happen when unmounting /.  Since that operation will fail
2160 	 * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2161 	 */
2162 	if (vp == NULL)
2163 		return (EBUSY);
2164 
2165 	vpvfsentry = vn_vfslocks_getlock(vp);
2166 
2167 	if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER))
2168 		return (0);
2169 
2170 	vn_vfslocks_rele(vpvfsentry);
2171 	return (EBUSY);
2172 }
2173 
2174 void
2175 vn_vfsunlock(vnode_t *vp)
2176 {
2177 	vn_vfslocks_entry_t *vpvfsentry;
2178 
2179 	/*
2180 	 * ve_refcnt needs to be decremented twice.
2181 	 * 1. To release refernce after a call to vn_vfslocks_getlock()
2182 	 * 2. To release the reference from the locking routines like
2183 	 *    vn_vfsrlock/vn_vfswlock etc,.
2184 	 */
2185 	vpvfsentry = vn_vfslocks_getlock(vp);
2186 	vn_vfslocks_rele(vpvfsentry);
2187 
2188 	rwst_exit(&vpvfsentry->ve_lock);
2189 	vn_vfslocks_rele(vpvfsentry);
2190 }
2191 
2192 int
2193 vn_vfswlock_held(vnode_t *vp)
2194 {
2195 	int held;
2196 	vn_vfslocks_entry_t *vpvfsentry;
2197 
2198 	ASSERT(vp != NULL);
2199 
2200 	vpvfsentry = vn_vfslocks_getlock(vp);
2201 	held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER);
2202 
2203 	vn_vfslocks_rele(vpvfsentry);
2204 	return (held);
2205 }
2206 
2207 
2208 int
2209 vn_make_ops(
2210 	const char *name,			/* Name of file system */
2211 	const fs_operation_def_t *templ,	/* Operation specification */
2212 	vnodeops_t **actual)			/* Return the vnodeops */
2213 {
2214 	int unused_ops;
2215 	int error;
2216 
2217 	*actual = (vnodeops_t *)kmem_alloc(sizeof (vnodeops_t), KM_SLEEP);
2218 
2219 	(*actual)->vnop_name = name;
2220 
2221 	error = fs_build_vector(*actual, &unused_ops, vn_ops_table, templ);
2222 	if (error) {
2223 		kmem_free(*actual, sizeof (vnodeops_t));
2224 	}
2225 
2226 #if DEBUG
2227 	if (unused_ops != 0)
2228 		cmn_err(CE_WARN, "vn_make_ops: %s: %d operations supplied "
2229 		    "but not used", name, unused_ops);
2230 #endif
2231 
2232 	return (error);
2233 }
2234 
2235 /*
2236  * Free the vnodeops created as a result of vn_make_ops()
2237  */
2238 void
2239 vn_freevnodeops(vnodeops_t *vnops)
2240 {
2241 	kmem_free(vnops, sizeof (vnodeops_t));
2242 }
2243 
2244 /*
2245  * Vnode cache.
2246  */
2247 
2248 /* ARGSUSED */
2249 static int
2250 vn_cache_constructor(void *buf, void *cdrarg, int kmflags)
2251 {
2252 	struct vnode *vp;
2253 
2254 	vp = buf;
2255 
2256 	mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL);
2257 	mutex_init(&vp->v_vsd_lock, NULL, MUTEX_DEFAULT, NULL);
2258 	cv_init(&vp->v_cv, NULL, CV_DEFAULT, NULL);
2259 	rw_init(&vp->v_nbllock, NULL, RW_DEFAULT, NULL);
2260 	vp->v_femhead = NULL;	/* Must be done before vn_reinit() */
2261 	vp->v_path = NULL;
2262 	vp->v_mpssdata = NULL;
2263 	vp->v_vsd = NULL;
2264 	vp->v_fopdata = NULL;
2265 
2266 	return (0);
2267 }
2268 
2269 /* ARGSUSED */
2270 static void
2271 vn_cache_destructor(void *buf, void *cdrarg)
2272 {
2273 	struct vnode *vp;
2274 
2275 	vp = buf;
2276 
2277 	rw_destroy(&vp->v_nbllock);
2278 	cv_destroy(&vp->v_cv);
2279 	mutex_destroy(&vp->v_vsd_lock);
2280 	mutex_destroy(&vp->v_lock);
2281 }
2282 
2283 void
2284 vn_create_cache(void)
2285 {
2286 	/* LINTED */
2287 	ASSERT((1 << VNODE_ALIGN_LOG2) ==
2288 	    P2ROUNDUP(sizeof (struct vnode), VNODE_ALIGN));
2289 	vn_cache = kmem_cache_create("vn_cache", sizeof (struct vnode),
2290 	    VNODE_ALIGN, vn_cache_constructor, vn_cache_destructor, NULL, NULL,
2291 	    NULL, 0);
2292 }
2293 
2294 void
2295 vn_destroy_cache(void)
2296 {
2297 	kmem_cache_destroy(vn_cache);
2298 }
2299 
2300 /*
2301  * Used by file systems when fs-specific nodes (e.g., ufs inodes) are
2302  * cached by the file system and vnodes remain associated.
2303  */
2304 void
2305 vn_recycle(vnode_t *vp)
2306 {
2307 	ASSERT(vp->v_pages == NULL);
2308 
2309 	/*
2310 	 * XXX - This really belongs in vn_reinit(), but we have some issues
2311 	 * with the counts.  Best to have it here for clean initialization.
2312 	 */
2313 	vp->v_rdcnt = 0;
2314 	vp->v_wrcnt = 0;
2315 	vp->v_mmap_read = 0;
2316 	vp->v_mmap_write = 0;
2317 
2318 	/*
2319 	 * If FEM was in use, make sure everything gets cleaned up
2320 	 * NOTE: vp->v_femhead is initialized to NULL in the vnode
2321 	 * constructor.
2322 	 */
2323 	if (vp->v_femhead) {
2324 		/* XXX - There should be a free_femhead() that does all this */
2325 		ASSERT(vp->v_femhead->femh_list == NULL);
2326 		mutex_destroy(&vp->v_femhead->femh_lock);
2327 		kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2328 		vp->v_femhead = NULL;
2329 	}
2330 	if (vp->v_path) {
2331 		kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2332 		vp->v_path = NULL;
2333 	}
2334 
2335 	if (vp->v_fopdata != NULL) {
2336 		free_fopdata(vp);
2337 	}
2338 	vp->v_mpssdata = NULL;
2339 	vsd_free(vp);
2340 }
2341 
2342 /*
2343  * Used to reset the vnode fields including those that are directly accessible
2344  * as well as those which require an accessor function.
2345  *
2346  * Does not initialize:
2347  *	synchronization objects: v_lock, v_vsd_lock, v_nbllock, v_cv
2348  *	v_data (since FS-nodes and vnodes point to each other and should
2349  *		be updated simultaneously)
2350  *	v_op (in case someone needs to make a VOP call on this object)
2351  */
2352 void
2353 vn_reinit(vnode_t *vp)
2354 {
2355 	vp->v_count = 1;
2356 	vp->v_count_dnlc = 0;
2357 	vp->v_vfsp = NULL;
2358 	vp->v_stream = NULL;
2359 	vp->v_vfsmountedhere = NULL;
2360 	vp->v_flag = 0;
2361 	vp->v_type = VNON;
2362 	vp->v_rdev = NODEV;
2363 
2364 	vp->v_filocks = NULL;
2365 	vp->v_shrlocks = NULL;
2366 	vp->v_pages = NULL;
2367 
2368 	vp->v_locality = NULL;
2369 	vp->v_xattrdir = NULL;
2370 
2371 	/* Handles v_femhead, v_path, and the r/w/map counts */
2372 	vn_recycle(vp);
2373 }
2374 
2375 vnode_t *
2376 vn_alloc(int kmflag)
2377 {
2378 	vnode_t *vp;
2379 
2380 	vp = kmem_cache_alloc(vn_cache, kmflag);
2381 
2382 	if (vp != NULL) {
2383 		vp->v_femhead = NULL;	/* Must be done before vn_reinit() */
2384 		vp->v_fopdata = NULL;
2385 		vn_reinit(vp);
2386 	}
2387 
2388 	return (vp);
2389 }
2390 
2391 void
2392 vn_free(vnode_t *vp)
2393 {
2394 	ASSERT(vp->v_shrlocks == NULL);
2395 	ASSERT(vp->v_filocks == NULL);
2396 
2397 	/*
2398 	 * Some file systems call vn_free() with v_count of zero,
2399 	 * some with v_count of 1.  In any case, the value should
2400 	 * never be anything else.
2401 	 */
2402 	ASSERT((vp->v_count == 0) || (vp->v_count == 1));
2403 	ASSERT(vp->v_count_dnlc == 0);
2404 	if (vp->v_path != NULL) {
2405 		kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2406 		vp->v_path = NULL;
2407 	}
2408 
2409 	/* If FEM was in use, make sure everything gets cleaned up */
2410 	if (vp->v_femhead) {
2411 		/* XXX - There should be a free_femhead() that does all this */
2412 		ASSERT(vp->v_femhead->femh_list == NULL);
2413 		mutex_destroy(&vp->v_femhead->femh_lock);
2414 		kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2415 		vp->v_femhead = NULL;
2416 	}
2417 
2418 	if (vp->v_fopdata != NULL) {
2419 		free_fopdata(vp);
2420 	}
2421 	vp->v_mpssdata = NULL;
2422 	vsd_free(vp);
2423 	kmem_cache_free(vn_cache, vp);
2424 }
2425 
2426 /*
2427  * vnode status changes, should define better states than 1, 0.
2428  */
2429 void
2430 vn_reclaim(vnode_t *vp)
2431 {
2432 	vfs_t   *vfsp = vp->v_vfsp;
2433 
2434 	if (vfsp == NULL ||
2435 	    vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2436 		return;
2437 	}
2438 	(void) VFS_VNSTATE(vfsp, vp, VNTRANS_RECLAIMED);
2439 }
2440 
2441 void
2442 vn_idle(vnode_t *vp)
2443 {
2444 	vfs_t   *vfsp = vp->v_vfsp;
2445 
2446 	if (vfsp == NULL ||
2447 	    vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2448 		return;
2449 	}
2450 	(void) VFS_VNSTATE(vfsp, vp, VNTRANS_IDLED);
2451 }
2452 void
2453 vn_exists(vnode_t *vp)
2454 {
2455 	vfs_t   *vfsp = vp->v_vfsp;
2456 
2457 	if (vfsp == NULL ||
2458 	    vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2459 		return;
2460 	}
2461 	(void) VFS_VNSTATE(vfsp, vp, VNTRANS_EXISTS);
2462 }
2463 
2464 void
2465 vn_invalid(vnode_t *vp)
2466 {
2467 	vfs_t   *vfsp = vp->v_vfsp;
2468 
2469 	if (vfsp == NULL ||
2470 	    vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2471 		return;
2472 	}
2473 	(void) VFS_VNSTATE(vfsp, vp, VNTRANS_DESTROYED);
2474 }
2475 
2476 /* Vnode event notification */
2477 
2478 int
2479 vnevent_support(vnode_t *vp, caller_context_t *ct)
2480 {
2481 	if (vp == NULL)
2482 		return (EINVAL);
2483 
2484 	return (VOP_VNEVENT(vp, VE_SUPPORT, NULL, NULL, ct));
2485 }
2486 
2487 void
2488 vnevent_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2489 {
2490 	if (vp == NULL || vp->v_femhead == NULL) {
2491 		return;
2492 	}
2493 	(void) VOP_VNEVENT(vp, VE_RENAME_SRC, dvp, name, ct);
2494 }
2495 
2496 void
2497 vnevent_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2498     caller_context_t *ct)
2499 {
2500 	if (vp == NULL || vp->v_femhead == NULL) {
2501 		return;
2502 	}
2503 	(void) VOP_VNEVENT(vp, VE_RENAME_DEST, dvp, name, ct);
2504 }
2505 
2506 void
2507 vnevent_rename_dest_dir(vnode_t *vp, caller_context_t *ct)
2508 {
2509 	if (vp == NULL || vp->v_femhead == NULL) {
2510 		return;
2511 	}
2512 	(void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, NULL, NULL, ct);
2513 }
2514 
2515 void
2516 vnevent_remove(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2517 {
2518 	if (vp == NULL || vp->v_femhead == NULL) {
2519 		return;
2520 	}
2521 	(void) VOP_VNEVENT(vp, VE_REMOVE, dvp, name, ct);
2522 }
2523 
2524 void
2525 vnevent_rmdir(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2526 {
2527 	if (vp == NULL || vp->v_femhead == NULL) {
2528 		return;
2529 	}
2530 	(void) VOP_VNEVENT(vp, VE_RMDIR, dvp, name, ct);
2531 }
2532 
2533 void
2534 vnevent_create(vnode_t *vp, caller_context_t *ct)
2535 {
2536 	if (vp == NULL || vp->v_femhead == NULL) {
2537 		return;
2538 	}
2539 	(void) VOP_VNEVENT(vp, VE_CREATE, NULL, NULL, ct);
2540 }
2541 
2542 void
2543 vnevent_link(vnode_t *vp, caller_context_t *ct)
2544 {
2545 	if (vp == NULL || vp->v_femhead == NULL) {
2546 		return;
2547 	}
2548 	(void) VOP_VNEVENT(vp, VE_LINK, NULL, NULL, ct);
2549 }
2550 
2551 void
2552 vnevent_mountedover(vnode_t *vp, caller_context_t *ct)
2553 {
2554 	if (vp == NULL || vp->v_femhead == NULL) {
2555 		return;
2556 	}
2557 	(void) VOP_VNEVENT(vp, VE_MOUNTEDOVER, NULL, NULL, ct);
2558 }
2559 
2560 /*
2561  * Vnode accessors.
2562  */
2563 
2564 int
2565 vn_is_readonly(vnode_t *vp)
2566 {
2567 	return (vp->v_vfsp->vfs_flag & VFS_RDONLY);
2568 }
2569 
2570 int
2571 vn_has_flocks(vnode_t *vp)
2572 {
2573 	return (vp->v_filocks != NULL);
2574 }
2575 
2576 int
2577 vn_has_mandatory_locks(vnode_t *vp, int mode)
2578 {
2579 	return ((vp->v_filocks != NULL) && (MANDLOCK(vp, mode)));
2580 }
2581 
2582 int
2583 vn_has_cached_data(vnode_t *vp)
2584 {
2585 	return (vp->v_pages != NULL);
2586 }
2587 
2588 /*
2589  * Return 0 if the vnode in question shouldn't be permitted into a zone via
2590  * zone_enter(2).
2591  */
2592 int
2593 vn_can_change_zones(vnode_t *vp)
2594 {
2595 	struct vfssw *vswp;
2596 	int allow = 1;
2597 	vnode_t *rvp;
2598 
2599 	if (nfs_global_client_only != 0)
2600 		return (1);
2601 
2602 	/*
2603 	 * We always want to look at the underlying vnode if there is one.
2604 	 */
2605 	if (VOP_REALVP(vp, &rvp, NULL) != 0)
2606 		rvp = vp;
2607 	/*
2608 	 * Some pseudo filesystems (including doorfs) don't actually register
2609 	 * their vfsops_t, so the following may return NULL; we happily let
2610 	 * such vnodes switch zones.
2611 	 */
2612 	vswp = vfs_getvfsswbyvfsops(vfs_getops(rvp->v_vfsp));
2613 	if (vswp != NULL) {
2614 		if (vswp->vsw_flag & VSW_NOTZONESAFE)
2615 			allow = 0;
2616 		vfs_unrefvfssw(vswp);
2617 	}
2618 	return (allow);
2619 }
2620 
2621 /*
2622  * Return nonzero if the vnode is a mount point, zero if not.
2623  */
2624 int
2625 vn_ismntpt(vnode_t *vp)
2626 {
2627 	return (vp->v_vfsmountedhere != NULL);
2628 }
2629 
2630 /* Retrieve the vfs (if any) mounted on this vnode */
2631 vfs_t *
2632 vn_mountedvfs(vnode_t *vp)
2633 {
2634 	return (vp->v_vfsmountedhere);
2635 }
2636 
2637 /*
2638  * Return nonzero if the vnode is referenced by the dnlc, zero if not.
2639  */
2640 int
2641 vn_in_dnlc(vnode_t *vp)
2642 {
2643 	return (vp->v_count_dnlc > 0);
2644 }
2645 
2646 /*
2647  * vn_has_other_opens() checks whether a particular file is opened by more than
2648  * just the caller and whether the open is for read and/or write.
2649  * This routine is for calling after the caller has already called VOP_OPEN()
2650  * and the caller wishes to know if they are the only one with it open for
2651  * the mode(s) specified.
2652  *
2653  * Vnode counts are only kept on regular files (v_type=VREG).
2654  */
2655 int
2656 vn_has_other_opens(
2657 	vnode_t *vp,
2658 	v_mode_t mode)
2659 {
2660 
2661 	ASSERT(vp != NULL);
2662 
2663 	switch (mode) {
2664 	case V_WRITE:
2665 		if (vp->v_wrcnt > 1)
2666 			return (V_TRUE);
2667 		break;
2668 	case V_RDORWR:
2669 		if ((vp->v_rdcnt > 1) || (vp->v_wrcnt > 1))
2670 			return (V_TRUE);
2671 		break;
2672 	case V_RDANDWR:
2673 		if ((vp->v_rdcnt > 1) && (vp->v_wrcnt > 1))
2674 			return (V_TRUE);
2675 		break;
2676 	case V_READ:
2677 		if (vp->v_rdcnt > 1)
2678 			return (V_TRUE);
2679 		break;
2680 	}
2681 
2682 	return (V_FALSE);
2683 }
2684 
2685 /*
2686  * vn_is_opened() checks whether a particular file is opened and
2687  * whether the open is for read and/or write.
2688  *
2689  * Vnode counts are only kept on regular files (v_type=VREG).
2690  */
2691 int
2692 vn_is_opened(
2693 	vnode_t *vp,
2694 	v_mode_t mode)
2695 {
2696 
2697 	ASSERT(vp != NULL);
2698 
2699 	switch (mode) {
2700 	case V_WRITE:
2701 		if (vp->v_wrcnt)
2702 			return (V_TRUE);
2703 		break;
2704 	case V_RDANDWR:
2705 		if (vp->v_rdcnt && vp->v_wrcnt)
2706 			return (V_TRUE);
2707 		break;
2708 	case V_RDORWR:
2709 		if (vp->v_rdcnt || vp->v_wrcnt)
2710 			return (V_TRUE);
2711 		break;
2712 	case V_READ:
2713 		if (vp->v_rdcnt)
2714 			return (V_TRUE);
2715 		break;
2716 	}
2717 
2718 	return (V_FALSE);
2719 }
2720 
2721 /*
2722  * vn_is_mapped() checks whether a particular file is mapped and whether
2723  * the file is mapped read and/or write.
2724  */
2725 int
2726 vn_is_mapped(
2727 	vnode_t *vp,
2728 	v_mode_t mode)
2729 {
2730 
2731 	ASSERT(vp != NULL);
2732 
2733 #if !defined(_LP64)
2734 	switch (mode) {
2735 	/*
2736 	 * The atomic_add_64_nv functions force atomicity in the
2737 	 * case of 32 bit architectures. Otherwise the 64 bit values
2738 	 * require two fetches. The value of the fields may be
2739 	 * (potentially) changed between the first fetch and the
2740 	 * second
2741 	 */
2742 	case V_WRITE:
2743 		if (atomic_add_64_nv((&(vp->v_mmap_write)), 0))
2744 			return (V_TRUE);
2745 		break;
2746 	case V_RDANDWR:
2747 		if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) &&
2748 		    (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2749 			return (V_TRUE);
2750 		break;
2751 	case V_RDORWR:
2752 		if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) ||
2753 		    (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2754 			return (V_TRUE);
2755 		break;
2756 	case V_READ:
2757 		if (atomic_add_64_nv((&(vp->v_mmap_read)), 0))
2758 			return (V_TRUE);
2759 		break;
2760 	}
2761 #else
2762 	switch (mode) {
2763 	case V_WRITE:
2764 		if (vp->v_mmap_write)
2765 			return (V_TRUE);
2766 		break;
2767 	case V_RDANDWR:
2768 		if (vp->v_mmap_read && vp->v_mmap_write)
2769 			return (V_TRUE);
2770 		break;
2771 	case V_RDORWR:
2772 		if (vp->v_mmap_read || vp->v_mmap_write)
2773 			return (V_TRUE);
2774 		break;
2775 	case V_READ:
2776 		if (vp->v_mmap_read)
2777 			return (V_TRUE);
2778 		break;
2779 	}
2780 #endif
2781 
2782 	return (V_FALSE);
2783 }
2784 
2785 /*
2786  * Set the operations vector for a vnode.
2787  *
2788  * FEM ensures that the v_femhead pointer is filled in before the
2789  * v_op pointer is changed.  This means that if the v_femhead pointer
2790  * is NULL, and the v_op field hasn't changed since before which checked
2791  * the v_femhead pointer; then our update is ok - we are not racing with
2792  * FEM.
2793  */
2794 void
2795 vn_setops(vnode_t *vp, vnodeops_t *vnodeops)
2796 {
2797 	vnodeops_t	*op;
2798 
2799 	ASSERT(vp != NULL);
2800 	ASSERT(vnodeops != NULL);
2801 
2802 	op = vp->v_op;
2803 	membar_consumer();
2804 	/*
2805 	 * If vp->v_femhead == NULL, then we'll call casptr() to do the
2806 	 * compare-and-swap on vp->v_op.  If either fails, then FEM is
2807 	 * in effect on the vnode and we need to have FEM deal with it.
2808 	 */
2809 	if (vp->v_femhead != NULL || casptr(&vp->v_op, op, vnodeops) != op) {
2810 		fem_setvnops(vp, vnodeops);
2811 	}
2812 }
2813 
2814 /*
2815  * Retrieve the operations vector for a vnode
2816  * As with vn_setops(above); make sure we aren't racing with FEM.
2817  * FEM sets the v_op to a special, internal, vnodeops that wouldn't
2818  * make sense to the callers of this routine.
2819  */
2820 vnodeops_t *
2821 vn_getops(vnode_t *vp)
2822 {
2823 	vnodeops_t	*op;
2824 
2825 	ASSERT(vp != NULL);
2826 
2827 	op = vp->v_op;
2828 	membar_consumer();
2829 	if (vp->v_femhead == NULL && op == vp->v_op) {
2830 		return (op);
2831 	} else {
2832 		return (fem_getvnops(vp));
2833 	}
2834 }
2835 
2836 /*
2837  * Returns non-zero (1) if the vnodeops matches that of the vnode.
2838  * Returns zero (0) if not.
2839  */
2840 int
2841 vn_matchops(vnode_t *vp, vnodeops_t *vnodeops)
2842 {
2843 	return (vn_getops(vp) == vnodeops);
2844 }
2845 
2846 /*
2847  * Returns non-zero (1) if the specified operation matches the
2848  * corresponding operation for that the vnode.
2849  * Returns zero (0) if not.
2850  */
2851 
2852 #define	MATCHNAME(n1, n2) (((n1)[0] == (n2)[0]) && (strcmp((n1), (n2)) == 0))
2853 
2854 int
2855 vn_matchopval(vnode_t *vp, char *vopname, fs_generic_func_p funcp)
2856 {
2857 	const fs_operation_trans_def_t *otdp;
2858 	fs_generic_func_p *loc = NULL;
2859 	vnodeops_t	*vop = vn_getops(vp);
2860 
2861 	ASSERT(vopname != NULL);
2862 
2863 	for (otdp = vn_ops_table; otdp->name != NULL; otdp++) {
2864 		if (MATCHNAME(otdp->name, vopname)) {
2865 			loc = (fs_generic_func_p *)
2866 			    ((char *)(vop) + otdp->offset);
2867 			break;
2868 		}
2869 	}
2870 
2871 	return ((loc != NULL) && (*loc == funcp));
2872 }
2873 
2874 /*
2875  * fs_new_caller_id() needs to return a unique ID on a given local system.
2876  * The IDs do not need to survive across reboots.  These are primarily
2877  * used so that (FEM) monitors can detect particular callers (such as
2878  * the NFS server) to a given vnode/vfs operation.
2879  */
2880 u_longlong_t
2881 fs_new_caller_id()
2882 {
2883 	static uint64_t next_caller_id = 0LL; /* First call returns 1 */
2884 
2885 	return ((u_longlong_t)atomic_add_64_nv(&next_caller_id, 1));
2886 }
2887 
2888 /*
2889  * Given a starting vnode and a path, updates the path in the target vnode in
2890  * a safe manner.  If the vnode already has path information embedded, then the
2891  * cached path is left untouched.
2892  */
2893 
2894 size_t max_vnode_path = 4 * MAXPATHLEN;
2895 
2896 void
2897 vn_setpath(vnode_t *rootvp, struct vnode *startvp, struct vnode *vp,
2898     const char *path, size_t plen)
2899 {
2900 	char	*rpath;
2901 	vnode_t	*base;
2902 	size_t	rpathlen, rpathalloc;
2903 	int	doslash = 1;
2904 
2905 	if (*path == '/') {
2906 		base = rootvp;
2907 		path++;
2908 		plen--;
2909 	} else {
2910 		base = startvp;
2911 	}
2912 
2913 	/*
2914 	 * We cannot grab base->v_lock while we hold vp->v_lock because of
2915 	 * the potential for deadlock.
2916 	 */
2917 	mutex_enter(&base->v_lock);
2918 	if (base->v_path == NULL) {
2919 		mutex_exit(&base->v_lock);
2920 		return;
2921 	}
2922 
2923 	rpathlen = strlen(base->v_path);
2924 	rpathalloc = rpathlen + plen + 1;
2925 	/* Avoid adding a slash if there's already one there */
2926 	if (base->v_path[rpathlen-1] == '/')
2927 		doslash = 0;
2928 	else
2929 		rpathalloc++;
2930 
2931 	/*
2932 	 * We don't want to call kmem_alloc(KM_SLEEP) with kernel locks held,
2933 	 * so we must do this dance.  If, by chance, something changes the path,
2934 	 * just give up since there is no real harm.
2935 	 */
2936 	mutex_exit(&base->v_lock);
2937 
2938 	/* Paths should stay within reason */
2939 	if (rpathalloc > max_vnode_path)
2940 		return;
2941 
2942 	rpath = kmem_alloc(rpathalloc, KM_SLEEP);
2943 
2944 	mutex_enter(&base->v_lock);
2945 	if (base->v_path == NULL || strlen(base->v_path) != rpathlen) {
2946 		mutex_exit(&base->v_lock);
2947 		kmem_free(rpath, rpathalloc);
2948 		return;
2949 	}
2950 	bcopy(base->v_path, rpath, rpathlen);
2951 	mutex_exit(&base->v_lock);
2952 
2953 	if (doslash)
2954 		rpath[rpathlen++] = '/';
2955 	bcopy(path, rpath + rpathlen, plen);
2956 	rpath[rpathlen + plen] = '\0';
2957 
2958 	mutex_enter(&vp->v_lock);
2959 	if (vp->v_path != NULL) {
2960 		mutex_exit(&vp->v_lock);
2961 		kmem_free(rpath, rpathalloc);
2962 	} else {
2963 		vp->v_path = rpath;
2964 		mutex_exit(&vp->v_lock);
2965 	}
2966 }
2967 
2968 /*
2969  * Sets the path to the vnode to be the given string, regardless of current
2970  * context.  The string must be a complete path from rootdir.  This is only used
2971  * by fsop_root() for setting the path based on the mountpoint.
2972  */
2973 void
2974 vn_setpath_str(struct vnode *vp, const char *str, size_t len)
2975 {
2976 	char *buf = kmem_alloc(len + 1, KM_SLEEP);
2977 
2978 	mutex_enter(&vp->v_lock);
2979 	if (vp->v_path != NULL) {
2980 		mutex_exit(&vp->v_lock);
2981 		kmem_free(buf, len + 1);
2982 		return;
2983 	}
2984 
2985 	vp->v_path = buf;
2986 	bcopy(str, vp->v_path, len);
2987 	vp->v_path[len] = '\0';
2988 
2989 	mutex_exit(&vp->v_lock);
2990 }
2991 
2992 /*
2993  * Called from within filesystem's vop_rename() to handle renames once the
2994  * target vnode is available.
2995  */
2996 void
2997 vn_renamepath(vnode_t *dvp, vnode_t *vp, const char *nm, size_t len)
2998 {
2999 	char *tmp;
3000 
3001 	mutex_enter(&vp->v_lock);
3002 	tmp = vp->v_path;
3003 	vp->v_path = NULL;
3004 	mutex_exit(&vp->v_lock);
3005 	vn_setpath(rootdir, dvp, vp, nm, len);
3006 	if (tmp != NULL)
3007 		kmem_free(tmp, strlen(tmp) + 1);
3008 }
3009 
3010 /*
3011  * Similar to vn_setpath_str(), this function sets the path of the destination
3012  * vnode to the be the same as the source vnode.
3013  */
3014 void
3015 vn_copypath(struct vnode *src, struct vnode *dst)
3016 {
3017 	char *buf;
3018 	int alloc;
3019 
3020 	mutex_enter(&src->v_lock);
3021 	if (src->v_path == NULL) {
3022 		mutex_exit(&src->v_lock);
3023 		return;
3024 	}
3025 	alloc = strlen(src->v_path) + 1;
3026 
3027 	/* avoid kmem_alloc() with lock held */
3028 	mutex_exit(&src->v_lock);
3029 	buf = kmem_alloc(alloc, KM_SLEEP);
3030 	mutex_enter(&src->v_lock);
3031 	if (src->v_path == NULL || strlen(src->v_path) + 1 != alloc) {
3032 		mutex_exit(&src->v_lock);
3033 		kmem_free(buf, alloc);
3034 		return;
3035 	}
3036 	bcopy(src->v_path, buf, alloc);
3037 	mutex_exit(&src->v_lock);
3038 
3039 	mutex_enter(&dst->v_lock);
3040 	if (dst->v_path != NULL) {
3041 		mutex_exit(&dst->v_lock);
3042 		kmem_free(buf, alloc);
3043 		return;
3044 	}
3045 	dst->v_path = buf;
3046 	mutex_exit(&dst->v_lock);
3047 }
3048 
3049 /*
3050  * XXX Private interface for segvn routines that handle vnode
3051  * large page segments.
3052  *
3053  * return 1 if vp's file system VOP_PAGEIO() implementation
3054  * can be safely used instead of VOP_GETPAGE() for handling
3055  * pagefaults against regular non swap files. VOP_PAGEIO()
3056  * interface is considered safe here if its implementation
3057  * is very close to VOP_GETPAGE() implementation.
3058  * e.g. It zero's out the part of the page beyond EOF. Doesn't
3059  * panic if there're file holes but instead returns an error.
3060  * Doesn't assume file won't be changed by user writes, etc.
3061  *
3062  * return 0 otherwise.
3063  *
3064  * For now allow segvn to only use VOP_PAGEIO() with ufs and nfs.
3065  */
3066 int
3067 vn_vmpss_usepageio(vnode_t *vp)
3068 {
3069 	vfs_t   *vfsp = vp->v_vfsp;
3070 	char *fsname = vfssw[vfsp->vfs_fstype].vsw_name;
3071 	char *pageio_ok_fss[] = {"ufs", "nfs", NULL};
3072 	char **fsok = pageio_ok_fss;
3073 
3074 	if (fsname == NULL) {
3075 		return (0);
3076 	}
3077 
3078 	for (; *fsok; fsok++) {
3079 		if (strcmp(*fsok, fsname) == 0) {
3080 			return (1);
3081 		}
3082 	}
3083 	return (0);
3084 }
3085 
3086 /* VOP_XXX() macros call the corresponding fop_xxx() function */
3087 
3088 int
3089 fop_open(
3090 	vnode_t **vpp,
3091 	int mode,
3092 	cred_t *cr,
3093 	caller_context_t *ct)
3094 {
3095 	int ret;
3096 	vnode_t *vp = *vpp;
3097 
3098 	VN_HOLD(vp);
3099 	/*
3100 	 * Adding to the vnode counts before calling open
3101 	 * avoids the need for a mutex. It circumvents a race
3102 	 * condition where a query made on the vnode counts results in a
3103 	 * false negative. The inquirer goes away believing the file is
3104 	 * not open when there is an open on the file already under way.
3105 	 *
3106 	 * The counts are meant to prevent NFS from granting a delegation
3107 	 * when it would be dangerous to do so.
3108 	 *
3109 	 * The vnode counts are only kept on regular files
3110 	 */
3111 	if ((*vpp)->v_type == VREG) {
3112 		if (mode & FREAD)
3113 			atomic_add_32(&((*vpp)->v_rdcnt), 1);
3114 		if (mode & FWRITE)
3115 			atomic_add_32(&((*vpp)->v_wrcnt), 1);
3116 	}
3117 
3118 	VOPXID_MAP_CR(vp, cr);
3119 
3120 	ret = (*(*(vpp))->v_op->vop_open)(vpp, mode, cr, ct);
3121 
3122 	if (ret) {
3123 		/*
3124 		 * Use the saved vp just in case the vnode ptr got trashed
3125 		 * by the error.
3126 		 */
3127 		VOPSTATS_UPDATE(vp, open);
3128 		if ((vp->v_type == VREG) && (mode & FREAD))
3129 			atomic_add_32(&(vp->v_rdcnt), -1);
3130 		if ((vp->v_type == VREG) && (mode & FWRITE))
3131 			atomic_add_32(&(vp->v_wrcnt), -1);
3132 	} else {
3133 		/*
3134 		 * Some filesystems will return a different vnode,
3135 		 * but the same path was still used to open it.
3136 		 * So if we do change the vnode and need to
3137 		 * copy over the path, do so here, rather than special
3138 		 * casing each filesystem. Adjust the vnode counts to
3139 		 * reflect the vnode switch.
3140 		 */
3141 		VOPSTATS_UPDATE(*vpp, open);
3142 		if (*vpp != vp && *vpp != NULL) {
3143 			vn_copypath(vp, *vpp);
3144 			if (((*vpp)->v_type == VREG) && (mode & FREAD))
3145 				atomic_add_32(&((*vpp)->v_rdcnt), 1);
3146 			if ((vp->v_type == VREG) && (mode & FREAD))
3147 				atomic_add_32(&(vp->v_rdcnt), -1);
3148 			if (((*vpp)->v_type == VREG) && (mode & FWRITE))
3149 				atomic_add_32(&((*vpp)->v_wrcnt), 1);
3150 			if ((vp->v_type == VREG) && (mode & FWRITE))
3151 				atomic_add_32(&(vp->v_wrcnt), -1);
3152 		}
3153 	}
3154 	VN_RELE(vp);
3155 	return (ret);
3156 }
3157 
3158 int
3159 fop_close(
3160 	vnode_t *vp,
3161 	int flag,
3162 	int count,
3163 	offset_t offset,
3164 	cred_t *cr,
3165 	caller_context_t *ct)
3166 {
3167 	int err;
3168 
3169 	VOPXID_MAP_CR(vp, cr);
3170 
3171 	err = (*(vp)->v_op->vop_close)(vp, flag, count, offset, cr, ct);
3172 	VOPSTATS_UPDATE(vp, close);
3173 	/*
3174 	 * Check passed in count to handle possible dups. Vnode counts are only
3175 	 * kept on regular files
3176 	 */
3177 	if ((vp->v_type == VREG) && (count == 1))  {
3178 		if (flag & FREAD) {
3179 			ASSERT(vp->v_rdcnt > 0);
3180 			atomic_add_32(&(vp->v_rdcnt), -1);
3181 		}
3182 		if (flag & FWRITE) {
3183 			ASSERT(vp->v_wrcnt > 0);
3184 			atomic_add_32(&(vp->v_wrcnt), -1);
3185 		}
3186 	}
3187 	return (err);
3188 }
3189 
3190 int
3191 fop_read(
3192 	vnode_t *vp,
3193 	uio_t *uiop,
3194 	int ioflag,
3195 	cred_t *cr,
3196 	caller_context_t *ct)
3197 {
3198 	int	err;
3199 	ssize_t	resid_start = uiop->uio_resid;
3200 
3201 	VOPXID_MAP_CR(vp, cr);
3202 
3203 	err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct);
3204 	VOPSTATS_UPDATE_IO(vp, read,
3205 	    read_bytes, (resid_start - uiop->uio_resid));
3206 	return (err);
3207 }
3208 
3209 int
3210 fop_write(
3211 	vnode_t *vp,
3212 	uio_t *uiop,
3213 	int ioflag,
3214 	cred_t *cr,
3215 	caller_context_t *ct)
3216 {
3217 	int	err;
3218 	ssize_t	resid_start = uiop->uio_resid;
3219 
3220 	VOPXID_MAP_CR(vp, cr);
3221 
3222 	err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct);
3223 	VOPSTATS_UPDATE_IO(vp, write,
3224 	    write_bytes, (resid_start - uiop->uio_resid));
3225 	return (err);
3226 }
3227 
3228 int
3229 fop_ioctl(
3230 	vnode_t *vp,
3231 	int cmd,
3232 	intptr_t arg,
3233 	int flag,
3234 	cred_t *cr,
3235 	int *rvalp,
3236 	caller_context_t *ct)
3237 {
3238 	int	err;
3239 
3240 	VOPXID_MAP_CR(vp, cr);
3241 
3242 	err = (*(vp)->v_op->vop_ioctl)(vp, cmd, arg, flag, cr, rvalp, ct);
3243 	VOPSTATS_UPDATE(vp, ioctl);
3244 	return (err);
3245 }
3246 
3247 int
3248 fop_setfl(
3249 	vnode_t *vp,
3250 	int oflags,
3251 	int nflags,
3252 	cred_t *cr,
3253 	caller_context_t *ct)
3254 {
3255 	int	err;
3256 
3257 	VOPXID_MAP_CR(vp, cr);
3258 
3259 	err = (*(vp)->v_op->vop_setfl)(vp, oflags, nflags, cr, ct);
3260 	VOPSTATS_UPDATE(vp, setfl);
3261 	return (err);
3262 }
3263 
3264 int
3265 fop_getattr(
3266 	vnode_t *vp,
3267 	vattr_t *vap,
3268 	int flags,
3269 	cred_t *cr,
3270 	caller_context_t *ct)
3271 {
3272 	int	err;
3273 
3274 	VOPXID_MAP_CR(vp, cr);
3275 
3276 	/*
3277 	 * If this file system doesn't understand the xvattr extensions
3278 	 * then turn off the xvattr bit.
3279 	 */
3280 	if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3281 		vap->va_mask &= ~AT_XVATTR;
3282 	}
3283 
3284 	/*
3285 	 * We're only allowed to skip the ACL check iff we used a 32 bit
3286 	 * ACE mask with VOP_ACCESS() to determine permissions.
3287 	 */
3288 	if ((flags & ATTR_NOACLCHECK) &&
3289 	    vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3290 		return (EINVAL);
3291 	}
3292 	err = (*(vp)->v_op->vop_getattr)(vp, vap, flags, cr, ct);
3293 	VOPSTATS_UPDATE(vp, getattr);
3294 	return (err);
3295 }
3296 
3297 int
3298 fop_setattr(
3299 	vnode_t *vp,
3300 	vattr_t *vap,
3301 	int flags,
3302 	cred_t *cr,
3303 	caller_context_t *ct)
3304 {
3305 	int	err;
3306 
3307 	VOPXID_MAP_CR(vp, cr);
3308 
3309 	/*
3310 	 * If this file system doesn't understand the xvattr extensions
3311 	 * then turn off the xvattr bit.
3312 	 */
3313 	if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3314 		vap->va_mask &= ~AT_XVATTR;
3315 	}
3316 
3317 	/*
3318 	 * We're only allowed to skip the ACL check iff we used a 32 bit
3319 	 * ACE mask with VOP_ACCESS() to determine permissions.
3320 	 */
3321 	if ((flags & ATTR_NOACLCHECK) &&
3322 	    vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3323 		return (EINVAL);
3324 	}
3325 	err = (*(vp)->v_op->vop_setattr)(vp, vap, flags, cr, ct);
3326 	VOPSTATS_UPDATE(vp, setattr);
3327 	return (err);
3328 }
3329 
3330 int
3331 fop_access(
3332 	vnode_t *vp,
3333 	int mode,
3334 	int flags,
3335 	cred_t *cr,
3336 	caller_context_t *ct)
3337 {
3338 	int	err;
3339 
3340 	if ((flags & V_ACE_MASK) &&
3341 	    vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3342 		return (EINVAL);
3343 	}
3344 
3345 	VOPXID_MAP_CR(vp, cr);
3346 
3347 	err = (*(vp)->v_op->vop_access)(vp, mode, flags, cr, ct);
3348 	VOPSTATS_UPDATE(vp, access);
3349 	return (err);
3350 }
3351 
3352 int
3353 fop_lookup(
3354 	vnode_t *dvp,
3355 	char *nm,
3356 	vnode_t **vpp,
3357 	pathname_t *pnp,
3358 	int flags,
3359 	vnode_t *rdir,
3360 	cred_t *cr,
3361 	caller_context_t *ct,
3362 	int *deflags,		/* Returned per-dirent flags */
3363 	pathname_t *ppnp)	/* Returned case-preserved name in directory */
3364 {
3365 	int ret;
3366 
3367 	/*
3368 	 * If this file system doesn't support case-insensitive access
3369 	 * and said access is requested, fail quickly.  It is required
3370 	 * that if the vfs supports case-insensitive lookup, it also
3371 	 * supports extended dirent flags.
3372 	 */
3373 	if (flags & FIGNORECASE &&
3374 	    (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3375 	    vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3376 		return (EINVAL);
3377 
3378 	VOPXID_MAP_CR(dvp, cr);
3379 
3380 	if ((flags & LOOKUP_XATTR) && (flags & LOOKUP_HAVE_SYSATTR_DIR) == 0) {
3381 		ret = xattr_dir_lookup(dvp, vpp, flags, cr);
3382 	} else {
3383 		ret = (*(dvp)->v_op->vop_lookup)
3384 		    (dvp, nm, vpp, pnp, flags, rdir, cr, ct, deflags, ppnp);
3385 	}
3386 	if (ret == 0 && *vpp) {
3387 		VOPSTATS_UPDATE(*vpp, lookup);
3388 		if ((*vpp)->v_path == NULL) {
3389 			vn_setpath(rootdir, dvp, *vpp, nm, strlen(nm));
3390 		}
3391 	}
3392 
3393 	return (ret);
3394 }
3395 
3396 int
3397 fop_create(
3398 	vnode_t *dvp,
3399 	char *name,
3400 	vattr_t *vap,
3401 	vcexcl_t excl,
3402 	int mode,
3403 	vnode_t **vpp,
3404 	cred_t *cr,
3405 	int flags,
3406 	caller_context_t *ct,
3407 	vsecattr_t *vsecp)	/* ACL to set during create */
3408 {
3409 	int ret;
3410 
3411 	if (vsecp != NULL &&
3412 	    vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3413 		return (EINVAL);
3414 	}
3415 	/*
3416 	 * If this file system doesn't support case-insensitive access
3417 	 * and said access is requested, fail quickly.
3418 	 */
3419 	if (flags & FIGNORECASE &&
3420 	    (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3421 	    vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3422 		return (EINVAL);
3423 
3424 	VOPXID_MAP_CR(dvp, cr);
3425 
3426 	ret = (*(dvp)->v_op->vop_create)
3427 	    (dvp, name, vap, excl, mode, vpp, cr, flags, ct, vsecp);
3428 	if (ret == 0 && *vpp) {
3429 		VOPSTATS_UPDATE(*vpp, create);
3430 		if ((*vpp)->v_path == NULL) {
3431 			vn_setpath(rootdir, dvp, *vpp, name, strlen(name));
3432 		}
3433 	}
3434 
3435 	return (ret);
3436 }
3437 
3438 int
3439 fop_remove(
3440 	vnode_t *dvp,
3441 	char *nm,
3442 	cred_t *cr,
3443 	caller_context_t *ct,
3444 	int flags)
3445 {
3446 	int	err;
3447 
3448 	/*
3449 	 * If this file system doesn't support case-insensitive access
3450 	 * and said access is requested, fail quickly.
3451 	 */
3452 	if (flags & FIGNORECASE &&
3453 	    (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3454 	    vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3455 		return (EINVAL);
3456 
3457 	VOPXID_MAP_CR(dvp, cr);
3458 
3459 	err = (*(dvp)->v_op->vop_remove)(dvp, nm, cr, ct, flags);
3460 	VOPSTATS_UPDATE(dvp, remove);
3461 	return (err);
3462 }
3463 
3464 int
3465 fop_link(
3466 	vnode_t *tdvp,
3467 	vnode_t *svp,
3468 	char *tnm,
3469 	cred_t *cr,
3470 	caller_context_t *ct,
3471 	int flags)
3472 {
3473 	int	err;
3474 
3475 	/*
3476 	 * If the target file system doesn't support case-insensitive access
3477 	 * and said access is requested, fail quickly.
3478 	 */
3479 	if (flags & FIGNORECASE &&
3480 	    (vfs_has_feature(tdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3481 	    vfs_has_feature(tdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3482 		return (EINVAL);
3483 
3484 	VOPXID_MAP_CR(tdvp, cr);
3485 
3486 	err = (*(tdvp)->v_op->vop_link)(tdvp, svp, tnm, cr, ct, flags);
3487 	VOPSTATS_UPDATE(tdvp, link);
3488 	return (err);
3489 }
3490 
3491 int
3492 fop_rename(
3493 	vnode_t *sdvp,
3494 	char *snm,
3495 	vnode_t *tdvp,
3496 	char *tnm,
3497 	cred_t *cr,
3498 	caller_context_t *ct,
3499 	int flags)
3500 {
3501 	int	err;
3502 
3503 	/*
3504 	 * If the file system involved does not support
3505 	 * case-insensitive access and said access is requested, fail
3506 	 * quickly.
3507 	 */
3508 	if (flags & FIGNORECASE &&
3509 	    ((vfs_has_feature(sdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3510 	    vfs_has_feature(sdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)))
3511 		return (EINVAL);
3512 
3513 	VOPXID_MAP_CR(tdvp, cr);
3514 
3515 	err = (*(sdvp)->v_op->vop_rename)(sdvp, snm, tdvp, tnm, cr, ct, flags);
3516 	VOPSTATS_UPDATE(sdvp, rename);
3517 	return (err);
3518 }
3519 
3520 int
3521 fop_mkdir(
3522 	vnode_t *dvp,
3523 	char *dirname,
3524 	vattr_t *vap,
3525 	vnode_t **vpp,
3526 	cred_t *cr,
3527 	caller_context_t *ct,
3528 	int flags,
3529 	vsecattr_t *vsecp)	/* ACL to set during create */
3530 {
3531 	int ret;
3532 
3533 	if (vsecp != NULL &&
3534 	    vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3535 		return (EINVAL);
3536 	}
3537 	/*
3538 	 * If this file system doesn't support case-insensitive access
3539 	 * and said access is requested, fail quickly.
3540 	 */
3541 	if (flags & FIGNORECASE &&
3542 	    (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3543 	    vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3544 		return (EINVAL);
3545 
3546 	VOPXID_MAP_CR(dvp, cr);
3547 
3548 	ret = (*(dvp)->v_op->vop_mkdir)
3549 	    (dvp, dirname, vap, vpp, cr, ct, flags, vsecp);
3550 	if (ret == 0 && *vpp) {
3551 		VOPSTATS_UPDATE(*vpp, mkdir);
3552 		if ((*vpp)->v_path == NULL) {
3553 			vn_setpath(rootdir, dvp, *vpp, dirname,
3554 			    strlen(dirname));
3555 		}
3556 	}
3557 
3558 	return (ret);
3559 }
3560 
3561 int
3562 fop_rmdir(
3563 	vnode_t *dvp,
3564 	char *nm,
3565 	vnode_t *cdir,
3566 	cred_t *cr,
3567 	caller_context_t *ct,
3568 	int flags)
3569 {
3570 	int	err;
3571 
3572 	/*
3573 	 * If this file system doesn't support case-insensitive access
3574 	 * and said access is requested, fail quickly.
3575 	 */
3576 	if (flags & FIGNORECASE &&
3577 	    (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3578 	    vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3579 		return (EINVAL);
3580 
3581 	VOPXID_MAP_CR(dvp, cr);
3582 
3583 	err = (*(dvp)->v_op->vop_rmdir)(dvp, nm, cdir, cr, ct, flags);
3584 	VOPSTATS_UPDATE(dvp, rmdir);
3585 	return (err);
3586 }
3587 
3588 int
3589 fop_readdir(
3590 	vnode_t *vp,
3591 	uio_t *uiop,
3592 	cred_t *cr,
3593 	int *eofp,
3594 	caller_context_t *ct,
3595 	int flags)
3596 {
3597 	int	err;
3598 	ssize_t	resid_start = uiop->uio_resid;
3599 
3600 	/*
3601 	 * If this file system doesn't support retrieving directory
3602 	 * entry flags and said access is requested, fail quickly.
3603 	 */
3604 	if (flags & V_RDDIR_ENTFLAGS &&
3605 	    vfs_has_feature(vp->v_vfsp, VFSFT_DIRENTFLAGS) == 0)
3606 		return (EINVAL);
3607 
3608 	VOPXID_MAP_CR(vp, cr);
3609 
3610 	err = (*(vp)->v_op->vop_readdir)(vp, uiop, cr, eofp, ct, flags);
3611 	VOPSTATS_UPDATE_IO(vp, readdir,
3612 	    readdir_bytes, (resid_start - uiop->uio_resid));
3613 	return (err);
3614 }
3615 
3616 int
3617 fop_symlink(
3618 	vnode_t *dvp,
3619 	char *linkname,
3620 	vattr_t *vap,
3621 	char *target,
3622 	cred_t *cr,
3623 	caller_context_t *ct,
3624 	int flags)
3625 {
3626 	int	err;
3627 	xvattr_t xvattr;
3628 
3629 	/*
3630 	 * If this file system doesn't support case-insensitive access
3631 	 * and said access is requested, fail quickly.
3632 	 */
3633 	if (flags & FIGNORECASE &&
3634 	    (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3635 	    vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3636 		return (EINVAL);
3637 
3638 	VOPXID_MAP_CR(dvp, cr);
3639 
3640 	/* check for reparse point */
3641 	if ((vfs_has_feature(dvp->v_vfsp, VFSFT_REPARSE)) &&
3642 	    (strncmp(target, FS_REPARSE_TAG_STR,
3643 	    strlen(FS_REPARSE_TAG_STR)) == 0)) {
3644 		if (!fs_reparse_mark(target, vap, &xvattr))
3645 			vap = (vattr_t *)&xvattr;
3646 	}
3647 
3648 	err = (*(dvp)->v_op->vop_symlink)
3649 	    (dvp, linkname, vap, target, cr, ct, flags);
3650 	VOPSTATS_UPDATE(dvp, symlink);
3651 	return (err);
3652 }
3653 
3654 int
3655 fop_readlink(
3656 	vnode_t *vp,
3657 	uio_t *uiop,
3658 	cred_t *cr,
3659 	caller_context_t *ct)
3660 {
3661 	int	err;
3662 
3663 	VOPXID_MAP_CR(vp, cr);
3664 
3665 	err = (*(vp)->v_op->vop_readlink)(vp, uiop, cr, ct);
3666 	VOPSTATS_UPDATE(vp, readlink);
3667 	return (err);
3668 }
3669 
3670 int
3671 fop_fsync(
3672 	vnode_t *vp,
3673 	int syncflag,
3674 	cred_t *cr,
3675 	caller_context_t *ct)
3676 {
3677 	int	err;
3678 
3679 	VOPXID_MAP_CR(vp, cr);
3680 
3681 	err = (*(vp)->v_op->vop_fsync)(vp, syncflag, cr, ct);
3682 	VOPSTATS_UPDATE(vp, fsync);
3683 	return (err);
3684 }
3685 
3686 void
3687 fop_inactive(
3688 	vnode_t *vp,
3689 	cred_t *cr,
3690 	caller_context_t *ct)
3691 {
3692 	/* Need to update stats before vop call since we may lose the vnode */
3693 	VOPSTATS_UPDATE(vp, inactive);
3694 
3695 	VOPXID_MAP_CR(vp, cr);
3696 
3697 	(*(vp)->v_op->vop_inactive)(vp, cr, ct);
3698 }
3699 
3700 int
3701 fop_fid(
3702 	vnode_t *vp,
3703 	fid_t *fidp,
3704 	caller_context_t *ct)
3705 {
3706 	int	err;
3707 
3708 	err = (*(vp)->v_op->vop_fid)(vp, fidp, ct);
3709 	VOPSTATS_UPDATE(vp, fid);
3710 	return (err);
3711 }
3712 
3713 int
3714 fop_rwlock(
3715 	vnode_t *vp,
3716 	int write_lock,
3717 	caller_context_t *ct)
3718 {
3719 	int	ret;
3720 
3721 	ret = ((*(vp)->v_op->vop_rwlock)(vp, write_lock, ct));
3722 	VOPSTATS_UPDATE(vp, rwlock);
3723 	return (ret);
3724 }
3725 
3726 void
3727 fop_rwunlock(
3728 	vnode_t *vp,
3729 	int write_lock,
3730 	caller_context_t *ct)
3731 {
3732 	(*(vp)->v_op->vop_rwunlock)(vp, write_lock, ct);
3733 	VOPSTATS_UPDATE(vp, rwunlock);
3734 }
3735 
3736 int
3737 fop_seek(
3738 	vnode_t *vp,
3739 	offset_t ooff,
3740 	offset_t *noffp,
3741 	caller_context_t *ct)
3742 {
3743 	int	err;
3744 
3745 	err = (*(vp)->v_op->vop_seek)(vp, ooff, noffp, ct);
3746 	VOPSTATS_UPDATE(vp, seek);
3747 	return (err);
3748 }
3749 
3750 int
3751 fop_cmp(
3752 	vnode_t *vp1,
3753 	vnode_t *vp2,
3754 	caller_context_t *ct)
3755 {
3756 	int	err;
3757 
3758 	err = (*(vp1)->v_op->vop_cmp)(vp1, vp2, ct);
3759 	VOPSTATS_UPDATE(vp1, cmp);
3760 	return (err);
3761 }
3762 
3763 int
3764 fop_frlock(
3765 	vnode_t *vp,
3766 	int cmd,
3767 	flock64_t *bfp,
3768 	int flag,
3769 	offset_t offset,
3770 	struct flk_callback *flk_cbp,
3771 	cred_t *cr,
3772 	caller_context_t *ct)
3773 {
3774 	int	err;
3775 
3776 	VOPXID_MAP_CR(vp, cr);
3777 
3778 	err = (*(vp)->v_op->vop_frlock)
3779 	    (vp, cmd, bfp, flag, offset, flk_cbp, cr, ct);
3780 	VOPSTATS_UPDATE(vp, frlock);
3781 	return (err);
3782 }
3783 
3784 int
3785 fop_space(
3786 	vnode_t *vp,
3787 	int cmd,
3788 	flock64_t *bfp,
3789 	int flag,
3790 	offset_t offset,
3791 	cred_t *cr,
3792 	caller_context_t *ct)
3793 {
3794 	int	err;
3795 
3796 	VOPXID_MAP_CR(vp, cr);
3797 
3798 	err = (*(vp)->v_op->vop_space)(vp, cmd, bfp, flag, offset, cr, ct);
3799 	VOPSTATS_UPDATE(vp, space);
3800 	return (err);
3801 }
3802 
3803 int
3804 fop_realvp(
3805 	vnode_t *vp,
3806 	vnode_t **vpp,
3807 	caller_context_t *ct)
3808 {
3809 	int	err;
3810 
3811 	err = (*(vp)->v_op->vop_realvp)(vp, vpp, ct);
3812 	VOPSTATS_UPDATE(vp, realvp);
3813 	return (err);
3814 }
3815 
3816 int
3817 fop_getpage(
3818 	vnode_t *vp,
3819 	offset_t off,
3820 	size_t len,
3821 	uint_t *protp,
3822 	page_t **plarr,
3823 	size_t plsz,
3824 	struct seg *seg,
3825 	caddr_t addr,
3826 	enum seg_rw rw,
3827 	cred_t *cr,
3828 	caller_context_t *ct)
3829 {
3830 	int	err;
3831 
3832 	VOPXID_MAP_CR(vp, cr);
3833 
3834 	err = (*(vp)->v_op->vop_getpage)
3835 	    (vp, off, len, protp, plarr, plsz, seg, addr, rw, cr, ct);
3836 	VOPSTATS_UPDATE(vp, getpage);
3837 	return (err);
3838 }
3839 
3840 int
3841 fop_putpage(
3842 	vnode_t *vp,
3843 	offset_t off,
3844 	size_t len,
3845 	int flags,
3846 	cred_t *cr,
3847 	caller_context_t *ct)
3848 {
3849 	int	err;
3850 
3851 	VOPXID_MAP_CR(vp, cr);
3852 
3853 	err = (*(vp)->v_op->vop_putpage)(vp, off, len, flags, cr, ct);
3854 	VOPSTATS_UPDATE(vp, putpage);
3855 	return (err);
3856 }
3857 
3858 int
3859 fop_map(
3860 	vnode_t *vp,
3861 	offset_t off,
3862 	struct as *as,
3863 	caddr_t *addrp,
3864 	size_t len,
3865 	uchar_t prot,
3866 	uchar_t maxprot,
3867 	uint_t flags,
3868 	cred_t *cr,
3869 	caller_context_t *ct)
3870 {
3871 	int	err;
3872 
3873 	VOPXID_MAP_CR(vp, cr);
3874 
3875 	err = (*(vp)->v_op->vop_map)
3876 	    (vp, off, as, addrp, len, prot, maxprot, flags, cr, ct);
3877 	VOPSTATS_UPDATE(vp, map);
3878 	return (err);
3879 }
3880 
3881 int
3882 fop_addmap(
3883 	vnode_t *vp,
3884 	offset_t off,
3885 	struct as *as,
3886 	caddr_t addr,
3887 	size_t len,
3888 	uchar_t prot,
3889 	uchar_t maxprot,
3890 	uint_t flags,
3891 	cred_t *cr,
3892 	caller_context_t *ct)
3893 {
3894 	int error;
3895 	u_longlong_t delta;
3896 
3897 	VOPXID_MAP_CR(vp, cr);
3898 
3899 	error = (*(vp)->v_op->vop_addmap)
3900 	    (vp, off, as, addr, len, prot, maxprot, flags, cr, ct);
3901 
3902 	if ((!error) && (vp->v_type == VREG)) {
3903 		delta = (u_longlong_t)btopr(len);
3904 		/*
3905 		 * If file is declared MAP_PRIVATE, it can't be written back
3906 		 * even if open for write. Handle as read.
3907 		 */
3908 		if (flags & MAP_PRIVATE) {
3909 			atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3910 			    (int64_t)delta);
3911 		} else {
3912 			/*
3913 			 * atomic_add_64 forces the fetch of a 64 bit value to
3914 			 * be atomic on 32 bit machines
3915 			 */
3916 			if (maxprot & PROT_WRITE)
3917 				atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
3918 				    (int64_t)delta);
3919 			if (maxprot & PROT_READ)
3920 				atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3921 				    (int64_t)delta);
3922 			if (maxprot & PROT_EXEC)
3923 				atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3924 				    (int64_t)delta);
3925 		}
3926 	}
3927 	VOPSTATS_UPDATE(vp, addmap);
3928 	return (error);
3929 }
3930 
3931 int
3932 fop_delmap(
3933 	vnode_t *vp,
3934 	offset_t off,
3935 	struct as *as,
3936 	caddr_t addr,
3937 	size_t len,
3938 	uint_t prot,
3939 	uint_t maxprot,
3940 	uint_t flags,
3941 	cred_t *cr,
3942 	caller_context_t *ct)
3943 {
3944 	int error;
3945 	u_longlong_t delta;
3946 
3947 	VOPXID_MAP_CR(vp, cr);
3948 
3949 	error = (*(vp)->v_op->vop_delmap)
3950 	    (vp, off, as, addr, len, prot, maxprot, flags, cr, ct);
3951 
3952 	/*
3953 	 * NFS calls into delmap twice, the first time
3954 	 * it simply establishes a callback mechanism and returns EAGAIN
3955 	 * while the real work is being done upon the second invocation.
3956 	 * We have to detect this here and only decrement the counts upon
3957 	 * the second delmap request.
3958 	 */
3959 	if ((error != EAGAIN) && (vp->v_type == VREG)) {
3960 
3961 		delta = (u_longlong_t)btopr(len);
3962 
3963 		if (flags & MAP_PRIVATE) {
3964 			atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3965 			    (int64_t)(-delta));
3966 		} else {
3967 			/*
3968 			 * atomic_add_64 forces the fetch of a 64 bit value
3969 			 * to be atomic on 32 bit machines
3970 			 */
3971 			if (maxprot & PROT_WRITE)
3972 				atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
3973 				    (int64_t)(-delta));
3974 			if (maxprot & PROT_READ)
3975 				atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3976 				    (int64_t)(-delta));
3977 			if (maxprot & PROT_EXEC)
3978 				atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3979 				    (int64_t)(-delta));
3980 		}
3981 	}
3982 	VOPSTATS_UPDATE(vp, delmap);
3983 	return (error);
3984 }
3985 
3986 
3987 int
3988 fop_poll(
3989 	vnode_t *vp,
3990 	short events,
3991 	int anyyet,
3992 	short *reventsp,
3993 	struct pollhead **phpp,
3994 	caller_context_t *ct)
3995 {
3996 	int	err;
3997 
3998 	err = (*(vp)->v_op->vop_poll)(vp, events, anyyet, reventsp, phpp, ct);
3999 	VOPSTATS_UPDATE(vp, poll);
4000 	return (err);
4001 }
4002 
4003 int
4004 fop_dump(
4005 	vnode_t *vp,
4006 	caddr_t addr,
4007 	offset_t lbdn,
4008 	offset_t dblks,
4009 	caller_context_t *ct)
4010 {
4011 	int	err;
4012 
4013 	/* ensure lbdn and dblks can be passed safely to bdev_dump */
4014 	if ((lbdn != (daddr_t)lbdn) || (dblks != (int)dblks))
4015 		return (EIO);
4016 
4017 	err = (*(vp)->v_op->vop_dump)(vp, addr, lbdn, dblks, ct);
4018 	VOPSTATS_UPDATE(vp, dump);
4019 	return (err);
4020 }
4021 
4022 int
4023 fop_pathconf(
4024 	vnode_t *vp,
4025 	int cmd,
4026 	ulong_t *valp,
4027 	cred_t *cr,
4028 	caller_context_t *ct)
4029 {
4030 	int	err;
4031 
4032 	VOPXID_MAP_CR(vp, cr);
4033 
4034 	err = (*(vp)->v_op->vop_pathconf)(vp, cmd, valp, cr, ct);
4035 	VOPSTATS_UPDATE(vp, pathconf);
4036 	return (err);
4037 }
4038 
4039 int
4040 fop_pageio(
4041 	vnode_t *vp,
4042 	struct page *pp,
4043 	u_offset_t io_off,
4044 	size_t io_len,
4045 	int flags,
4046 	cred_t *cr,
4047 	caller_context_t *ct)
4048 {
4049 	int	err;
4050 
4051 	VOPXID_MAP_CR(vp, cr);
4052 
4053 	err = (*(vp)->v_op->vop_pageio)(vp, pp, io_off, io_len, flags, cr, ct);
4054 	VOPSTATS_UPDATE(vp, pageio);
4055 	return (err);
4056 }
4057 
4058 int
4059 fop_dumpctl(
4060 	vnode_t *vp,
4061 	int action,
4062 	offset_t *blkp,
4063 	caller_context_t *ct)
4064 {
4065 	int	err;
4066 	err = (*(vp)->v_op->vop_dumpctl)(vp, action, blkp, ct);
4067 	VOPSTATS_UPDATE(vp, dumpctl);
4068 	return (err);
4069 }
4070 
4071 void
4072 fop_dispose(
4073 	vnode_t *vp,
4074 	page_t *pp,
4075 	int flag,
4076 	int dn,
4077 	cred_t *cr,
4078 	caller_context_t *ct)
4079 {
4080 	/* Must do stats first since it's possible to lose the vnode */
4081 	VOPSTATS_UPDATE(vp, dispose);
4082 
4083 	VOPXID_MAP_CR(vp, cr);
4084 
4085 	(*(vp)->v_op->vop_dispose)(vp, pp, flag, dn, cr, ct);
4086 }
4087 
4088 int
4089 fop_setsecattr(
4090 	vnode_t *vp,
4091 	vsecattr_t *vsap,
4092 	int flag,
4093 	cred_t *cr,
4094 	caller_context_t *ct)
4095 {
4096 	int	err;
4097 
4098 	VOPXID_MAP_CR(vp, cr);
4099 
4100 	/*
4101 	 * We're only allowed to skip the ACL check iff we used a 32 bit
4102 	 * ACE mask with VOP_ACCESS() to determine permissions.
4103 	 */
4104 	if ((flag & ATTR_NOACLCHECK) &&
4105 	    vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4106 		return (EINVAL);
4107 	}
4108 	err = (*(vp)->v_op->vop_setsecattr) (vp, vsap, flag, cr, ct);
4109 	VOPSTATS_UPDATE(vp, setsecattr);
4110 	return (err);
4111 }
4112 
4113 int
4114 fop_getsecattr(
4115 	vnode_t *vp,
4116 	vsecattr_t *vsap,
4117 	int flag,
4118 	cred_t *cr,
4119 	caller_context_t *ct)
4120 {
4121 	int	err;
4122 
4123 	/*
4124 	 * We're only allowed to skip the ACL check iff we used a 32 bit
4125 	 * ACE mask with VOP_ACCESS() to determine permissions.
4126 	 */
4127 	if ((flag & ATTR_NOACLCHECK) &&
4128 	    vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4129 		return (EINVAL);
4130 	}
4131 
4132 	VOPXID_MAP_CR(vp, cr);
4133 
4134 	err = (*(vp)->v_op->vop_getsecattr) (vp, vsap, flag, cr, ct);
4135 	VOPSTATS_UPDATE(vp, getsecattr);
4136 	return (err);
4137 }
4138 
4139 int
4140 fop_shrlock(
4141 	vnode_t *vp,
4142 	int cmd,
4143 	struct shrlock *shr,
4144 	int flag,
4145 	cred_t *cr,
4146 	caller_context_t *ct)
4147 {
4148 	int	err;
4149 
4150 	VOPXID_MAP_CR(vp, cr);
4151 
4152 	err = (*(vp)->v_op->vop_shrlock)(vp, cmd, shr, flag, cr, ct);
4153 	VOPSTATS_UPDATE(vp, shrlock);
4154 	return (err);
4155 }
4156 
4157 int
4158 fop_vnevent(vnode_t *vp, vnevent_t vnevent, vnode_t *dvp, char *fnm,
4159     caller_context_t *ct)
4160 {
4161 	int	err;
4162 
4163 	err = (*(vp)->v_op->vop_vnevent)(vp, vnevent, dvp, fnm, ct);
4164 	VOPSTATS_UPDATE(vp, vnevent);
4165 	return (err);
4166 }
4167 
4168 int
4169 fop_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *uiop, cred_t *cr,
4170     caller_context_t *ct)
4171 {
4172 	int err;
4173 
4174 	if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4175 		return (ENOTSUP);
4176 	err = (*(vp)->v_op->vop_reqzcbuf)(vp, ioflag, uiop, cr, ct);
4177 	VOPSTATS_UPDATE(vp, reqzcbuf);
4178 	return (err);
4179 }
4180 
4181 int
4182 fop_retzcbuf(vnode_t *vp, xuio_t *uiop, cred_t *cr, caller_context_t *ct)
4183 {
4184 	int err;
4185 
4186 	if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4187 		return (ENOTSUP);
4188 	err = (*(vp)->v_op->vop_retzcbuf)(vp, uiop, cr, ct);
4189 	VOPSTATS_UPDATE(vp, retzcbuf);
4190 	return (err);
4191 }
4192 
4193 /*
4194  * Default destructor
4195  *	Needed because NULL destructor means that the key is unused
4196  */
4197 /* ARGSUSED */
4198 void
4199 vsd_defaultdestructor(void *value)
4200 {}
4201 
4202 /*
4203  * Create a key (index into per vnode array)
4204  *	Locks out vsd_create, vsd_destroy, and vsd_free
4205  *	May allocate memory with lock held
4206  */
4207 void
4208 vsd_create(uint_t *keyp, void (*destructor)(void *))
4209 {
4210 	int	i;
4211 	uint_t	nkeys;
4212 
4213 	/*
4214 	 * if key is allocated, do nothing
4215 	 */
4216 	mutex_enter(&vsd_lock);
4217 	if (*keyp) {
4218 		mutex_exit(&vsd_lock);
4219 		return;
4220 	}
4221 	/*
4222 	 * find an unused key
4223 	 */
4224 	if (destructor == NULL)
4225 		destructor = vsd_defaultdestructor;
4226 
4227 	for (i = 0; i < vsd_nkeys; ++i)
4228 		if (vsd_destructor[i] == NULL)
4229 			break;
4230 
4231 	/*
4232 	 * if no unused keys, increase the size of the destructor array
4233 	 */
4234 	if (i == vsd_nkeys) {
4235 		if ((nkeys = (vsd_nkeys << 1)) == 0)
4236 			nkeys = 1;
4237 		vsd_destructor =
4238 		    (void (**)(void *))vsd_realloc((void *)vsd_destructor,
4239 		    (size_t)(vsd_nkeys * sizeof (void (*)(void *))),
4240 		    (size_t)(nkeys * sizeof (void (*)(void *))));
4241 		vsd_nkeys = nkeys;
4242 	}
4243 
4244 	/*
4245 	 * allocate the next available unused key
4246 	 */
4247 	vsd_destructor[i] = destructor;
4248 	*keyp = i + 1;
4249 
4250 	/* create vsd_list, if it doesn't exist */
4251 	if (vsd_list == NULL) {
4252 		vsd_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
4253 		list_create(vsd_list, sizeof (struct vsd_node),
4254 		    offsetof(struct vsd_node, vs_nodes));
4255 	}
4256 
4257 	mutex_exit(&vsd_lock);
4258 }
4259 
4260 /*
4261  * Destroy a key
4262  *
4263  * Assumes that the caller is preventing vsd_set and vsd_get
4264  * Locks out vsd_create, vsd_destroy, and vsd_free
4265  * May free memory with lock held
4266  */
4267 void
4268 vsd_destroy(uint_t *keyp)
4269 {
4270 	uint_t key;
4271 	struct vsd_node *vsd;
4272 
4273 	/*
4274 	 * protect the key namespace and our destructor lists
4275 	 */
4276 	mutex_enter(&vsd_lock);
4277 	key = *keyp;
4278 	*keyp = 0;
4279 
4280 	ASSERT(key <= vsd_nkeys);
4281 
4282 	/*
4283 	 * if the key is valid
4284 	 */
4285 	if (key != 0) {
4286 		uint_t k = key - 1;
4287 		/*
4288 		 * for every vnode with VSD, call key's destructor
4289 		 */
4290 		for (vsd = list_head(vsd_list); vsd != NULL;
4291 		    vsd = list_next(vsd_list, vsd)) {
4292 			/*
4293 			 * no VSD for key in this vnode
4294 			 */
4295 			if (key > vsd->vs_nkeys)
4296 				continue;
4297 			/*
4298 			 * call destructor for key
4299 			 */
4300 			if (vsd->vs_value[k] && vsd_destructor[k])
4301 				(*vsd_destructor[k])(vsd->vs_value[k]);
4302 			/*
4303 			 * reset value for key
4304 			 */
4305 			vsd->vs_value[k] = NULL;
4306 		}
4307 		/*
4308 		 * actually free the key (NULL destructor == unused)
4309 		 */
4310 		vsd_destructor[k] = NULL;
4311 	}
4312 
4313 	mutex_exit(&vsd_lock);
4314 }
4315 
4316 /*
4317  * Quickly return the per vnode value that was stored with the specified key
4318  * Assumes the caller is protecting key from vsd_create and vsd_destroy
4319  * Assumes the caller is holding v_vsd_lock to protect the vsd.
4320  */
4321 void *
4322 vsd_get(vnode_t *vp, uint_t key)
4323 {
4324 	struct vsd_node *vsd;
4325 
4326 	ASSERT(vp != NULL);
4327 	ASSERT(mutex_owned(&vp->v_vsd_lock));
4328 
4329 	vsd = vp->v_vsd;
4330 
4331 	if (key && vsd != NULL && key <= vsd->vs_nkeys)
4332 		return (vsd->vs_value[key - 1]);
4333 	return (NULL);
4334 }
4335 
4336 /*
4337  * Set a per vnode value indexed with the specified key
4338  * Assumes the caller is holding v_vsd_lock to protect the vsd.
4339  */
4340 int
4341 vsd_set(vnode_t *vp, uint_t key, void *value)
4342 {
4343 	struct vsd_node *vsd;
4344 
4345 	ASSERT(vp != NULL);
4346 	ASSERT(mutex_owned(&vp->v_vsd_lock));
4347 
4348 	if (key == 0)
4349 		return (EINVAL);
4350 
4351 	vsd = vp->v_vsd;
4352 	if (vsd == NULL)
4353 		vsd = vp->v_vsd = kmem_zalloc(sizeof (*vsd), KM_SLEEP);
4354 
4355 	/*
4356 	 * If the vsd was just allocated, vs_nkeys will be 0, so the following
4357 	 * code won't happen and we will continue down and allocate space for
4358 	 * the vs_value array.
4359 	 * If the caller is replacing one value with another, then it is up
4360 	 * to the caller to free/rele/destroy the previous value (if needed).
4361 	 */
4362 	if (key <= vsd->vs_nkeys) {
4363 		vsd->vs_value[key - 1] = value;
4364 		return (0);
4365 	}
4366 
4367 	ASSERT(key <= vsd_nkeys);
4368 
4369 	if (vsd->vs_nkeys == 0) {
4370 		mutex_enter(&vsd_lock);	/* lock out vsd_destroy() */
4371 		/*
4372 		 * Link onto list of all VSD nodes.
4373 		 */
4374 		list_insert_head(vsd_list, vsd);
4375 		mutex_exit(&vsd_lock);
4376 	}
4377 
4378 	/*
4379 	 * Allocate vnode local storage and set the value for key
4380 	 */
4381 	vsd->vs_value = vsd_realloc(vsd->vs_value,
4382 	    vsd->vs_nkeys * sizeof (void *),
4383 	    key * sizeof (void *));
4384 	vsd->vs_nkeys = key;
4385 	vsd->vs_value[key - 1] = value;
4386 
4387 	return (0);
4388 }
4389 
4390 /*
4391  * Called from vn_free() to run the destructor function for each vsd
4392  *	Locks out vsd_create and vsd_destroy
4393  *	Assumes that the destructor *DOES NOT* use vsd
4394  */
4395 void
4396 vsd_free(vnode_t *vp)
4397 {
4398 	int i;
4399 	struct vsd_node *vsd = vp->v_vsd;
4400 
4401 	if (vsd == NULL)
4402 		return;
4403 
4404 	if (vsd->vs_nkeys == 0) {
4405 		kmem_free(vsd, sizeof (*vsd));
4406 		vp->v_vsd = NULL;
4407 		return;
4408 	}
4409 
4410 	/*
4411 	 * lock out vsd_create and vsd_destroy, call
4412 	 * the destructor, and mark the value as destroyed.
4413 	 */
4414 	mutex_enter(&vsd_lock);
4415 
4416 	for (i = 0; i < vsd->vs_nkeys; i++) {
4417 		if (vsd->vs_value[i] && vsd_destructor[i])
4418 			(*vsd_destructor[i])(vsd->vs_value[i]);
4419 		vsd->vs_value[i] = NULL;
4420 	}
4421 
4422 	/*
4423 	 * remove from linked list of VSD nodes
4424 	 */
4425 	list_remove(vsd_list, vsd);
4426 
4427 	mutex_exit(&vsd_lock);
4428 
4429 	/*
4430 	 * free up the VSD
4431 	 */
4432 	kmem_free(vsd->vs_value, vsd->vs_nkeys * sizeof (void *));
4433 	kmem_free(vsd, sizeof (struct vsd_node));
4434 	vp->v_vsd = NULL;
4435 }
4436 
4437 /*
4438  * realloc
4439  */
4440 static void *
4441 vsd_realloc(void *old, size_t osize, size_t nsize)
4442 {
4443 	void *new;
4444 
4445 	new = kmem_zalloc(nsize, KM_SLEEP);
4446 	if (old) {
4447 		bcopy(old, new, osize);
4448 		kmem_free(old, osize);
4449 	}
4450 	return (new);
4451 }
4452 
4453 /*
4454  * Setup the extensible system attribute for creating a reparse point.
4455  * The symlink data 'target' is validated for proper format of a reparse
4456  * string and a check also made to make sure the symlink data does not
4457  * point to an existing file.
4458  *
4459  * return 0 if ok else -1.
4460  */
4461 static int
4462 fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr)
4463 {
4464 	xoptattr_t *xoap;
4465 
4466 	if ((!target) || (!vap) || (!xvattr))
4467 		return (-1);
4468 
4469 	/* validate reparse string */
4470 	if (reparse_validate((const char *)target))
4471 		return (-1);
4472 
4473 	xva_init(xvattr);
4474 	xvattr->xva_vattr = *vap;
4475 	xvattr->xva_vattr.va_mask |= AT_XVATTR;
4476 	xoap = xva_getxoptattr(xvattr);
4477 	ASSERT(xoap);
4478 	XVA_SET_REQ(xvattr, XAT_REPARSE);
4479 	xoap->xoa_reparse = 1;
4480 
4481 	return (0);
4482 }
4483 
4484 /*
4485  * Function to check whether a symlink is a reparse point.
4486  * Return B_TRUE if it is a reparse point, else return B_FALSE
4487  */
4488 boolean_t
4489 vn_is_reparse(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4490 {
4491 	xvattr_t xvattr;
4492 	xoptattr_t *xoap;
4493 
4494 	if ((vp->v_type != VLNK) ||
4495 	    !(vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR)))
4496 		return (B_FALSE);
4497 
4498 	xva_init(&xvattr);
4499 	xoap = xva_getxoptattr(&xvattr);
4500 	ASSERT(xoap);
4501 	XVA_SET_REQ(&xvattr, XAT_REPARSE);
4502 
4503 	if (VOP_GETATTR(vp, &xvattr.xva_vattr, 0, cr, ct))
4504 		return (B_FALSE);
4505 
4506 	if ((!(xvattr.xva_vattr.va_mask & AT_XVATTR)) ||
4507 	    (!(XVA_ISSET_RTN(&xvattr, XAT_REPARSE))))
4508 		return (B_FALSE);
4509 
4510 	return (xoap->xoa_reparse ? B_TRUE : B_FALSE);
4511 }
4512