xref: /titanic_51/usr/src/uts/common/fs/vnode.c (revision d2365b013d4199b49b3a1438d57aea23423e02ad)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 /*
30  * University Copyright- Copyright (c) 1982, 1986, 1988
31  * The Regents of the University of California
32  * All Rights Reserved
33  *
34  * University Acknowledgment- Portions of this document are derived from
35  * software developed by the University of California, Berkeley, and its
36  * contributors.
37  */
38 
39 #include <sys/types.h>
40 #include <sys/param.h>
41 #include <sys/t_lock.h>
42 #include <sys/errno.h>
43 #include <sys/cred.h>
44 #include <sys/user.h>
45 #include <sys/uio.h>
46 #include <sys/file.h>
47 #include <sys/pathname.h>
48 #include <sys/vfs.h>
49 #include <sys/vfs_opreg.h>
50 #include <sys/vnode.h>
51 #include <sys/rwstlock.h>
52 #include <sys/fem.h>
53 #include <sys/stat.h>
54 #include <sys/mode.h>
55 #include <sys/conf.h>
56 #include <sys/sysmacros.h>
57 #include <sys/cmn_err.h>
58 #include <sys/systm.h>
59 #include <sys/kmem.h>
60 #include <sys/debug.h>
61 #include <c2/audit.h>
62 #include <sys/acl.h>
63 #include <sys/nbmlock.h>
64 #include <sys/fcntl.h>
65 #include <fs/fs_subr.h>
66 #include <sys/taskq.h>
67 #include <fs/fs_reparse.h>
68 
69 /* Determine if this vnode is a file that is read-only */
70 #define	ISROFILE(vp)	\
71 	((vp)->v_type != VCHR && (vp)->v_type != VBLK && \
72 	    (vp)->v_type != VFIFO && vn_is_readonly(vp))
73 
74 /* Tunable via /etc/system; used only by admin/install */
75 int nfs_global_client_only;
76 
77 /*
78  * Array of vopstats_t for per-FS-type vopstats.  This array has the same
79  * number of entries as and parallel to the vfssw table.  (Arguably, it could
80  * be part of the vfssw table.)  Once it's initialized, it's accessed using
81  * the same fstype index that is used to index into the vfssw table.
82  */
83 vopstats_t **vopstats_fstype;
84 
85 /* vopstats initialization template used for fast initialization via bcopy() */
86 static vopstats_t *vs_templatep;
87 
88 /* Kmem cache handle for vsk_anchor_t allocations */
89 kmem_cache_t *vsk_anchor_cache;
90 
91 /* file events cleanup routine */
92 extern void free_fopdata(vnode_t *);
93 
94 /*
95  * Root of AVL tree for the kstats associated with vopstats.  Lock protects
96  * updates to vsktat_tree.
97  */
98 avl_tree_t	vskstat_tree;
99 kmutex_t	vskstat_tree_lock;
100 
101 /* Global variable which enables/disables the vopstats collection */
102 int vopstats_enabled = 1;
103 
104 /*
105  * forward declarations for internal vnode specific data (vsd)
106  */
107 static void *vsd_realloc(void *, size_t, size_t);
108 
109 /*
110  * forward declarations for reparse point functions
111  */
112 static int fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr);
113 
114 /*
115  * VSD -- VNODE SPECIFIC DATA
116  * The v_data pointer is typically used by a file system to store a
117  * pointer to the file system's private node (e.g. ufs inode, nfs rnode).
118  * However, there are times when additional project private data needs
119  * to be stored separately from the data (node) pointed to by v_data.
120  * This additional data could be stored by the file system itself or
121  * by a completely different kernel entity.  VSD provides a way for
122  * callers to obtain a key and store a pointer to private data associated
123  * with a vnode.
124  *
125  * Callers are responsible for protecting the vsd by holding v_vsd_lock
126  * for calls to vsd_set() and vsd_get().
127  */
128 
129 /*
130  * vsd_lock protects:
131  *   vsd_nkeys - creation and deletion of vsd keys
132  *   vsd_list - insertion and deletion of vsd_node in the vsd_list
133  *   vsd_destructor - adding and removing destructors to the list
134  */
135 static kmutex_t		vsd_lock;
136 static uint_t		vsd_nkeys;	 /* size of destructor array */
137 /* list of vsd_node's */
138 static list_t *vsd_list = NULL;
139 /* per-key destructor funcs */
140 static void 		(**vsd_destructor)(void *);
141 
142 /*
143  * The following is the common set of actions needed to update the
144  * vopstats structure from a vnode op.  Both VOPSTATS_UPDATE() and
145  * VOPSTATS_UPDATE_IO() do almost the same thing, except for the
146  * recording of the bytes transferred.  Since the code is similar
147  * but small, it is nearly a duplicate.  Consequently any changes
148  * to one may need to be reflected in the other.
149  * Rundown of the variables:
150  * vp - Pointer to the vnode
151  * counter - Partial name structure member to update in vopstats for counts
152  * bytecounter - Partial name structure member to update in vopstats for bytes
153  * bytesval - Value to update in vopstats for bytes
154  * fstype - Index into vsanchor_fstype[], same as index into vfssw[]
155  * vsp - Pointer to vopstats structure (either in vfs or vsanchor_fstype[i])
156  */
157 
158 #define	VOPSTATS_UPDATE(vp, counter) {					\
159 	vfs_t *vfsp = (vp)->v_vfsp;					\
160 	if (vfsp && vfsp->vfs_implp &&					\
161 	    (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) {	\
162 		vopstats_t *vsp = &vfsp->vfs_vopstats;			\
163 		uint64_t *stataddr = &(vsp->n##counter.value.ui64);	\
164 		extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
165 		    size_t, uint64_t *);				\
166 		__dtrace_probe___fsinfo_##counter(vp, 0, stataddr);	\
167 		(*stataddr)++;						\
168 		if ((vsp = vfsp->vfs_fstypevsp) != NULL) {		\
169 			vsp->n##counter.value.ui64++;			\
170 		}							\
171 	}								\
172 }
173 
174 #define	VOPSTATS_UPDATE_IO(vp, counter, bytecounter, bytesval) {	\
175 	vfs_t *vfsp = (vp)->v_vfsp;					\
176 	if (vfsp && vfsp->vfs_implp &&					\
177 	    (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) {	\
178 		vopstats_t *vsp = &vfsp->vfs_vopstats;			\
179 		uint64_t *stataddr = &(vsp->n##counter.value.ui64);	\
180 		extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
181 		    size_t, uint64_t *);				\
182 		__dtrace_probe___fsinfo_##counter(vp, bytesval, stataddr); \
183 		(*stataddr)++;						\
184 		vsp->bytecounter.value.ui64 += bytesval;		\
185 		if ((vsp = vfsp->vfs_fstypevsp) != NULL) {		\
186 			vsp->n##counter.value.ui64++;			\
187 			vsp->bytecounter.value.ui64 += bytesval;	\
188 		}							\
189 	}								\
190 }
191 
192 /*
193  * If the filesystem does not support XIDs map credential
194  * If the vfsp is NULL, perhaps we should also map?
195  */
196 #define	VOPXID_MAP_CR(vp, cr)	{					\
197 	vfs_t *vfsp = (vp)->v_vfsp;					\
198 	if (vfsp != NULL && (vfsp->vfs_flag & VFS_XID) == 0)		\
199 		cr = crgetmapped(cr);					\
200 	}
201 
202 /*
203  * Convert stat(2) formats to vnode types and vice versa.  (Knows about
204  * numerical order of S_IFMT and vnode types.)
205  */
206 enum vtype iftovt_tab[] = {
207 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
208 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
209 };
210 
211 ushort_t vttoif_tab[] = {
212 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO,
213 	S_IFDOOR, 0, S_IFSOCK, S_IFPORT, 0
214 };
215 
216 /*
217  * The system vnode cache.
218  */
219 
220 kmem_cache_t *vn_cache;
221 
222 
223 /*
224  * Vnode operations vector.
225  */
226 
227 static const fs_operation_trans_def_t vn_ops_table[] = {
228 	VOPNAME_OPEN, offsetof(struct vnodeops, vop_open),
229 	    fs_nosys, fs_nosys,
230 
231 	VOPNAME_CLOSE, offsetof(struct vnodeops, vop_close),
232 	    fs_nosys, fs_nosys,
233 
234 	VOPNAME_READ, offsetof(struct vnodeops, vop_read),
235 	    fs_nosys, fs_nosys,
236 
237 	VOPNAME_WRITE, offsetof(struct vnodeops, vop_write),
238 	    fs_nosys, fs_nosys,
239 
240 	VOPNAME_IOCTL, offsetof(struct vnodeops, vop_ioctl),
241 	    fs_nosys, fs_nosys,
242 
243 	VOPNAME_SETFL, offsetof(struct vnodeops, vop_setfl),
244 	    fs_setfl, fs_nosys,
245 
246 	VOPNAME_GETATTR, offsetof(struct vnodeops, vop_getattr),
247 	    fs_nosys, fs_nosys,
248 
249 	VOPNAME_SETATTR, offsetof(struct vnodeops, vop_setattr),
250 	    fs_nosys, fs_nosys,
251 
252 	VOPNAME_ACCESS, offsetof(struct vnodeops, vop_access),
253 	    fs_nosys, fs_nosys,
254 
255 	VOPNAME_LOOKUP, offsetof(struct vnodeops, vop_lookup),
256 	    fs_nosys, fs_nosys,
257 
258 	VOPNAME_CREATE, offsetof(struct vnodeops, vop_create),
259 	    fs_nosys, fs_nosys,
260 
261 	VOPNAME_REMOVE, offsetof(struct vnodeops, vop_remove),
262 	    fs_nosys, fs_nosys,
263 
264 	VOPNAME_LINK, offsetof(struct vnodeops, vop_link),
265 	    fs_nosys, fs_nosys,
266 
267 	VOPNAME_RENAME, offsetof(struct vnodeops, vop_rename),
268 	    fs_nosys, fs_nosys,
269 
270 	VOPNAME_MKDIR, offsetof(struct vnodeops, vop_mkdir),
271 	    fs_nosys, fs_nosys,
272 
273 	VOPNAME_RMDIR, offsetof(struct vnodeops, vop_rmdir),
274 	    fs_nosys, fs_nosys,
275 
276 	VOPNAME_READDIR, offsetof(struct vnodeops, vop_readdir),
277 	    fs_nosys, fs_nosys,
278 
279 	VOPNAME_SYMLINK, offsetof(struct vnodeops, vop_symlink),
280 	    fs_nosys, fs_nosys,
281 
282 	VOPNAME_READLINK, offsetof(struct vnodeops, vop_readlink),
283 	    fs_nosys, fs_nosys,
284 
285 	VOPNAME_FSYNC, offsetof(struct vnodeops, vop_fsync),
286 	    fs_nosys, fs_nosys,
287 
288 	VOPNAME_INACTIVE, offsetof(struct vnodeops, vop_inactive),
289 	    fs_nosys, fs_nosys,
290 
291 	VOPNAME_FID, offsetof(struct vnodeops, vop_fid),
292 	    fs_nosys, fs_nosys,
293 
294 	VOPNAME_RWLOCK, offsetof(struct vnodeops, vop_rwlock),
295 	    fs_rwlock, fs_rwlock,
296 
297 	VOPNAME_RWUNLOCK, offsetof(struct vnodeops, vop_rwunlock),
298 	    (fs_generic_func_p) fs_rwunlock,
299 	    (fs_generic_func_p) fs_rwunlock,	/* no errors allowed */
300 
301 	VOPNAME_SEEK, offsetof(struct vnodeops, vop_seek),
302 	    fs_nosys, fs_nosys,
303 
304 	VOPNAME_CMP, offsetof(struct vnodeops, vop_cmp),
305 	    fs_cmp, fs_cmp,		/* no errors allowed */
306 
307 	VOPNAME_FRLOCK, offsetof(struct vnodeops, vop_frlock),
308 	    fs_frlock, fs_nosys,
309 
310 	VOPNAME_SPACE, offsetof(struct vnodeops, vop_space),
311 	    fs_nosys, fs_nosys,
312 
313 	VOPNAME_REALVP, offsetof(struct vnodeops, vop_realvp),
314 	    fs_nosys, fs_nosys,
315 
316 	VOPNAME_GETPAGE, offsetof(struct vnodeops, vop_getpage),
317 	    fs_nosys, fs_nosys,
318 
319 	VOPNAME_PUTPAGE, offsetof(struct vnodeops, vop_putpage),
320 	    fs_nosys, fs_nosys,
321 
322 	VOPNAME_MAP, offsetof(struct vnodeops, vop_map),
323 	    (fs_generic_func_p) fs_nosys_map,
324 	    (fs_generic_func_p) fs_nosys_map,
325 
326 	VOPNAME_ADDMAP, offsetof(struct vnodeops, vop_addmap),
327 	    (fs_generic_func_p) fs_nosys_addmap,
328 	    (fs_generic_func_p) fs_nosys_addmap,
329 
330 	VOPNAME_DELMAP, offsetof(struct vnodeops, vop_delmap),
331 	    fs_nosys, fs_nosys,
332 
333 	VOPNAME_POLL, offsetof(struct vnodeops, vop_poll),
334 	    (fs_generic_func_p) fs_poll, (fs_generic_func_p) fs_nosys_poll,
335 
336 	VOPNAME_DUMP, offsetof(struct vnodeops, vop_dump),
337 	    fs_nosys, fs_nosys,
338 
339 	VOPNAME_PATHCONF, offsetof(struct vnodeops, vop_pathconf),
340 	    fs_pathconf, fs_nosys,
341 
342 	VOPNAME_PAGEIO, offsetof(struct vnodeops, vop_pageio),
343 	    fs_nosys, fs_nosys,
344 
345 	VOPNAME_DUMPCTL, offsetof(struct vnodeops, vop_dumpctl),
346 	    fs_nosys, fs_nosys,
347 
348 	VOPNAME_DISPOSE, offsetof(struct vnodeops, vop_dispose),
349 	    (fs_generic_func_p) fs_dispose,
350 	    (fs_generic_func_p) fs_nodispose,
351 
352 	VOPNAME_SETSECATTR, offsetof(struct vnodeops, vop_setsecattr),
353 	    fs_nosys, fs_nosys,
354 
355 	VOPNAME_GETSECATTR, offsetof(struct vnodeops, vop_getsecattr),
356 	    fs_fab_acl, fs_nosys,
357 
358 	VOPNAME_SHRLOCK, offsetof(struct vnodeops, vop_shrlock),
359 	    fs_shrlock, fs_nosys,
360 
361 	VOPNAME_VNEVENT, offsetof(struct vnodeops, vop_vnevent),
362 	    (fs_generic_func_p) fs_vnevent_nosupport,
363 	    (fs_generic_func_p) fs_vnevent_nosupport,
364 
365 	VOPNAME_REQZCBUF, offsetof(struct vnodeops, vop_reqzcbuf),
366 	    fs_nosys, fs_nosys,
367 
368 	VOPNAME_RETZCBUF, offsetof(struct vnodeops, vop_retzcbuf),
369 	    fs_nosys, fs_nosys,
370 
371 	NULL, 0, NULL, NULL
372 };
373 
374 /* Extensible attribute (xva) routines. */
375 
376 /*
377  * Zero out the structure, set the size of the requested/returned bitmaps,
378  * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer
379  * to the returned attributes array.
380  */
381 void
382 xva_init(xvattr_t *xvap)
383 {
384 	bzero(xvap, sizeof (xvattr_t));
385 	xvap->xva_mapsize = XVA_MAPSIZE;
386 	xvap->xva_magic = XVA_MAGIC;
387 	xvap->xva_vattr.va_mask = AT_XVATTR;
388 	xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0];
389 }
390 
391 /*
392  * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t
393  * structure.  Otherwise, returns NULL.
394  */
395 xoptattr_t *
396 xva_getxoptattr(xvattr_t *xvap)
397 {
398 	xoptattr_t *xoap = NULL;
399 	if (xvap->xva_vattr.va_mask & AT_XVATTR)
400 		xoap = &xvap->xva_xoptattrs;
401 	return (xoap);
402 }
403 
404 /*
405  * Used by the AVL routines to compare two vsk_anchor_t structures in the tree.
406  * We use the f_fsid reported by VFS_STATVFS() since we use that for the
407  * kstat name.
408  */
409 static int
410 vska_compar(const void *n1, const void *n2)
411 {
412 	int ret;
413 	ulong_t p1 = ((vsk_anchor_t *)n1)->vsk_fsid;
414 	ulong_t p2 = ((vsk_anchor_t *)n2)->vsk_fsid;
415 
416 	if (p1 < p2) {
417 		ret = -1;
418 	} else if (p1 > p2) {
419 		ret = 1;
420 	} else {
421 		ret = 0;
422 	}
423 
424 	return (ret);
425 }
426 
427 /*
428  * Used to create a single template which will be bcopy()ed to a newly
429  * allocated vsanchor_combo_t structure in new_vsanchor(), below.
430  */
431 static vopstats_t *
432 create_vopstats_template()
433 {
434 	vopstats_t		*vsp;
435 
436 	vsp = kmem_alloc(sizeof (vopstats_t), KM_SLEEP);
437 	bzero(vsp, sizeof (*vsp));	/* Start fresh */
438 
439 	/* VOP_OPEN */
440 	kstat_named_init(&vsp->nopen, "nopen", KSTAT_DATA_UINT64);
441 	/* VOP_CLOSE */
442 	kstat_named_init(&vsp->nclose, "nclose", KSTAT_DATA_UINT64);
443 	/* VOP_READ I/O */
444 	kstat_named_init(&vsp->nread, "nread", KSTAT_DATA_UINT64);
445 	kstat_named_init(&vsp->read_bytes, "read_bytes", KSTAT_DATA_UINT64);
446 	/* VOP_WRITE I/O */
447 	kstat_named_init(&vsp->nwrite, "nwrite", KSTAT_DATA_UINT64);
448 	kstat_named_init(&vsp->write_bytes, "write_bytes", KSTAT_DATA_UINT64);
449 	/* VOP_IOCTL */
450 	kstat_named_init(&vsp->nioctl, "nioctl", KSTAT_DATA_UINT64);
451 	/* VOP_SETFL */
452 	kstat_named_init(&vsp->nsetfl, "nsetfl", KSTAT_DATA_UINT64);
453 	/* VOP_GETATTR */
454 	kstat_named_init(&vsp->ngetattr, "ngetattr", KSTAT_DATA_UINT64);
455 	/* VOP_SETATTR */
456 	kstat_named_init(&vsp->nsetattr, "nsetattr", KSTAT_DATA_UINT64);
457 	/* VOP_ACCESS */
458 	kstat_named_init(&vsp->naccess, "naccess", KSTAT_DATA_UINT64);
459 	/* VOP_LOOKUP */
460 	kstat_named_init(&vsp->nlookup, "nlookup", KSTAT_DATA_UINT64);
461 	/* VOP_CREATE */
462 	kstat_named_init(&vsp->ncreate, "ncreate", KSTAT_DATA_UINT64);
463 	/* VOP_REMOVE */
464 	kstat_named_init(&vsp->nremove, "nremove", KSTAT_DATA_UINT64);
465 	/* VOP_LINK */
466 	kstat_named_init(&vsp->nlink, "nlink", KSTAT_DATA_UINT64);
467 	/* VOP_RENAME */
468 	kstat_named_init(&vsp->nrename, "nrename", KSTAT_DATA_UINT64);
469 	/* VOP_MKDIR */
470 	kstat_named_init(&vsp->nmkdir, "nmkdir", KSTAT_DATA_UINT64);
471 	/* VOP_RMDIR */
472 	kstat_named_init(&vsp->nrmdir, "nrmdir", KSTAT_DATA_UINT64);
473 	/* VOP_READDIR I/O */
474 	kstat_named_init(&vsp->nreaddir, "nreaddir", KSTAT_DATA_UINT64);
475 	kstat_named_init(&vsp->readdir_bytes, "readdir_bytes",
476 	    KSTAT_DATA_UINT64);
477 	/* VOP_SYMLINK */
478 	kstat_named_init(&vsp->nsymlink, "nsymlink", KSTAT_DATA_UINT64);
479 	/* VOP_READLINK */
480 	kstat_named_init(&vsp->nreadlink, "nreadlink", KSTAT_DATA_UINT64);
481 	/* VOP_FSYNC */
482 	kstat_named_init(&vsp->nfsync, "nfsync", KSTAT_DATA_UINT64);
483 	/* VOP_INACTIVE */
484 	kstat_named_init(&vsp->ninactive, "ninactive", KSTAT_DATA_UINT64);
485 	/* VOP_FID */
486 	kstat_named_init(&vsp->nfid, "nfid", KSTAT_DATA_UINT64);
487 	/* VOP_RWLOCK */
488 	kstat_named_init(&vsp->nrwlock, "nrwlock", KSTAT_DATA_UINT64);
489 	/* VOP_RWUNLOCK */
490 	kstat_named_init(&vsp->nrwunlock, "nrwunlock", KSTAT_DATA_UINT64);
491 	/* VOP_SEEK */
492 	kstat_named_init(&vsp->nseek, "nseek", KSTAT_DATA_UINT64);
493 	/* VOP_CMP */
494 	kstat_named_init(&vsp->ncmp, "ncmp", KSTAT_DATA_UINT64);
495 	/* VOP_FRLOCK */
496 	kstat_named_init(&vsp->nfrlock, "nfrlock", KSTAT_DATA_UINT64);
497 	/* VOP_SPACE */
498 	kstat_named_init(&vsp->nspace, "nspace", KSTAT_DATA_UINT64);
499 	/* VOP_REALVP */
500 	kstat_named_init(&vsp->nrealvp, "nrealvp", KSTAT_DATA_UINT64);
501 	/* VOP_GETPAGE */
502 	kstat_named_init(&vsp->ngetpage, "ngetpage", KSTAT_DATA_UINT64);
503 	/* VOP_PUTPAGE */
504 	kstat_named_init(&vsp->nputpage, "nputpage", KSTAT_DATA_UINT64);
505 	/* VOP_MAP */
506 	kstat_named_init(&vsp->nmap, "nmap", KSTAT_DATA_UINT64);
507 	/* VOP_ADDMAP */
508 	kstat_named_init(&vsp->naddmap, "naddmap", KSTAT_DATA_UINT64);
509 	/* VOP_DELMAP */
510 	kstat_named_init(&vsp->ndelmap, "ndelmap", KSTAT_DATA_UINT64);
511 	/* VOP_POLL */
512 	kstat_named_init(&vsp->npoll, "npoll", KSTAT_DATA_UINT64);
513 	/* VOP_DUMP */
514 	kstat_named_init(&vsp->ndump, "ndump", KSTAT_DATA_UINT64);
515 	/* VOP_PATHCONF */
516 	kstat_named_init(&vsp->npathconf, "npathconf", KSTAT_DATA_UINT64);
517 	/* VOP_PAGEIO */
518 	kstat_named_init(&vsp->npageio, "npageio", KSTAT_DATA_UINT64);
519 	/* VOP_DUMPCTL */
520 	kstat_named_init(&vsp->ndumpctl, "ndumpctl", KSTAT_DATA_UINT64);
521 	/* VOP_DISPOSE */
522 	kstat_named_init(&vsp->ndispose, "ndispose", KSTAT_DATA_UINT64);
523 	/* VOP_SETSECATTR */
524 	kstat_named_init(&vsp->nsetsecattr, "nsetsecattr", KSTAT_DATA_UINT64);
525 	/* VOP_GETSECATTR */
526 	kstat_named_init(&vsp->ngetsecattr, "ngetsecattr", KSTAT_DATA_UINT64);
527 	/* VOP_SHRLOCK */
528 	kstat_named_init(&vsp->nshrlock, "nshrlock", KSTAT_DATA_UINT64);
529 	/* VOP_VNEVENT */
530 	kstat_named_init(&vsp->nvnevent, "nvnevent", KSTAT_DATA_UINT64);
531 	/* VOP_REQZCBUF */
532 	kstat_named_init(&vsp->nreqzcbuf, "nreqzcbuf", KSTAT_DATA_UINT64);
533 	/* VOP_RETZCBUF */
534 	kstat_named_init(&vsp->nretzcbuf, "nretzcbuf", KSTAT_DATA_UINT64);
535 
536 	return (vsp);
537 }
538 
539 /*
540  * Creates a kstat structure associated with a vopstats structure.
541  */
542 kstat_t *
543 new_vskstat(char *ksname, vopstats_t *vsp)
544 {
545 	kstat_t		*ksp;
546 
547 	if (!vopstats_enabled) {
548 		return (NULL);
549 	}
550 
551 	ksp = kstat_create("unix", 0, ksname, "misc", KSTAT_TYPE_NAMED,
552 	    sizeof (vopstats_t)/sizeof (kstat_named_t),
553 	    KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE);
554 	if (ksp) {
555 		ksp->ks_data = vsp;
556 		kstat_install(ksp);
557 	}
558 
559 	return (ksp);
560 }
561 
562 /*
563  * Called from vfsinit() to initialize the support mechanisms for vopstats
564  */
565 void
566 vopstats_startup()
567 {
568 	if (!vopstats_enabled)
569 		return;
570 
571 	/*
572 	 * Creates the AVL tree which holds per-vfs vopstat anchors.  This
573 	 * is necessary since we need to check if a kstat exists before we
574 	 * attempt to create it.  Also, initialize its lock.
575 	 */
576 	avl_create(&vskstat_tree, vska_compar, sizeof (vsk_anchor_t),
577 	    offsetof(vsk_anchor_t, vsk_node));
578 	mutex_init(&vskstat_tree_lock, NULL, MUTEX_DEFAULT, NULL);
579 
580 	vsk_anchor_cache = kmem_cache_create("vsk_anchor_cache",
581 	    sizeof (vsk_anchor_t), sizeof (uintptr_t), NULL, NULL, NULL,
582 	    NULL, NULL, 0);
583 
584 	/*
585 	 * Set up the array of pointers for the vopstats-by-FS-type.
586 	 * The entries will be allocated/initialized as each file system
587 	 * goes through modload/mod_installfs.
588 	 */
589 	vopstats_fstype = (vopstats_t **)kmem_zalloc(
590 	    (sizeof (vopstats_t *) * nfstype), KM_SLEEP);
591 
592 	/* Set up the global vopstats initialization template */
593 	vs_templatep = create_vopstats_template();
594 }
595 
596 /*
597  * We need to have the all of the counters zeroed.
598  * The initialization of the vopstats_t includes on the order of
599  * 50 calls to kstat_named_init().  Rather that do that on every call,
600  * we do it once in a template (vs_templatep) then bcopy it over.
601  */
602 void
603 initialize_vopstats(vopstats_t *vsp)
604 {
605 	if (vsp == NULL)
606 		return;
607 
608 	bcopy(vs_templatep, vsp, sizeof (vopstats_t));
609 }
610 
611 /*
612  * If possible, determine which vopstats by fstype to use and
613  * return a pointer to the caller.
614  */
615 vopstats_t *
616 get_fstype_vopstats(vfs_t *vfsp, struct vfssw *vswp)
617 {
618 	int		fstype = 0;	/* Index into vfssw[] */
619 	vopstats_t	*vsp = NULL;
620 
621 	if (vfsp == NULL || (vfsp->vfs_flag & VFS_STATS) == 0 ||
622 	    !vopstats_enabled)
623 		return (NULL);
624 	/*
625 	 * Set up the fstype.  We go to so much trouble because all versions
626 	 * of NFS use the same fstype in their vfs even though they have
627 	 * distinct entries in the vfssw[] table.
628 	 * NOTE: A special vfs (e.g., EIO_vfs) may not have an entry.
629 	 */
630 	if (vswp) {
631 		fstype = vswp - vfssw;	/* Gets us the index */
632 	} else {
633 		fstype = vfsp->vfs_fstype;
634 	}
635 
636 	/*
637 	 * Point to the per-fstype vopstats. The only valid values are
638 	 * non-zero positive values less than the number of vfssw[] table
639 	 * entries.
640 	 */
641 	if (fstype > 0 && fstype < nfstype) {
642 		vsp = vopstats_fstype[fstype];
643 	}
644 
645 	return (vsp);
646 }
647 
648 /*
649  * Generate a kstat name, create the kstat structure, and allocate a
650  * vsk_anchor_t to hold it together.  Return the pointer to the vsk_anchor_t
651  * to the caller.  This must only be called from a mount.
652  */
653 vsk_anchor_t *
654 get_vskstat_anchor(vfs_t *vfsp)
655 {
656 	char		kstatstr[KSTAT_STRLEN]; /* kstat name for vopstats */
657 	statvfs64_t	statvfsbuf;		/* Needed to find f_fsid */
658 	vsk_anchor_t	*vskp = NULL;		/* vfs <--> kstat anchor */
659 	kstat_t		*ksp;			/* Ptr to new kstat */
660 	avl_index_t	where;			/* Location in the AVL tree */
661 
662 	if (vfsp == NULL || vfsp->vfs_implp == NULL ||
663 	    (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
664 		return (NULL);
665 
666 	/* Need to get the fsid to build a kstat name */
667 	if (VFS_STATVFS(vfsp, &statvfsbuf) == 0) {
668 		/* Create a name for our kstats based on fsid */
669 		(void) snprintf(kstatstr, KSTAT_STRLEN, "%s%lx",
670 		    VOPSTATS_STR, statvfsbuf.f_fsid);
671 
672 		/* Allocate and initialize the vsk_anchor_t */
673 		vskp = kmem_cache_alloc(vsk_anchor_cache, KM_SLEEP);
674 		bzero(vskp, sizeof (*vskp));
675 		vskp->vsk_fsid = statvfsbuf.f_fsid;
676 
677 		mutex_enter(&vskstat_tree_lock);
678 		if (avl_find(&vskstat_tree, vskp, &where) == NULL) {
679 			avl_insert(&vskstat_tree, vskp, where);
680 			mutex_exit(&vskstat_tree_lock);
681 
682 			/*
683 			 * Now that we've got the anchor in the AVL
684 			 * tree, we can create the kstat.
685 			 */
686 			ksp = new_vskstat(kstatstr, &vfsp->vfs_vopstats);
687 			if (ksp) {
688 				vskp->vsk_ksp = ksp;
689 			}
690 		} else {
691 			/* Oops, found one! Release memory and lock. */
692 			mutex_exit(&vskstat_tree_lock);
693 			kmem_cache_free(vsk_anchor_cache, vskp);
694 			vskp = NULL;
695 		}
696 	}
697 	return (vskp);
698 }
699 
700 /*
701  * We're in the process of tearing down the vfs and need to cleanup
702  * the data structures associated with the vopstats. Must only be called
703  * from dounmount().
704  */
705 void
706 teardown_vopstats(vfs_t *vfsp)
707 {
708 	vsk_anchor_t	*vskap;
709 	avl_index_t	where;
710 
711 	if (vfsp == NULL || vfsp->vfs_implp == NULL ||
712 	    (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
713 		return;
714 
715 	/* This is a safe check since VFS_STATS must be set (see above) */
716 	if ((vskap = vfsp->vfs_vskap) == NULL)
717 		return;
718 
719 	/* Whack the pointer right away */
720 	vfsp->vfs_vskap = NULL;
721 
722 	/* Lock the tree, remove the node, and delete the kstat */
723 	mutex_enter(&vskstat_tree_lock);
724 	if (avl_find(&vskstat_tree, vskap, &where)) {
725 		avl_remove(&vskstat_tree, vskap);
726 	}
727 
728 	if (vskap->vsk_ksp) {
729 		kstat_delete(vskap->vsk_ksp);
730 	}
731 	mutex_exit(&vskstat_tree_lock);
732 
733 	kmem_cache_free(vsk_anchor_cache, vskap);
734 }
735 
736 /*
737  * Read or write a vnode.  Called from kernel code.
738  */
739 int
740 vn_rdwr(
741 	enum uio_rw rw,
742 	struct vnode *vp,
743 	caddr_t base,
744 	ssize_t len,
745 	offset_t offset,
746 	enum uio_seg seg,
747 	int ioflag,
748 	rlim64_t ulimit,	/* meaningful only if rw is UIO_WRITE */
749 	cred_t *cr,
750 	ssize_t *residp)
751 {
752 	struct uio uio;
753 	struct iovec iov;
754 	int error;
755 	int in_crit = 0;
756 
757 	if (rw == UIO_WRITE && ISROFILE(vp))
758 		return (EROFS);
759 
760 	if (len < 0)
761 		return (EIO);
762 
763 	VOPXID_MAP_CR(vp, cr);
764 
765 	iov.iov_base = base;
766 	iov.iov_len = len;
767 	uio.uio_iov = &iov;
768 	uio.uio_iovcnt = 1;
769 	uio.uio_loffset = offset;
770 	uio.uio_segflg = (short)seg;
771 	uio.uio_resid = len;
772 	uio.uio_llimit = ulimit;
773 
774 	/*
775 	 * We have to enter the critical region before calling VOP_RWLOCK
776 	 * to avoid a deadlock with ufs.
777 	 */
778 	if (nbl_need_check(vp)) {
779 		int svmand;
780 
781 		nbl_start_crit(vp, RW_READER);
782 		in_crit = 1;
783 		error = nbl_svmand(vp, cr, &svmand);
784 		if (error != 0)
785 			goto done;
786 		if (nbl_conflict(vp, rw == UIO_WRITE ? NBL_WRITE : NBL_READ,
787 		    uio.uio_offset, uio.uio_resid, svmand, NULL)) {
788 			error = EACCES;
789 			goto done;
790 		}
791 	}
792 
793 	(void) VOP_RWLOCK(vp,
794 	    rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
795 	if (rw == UIO_WRITE) {
796 		uio.uio_fmode = FWRITE;
797 		uio.uio_extflg = UIO_COPY_DEFAULT;
798 		error = VOP_WRITE(vp, &uio, ioflag, cr, NULL);
799 	} else {
800 		uio.uio_fmode = FREAD;
801 		uio.uio_extflg = UIO_COPY_CACHED;
802 		error = VOP_READ(vp, &uio, ioflag, cr, NULL);
803 	}
804 	VOP_RWUNLOCK(vp,
805 	    rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
806 	if (residp)
807 		*residp = uio.uio_resid;
808 	else if (uio.uio_resid)
809 		error = EIO;
810 
811 done:
812 	if (in_crit)
813 		nbl_end_crit(vp);
814 	return (error);
815 }
816 
817 /*
818  * Release a vnode.  Call VOP_INACTIVE on last reference or
819  * decrement reference count.
820  *
821  * To avoid race conditions, the v_count is left at 1 for
822  * the call to VOP_INACTIVE. This prevents another thread
823  * from reclaiming and releasing the vnode *before* the
824  * VOP_INACTIVE routine has a chance to destroy the vnode.
825  * We can't have more than 1 thread calling VOP_INACTIVE
826  * on a vnode.
827  */
828 void
829 vn_rele(vnode_t *vp)
830 {
831 	VERIFY(vp->v_count > 0);
832 	mutex_enter(&vp->v_lock);
833 	if (vp->v_count == 1) {
834 		mutex_exit(&vp->v_lock);
835 		VOP_INACTIVE(vp, CRED(), NULL);
836 		return;
837 	}
838 	vp->v_count--;
839 	mutex_exit(&vp->v_lock);
840 }
841 
842 /*
843  * Release a vnode referenced by the DNLC. Multiple DNLC references are treated
844  * as a single reference, so v_count is not decremented until the last DNLC hold
845  * is released. This makes it possible to distinguish vnodes that are referenced
846  * only by the DNLC.
847  */
848 void
849 vn_rele_dnlc(vnode_t *vp)
850 {
851 	VERIFY((vp->v_count > 0) && (vp->v_count_dnlc > 0));
852 	mutex_enter(&vp->v_lock);
853 	if (--vp->v_count_dnlc == 0) {
854 		if (vp->v_count == 1) {
855 			mutex_exit(&vp->v_lock);
856 			VOP_INACTIVE(vp, CRED(), NULL);
857 			return;
858 		}
859 		vp->v_count--;
860 	}
861 	mutex_exit(&vp->v_lock);
862 }
863 
864 /*
865  * Like vn_rele() except that it clears v_stream under v_lock.
866  * This is used by sockfs when it dismantels the association between
867  * the sockfs node and the vnode in the underlaying file system.
868  * v_lock has to be held to prevent a thread coming through the lookupname
869  * path from accessing a stream head that is going away.
870  */
871 void
872 vn_rele_stream(vnode_t *vp)
873 {
874 	VERIFY(vp->v_count > 0);
875 	mutex_enter(&vp->v_lock);
876 	vp->v_stream = NULL;
877 	if (vp->v_count == 1) {
878 		mutex_exit(&vp->v_lock);
879 		VOP_INACTIVE(vp, CRED(), NULL);
880 		return;
881 	}
882 	vp->v_count--;
883 	mutex_exit(&vp->v_lock);
884 }
885 
886 static void
887 vn_rele_inactive(vnode_t *vp)
888 {
889 	VOP_INACTIVE(vp, CRED(), NULL);
890 }
891 
892 /*
893  * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it
894  * asynchronously using a taskq. This can avoid deadlocks caused by re-entering
895  * the file system as a result of releasing the vnode. Note, file systems
896  * already have to handle the race where the vnode is incremented before the
897  * inactive routine is called and does its locking.
898  *
899  * Warning: Excessive use of this routine can lead to performance problems.
900  * This is because taskqs throttle back allocation if too many are created.
901  */
902 void
903 vn_rele_async(vnode_t *vp, taskq_t *taskq)
904 {
905 	VERIFY(vp->v_count > 0);
906 	mutex_enter(&vp->v_lock);
907 	if (vp->v_count == 1) {
908 		mutex_exit(&vp->v_lock);
909 		VERIFY(taskq_dispatch(taskq, (task_func_t *)vn_rele_inactive,
910 		    vp, TQ_SLEEP) != NULL);
911 		return;
912 	}
913 	vp->v_count--;
914 	mutex_exit(&vp->v_lock);
915 }
916 
917 int
918 vn_open(
919 	char *pnamep,
920 	enum uio_seg seg,
921 	int filemode,
922 	int createmode,
923 	struct vnode **vpp,
924 	enum create crwhy,
925 	mode_t umask)
926 {
927 	return (vn_openat(pnamep, seg, filemode, createmode, vpp, crwhy,
928 	    umask, NULL, -1));
929 }
930 
931 
932 /*
933  * Open/create a vnode.
934  * This may be callable by the kernel, the only known use
935  * of user context being that the current user credentials
936  * are used for permissions.  crwhy is defined iff filemode & FCREAT.
937  */
938 int
939 vn_openat(
940 	char *pnamep,
941 	enum uio_seg seg,
942 	int filemode,
943 	int createmode,
944 	struct vnode **vpp,
945 	enum create crwhy,
946 	mode_t umask,
947 	struct vnode *startvp,
948 	int fd)
949 {
950 	struct vnode *vp;
951 	int mode;
952 	int accessflags;
953 	int error;
954 	int in_crit = 0;
955 	int open_done = 0;
956 	int shrlock_done = 0;
957 	struct vattr vattr;
958 	enum symfollow follow;
959 	int estale_retry = 0;
960 	struct shrlock shr;
961 	struct shr_locowner shr_own;
962 
963 	mode = 0;
964 	accessflags = 0;
965 	if (filemode & FREAD)
966 		mode |= VREAD;
967 	if (filemode & (FWRITE|FTRUNC))
968 		mode |= VWRITE;
969 	if (filemode & FXATTRDIROPEN)
970 		mode |= VEXEC;
971 
972 	/* symlink interpretation */
973 	if (filemode & FNOFOLLOW)
974 		follow = NO_FOLLOW;
975 	else
976 		follow = FOLLOW;
977 
978 	if (filemode & FAPPEND)
979 		accessflags |= V_APPEND;
980 
981 top:
982 	if (filemode & FCREAT) {
983 		enum vcexcl excl;
984 
985 		/*
986 		 * Wish to create a file.
987 		 */
988 		vattr.va_type = VREG;
989 		vattr.va_mode = createmode;
990 		vattr.va_mask = AT_TYPE|AT_MODE;
991 		if (filemode & FTRUNC) {
992 			vattr.va_size = 0;
993 			vattr.va_mask |= AT_SIZE;
994 		}
995 		if (filemode & FEXCL)
996 			excl = EXCL;
997 		else
998 			excl = NONEXCL;
999 
1000 		if (error =
1001 		    vn_createat(pnamep, seg, &vattr, excl, mode, &vp, crwhy,
1002 		    (filemode & ~(FTRUNC|FEXCL)), umask, startvp))
1003 			return (error);
1004 	} else {
1005 		/*
1006 		 * Wish to open a file.  Just look it up.
1007 		 */
1008 		if (error = lookupnameat(pnamep, seg, follow,
1009 		    NULLVPP, &vp, startvp)) {
1010 			if ((error == ESTALE) &&
1011 			    fs_need_estale_retry(estale_retry++))
1012 				goto top;
1013 			return (error);
1014 		}
1015 
1016 		/*
1017 		 * Get the attributes to check whether file is large.
1018 		 * We do this only if the FOFFMAX flag is not set and
1019 		 * only for regular files.
1020 		 */
1021 
1022 		if (!(filemode & FOFFMAX) && (vp->v_type == VREG)) {
1023 			vattr.va_mask = AT_SIZE;
1024 			if ((error = VOP_GETATTR(vp, &vattr, 0,
1025 			    CRED(), NULL))) {
1026 				goto out;
1027 			}
1028 			if (vattr.va_size > (u_offset_t)MAXOFF32_T) {
1029 				/*
1030 				 * Large File API - regular open fails
1031 				 * if FOFFMAX flag is set in file mode
1032 				 */
1033 				error = EOVERFLOW;
1034 				goto out;
1035 			}
1036 		}
1037 		/*
1038 		 * Can't write directories, active texts, or
1039 		 * read-only filesystems.  Can't truncate files
1040 		 * on which mandatory locking is in effect.
1041 		 */
1042 		if (filemode & (FWRITE|FTRUNC)) {
1043 			/*
1044 			 * Allow writable directory if VDIROPEN flag is set.
1045 			 */
1046 			if (vp->v_type == VDIR && !(vp->v_flag & VDIROPEN)) {
1047 				error = EISDIR;
1048 				goto out;
1049 			}
1050 			if (ISROFILE(vp)) {
1051 				error = EROFS;
1052 				goto out;
1053 			}
1054 			/*
1055 			 * Can't truncate files on which
1056 			 * sysv mandatory locking is in effect.
1057 			 */
1058 			if (filemode & FTRUNC) {
1059 				vnode_t *rvp;
1060 
1061 				if (VOP_REALVP(vp, &rvp, NULL) != 0)
1062 					rvp = vp;
1063 				if (rvp->v_filocks != NULL) {
1064 					vattr.va_mask = AT_MODE;
1065 					if ((error = VOP_GETATTR(vp,
1066 					    &vattr, 0, CRED(), NULL)) == 0 &&
1067 					    MANDLOCK(vp, vattr.va_mode))
1068 						error = EAGAIN;
1069 				}
1070 			}
1071 			if (error)
1072 				goto out;
1073 		}
1074 		/*
1075 		 * Check permissions.
1076 		 */
1077 		if (error = VOP_ACCESS(vp, mode, accessflags, CRED(), NULL))
1078 			goto out;
1079 	}
1080 
1081 	/*
1082 	 * Do remaining checks for FNOFOLLOW and FNOLINKS.
1083 	 */
1084 	if ((filemode & FNOFOLLOW) && vp->v_type == VLNK) {
1085 		error = ELOOP;
1086 		goto out;
1087 	}
1088 	if (filemode & FNOLINKS) {
1089 		vattr.va_mask = AT_NLINK;
1090 		if ((error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))) {
1091 			goto out;
1092 		}
1093 		if (vattr.va_nlink != 1) {
1094 			error = EMLINK;
1095 			goto out;
1096 		}
1097 	}
1098 
1099 	/*
1100 	 * Opening a socket corresponding to the AF_UNIX pathname
1101 	 * in the filesystem name space is not supported.
1102 	 * However, VSOCK nodes in namefs are supported in order
1103 	 * to make fattach work for sockets.
1104 	 *
1105 	 * XXX This uses VOP_REALVP to distinguish between
1106 	 * an unopened namefs node (where VOP_REALVP returns a
1107 	 * different VSOCK vnode) and a VSOCK created by vn_create
1108 	 * in some file system (where VOP_REALVP would never return
1109 	 * a different vnode).
1110 	 */
1111 	if (vp->v_type == VSOCK) {
1112 		struct vnode *nvp;
1113 
1114 		error = VOP_REALVP(vp, &nvp, NULL);
1115 		if (error != 0 || nvp == NULL || nvp == vp ||
1116 		    nvp->v_type != VSOCK) {
1117 			error = EOPNOTSUPP;
1118 			goto out;
1119 		}
1120 	}
1121 
1122 	if ((vp->v_type == VREG) && nbl_need_check(vp)) {
1123 		/* get share reservation */
1124 		shr.s_access = 0;
1125 		if (filemode & FWRITE)
1126 			shr.s_access |= F_WRACC;
1127 		if (filemode & FREAD)
1128 			shr.s_access |= F_RDACC;
1129 		shr.s_deny = 0;
1130 		shr.s_sysid = 0;
1131 		shr.s_pid = ttoproc(curthread)->p_pid;
1132 		shr_own.sl_pid = shr.s_pid;
1133 		shr_own.sl_id = fd;
1134 		shr.s_own_len = sizeof (shr_own);
1135 		shr.s_owner = (caddr_t)&shr_own;
1136 		error = VOP_SHRLOCK(vp, F_SHARE_NBMAND, &shr, filemode, CRED(),
1137 		    NULL);
1138 		if (error)
1139 			goto out;
1140 		shrlock_done = 1;
1141 
1142 		/* nbmand conflict check if truncating file */
1143 		if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1144 			nbl_start_crit(vp, RW_READER);
1145 			in_crit = 1;
1146 
1147 			vattr.va_mask = AT_SIZE;
1148 			if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
1149 				goto out;
1150 			if (nbl_conflict(vp, NBL_WRITE, 0, vattr.va_size, 0,
1151 			    NULL)) {
1152 				error = EACCES;
1153 				goto out;
1154 			}
1155 		}
1156 	}
1157 
1158 	/*
1159 	 * Do opening protocol.
1160 	 */
1161 	error = VOP_OPEN(&vp, filemode, CRED(), NULL);
1162 	if (error)
1163 		goto out;
1164 	open_done = 1;
1165 
1166 	/*
1167 	 * Truncate if required.
1168 	 */
1169 	if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1170 		vattr.va_size = 0;
1171 		vattr.va_mask = AT_SIZE;
1172 		if ((error = VOP_SETATTR(vp, &vattr, 0, CRED(), NULL)) != 0)
1173 			goto out;
1174 	}
1175 out:
1176 	ASSERT(vp->v_count > 0);
1177 
1178 	if (in_crit) {
1179 		nbl_end_crit(vp);
1180 		in_crit = 0;
1181 	}
1182 	if (error) {
1183 		if (open_done) {
1184 			(void) VOP_CLOSE(vp, filemode, 1, (offset_t)0, CRED(),
1185 			    NULL);
1186 			open_done = 0;
1187 			shrlock_done = 0;
1188 		}
1189 		if (shrlock_done) {
1190 			(void) VOP_SHRLOCK(vp, F_UNSHARE, &shr, 0, CRED(),
1191 			    NULL);
1192 			shrlock_done = 0;
1193 		}
1194 
1195 		/*
1196 		 * The following clause was added to handle a problem
1197 		 * with NFS consistency.  It is possible that a lookup
1198 		 * of the file to be opened succeeded, but the file
1199 		 * itself doesn't actually exist on the server.  This
1200 		 * is chiefly due to the DNLC containing an entry for
1201 		 * the file which has been removed on the server.  In
1202 		 * this case, we just start over.  If there was some
1203 		 * other cause for the ESTALE error, then the lookup
1204 		 * of the file will fail and the error will be returned
1205 		 * above instead of looping around from here.
1206 		 */
1207 		VN_RELE(vp);
1208 		if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1209 			goto top;
1210 	} else
1211 		*vpp = vp;
1212 	return (error);
1213 }
1214 
1215 /*
1216  * The following two accessor functions are for the NFSv4 server.  Since there
1217  * is no VOP_OPEN_UP/DOWNGRADE we need a way for the NFS server to keep the
1218  * vnode open counts correct when a client "upgrades" an open or does an
1219  * open_downgrade.  In NFS, an upgrade or downgrade can not only change the
1220  * open mode (add or subtract read or write), but also change the share/deny
1221  * modes.  However, share reservations are not integrated with OPEN, yet, so
1222  * we need to handle each separately.  These functions are cleaner than having
1223  * the NFS server manipulate the counts directly, however, nobody else should
1224  * use these functions.
1225  */
1226 void
1227 vn_open_upgrade(
1228 	vnode_t *vp,
1229 	int filemode)
1230 {
1231 	ASSERT(vp->v_type == VREG);
1232 
1233 	if (filemode & FREAD)
1234 		atomic_add_32(&(vp->v_rdcnt), 1);
1235 	if (filemode & FWRITE)
1236 		atomic_add_32(&(vp->v_wrcnt), 1);
1237 
1238 }
1239 
1240 void
1241 vn_open_downgrade(
1242 	vnode_t *vp,
1243 	int filemode)
1244 {
1245 	ASSERT(vp->v_type == VREG);
1246 
1247 	if (filemode & FREAD) {
1248 		ASSERT(vp->v_rdcnt > 0);
1249 		atomic_add_32(&(vp->v_rdcnt), -1);
1250 	}
1251 	if (filemode & FWRITE) {
1252 		ASSERT(vp->v_wrcnt > 0);
1253 		atomic_add_32(&(vp->v_wrcnt), -1);
1254 	}
1255 
1256 }
1257 
1258 int
1259 vn_create(
1260 	char *pnamep,
1261 	enum uio_seg seg,
1262 	struct vattr *vap,
1263 	enum vcexcl excl,
1264 	int mode,
1265 	struct vnode **vpp,
1266 	enum create why,
1267 	int flag,
1268 	mode_t umask)
1269 {
1270 	return (vn_createat(pnamep, seg, vap, excl, mode, vpp, why, flag,
1271 	    umask, NULL));
1272 }
1273 
1274 /*
1275  * Create a vnode (makenode).
1276  */
1277 int
1278 vn_createat(
1279 	char *pnamep,
1280 	enum uio_seg seg,
1281 	struct vattr *vap,
1282 	enum vcexcl excl,
1283 	int mode,
1284 	struct vnode **vpp,
1285 	enum create why,
1286 	int flag,
1287 	mode_t umask,
1288 	struct vnode *startvp)
1289 {
1290 	struct vnode *dvp;	/* ptr to parent dir vnode */
1291 	struct vnode *vp = NULL;
1292 	struct pathname pn;
1293 	int error;
1294 	int in_crit = 0;
1295 	struct vattr vattr;
1296 	enum symfollow follow;
1297 	int estale_retry = 0;
1298 
1299 	ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
1300 
1301 	/* symlink interpretation */
1302 	if ((flag & FNOFOLLOW) || excl == EXCL)
1303 		follow = NO_FOLLOW;
1304 	else
1305 		follow = FOLLOW;
1306 	flag &= ~(FNOFOLLOW|FNOLINKS);
1307 
1308 top:
1309 	/*
1310 	 * Lookup directory.
1311 	 * If new object is a file, call lower level to create it.
1312 	 * Note that it is up to the lower level to enforce exclusive
1313 	 * creation, if the file is already there.
1314 	 * This allows the lower level to do whatever
1315 	 * locking or protocol that is needed to prevent races.
1316 	 * If the new object is directory call lower level to make
1317 	 * the new directory, with "." and "..".
1318 	 */
1319 	if (error = pn_get(pnamep, seg, &pn))
1320 		return (error);
1321 	if (audit_active)
1322 		audit_vncreate_start();
1323 	dvp = NULL;
1324 	*vpp = NULL;
1325 	/*
1326 	 * lookup will find the parent directory for the vnode.
1327 	 * When it is done the pn holds the name of the entry
1328 	 * in the directory.
1329 	 * If this is a non-exclusive create we also find the node itself.
1330 	 */
1331 	error = lookuppnat(&pn, NULL, follow, &dvp,
1332 	    (excl == EXCL) ? NULLVPP : vpp, startvp);
1333 	if (error) {
1334 		pn_free(&pn);
1335 		if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1336 			goto top;
1337 		if (why == CRMKDIR && error == EINVAL)
1338 			error = EEXIST;		/* SVID */
1339 		return (error);
1340 	}
1341 
1342 	if (why != CRMKNOD)
1343 		vap->va_mode &= ~VSVTX;
1344 
1345 	/*
1346 	 * If default ACLs are defined for the directory don't apply the
1347 	 * umask if umask is passed.
1348 	 */
1349 
1350 	if (umask) {
1351 
1352 		vsecattr_t vsec;
1353 
1354 		vsec.vsa_aclcnt = 0;
1355 		vsec.vsa_aclentp = NULL;
1356 		vsec.vsa_dfaclcnt = 0;
1357 		vsec.vsa_dfaclentp = NULL;
1358 		vsec.vsa_mask = VSA_DFACLCNT;
1359 		error = VOP_GETSECATTR(dvp, &vsec, 0, CRED(), NULL);
1360 		/*
1361 		 * If error is ENOSYS then treat it as no error
1362 		 * Don't want to force all file systems to support
1363 		 * aclent_t style of ACL's.
1364 		 */
1365 		if (error == ENOSYS)
1366 			error = 0;
1367 		if (error) {
1368 			if (*vpp != NULL)
1369 				VN_RELE(*vpp);
1370 			goto out;
1371 		} else {
1372 			/*
1373 			 * Apply the umask if no default ACLs.
1374 			 */
1375 			if (vsec.vsa_dfaclcnt == 0)
1376 				vap->va_mode &= ~umask;
1377 
1378 			/*
1379 			 * VOP_GETSECATTR() may have allocated memory for
1380 			 * ACLs we didn't request, so double-check and
1381 			 * free it if necessary.
1382 			 */
1383 			if (vsec.vsa_aclcnt && vsec.vsa_aclentp != NULL)
1384 				kmem_free((caddr_t)vsec.vsa_aclentp,
1385 				    vsec.vsa_aclcnt * sizeof (aclent_t));
1386 			if (vsec.vsa_dfaclcnt && vsec.vsa_dfaclentp != NULL)
1387 				kmem_free((caddr_t)vsec.vsa_dfaclentp,
1388 				    vsec.vsa_dfaclcnt * sizeof (aclent_t));
1389 		}
1390 	}
1391 
1392 	/*
1393 	 * In general we want to generate EROFS if the file system is
1394 	 * readonly.  However, POSIX (IEEE Std. 1003.1) section 5.3.1
1395 	 * documents the open system call, and it says that O_CREAT has no
1396 	 * effect if the file already exists.  Bug 1119649 states
1397 	 * that open(path, O_CREAT, ...) fails when attempting to open an
1398 	 * existing file on a read only file system.  Thus, the first part
1399 	 * of the following if statement has 3 checks:
1400 	 *	if the file exists &&
1401 	 *		it is being open with write access &&
1402 	 *		the file system is read only
1403 	 *	then generate EROFS
1404 	 */
1405 	if ((*vpp != NULL && (mode & VWRITE) && ISROFILE(*vpp)) ||
1406 	    (*vpp == NULL && dvp->v_vfsp->vfs_flag & VFS_RDONLY)) {
1407 		if (*vpp)
1408 			VN_RELE(*vpp);
1409 		error = EROFS;
1410 	} else if (excl == NONEXCL && *vpp != NULL) {
1411 		vnode_t *rvp;
1412 
1413 		/*
1414 		 * File already exists.  If a mandatory lock has been
1415 		 * applied, return error.
1416 		 */
1417 		vp = *vpp;
1418 		if (VOP_REALVP(vp, &rvp, NULL) != 0)
1419 			rvp = vp;
1420 		if ((vap->va_mask & AT_SIZE) && nbl_need_check(vp)) {
1421 			nbl_start_crit(vp, RW_READER);
1422 			in_crit = 1;
1423 		}
1424 		if (rvp->v_filocks != NULL || rvp->v_shrlocks != NULL) {
1425 			vattr.va_mask = AT_MODE|AT_SIZE;
1426 			if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) {
1427 				goto out;
1428 			}
1429 			if (MANDLOCK(vp, vattr.va_mode)) {
1430 				error = EAGAIN;
1431 				goto out;
1432 			}
1433 			/*
1434 			 * File cannot be truncated if non-blocking mandatory
1435 			 * locks are currently on the file.
1436 			 */
1437 			if ((vap->va_mask & AT_SIZE) && in_crit) {
1438 				u_offset_t offset;
1439 				ssize_t length;
1440 
1441 				offset = vap->va_size > vattr.va_size ?
1442 				    vattr.va_size : vap->va_size;
1443 				length = vap->va_size > vattr.va_size ?
1444 				    vap->va_size - vattr.va_size :
1445 				    vattr.va_size - vap->va_size;
1446 				if (nbl_conflict(vp, NBL_WRITE, offset,
1447 				    length, 0, NULL)) {
1448 					error = EACCES;
1449 					goto out;
1450 				}
1451 			}
1452 		}
1453 
1454 		/*
1455 		 * If the file is the root of a VFS, we've crossed a
1456 		 * mount point and the "containing" directory that we
1457 		 * acquired above (dvp) is irrelevant because it's in
1458 		 * a different file system.  We apply VOP_CREATE to the
1459 		 * target itself instead of to the containing directory
1460 		 * and supply a null path name to indicate (conventionally)
1461 		 * the node itself as the "component" of interest.
1462 		 *
1463 		 * The intercession of the file system is necessary to
1464 		 * ensure that the appropriate permission checks are
1465 		 * done.
1466 		 */
1467 		if (vp->v_flag & VROOT) {
1468 			ASSERT(why != CRMKDIR);
1469 			error = VOP_CREATE(vp, "", vap, excl, mode, vpp,
1470 			    CRED(), flag, NULL, NULL);
1471 			/*
1472 			 * If the create succeeded, it will have created
1473 			 * a new reference to the vnode.  Give up the
1474 			 * original reference.  The assertion should not
1475 			 * get triggered because NBMAND locks only apply to
1476 			 * VREG files.  And if in_crit is non-zero for some
1477 			 * reason, detect that here, rather than when we
1478 			 * deference a null vp.
1479 			 */
1480 			ASSERT(in_crit == 0);
1481 			VN_RELE(vp);
1482 			vp = NULL;
1483 			goto out;
1484 		}
1485 
1486 		/*
1487 		 * Large File API - non-large open (FOFFMAX flag not set)
1488 		 * of regular file fails if the file size exceeds MAXOFF32_T.
1489 		 */
1490 		if (why != CRMKDIR &&
1491 		    !(flag & FOFFMAX) &&
1492 		    (vp->v_type == VREG)) {
1493 			vattr.va_mask = AT_SIZE;
1494 			if ((error = VOP_GETATTR(vp, &vattr, 0,
1495 			    CRED(), NULL))) {
1496 				goto out;
1497 			}
1498 			if ((vattr.va_size > (u_offset_t)MAXOFF32_T)) {
1499 				error = EOVERFLOW;
1500 				goto out;
1501 			}
1502 		}
1503 	}
1504 
1505 	if (error == 0) {
1506 		/*
1507 		 * Call mkdir() if specified, otherwise create().
1508 		 */
1509 		int must_be_dir = pn_fixslash(&pn);	/* trailing '/'? */
1510 
1511 		if (why == CRMKDIR)
1512 			/*
1513 			 * N.B., if vn_createat() ever requests
1514 			 * case-insensitive behavior then it will need
1515 			 * to be passed to VOP_MKDIR().  VOP_CREATE()
1516 			 * will already get it via "flag"
1517 			 */
1518 			error = VOP_MKDIR(dvp, pn.pn_path, vap, vpp, CRED(),
1519 			    NULL, 0, NULL);
1520 		else if (!must_be_dir)
1521 			error = VOP_CREATE(dvp, pn.pn_path, vap,
1522 			    excl, mode, vpp, CRED(), flag, NULL, NULL);
1523 		else
1524 			error = ENOTDIR;
1525 	}
1526 
1527 out:
1528 
1529 	if (audit_active)
1530 		audit_vncreate_finish(*vpp, error);
1531 	if (in_crit) {
1532 		nbl_end_crit(vp);
1533 		in_crit = 0;
1534 	}
1535 	if (vp != NULL) {
1536 		VN_RELE(vp);
1537 		vp = NULL;
1538 	}
1539 	pn_free(&pn);
1540 	VN_RELE(dvp);
1541 	/*
1542 	 * The following clause was added to handle a problem
1543 	 * with NFS consistency.  It is possible that a lookup
1544 	 * of the file to be created succeeded, but the file
1545 	 * itself doesn't actually exist on the server.  This
1546 	 * is chiefly due to the DNLC containing an entry for
1547 	 * the file which has been removed on the server.  In
1548 	 * this case, we just start over.  If there was some
1549 	 * other cause for the ESTALE error, then the lookup
1550 	 * of the file will fail and the error will be returned
1551 	 * above instead of looping around from here.
1552 	 */
1553 	if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1554 		goto top;
1555 	return (error);
1556 }
1557 
1558 int
1559 vn_link(char *from, char *to, enum uio_seg seg)
1560 {
1561 	struct vnode *fvp;		/* from vnode ptr */
1562 	struct vnode *tdvp;		/* to directory vnode ptr */
1563 	struct pathname pn;
1564 	int error;
1565 	struct vattr vattr;
1566 	dev_t fsid;
1567 	int estale_retry = 0;
1568 
1569 top:
1570 	fvp = tdvp = NULL;
1571 	if (error = pn_get(to, seg, &pn))
1572 		return (error);
1573 	if (error = lookupname(from, seg, NO_FOLLOW, NULLVPP, &fvp))
1574 		goto out;
1575 	if (error = lookuppn(&pn, NULL, NO_FOLLOW, &tdvp, NULLVPP))
1576 		goto out;
1577 	/*
1578 	 * Make sure both source vnode and target directory vnode are
1579 	 * in the same vfs and that it is writeable.
1580 	 */
1581 	vattr.va_mask = AT_FSID;
1582 	if (error = VOP_GETATTR(fvp, &vattr, 0, CRED(), NULL))
1583 		goto out;
1584 	fsid = vattr.va_fsid;
1585 	vattr.va_mask = AT_FSID;
1586 	if (error = VOP_GETATTR(tdvp, &vattr, 0, CRED(), NULL))
1587 		goto out;
1588 	if (fsid != vattr.va_fsid) {
1589 		error = EXDEV;
1590 		goto out;
1591 	}
1592 	if (tdvp->v_vfsp->vfs_flag & VFS_RDONLY) {
1593 		error = EROFS;
1594 		goto out;
1595 	}
1596 	/*
1597 	 * Do the link.
1598 	 */
1599 	(void) pn_fixslash(&pn);
1600 	error = VOP_LINK(tdvp, fvp, pn.pn_path, CRED(), NULL, 0);
1601 out:
1602 	pn_free(&pn);
1603 	if (fvp)
1604 		VN_RELE(fvp);
1605 	if (tdvp)
1606 		VN_RELE(tdvp);
1607 	if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1608 		goto top;
1609 	return (error);
1610 }
1611 
1612 int
1613 vn_rename(char *from, char *to, enum uio_seg seg)
1614 {
1615 	return (vn_renameat(NULL, from, NULL, to, seg));
1616 }
1617 
1618 int
1619 vn_renameat(vnode_t *fdvp, char *fname, vnode_t *tdvp,
1620 		char *tname, enum uio_seg seg)
1621 {
1622 	int error;
1623 	struct vattr vattr;
1624 	struct pathname fpn;		/* from pathname */
1625 	struct pathname tpn;		/* to pathname */
1626 	dev_t fsid;
1627 	int in_crit_src, in_crit_targ;
1628 	vnode_t *fromvp, *fvp;
1629 	vnode_t *tovp, *targvp;
1630 	int estale_retry = 0;
1631 
1632 top:
1633 	fvp = fromvp = tovp = targvp = NULL;
1634 	in_crit_src = in_crit_targ = 0;
1635 	/*
1636 	 * Get to and from pathnames.
1637 	 */
1638 	if (error = pn_get(fname, seg, &fpn))
1639 		return (error);
1640 	if (error = pn_get(tname, seg, &tpn)) {
1641 		pn_free(&fpn);
1642 		return (error);
1643 	}
1644 
1645 	/*
1646 	 * First we need to resolve the correct directories
1647 	 * The passed in directories may only be a starting point,
1648 	 * but we need the real directories the file(s) live in.
1649 	 * For example the fname may be something like usr/lib/sparc
1650 	 * and we were passed in the / directory, but we need to
1651 	 * use the lib directory for the rename.
1652 	 */
1653 
1654 	if (audit_active)
1655 		audit_setfsat_path(1);
1656 	/*
1657 	 * Lookup to and from directories.
1658 	 */
1659 	if (error = lookuppnat(&fpn, NULL, NO_FOLLOW, &fromvp, &fvp, fdvp)) {
1660 		goto out;
1661 	}
1662 
1663 	/*
1664 	 * Make sure there is an entry.
1665 	 */
1666 	if (fvp == NULL) {
1667 		error = ENOENT;
1668 		goto out;
1669 	}
1670 
1671 	if (audit_active)
1672 		audit_setfsat_path(3);
1673 	if (error = lookuppnat(&tpn, NULL, NO_FOLLOW, &tovp, &targvp, tdvp)) {
1674 		goto out;
1675 	}
1676 
1677 	/*
1678 	 * Make sure both the from vnode directory and the to directory
1679 	 * are in the same vfs and the to directory is writable.
1680 	 * We check fsid's, not vfs pointers, so loopback fs works.
1681 	 */
1682 	if (fromvp != tovp) {
1683 		vattr.va_mask = AT_FSID;
1684 		if (error = VOP_GETATTR(fromvp, &vattr, 0, CRED(), NULL))
1685 			goto out;
1686 		fsid = vattr.va_fsid;
1687 		vattr.va_mask = AT_FSID;
1688 		if (error = VOP_GETATTR(tovp, &vattr, 0, CRED(), NULL))
1689 			goto out;
1690 		if (fsid != vattr.va_fsid) {
1691 			error = EXDEV;
1692 			goto out;
1693 		}
1694 	}
1695 
1696 	if (tovp->v_vfsp->vfs_flag & VFS_RDONLY) {
1697 		error = EROFS;
1698 		goto out;
1699 	}
1700 
1701 	if (targvp && (fvp != targvp)) {
1702 		nbl_start_crit(targvp, RW_READER);
1703 		in_crit_targ = 1;
1704 		if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
1705 			error = EACCES;
1706 			goto out;
1707 		}
1708 	}
1709 
1710 	if (nbl_need_check(fvp)) {
1711 		nbl_start_crit(fvp, RW_READER);
1712 		in_crit_src = 1;
1713 		if (nbl_conflict(fvp, NBL_RENAME, 0, 0, 0, NULL)) {
1714 			error = EACCES;
1715 			goto out;
1716 		}
1717 	}
1718 
1719 	/*
1720 	 * Do the rename.
1721 	 */
1722 	(void) pn_fixslash(&tpn);
1723 	error = VOP_RENAME(fromvp, fpn.pn_path, tovp, tpn.pn_path, CRED(),
1724 	    NULL, 0);
1725 
1726 out:
1727 	pn_free(&fpn);
1728 	pn_free(&tpn);
1729 	if (in_crit_src)
1730 		nbl_end_crit(fvp);
1731 	if (in_crit_targ)
1732 		nbl_end_crit(targvp);
1733 	if (fromvp)
1734 		VN_RELE(fromvp);
1735 	if (tovp)
1736 		VN_RELE(tovp);
1737 	if (targvp)
1738 		VN_RELE(targvp);
1739 	if (fvp)
1740 		VN_RELE(fvp);
1741 	if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1742 		goto top;
1743 	return (error);
1744 }
1745 
1746 /*
1747  * Remove a file or directory.
1748  */
1749 int
1750 vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag)
1751 {
1752 	return (vn_removeat(NULL, fnamep, seg, dirflag));
1753 }
1754 
1755 int
1756 vn_removeat(vnode_t *startvp, char *fnamep, enum uio_seg seg, enum rm dirflag)
1757 {
1758 	struct vnode *vp;		/* entry vnode */
1759 	struct vnode *dvp;		/* ptr to parent dir vnode */
1760 	struct vnode *coveredvp;
1761 	struct pathname pn;		/* name of entry */
1762 	enum vtype vtype;
1763 	int error;
1764 	struct vfs *vfsp;
1765 	struct vfs *dvfsp;	/* ptr to parent dir vfs */
1766 	int in_crit = 0;
1767 	int estale_retry = 0;
1768 
1769 top:
1770 	if (error = pn_get(fnamep, seg, &pn))
1771 		return (error);
1772 	dvp = vp = NULL;
1773 	if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &dvp, &vp, startvp)) {
1774 		pn_free(&pn);
1775 		if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1776 			goto top;
1777 		return (error);
1778 	}
1779 
1780 	/*
1781 	 * Make sure there is an entry.
1782 	 */
1783 	if (vp == NULL) {
1784 		error = ENOENT;
1785 		goto out;
1786 	}
1787 
1788 	vfsp = vp->v_vfsp;
1789 	dvfsp = dvp->v_vfsp;
1790 
1791 	/*
1792 	 * If the named file is the root of a mounted filesystem, fail,
1793 	 * unless it's marked unlinkable.  In that case, unmount the
1794 	 * filesystem and proceed to unlink the covered vnode.  (If the
1795 	 * covered vnode is a directory, use rmdir instead of unlink,
1796 	 * to avoid file system corruption.)
1797 	 */
1798 	if (vp->v_flag & VROOT) {
1799 		if ((vfsp->vfs_flag & VFS_UNLINKABLE) == 0) {
1800 			error = EBUSY;
1801 			goto out;
1802 		}
1803 
1804 		/*
1805 		 * Namefs specific code starts here.
1806 		 */
1807 
1808 		if (dirflag == RMDIRECTORY) {
1809 			/*
1810 			 * User called rmdir(2) on a file that has
1811 			 * been namefs mounted on top of.  Since
1812 			 * namefs doesn't allow directories to
1813 			 * be mounted on other files we know
1814 			 * vp is not of type VDIR so fail to operation.
1815 			 */
1816 			error = ENOTDIR;
1817 			goto out;
1818 		}
1819 
1820 		/*
1821 		 * If VROOT is still set after grabbing vp->v_lock,
1822 		 * noone has finished nm_unmount so far and coveredvp
1823 		 * is valid.
1824 		 * If we manage to grab vn_vfswlock(coveredvp) before releasing
1825 		 * vp->v_lock, any race window is eliminated.
1826 		 */
1827 
1828 		mutex_enter(&vp->v_lock);
1829 		if ((vp->v_flag & VROOT) == 0) {
1830 			/* Someone beat us to the unmount */
1831 			mutex_exit(&vp->v_lock);
1832 			error = EBUSY;
1833 			goto out;
1834 		}
1835 		vfsp = vp->v_vfsp;
1836 		coveredvp = vfsp->vfs_vnodecovered;
1837 		ASSERT(coveredvp);
1838 		/*
1839 		 * Note: Implementation of vn_vfswlock shows that ordering of
1840 		 * v_lock / vn_vfswlock is not an issue here.
1841 		 */
1842 		error = vn_vfswlock(coveredvp);
1843 		mutex_exit(&vp->v_lock);
1844 
1845 		if (error)
1846 			goto out;
1847 
1848 		VN_HOLD(coveredvp);
1849 		VN_RELE(vp);
1850 		error = dounmount(vfsp, 0, CRED());
1851 
1852 		/*
1853 		 * Unmounted the namefs file system; now get
1854 		 * the object it was mounted over.
1855 		 */
1856 		vp = coveredvp;
1857 		/*
1858 		 * If namefs was mounted over a directory, then
1859 		 * we want to use rmdir() instead of unlink().
1860 		 */
1861 		if (vp->v_type == VDIR)
1862 			dirflag = RMDIRECTORY;
1863 
1864 		if (error)
1865 			goto out;
1866 	}
1867 
1868 	/*
1869 	 * Make sure filesystem is writeable.
1870 	 * We check the parent directory's vfs in case this is an lofs vnode.
1871 	 */
1872 	if (dvfsp && dvfsp->vfs_flag & VFS_RDONLY) {
1873 		error = EROFS;
1874 		goto out;
1875 	}
1876 
1877 	vtype = vp->v_type;
1878 
1879 	/*
1880 	 * If there is the possibility of an nbmand share reservation, make
1881 	 * sure it's okay to remove the file.  Keep a reference to the
1882 	 * vnode, so that we can exit the nbl critical region after
1883 	 * calling VOP_REMOVE.
1884 	 * If there is no possibility of an nbmand share reservation,
1885 	 * release the vnode reference now.  Filesystems like NFS may
1886 	 * behave differently if there is an extra reference, so get rid of
1887 	 * this one.  Fortunately, we can't have nbmand mounts on NFS
1888 	 * filesystems.
1889 	 */
1890 	if (nbl_need_check(vp)) {
1891 		nbl_start_crit(vp, RW_READER);
1892 		in_crit = 1;
1893 		if (nbl_conflict(vp, NBL_REMOVE, 0, 0, 0, NULL)) {
1894 			error = EACCES;
1895 			goto out;
1896 		}
1897 	} else {
1898 		VN_RELE(vp);
1899 		vp = NULL;
1900 	}
1901 
1902 	if (dirflag == RMDIRECTORY) {
1903 		/*
1904 		 * Caller is using rmdir(2), which can only be applied to
1905 		 * directories.
1906 		 */
1907 		if (vtype != VDIR) {
1908 			error = ENOTDIR;
1909 		} else {
1910 			vnode_t *cwd;
1911 			proc_t *pp = curproc;
1912 
1913 			mutex_enter(&pp->p_lock);
1914 			cwd = PTOU(pp)->u_cdir;
1915 			VN_HOLD(cwd);
1916 			mutex_exit(&pp->p_lock);
1917 			error = VOP_RMDIR(dvp, pn.pn_path, cwd, CRED(),
1918 			    NULL, 0);
1919 			VN_RELE(cwd);
1920 		}
1921 	} else {
1922 		/*
1923 		 * Unlink(2) can be applied to anything.
1924 		 */
1925 		error = VOP_REMOVE(dvp, pn.pn_path, CRED(), NULL, 0);
1926 	}
1927 
1928 out:
1929 	pn_free(&pn);
1930 	if (in_crit) {
1931 		nbl_end_crit(vp);
1932 		in_crit = 0;
1933 	}
1934 	if (vp != NULL)
1935 		VN_RELE(vp);
1936 	if (dvp != NULL)
1937 		VN_RELE(dvp);
1938 	if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1939 		goto top;
1940 	return (error);
1941 }
1942 
1943 /*
1944  * Utility function to compare equality of vnodes.
1945  * Compare the underlying real vnodes, if there are underlying vnodes.
1946  * This is a more thorough comparison than the VN_CMP() macro provides.
1947  */
1948 int
1949 vn_compare(vnode_t *vp1, vnode_t *vp2)
1950 {
1951 	vnode_t *realvp;
1952 
1953 	if (vp1 != NULL && VOP_REALVP(vp1, &realvp, NULL) == 0)
1954 		vp1 = realvp;
1955 	if (vp2 != NULL && VOP_REALVP(vp2, &realvp, NULL) == 0)
1956 		vp2 = realvp;
1957 	return (VN_CMP(vp1, vp2));
1958 }
1959 
1960 /*
1961  * The number of locks to hash into.  This value must be a power
1962  * of 2 minus 1 and should probably also be prime.
1963  */
1964 #define	NUM_BUCKETS	1023
1965 
1966 struct  vn_vfslocks_bucket {
1967 	kmutex_t vb_lock;
1968 	vn_vfslocks_entry_t *vb_list;
1969 	char pad[64 - sizeof (kmutex_t) - sizeof (void *)];
1970 };
1971 
1972 /*
1973  * Total number of buckets will be NUM_BUCKETS + 1 .
1974  */
1975 
1976 #pragma	align	64(vn_vfslocks_buckets)
1977 static	struct vn_vfslocks_bucket	vn_vfslocks_buckets[NUM_BUCKETS + 1];
1978 
1979 #define	VN_VFSLOCKS_SHIFT	9
1980 
1981 #define	VN_VFSLOCKS_HASH(vfsvpptr)	\
1982 	((((intptr_t)(vfsvpptr)) >> VN_VFSLOCKS_SHIFT) & NUM_BUCKETS)
1983 
1984 /*
1985  * vn_vfslocks_getlock() uses an HASH scheme to generate
1986  * rwstlock using vfs/vnode pointer passed to it.
1987  *
1988  * vn_vfslocks_rele() releases a reference in the
1989  * HASH table which allows the entry allocated by
1990  * vn_vfslocks_getlock() to be freed at a later
1991  * stage when the refcount drops to zero.
1992  */
1993 
1994 vn_vfslocks_entry_t *
1995 vn_vfslocks_getlock(void *vfsvpptr)
1996 {
1997 	struct vn_vfslocks_bucket *bp;
1998 	vn_vfslocks_entry_t *vep;
1999 	vn_vfslocks_entry_t *tvep;
2000 
2001 	ASSERT(vfsvpptr != NULL);
2002 	bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vfsvpptr)];
2003 
2004 	mutex_enter(&bp->vb_lock);
2005 	for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
2006 		if (vep->ve_vpvfs == vfsvpptr) {
2007 			vep->ve_refcnt++;
2008 			mutex_exit(&bp->vb_lock);
2009 			return (vep);
2010 		}
2011 	}
2012 	mutex_exit(&bp->vb_lock);
2013 	vep = kmem_alloc(sizeof (*vep), KM_SLEEP);
2014 	rwst_init(&vep->ve_lock, NULL, RW_DEFAULT, NULL);
2015 	vep->ve_vpvfs = (char *)vfsvpptr;
2016 	vep->ve_refcnt = 1;
2017 	mutex_enter(&bp->vb_lock);
2018 	for (tvep = bp->vb_list; tvep != NULL; tvep = tvep->ve_next) {
2019 		if (tvep->ve_vpvfs == vfsvpptr) {
2020 			tvep->ve_refcnt++;
2021 			mutex_exit(&bp->vb_lock);
2022 
2023 			/*
2024 			 * There is already an entry in the hash
2025 			 * destroy what we just allocated.
2026 			 */
2027 			rwst_destroy(&vep->ve_lock);
2028 			kmem_free(vep, sizeof (*vep));
2029 			return (tvep);
2030 		}
2031 	}
2032 	vep->ve_next = bp->vb_list;
2033 	bp->vb_list = vep;
2034 	mutex_exit(&bp->vb_lock);
2035 	return (vep);
2036 }
2037 
2038 void
2039 vn_vfslocks_rele(vn_vfslocks_entry_t *vepent)
2040 {
2041 	struct vn_vfslocks_bucket *bp;
2042 	vn_vfslocks_entry_t *vep;
2043 	vn_vfslocks_entry_t *pvep;
2044 
2045 	ASSERT(vepent != NULL);
2046 	ASSERT(vepent->ve_vpvfs != NULL);
2047 
2048 	bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vepent->ve_vpvfs)];
2049 
2050 	mutex_enter(&bp->vb_lock);
2051 	vepent->ve_refcnt--;
2052 
2053 	if ((int32_t)vepent->ve_refcnt < 0)
2054 		cmn_err(CE_PANIC, "vn_vfslocks_rele: refcount negative");
2055 
2056 	if (vepent->ve_refcnt == 0) {
2057 		for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
2058 			if (vep->ve_vpvfs == vepent->ve_vpvfs) {
2059 				if (bp->vb_list == vep)
2060 					bp->vb_list = vep->ve_next;
2061 				else {
2062 					/* LINTED */
2063 					pvep->ve_next = vep->ve_next;
2064 				}
2065 				mutex_exit(&bp->vb_lock);
2066 				rwst_destroy(&vep->ve_lock);
2067 				kmem_free(vep, sizeof (*vep));
2068 				return;
2069 			}
2070 			pvep = vep;
2071 		}
2072 		cmn_err(CE_PANIC, "vn_vfslocks_rele: vp/vfs not found");
2073 	}
2074 	mutex_exit(&bp->vb_lock);
2075 }
2076 
2077 /*
2078  * vn_vfswlock_wait is used to implement a lock which is logically a writers
2079  * lock protecting the v_vfsmountedhere field.
2080  * vn_vfswlock_wait has been modified to be similar to vn_vfswlock,
2081  * except that it blocks to acquire the lock VVFSLOCK.
2082  *
2083  * traverse() and routines re-implementing part of traverse (e.g. autofs)
2084  * need to hold this lock. mount(), vn_rename(), vn_remove() and so on
2085  * need the non-blocking version of the writers lock i.e. vn_vfswlock
2086  */
2087 int
2088 vn_vfswlock_wait(vnode_t *vp)
2089 {
2090 	int retval;
2091 	vn_vfslocks_entry_t *vpvfsentry;
2092 	ASSERT(vp != NULL);
2093 
2094 	vpvfsentry = vn_vfslocks_getlock(vp);
2095 	retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_WRITER);
2096 
2097 	if (retval == EINTR) {
2098 		vn_vfslocks_rele(vpvfsentry);
2099 		return (EINTR);
2100 	}
2101 	return (retval);
2102 }
2103 
2104 int
2105 vn_vfsrlock_wait(vnode_t *vp)
2106 {
2107 	int retval;
2108 	vn_vfslocks_entry_t *vpvfsentry;
2109 	ASSERT(vp != NULL);
2110 
2111 	vpvfsentry = vn_vfslocks_getlock(vp);
2112 	retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_READER);
2113 
2114 	if (retval == EINTR) {
2115 		vn_vfslocks_rele(vpvfsentry);
2116 		return (EINTR);
2117 	}
2118 
2119 	return (retval);
2120 }
2121 
2122 
2123 /*
2124  * vn_vfswlock is used to implement a lock which is logically a writers lock
2125  * protecting the v_vfsmountedhere field.
2126  */
2127 int
2128 vn_vfswlock(vnode_t *vp)
2129 {
2130 	vn_vfslocks_entry_t *vpvfsentry;
2131 
2132 	/*
2133 	 * If vp is NULL then somebody is trying to lock the covered vnode
2134 	 * of /.  (vfs_vnodecovered is NULL for /).  This situation will
2135 	 * only happen when unmounting /.  Since that operation will fail
2136 	 * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2137 	 */
2138 	if (vp == NULL)
2139 		return (EBUSY);
2140 
2141 	vpvfsentry = vn_vfslocks_getlock(vp);
2142 
2143 	if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER))
2144 		return (0);
2145 
2146 	vn_vfslocks_rele(vpvfsentry);
2147 	return (EBUSY);
2148 }
2149 
2150 int
2151 vn_vfsrlock(vnode_t *vp)
2152 {
2153 	vn_vfslocks_entry_t *vpvfsentry;
2154 
2155 	/*
2156 	 * If vp is NULL then somebody is trying to lock the covered vnode
2157 	 * of /.  (vfs_vnodecovered is NULL for /).  This situation will
2158 	 * only happen when unmounting /.  Since that operation will fail
2159 	 * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2160 	 */
2161 	if (vp == NULL)
2162 		return (EBUSY);
2163 
2164 	vpvfsentry = vn_vfslocks_getlock(vp);
2165 
2166 	if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER))
2167 		return (0);
2168 
2169 	vn_vfslocks_rele(vpvfsentry);
2170 	return (EBUSY);
2171 }
2172 
2173 void
2174 vn_vfsunlock(vnode_t *vp)
2175 {
2176 	vn_vfslocks_entry_t *vpvfsentry;
2177 
2178 	/*
2179 	 * ve_refcnt needs to be decremented twice.
2180 	 * 1. To release refernce after a call to vn_vfslocks_getlock()
2181 	 * 2. To release the reference from the locking routines like
2182 	 *    vn_vfsrlock/vn_vfswlock etc,.
2183 	 */
2184 	vpvfsentry = vn_vfslocks_getlock(vp);
2185 	vn_vfslocks_rele(vpvfsentry);
2186 
2187 	rwst_exit(&vpvfsentry->ve_lock);
2188 	vn_vfslocks_rele(vpvfsentry);
2189 }
2190 
2191 int
2192 vn_vfswlock_held(vnode_t *vp)
2193 {
2194 	int held;
2195 	vn_vfslocks_entry_t *vpvfsentry;
2196 
2197 	ASSERT(vp != NULL);
2198 
2199 	vpvfsentry = vn_vfslocks_getlock(vp);
2200 	held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER);
2201 
2202 	vn_vfslocks_rele(vpvfsentry);
2203 	return (held);
2204 }
2205 
2206 
2207 int
2208 vn_make_ops(
2209 	const char *name,			/* Name of file system */
2210 	const fs_operation_def_t *templ,	/* Operation specification */
2211 	vnodeops_t **actual)			/* Return the vnodeops */
2212 {
2213 	int unused_ops;
2214 	int error;
2215 
2216 	*actual = (vnodeops_t *)kmem_alloc(sizeof (vnodeops_t), KM_SLEEP);
2217 
2218 	(*actual)->vnop_name = name;
2219 
2220 	error = fs_build_vector(*actual, &unused_ops, vn_ops_table, templ);
2221 	if (error) {
2222 		kmem_free(*actual, sizeof (vnodeops_t));
2223 	}
2224 
2225 #if DEBUG
2226 	if (unused_ops != 0)
2227 		cmn_err(CE_WARN, "vn_make_ops: %s: %d operations supplied "
2228 		    "but not used", name, unused_ops);
2229 #endif
2230 
2231 	return (error);
2232 }
2233 
2234 /*
2235  * Free the vnodeops created as a result of vn_make_ops()
2236  */
2237 void
2238 vn_freevnodeops(vnodeops_t *vnops)
2239 {
2240 	kmem_free(vnops, sizeof (vnodeops_t));
2241 }
2242 
2243 /*
2244  * Vnode cache.
2245  */
2246 
2247 /* ARGSUSED */
2248 static int
2249 vn_cache_constructor(void *buf, void *cdrarg, int kmflags)
2250 {
2251 	struct vnode *vp;
2252 
2253 	vp = buf;
2254 
2255 	mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL);
2256 	mutex_init(&vp->v_vsd_lock, NULL, MUTEX_DEFAULT, NULL);
2257 	cv_init(&vp->v_cv, NULL, CV_DEFAULT, NULL);
2258 	rw_init(&vp->v_nbllock, NULL, RW_DEFAULT, NULL);
2259 	vp->v_femhead = NULL;	/* Must be done before vn_reinit() */
2260 	vp->v_path = NULL;
2261 	vp->v_mpssdata = NULL;
2262 	vp->v_vsd = NULL;
2263 	vp->v_fopdata = NULL;
2264 
2265 	return (0);
2266 }
2267 
2268 /* ARGSUSED */
2269 static void
2270 vn_cache_destructor(void *buf, void *cdrarg)
2271 {
2272 	struct vnode *vp;
2273 
2274 	vp = buf;
2275 
2276 	rw_destroy(&vp->v_nbllock);
2277 	cv_destroy(&vp->v_cv);
2278 	mutex_destroy(&vp->v_vsd_lock);
2279 	mutex_destroy(&vp->v_lock);
2280 }
2281 
2282 void
2283 vn_create_cache(void)
2284 {
2285 	vn_cache = kmem_cache_create("vn_cache", sizeof (struct vnode), 64,
2286 	    vn_cache_constructor, vn_cache_destructor, NULL, NULL,
2287 	    NULL, 0);
2288 }
2289 
2290 void
2291 vn_destroy_cache(void)
2292 {
2293 	kmem_cache_destroy(vn_cache);
2294 }
2295 
2296 /*
2297  * Used by file systems when fs-specific nodes (e.g., ufs inodes) are
2298  * cached by the file system and vnodes remain associated.
2299  */
2300 void
2301 vn_recycle(vnode_t *vp)
2302 {
2303 	ASSERT(vp->v_pages == NULL);
2304 
2305 	/*
2306 	 * XXX - This really belongs in vn_reinit(), but we have some issues
2307 	 * with the counts.  Best to have it here for clean initialization.
2308 	 */
2309 	vp->v_rdcnt = 0;
2310 	vp->v_wrcnt = 0;
2311 	vp->v_mmap_read = 0;
2312 	vp->v_mmap_write = 0;
2313 
2314 	/*
2315 	 * If FEM was in use, make sure everything gets cleaned up
2316 	 * NOTE: vp->v_femhead is initialized to NULL in the vnode
2317 	 * constructor.
2318 	 */
2319 	if (vp->v_femhead) {
2320 		/* XXX - There should be a free_femhead() that does all this */
2321 		ASSERT(vp->v_femhead->femh_list == NULL);
2322 		mutex_destroy(&vp->v_femhead->femh_lock);
2323 		kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2324 		vp->v_femhead = NULL;
2325 	}
2326 	if (vp->v_path) {
2327 		kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2328 		vp->v_path = NULL;
2329 	}
2330 
2331 	if (vp->v_fopdata != NULL) {
2332 		free_fopdata(vp);
2333 	}
2334 	vp->v_mpssdata = NULL;
2335 	vsd_free(vp);
2336 }
2337 
2338 /*
2339  * Used to reset the vnode fields including those that are directly accessible
2340  * as well as those which require an accessor function.
2341  *
2342  * Does not initialize:
2343  *	synchronization objects: v_lock, v_vsd_lock, v_nbllock, v_cv
2344  *	v_data (since FS-nodes and vnodes point to each other and should
2345  *		be updated simultaneously)
2346  *	v_op (in case someone needs to make a VOP call on this object)
2347  */
2348 void
2349 vn_reinit(vnode_t *vp)
2350 {
2351 	vp->v_count = 1;
2352 	vp->v_count_dnlc = 0;
2353 	vp->v_vfsp = NULL;
2354 	vp->v_stream = NULL;
2355 	vp->v_vfsmountedhere = NULL;
2356 	vp->v_flag = 0;
2357 	vp->v_type = VNON;
2358 	vp->v_rdev = NODEV;
2359 
2360 	vp->v_filocks = NULL;
2361 	vp->v_shrlocks = NULL;
2362 	vp->v_pages = NULL;
2363 
2364 	vp->v_locality = NULL;
2365 	vp->v_xattrdir = NULL;
2366 
2367 	/* Handles v_femhead, v_path, and the r/w/map counts */
2368 	vn_recycle(vp);
2369 }
2370 
2371 vnode_t *
2372 vn_alloc(int kmflag)
2373 {
2374 	vnode_t *vp;
2375 
2376 	vp = kmem_cache_alloc(vn_cache, kmflag);
2377 
2378 	if (vp != NULL) {
2379 		vp->v_femhead = NULL;	/* Must be done before vn_reinit() */
2380 		vp->v_fopdata = NULL;
2381 		vn_reinit(vp);
2382 	}
2383 
2384 	return (vp);
2385 }
2386 
2387 void
2388 vn_free(vnode_t *vp)
2389 {
2390 	ASSERT(vp->v_shrlocks == NULL);
2391 	ASSERT(vp->v_filocks == NULL);
2392 
2393 	/*
2394 	 * Some file systems call vn_free() with v_count of zero,
2395 	 * some with v_count of 1.  In any case, the value should
2396 	 * never be anything else.
2397 	 */
2398 	ASSERT((vp->v_count == 0) || (vp->v_count == 1));
2399 	ASSERT(vp->v_count_dnlc == 0);
2400 	if (vp->v_path != NULL) {
2401 		kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2402 		vp->v_path = NULL;
2403 	}
2404 
2405 	/* If FEM was in use, make sure everything gets cleaned up */
2406 	if (vp->v_femhead) {
2407 		/* XXX - There should be a free_femhead() that does all this */
2408 		ASSERT(vp->v_femhead->femh_list == NULL);
2409 		mutex_destroy(&vp->v_femhead->femh_lock);
2410 		kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2411 		vp->v_femhead = NULL;
2412 	}
2413 
2414 	if (vp->v_fopdata != NULL) {
2415 		free_fopdata(vp);
2416 	}
2417 	vp->v_mpssdata = NULL;
2418 	vsd_free(vp);
2419 	kmem_cache_free(vn_cache, vp);
2420 }
2421 
2422 /*
2423  * vnode status changes, should define better states than 1, 0.
2424  */
2425 void
2426 vn_reclaim(vnode_t *vp)
2427 {
2428 	vfs_t   *vfsp = vp->v_vfsp;
2429 
2430 	if (vfsp == NULL ||
2431 	    vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2432 		return;
2433 	}
2434 	(void) VFS_VNSTATE(vfsp, vp, VNTRANS_RECLAIMED);
2435 }
2436 
2437 void
2438 vn_idle(vnode_t *vp)
2439 {
2440 	vfs_t   *vfsp = vp->v_vfsp;
2441 
2442 	if (vfsp == NULL ||
2443 	    vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2444 		return;
2445 	}
2446 	(void) VFS_VNSTATE(vfsp, vp, VNTRANS_IDLED);
2447 }
2448 void
2449 vn_exists(vnode_t *vp)
2450 {
2451 	vfs_t   *vfsp = vp->v_vfsp;
2452 
2453 	if (vfsp == NULL ||
2454 	    vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2455 		return;
2456 	}
2457 	(void) VFS_VNSTATE(vfsp, vp, VNTRANS_EXISTS);
2458 }
2459 
2460 void
2461 vn_invalid(vnode_t *vp)
2462 {
2463 	vfs_t   *vfsp = vp->v_vfsp;
2464 
2465 	if (vfsp == NULL ||
2466 	    vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2467 		return;
2468 	}
2469 	(void) VFS_VNSTATE(vfsp, vp, VNTRANS_DESTROYED);
2470 }
2471 
2472 /* Vnode event notification */
2473 
2474 int
2475 vnevent_support(vnode_t *vp, caller_context_t *ct)
2476 {
2477 	if (vp == NULL)
2478 		return (EINVAL);
2479 
2480 	return (VOP_VNEVENT(vp, VE_SUPPORT, NULL, NULL, ct));
2481 }
2482 
2483 void
2484 vnevent_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2485 {
2486 	if (vp == NULL || vp->v_femhead == NULL) {
2487 		return;
2488 	}
2489 	(void) VOP_VNEVENT(vp, VE_RENAME_SRC, dvp, name, ct);
2490 }
2491 
2492 void
2493 vnevent_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2494     caller_context_t *ct)
2495 {
2496 	if (vp == NULL || vp->v_femhead == NULL) {
2497 		return;
2498 	}
2499 	(void) VOP_VNEVENT(vp, VE_RENAME_DEST, dvp, name, ct);
2500 }
2501 
2502 void
2503 vnevent_rename_dest_dir(vnode_t *vp, caller_context_t *ct)
2504 {
2505 	if (vp == NULL || vp->v_femhead == NULL) {
2506 		return;
2507 	}
2508 	(void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, NULL, NULL, ct);
2509 }
2510 
2511 void
2512 vnevent_remove(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2513 {
2514 	if (vp == NULL || vp->v_femhead == NULL) {
2515 		return;
2516 	}
2517 	(void) VOP_VNEVENT(vp, VE_REMOVE, dvp, name, ct);
2518 }
2519 
2520 void
2521 vnevent_rmdir(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2522 {
2523 	if (vp == NULL || vp->v_femhead == NULL) {
2524 		return;
2525 	}
2526 	(void) VOP_VNEVENT(vp, VE_RMDIR, dvp, name, ct);
2527 }
2528 
2529 void
2530 vnevent_create(vnode_t *vp, caller_context_t *ct)
2531 {
2532 	if (vp == NULL || vp->v_femhead == NULL) {
2533 		return;
2534 	}
2535 	(void) VOP_VNEVENT(vp, VE_CREATE, NULL, NULL, ct);
2536 }
2537 
2538 void
2539 vnevent_link(vnode_t *vp, caller_context_t *ct)
2540 {
2541 	if (vp == NULL || vp->v_femhead == NULL) {
2542 		return;
2543 	}
2544 	(void) VOP_VNEVENT(vp, VE_LINK, NULL, NULL, ct);
2545 }
2546 
2547 void
2548 vnevent_mountedover(vnode_t *vp, caller_context_t *ct)
2549 {
2550 	if (vp == NULL || vp->v_femhead == NULL) {
2551 		return;
2552 	}
2553 	(void) VOP_VNEVENT(vp, VE_MOUNTEDOVER, NULL, NULL, ct);
2554 }
2555 
2556 /*
2557  * Vnode accessors.
2558  */
2559 
2560 int
2561 vn_is_readonly(vnode_t *vp)
2562 {
2563 	return (vp->v_vfsp->vfs_flag & VFS_RDONLY);
2564 }
2565 
2566 int
2567 vn_has_flocks(vnode_t *vp)
2568 {
2569 	return (vp->v_filocks != NULL);
2570 }
2571 
2572 int
2573 vn_has_mandatory_locks(vnode_t *vp, int mode)
2574 {
2575 	return ((vp->v_filocks != NULL) && (MANDLOCK(vp, mode)));
2576 }
2577 
2578 int
2579 vn_has_cached_data(vnode_t *vp)
2580 {
2581 	return (vp->v_pages != NULL);
2582 }
2583 
2584 /*
2585  * Return 0 if the vnode in question shouldn't be permitted into a zone via
2586  * zone_enter(2).
2587  */
2588 int
2589 vn_can_change_zones(vnode_t *vp)
2590 {
2591 	struct vfssw *vswp;
2592 	int allow = 1;
2593 	vnode_t *rvp;
2594 
2595 	if (nfs_global_client_only != 0)
2596 		return (1);
2597 
2598 	/*
2599 	 * We always want to look at the underlying vnode if there is one.
2600 	 */
2601 	if (VOP_REALVP(vp, &rvp, NULL) != 0)
2602 		rvp = vp;
2603 	/*
2604 	 * Some pseudo filesystems (including doorfs) don't actually register
2605 	 * their vfsops_t, so the following may return NULL; we happily let
2606 	 * such vnodes switch zones.
2607 	 */
2608 	vswp = vfs_getvfsswbyvfsops(vfs_getops(rvp->v_vfsp));
2609 	if (vswp != NULL) {
2610 		if (vswp->vsw_flag & VSW_NOTZONESAFE)
2611 			allow = 0;
2612 		vfs_unrefvfssw(vswp);
2613 	}
2614 	return (allow);
2615 }
2616 
2617 /*
2618  * Return nonzero if the vnode is a mount point, zero if not.
2619  */
2620 int
2621 vn_ismntpt(vnode_t *vp)
2622 {
2623 	return (vp->v_vfsmountedhere != NULL);
2624 }
2625 
2626 /* Retrieve the vfs (if any) mounted on this vnode */
2627 vfs_t *
2628 vn_mountedvfs(vnode_t *vp)
2629 {
2630 	return (vp->v_vfsmountedhere);
2631 }
2632 
2633 /*
2634  * Return nonzero if the vnode is referenced by the dnlc, zero if not.
2635  */
2636 int
2637 vn_in_dnlc(vnode_t *vp)
2638 {
2639 	return (vp->v_count_dnlc > 0);
2640 }
2641 
2642 /*
2643  * vn_has_other_opens() checks whether a particular file is opened by more than
2644  * just the caller and whether the open is for read and/or write.
2645  * This routine is for calling after the caller has already called VOP_OPEN()
2646  * and the caller wishes to know if they are the only one with it open for
2647  * the mode(s) specified.
2648  *
2649  * Vnode counts are only kept on regular files (v_type=VREG).
2650  */
2651 int
2652 vn_has_other_opens(
2653 	vnode_t *vp,
2654 	v_mode_t mode)
2655 {
2656 
2657 	ASSERT(vp != NULL);
2658 
2659 	switch (mode) {
2660 	case V_WRITE:
2661 		if (vp->v_wrcnt > 1)
2662 			return (V_TRUE);
2663 		break;
2664 	case V_RDORWR:
2665 		if ((vp->v_rdcnt > 1) || (vp->v_wrcnt > 1))
2666 			return (V_TRUE);
2667 		break;
2668 	case V_RDANDWR:
2669 		if ((vp->v_rdcnt > 1) && (vp->v_wrcnt > 1))
2670 			return (V_TRUE);
2671 		break;
2672 	case V_READ:
2673 		if (vp->v_rdcnt > 1)
2674 			return (V_TRUE);
2675 		break;
2676 	}
2677 
2678 	return (V_FALSE);
2679 }
2680 
2681 /*
2682  * vn_is_opened() checks whether a particular file is opened and
2683  * whether the open is for read and/or write.
2684  *
2685  * Vnode counts are only kept on regular files (v_type=VREG).
2686  */
2687 int
2688 vn_is_opened(
2689 	vnode_t *vp,
2690 	v_mode_t mode)
2691 {
2692 
2693 	ASSERT(vp != NULL);
2694 
2695 	switch (mode) {
2696 	case V_WRITE:
2697 		if (vp->v_wrcnt)
2698 			return (V_TRUE);
2699 		break;
2700 	case V_RDANDWR:
2701 		if (vp->v_rdcnt && vp->v_wrcnt)
2702 			return (V_TRUE);
2703 		break;
2704 	case V_RDORWR:
2705 		if (vp->v_rdcnt || vp->v_wrcnt)
2706 			return (V_TRUE);
2707 		break;
2708 	case V_READ:
2709 		if (vp->v_rdcnt)
2710 			return (V_TRUE);
2711 		break;
2712 	}
2713 
2714 	return (V_FALSE);
2715 }
2716 
2717 /*
2718  * vn_is_mapped() checks whether a particular file is mapped and whether
2719  * the file is mapped read and/or write.
2720  */
2721 int
2722 vn_is_mapped(
2723 	vnode_t *vp,
2724 	v_mode_t mode)
2725 {
2726 
2727 	ASSERT(vp != NULL);
2728 
2729 #if !defined(_LP64)
2730 	switch (mode) {
2731 	/*
2732 	 * The atomic_add_64_nv functions force atomicity in the
2733 	 * case of 32 bit architectures. Otherwise the 64 bit values
2734 	 * require two fetches. The value of the fields may be
2735 	 * (potentially) changed between the first fetch and the
2736 	 * second
2737 	 */
2738 	case V_WRITE:
2739 		if (atomic_add_64_nv((&(vp->v_mmap_write)), 0))
2740 			return (V_TRUE);
2741 		break;
2742 	case V_RDANDWR:
2743 		if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) &&
2744 		    (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2745 			return (V_TRUE);
2746 		break;
2747 	case V_RDORWR:
2748 		if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) ||
2749 		    (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2750 			return (V_TRUE);
2751 		break;
2752 	case V_READ:
2753 		if (atomic_add_64_nv((&(vp->v_mmap_read)), 0))
2754 			return (V_TRUE);
2755 		break;
2756 	}
2757 #else
2758 	switch (mode) {
2759 	case V_WRITE:
2760 		if (vp->v_mmap_write)
2761 			return (V_TRUE);
2762 		break;
2763 	case V_RDANDWR:
2764 		if (vp->v_mmap_read && vp->v_mmap_write)
2765 			return (V_TRUE);
2766 		break;
2767 	case V_RDORWR:
2768 		if (vp->v_mmap_read || vp->v_mmap_write)
2769 			return (V_TRUE);
2770 		break;
2771 	case V_READ:
2772 		if (vp->v_mmap_read)
2773 			return (V_TRUE);
2774 		break;
2775 	}
2776 #endif
2777 
2778 	return (V_FALSE);
2779 }
2780 
2781 /*
2782  * Set the operations vector for a vnode.
2783  *
2784  * FEM ensures that the v_femhead pointer is filled in before the
2785  * v_op pointer is changed.  This means that if the v_femhead pointer
2786  * is NULL, and the v_op field hasn't changed since before which checked
2787  * the v_femhead pointer; then our update is ok - we are not racing with
2788  * FEM.
2789  */
2790 void
2791 vn_setops(vnode_t *vp, vnodeops_t *vnodeops)
2792 {
2793 	vnodeops_t	*op;
2794 
2795 	ASSERT(vp != NULL);
2796 	ASSERT(vnodeops != NULL);
2797 
2798 	op = vp->v_op;
2799 	membar_consumer();
2800 	/*
2801 	 * If vp->v_femhead == NULL, then we'll call casptr() to do the
2802 	 * compare-and-swap on vp->v_op.  If either fails, then FEM is
2803 	 * in effect on the vnode and we need to have FEM deal with it.
2804 	 */
2805 	if (vp->v_femhead != NULL || casptr(&vp->v_op, op, vnodeops) != op) {
2806 		fem_setvnops(vp, vnodeops);
2807 	}
2808 }
2809 
2810 /*
2811  * Retrieve the operations vector for a vnode
2812  * As with vn_setops(above); make sure we aren't racing with FEM.
2813  * FEM sets the v_op to a special, internal, vnodeops that wouldn't
2814  * make sense to the callers of this routine.
2815  */
2816 vnodeops_t *
2817 vn_getops(vnode_t *vp)
2818 {
2819 	vnodeops_t	*op;
2820 
2821 	ASSERT(vp != NULL);
2822 
2823 	op = vp->v_op;
2824 	membar_consumer();
2825 	if (vp->v_femhead == NULL && op == vp->v_op) {
2826 		return (op);
2827 	} else {
2828 		return (fem_getvnops(vp));
2829 	}
2830 }
2831 
2832 /*
2833  * Returns non-zero (1) if the vnodeops matches that of the vnode.
2834  * Returns zero (0) if not.
2835  */
2836 int
2837 vn_matchops(vnode_t *vp, vnodeops_t *vnodeops)
2838 {
2839 	return (vn_getops(vp) == vnodeops);
2840 }
2841 
2842 /*
2843  * Returns non-zero (1) if the specified operation matches the
2844  * corresponding operation for that the vnode.
2845  * Returns zero (0) if not.
2846  */
2847 
2848 #define	MATCHNAME(n1, n2) (((n1)[0] == (n2)[0]) && (strcmp((n1), (n2)) == 0))
2849 
2850 int
2851 vn_matchopval(vnode_t *vp, char *vopname, fs_generic_func_p funcp)
2852 {
2853 	const fs_operation_trans_def_t *otdp;
2854 	fs_generic_func_p *loc = NULL;
2855 	vnodeops_t	*vop = vn_getops(vp);
2856 
2857 	ASSERT(vopname != NULL);
2858 
2859 	for (otdp = vn_ops_table; otdp->name != NULL; otdp++) {
2860 		if (MATCHNAME(otdp->name, vopname)) {
2861 			loc = (fs_generic_func_p *)
2862 			    ((char *)(vop) + otdp->offset);
2863 			break;
2864 		}
2865 	}
2866 
2867 	return ((loc != NULL) && (*loc == funcp));
2868 }
2869 
2870 /*
2871  * fs_new_caller_id() needs to return a unique ID on a given local system.
2872  * The IDs do not need to survive across reboots.  These are primarily
2873  * used so that (FEM) monitors can detect particular callers (such as
2874  * the NFS server) to a given vnode/vfs operation.
2875  */
2876 u_longlong_t
2877 fs_new_caller_id()
2878 {
2879 	static uint64_t next_caller_id = 0LL; /* First call returns 1 */
2880 
2881 	return ((u_longlong_t)atomic_add_64_nv(&next_caller_id, 1));
2882 }
2883 
2884 /*
2885  * Given a starting vnode and a path, updates the path in the target vnode in
2886  * a safe manner.  If the vnode already has path information embedded, then the
2887  * cached path is left untouched.
2888  */
2889 
2890 size_t max_vnode_path = 4 * MAXPATHLEN;
2891 
2892 void
2893 vn_setpath(vnode_t *rootvp, struct vnode *startvp, struct vnode *vp,
2894     const char *path, size_t plen)
2895 {
2896 	char	*rpath;
2897 	vnode_t	*base;
2898 	size_t	rpathlen, rpathalloc;
2899 	int	doslash = 1;
2900 
2901 	if (*path == '/') {
2902 		base = rootvp;
2903 		path++;
2904 		plen--;
2905 	} else {
2906 		base = startvp;
2907 	}
2908 
2909 	/*
2910 	 * We cannot grab base->v_lock while we hold vp->v_lock because of
2911 	 * the potential for deadlock.
2912 	 */
2913 	mutex_enter(&base->v_lock);
2914 	if (base->v_path == NULL) {
2915 		mutex_exit(&base->v_lock);
2916 		return;
2917 	}
2918 
2919 	rpathlen = strlen(base->v_path);
2920 	rpathalloc = rpathlen + plen + 1;
2921 	/* Avoid adding a slash if there's already one there */
2922 	if (base->v_path[rpathlen-1] == '/')
2923 		doslash = 0;
2924 	else
2925 		rpathalloc++;
2926 
2927 	/*
2928 	 * We don't want to call kmem_alloc(KM_SLEEP) with kernel locks held,
2929 	 * so we must do this dance.  If, by chance, something changes the path,
2930 	 * just give up since there is no real harm.
2931 	 */
2932 	mutex_exit(&base->v_lock);
2933 
2934 	/* Paths should stay within reason */
2935 	if (rpathalloc > max_vnode_path)
2936 		return;
2937 
2938 	rpath = kmem_alloc(rpathalloc, KM_SLEEP);
2939 
2940 	mutex_enter(&base->v_lock);
2941 	if (base->v_path == NULL || strlen(base->v_path) != rpathlen) {
2942 		mutex_exit(&base->v_lock);
2943 		kmem_free(rpath, rpathalloc);
2944 		return;
2945 	}
2946 	bcopy(base->v_path, rpath, rpathlen);
2947 	mutex_exit(&base->v_lock);
2948 
2949 	if (doslash)
2950 		rpath[rpathlen++] = '/';
2951 	bcopy(path, rpath + rpathlen, plen);
2952 	rpath[rpathlen + plen] = '\0';
2953 
2954 	mutex_enter(&vp->v_lock);
2955 	if (vp->v_path != NULL) {
2956 		mutex_exit(&vp->v_lock);
2957 		kmem_free(rpath, rpathalloc);
2958 	} else {
2959 		vp->v_path = rpath;
2960 		mutex_exit(&vp->v_lock);
2961 	}
2962 }
2963 
2964 /*
2965  * Sets the path to the vnode to be the given string, regardless of current
2966  * context.  The string must be a complete path from rootdir.  This is only used
2967  * by fsop_root() for setting the path based on the mountpoint.
2968  */
2969 void
2970 vn_setpath_str(struct vnode *vp, const char *str, size_t len)
2971 {
2972 	char *buf = kmem_alloc(len + 1, KM_SLEEP);
2973 
2974 	mutex_enter(&vp->v_lock);
2975 	if (vp->v_path != NULL) {
2976 		mutex_exit(&vp->v_lock);
2977 		kmem_free(buf, len + 1);
2978 		return;
2979 	}
2980 
2981 	vp->v_path = buf;
2982 	bcopy(str, vp->v_path, len);
2983 	vp->v_path[len] = '\0';
2984 
2985 	mutex_exit(&vp->v_lock);
2986 }
2987 
2988 /*
2989  * Called from within filesystem's vop_rename() to handle renames once the
2990  * target vnode is available.
2991  */
2992 void
2993 vn_renamepath(vnode_t *dvp, vnode_t *vp, const char *nm, size_t len)
2994 {
2995 	char *tmp;
2996 
2997 	mutex_enter(&vp->v_lock);
2998 	tmp = vp->v_path;
2999 	vp->v_path = NULL;
3000 	mutex_exit(&vp->v_lock);
3001 	vn_setpath(rootdir, dvp, vp, nm, len);
3002 	if (tmp != NULL)
3003 		kmem_free(tmp, strlen(tmp) + 1);
3004 }
3005 
3006 /*
3007  * Similar to vn_setpath_str(), this function sets the path of the destination
3008  * vnode to the be the same as the source vnode.
3009  */
3010 void
3011 vn_copypath(struct vnode *src, struct vnode *dst)
3012 {
3013 	char *buf;
3014 	int alloc;
3015 
3016 	mutex_enter(&src->v_lock);
3017 	if (src->v_path == NULL) {
3018 		mutex_exit(&src->v_lock);
3019 		return;
3020 	}
3021 	alloc = strlen(src->v_path) + 1;
3022 
3023 	/* avoid kmem_alloc() with lock held */
3024 	mutex_exit(&src->v_lock);
3025 	buf = kmem_alloc(alloc, KM_SLEEP);
3026 	mutex_enter(&src->v_lock);
3027 	if (src->v_path == NULL || strlen(src->v_path) + 1 != alloc) {
3028 		mutex_exit(&src->v_lock);
3029 		kmem_free(buf, alloc);
3030 		return;
3031 	}
3032 	bcopy(src->v_path, buf, alloc);
3033 	mutex_exit(&src->v_lock);
3034 
3035 	mutex_enter(&dst->v_lock);
3036 	if (dst->v_path != NULL) {
3037 		mutex_exit(&dst->v_lock);
3038 		kmem_free(buf, alloc);
3039 		return;
3040 	}
3041 	dst->v_path = buf;
3042 	mutex_exit(&dst->v_lock);
3043 }
3044 
3045 /*
3046  * XXX Private interface for segvn routines that handle vnode
3047  * large page segments.
3048  *
3049  * return 1 if vp's file system VOP_PAGEIO() implementation
3050  * can be safely used instead of VOP_GETPAGE() for handling
3051  * pagefaults against regular non swap files. VOP_PAGEIO()
3052  * interface is considered safe here if its implementation
3053  * is very close to VOP_GETPAGE() implementation.
3054  * e.g. It zero's out the part of the page beyond EOF. Doesn't
3055  * panic if there're file holes but instead returns an error.
3056  * Doesn't assume file won't be changed by user writes, etc.
3057  *
3058  * return 0 otherwise.
3059  *
3060  * For now allow segvn to only use VOP_PAGEIO() with ufs and nfs.
3061  */
3062 int
3063 vn_vmpss_usepageio(vnode_t *vp)
3064 {
3065 	vfs_t   *vfsp = vp->v_vfsp;
3066 	char *fsname = vfssw[vfsp->vfs_fstype].vsw_name;
3067 	char *pageio_ok_fss[] = {"ufs", "nfs", NULL};
3068 	char **fsok = pageio_ok_fss;
3069 
3070 	if (fsname == NULL) {
3071 		return (0);
3072 	}
3073 
3074 	for (; *fsok; fsok++) {
3075 		if (strcmp(*fsok, fsname) == 0) {
3076 			return (1);
3077 		}
3078 	}
3079 	return (0);
3080 }
3081 
3082 /* VOP_XXX() macros call the corresponding fop_xxx() function */
3083 
3084 int
3085 fop_open(
3086 	vnode_t **vpp,
3087 	int mode,
3088 	cred_t *cr,
3089 	caller_context_t *ct)
3090 {
3091 	int ret;
3092 	vnode_t *vp = *vpp;
3093 
3094 	VN_HOLD(vp);
3095 	/*
3096 	 * Adding to the vnode counts before calling open
3097 	 * avoids the need for a mutex. It circumvents a race
3098 	 * condition where a query made on the vnode counts results in a
3099 	 * false negative. The inquirer goes away believing the file is
3100 	 * not open when there is an open on the file already under way.
3101 	 *
3102 	 * The counts are meant to prevent NFS from granting a delegation
3103 	 * when it would be dangerous to do so.
3104 	 *
3105 	 * The vnode counts are only kept on regular files
3106 	 */
3107 	if ((*vpp)->v_type == VREG) {
3108 		if (mode & FREAD)
3109 			atomic_add_32(&((*vpp)->v_rdcnt), 1);
3110 		if (mode & FWRITE)
3111 			atomic_add_32(&((*vpp)->v_wrcnt), 1);
3112 	}
3113 
3114 	VOPXID_MAP_CR(vp, cr);
3115 
3116 	ret = (*(*(vpp))->v_op->vop_open)(vpp, mode, cr, ct);
3117 
3118 	if (ret) {
3119 		/*
3120 		 * Use the saved vp just in case the vnode ptr got trashed
3121 		 * by the error.
3122 		 */
3123 		VOPSTATS_UPDATE(vp, open);
3124 		if ((vp->v_type == VREG) && (mode & FREAD))
3125 			atomic_add_32(&(vp->v_rdcnt), -1);
3126 		if ((vp->v_type == VREG) && (mode & FWRITE))
3127 			atomic_add_32(&(vp->v_wrcnt), -1);
3128 	} else {
3129 		/*
3130 		 * Some filesystems will return a different vnode,
3131 		 * but the same path was still used to open it.
3132 		 * So if we do change the vnode and need to
3133 		 * copy over the path, do so here, rather than special
3134 		 * casing each filesystem. Adjust the vnode counts to
3135 		 * reflect the vnode switch.
3136 		 */
3137 		VOPSTATS_UPDATE(*vpp, open);
3138 		if (*vpp != vp && *vpp != NULL) {
3139 			vn_copypath(vp, *vpp);
3140 			if (((*vpp)->v_type == VREG) && (mode & FREAD))
3141 				atomic_add_32(&((*vpp)->v_rdcnt), 1);
3142 			if ((vp->v_type == VREG) && (mode & FREAD))
3143 				atomic_add_32(&(vp->v_rdcnt), -1);
3144 			if (((*vpp)->v_type == VREG) && (mode & FWRITE))
3145 				atomic_add_32(&((*vpp)->v_wrcnt), 1);
3146 			if ((vp->v_type == VREG) && (mode & FWRITE))
3147 				atomic_add_32(&(vp->v_wrcnt), -1);
3148 		}
3149 	}
3150 	VN_RELE(vp);
3151 	return (ret);
3152 }
3153 
3154 int
3155 fop_close(
3156 	vnode_t *vp,
3157 	int flag,
3158 	int count,
3159 	offset_t offset,
3160 	cred_t *cr,
3161 	caller_context_t *ct)
3162 {
3163 	int err;
3164 
3165 	VOPXID_MAP_CR(vp, cr);
3166 
3167 	err = (*(vp)->v_op->vop_close)(vp, flag, count, offset, cr, ct);
3168 	VOPSTATS_UPDATE(vp, close);
3169 	/*
3170 	 * Check passed in count to handle possible dups. Vnode counts are only
3171 	 * kept on regular files
3172 	 */
3173 	if ((vp->v_type == VREG) && (count == 1))  {
3174 		if (flag & FREAD) {
3175 			ASSERT(vp->v_rdcnt > 0);
3176 			atomic_add_32(&(vp->v_rdcnt), -1);
3177 		}
3178 		if (flag & FWRITE) {
3179 			ASSERT(vp->v_wrcnt > 0);
3180 			atomic_add_32(&(vp->v_wrcnt), -1);
3181 		}
3182 	}
3183 	return (err);
3184 }
3185 
3186 int
3187 fop_read(
3188 	vnode_t *vp,
3189 	uio_t *uiop,
3190 	int ioflag,
3191 	cred_t *cr,
3192 	caller_context_t *ct)
3193 {
3194 	int	err;
3195 	ssize_t	resid_start = uiop->uio_resid;
3196 
3197 	VOPXID_MAP_CR(vp, cr);
3198 
3199 	err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct);
3200 	VOPSTATS_UPDATE_IO(vp, read,
3201 	    read_bytes, (resid_start - uiop->uio_resid));
3202 	return (err);
3203 }
3204 
3205 int
3206 fop_write(
3207 	vnode_t *vp,
3208 	uio_t *uiop,
3209 	int ioflag,
3210 	cred_t *cr,
3211 	caller_context_t *ct)
3212 {
3213 	int	err;
3214 	ssize_t	resid_start = uiop->uio_resid;
3215 
3216 	VOPXID_MAP_CR(vp, cr);
3217 
3218 	err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct);
3219 	VOPSTATS_UPDATE_IO(vp, write,
3220 	    write_bytes, (resid_start - uiop->uio_resid));
3221 	return (err);
3222 }
3223 
3224 int
3225 fop_ioctl(
3226 	vnode_t *vp,
3227 	int cmd,
3228 	intptr_t arg,
3229 	int flag,
3230 	cred_t *cr,
3231 	int *rvalp,
3232 	caller_context_t *ct)
3233 {
3234 	int	err;
3235 
3236 	VOPXID_MAP_CR(vp, cr);
3237 
3238 	err = (*(vp)->v_op->vop_ioctl)(vp, cmd, arg, flag, cr, rvalp, ct);
3239 	VOPSTATS_UPDATE(vp, ioctl);
3240 	return (err);
3241 }
3242 
3243 int
3244 fop_setfl(
3245 	vnode_t *vp,
3246 	int oflags,
3247 	int nflags,
3248 	cred_t *cr,
3249 	caller_context_t *ct)
3250 {
3251 	int	err;
3252 
3253 	VOPXID_MAP_CR(vp, cr);
3254 
3255 	err = (*(vp)->v_op->vop_setfl)(vp, oflags, nflags, cr, ct);
3256 	VOPSTATS_UPDATE(vp, setfl);
3257 	return (err);
3258 }
3259 
3260 int
3261 fop_getattr(
3262 	vnode_t *vp,
3263 	vattr_t *vap,
3264 	int flags,
3265 	cred_t *cr,
3266 	caller_context_t *ct)
3267 {
3268 	int	err;
3269 
3270 	VOPXID_MAP_CR(vp, cr);
3271 
3272 	/*
3273 	 * If this file system doesn't understand the xvattr extensions
3274 	 * then turn off the xvattr bit.
3275 	 */
3276 	if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3277 		vap->va_mask &= ~AT_XVATTR;
3278 	}
3279 
3280 	/*
3281 	 * We're only allowed to skip the ACL check iff we used a 32 bit
3282 	 * ACE mask with VOP_ACCESS() to determine permissions.
3283 	 */
3284 	if ((flags & ATTR_NOACLCHECK) &&
3285 	    vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3286 		return (EINVAL);
3287 	}
3288 	err = (*(vp)->v_op->vop_getattr)(vp, vap, flags, cr, ct);
3289 	VOPSTATS_UPDATE(vp, getattr);
3290 	return (err);
3291 }
3292 
3293 int
3294 fop_setattr(
3295 	vnode_t *vp,
3296 	vattr_t *vap,
3297 	int flags,
3298 	cred_t *cr,
3299 	caller_context_t *ct)
3300 {
3301 	int	err;
3302 
3303 	VOPXID_MAP_CR(vp, cr);
3304 
3305 	/*
3306 	 * If this file system doesn't understand the xvattr extensions
3307 	 * then turn off the xvattr bit.
3308 	 */
3309 	if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3310 		vap->va_mask &= ~AT_XVATTR;
3311 	}
3312 
3313 	/*
3314 	 * We're only allowed to skip the ACL check iff we used a 32 bit
3315 	 * ACE mask with VOP_ACCESS() to determine permissions.
3316 	 */
3317 	if ((flags & ATTR_NOACLCHECK) &&
3318 	    vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3319 		return (EINVAL);
3320 	}
3321 	err = (*(vp)->v_op->vop_setattr)(vp, vap, flags, cr, ct);
3322 	VOPSTATS_UPDATE(vp, setattr);
3323 	return (err);
3324 }
3325 
3326 int
3327 fop_access(
3328 	vnode_t *vp,
3329 	int mode,
3330 	int flags,
3331 	cred_t *cr,
3332 	caller_context_t *ct)
3333 {
3334 	int	err;
3335 
3336 	if ((flags & V_ACE_MASK) &&
3337 	    vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3338 		return (EINVAL);
3339 	}
3340 
3341 	VOPXID_MAP_CR(vp, cr);
3342 
3343 	err = (*(vp)->v_op->vop_access)(vp, mode, flags, cr, ct);
3344 	VOPSTATS_UPDATE(vp, access);
3345 	return (err);
3346 }
3347 
3348 int
3349 fop_lookup(
3350 	vnode_t *dvp,
3351 	char *nm,
3352 	vnode_t **vpp,
3353 	pathname_t *pnp,
3354 	int flags,
3355 	vnode_t *rdir,
3356 	cred_t *cr,
3357 	caller_context_t *ct,
3358 	int *deflags,		/* Returned per-dirent flags */
3359 	pathname_t *ppnp)	/* Returned case-preserved name in directory */
3360 {
3361 	int ret;
3362 
3363 	/*
3364 	 * If this file system doesn't support case-insensitive access
3365 	 * and said access is requested, fail quickly.  It is required
3366 	 * that if the vfs supports case-insensitive lookup, it also
3367 	 * supports extended dirent flags.
3368 	 */
3369 	if (flags & FIGNORECASE &&
3370 	    (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3371 	    vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3372 		return (EINVAL);
3373 
3374 	VOPXID_MAP_CR(dvp, cr);
3375 
3376 	if ((flags & LOOKUP_XATTR) && (flags & LOOKUP_HAVE_SYSATTR_DIR) == 0) {
3377 		ret = xattr_dir_lookup(dvp, vpp, flags, cr);
3378 	} else {
3379 		ret = (*(dvp)->v_op->vop_lookup)
3380 		    (dvp, nm, vpp, pnp, flags, rdir, cr, ct, deflags, ppnp);
3381 	}
3382 	if (ret == 0 && *vpp) {
3383 		VOPSTATS_UPDATE(*vpp, lookup);
3384 		if ((*vpp)->v_path == NULL) {
3385 			vn_setpath(rootdir, dvp, *vpp, nm, strlen(nm));
3386 		}
3387 	}
3388 
3389 	return (ret);
3390 }
3391 
3392 int
3393 fop_create(
3394 	vnode_t *dvp,
3395 	char *name,
3396 	vattr_t *vap,
3397 	vcexcl_t excl,
3398 	int mode,
3399 	vnode_t **vpp,
3400 	cred_t *cr,
3401 	int flags,
3402 	caller_context_t *ct,
3403 	vsecattr_t *vsecp)	/* ACL to set during create */
3404 {
3405 	int ret;
3406 
3407 	if (vsecp != NULL &&
3408 	    vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3409 		return (EINVAL);
3410 	}
3411 	/*
3412 	 * If this file system doesn't support case-insensitive access
3413 	 * and said access is requested, fail quickly.
3414 	 */
3415 	if (flags & FIGNORECASE &&
3416 	    (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3417 	    vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3418 		return (EINVAL);
3419 
3420 	VOPXID_MAP_CR(dvp, cr);
3421 
3422 	ret = (*(dvp)->v_op->vop_create)
3423 	    (dvp, name, vap, excl, mode, vpp, cr, flags, ct, vsecp);
3424 	if (ret == 0 && *vpp) {
3425 		VOPSTATS_UPDATE(*vpp, create);
3426 		if ((*vpp)->v_path == NULL) {
3427 			vn_setpath(rootdir, dvp, *vpp, name, strlen(name));
3428 		}
3429 	}
3430 
3431 	return (ret);
3432 }
3433 
3434 int
3435 fop_remove(
3436 	vnode_t *dvp,
3437 	char *nm,
3438 	cred_t *cr,
3439 	caller_context_t *ct,
3440 	int flags)
3441 {
3442 	int	err;
3443 
3444 	/*
3445 	 * If this file system doesn't support case-insensitive access
3446 	 * and said access is requested, fail quickly.
3447 	 */
3448 	if (flags & FIGNORECASE &&
3449 	    (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3450 	    vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3451 		return (EINVAL);
3452 
3453 	VOPXID_MAP_CR(dvp, cr);
3454 
3455 	err = (*(dvp)->v_op->vop_remove)(dvp, nm, cr, ct, flags);
3456 	VOPSTATS_UPDATE(dvp, remove);
3457 	return (err);
3458 }
3459 
3460 int
3461 fop_link(
3462 	vnode_t *tdvp,
3463 	vnode_t *svp,
3464 	char *tnm,
3465 	cred_t *cr,
3466 	caller_context_t *ct,
3467 	int flags)
3468 {
3469 	int	err;
3470 
3471 	/*
3472 	 * If the target file system doesn't support case-insensitive access
3473 	 * and said access is requested, fail quickly.
3474 	 */
3475 	if (flags & FIGNORECASE &&
3476 	    (vfs_has_feature(tdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3477 	    vfs_has_feature(tdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3478 		return (EINVAL);
3479 
3480 	VOPXID_MAP_CR(tdvp, cr);
3481 
3482 	err = (*(tdvp)->v_op->vop_link)(tdvp, svp, tnm, cr, ct, flags);
3483 	VOPSTATS_UPDATE(tdvp, link);
3484 	return (err);
3485 }
3486 
3487 int
3488 fop_rename(
3489 	vnode_t *sdvp,
3490 	char *snm,
3491 	vnode_t *tdvp,
3492 	char *tnm,
3493 	cred_t *cr,
3494 	caller_context_t *ct,
3495 	int flags)
3496 {
3497 	int	err;
3498 
3499 	/*
3500 	 * If the file system involved does not support
3501 	 * case-insensitive access and said access is requested, fail
3502 	 * quickly.
3503 	 */
3504 	if (flags & FIGNORECASE &&
3505 	    ((vfs_has_feature(sdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3506 	    vfs_has_feature(sdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)))
3507 		return (EINVAL);
3508 
3509 	VOPXID_MAP_CR(tdvp, cr);
3510 
3511 	err = (*(sdvp)->v_op->vop_rename)(sdvp, snm, tdvp, tnm, cr, ct, flags);
3512 	VOPSTATS_UPDATE(sdvp, rename);
3513 	return (err);
3514 }
3515 
3516 int
3517 fop_mkdir(
3518 	vnode_t *dvp,
3519 	char *dirname,
3520 	vattr_t *vap,
3521 	vnode_t **vpp,
3522 	cred_t *cr,
3523 	caller_context_t *ct,
3524 	int flags,
3525 	vsecattr_t *vsecp)	/* ACL to set during create */
3526 {
3527 	int ret;
3528 
3529 	if (vsecp != NULL &&
3530 	    vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3531 		return (EINVAL);
3532 	}
3533 	/*
3534 	 * If this file system doesn't support case-insensitive access
3535 	 * and said access is requested, fail quickly.
3536 	 */
3537 	if (flags & FIGNORECASE &&
3538 	    (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3539 	    vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3540 		return (EINVAL);
3541 
3542 	VOPXID_MAP_CR(dvp, cr);
3543 
3544 	ret = (*(dvp)->v_op->vop_mkdir)
3545 	    (dvp, dirname, vap, vpp, cr, ct, flags, vsecp);
3546 	if (ret == 0 && *vpp) {
3547 		VOPSTATS_UPDATE(*vpp, mkdir);
3548 		if ((*vpp)->v_path == NULL) {
3549 			vn_setpath(rootdir, dvp, *vpp, dirname,
3550 			    strlen(dirname));
3551 		}
3552 	}
3553 
3554 	return (ret);
3555 }
3556 
3557 int
3558 fop_rmdir(
3559 	vnode_t *dvp,
3560 	char *nm,
3561 	vnode_t *cdir,
3562 	cred_t *cr,
3563 	caller_context_t *ct,
3564 	int flags)
3565 {
3566 	int	err;
3567 
3568 	/*
3569 	 * If this file system doesn't support case-insensitive access
3570 	 * and said access is requested, fail quickly.
3571 	 */
3572 	if (flags & FIGNORECASE &&
3573 	    (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3574 	    vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3575 		return (EINVAL);
3576 
3577 	VOPXID_MAP_CR(dvp, cr);
3578 
3579 	err = (*(dvp)->v_op->vop_rmdir)(dvp, nm, cdir, cr, ct, flags);
3580 	VOPSTATS_UPDATE(dvp, rmdir);
3581 	return (err);
3582 }
3583 
3584 int
3585 fop_readdir(
3586 	vnode_t *vp,
3587 	uio_t *uiop,
3588 	cred_t *cr,
3589 	int *eofp,
3590 	caller_context_t *ct,
3591 	int flags)
3592 {
3593 	int	err;
3594 	ssize_t	resid_start = uiop->uio_resid;
3595 
3596 	/*
3597 	 * If this file system doesn't support retrieving directory
3598 	 * entry flags and said access is requested, fail quickly.
3599 	 */
3600 	if (flags & V_RDDIR_ENTFLAGS &&
3601 	    vfs_has_feature(vp->v_vfsp, VFSFT_DIRENTFLAGS) == 0)
3602 		return (EINVAL);
3603 
3604 	VOPXID_MAP_CR(vp, cr);
3605 
3606 	err = (*(vp)->v_op->vop_readdir)(vp, uiop, cr, eofp, ct, flags);
3607 	VOPSTATS_UPDATE_IO(vp, readdir,
3608 	    readdir_bytes, (resid_start - uiop->uio_resid));
3609 	return (err);
3610 }
3611 
3612 int
3613 fop_symlink(
3614 	vnode_t *dvp,
3615 	char *linkname,
3616 	vattr_t *vap,
3617 	char *target,
3618 	cred_t *cr,
3619 	caller_context_t *ct,
3620 	int flags)
3621 {
3622 	int	err;
3623 	xvattr_t xvattr;
3624 
3625 	/*
3626 	 * If this file system doesn't support case-insensitive access
3627 	 * and said access is requested, fail quickly.
3628 	 */
3629 	if (flags & FIGNORECASE &&
3630 	    (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3631 	    vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3632 		return (EINVAL);
3633 
3634 	VOPXID_MAP_CR(dvp, cr);
3635 
3636 	/* check for reparse point */
3637 	if ((vfs_has_feature(dvp->v_vfsp, VFSFT_REPARSE)) &&
3638 	    (strncmp(target, FS_REPARSE_TAG_STR,
3639 	    strlen(FS_REPARSE_TAG_STR)) == 0)) {
3640 		if (!fs_reparse_mark(target, vap, &xvattr))
3641 			vap = (vattr_t *)&xvattr;
3642 	}
3643 
3644 	err = (*(dvp)->v_op->vop_symlink)
3645 	    (dvp, linkname, vap, target, cr, ct, flags);
3646 	VOPSTATS_UPDATE(dvp, symlink);
3647 	return (err);
3648 }
3649 
3650 int
3651 fop_readlink(
3652 	vnode_t *vp,
3653 	uio_t *uiop,
3654 	cred_t *cr,
3655 	caller_context_t *ct)
3656 {
3657 	int	err;
3658 
3659 	VOPXID_MAP_CR(vp, cr);
3660 
3661 	err = (*(vp)->v_op->vop_readlink)(vp, uiop, cr, ct);
3662 	VOPSTATS_UPDATE(vp, readlink);
3663 	return (err);
3664 }
3665 
3666 int
3667 fop_fsync(
3668 	vnode_t *vp,
3669 	int syncflag,
3670 	cred_t *cr,
3671 	caller_context_t *ct)
3672 {
3673 	int	err;
3674 
3675 	VOPXID_MAP_CR(vp, cr);
3676 
3677 	err = (*(vp)->v_op->vop_fsync)(vp, syncflag, cr, ct);
3678 	VOPSTATS_UPDATE(vp, fsync);
3679 	return (err);
3680 }
3681 
3682 void
3683 fop_inactive(
3684 	vnode_t *vp,
3685 	cred_t *cr,
3686 	caller_context_t *ct)
3687 {
3688 	/* Need to update stats before vop call since we may lose the vnode */
3689 	VOPSTATS_UPDATE(vp, inactive);
3690 
3691 	VOPXID_MAP_CR(vp, cr);
3692 
3693 	(*(vp)->v_op->vop_inactive)(vp, cr, ct);
3694 }
3695 
3696 int
3697 fop_fid(
3698 	vnode_t *vp,
3699 	fid_t *fidp,
3700 	caller_context_t *ct)
3701 {
3702 	int	err;
3703 
3704 	err = (*(vp)->v_op->vop_fid)(vp, fidp, ct);
3705 	VOPSTATS_UPDATE(vp, fid);
3706 	return (err);
3707 }
3708 
3709 int
3710 fop_rwlock(
3711 	vnode_t *vp,
3712 	int write_lock,
3713 	caller_context_t *ct)
3714 {
3715 	int	ret;
3716 
3717 	ret = ((*(vp)->v_op->vop_rwlock)(vp, write_lock, ct));
3718 	VOPSTATS_UPDATE(vp, rwlock);
3719 	return (ret);
3720 }
3721 
3722 void
3723 fop_rwunlock(
3724 	vnode_t *vp,
3725 	int write_lock,
3726 	caller_context_t *ct)
3727 {
3728 	(*(vp)->v_op->vop_rwunlock)(vp, write_lock, ct);
3729 	VOPSTATS_UPDATE(vp, rwunlock);
3730 }
3731 
3732 int
3733 fop_seek(
3734 	vnode_t *vp,
3735 	offset_t ooff,
3736 	offset_t *noffp,
3737 	caller_context_t *ct)
3738 {
3739 	int	err;
3740 
3741 	err = (*(vp)->v_op->vop_seek)(vp, ooff, noffp, ct);
3742 	VOPSTATS_UPDATE(vp, seek);
3743 	return (err);
3744 }
3745 
3746 int
3747 fop_cmp(
3748 	vnode_t *vp1,
3749 	vnode_t *vp2,
3750 	caller_context_t *ct)
3751 {
3752 	int	err;
3753 
3754 	err = (*(vp1)->v_op->vop_cmp)(vp1, vp2, ct);
3755 	VOPSTATS_UPDATE(vp1, cmp);
3756 	return (err);
3757 }
3758 
3759 int
3760 fop_frlock(
3761 	vnode_t *vp,
3762 	int cmd,
3763 	flock64_t *bfp,
3764 	int flag,
3765 	offset_t offset,
3766 	struct flk_callback *flk_cbp,
3767 	cred_t *cr,
3768 	caller_context_t *ct)
3769 {
3770 	int	err;
3771 
3772 	VOPXID_MAP_CR(vp, cr);
3773 
3774 	err = (*(vp)->v_op->vop_frlock)
3775 	    (vp, cmd, bfp, flag, offset, flk_cbp, cr, ct);
3776 	VOPSTATS_UPDATE(vp, frlock);
3777 	return (err);
3778 }
3779 
3780 int
3781 fop_space(
3782 	vnode_t *vp,
3783 	int cmd,
3784 	flock64_t *bfp,
3785 	int flag,
3786 	offset_t offset,
3787 	cred_t *cr,
3788 	caller_context_t *ct)
3789 {
3790 	int	err;
3791 
3792 	VOPXID_MAP_CR(vp, cr);
3793 
3794 	err = (*(vp)->v_op->vop_space)(vp, cmd, bfp, flag, offset, cr, ct);
3795 	VOPSTATS_UPDATE(vp, space);
3796 	return (err);
3797 }
3798 
3799 int
3800 fop_realvp(
3801 	vnode_t *vp,
3802 	vnode_t **vpp,
3803 	caller_context_t *ct)
3804 {
3805 	int	err;
3806 
3807 	err = (*(vp)->v_op->vop_realvp)(vp, vpp, ct);
3808 	VOPSTATS_UPDATE(vp, realvp);
3809 	return (err);
3810 }
3811 
3812 int
3813 fop_getpage(
3814 	vnode_t *vp,
3815 	offset_t off,
3816 	size_t len,
3817 	uint_t *protp,
3818 	page_t **plarr,
3819 	size_t plsz,
3820 	struct seg *seg,
3821 	caddr_t addr,
3822 	enum seg_rw rw,
3823 	cred_t *cr,
3824 	caller_context_t *ct)
3825 {
3826 	int	err;
3827 
3828 	VOPXID_MAP_CR(vp, cr);
3829 
3830 	err = (*(vp)->v_op->vop_getpage)
3831 	    (vp, off, len, protp, plarr, plsz, seg, addr, rw, cr, ct);
3832 	VOPSTATS_UPDATE(vp, getpage);
3833 	return (err);
3834 }
3835 
3836 int
3837 fop_putpage(
3838 	vnode_t *vp,
3839 	offset_t off,
3840 	size_t len,
3841 	int flags,
3842 	cred_t *cr,
3843 	caller_context_t *ct)
3844 {
3845 	int	err;
3846 
3847 	VOPXID_MAP_CR(vp, cr);
3848 
3849 	err = (*(vp)->v_op->vop_putpage)(vp, off, len, flags, cr, ct);
3850 	VOPSTATS_UPDATE(vp, putpage);
3851 	return (err);
3852 }
3853 
3854 int
3855 fop_map(
3856 	vnode_t *vp,
3857 	offset_t off,
3858 	struct as *as,
3859 	caddr_t *addrp,
3860 	size_t len,
3861 	uchar_t prot,
3862 	uchar_t maxprot,
3863 	uint_t flags,
3864 	cred_t *cr,
3865 	caller_context_t *ct)
3866 {
3867 	int	err;
3868 
3869 	VOPXID_MAP_CR(vp, cr);
3870 
3871 	err = (*(vp)->v_op->vop_map)
3872 	    (vp, off, as, addrp, len, prot, maxprot, flags, cr, ct);
3873 	VOPSTATS_UPDATE(vp, map);
3874 	return (err);
3875 }
3876 
3877 int
3878 fop_addmap(
3879 	vnode_t *vp,
3880 	offset_t off,
3881 	struct as *as,
3882 	caddr_t addr,
3883 	size_t len,
3884 	uchar_t prot,
3885 	uchar_t maxprot,
3886 	uint_t flags,
3887 	cred_t *cr,
3888 	caller_context_t *ct)
3889 {
3890 	int error;
3891 	u_longlong_t delta;
3892 
3893 	VOPXID_MAP_CR(vp, cr);
3894 
3895 	error = (*(vp)->v_op->vop_addmap)
3896 	    (vp, off, as, addr, len, prot, maxprot, flags, cr, ct);
3897 
3898 	if ((!error) && (vp->v_type == VREG)) {
3899 		delta = (u_longlong_t)btopr(len);
3900 		/*
3901 		 * If file is declared MAP_PRIVATE, it can't be written back
3902 		 * even if open for write. Handle as read.
3903 		 */
3904 		if (flags & MAP_PRIVATE) {
3905 			atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3906 			    (int64_t)delta);
3907 		} else {
3908 			/*
3909 			 * atomic_add_64 forces the fetch of a 64 bit value to
3910 			 * be atomic on 32 bit machines
3911 			 */
3912 			if (maxprot & PROT_WRITE)
3913 				atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
3914 				    (int64_t)delta);
3915 			if (maxprot & PROT_READ)
3916 				atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3917 				    (int64_t)delta);
3918 			if (maxprot & PROT_EXEC)
3919 				atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3920 				    (int64_t)delta);
3921 		}
3922 	}
3923 	VOPSTATS_UPDATE(vp, addmap);
3924 	return (error);
3925 }
3926 
3927 int
3928 fop_delmap(
3929 	vnode_t *vp,
3930 	offset_t off,
3931 	struct as *as,
3932 	caddr_t addr,
3933 	size_t len,
3934 	uint_t prot,
3935 	uint_t maxprot,
3936 	uint_t flags,
3937 	cred_t *cr,
3938 	caller_context_t *ct)
3939 {
3940 	int error;
3941 	u_longlong_t delta;
3942 
3943 	VOPXID_MAP_CR(vp, cr);
3944 
3945 	error = (*(vp)->v_op->vop_delmap)
3946 	    (vp, off, as, addr, len, prot, maxprot, flags, cr, ct);
3947 
3948 	/*
3949 	 * NFS calls into delmap twice, the first time
3950 	 * it simply establishes a callback mechanism and returns EAGAIN
3951 	 * while the real work is being done upon the second invocation.
3952 	 * We have to detect this here and only decrement the counts upon
3953 	 * the second delmap request.
3954 	 */
3955 	if ((error != EAGAIN) && (vp->v_type == VREG)) {
3956 
3957 		delta = (u_longlong_t)btopr(len);
3958 
3959 		if (flags & MAP_PRIVATE) {
3960 			atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3961 			    (int64_t)(-delta));
3962 		} else {
3963 			/*
3964 			 * atomic_add_64 forces the fetch of a 64 bit value
3965 			 * to be atomic on 32 bit machines
3966 			 */
3967 			if (maxprot & PROT_WRITE)
3968 				atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
3969 				    (int64_t)(-delta));
3970 			if (maxprot & PROT_READ)
3971 				atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3972 				    (int64_t)(-delta));
3973 			if (maxprot & PROT_EXEC)
3974 				atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3975 				    (int64_t)(-delta));
3976 		}
3977 	}
3978 	VOPSTATS_UPDATE(vp, delmap);
3979 	return (error);
3980 }
3981 
3982 
3983 int
3984 fop_poll(
3985 	vnode_t *vp,
3986 	short events,
3987 	int anyyet,
3988 	short *reventsp,
3989 	struct pollhead **phpp,
3990 	caller_context_t *ct)
3991 {
3992 	int	err;
3993 
3994 	err = (*(vp)->v_op->vop_poll)(vp, events, anyyet, reventsp, phpp, ct);
3995 	VOPSTATS_UPDATE(vp, poll);
3996 	return (err);
3997 }
3998 
3999 int
4000 fop_dump(
4001 	vnode_t *vp,
4002 	caddr_t addr,
4003 	offset_t lbdn,
4004 	offset_t dblks,
4005 	caller_context_t *ct)
4006 {
4007 	int	err;
4008 
4009 	/* ensure lbdn and dblks can be passed safely to bdev_dump */
4010 	if ((lbdn != (daddr_t)lbdn) || (dblks != (int)dblks))
4011 		return (EIO);
4012 
4013 	err = (*(vp)->v_op->vop_dump)(vp, addr, lbdn, dblks, ct);
4014 	VOPSTATS_UPDATE(vp, dump);
4015 	return (err);
4016 }
4017 
4018 int
4019 fop_pathconf(
4020 	vnode_t *vp,
4021 	int cmd,
4022 	ulong_t *valp,
4023 	cred_t *cr,
4024 	caller_context_t *ct)
4025 {
4026 	int	err;
4027 
4028 	VOPXID_MAP_CR(vp, cr);
4029 
4030 	err = (*(vp)->v_op->vop_pathconf)(vp, cmd, valp, cr, ct);
4031 	VOPSTATS_UPDATE(vp, pathconf);
4032 	return (err);
4033 }
4034 
4035 int
4036 fop_pageio(
4037 	vnode_t *vp,
4038 	struct page *pp,
4039 	u_offset_t io_off,
4040 	size_t io_len,
4041 	int flags,
4042 	cred_t *cr,
4043 	caller_context_t *ct)
4044 {
4045 	int	err;
4046 
4047 	VOPXID_MAP_CR(vp, cr);
4048 
4049 	err = (*(vp)->v_op->vop_pageio)(vp, pp, io_off, io_len, flags, cr, ct);
4050 	VOPSTATS_UPDATE(vp, pageio);
4051 	return (err);
4052 }
4053 
4054 int
4055 fop_dumpctl(
4056 	vnode_t *vp,
4057 	int action,
4058 	offset_t *blkp,
4059 	caller_context_t *ct)
4060 {
4061 	int	err;
4062 	err = (*(vp)->v_op->vop_dumpctl)(vp, action, blkp, ct);
4063 	VOPSTATS_UPDATE(vp, dumpctl);
4064 	return (err);
4065 }
4066 
4067 void
4068 fop_dispose(
4069 	vnode_t *vp,
4070 	page_t *pp,
4071 	int flag,
4072 	int dn,
4073 	cred_t *cr,
4074 	caller_context_t *ct)
4075 {
4076 	/* Must do stats first since it's possible to lose the vnode */
4077 	VOPSTATS_UPDATE(vp, dispose);
4078 
4079 	VOPXID_MAP_CR(vp, cr);
4080 
4081 	(*(vp)->v_op->vop_dispose)(vp, pp, flag, dn, cr, ct);
4082 }
4083 
4084 int
4085 fop_setsecattr(
4086 	vnode_t *vp,
4087 	vsecattr_t *vsap,
4088 	int flag,
4089 	cred_t *cr,
4090 	caller_context_t *ct)
4091 {
4092 	int	err;
4093 
4094 	VOPXID_MAP_CR(vp, cr);
4095 
4096 	/*
4097 	 * We're only allowed to skip the ACL check iff we used a 32 bit
4098 	 * ACE mask with VOP_ACCESS() to determine permissions.
4099 	 */
4100 	if ((flag & ATTR_NOACLCHECK) &&
4101 	    vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4102 		return (EINVAL);
4103 	}
4104 	err = (*(vp)->v_op->vop_setsecattr) (vp, vsap, flag, cr, ct);
4105 	VOPSTATS_UPDATE(vp, setsecattr);
4106 	return (err);
4107 }
4108 
4109 int
4110 fop_getsecattr(
4111 	vnode_t *vp,
4112 	vsecattr_t *vsap,
4113 	int flag,
4114 	cred_t *cr,
4115 	caller_context_t *ct)
4116 {
4117 	int	err;
4118 
4119 	/*
4120 	 * We're only allowed to skip the ACL check iff we used a 32 bit
4121 	 * ACE mask with VOP_ACCESS() to determine permissions.
4122 	 */
4123 	if ((flag & ATTR_NOACLCHECK) &&
4124 	    vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4125 		return (EINVAL);
4126 	}
4127 
4128 	VOPXID_MAP_CR(vp, cr);
4129 
4130 	err = (*(vp)->v_op->vop_getsecattr) (vp, vsap, flag, cr, ct);
4131 	VOPSTATS_UPDATE(vp, getsecattr);
4132 	return (err);
4133 }
4134 
4135 int
4136 fop_shrlock(
4137 	vnode_t *vp,
4138 	int cmd,
4139 	struct shrlock *shr,
4140 	int flag,
4141 	cred_t *cr,
4142 	caller_context_t *ct)
4143 {
4144 	int	err;
4145 
4146 	VOPXID_MAP_CR(vp, cr);
4147 
4148 	err = (*(vp)->v_op->vop_shrlock)(vp, cmd, shr, flag, cr, ct);
4149 	VOPSTATS_UPDATE(vp, shrlock);
4150 	return (err);
4151 }
4152 
4153 int
4154 fop_vnevent(vnode_t *vp, vnevent_t vnevent, vnode_t *dvp, char *fnm,
4155     caller_context_t *ct)
4156 {
4157 	int	err;
4158 
4159 	err = (*(vp)->v_op->vop_vnevent)(vp, vnevent, dvp, fnm, ct);
4160 	VOPSTATS_UPDATE(vp, vnevent);
4161 	return (err);
4162 }
4163 
4164 int
4165 fop_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *uiop, cred_t *cr,
4166     caller_context_t *ct)
4167 {
4168 	int err;
4169 
4170 	if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4171 		return (ENOTSUP);
4172 	err = (*(vp)->v_op->vop_reqzcbuf)(vp, ioflag, uiop, cr, ct);
4173 	VOPSTATS_UPDATE(vp, reqzcbuf);
4174 	return (err);
4175 }
4176 
4177 int
4178 fop_retzcbuf(vnode_t *vp, xuio_t *uiop, cred_t *cr, caller_context_t *ct)
4179 {
4180 	int err;
4181 
4182 	if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4183 		return (ENOTSUP);
4184 	err = (*(vp)->v_op->vop_retzcbuf)(vp, uiop, cr, ct);
4185 	VOPSTATS_UPDATE(vp, retzcbuf);
4186 	return (err);
4187 }
4188 
4189 /*
4190  * Default destructor
4191  *	Needed because NULL destructor means that the key is unused
4192  */
4193 /* ARGSUSED */
4194 void
4195 vsd_defaultdestructor(void *value)
4196 {}
4197 
4198 /*
4199  * Create a key (index into per vnode array)
4200  *	Locks out vsd_create, vsd_destroy, and vsd_free
4201  *	May allocate memory with lock held
4202  */
4203 void
4204 vsd_create(uint_t *keyp, void (*destructor)(void *))
4205 {
4206 	int	i;
4207 	uint_t	nkeys;
4208 
4209 	/*
4210 	 * if key is allocated, do nothing
4211 	 */
4212 	mutex_enter(&vsd_lock);
4213 	if (*keyp) {
4214 		mutex_exit(&vsd_lock);
4215 		return;
4216 	}
4217 	/*
4218 	 * find an unused key
4219 	 */
4220 	if (destructor == NULL)
4221 		destructor = vsd_defaultdestructor;
4222 
4223 	for (i = 0; i < vsd_nkeys; ++i)
4224 		if (vsd_destructor[i] == NULL)
4225 			break;
4226 
4227 	/*
4228 	 * if no unused keys, increase the size of the destructor array
4229 	 */
4230 	if (i == vsd_nkeys) {
4231 		if ((nkeys = (vsd_nkeys << 1)) == 0)
4232 			nkeys = 1;
4233 		vsd_destructor =
4234 		    (void (**)(void *))vsd_realloc((void *)vsd_destructor,
4235 		    (size_t)(vsd_nkeys * sizeof (void (*)(void *))),
4236 		    (size_t)(nkeys * sizeof (void (*)(void *))));
4237 		vsd_nkeys = nkeys;
4238 	}
4239 
4240 	/*
4241 	 * allocate the next available unused key
4242 	 */
4243 	vsd_destructor[i] = destructor;
4244 	*keyp = i + 1;
4245 
4246 	/* create vsd_list, if it doesn't exist */
4247 	if (vsd_list == NULL) {
4248 		vsd_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
4249 		list_create(vsd_list, sizeof (struct vsd_node),
4250 		    offsetof(struct vsd_node, vs_nodes));
4251 	}
4252 
4253 	mutex_exit(&vsd_lock);
4254 }
4255 
4256 /*
4257  * Destroy a key
4258  *
4259  * Assumes that the caller is preventing vsd_set and vsd_get
4260  * Locks out vsd_create, vsd_destroy, and vsd_free
4261  * May free memory with lock held
4262  */
4263 void
4264 vsd_destroy(uint_t *keyp)
4265 {
4266 	uint_t key;
4267 	struct vsd_node *vsd;
4268 
4269 	/*
4270 	 * protect the key namespace and our destructor lists
4271 	 */
4272 	mutex_enter(&vsd_lock);
4273 	key = *keyp;
4274 	*keyp = 0;
4275 
4276 	ASSERT(key <= vsd_nkeys);
4277 
4278 	/*
4279 	 * if the key is valid
4280 	 */
4281 	if (key != 0) {
4282 		uint_t k = key - 1;
4283 		/*
4284 		 * for every vnode with VSD, call key's destructor
4285 		 */
4286 		for (vsd = list_head(vsd_list); vsd != NULL;
4287 		    vsd = list_next(vsd_list, vsd)) {
4288 			/*
4289 			 * no VSD for key in this vnode
4290 			 */
4291 			if (key > vsd->vs_nkeys)
4292 				continue;
4293 			/*
4294 			 * call destructor for key
4295 			 */
4296 			if (vsd->vs_value[k] && vsd_destructor[k])
4297 				(*vsd_destructor[k])(vsd->vs_value[k]);
4298 			/*
4299 			 * reset value for key
4300 			 */
4301 			vsd->vs_value[k] = NULL;
4302 		}
4303 		/*
4304 		 * actually free the key (NULL destructor == unused)
4305 		 */
4306 		vsd_destructor[k] = NULL;
4307 	}
4308 
4309 	mutex_exit(&vsd_lock);
4310 }
4311 
4312 /*
4313  * Quickly return the per vnode value that was stored with the specified key
4314  * Assumes the caller is protecting key from vsd_create and vsd_destroy
4315  * Assumes the caller is holding v_vsd_lock to protect the vsd.
4316  */
4317 void *
4318 vsd_get(vnode_t *vp, uint_t key)
4319 {
4320 	struct vsd_node *vsd;
4321 
4322 	ASSERT(vp != NULL);
4323 	ASSERT(mutex_owned(&vp->v_vsd_lock));
4324 
4325 	vsd = vp->v_vsd;
4326 
4327 	if (key && vsd != NULL && key <= vsd->vs_nkeys)
4328 		return (vsd->vs_value[key - 1]);
4329 	return (NULL);
4330 }
4331 
4332 /*
4333  * Set a per vnode value indexed with the specified key
4334  * Assumes the caller is holding v_vsd_lock to protect the vsd.
4335  */
4336 int
4337 vsd_set(vnode_t *vp, uint_t key, void *value)
4338 {
4339 	struct vsd_node *vsd;
4340 
4341 	ASSERT(vp != NULL);
4342 	ASSERT(mutex_owned(&vp->v_vsd_lock));
4343 
4344 	if (key == 0)
4345 		return (EINVAL);
4346 
4347 	vsd = vp->v_vsd;
4348 	if (vsd == NULL)
4349 		vsd = vp->v_vsd = kmem_zalloc(sizeof (*vsd), KM_SLEEP);
4350 
4351 	/*
4352 	 * If the vsd was just allocated, vs_nkeys will be 0, so the following
4353 	 * code won't happen and we will continue down and allocate space for
4354 	 * the vs_value array.
4355 	 * If the caller is replacing one value with another, then it is up
4356 	 * to the caller to free/rele/destroy the previous value (if needed).
4357 	 */
4358 	if (key <= vsd->vs_nkeys) {
4359 		vsd->vs_value[key - 1] = value;
4360 		return (0);
4361 	}
4362 
4363 	ASSERT(key <= vsd_nkeys);
4364 
4365 	if (vsd->vs_nkeys == 0) {
4366 		mutex_enter(&vsd_lock);	/* lock out vsd_destroy() */
4367 		/*
4368 		 * Link onto list of all VSD nodes.
4369 		 */
4370 		list_insert_head(vsd_list, vsd);
4371 		mutex_exit(&vsd_lock);
4372 	}
4373 
4374 	/*
4375 	 * Allocate vnode local storage and set the value for key
4376 	 */
4377 	vsd->vs_value = vsd_realloc(vsd->vs_value,
4378 	    vsd->vs_nkeys * sizeof (void *),
4379 	    key * sizeof (void *));
4380 	vsd->vs_nkeys = key;
4381 	vsd->vs_value[key - 1] = value;
4382 
4383 	return (0);
4384 }
4385 
4386 /*
4387  * Called from vn_free() to run the destructor function for each vsd
4388  *	Locks out vsd_create and vsd_destroy
4389  *	Assumes that the destructor *DOES NOT* use vsd
4390  */
4391 void
4392 vsd_free(vnode_t *vp)
4393 {
4394 	int i;
4395 	struct vsd_node *vsd = vp->v_vsd;
4396 
4397 	if (vsd == NULL)
4398 		return;
4399 
4400 	if (vsd->vs_nkeys == 0) {
4401 		kmem_free(vsd, sizeof (*vsd));
4402 		vp->v_vsd = NULL;
4403 		return;
4404 	}
4405 
4406 	/*
4407 	 * lock out vsd_create and vsd_destroy, call
4408 	 * the destructor, and mark the value as destroyed.
4409 	 */
4410 	mutex_enter(&vsd_lock);
4411 
4412 	for (i = 0; i < vsd->vs_nkeys; i++) {
4413 		if (vsd->vs_value[i] && vsd_destructor[i])
4414 			(*vsd_destructor[i])(vsd->vs_value[i]);
4415 		vsd->vs_value[i] = NULL;
4416 	}
4417 
4418 	/*
4419 	 * remove from linked list of VSD nodes
4420 	 */
4421 	list_remove(vsd_list, vsd);
4422 
4423 	mutex_exit(&vsd_lock);
4424 
4425 	/*
4426 	 * free up the VSD
4427 	 */
4428 	kmem_free(vsd->vs_value, vsd->vs_nkeys * sizeof (void *));
4429 	kmem_free(vsd, sizeof (struct vsd_node));
4430 	vp->v_vsd = NULL;
4431 }
4432 
4433 /*
4434  * realloc
4435  */
4436 static void *
4437 vsd_realloc(void *old, size_t osize, size_t nsize)
4438 {
4439 	void *new;
4440 
4441 	new = kmem_zalloc(nsize, KM_SLEEP);
4442 	if (old) {
4443 		bcopy(old, new, osize);
4444 		kmem_free(old, osize);
4445 	}
4446 	return (new);
4447 }
4448 
4449 /*
4450  * Setup the extensible system attribute for creating a reparse point.
4451  * The symlink data 'target' is validated for proper format of a reparse
4452  * string and a check also made to make sure the symlink data does not
4453  * point to an existing file.
4454  *
4455  * return 0 if ok else -1.
4456  */
4457 static int
4458 fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr)
4459 {
4460 	xoptattr_t *xoap;
4461 
4462 	if ((!target) || (!vap) || (!xvattr))
4463 		return (-1);
4464 
4465 	/* validate reparse string */
4466 	if (reparse_validate((const char *)target))
4467 		return (-1);
4468 
4469 	xva_init(xvattr);
4470 	xvattr->xva_vattr = *vap;
4471 	xvattr->xva_vattr.va_mask |= AT_XVATTR;
4472 	xoap = xva_getxoptattr(xvattr);
4473 	ASSERT(xoap);
4474 	XVA_SET_REQ(xvattr, XAT_REPARSE);
4475 	xoap->xoa_reparse = 1;
4476 
4477 	return (0);
4478 }
4479 
4480 /*
4481  * Function to check whether a symlink is a reparse point.
4482  * Return B_TRUE if it is a reparse point, else return B_FALSE
4483  */
4484 boolean_t
4485 vn_is_reparse(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4486 {
4487 	xvattr_t xvattr;
4488 	xoptattr_t *xoap;
4489 
4490 	if ((vp->v_type != VLNK) ||
4491 	    !(vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR)))
4492 		return (B_FALSE);
4493 
4494 	xva_init(&xvattr);
4495 	xoap = xva_getxoptattr(&xvattr);
4496 	ASSERT(xoap);
4497 	XVA_SET_REQ(&xvattr, XAT_REPARSE);
4498 
4499 	if (VOP_GETATTR(vp, &xvattr.xva_vattr, 0, cr, ct))
4500 		return (B_FALSE);
4501 
4502 	if ((!(xvattr.xva_vattr.va_mask & AT_XVATTR)) ||
4503 	    (!(XVA_ISSET_RTN(&xvattr, XAT_REPARSE))))
4504 		return (B_FALSE);
4505 
4506 	return (xoap->xoa_reparse ? B_TRUE : B_FALSE);
4507 }
4508