xref: /illumos-gate/usr/src/uts/common/fs/vnode.c (revision 133d96a86dd791c469e03dc4c80ace8f7775aa93)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright 2020 Joyent, Inc.
25  * Copyright 2022 Spencer Evans-Cole.
26  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
27  * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
28  */
29 
30 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
31 /*	  All Rights Reserved	*/
32 
33 /*
34  * University Copyright- Copyright (c) 1982, 1986, 1988
35  * The Regents of the University of California
36  * All Rights Reserved
37  *
38  * University Acknowledgment- Portions of this document are derived from
39  * software developed by the University of California, Berkeley, and its
40  * contributors.
41  */
42 
43 #include <sys/types.h>
44 #include <sys/param.h>
45 #include <sys/t_lock.h>
46 #include <sys/errno.h>
47 #include <sys/cred.h>
48 #include <sys/user.h>
49 #include <sys/uio.h>
50 #include <sys/file.h>
51 #include <sys/pathname.h>
52 #include <sys/vfs.h>
53 #include <sys/vfs_opreg.h>
54 #include <sys/vnode.h>
55 #include <sys/filio.h>
56 #include <sys/rwstlock.h>
57 #include <sys/fem.h>
58 #include <sys/stat.h>
59 #include <sys/mode.h>
60 #include <sys/conf.h>
61 #include <sys/sysmacros.h>
62 #include <sys/cmn_err.h>
63 #include <sys/systm.h>
64 #include <sys/kmem.h>
65 #include <sys/debug.h>
66 #include <c2/audit.h>
67 #include <sys/acl.h>
68 #include <sys/nbmlock.h>
69 #include <sys/fcntl.h>
70 #include <fs/fs_subr.h>
71 #include <sys/taskq.h>
72 #include <fs/fs_reparse.h>
73 #include <sys/time.h>
74 #include <sys/sdt.h>
75 
76 /* Determine if this vnode is a file that is read-only */
77 #define	ISROFILE(vp)	\
78 	((vp)->v_type != VCHR && (vp)->v_type != VBLK && \
79 	    (vp)->v_type != VFIFO && vn_is_readonly(vp))
80 
81 /* Tunable via /etc/system; used only by admin/install */
82 int nfs_global_client_only;
83 
84 /*
85  * Array of vopstats_t for per-FS-type vopstats.  This array has the same
86  * number of entries as and parallel to the vfssw table.  (Arguably, it could
87  * be part of the vfssw table.)  Once it's initialized, it's accessed using
88  * the same fstype index that is used to index into the vfssw table.
89  */
90 vopstats_t **vopstats_fstype;
91 
92 /* vopstats initialization template used for fast initialization via bcopy() */
93 static vopstats_t *vs_templatep;
94 
95 /* Kmem cache handle for vsk_anchor_t allocations */
96 kmem_cache_t *vsk_anchor_cache;
97 
98 /* file events cleanup routine */
99 extern void free_fopdata(vnode_t *);
100 
101 /*
102  * Root of AVL tree for the kstats associated with vopstats.  Lock protects
103  * updates to vsktat_tree.
104  */
105 avl_tree_t	vskstat_tree;
106 kmutex_t	vskstat_tree_lock;
107 
108 /* Global variable which enables/disables the vopstats collection */
109 int vopstats_enabled = 1;
110 
111 /* Global used for empty/invalid v_path */
112 char *vn_vpath_empty = "";
113 
114 /*
115  * forward declarations for internal vnode specific data (vsd)
116  */
117 static void *vsd_realloc(void *, size_t, size_t);
118 
119 /*
120  * forward declarations for reparse point functions
121  */
122 static int fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr);
123 
124 /*
125  * VSD -- VNODE SPECIFIC DATA
126  * The v_data pointer is typically used by a file system to store a
127  * pointer to the file system's private node (e.g. ufs inode, nfs rnode).
128  * However, there are times when additional project private data needs
129  * to be stored separately from the data (node) pointed to by v_data.
130  * This additional data could be stored by the file system itself or
131  * by a completely different kernel entity.  VSD provides a way for
132  * callers to obtain a key and store a pointer to private data associated
133  * with a vnode.
134  *
135  * Callers are responsible for protecting the vsd by holding v_vsd_lock
136  * for calls to vsd_set() and vsd_get().
137  */
138 
139 /*
140  * vsd_lock protects:
141  *   vsd_nkeys - creation and deletion of vsd keys
142  *   vsd_list - insertion and deletion of vsd_node in the vsd_list
143  *   vsd_destructor - adding and removing destructors to the list
144  */
145 static kmutex_t		vsd_lock;
146 static uint_t		vsd_nkeys;	 /* size of destructor array */
147 /* list of vsd_node's */
148 static list_t *vsd_list = NULL;
149 /* per-key destructor funcs */
150 static void		(**vsd_destructor)(void *);
151 
152 /*
153  * The following is the common set of actions needed to update the
154  * vopstats structure from a vnode op.  Both VOPSTATS_UPDATE() and
155  * VOPSTATS_UPDATE_IO() do almost the same thing, except for the
156  * recording of the bytes transferred.  Since the code is similar
157  * but small, it is nearly a duplicate.  Consequently any changes
158  * to one may need to be reflected in the other.
159  * Rundown of the variables:
160  * vp - Pointer to the vnode
161  * counter - Partial name structure member to update in vopstats for counts
162  * bytecounter - Partial name structure member to update in vopstats for bytes
163  * bytesval - Value to update in vopstats for bytes
164  * fstype - Index into vsanchor_fstype[], same as index into vfssw[]
165  * vsp - Pointer to vopstats structure (either in vfs or vsanchor_fstype[i])
166  */
167 
168 #define	VOPSTATS_UPDATE(vp, counter) {					\
169 	vfs_t *vfsp = (vp)->v_vfsp;					\
170 	if (vfsp && vfsp->vfs_implp &&					\
171 	    (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) {	\
172 		vopstats_t *vsp = &vfsp->vfs_vopstats;			\
173 		uint64_t *stataddr = &(vsp->n##counter.value.ui64);	\
174 		extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
175 		    size_t, uint64_t *);				\
176 		__dtrace_probe___fsinfo_##counter(vp, 0, stataddr);	\
177 		(*stataddr)++;						\
178 		if ((vsp = vfsp->vfs_fstypevsp) != NULL) {		\
179 			vsp->n##counter.value.ui64++;			\
180 		}							\
181 	}								\
182 }
183 
184 #define	VOPSTATS_UPDATE_IO(vp, counter, bytecounter, bytesval) {	\
185 	vfs_t *vfsp = (vp)->v_vfsp;					\
186 	if (vfsp && vfsp->vfs_implp &&					\
187 	    (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) {	\
188 		vopstats_t *vsp = &vfsp->vfs_vopstats;			\
189 		uint64_t *stataddr = &(vsp->n##counter.value.ui64);	\
190 		extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
191 		    size_t, uint64_t *);				\
192 		__dtrace_probe___fsinfo_##counter(vp, bytesval, stataddr); \
193 		(*stataddr)++;						\
194 		vsp->bytecounter.value.ui64 += bytesval;		\
195 		if ((vsp = vfsp->vfs_fstypevsp) != NULL) {		\
196 			vsp->n##counter.value.ui64++;			\
197 			vsp->bytecounter.value.ui64 += bytesval;	\
198 		}							\
199 	}								\
200 }
201 
202 /*
203  * If the filesystem does not support XIDs map credential
204  * If the vfsp is NULL, perhaps we should also map?
205  */
206 #define	VOPXID_MAP_CR(vp, cr)	{					\
207 	vfs_t *vfsp = (vp)->v_vfsp;					\
208 	if (vfsp != NULL && (vfsp->vfs_flag & VFS_XID) == 0)		\
209 		cr = crgetmapped(cr);					\
210 	}
211 
212 /*
213  * Convert stat(2) formats to vnode types and vice versa.  (Knows about
214  * numerical order of S_IFMT and vnode types.)
215  */
216 enum vtype iftovt_tab[] = {
217 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
218 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
219 };
220 
221 ushort_t vttoif_tab[] = {
222 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO,
223 	S_IFDOOR, 0, S_IFSOCK, S_IFPORT, 0
224 };
225 
226 /*
227  * The system vnode cache.
228  */
229 
230 kmem_cache_t *vn_cache;
231 
232 
233 /*
234  * Vnode operations vector.
235  */
236 
237 static const fs_operation_trans_def_t vn_ops_table[] = {
238 	VOPNAME_OPEN, offsetof(struct vnodeops, vop_open),
239 	    fs_nosys, fs_nosys,
240 
241 	VOPNAME_CLOSE, offsetof(struct vnodeops, vop_close),
242 	    fs_nosys, fs_nosys,
243 
244 	VOPNAME_READ, offsetof(struct vnodeops, vop_read),
245 	    fs_nosys, fs_nosys,
246 
247 	VOPNAME_WRITE, offsetof(struct vnodeops, vop_write),
248 	    fs_nosys, fs_nosys,
249 
250 	VOPNAME_IOCTL, offsetof(struct vnodeops, vop_ioctl),
251 	    fs_nosys, fs_nosys,
252 
253 	VOPNAME_SETFL, offsetof(struct vnodeops, vop_setfl),
254 	    fs_setfl, fs_nosys,
255 
256 	VOPNAME_GETATTR, offsetof(struct vnodeops, vop_getattr),
257 	    fs_nosys, fs_nosys,
258 
259 	VOPNAME_SETATTR, offsetof(struct vnodeops, vop_setattr),
260 	    fs_nosys, fs_nosys,
261 
262 	VOPNAME_ACCESS, offsetof(struct vnodeops, vop_access),
263 	    fs_nosys, fs_nosys,
264 
265 	VOPNAME_LOOKUP, offsetof(struct vnodeops, vop_lookup),
266 	    fs_nosys, fs_nosys,
267 
268 	VOPNAME_CREATE, offsetof(struct vnodeops, vop_create),
269 	    fs_nosys, fs_nosys,
270 
271 	VOPNAME_REMOVE, offsetof(struct vnodeops, vop_remove),
272 	    fs_nosys, fs_nosys,
273 
274 	VOPNAME_LINK, offsetof(struct vnodeops, vop_link),
275 	    fs_nosys, fs_nosys,
276 
277 	VOPNAME_RENAME, offsetof(struct vnodeops, vop_rename),
278 	    fs_nosys, fs_nosys,
279 
280 	VOPNAME_MKDIR, offsetof(struct vnodeops, vop_mkdir),
281 	    fs_nosys, fs_nosys,
282 
283 	VOPNAME_RMDIR, offsetof(struct vnodeops, vop_rmdir),
284 	    fs_nosys, fs_nosys,
285 
286 	VOPNAME_READDIR, offsetof(struct vnodeops, vop_readdir),
287 	    fs_nosys, fs_nosys,
288 
289 	VOPNAME_SYMLINK, offsetof(struct vnodeops, vop_symlink),
290 	    fs_nosys, fs_nosys,
291 
292 	VOPNAME_READLINK, offsetof(struct vnodeops, vop_readlink),
293 	    fs_nosys, fs_nosys,
294 
295 	VOPNAME_FSYNC, offsetof(struct vnodeops, vop_fsync),
296 	    fs_nosys, fs_nosys,
297 
298 	VOPNAME_INACTIVE, offsetof(struct vnodeops, vop_inactive),
299 	    fs_nosys, fs_nosys,
300 
301 	VOPNAME_FID, offsetof(struct vnodeops, vop_fid),
302 	    fs_nosys, fs_nosys,
303 
304 	VOPNAME_RWLOCK, offsetof(struct vnodeops, vop_rwlock),
305 	    fs_rwlock, fs_rwlock,
306 
307 	VOPNAME_RWUNLOCK, offsetof(struct vnodeops, vop_rwunlock),
308 	    (fs_generic_func_p)(uintptr_t)fs_rwunlock,
309 	    (fs_generic_func_p)(uintptr_t)fs_rwunlock,	/* no errors allowed */
310 
311 	VOPNAME_SEEK, offsetof(struct vnodeops, vop_seek),
312 	    fs_nosys, fs_nosys,
313 
314 	VOPNAME_CMP, offsetof(struct vnodeops, vop_cmp),
315 	    fs_cmp, fs_cmp,		/* no errors allowed */
316 
317 	VOPNAME_FRLOCK, offsetof(struct vnodeops, vop_frlock),
318 	    fs_frlock, fs_nosys,
319 
320 	VOPNAME_SPACE, offsetof(struct vnodeops, vop_space),
321 	    fs_nosys, fs_nosys,
322 
323 	VOPNAME_REALVP, offsetof(struct vnodeops, vop_realvp),
324 	    fs_nosys, fs_nosys,
325 
326 	VOPNAME_GETPAGE, offsetof(struct vnodeops, vop_getpage),
327 	    fs_nosys, fs_nosys,
328 
329 	VOPNAME_PUTPAGE, offsetof(struct vnodeops, vop_putpage),
330 	    fs_nosys, fs_nosys,
331 
332 	VOPNAME_MAP, offsetof(struct vnodeops, vop_map),
333 	    (fs_generic_func_p) fs_nosys_map,
334 	    (fs_generic_func_p) fs_nosys_map,
335 
336 	VOPNAME_ADDMAP, offsetof(struct vnodeops, vop_addmap),
337 	    (fs_generic_func_p) fs_nosys_addmap,
338 	    (fs_generic_func_p) fs_nosys_addmap,
339 
340 	VOPNAME_DELMAP, offsetof(struct vnodeops, vop_delmap),
341 	    fs_nosys, fs_nosys,
342 
343 	VOPNAME_POLL, offsetof(struct vnodeops, vop_poll),
344 	    (fs_generic_func_p) fs_poll, (fs_generic_func_p) fs_nosys_poll,
345 
346 	VOPNAME_DUMP, offsetof(struct vnodeops, vop_dump),
347 	    fs_nosys, fs_nosys,
348 
349 	VOPNAME_PATHCONF, offsetof(struct vnodeops, vop_pathconf),
350 	    fs_pathconf, fs_nosys,
351 
352 	VOPNAME_PAGEIO, offsetof(struct vnodeops, vop_pageio),
353 	    fs_nosys, fs_nosys,
354 
355 	VOPNAME_DUMPCTL, offsetof(struct vnodeops, vop_dumpctl),
356 	    fs_nosys, fs_nosys,
357 
358 	VOPNAME_DISPOSE, offsetof(struct vnodeops, vop_dispose),
359 	    (fs_generic_func_p)(uintptr_t)fs_dispose,
360 	    (fs_generic_func_p)(uintptr_t)fs_nodispose,
361 
362 	VOPNAME_SETSECATTR, offsetof(struct vnodeops, vop_setsecattr),
363 	    fs_nosys, fs_nosys,
364 
365 	VOPNAME_GETSECATTR, offsetof(struct vnodeops, vop_getsecattr),
366 	    fs_fab_acl, fs_nosys,
367 
368 	VOPNAME_SHRLOCK, offsetof(struct vnodeops, vop_shrlock),
369 	    fs_shrlock, fs_nosys,
370 
371 	VOPNAME_VNEVENT, offsetof(struct vnodeops, vop_vnevent),
372 	    (fs_generic_func_p) fs_vnevent_nosupport,
373 	    (fs_generic_func_p) fs_vnevent_nosupport,
374 
375 	VOPNAME_REQZCBUF, offsetof(struct vnodeops, vop_reqzcbuf),
376 	    fs_nosys, fs_nosys,
377 
378 	VOPNAME_RETZCBUF, offsetof(struct vnodeops, vop_retzcbuf),
379 	    fs_nosys, fs_nosys,
380 
381 	NULL, 0, NULL, NULL
382 };
383 
384 /* Extensible attribute (xva) routines. */
385 
386 /*
387  * Zero out the structure, set the size of the requested/returned bitmaps,
388  * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer
389  * to the returned attributes array.
390  */
391 void
392 xva_init(xvattr_t *xvap)
393 {
394 	bzero(xvap, sizeof (xvattr_t));
395 	xvap->xva_mapsize = XVA_MAPSIZE;
396 	xvap->xva_magic = XVA_MAGIC;
397 	xvap->xva_vattr.va_mask = AT_XVATTR;
398 	xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0];
399 }
400 
401 /*
402  * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t
403  * structure.  Otherwise, returns NULL.
404  */
405 xoptattr_t *
406 xva_getxoptattr(xvattr_t *xvap)
407 {
408 	xoptattr_t *xoap = NULL;
409 	if (xvap->xva_vattr.va_mask & AT_XVATTR)
410 		xoap = &xvap->xva_xoptattrs;
411 	return (xoap);
412 }
413 
414 /*
415  * Used by the AVL routines to compare two vsk_anchor_t structures in the tree.
416  * We use the f_fsid reported by VFS_STATVFS() since we use that for the
417  * kstat name.
418  */
419 static int
420 vska_compar(const void *n1, const void *n2)
421 {
422 	int ret;
423 	ulong_t p1 = ((vsk_anchor_t *)n1)->vsk_fsid;
424 	ulong_t p2 = ((vsk_anchor_t *)n2)->vsk_fsid;
425 
426 	if (p1 < p2) {
427 		ret = -1;
428 	} else if (p1 > p2) {
429 		ret = 1;
430 	} else {
431 		ret = 0;
432 	}
433 
434 	return (ret);
435 }
436 
437 /*
438  * Used to create a single template which will be bcopy()ed to a newly
439  * allocated vsanchor_combo_t structure in new_vsanchor(), below.
440  */
441 static vopstats_t *
442 create_vopstats_template()
443 {
444 	vopstats_t		*vsp;
445 
446 	vsp = kmem_alloc(sizeof (vopstats_t), KM_SLEEP);
447 	bzero(vsp, sizeof (*vsp));	/* Start fresh */
448 
449 	/* VOP_OPEN */
450 	kstat_named_init(&vsp->nopen, "nopen", KSTAT_DATA_UINT64);
451 	/* VOP_CLOSE */
452 	kstat_named_init(&vsp->nclose, "nclose", KSTAT_DATA_UINT64);
453 	/* VOP_READ I/O */
454 	kstat_named_init(&vsp->nread, "nread", KSTAT_DATA_UINT64);
455 	kstat_named_init(&vsp->read_bytes, "read_bytes", KSTAT_DATA_UINT64);
456 	/* VOP_WRITE I/O */
457 	kstat_named_init(&vsp->nwrite, "nwrite", KSTAT_DATA_UINT64);
458 	kstat_named_init(&vsp->write_bytes, "write_bytes", KSTAT_DATA_UINT64);
459 	/* VOP_IOCTL */
460 	kstat_named_init(&vsp->nioctl, "nioctl", KSTAT_DATA_UINT64);
461 	/* VOP_SETFL */
462 	kstat_named_init(&vsp->nsetfl, "nsetfl", KSTAT_DATA_UINT64);
463 	/* VOP_GETATTR */
464 	kstat_named_init(&vsp->ngetattr, "ngetattr", KSTAT_DATA_UINT64);
465 	/* VOP_SETATTR */
466 	kstat_named_init(&vsp->nsetattr, "nsetattr", KSTAT_DATA_UINT64);
467 	/* VOP_ACCESS */
468 	kstat_named_init(&vsp->naccess, "naccess", KSTAT_DATA_UINT64);
469 	/* VOP_LOOKUP */
470 	kstat_named_init(&vsp->nlookup, "nlookup", KSTAT_DATA_UINT64);
471 	/* VOP_CREATE */
472 	kstat_named_init(&vsp->ncreate, "ncreate", KSTAT_DATA_UINT64);
473 	/* VOP_REMOVE */
474 	kstat_named_init(&vsp->nremove, "nremove", KSTAT_DATA_UINT64);
475 	/* VOP_LINK */
476 	kstat_named_init(&vsp->nlink, "nlink", KSTAT_DATA_UINT64);
477 	/* VOP_RENAME */
478 	kstat_named_init(&vsp->nrename, "nrename", KSTAT_DATA_UINT64);
479 	/* VOP_MKDIR */
480 	kstat_named_init(&vsp->nmkdir, "nmkdir", KSTAT_DATA_UINT64);
481 	/* VOP_RMDIR */
482 	kstat_named_init(&vsp->nrmdir, "nrmdir", KSTAT_DATA_UINT64);
483 	/* VOP_READDIR I/O */
484 	kstat_named_init(&vsp->nreaddir, "nreaddir", KSTAT_DATA_UINT64);
485 	kstat_named_init(&vsp->readdir_bytes, "readdir_bytes",
486 	    KSTAT_DATA_UINT64);
487 	/* VOP_SYMLINK */
488 	kstat_named_init(&vsp->nsymlink, "nsymlink", KSTAT_DATA_UINT64);
489 	/* VOP_READLINK */
490 	kstat_named_init(&vsp->nreadlink, "nreadlink", KSTAT_DATA_UINT64);
491 	/* VOP_FSYNC */
492 	kstat_named_init(&vsp->nfsync, "nfsync", KSTAT_DATA_UINT64);
493 	/* VOP_INACTIVE */
494 	kstat_named_init(&vsp->ninactive, "ninactive", KSTAT_DATA_UINT64);
495 	/* VOP_FID */
496 	kstat_named_init(&vsp->nfid, "nfid", KSTAT_DATA_UINT64);
497 	/* VOP_RWLOCK */
498 	kstat_named_init(&vsp->nrwlock, "nrwlock", KSTAT_DATA_UINT64);
499 	/* VOP_RWUNLOCK */
500 	kstat_named_init(&vsp->nrwunlock, "nrwunlock", KSTAT_DATA_UINT64);
501 	/* VOP_SEEK */
502 	kstat_named_init(&vsp->nseek, "nseek", KSTAT_DATA_UINT64);
503 	/* VOP_CMP */
504 	kstat_named_init(&vsp->ncmp, "ncmp", KSTAT_DATA_UINT64);
505 	/* VOP_FRLOCK */
506 	kstat_named_init(&vsp->nfrlock, "nfrlock", KSTAT_DATA_UINT64);
507 	/* VOP_SPACE */
508 	kstat_named_init(&vsp->nspace, "nspace", KSTAT_DATA_UINT64);
509 	/* VOP_REALVP */
510 	kstat_named_init(&vsp->nrealvp, "nrealvp", KSTAT_DATA_UINT64);
511 	/* VOP_GETPAGE */
512 	kstat_named_init(&vsp->ngetpage, "ngetpage", KSTAT_DATA_UINT64);
513 	/* VOP_PUTPAGE */
514 	kstat_named_init(&vsp->nputpage, "nputpage", KSTAT_DATA_UINT64);
515 	/* VOP_MAP */
516 	kstat_named_init(&vsp->nmap, "nmap", KSTAT_DATA_UINT64);
517 	/* VOP_ADDMAP */
518 	kstat_named_init(&vsp->naddmap, "naddmap", KSTAT_DATA_UINT64);
519 	/* VOP_DELMAP */
520 	kstat_named_init(&vsp->ndelmap, "ndelmap", KSTAT_DATA_UINT64);
521 	/* VOP_POLL */
522 	kstat_named_init(&vsp->npoll, "npoll", KSTAT_DATA_UINT64);
523 	/* VOP_DUMP */
524 	kstat_named_init(&vsp->ndump, "ndump", KSTAT_DATA_UINT64);
525 	/* VOP_PATHCONF */
526 	kstat_named_init(&vsp->npathconf, "npathconf", KSTAT_DATA_UINT64);
527 	/* VOP_PAGEIO */
528 	kstat_named_init(&vsp->npageio, "npageio", KSTAT_DATA_UINT64);
529 	/* VOP_DUMPCTL */
530 	kstat_named_init(&vsp->ndumpctl, "ndumpctl", KSTAT_DATA_UINT64);
531 	/* VOP_DISPOSE */
532 	kstat_named_init(&vsp->ndispose, "ndispose", KSTAT_DATA_UINT64);
533 	/* VOP_SETSECATTR */
534 	kstat_named_init(&vsp->nsetsecattr, "nsetsecattr", KSTAT_DATA_UINT64);
535 	/* VOP_GETSECATTR */
536 	kstat_named_init(&vsp->ngetsecattr, "ngetsecattr", KSTAT_DATA_UINT64);
537 	/* VOP_SHRLOCK */
538 	kstat_named_init(&vsp->nshrlock, "nshrlock", KSTAT_DATA_UINT64);
539 	/* VOP_VNEVENT */
540 	kstat_named_init(&vsp->nvnevent, "nvnevent", KSTAT_DATA_UINT64);
541 	/* VOP_REQZCBUF */
542 	kstat_named_init(&vsp->nreqzcbuf, "nreqzcbuf", KSTAT_DATA_UINT64);
543 	/* VOP_RETZCBUF */
544 	kstat_named_init(&vsp->nretzcbuf, "nretzcbuf", KSTAT_DATA_UINT64);
545 
546 	return (vsp);
547 }
548 
549 /*
550  * Creates a kstat structure associated with a vopstats structure.
551  */
552 kstat_t *
553 new_vskstat(char *ksname, vopstats_t *vsp)
554 {
555 	kstat_t		*ksp;
556 
557 	if (!vopstats_enabled) {
558 		return (NULL);
559 	}
560 
561 	ksp = kstat_create("unix", 0, ksname, "misc", KSTAT_TYPE_NAMED,
562 	    sizeof (vopstats_t)/sizeof (kstat_named_t),
563 	    KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE);
564 	if (ksp) {
565 		ksp->ks_data = vsp;
566 		kstat_install(ksp);
567 	}
568 
569 	return (ksp);
570 }
571 
572 /*
573  * Called from vfsinit() to initialize the support mechanisms for vopstats
574  */
575 void
576 vopstats_startup()
577 {
578 	if (!vopstats_enabled)
579 		return;
580 
581 	/*
582 	 * Creates the AVL tree which holds per-vfs vopstat anchors.  This
583 	 * is necessary since we need to check if a kstat exists before we
584 	 * attempt to create it.  Also, initialize its lock.
585 	 */
586 	avl_create(&vskstat_tree, vska_compar, sizeof (vsk_anchor_t),
587 	    offsetof(vsk_anchor_t, vsk_node));
588 	mutex_init(&vskstat_tree_lock, NULL, MUTEX_DEFAULT, NULL);
589 
590 	vsk_anchor_cache = kmem_cache_create("vsk_anchor_cache",
591 	    sizeof (vsk_anchor_t), sizeof (uintptr_t), NULL, NULL, NULL,
592 	    NULL, NULL, 0);
593 
594 	/*
595 	 * Set up the array of pointers for the vopstats-by-FS-type.
596 	 * The entries will be allocated/initialized as each file system
597 	 * goes through modload/mod_installfs.
598 	 */
599 	vopstats_fstype = (vopstats_t **)kmem_zalloc(
600 	    (sizeof (vopstats_t *) * nfstype), KM_SLEEP);
601 
602 	/* Set up the global vopstats initialization template */
603 	vs_templatep = create_vopstats_template();
604 }
605 
606 /*
607  * We need to have the all of the counters zeroed.
608  * The initialization of the vopstats_t includes on the order of
609  * 50 calls to kstat_named_init().  Rather that do that on every call,
610  * we do it once in a template (vs_templatep) then bcopy it over.
611  */
612 void
613 initialize_vopstats(vopstats_t *vsp)
614 {
615 	if (vsp == NULL)
616 		return;
617 
618 	bcopy(vs_templatep, vsp, sizeof (vopstats_t));
619 }
620 
621 /*
622  * If possible, determine which vopstats by fstype to use and
623  * return a pointer to the caller.
624  */
625 vopstats_t *
626 get_fstype_vopstats(vfs_t *vfsp, struct vfssw *vswp)
627 {
628 	int		fstype = 0;	/* Index into vfssw[] */
629 	vopstats_t	*vsp = NULL;
630 
631 	if (vfsp == NULL || (vfsp->vfs_flag & VFS_STATS) == 0 ||
632 	    !vopstats_enabled)
633 		return (NULL);
634 	/*
635 	 * Set up the fstype.  We go to so much trouble because all versions
636 	 * of NFS use the same fstype in their vfs even though they have
637 	 * distinct entries in the vfssw[] table.
638 	 * NOTE: A special vfs (e.g., EIO_vfs) may not have an entry.
639 	 */
640 	if (vswp) {
641 		fstype = vswp - vfssw;	/* Gets us the index */
642 	} else {
643 		fstype = vfsp->vfs_fstype;
644 	}
645 
646 	/*
647 	 * Point to the per-fstype vopstats. The only valid values are
648 	 * non-zero positive values less than the number of vfssw[] table
649 	 * entries.
650 	 */
651 	if (fstype > 0 && fstype < nfstype) {
652 		vsp = vopstats_fstype[fstype];
653 	}
654 
655 	return (vsp);
656 }
657 
658 /*
659  * Generate a kstat name, create the kstat structure, and allocate a
660  * vsk_anchor_t to hold it together.  Return the pointer to the vsk_anchor_t
661  * to the caller.  This must only be called from a mount.
662  */
663 vsk_anchor_t *
664 get_vskstat_anchor(vfs_t *vfsp)
665 {
666 	char		kstatstr[KSTAT_STRLEN]; /* kstat name for vopstats */
667 	statvfs64_t	statvfsbuf;		/* Needed to find f_fsid */
668 	vsk_anchor_t	*vskp = NULL;		/* vfs <--> kstat anchor */
669 	kstat_t		*ksp;			/* Ptr to new kstat */
670 	avl_index_t	where;			/* Location in the AVL tree */
671 
672 	if (vfsp == NULL || vfsp->vfs_implp == NULL ||
673 	    (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
674 		return (NULL);
675 
676 	/* Need to get the fsid to build a kstat name */
677 	if (VFS_STATVFS(vfsp, &statvfsbuf) == 0) {
678 		/* Create a name for our kstats based on fsid */
679 		(void) snprintf(kstatstr, KSTAT_STRLEN, "%s%lx",
680 		    VOPSTATS_STR, statvfsbuf.f_fsid);
681 
682 		/* Allocate and initialize the vsk_anchor_t */
683 		vskp = kmem_cache_alloc(vsk_anchor_cache, KM_SLEEP);
684 		bzero(vskp, sizeof (*vskp));
685 		vskp->vsk_fsid = statvfsbuf.f_fsid;
686 
687 		mutex_enter(&vskstat_tree_lock);
688 		if (avl_find(&vskstat_tree, vskp, &where) == NULL) {
689 			avl_insert(&vskstat_tree, vskp, where);
690 			mutex_exit(&vskstat_tree_lock);
691 
692 			/*
693 			 * Now that we've got the anchor in the AVL
694 			 * tree, we can create the kstat.
695 			 */
696 			ksp = new_vskstat(kstatstr, &vfsp->vfs_vopstats);
697 			if (ksp) {
698 				vskp->vsk_ksp = ksp;
699 			}
700 		} else {
701 			/* Oops, found one! Release memory and lock. */
702 			mutex_exit(&vskstat_tree_lock);
703 			kmem_cache_free(vsk_anchor_cache, vskp);
704 			vskp = NULL;
705 		}
706 	}
707 	return (vskp);
708 }
709 
710 /*
711  * We're in the process of tearing down the vfs and need to cleanup
712  * the data structures associated with the vopstats. Must only be called
713  * from dounmount().
714  */
715 void
716 teardown_vopstats(vfs_t *vfsp)
717 {
718 	vsk_anchor_t	*vskap;
719 	avl_index_t	where;
720 
721 	if (vfsp == NULL || vfsp->vfs_implp == NULL ||
722 	    (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
723 		return;
724 
725 	/* This is a safe check since VFS_STATS must be set (see above) */
726 	if ((vskap = vfsp->vfs_vskap) == NULL)
727 		return;
728 
729 	/* Whack the pointer right away */
730 	vfsp->vfs_vskap = NULL;
731 
732 	/* Lock the tree, remove the node, and delete the kstat */
733 	mutex_enter(&vskstat_tree_lock);
734 	if (avl_find(&vskstat_tree, vskap, &where)) {
735 		avl_remove(&vskstat_tree, vskap);
736 	}
737 
738 	if (vskap->vsk_ksp) {
739 		kstat_delete(vskap->vsk_ksp);
740 	}
741 	mutex_exit(&vskstat_tree_lock);
742 
743 	kmem_cache_free(vsk_anchor_cache, vskap);
744 }
745 
746 /*
747  * Read or write a vnode.  Called from kernel code.
748  */
749 int
750 vn_rdwr(
751 	enum uio_rw rw,
752 	struct vnode *vp,
753 	caddr_t base,
754 	ssize_t len,
755 	offset_t offset,
756 	enum uio_seg seg,
757 	int ioflag,
758 	rlim64_t ulimit,	/* meaningful only if rw is UIO_WRITE */
759 	cred_t *cr,
760 	ssize_t *residp)
761 {
762 	struct uio uio;
763 	struct iovec iov;
764 	int error;
765 	int in_crit = 0;
766 
767 	if (rw == UIO_WRITE && ISROFILE(vp))
768 		return (EROFS);
769 
770 	if (len < 0)
771 		return (EIO);
772 
773 	VOPXID_MAP_CR(vp, cr);
774 
775 	iov.iov_base = base;
776 	iov.iov_len = len;
777 	uio.uio_iov = &iov;
778 	uio.uio_iovcnt = 1;
779 	uio.uio_loffset = offset;
780 	uio.uio_segflg = (short)seg;
781 	uio.uio_resid = len;
782 	uio.uio_llimit = ulimit;
783 
784 	/*
785 	 * We have to enter the critical region before calling VOP_RWLOCK
786 	 * to avoid a deadlock with ufs.
787 	 */
788 	if (nbl_need_check(vp)) {
789 		int svmand;
790 
791 		nbl_start_crit(vp, RW_READER);
792 		in_crit = 1;
793 		error = nbl_svmand(vp, cr, &svmand);
794 		if (error != 0)
795 			goto done;
796 		if (nbl_conflict(vp, rw == UIO_WRITE ? NBL_WRITE : NBL_READ,
797 		    uio.uio_offset, uio.uio_resid, svmand, NULL)) {
798 			error = EACCES;
799 			goto done;
800 		}
801 	}
802 
803 	(void) VOP_RWLOCK(vp,
804 	    rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
805 	if (rw == UIO_WRITE) {
806 		uio.uio_fmode = FWRITE;
807 		uio.uio_extflg = UIO_COPY_DEFAULT;
808 		error = VOP_WRITE(vp, &uio, ioflag, cr, NULL);
809 	} else {
810 		uio.uio_fmode = FREAD;
811 		uio.uio_extflg = UIO_COPY_CACHED;
812 		error = VOP_READ(vp, &uio, ioflag, cr, NULL);
813 	}
814 	VOP_RWUNLOCK(vp,
815 	    rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
816 	if (residp)
817 		*residp = uio.uio_resid;
818 	else if (uio.uio_resid)
819 		error = EIO;
820 
821 done:
822 	if (in_crit)
823 		nbl_end_crit(vp);
824 	return (error);
825 }
826 
827 /*
828  * Release a vnode.  Call VOP_INACTIVE on last reference or
829  * decrement reference count.
830  *
831  * To avoid race conditions, the v_count is left at 1 for
832  * the call to VOP_INACTIVE. This prevents another thread
833  * from reclaiming and releasing the vnode *before* the
834  * VOP_INACTIVE routine has a chance to destroy the vnode.
835  * We can't have more than 1 thread calling VOP_INACTIVE
836  * on a vnode.
837  */
838 void
839 vn_rele(vnode_t *vp)
840 {
841 	mutex_enter(&vp->v_lock);
842 	if (vp->v_count == 1) {
843 		mutex_exit(&vp->v_lock);
844 		VOP_INACTIVE(vp, CRED(), NULL);
845 		return;
846 	}
847 	VERIFY(vp->v_count > 0);
848 	VN_RELE_LOCKED(vp);
849 	mutex_exit(&vp->v_lock);
850 }
851 
852 /*
853  * Release a vnode referenced by the DNLC. Multiple DNLC references are treated
854  * as a single reference, so v_count is not decremented until the last DNLC hold
855  * is released. This makes it possible to distinguish vnodes that are referenced
856  * only by the DNLC.
857  */
858 void
859 vn_rele_dnlc(vnode_t *vp)
860 {
861 	mutex_enter(&vp->v_lock);
862 	VERIFY((vp->v_count > 0) && (vp->v_count_dnlc > 0));
863 	if (--vp->v_count_dnlc == 0) {
864 		if (vp->v_count == 1) {
865 			mutex_exit(&vp->v_lock);
866 			VOP_INACTIVE(vp, CRED(), NULL);
867 			return;
868 		}
869 		VN_RELE_LOCKED(vp);
870 	}
871 	mutex_exit(&vp->v_lock);
872 }
873 
874 /*
875  * Like vn_rele() except that it clears v_stream under v_lock.
876  * This is used by sockfs when it dismantles the association between
877  * the sockfs node and the vnode in the underlying file system.
878  * v_lock has to be held to prevent a thread coming through the lookupname
879  * path from accessing a stream head that is going away.
880  */
881 void
882 vn_rele_stream(vnode_t *vp)
883 {
884 	mutex_enter(&vp->v_lock);
885 	vp->v_stream = NULL;
886 	if (vp->v_count == 1) {
887 		mutex_exit(&vp->v_lock);
888 		VOP_INACTIVE(vp, CRED(), NULL);
889 		return;
890 	}
891 	VERIFY(vp->v_count > 0);
892 	VN_RELE_LOCKED(vp);
893 	mutex_exit(&vp->v_lock);
894 }
895 
896 static void
897 vn_rele_inactive(vnode_t *vp)
898 {
899 	VOP_INACTIVE(vp, CRED(), NULL);
900 }
901 
902 /*
903  * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it
904  * asynchronously using a taskq. This can avoid deadlocks caused by re-entering
905  * the file system as a result of releasing the vnode. Note, file systems
906  * already have to handle the race where the vnode is incremented before the
907  * inactive routine is called and does its locking.
908  *
909  * Warning: Excessive use of this routine can lead to performance problems.
910  * This is because taskqs throttle back allocation if too many are created.
911  */
912 void
913 vn_rele_async(vnode_t *vp, taskq_t *taskq)
914 {
915 	mutex_enter(&vp->v_lock);
916 	if (vp->v_count == 1) {
917 		mutex_exit(&vp->v_lock);
918 		VERIFY(taskq_dispatch(taskq, (task_func_t *)vn_rele_inactive,
919 		    vp, TQ_SLEEP) != TASKQID_INVALID);
920 		return;
921 	}
922 	VERIFY(vp->v_count > 0);
923 	VN_RELE_LOCKED(vp);
924 	mutex_exit(&vp->v_lock);
925 }
926 
927 int
928 vn_open(
929 	char *pnamep,
930 	enum uio_seg seg,
931 	int filemode,
932 	int createmode,
933 	struct vnode **vpp,
934 	enum create crwhy,
935 	mode_t umask)
936 {
937 	return (vn_openat(pnamep, seg, filemode, createmode, vpp, crwhy,
938 	    umask, NULL, -1));
939 }
940 
941 
942 /*
943  * Open/create a vnode.
944  * This may be callable by the kernel, the only known use
945  * of user context being that the current user credentials
946  * are used for permissions.  crwhy is defined iff filemode & FCREAT.
947  */
948 int
949 vn_openat(
950 	char *pnamep,
951 	enum uio_seg seg,
952 	int filemode,
953 	int createmode,
954 	struct vnode **vpp,
955 	enum create crwhy,
956 	mode_t umask,
957 	struct vnode *startvp,
958 	int fd)
959 {
960 	struct vnode *vp;
961 	int mode;
962 	int accessflags;
963 	int error;
964 	int in_crit = 0;
965 	int open_done = 0;
966 	int shrlock_done = 0;
967 	struct vattr vattr;
968 	enum symfollow follow;
969 	int estale_retry = 0;
970 	struct shrlock shr;
971 	struct shr_locowner shr_own;
972 	boolean_t create;
973 
974 	mode = 0;
975 	accessflags = 0;
976 	if (filemode & FREAD)
977 		mode |= VREAD;
978 	if (filemode & (FWRITE|FTRUNC))
979 		mode |= VWRITE;
980 	if (filemode & (FSEARCH|FEXEC|FXATTRDIROPEN))
981 		mode |= VEXEC;
982 
983 	/* symlink interpretation */
984 	if (filemode & FNOFOLLOW)
985 		follow = NO_FOLLOW;
986 	else
987 		follow = FOLLOW;
988 
989 	if (filemode & FAPPEND)
990 		accessflags |= V_APPEND;
991 
992 	/*
993 	 * We need to handle the case of FCREAT | FDIRECTORY and the case of
994 	 * FEXCL. If all three are specified, then we always fail because we
995 	 * cannot create a directory through this interface and FEXCL says we
996 	 * need to fail the request if we can't create it. If, however, only
997 	 * FCREAT | FDIRECTORY are specified, then we can treat this as the case
998 	 * of opening a file that already exists. If it exists, we can do
999 	 * something and if not, we fail. Effectively FCREAT | FDIRECTORY is
1000 	 * treated as FDIRECTORY.
1001 	 */
1002 	if ((filemode & (FCREAT | FDIRECTORY | FEXCL)) ==
1003 	    (FCREAT | FDIRECTORY | FEXCL)) {
1004 		return (EINVAL);
1005 	}
1006 
1007 	if ((filemode & (FCREAT | FDIRECTORY)) == (FCREAT | FDIRECTORY)) {
1008 		create = B_FALSE;
1009 	} else if ((filemode & FCREAT) != 0) {
1010 		create = B_TRUE;
1011 	} else {
1012 		create = B_FALSE;
1013 	}
1014 
1015 top:
1016 	if (create) {
1017 		enum vcexcl excl;
1018 
1019 		/*
1020 		 * Wish to create a file.
1021 		 */
1022 		vattr.va_type = VREG;
1023 		vattr.va_mode = createmode;
1024 		vattr.va_mask = AT_TYPE|AT_MODE;
1025 		if (filemode & FTRUNC) {
1026 			vattr.va_size = 0;
1027 			vattr.va_mask |= AT_SIZE;
1028 		}
1029 		if (filemode & FEXCL)
1030 			excl = EXCL;
1031 		else
1032 			excl = NONEXCL;
1033 
1034 		if (error =
1035 		    vn_createat(pnamep, seg, &vattr, excl, mode, &vp, crwhy,
1036 		    (filemode & ~(FTRUNC|FEXCL)), umask, startvp))
1037 			return (error);
1038 	} else {
1039 		/*
1040 		 * Wish to open a file.  Just look it up.
1041 		 */
1042 		if (error = lookupnameat(pnamep, seg, follow,
1043 		    NULLVPP, &vp, startvp)) {
1044 			if ((error == ESTALE) &&
1045 			    fs_need_estale_retry(estale_retry++))
1046 				goto top;
1047 			return (error);
1048 		}
1049 
1050 		/*
1051 		 * Get the attributes to check whether file is large.
1052 		 * We do this only if the FOFFMAX flag is not set and
1053 		 * only for regular files.
1054 		 */
1055 
1056 		if (!(filemode & FOFFMAX) && (vp->v_type == VREG)) {
1057 			vattr.va_mask = AT_SIZE;
1058 			if ((error = VOP_GETATTR(vp, &vattr, 0,
1059 			    CRED(), NULL))) {
1060 				goto out;
1061 			}
1062 			if (vattr.va_size > (u_offset_t)MAXOFF32_T) {
1063 				/*
1064 				 * Large File API - regular open fails
1065 				 * if FOFFMAX flag is set in file mode
1066 				 */
1067 				error = EOVERFLOW;
1068 				goto out;
1069 			}
1070 		}
1071 		/*
1072 		 * Can't write directories, active texts, or
1073 		 * read-only filesystems.  Can't truncate files
1074 		 * on which mandatory locking is in effect.
1075 		 */
1076 		if (filemode & (FWRITE|FTRUNC)) {
1077 			/*
1078 			 * Allow writable directory if VDIROPEN flag is set.
1079 			 */
1080 			if (vp->v_type == VDIR && !(vp->v_flag & VDIROPEN)) {
1081 				error = EISDIR;
1082 				goto out;
1083 			}
1084 			if (ISROFILE(vp)) {
1085 				error = EROFS;
1086 				goto out;
1087 			}
1088 			/*
1089 			 * Can't truncate files on which
1090 			 * sysv mandatory locking is in effect.
1091 			 */
1092 			if (filemode & FTRUNC) {
1093 				vnode_t *rvp;
1094 
1095 				if (VOP_REALVP(vp, &rvp, NULL) != 0)
1096 					rvp = vp;
1097 				if (rvp->v_filocks != NULL) {
1098 					vattr.va_mask = AT_MODE;
1099 					if ((error = VOP_GETATTR(vp,
1100 					    &vattr, 0, CRED(), NULL)) == 0 &&
1101 					    MANDLOCK(vp, vattr.va_mode))
1102 						error = EAGAIN;
1103 				}
1104 			}
1105 			if (error)
1106 				goto out;
1107 		}
1108 		/*
1109 		 * Check permissions.
1110 		 */
1111 		if (error = VOP_ACCESS(vp, mode, accessflags, CRED(), NULL))
1112 			goto out;
1113 
1114 		/*
1115 		 * Require FSEARCH and FDIRECTORY to return a directory. Require
1116 		 * FEXEC to return a regular file.
1117 		 */
1118 		if ((filemode & (FSEARCH|FDIRECTORY)) != 0 &&
1119 		    vp->v_type != VDIR) {
1120 			error = ENOTDIR;
1121 			goto out;
1122 		}
1123 		if ((filemode & FEXEC) && vp->v_type != VREG) {
1124 			error = ENOEXEC;	/* XXX: error code? */
1125 			goto out;
1126 		}
1127 	}
1128 
1129 	/*
1130 	 * Do remaining checks for FNOFOLLOW and FNOLINKS.
1131 	 */
1132 	if ((filemode & FNOFOLLOW) && vp->v_type == VLNK) {
1133 		error = ELOOP;
1134 		goto out;
1135 	}
1136 	if (filemode & FNOLINKS) {
1137 		vattr.va_mask = AT_NLINK;
1138 		if ((error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))) {
1139 			goto out;
1140 		}
1141 		if (vattr.va_nlink != 1) {
1142 			error = EMLINK;
1143 			goto out;
1144 		}
1145 	}
1146 
1147 	/*
1148 	 * Opening a socket corresponding to the AF_UNIX pathname
1149 	 * in the filesystem name space is not supported.
1150 	 * However, VSOCK nodes in namefs are supported in order
1151 	 * to make fattach work for sockets.
1152 	 *
1153 	 * XXX This uses VOP_REALVP to distinguish between
1154 	 * an unopened namefs node (where VOP_REALVP returns a
1155 	 * different VSOCK vnode) and a VSOCK created by vn_create
1156 	 * in some file system (where VOP_REALVP would never return
1157 	 * a different vnode).
1158 	 */
1159 	if (vp->v_type == VSOCK) {
1160 		struct vnode *nvp;
1161 
1162 		error = VOP_REALVP(vp, &nvp, NULL);
1163 		if (error != 0 || nvp == NULL || nvp == vp ||
1164 		    nvp->v_type != VSOCK) {
1165 			error = EOPNOTSUPP;
1166 			goto out;
1167 		}
1168 	}
1169 
1170 	if ((vp->v_type == VREG) && nbl_need_check(vp)) {
1171 		/* get share reservation */
1172 		shr.s_access = 0;
1173 		if (filemode & FWRITE)
1174 			shr.s_access |= F_WRACC;
1175 		if (filemode & FREAD)
1176 			shr.s_access |= F_RDACC;
1177 		shr.s_deny = 0;
1178 		shr.s_sysid = 0;
1179 		shr.s_pid = ttoproc(curthread)->p_pid;
1180 		shr_own.sl_pid = shr.s_pid;
1181 		shr_own.sl_id = fd;
1182 		shr.s_own_len = sizeof (shr_own);
1183 		shr.s_owner = (caddr_t)&shr_own;
1184 		error = VOP_SHRLOCK(vp, F_SHARE_NBMAND, &shr, filemode, CRED(),
1185 		    NULL);
1186 		if (error)
1187 			goto out;
1188 		shrlock_done = 1;
1189 
1190 		/* nbmand conflict check if truncating file */
1191 		if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1192 			nbl_start_crit(vp, RW_READER);
1193 			in_crit = 1;
1194 
1195 			vattr.va_mask = AT_SIZE;
1196 			if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
1197 				goto out;
1198 			if (nbl_conflict(vp, NBL_WRITE, 0, vattr.va_size, 0,
1199 			    NULL)) {
1200 				error = EACCES;
1201 				goto out;
1202 			}
1203 		}
1204 	}
1205 
1206 	/*
1207 	 * Do opening protocol.
1208 	 */
1209 	error = VOP_OPEN(&vp, filemode, CRED(), NULL);
1210 	if (error)
1211 		goto out;
1212 	open_done = 1;
1213 
1214 	/*
1215 	 * Truncate if required.
1216 	 */
1217 	if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1218 		vattr.va_size = 0;
1219 		vattr.va_mask = AT_SIZE;
1220 		if ((error = VOP_SETATTR(vp, &vattr, 0, CRED(), NULL)) != 0)
1221 			goto out;
1222 	}
1223 
1224 	/*
1225 	 * Turn on directio, if requested.
1226 	 */
1227 	if (filemode & FDIRECT) {
1228 		if ((error = VOP_IOCTL(vp, _FIODIRECTIO, DIRECTIO_ON, 0,
1229 		    CRED(), NULL, NULL)) != 0) {
1230 			/*
1231 			 * On Linux, O_DIRECT returns EINVAL when the file
1232 			 * system does not support directio, so we'll do the
1233 			 * same.
1234 			 */
1235 			error = EINVAL;
1236 			goto out;
1237 		}
1238 	}
1239 out:
1240 	ASSERT(vp->v_count > 0);
1241 
1242 	if (in_crit) {
1243 		nbl_end_crit(vp);
1244 		in_crit = 0;
1245 	}
1246 	if (error) {
1247 		if (open_done) {
1248 			(void) VOP_CLOSE(vp, filemode, 1, (offset_t)0, CRED(),
1249 			    NULL);
1250 			open_done = 0;
1251 			shrlock_done = 0;
1252 		}
1253 		if (shrlock_done) {
1254 			(void) VOP_SHRLOCK(vp, F_UNSHARE, &shr, 0, CRED(),
1255 			    NULL);
1256 			shrlock_done = 0;
1257 		}
1258 
1259 		/*
1260 		 * The following clause was added to handle a problem
1261 		 * with NFS consistency.  It is possible that a lookup
1262 		 * of the file to be opened succeeded, but the file
1263 		 * itself doesn't actually exist on the server.  This
1264 		 * is chiefly due to the DNLC containing an entry for
1265 		 * the file which has been removed on the server.  In
1266 		 * this case, we just start over.  If there was some
1267 		 * other cause for the ESTALE error, then the lookup
1268 		 * of the file will fail and the error will be returned
1269 		 * above instead of looping around from here.
1270 		 */
1271 		VN_RELE(vp);
1272 		if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1273 			goto top;
1274 	} else
1275 		*vpp = vp;
1276 	return (error);
1277 }
1278 
1279 /*
1280  * The following two accessor functions are for the NFSv4 server.  Since there
1281  * is no VOP_OPEN_UP/DOWNGRADE we need a way for the NFS server to keep the
1282  * vnode open counts correct when a client "upgrades" an open or does an
1283  * open_downgrade.  In NFS, an upgrade or downgrade can not only change the
1284  * open mode (add or subtract read or write), but also change the share/deny
1285  * modes.  However, share reservations are not integrated with OPEN, yet, so
1286  * we need to handle each separately.  These functions are cleaner than having
1287  * the NFS server manipulate the counts directly, however, nobody else should
1288  * use these functions.
1289  */
1290 void
1291 vn_open_upgrade(
1292 	vnode_t *vp,
1293 	int filemode)
1294 {
1295 	ASSERT(vp->v_type == VREG);
1296 
1297 	if (filemode & FREAD)
1298 		atomic_inc_32(&vp->v_rdcnt);
1299 	if (filemode & FWRITE)
1300 		atomic_inc_32(&vp->v_wrcnt);
1301 
1302 }
1303 
1304 void
1305 vn_open_downgrade(
1306 	vnode_t *vp,
1307 	int filemode)
1308 {
1309 	ASSERT(vp->v_type == VREG);
1310 
1311 	if (filemode & FREAD) {
1312 		ASSERT(vp->v_rdcnt > 0);
1313 		atomic_dec_32(&vp->v_rdcnt);
1314 	}
1315 	if (filemode & FWRITE) {
1316 		ASSERT(vp->v_wrcnt > 0);
1317 		atomic_dec_32(&vp->v_wrcnt);
1318 	}
1319 
1320 }
1321 
1322 int
1323 vn_create(
1324 	char *pnamep,
1325 	enum uio_seg seg,
1326 	struct vattr *vap,
1327 	enum vcexcl excl,
1328 	int mode,
1329 	struct vnode **vpp,
1330 	enum create why,
1331 	int flag,
1332 	mode_t umask)
1333 {
1334 	return (vn_createat(pnamep, seg, vap, excl, mode, vpp, why, flag,
1335 	    umask, NULL));
1336 }
1337 
1338 /*
1339  * Create a vnode (makenode).
1340  */
1341 int
1342 vn_createat(
1343 	char *pnamep,
1344 	enum uio_seg seg,
1345 	struct vattr *vap,
1346 	enum vcexcl excl,
1347 	int mode,
1348 	struct vnode **vpp,
1349 	enum create why,
1350 	int flag,
1351 	mode_t umask,
1352 	struct vnode *startvp)
1353 {
1354 	struct vnode *dvp;	/* ptr to parent dir vnode */
1355 	struct vnode *vp = NULL;
1356 	struct pathname pn;
1357 	int error;
1358 	int in_crit = 0;
1359 	struct vattr vattr;
1360 	enum symfollow follow;
1361 	int estale_retry = 0;
1362 	uint32_t auditing = AU_AUDITING();
1363 
1364 	ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
1365 
1366 	/* symlink interpretation */
1367 	if ((flag & FNOFOLLOW) || excl == EXCL)
1368 		follow = NO_FOLLOW;
1369 	else
1370 		follow = FOLLOW;
1371 	flag &= ~(FNOFOLLOW|FNOLINKS);
1372 
1373 top:
1374 	/*
1375 	 * Lookup directory.
1376 	 * If new object is a file, call lower level to create it.
1377 	 * Note that it is up to the lower level to enforce exclusive
1378 	 * creation, if the file is already there.
1379 	 * This allows the lower level to do whatever
1380 	 * locking or protocol that is needed to prevent races.
1381 	 * If the new object is directory call lower level to make
1382 	 * the new directory, with "." and "..".
1383 	 */
1384 	if (error = pn_get(pnamep, seg, &pn))
1385 		return (error);
1386 	if (auditing)
1387 		audit_vncreate_start();
1388 	dvp = NULL;
1389 	*vpp = NULL;
1390 	/*
1391 	 * lookup will find the parent directory for the vnode.
1392 	 * When it is done the pn holds the name of the entry
1393 	 * in the directory.
1394 	 * If this is a non-exclusive create we also find the node itself.
1395 	 */
1396 	error = lookuppnat(&pn, NULL, follow, &dvp,
1397 	    (excl == EXCL) ? NULLVPP : vpp, startvp);
1398 	if (error) {
1399 		pn_free(&pn);
1400 		if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1401 			goto top;
1402 		if (why == CRMKDIR && error == EINVAL)
1403 			error = EEXIST;		/* SVID */
1404 		return (error);
1405 	}
1406 
1407 	if (why != CRMKNOD)
1408 		vap->va_mode &= ~VSVTX;
1409 
1410 	/*
1411 	 * If default ACLs are defined for the directory don't apply the
1412 	 * umask if umask is passed.
1413 	 */
1414 
1415 	if (umask) {
1416 
1417 		vsecattr_t vsec;
1418 
1419 		vsec.vsa_aclcnt = 0;
1420 		vsec.vsa_aclentp = NULL;
1421 		vsec.vsa_dfaclcnt = 0;
1422 		vsec.vsa_dfaclentp = NULL;
1423 		vsec.vsa_mask = VSA_DFACLCNT;
1424 		error = VOP_GETSECATTR(dvp, &vsec, 0, CRED(), NULL);
1425 		/*
1426 		 * If error is ENOSYS then treat it as no error
1427 		 * Don't want to force all file systems to support
1428 		 * aclent_t style of ACL's.
1429 		 */
1430 		if (error == ENOSYS)
1431 			error = 0;
1432 		if (error) {
1433 			if (*vpp != NULL)
1434 				VN_RELE(*vpp);
1435 			goto out;
1436 		} else {
1437 			/*
1438 			 * Apply the umask if no default ACLs.
1439 			 */
1440 			if (vsec.vsa_dfaclcnt == 0)
1441 				vap->va_mode &= ~umask;
1442 
1443 			/*
1444 			 * VOP_GETSECATTR() may have allocated memory for
1445 			 * ACLs we didn't request, so double-check and
1446 			 * free it if necessary.
1447 			 */
1448 			if (vsec.vsa_aclcnt && vsec.vsa_aclentp != NULL)
1449 				kmem_free((caddr_t)vsec.vsa_aclentp,
1450 				    vsec.vsa_aclcnt * sizeof (aclent_t));
1451 			if (vsec.vsa_dfaclcnt && vsec.vsa_dfaclentp != NULL)
1452 				kmem_free((caddr_t)vsec.vsa_dfaclentp,
1453 				    vsec.vsa_dfaclcnt * sizeof (aclent_t));
1454 		}
1455 	}
1456 
1457 	/*
1458 	 * In general we want to generate EROFS if the file system is
1459 	 * readonly.  However, POSIX (IEEE Std. 1003.1) section 5.3.1
1460 	 * documents the open system call, and it says that O_CREAT has no
1461 	 * effect if the file already exists.  Bug 1119649 states
1462 	 * that open(path, O_CREAT, ...) fails when attempting to open an
1463 	 * existing file on a read only file system.  Thus, the first part
1464 	 * of the following if statement has 3 checks:
1465 	 *	if the file exists &&
1466 	 *		it is being open with write access &&
1467 	 *		the file system is read only
1468 	 *	then generate EROFS
1469 	 */
1470 	if ((*vpp != NULL && (mode & VWRITE) && ISROFILE(*vpp)) ||
1471 	    (*vpp == NULL && dvp->v_vfsp->vfs_flag & VFS_RDONLY)) {
1472 		if (*vpp)
1473 			VN_RELE(*vpp);
1474 		error = EROFS;
1475 	} else if (excl == NONEXCL && *vpp != NULL) {
1476 		vnode_t *rvp;
1477 
1478 		/*
1479 		 * File already exists.  If a mandatory lock has been
1480 		 * applied, return error.
1481 		 */
1482 		vp = *vpp;
1483 		if (VOP_REALVP(vp, &rvp, NULL) != 0)
1484 			rvp = vp;
1485 		if ((vap->va_mask & AT_SIZE) && nbl_need_check(vp)) {
1486 			nbl_start_crit(vp, RW_READER);
1487 			in_crit = 1;
1488 		}
1489 		if (rvp->v_filocks != NULL || rvp->v_shrlocks != NULL) {
1490 			vattr.va_mask = AT_MODE|AT_SIZE;
1491 			if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) {
1492 				goto out;
1493 			}
1494 			if (MANDLOCK(vp, vattr.va_mode)) {
1495 				error = EAGAIN;
1496 				goto out;
1497 			}
1498 			/*
1499 			 * File cannot be truncated if non-blocking mandatory
1500 			 * locks are currently on the file.
1501 			 */
1502 			if ((vap->va_mask & AT_SIZE) && in_crit) {
1503 				u_offset_t offset;
1504 				ssize_t length;
1505 
1506 				offset = vap->va_size > vattr.va_size ?
1507 				    vattr.va_size : vap->va_size;
1508 				length = vap->va_size > vattr.va_size ?
1509 				    vap->va_size - vattr.va_size :
1510 				    vattr.va_size - vap->va_size;
1511 				if (nbl_conflict(vp, NBL_WRITE, offset,
1512 				    length, 0, NULL)) {
1513 					error = EACCES;
1514 					goto out;
1515 				}
1516 			}
1517 		}
1518 
1519 		/*
1520 		 * If the file is the root of a VFS, we've crossed a
1521 		 * mount point and the "containing" directory that we
1522 		 * acquired above (dvp) is irrelevant because it's in
1523 		 * a different file system.  We apply VOP_CREATE to the
1524 		 * target itself instead of to the containing directory
1525 		 * and supply a null path name to indicate (conventionally)
1526 		 * the node itself as the "component" of interest.
1527 		 *
1528 		 * The call to VOP_CREATE() is necessary to ensure
1529 		 * that the appropriate permission checks are made,
1530 		 * i.e. EISDIR, EACCES, etc.  We already know that vpp
1531 		 * exists since we are in the else condition where this
1532 		 * was checked.
1533 		 */
1534 		if (vp->v_flag & VROOT) {
1535 			ASSERT(why != CRMKDIR);
1536 			error = VOP_CREATE(vp, "", vap, excl, mode, vpp,
1537 			    CRED(), flag, NULL, NULL);
1538 			/*
1539 			 * If the create succeeded, it will have created a
1540 			 * new reference on a new vnode (*vpp) in the child
1541 			 * file system, so we want to drop our reference on
1542 			 * the old (vp) upon exit.
1543 			 */
1544 			goto out;
1545 		}
1546 
1547 		/*
1548 		 * Large File API - non-large open (FOFFMAX flag not set)
1549 		 * of regular file fails if the file size exceeds MAXOFF32_T.
1550 		 */
1551 		if (why != CRMKDIR &&
1552 		    !(flag & FOFFMAX) &&
1553 		    (vp->v_type == VREG)) {
1554 			vattr.va_mask = AT_SIZE;
1555 			if ((error = VOP_GETATTR(vp, &vattr, 0,
1556 			    CRED(), NULL))) {
1557 				goto out;
1558 			}
1559 			if ((vattr.va_size > (u_offset_t)MAXOFF32_T)) {
1560 				error = EOVERFLOW;
1561 				goto out;
1562 			}
1563 		}
1564 	}
1565 
1566 	if (error == 0) {
1567 		/*
1568 		 * Call mkdir() if specified, otherwise create().
1569 		 */
1570 		int must_be_dir = pn_fixslash(&pn);	/* trailing '/'? */
1571 
1572 		if (why == CRMKDIR)
1573 			/*
1574 			 * N.B., if vn_createat() ever requests
1575 			 * case-insensitive behavior then it will need
1576 			 * to be passed to VOP_MKDIR().  VOP_CREATE()
1577 			 * will already get it via "flag"
1578 			 */
1579 			error = VOP_MKDIR(dvp, pn.pn_path, vap, vpp, CRED(),
1580 			    NULL, 0, NULL);
1581 		else if (!must_be_dir)
1582 			error = VOP_CREATE(dvp, pn.pn_path, vap,
1583 			    excl, mode, vpp, CRED(), flag, NULL, NULL);
1584 		else
1585 			error = ENOTDIR;
1586 	}
1587 
1588 out:
1589 
1590 	if (auditing)
1591 		audit_vncreate_finish(*vpp, error);
1592 	if (in_crit) {
1593 		nbl_end_crit(vp);
1594 		in_crit = 0;
1595 	}
1596 	if (vp != NULL) {
1597 		VN_RELE(vp);
1598 		vp = NULL;
1599 	}
1600 	pn_free(&pn);
1601 	VN_RELE(dvp);
1602 	/*
1603 	 * The following clause was added to handle a problem
1604 	 * with NFS consistency.  It is possible that a lookup
1605 	 * of the file to be created succeeded, but the file
1606 	 * itself doesn't actually exist on the server.  This
1607 	 * is chiefly due to the DNLC containing an entry for
1608 	 * the file which has been removed on the server.  In
1609 	 * this case, we just start over.  If there was some
1610 	 * other cause for the ESTALE error, then the lookup
1611 	 * of the file will fail and the error will be returned
1612 	 * above instead of looping around from here.
1613 	 */
1614 	if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1615 		goto top;
1616 	return (error);
1617 }
1618 
1619 int
1620 vn_link(char *from, char *to, enum uio_seg seg)
1621 {
1622 	return (vn_linkat(NULL, from, NO_FOLLOW, NULL, to, seg));
1623 }
1624 
1625 int
1626 vn_linkat(vnode_t *fstartvp, char *from, enum symfollow follow,
1627     vnode_t *tstartvp, char *to, enum uio_seg seg)
1628 {
1629 	struct vnode *fvp;		/* from vnode ptr */
1630 	struct vnode *tdvp;		/* to directory vnode ptr */
1631 	struct pathname pn;
1632 	int error;
1633 	struct vattr vattr;
1634 	dev_t fsid;
1635 	int estale_retry = 0;
1636 	uint32_t auditing = AU_AUDITING();
1637 
1638 top:
1639 	fvp = tdvp = NULL;
1640 	if (error = pn_get(to, seg, &pn))
1641 		return (error);
1642 	if (auditing && fstartvp != NULL)
1643 		audit_setfsat_path(1);
1644 	if (error = lookupnameat(from, seg, follow, NULLVPP, &fvp, fstartvp))
1645 		goto out;
1646 	if (auditing && tstartvp != NULL)
1647 		audit_setfsat_path(3);
1648 	if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &tdvp, NULLVPP, tstartvp))
1649 		goto out;
1650 	/*
1651 	 * Make sure both source vnode and target directory vnode are
1652 	 * in the same vfs and that it is writeable.
1653 	 */
1654 	vattr.va_mask = AT_FSID;
1655 	if (error = VOP_GETATTR(fvp, &vattr, 0, CRED(), NULL))
1656 		goto out;
1657 	fsid = vattr.va_fsid;
1658 	vattr.va_mask = AT_FSID;
1659 	if (error = VOP_GETATTR(tdvp, &vattr, 0, CRED(), NULL))
1660 		goto out;
1661 	if (fsid != vattr.va_fsid) {
1662 		error = EXDEV;
1663 		goto out;
1664 	}
1665 	if (tdvp->v_vfsp->vfs_flag & VFS_RDONLY) {
1666 		error = EROFS;
1667 		goto out;
1668 	}
1669 	/*
1670 	 * Do the link.
1671 	 */
1672 	(void) pn_fixslash(&pn);
1673 	error = VOP_LINK(tdvp, fvp, pn.pn_path, CRED(), NULL, 0);
1674 out:
1675 	pn_free(&pn);
1676 	if (fvp)
1677 		VN_RELE(fvp);
1678 	if (tdvp)
1679 		VN_RELE(tdvp);
1680 	if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1681 		goto top;
1682 	return (error);
1683 }
1684 
1685 int
1686 vn_rename(char *from, char *to, enum uio_seg seg)
1687 {
1688 	return (vn_renameat(NULL, from, NULL, to, seg));
1689 }
1690 
1691 int
1692 vn_renameat(vnode_t *fdvp, char *fname, vnode_t *tdvp,
1693     char *tname, enum uio_seg seg)
1694 {
1695 	int error;
1696 	struct vattr vattr;
1697 	struct pathname fpn;		/* from pathname */
1698 	struct pathname tpn;		/* to pathname */
1699 	dev_t fsid;
1700 	int in_crit_src, in_crit_targ;
1701 	vnode_t *fromvp, *fvp;
1702 	vnode_t *tovp, *targvp;
1703 	int estale_retry = 0;
1704 	uint32_t auditing = AU_AUDITING();
1705 
1706 top:
1707 	fvp = fromvp = tovp = targvp = NULL;
1708 	in_crit_src = in_crit_targ = 0;
1709 	/*
1710 	 * Get to and from pathnames.
1711 	 */
1712 	if (error = pn_get(fname, seg, &fpn))
1713 		return (error);
1714 	if (error = pn_get(tname, seg, &tpn)) {
1715 		pn_free(&fpn);
1716 		return (error);
1717 	}
1718 
1719 	/*
1720 	 * First we need to resolve the correct directories
1721 	 * The passed in directories may only be a starting point,
1722 	 * but we need the real directories the file(s) live in.
1723 	 * For example the fname may be something like usr/lib/sparc
1724 	 * and we were passed in the / directory, but we need to
1725 	 * use the lib directory for the rename.
1726 	 */
1727 
1728 	if (auditing && fdvp != NULL)
1729 		audit_setfsat_path(1);
1730 	/*
1731 	 * Lookup to and from directories.
1732 	 */
1733 	if (error = lookuppnat(&fpn, NULL, NO_FOLLOW, &fromvp, &fvp, fdvp)) {
1734 		goto out;
1735 	}
1736 
1737 	/*
1738 	 * Make sure there is an entry.
1739 	 */
1740 	if (fvp == NULL) {
1741 		error = ENOENT;
1742 		goto out;
1743 	}
1744 
1745 	if (auditing && tdvp != NULL)
1746 		audit_setfsat_path(3);
1747 	if (error = lookuppnat(&tpn, NULL, NO_FOLLOW, &tovp, &targvp, tdvp)) {
1748 		goto out;
1749 	}
1750 
1751 	/*
1752 	 * Make sure both the from vnode directory and the to directory
1753 	 * are in the same vfs and the to directory is writable.
1754 	 * We check fsid's, not vfs pointers, so loopback fs works.
1755 	 */
1756 	if (fromvp != tovp) {
1757 		vattr.va_mask = AT_FSID;
1758 		if (error = VOP_GETATTR(fromvp, &vattr, 0, CRED(), NULL))
1759 			goto out;
1760 		fsid = vattr.va_fsid;
1761 		vattr.va_mask = AT_FSID;
1762 		if (error = VOP_GETATTR(tovp, &vattr, 0, CRED(), NULL))
1763 			goto out;
1764 		if (fsid != vattr.va_fsid) {
1765 			error = EXDEV;
1766 			goto out;
1767 		}
1768 	}
1769 
1770 	if (tovp->v_vfsp->vfs_flag & VFS_RDONLY) {
1771 		error = EROFS;
1772 		goto out;
1773 	}
1774 
1775 	/*
1776 	 * Make sure "from" vp is not a mount point.
1777 	 * Note, lookup did traverse() already, so
1778 	 * we'll be looking at the mounted FS root.
1779 	 * (but allow files like mnttab)
1780 	 */
1781 	if ((fvp->v_flag & VROOT) != 0 && fvp->v_type == VDIR) {
1782 		error = EBUSY;
1783 		goto out;
1784 	}
1785 
1786 	if (targvp && (fvp != targvp)) {
1787 		nbl_start_crit(targvp, RW_READER);
1788 		in_crit_targ = 1;
1789 		if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
1790 			error = EACCES;
1791 			goto out;
1792 		}
1793 	}
1794 
1795 	if (nbl_need_check(fvp)) {
1796 		nbl_start_crit(fvp, RW_READER);
1797 		in_crit_src = 1;
1798 		if (nbl_conflict(fvp, NBL_RENAME, 0, 0, 0, NULL)) {
1799 			error = EACCES;
1800 			goto out;
1801 		}
1802 	}
1803 
1804 	/*
1805 	 * Do the rename.
1806 	 */
1807 	(void) pn_fixslash(&tpn);
1808 	error = VOP_RENAME(fromvp, fpn.pn_path, tovp, tpn.pn_path, CRED(),
1809 	    NULL, 0);
1810 
1811 out:
1812 	pn_free(&fpn);
1813 	pn_free(&tpn);
1814 	if (in_crit_src)
1815 		nbl_end_crit(fvp);
1816 	if (in_crit_targ)
1817 		nbl_end_crit(targvp);
1818 	if (fromvp)
1819 		VN_RELE(fromvp);
1820 	if (tovp)
1821 		VN_RELE(tovp);
1822 	if (targvp)
1823 		VN_RELE(targvp);
1824 	if (fvp)
1825 		VN_RELE(fvp);
1826 	if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1827 		goto top;
1828 	return (error);
1829 }
1830 
1831 /*
1832  * Remove a file or directory.
1833  */
1834 int
1835 vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag)
1836 {
1837 	return (vn_removeat(NULL, fnamep, seg, dirflag));
1838 }
1839 
1840 int
1841 vn_removeat(vnode_t *startvp, char *fnamep, enum uio_seg seg, enum rm dirflag)
1842 {
1843 	struct vnode *vp;		/* entry vnode */
1844 	struct vnode *dvp;		/* ptr to parent dir vnode */
1845 	struct vnode *coveredvp;
1846 	struct pathname pn;		/* name of entry */
1847 	enum vtype vtype;
1848 	int error;
1849 	struct vfs *vfsp;
1850 	struct vfs *dvfsp;	/* ptr to parent dir vfs */
1851 	int in_crit = 0;
1852 	int estale_retry = 0;
1853 
1854 top:
1855 	if (error = pn_get(fnamep, seg, &pn))
1856 		return (error);
1857 	dvp = vp = NULL;
1858 	if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &dvp, &vp, startvp)) {
1859 		pn_free(&pn);
1860 		if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1861 			goto top;
1862 		return (error);
1863 	}
1864 
1865 	/*
1866 	 * Make sure there is an entry.
1867 	 */
1868 	if (vp == NULL) {
1869 		error = ENOENT;
1870 		goto out;
1871 	}
1872 
1873 	vfsp = vp->v_vfsp;
1874 	dvfsp = dvp->v_vfsp;
1875 
1876 	/*
1877 	 * If the named file is the root of a mounted filesystem, fail,
1878 	 * unless it's marked unlinkable.  In that case, unmount the
1879 	 * filesystem and proceed to unlink the covered vnode.  (If the
1880 	 * covered vnode is a directory, use rmdir instead of unlink,
1881 	 * to avoid file system corruption.)
1882 	 */
1883 	if (vp->v_flag & VROOT) {
1884 		if ((vfsp->vfs_flag & VFS_UNLINKABLE) == 0) {
1885 			error = EBUSY;
1886 			goto out;
1887 		}
1888 
1889 		/*
1890 		 * Namefs specific code starts here.
1891 		 */
1892 
1893 		if (dirflag == RMDIRECTORY) {
1894 			/*
1895 			 * User called rmdir(2) on a file that has
1896 			 * been namefs mounted on top of.  Since
1897 			 * namefs doesn't allow directories to
1898 			 * be mounted on other files we know
1899 			 * vp is not of type VDIR so fail to operation.
1900 			 */
1901 			error = ENOTDIR;
1902 			goto out;
1903 		}
1904 
1905 		/*
1906 		 * If VROOT is still set after grabbing vp->v_lock,
1907 		 * noone has finished nm_unmount so far and coveredvp
1908 		 * is valid.
1909 		 * If we manage to grab vn_vfswlock(coveredvp) before releasing
1910 		 * vp->v_lock, any race window is eliminated.
1911 		 */
1912 
1913 		mutex_enter(&vp->v_lock);
1914 		if ((vp->v_flag & VROOT) == 0) {
1915 			/* Someone beat us to the unmount */
1916 			mutex_exit(&vp->v_lock);
1917 			error = EBUSY;
1918 			goto out;
1919 		}
1920 		vfsp = vp->v_vfsp;
1921 		coveredvp = vfsp->vfs_vnodecovered;
1922 		ASSERT(coveredvp);
1923 		/*
1924 		 * Note: Implementation of vn_vfswlock shows that ordering of
1925 		 * v_lock / vn_vfswlock is not an issue here.
1926 		 */
1927 		error = vn_vfswlock(coveredvp);
1928 		mutex_exit(&vp->v_lock);
1929 
1930 		if (error)
1931 			goto out;
1932 
1933 		VN_HOLD(coveredvp);
1934 		VN_RELE(vp);
1935 		error = dounmount(vfsp, 0, CRED());
1936 
1937 		/*
1938 		 * Unmounted the namefs file system; now get
1939 		 * the object it was mounted over.
1940 		 */
1941 		vp = coveredvp;
1942 		/*
1943 		 * If namefs was mounted over a directory, then
1944 		 * we want to use rmdir() instead of unlink().
1945 		 */
1946 		if (vp->v_type == VDIR)
1947 			dirflag = RMDIRECTORY;
1948 
1949 		if (error)
1950 			goto out;
1951 	}
1952 
1953 	/*
1954 	 * Make sure filesystem is writeable.
1955 	 * We check the parent directory's vfs in case this is an lofs vnode.
1956 	 */
1957 	if (dvfsp && dvfsp->vfs_flag & VFS_RDONLY) {
1958 		error = EROFS;
1959 		goto out;
1960 	}
1961 
1962 	vtype = vp->v_type;
1963 
1964 	/*
1965 	 * If there is the possibility of an nbmand share reservation, make
1966 	 * sure it's okay to remove the file.  Keep a reference to the
1967 	 * vnode, so that we can exit the nbl critical region after
1968 	 * calling VOP_REMOVE.
1969 	 * If there is no possibility of an nbmand share reservation,
1970 	 * release the vnode reference now.  Filesystems like NFS may
1971 	 * behave differently if there is an extra reference, so get rid of
1972 	 * this one.  Fortunately, we can't have nbmand mounts on NFS
1973 	 * filesystems.
1974 	 */
1975 	if (nbl_need_check(vp)) {
1976 		nbl_start_crit(vp, RW_READER);
1977 		in_crit = 1;
1978 		if (nbl_conflict(vp, NBL_REMOVE, 0, 0, 0, NULL)) {
1979 			error = EACCES;
1980 			goto out;
1981 		}
1982 	} else {
1983 		VN_RELE(vp);
1984 		vp = NULL;
1985 	}
1986 
1987 	if (dirflag == RMDIRECTORY) {
1988 		/*
1989 		 * Caller is using rmdir(2), which can only be applied to
1990 		 * directories.
1991 		 */
1992 		if (vtype != VDIR) {
1993 			error = ENOTDIR;
1994 		} else {
1995 			vnode_t *cwd;
1996 			proc_t *pp = curproc;
1997 
1998 			mutex_enter(&pp->p_lock);
1999 			cwd = PTOU(pp)->u_cdir;
2000 			VN_HOLD(cwd);
2001 			mutex_exit(&pp->p_lock);
2002 			error = VOP_RMDIR(dvp, pn.pn_path, cwd, CRED(),
2003 			    NULL, 0);
2004 			VN_RELE(cwd);
2005 		}
2006 	} else {
2007 		/*
2008 		 * Unlink(2) can be applied to anything.
2009 		 */
2010 		error = VOP_REMOVE(dvp, pn.pn_path, CRED(), NULL, 0);
2011 	}
2012 
2013 out:
2014 	pn_free(&pn);
2015 	if (in_crit) {
2016 		nbl_end_crit(vp);
2017 		in_crit = 0;
2018 	}
2019 	if (vp != NULL)
2020 		VN_RELE(vp);
2021 	if (dvp != NULL)
2022 		VN_RELE(dvp);
2023 	if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
2024 		goto top;
2025 	return (error);
2026 }
2027 
2028 /*
2029  * Utility function to compare equality of vnodes.
2030  * Compare the underlying real vnodes, if there are underlying vnodes.
2031  * This is a more thorough comparison than the VN_CMP() macro provides.
2032  */
2033 int
2034 vn_compare(vnode_t *vp1, vnode_t *vp2)
2035 {
2036 	vnode_t *realvp;
2037 
2038 	if (vp1 != NULL && VOP_REALVP(vp1, &realvp, NULL) == 0)
2039 		vp1 = realvp;
2040 	if (vp2 != NULL && VOP_REALVP(vp2, &realvp, NULL) == 0)
2041 		vp2 = realvp;
2042 	return (VN_CMP(vp1, vp2));
2043 }
2044 
2045 /*
2046  * The number of locks to hash into.  This value must be a power
2047  * of 2 minus 1 and should probably also be prime.
2048  */
2049 #define	NUM_BUCKETS	1023
2050 
2051 struct  vn_vfslocks_bucket {
2052 	kmutex_t vb_lock;
2053 	vn_vfslocks_entry_t *vb_list;
2054 	char pad[64 - sizeof (kmutex_t) - sizeof (void *)];
2055 };
2056 
2057 /*
2058  * Total number of buckets will be NUM_BUCKETS + 1 .
2059  */
2060 
2061 #pragma	align	64(vn_vfslocks_buckets)
2062 static	struct vn_vfslocks_bucket	vn_vfslocks_buckets[NUM_BUCKETS + 1];
2063 
2064 #define	VN_VFSLOCKS_SHIFT	9
2065 
2066 #define	VN_VFSLOCKS_HASH(vfsvpptr)	\
2067 	((((intptr_t)(vfsvpptr)) >> VN_VFSLOCKS_SHIFT) & NUM_BUCKETS)
2068 
2069 /*
2070  * vn_vfslocks_getlock() uses an HASH scheme to generate
2071  * rwstlock using vfs/vnode pointer passed to it.
2072  *
2073  * vn_vfslocks_rele() releases a reference in the
2074  * HASH table which allows the entry allocated by
2075  * vn_vfslocks_getlock() to be freed at a later
2076  * stage when the refcount drops to zero.
2077  */
2078 
2079 vn_vfslocks_entry_t *
2080 vn_vfslocks_getlock(void *vfsvpptr)
2081 {
2082 	struct vn_vfslocks_bucket *bp;
2083 	vn_vfslocks_entry_t *vep;
2084 	vn_vfslocks_entry_t *tvep;
2085 
2086 	ASSERT(vfsvpptr != NULL);
2087 	bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vfsvpptr)];
2088 
2089 	mutex_enter(&bp->vb_lock);
2090 	for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
2091 		if (vep->ve_vpvfs == vfsvpptr) {
2092 			vep->ve_refcnt++;
2093 			mutex_exit(&bp->vb_lock);
2094 			return (vep);
2095 		}
2096 	}
2097 	mutex_exit(&bp->vb_lock);
2098 	vep = kmem_alloc(sizeof (*vep), KM_SLEEP);
2099 	rwst_init(&vep->ve_lock, NULL, RW_DEFAULT, NULL);
2100 	vep->ve_vpvfs = (char *)vfsvpptr;
2101 	vep->ve_refcnt = 1;
2102 	mutex_enter(&bp->vb_lock);
2103 	for (tvep = bp->vb_list; tvep != NULL; tvep = tvep->ve_next) {
2104 		if (tvep->ve_vpvfs == vfsvpptr) {
2105 			tvep->ve_refcnt++;
2106 			mutex_exit(&bp->vb_lock);
2107 
2108 			/*
2109 			 * There is already an entry in the hash
2110 			 * destroy what we just allocated.
2111 			 */
2112 			rwst_destroy(&vep->ve_lock);
2113 			kmem_free(vep, sizeof (*vep));
2114 			return (tvep);
2115 		}
2116 	}
2117 	vep->ve_next = bp->vb_list;
2118 	bp->vb_list = vep;
2119 	mutex_exit(&bp->vb_lock);
2120 	return (vep);
2121 }
2122 
2123 void
2124 vn_vfslocks_rele(vn_vfslocks_entry_t *vepent)
2125 {
2126 	struct vn_vfslocks_bucket *bp;
2127 	vn_vfslocks_entry_t *vep;
2128 	vn_vfslocks_entry_t *pvep;
2129 
2130 	ASSERT(vepent != NULL);
2131 	ASSERT(vepent->ve_vpvfs != NULL);
2132 
2133 	bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vepent->ve_vpvfs)];
2134 
2135 	mutex_enter(&bp->vb_lock);
2136 	vepent->ve_refcnt--;
2137 
2138 	if ((int32_t)vepent->ve_refcnt < 0)
2139 		cmn_err(CE_PANIC, "vn_vfslocks_rele: refcount negative");
2140 
2141 	pvep = NULL;
2142 	if (vepent->ve_refcnt == 0) {
2143 		for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
2144 			if (vep->ve_vpvfs == vepent->ve_vpvfs) {
2145 				if (pvep == NULL)
2146 					bp->vb_list = vep->ve_next;
2147 				else {
2148 					pvep->ve_next = vep->ve_next;
2149 				}
2150 				mutex_exit(&bp->vb_lock);
2151 				rwst_destroy(&vep->ve_lock);
2152 				kmem_free(vep, sizeof (*vep));
2153 				return;
2154 			}
2155 			pvep = vep;
2156 		}
2157 		cmn_err(CE_PANIC, "vn_vfslocks_rele: vp/vfs not found");
2158 	}
2159 	mutex_exit(&bp->vb_lock);
2160 }
2161 
2162 /*
2163  * vn_vfswlock_wait is used to implement a lock which is logically a writers
2164  * lock protecting the v_vfsmountedhere field.
2165  * vn_vfswlock_wait has been modified to be similar to vn_vfswlock,
2166  * except that it blocks to acquire the lock VVFSLOCK.
2167  *
2168  * traverse() and routines re-implementing part of traverse (e.g. autofs)
2169  * need to hold this lock. mount(), vn_rename(), vn_remove() and so on
2170  * need the non-blocking version of the writers lock i.e. vn_vfswlock
2171  */
2172 int
2173 vn_vfswlock_wait(vnode_t *vp)
2174 {
2175 	int retval;
2176 	vn_vfslocks_entry_t *vpvfsentry;
2177 	ASSERT(vp != NULL);
2178 
2179 	vpvfsentry = vn_vfslocks_getlock(vp);
2180 	retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_WRITER);
2181 
2182 	if (retval == EINTR) {
2183 		vn_vfslocks_rele(vpvfsentry);
2184 		return (EINTR);
2185 	}
2186 	return (retval);
2187 }
2188 
2189 int
2190 vn_vfsrlock_wait(vnode_t *vp)
2191 {
2192 	int retval;
2193 	vn_vfslocks_entry_t *vpvfsentry;
2194 	ASSERT(vp != NULL);
2195 
2196 	vpvfsentry = vn_vfslocks_getlock(vp);
2197 	retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_READER);
2198 
2199 	if (retval == EINTR) {
2200 		vn_vfslocks_rele(vpvfsentry);
2201 		return (EINTR);
2202 	}
2203 
2204 	return (retval);
2205 }
2206 
2207 
2208 /*
2209  * vn_vfswlock is used to implement a lock which is logically a writers lock
2210  * protecting the v_vfsmountedhere field.
2211  */
2212 int
2213 vn_vfswlock(vnode_t *vp)
2214 {
2215 	vn_vfslocks_entry_t *vpvfsentry;
2216 
2217 	/*
2218 	 * If vp is NULL then somebody is trying to lock the covered vnode
2219 	 * of /.  (vfs_vnodecovered is NULL for /).  This situation will
2220 	 * only happen when unmounting /.  Since that operation will fail
2221 	 * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2222 	 */
2223 	if (vp == NULL)
2224 		return (EBUSY);
2225 
2226 	vpvfsentry = vn_vfslocks_getlock(vp);
2227 
2228 	if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER))
2229 		return (0);
2230 
2231 	vn_vfslocks_rele(vpvfsentry);
2232 	return (EBUSY);
2233 }
2234 
2235 int
2236 vn_vfsrlock(vnode_t *vp)
2237 {
2238 	vn_vfslocks_entry_t *vpvfsentry;
2239 
2240 	/*
2241 	 * If vp is NULL then somebody is trying to lock the covered vnode
2242 	 * of /.  (vfs_vnodecovered is NULL for /).  This situation will
2243 	 * only happen when unmounting /.  Since that operation will fail
2244 	 * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2245 	 */
2246 	if (vp == NULL)
2247 		return (EBUSY);
2248 
2249 	vpvfsentry = vn_vfslocks_getlock(vp);
2250 
2251 	if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER))
2252 		return (0);
2253 
2254 	vn_vfslocks_rele(vpvfsentry);
2255 	return (EBUSY);
2256 }
2257 
2258 void
2259 vn_vfsunlock(vnode_t *vp)
2260 {
2261 	vn_vfslocks_entry_t *vpvfsentry;
2262 
2263 	/*
2264 	 * ve_refcnt needs to be decremented twice.
2265 	 * 1. To release refernce after a call to vn_vfslocks_getlock()
2266 	 * 2. To release the reference from the locking routines like
2267 	 *    vn_vfsrlock/vn_vfswlock etc,.
2268 	 */
2269 	vpvfsentry = vn_vfslocks_getlock(vp);
2270 	vn_vfslocks_rele(vpvfsentry);
2271 
2272 	rwst_exit(&vpvfsentry->ve_lock);
2273 	vn_vfslocks_rele(vpvfsentry);
2274 }
2275 
2276 int
2277 vn_vfswlock_held(vnode_t *vp)
2278 {
2279 	int held;
2280 	vn_vfslocks_entry_t *vpvfsentry;
2281 
2282 	ASSERT(vp != NULL);
2283 
2284 	vpvfsentry = vn_vfslocks_getlock(vp);
2285 	held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER);
2286 
2287 	vn_vfslocks_rele(vpvfsentry);
2288 	return (held);
2289 }
2290 
2291 
2292 int
2293 vn_make_ops(
2294 	const char *name,			/* Name of file system */
2295 	const fs_operation_def_t *templ,	/* Operation specification */
2296 	vnodeops_t **actual)			/* Return the vnodeops */
2297 {
2298 	int unused_ops;
2299 	int error;
2300 
2301 	*actual = (vnodeops_t *)kmem_alloc(sizeof (vnodeops_t), KM_SLEEP);
2302 
2303 	(*actual)->vnop_name = name;
2304 
2305 	error = fs_build_vector(*actual, &unused_ops, vn_ops_table, templ);
2306 	if (error) {
2307 		kmem_free(*actual, sizeof (vnodeops_t));
2308 	}
2309 
2310 #if DEBUG
2311 	if (unused_ops != 0)
2312 		cmn_err(CE_WARN, "vn_make_ops: %s: %d operations supplied "
2313 		    "but not used", name, unused_ops);
2314 #endif
2315 
2316 	return (error);
2317 }
2318 
2319 /*
2320  * Free the vnodeops created as a result of vn_make_ops()
2321  */
2322 void
2323 vn_freevnodeops(vnodeops_t *vnops)
2324 {
2325 	kmem_free(vnops, sizeof (vnodeops_t));
2326 }
2327 
2328 /*
2329  * Vnode cache.
2330  */
2331 
2332 /* ARGSUSED */
2333 static int
2334 vn_cache_constructor(void *buf, void *cdrarg, int kmflags)
2335 {
2336 	struct vnode *vp;
2337 
2338 	vp = buf;
2339 
2340 	mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL);
2341 	mutex_init(&vp->v_vsd_lock, NULL, MUTEX_DEFAULT, NULL);
2342 	cv_init(&vp->v_cv, NULL, CV_DEFAULT, NULL);
2343 	rw_init(&vp->v_nbllock, NULL, RW_DEFAULT, NULL);
2344 	vp->v_femhead = NULL;	/* Must be done before vn_reinit() */
2345 	vp->v_path = vn_vpath_empty;
2346 	vp->v_path_stamp = 0;
2347 	vp->v_mpssdata = NULL;
2348 	vp->v_vsd = NULL;
2349 	vp->v_fopdata = NULL;
2350 
2351 	return (0);
2352 }
2353 
2354 /* ARGSUSED */
2355 static void
2356 vn_cache_destructor(void *buf, void *cdrarg)
2357 {
2358 	struct vnode *vp;
2359 
2360 	vp = buf;
2361 
2362 	rw_destroy(&vp->v_nbllock);
2363 	cv_destroy(&vp->v_cv);
2364 	mutex_destroy(&vp->v_vsd_lock);
2365 	mutex_destroy(&vp->v_lock);
2366 }
2367 
2368 void
2369 vn_create_cache(void)
2370 {
2371 	/* LINTED */
2372 	ASSERT((1 << VNODE_ALIGN_LOG2) ==
2373 	    P2ROUNDUP(sizeof (struct vnode), VNODE_ALIGN));
2374 	vn_cache = kmem_cache_create("vn_cache", sizeof (struct vnode),
2375 	    VNODE_ALIGN, vn_cache_constructor, vn_cache_destructor, NULL, NULL,
2376 	    NULL, 0);
2377 }
2378 
2379 void
2380 vn_destroy_cache(void)
2381 {
2382 	kmem_cache_destroy(vn_cache);
2383 }
2384 
2385 /*
2386  * Used by file systems when fs-specific nodes (e.g., ufs inodes) are
2387  * cached by the file system and vnodes remain associated.
2388  */
2389 void
2390 vn_recycle(vnode_t *vp)
2391 {
2392 	ASSERT(vp->v_pages == NULL);
2393 	VERIFY(vp->v_path != NULL);
2394 
2395 	/*
2396 	 * XXX - This really belongs in vn_reinit(), but we have some issues
2397 	 * with the counts.  Best to have it here for clean initialization.
2398 	 */
2399 	vp->v_rdcnt = 0;
2400 	vp->v_wrcnt = 0;
2401 	vp->v_mmap_read = 0;
2402 	vp->v_mmap_write = 0;
2403 
2404 	/*
2405 	 * If FEM was in use, make sure everything gets cleaned up
2406 	 * NOTE: vp->v_femhead is initialized to NULL in the vnode
2407 	 * constructor.
2408 	 */
2409 	if (vp->v_femhead) {
2410 		/* XXX - There should be a free_femhead() that does all this */
2411 		ASSERT(vp->v_femhead->femh_list == NULL);
2412 		mutex_destroy(&vp->v_femhead->femh_lock);
2413 		kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2414 		vp->v_femhead = NULL;
2415 	}
2416 	if (vp->v_path != vn_vpath_empty) {
2417 		kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2418 		vp->v_path = vn_vpath_empty;
2419 	}
2420 	vp->v_path_stamp = 0;
2421 
2422 	if (vp->v_fopdata != NULL) {
2423 		free_fopdata(vp);
2424 	}
2425 	vp->v_mpssdata = NULL;
2426 	vsd_free(vp);
2427 }
2428 
2429 /*
2430  * Used to reset the vnode fields including those that are directly accessible
2431  * as well as those which require an accessor function.
2432  *
2433  * Does not initialize:
2434  *	synchronization objects: v_lock, v_vsd_lock, v_nbllock, v_cv
2435  *	v_data (since FS-nodes and vnodes point to each other and should
2436  *		be updated simultaneously)
2437  *	v_op (in case someone needs to make a VOP call on this object)
2438  */
2439 void
2440 vn_reinit(vnode_t *vp)
2441 {
2442 	vp->v_count = 1;
2443 	vp->v_count_dnlc = 0;
2444 	vp->v_vfsp = NULL;
2445 	vp->v_stream = NULL;
2446 	vp->v_vfsmountedhere = NULL;
2447 	vp->v_flag = 0;
2448 	vp->v_type = VNON;
2449 	vp->v_rdev = NODEV;
2450 
2451 	vp->v_filocks = NULL;
2452 	vp->v_shrlocks = NULL;
2453 	vp->v_pages = NULL;
2454 
2455 	vp->v_locality = NULL;
2456 	vp->v_xattrdir = NULL;
2457 
2458 	/*
2459 	 * In a few specific instances, vn_reinit() is used to initialize
2460 	 * locally defined vnode_t instances.  Lacking the construction offered
2461 	 * by vn_alloc(), these vnodes require v_path initialization.
2462 	 */
2463 	if (vp->v_path == NULL) {
2464 		vp->v_path = vn_vpath_empty;
2465 	}
2466 
2467 	/* Handles v_femhead, v_path, and the r/w/map counts */
2468 	vn_recycle(vp);
2469 }
2470 
2471 vnode_t *
2472 vn_alloc(int kmflag)
2473 {
2474 	vnode_t *vp;
2475 
2476 	vp = kmem_cache_alloc(vn_cache, kmflag);
2477 
2478 	if (vp != NULL) {
2479 		vp->v_femhead = NULL;	/* Must be done before vn_reinit() */
2480 		vp->v_fopdata = NULL;
2481 		vn_reinit(vp);
2482 	}
2483 
2484 	return (vp);
2485 }
2486 
2487 void
2488 vn_free(vnode_t *vp)
2489 {
2490 	ASSERT(vp->v_shrlocks == NULL);
2491 	ASSERT(vp->v_filocks == NULL);
2492 
2493 	/*
2494 	 * Some file systems call vn_free() with v_count of zero,
2495 	 * some with v_count of 1.  In any case, the value should
2496 	 * never be anything else.
2497 	 */
2498 	ASSERT((vp->v_count == 0) || (vp->v_count == 1));
2499 	ASSERT(vp->v_count_dnlc == 0);
2500 	VERIFY(vp->v_path != NULL);
2501 	if (vp->v_path != vn_vpath_empty) {
2502 		kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2503 		vp->v_path = vn_vpath_empty;
2504 	}
2505 
2506 	/* If FEM was in use, make sure everything gets cleaned up */
2507 	if (vp->v_femhead) {
2508 		/* XXX - There should be a free_femhead() that does all this */
2509 		ASSERT(vp->v_femhead->femh_list == NULL);
2510 		mutex_destroy(&vp->v_femhead->femh_lock);
2511 		kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2512 		vp->v_femhead = NULL;
2513 	}
2514 
2515 	if (vp->v_fopdata != NULL) {
2516 		free_fopdata(vp);
2517 	}
2518 	vp->v_mpssdata = NULL;
2519 	vsd_free(vp);
2520 	kmem_cache_free(vn_cache, vp);
2521 }
2522 
2523 /*
2524  * vnode status changes, should define better states than 1, 0.
2525  */
2526 void
2527 vn_reclaim(vnode_t *vp)
2528 {
2529 	vfs_t   *vfsp = vp->v_vfsp;
2530 
2531 	if (vfsp == NULL ||
2532 	    vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2533 		return;
2534 	}
2535 	(void) VFS_VNSTATE(vfsp, vp, VNTRANS_RECLAIMED);
2536 }
2537 
2538 void
2539 vn_idle(vnode_t *vp)
2540 {
2541 	vfs_t   *vfsp = vp->v_vfsp;
2542 
2543 	if (vfsp == NULL ||
2544 	    vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2545 		return;
2546 	}
2547 	(void) VFS_VNSTATE(vfsp, vp, VNTRANS_IDLED);
2548 }
2549 void
2550 vn_exists(vnode_t *vp)
2551 {
2552 	vfs_t   *vfsp = vp->v_vfsp;
2553 
2554 	if (vfsp == NULL ||
2555 	    vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2556 		return;
2557 	}
2558 	(void) VFS_VNSTATE(vfsp, vp, VNTRANS_EXISTS);
2559 }
2560 
2561 void
2562 vn_invalid(vnode_t *vp)
2563 {
2564 	vfs_t   *vfsp = vp->v_vfsp;
2565 
2566 	if (vfsp == NULL ||
2567 	    vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2568 		return;
2569 	}
2570 	(void) VFS_VNSTATE(vfsp, vp, VNTRANS_DESTROYED);
2571 }
2572 
2573 /* Vnode event notification */
2574 
2575 int
2576 vnevent_support(vnode_t *vp, caller_context_t *ct)
2577 {
2578 	if (vp == NULL)
2579 		return (EINVAL);
2580 
2581 	return (VOP_VNEVENT(vp, VE_SUPPORT, NULL, NULL, ct));
2582 }
2583 
2584 void
2585 vnevent_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2586 {
2587 	if (vp == NULL || vp->v_femhead == NULL) {
2588 		return;
2589 	}
2590 	(void) VOP_VNEVENT(vp, VE_RENAME_SRC, dvp, name, ct);
2591 }
2592 
2593 void
2594 vnevent_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2595     caller_context_t *ct)
2596 {
2597 	if (vp == NULL || vp->v_femhead == NULL) {
2598 		return;
2599 	}
2600 	(void) VOP_VNEVENT(vp, VE_RENAME_DEST, dvp, name, ct);
2601 }
2602 
2603 void
2604 vnevent_rename_dest_dir(vnode_t *vp, caller_context_t *ct)
2605 {
2606 	if (vp == NULL || vp->v_femhead == NULL) {
2607 		return;
2608 	}
2609 	(void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, NULL, NULL, ct);
2610 }
2611 
2612 void
2613 vnevent_remove(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2614 {
2615 	if (vp == NULL || vp->v_femhead == NULL) {
2616 		return;
2617 	}
2618 	(void) VOP_VNEVENT(vp, VE_REMOVE, dvp, name, ct);
2619 }
2620 
2621 void
2622 vnevent_rmdir(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2623 {
2624 	if (vp == NULL || vp->v_femhead == NULL) {
2625 		return;
2626 	}
2627 	(void) VOP_VNEVENT(vp, VE_RMDIR, dvp, name, ct);
2628 }
2629 
2630 void
2631 vnevent_pre_rename_src(vnode_t *vp, vnode_t *dvp, char *name,
2632     caller_context_t *ct)
2633 {
2634 	if (vp == NULL || vp->v_femhead == NULL) {
2635 		return;
2636 	}
2637 	(void) VOP_VNEVENT(vp, VE_PRE_RENAME_SRC, dvp, name, ct);
2638 }
2639 
2640 void
2641 vnevent_pre_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2642     caller_context_t *ct)
2643 {
2644 	if (vp == NULL || vp->v_femhead == NULL) {
2645 		return;
2646 	}
2647 	(void) VOP_VNEVENT(vp, VE_PRE_RENAME_DEST, dvp, name, ct);
2648 }
2649 
2650 void
2651 vnevent_pre_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name,
2652     caller_context_t *ct)
2653 {
2654 	if (vp == NULL || vp->v_femhead == NULL) {
2655 		return;
2656 	}
2657 	(void) VOP_VNEVENT(vp, VE_PRE_RENAME_DEST_DIR, nvp, name, ct);
2658 }
2659 
2660 void
2661 vnevent_create(vnode_t *vp, caller_context_t *ct)
2662 {
2663 	if (vp == NULL || vp->v_femhead == NULL) {
2664 		return;
2665 	}
2666 	(void) VOP_VNEVENT(vp, VE_CREATE, NULL, NULL, ct);
2667 }
2668 
2669 void
2670 vnevent_link(vnode_t *vp, caller_context_t *ct)
2671 {
2672 	if (vp == NULL || vp->v_femhead == NULL) {
2673 		return;
2674 	}
2675 	(void) VOP_VNEVENT(vp, VE_LINK, NULL, NULL, ct);
2676 }
2677 
2678 void
2679 vnevent_mountedover(vnode_t *vp, caller_context_t *ct)
2680 {
2681 	if (vp == NULL || vp->v_femhead == NULL) {
2682 		return;
2683 	}
2684 	(void) VOP_VNEVENT(vp, VE_MOUNTEDOVER, NULL, NULL, ct);
2685 }
2686 
2687 void
2688 vnevent_truncate(vnode_t *vp, caller_context_t *ct)
2689 {
2690 	if (vp == NULL || vp->v_femhead == NULL) {
2691 		return;
2692 	}
2693 	(void) VOP_VNEVENT(vp, VE_TRUNCATE, NULL, NULL, ct);
2694 }
2695 
2696 /*
2697  * Vnode accessors.
2698  */
2699 
2700 int
2701 vn_is_readonly(vnode_t *vp)
2702 {
2703 	return (vp->v_vfsp->vfs_flag & VFS_RDONLY);
2704 }
2705 
2706 int
2707 vn_has_flocks(vnode_t *vp)
2708 {
2709 	return (vp->v_filocks != NULL);
2710 }
2711 
2712 int
2713 vn_has_mandatory_locks(vnode_t *vp, int mode)
2714 {
2715 	return ((vp->v_filocks != NULL) && (MANDLOCK(vp, mode)));
2716 }
2717 
2718 int
2719 vn_has_cached_data(vnode_t *vp)
2720 {
2721 	return (vp->v_pages != NULL);
2722 }
2723 
2724 /*
2725  * Return 0 if the vnode in question shouldn't be permitted into a zone via
2726  * zone_enter(2).
2727  */
2728 int
2729 vn_can_change_zones(vnode_t *vp)
2730 {
2731 	struct vfssw *vswp;
2732 	int allow = 1;
2733 	vnode_t *rvp;
2734 
2735 	if (nfs_global_client_only != 0)
2736 		return (1);
2737 
2738 	/*
2739 	 * We always want to look at the underlying vnode if there is one.
2740 	 */
2741 	if (VOP_REALVP(vp, &rvp, NULL) != 0)
2742 		rvp = vp;
2743 	/*
2744 	 * Some pseudo filesystems (including doorfs) don't actually register
2745 	 * their vfsops_t, so the following may return NULL; we happily let
2746 	 * such vnodes switch zones.
2747 	 */
2748 	vswp = vfs_getvfsswbyvfsops(vfs_getops(rvp->v_vfsp));
2749 	if (vswp != NULL) {
2750 		if (vswp->vsw_flag & VSW_NOTZONESAFE)
2751 			allow = 0;
2752 		vfs_unrefvfssw(vswp);
2753 	}
2754 	return (allow);
2755 }
2756 
2757 /*
2758  * Return nonzero if the vnode is a mount point, zero if not.
2759  */
2760 int
2761 vn_ismntpt(vnode_t *vp)
2762 {
2763 	return (vp->v_vfsmountedhere != NULL);
2764 }
2765 
2766 /* Retrieve the vfs (if any) mounted on this vnode */
2767 vfs_t *
2768 vn_mountedvfs(vnode_t *vp)
2769 {
2770 	return (vp->v_vfsmountedhere);
2771 }
2772 
2773 /*
2774  * Return nonzero if the vnode is referenced by the dnlc, zero if not.
2775  */
2776 int
2777 vn_in_dnlc(vnode_t *vp)
2778 {
2779 	return (vp->v_count_dnlc > 0);
2780 }
2781 
2782 /*
2783  * vn_has_other_opens() checks whether a particular file is opened by more than
2784  * just the caller and whether the open is for read and/or write.
2785  * This routine is for calling after the caller has already called VOP_OPEN()
2786  * and the caller wishes to know if they are the only one with it open for
2787  * the mode(s) specified.
2788  *
2789  * Vnode counts are only kept on regular files (v_type=VREG).
2790  */
2791 int
2792 vn_has_other_opens(
2793 	vnode_t *vp,
2794 	v_mode_t mode)
2795 {
2796 
2797 	ASSERT(vp != NULL);
2798 
2799 	switch (mode) {
2800 	case V_WRITE:
2801 		if (vp->v_wrcnt > 1)
2802 			return (V_TRUE);
2803 		break;
2804 	case V_RDORWR:
2805 		if ((vp->v_rdcnt > 1) || (vp->v_wrcnt > 1))
2806 			return (V_TRUE);
2807 		break;
2808 	case V_RDANDWR:
2809 		if ((vp->v_rdcnt > 1) && (vp->v_wrcnt > 1))
2810 			return (V_TRUE);
2811 		break;
2812 	case V_READ:
2813 		if (vp->v_rdcnt > 1)
2814 			return (V_TRUE);
2815 		break;
2816 	}
2817 
2818 	return (V_FALSE);
2819 }
2820 
2821 /*
2822  * vn_is_opened() checks whether a particular file is opened and
2823  * whether the open is for read and/or write.
2824  *
2825  * Vnode counts are only kept on regular files (v_type=VREG).
2826  */
2827 int
2828 vn_is_opened(
2829 	vnode_t *vp,
2830 	v_mode_t mode)
2831 {
2832 
2833 	ASSERT(vp != NULL);
2834 
2835 	switch (mode) {
2836 	case V_WRITE:
2837 		if (vp->v_wrcnt)
2838 			return (V_TRUE);
2839 		break;
2840 	case V_RDANDWR:
2841 		if (vp->v_rdcnt && vp->v_wrcnt)
2842 			return (V_TRUE);
2843 		break;
2844 	case V_RDORWR:
2845 		if (vp->v_rdcnt || vp->v_wrcnt)
2846 			return (V_TRUE);
2847 		break;
2848 	case V_READ:
2849 		if (vp->v_rdcnt)
2850 			return (V_TRUE);
2851 		break;
2852 	}
2853 
2854 	return (V_FALSE);
2855 }
2856 
2857 /*
2858  * vn_is_mapped() checks whether a particular file is mapped and whether
2859  * the file is mapped read and/or write.
2860  */
2861 int
2862 vn_is_mapped(
2863 	vnode_t *vp,
2864 	v_mode_t mode)
2865 {
2866 
2867 	ASSERT(vp != NULL);
2868 
2869 #if !defined(_LP64)
2870 	switch (mode) {
2871 	/*
2872 	 * The atomic_add_64_nv functions force atomicity in the
2873 	 * case of 32 bit architectures. Otherwise the 64 bit values
2874 	 * require two fetches. The value of the fields may be
2875 	 * (potentially) changed between the first fetch and the
2876 	 * second
2877 	 */
2878 	case V_WRITE:
2879 		if (atomic_add_64_nv((&(vp->v_mmap_write)), 0))
2880 			return (V_TRUE);
2881 		break;
2882 	case V_RDANDWR:
2883 		if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) &&
2884 		    (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2885 			return (V_TRUE);
2886 		break;
2887 	case V_RDORWR:
2888 		if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) ||
2889 		    (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2890 			return (V_TRUE);
2891 		break;
2892 	case V_READ:
2893 		if (atomic_add_64_nv((&(vp->v_mmap_read)), 0))
2894 			return (V_TRUE);
2895 		break;
2896 	}
2897 #else
2898 	switch (mode) {
2899 	case V_WRITE:
2900 		if (vp->v_mmap_write)
2901 			return (V_TRUE);
2902 		break;
2903 	case V_RDANDWR:
2904 		if (vp->v_mmap_read && vp->v_mmap_write)
2905 			return (V_TRUE);
2906 		break;
2907 	case V_RDORWR:
2908 		if (vp->v_mmap_read || vp->v_mmap_write)
2909 			return (V_TRUE);
2910 		break;
2911 	case V_READ:
2912 		if (vp->v_mmap_read)
2913 			return (V_TRUE);
2914 		break;
2915 	}
2916 #endif
2917 
2918 	return (V_FALSE);
2919 }
2920 
2921 /*
2922  * Set the operations vector for a vnode.
2923  *
2924  * FEM ensures that the v_femhead pointer is filled in before the
2925  * v_op pointer is changed.  This means that if the v_femhead pointer
2926  * is NULL, and the v_op field hasn't changed since before which checked
2927  * the v_femhead pointer; then our update is ok - we are not racing with
2928  * FEM.
2929  */
2930 void
2931 vn_setops(vnode_t *vp, vnodeops_t *vnodeops)
2932 {
2933 	vnodeops_t	*op;
2934 
2935 	ASSERT(vp != NULL);
2936 	ASSERT(vnodeops != NULL);
2937 
2938 	op = vp->v_op;
2939 	membar_consumer();
2940 	/*
2941 	 * If vp->v_femhead == NULL, then we'll call atomic_cas_ptr() to do
2942 	 * the compare-and-swap on vp->v_op.  If either fails, then FEM is
2943 	 * in effect on the vnode and we need to have FEM deal with it.
2944 	 */
2945 	if (vp->v_femhead != NULL || atomic_cas_ptr(&vp->v_op, op, vnodeops) !=
2946 	    op) {
2947 		fem_setvnops(vp, vnodeops);
2948 	}
2949 }
2950 
2951 /*
2952  * Retrieve the operations vector for a vnode
2953  * As with vn_setops(above); make sure we aren't racing with FEM.
2954  * FEM sets the v_op to a special, internal, vnodeops that wouldn't
2955  * make sense to the callers of this routine.
2956  */
2957 vnodeops_t *
2958 vn_getops(vnode_t *vp)
2959 {
2960 	vnodeops_t	*op;
2961 
2962 	ASSERT(vp != NULL);
2963 
2964 	op = vp->v_op;
2965 	membar_consumer();
2966 	if (vp->v_femhead == NULL && op == vp->v_op) {
2967 		return (op);
2968 	} else {
2969 		return (fem_getvnops(vp));
2970 	}
2971 }
2972 
2973 /*
2974  * Returns non-zero (1) if the vnodeops matches that of the vnode.
2975  * Returns zero (0) if not.
2976  */
2977 int
2978 vn_matchops(vnode_t *vp, vnodeops_t *vnodeops)
2979 {
2980 	return (vn_getops(vp) == vnodeops);
2981 }
2982 
2983 /*
2984  * Returns non-zero (1) if the specified operation matches the
2985  * corresponding operation for that the vnode.
2986  * Returns zero (0) if not.
2987  */
2988 
2989 #define	MATCHNAME(n1, n2) (((n1)[0] == (n2)[0]) && (strcmp((n1), (n2)) == 0))
2990 
2991 int
2992 vn_matchopval(vnode_t *vp, char *vopname, fs_generic_func_p funcp)
2993 {
2994 	const fs_operation_trans_def_t *otdp;
2995 	fs_generic_func_p *loc = NULL;
2996 	vnodeops_t	*vop = vn_getops(vp);
2997 
2998 	ASSERT(vopname != NULL);
2999 
3000 	for (otdp = vn_ops_table; otdp->name != NULL; otdp++) {
3001 		if (MATCHNAME(otdp->name, vopname)) {
3002 			loc = (fs_generic_func_p *)
3003 			    ((char *)(vop) + otdp->offset);
3004 			break;
3005 		}
3006 	}
3007 
3008 	return ((loc != NULL) && (*loc == funcp));
3009 }
3010 
3011 /*
3012  * fs_new_caller_id() needs to return a unique ID on a given local system.
3013  * The IDs do not need to survive across reboots.  These are primarily
3014  * used so that (FEM) monitors can detect particular callers (such as
3015  * the NFS server) to a given vnode/vfs operation.
3016  */
3017 u_longlong_t
3018 fs_new_caller_id()
3019 {
3020 	static uint64_t next_caller_id = 0LL; /* First call returns 1 */
3021 
3022 	return ((u_longlong_t)atomic_inc_64_nv(&next_caller_id));
3023 }
3024 
3025 /*
3026  * The value stored in v_path is relative to rootdir, located in the global
3027  * zone.  Zones or chroot environments which reside deeper inside the VFS
3028  * hierarchy will have a relative view of MAXPATHLEN since they are unaware of
3029  * what lies below their perceived root.  In order to keep v_path usable for
3030  * these child environments, its allocations are allowed to exceed MAXPATHLEN.
3031  *
3032  * An upper bound of max_vnode_path is placed upon v_path allocations to
3033  * prevent the system from going too wild at the behest of pathological
3034  * behavior from the operator.
3035  */
3036 size_t max_vnode_path = 4 * MAXPATHLEN;
3037 
3038 
3039 void
3040 vn_clearpath(vnode_t *vp, hrtime_t compare_stamp)
3041 {
3042 	char *buf;
3043 
3044 	mutex_enter(&vp->v_lock);
3045 	/*
3046 	 * If the snapshot of v_path_stamp passed in via compare_stamp does not
3047 	 * match the present value on the vnode, it indicates that subsequent
3048 	 * changes have occurred.  The v_path value is not cleared in this case
3049 	 * since the new value may be valid.
3050 	 */
3051 	if (compare_stamp != 0 && vp->v_path_stamp != compare_stamp) {
3052 		mutex_exit(&vp->v_lock);
3053 		return;
3054 	}
3055 	buf = vp->v_path;
3056 	vp->v_path = vn_vpath_empty;
3057 	vp->v_path_stamp = 0;
3058 	mutex_exit(&vp->v_lock);
3059 	if (buf != vn_vpath_empty) {
3060 		kmem_free(buf, strlen(buf) + 1);
3061 	}
3062 }
3063 
3064 static void
3065 vn_setpath_common(vnode_t *pvp, vnode_t *vp, const char *name, size_t len,
3066     boolean_t is_rename)
3067 {
3068 	char *buf, *oldbuf;
3069 	hrtime_t pstamp;
3070 	size_t baselen, buflen = 0;
3071 
3072 	/* Handle the vn_setpath_str case. */
3073 	if (pvp == NULL) {
3074 		if (len + 1 > max_vnode_path) {
3075 			DTRACE_PROBE4(vn__setpath__too__long, vnode_t *, pvp,
3076 			    vnode_t *, vp, char *, name, size_t, len + 1);
3077 			return;
3078 		}
3079 		buf = kmem_alloc(len + 1, KM_SLEEP);
3080 		bcopy(name, buf, len);
3081 		buf[len] = '\0';
3082 
3083 		mutex_enter(&vp->v_lock);
3084 		oldbuf = vp->v_path;
3085 		vp->v_path = buf;
3086 		vp->v_path_stamp = gethrtime();
3087 		mutex_exit(&vp->v_lock);
3088 		if (oldbuf != vn_vpath_empty) {
3089 			kmem_free(oldbuf, strlen(oldbuf) + 1);
3090 		}
3091 		return;
3092 	}
3093 
3094 	/* Take snapshot of parent dir */
3095 	mutex_enter(&pvp->v_lock);
3096 
3097 	if ((pvp->v_flag & VTRAVERSE) != 0) {
3098 		/*
3099 		 * When the parent vnode has VTRAVERSE set in its flags, normal
3100 		 * assumptions about v_path calculation no longer apply.  The
3101 		 * primary situation where this occurs is via the VFS tricks
3102 		 * which procfs plays in order to allow /proc/PID/(root|cwd) to
3103 		 * yield meaningful results.
3104 		 *
3105 		 * When this flag is set, v_path on the child must not be
3106 		 * updated since the calculated value is likely to be
3107 		 * incorrect, given the current context.
3108 		 */
3109 		mutex_exit(&pvp->v_lock);
3110 		return;
3111 	}
3112 
3113 retrybuf:
3114 	if (pvp->v_path == vn_vpath_empty) {
3115 		/*
3116 		 * Without v_path from the parent directory, generating a child
3117 		 * path from the name is impossible.
3118 		 */
3119 		if (len > 0) {
3120 			pstamp = pvp->v_path_stamp;
3121 			mutex_exit(&pvp->v_lock);
3122 			vn_clearpath(vp, pstamp);
3123 			return;
3124 		}
3125 
3126 		/*
3127 		 * The only feasible case here is where a NUL lookup is being
3128 		 * performed on rootdir prior to its v_path being populated.
3129 		 */
3130 		ASSERT(pvp->v_path_stamp == 0);
3131 		baselen = 0;
3132 		pstamp = 0;
3133 	} else {
3134 		pstamp = pvp->v_path_stamp;
3135 		baselen = strlen(pvp->v_path);
3136 		/* ignore a trailing slash if present */
3137 		if (pvp->v_path[baselen - 1] == '/') {
3138 			/* This should only the be case for rootdir */
3139 			ASSERT(baselen == 1 && pvp == rootdir);
3140 			baselen--;
3141 		}
3142 	}
3143 	mutex_exit(&pvp->v_lock);
3144 
3145 	if (buflen != 0) {
3146 		/* Free the existing (mis-sized) buffer in case of retry */
3147 		kmem_free(buf, buflen);
3148 	}
3149 	/* base, '/', name and trailing NUL */
3150 	buflen = baselen + len + 2;
3151 	if (buflen > max_vnode_path) {
3152 		DTRACE_PROBE4(vn__setpath_too__long, vnode_t *, pvp,
3153 		    vnode_t *, vp, char *, name, size_t, buflen);
3154 		return;
3155 	}
3156 	buf = kmem_alloc(buflen, KM_SLEEP);
3157 
3158 	mutex_enter(&pvp->v_lock);
3159 	if (pvp->v_path_stamp != pstamp) {
3160 		size_t vlen;
3161 
3162 		/*
3163 		 * Since v_path_stamp changed on the parent, it is likely that
3164 		 * v_path has been altered as well.  If the length does not
3165 		 * exactly match what was previously measured, the buffer
3166 		 * allocation must be repeated for proper sizing.
3167 		 */
3168 		if (pvp->v_path == vn_vpath_empty) {
3169 			/* Give up if parent lack v_path */
3170 			mutex_exit(&pvp->v_lock);
3171 			kmem_free(buf, buflen);
3172 			return;
3173 		}
3174 		vlen = strlen(pvp->v_path);
3175 		if (pvp->v_path[vlen - 1] == '/') {
3176 			vlen--;
3177 		}
3178 		if (vlen != baselen) {
3179 			goto retrybuf;
3180 		}
3181 	}
3182 	bcopy(pvp->v_path, buf, baselen);
3183 	mutex_exit(&pvp->v_lock);
3184 
3185 	buf[baselen] = '/';
3186 	baselen++;
3187 	bcopy(name, &buf[baselen], len + 1);
3188 
3189 	mutex_enter(&vp->v_lock);
3190 	if (vp->v_path_stamp == 0) {
3191 		/* never-visited vnode can inherit stamp from parent */
3192 		ASSERT(vp->v_path == vn_vpath_empty);
3193 		vp->v_path_stamp = pstamp;
3194 		vp->v_path = buf;
3195 		mutex_exit(&vp->v_lock);
3196 	} else if (vp->v_path_stamp < pstamp || is_rename) {
3197 		/*
3198 		 * Install the updated path and stamp, ensuring that the v_path
3199 		 * pointer is valid at all times for dtrace.
3200 		 */
3201 		oldbuf = vp->v_path;
3202 		vp->v_path = buf;
3203 		vp->v_path_stamp = gethrtime();
3204 		mutex_exit(&vp->v_lock);
3205 		kmem_free(oldbuf, strlen(oldbuf) + 1);
3206 	} else {
3207 		/*
3208 		 * If the timestamp matches or is greater, it means another
3209 		 * thread performed the update first while locks were dropped
3210 		 * here to make the allocation.  We defer to the newer value.
3211 		 */
3212 		mutex_exit(&vp->v_lock);
3213 		kmem_free(buf, buflen);
3214 	}
3215 	ASSERT(MUTEX_NOT_HELD(&vp->v_lock));
3216 }
3217 
3218 void
3219 vn_updatepath(vnode_t *pvp, vnode_t *vp, const char *name)
3220 {
3221 	size_t len;
3222 
3223 	/*
3224 	 * If the parent is older or empty, there's nothing further to do.
3225 	 */
3226 	if (pvp->v_path == vn_vpath_empty ||
3227 	    pvp->v_path_stamp <= vp->v_path_stamp) {
3228 		return;
3229 	}
3230 
3231 	/*
3232 	 * Given the lack of appropriate context, meaningful updates to v_path
3233 	 * cannot be made for during lookups for the '.' or '..' entries.
3234 	 */
3235 	len = strlen(name);
3236 	if (len == 0 || (len == 1 && name[0] == '.') ||
3237 	    (len == 2 && name[0] == '.' && name[1] == '.')) {
3238 		return;
3239 	}
3240 
3241 	vn_setpath_common(pvp, vp, name, len, B_FALSE);
3242 }
3243 
3244 /*
3245  * Given a starting vnode and a path, updates the path in the target vnode in
3246  * a safe manner.  If the vnode already has path information embedded, then the
3247  * cached path is left untouched.
3248  */
3249 /* ARGSUSED */
3250 void
3251 vn_setpath(vnode_t *rootvp, vnode_t *pvp, vnode_t *vp, const char *name,
3252     size_t len)
3253 {
3254 	vn_setpath_common(pvp, vp, name, len, B_FALSE);
3255 }
3256 
3257 /*
3258  * Sets the path to the vnode to be the given string, regardless of current
3259  * context.  The string must be a complete path from rootdir.  This is only used
3260  * by fsop_root() for setting the path based on the mountpoint.
3261  */
3262 void
3263 vn_setpath_str(vnode_t *vp, const char *str, size_t len)
3264 {
3265 	vn_setpath_common(NULL, vp, str, len, B_FALSE);
3266 }
3267 
3268 /*
3269  * Called from within filesystem's vop_rename() to handle renames once the
3270  * target vnode is available.
3271  */
3272 void
3273 vn_renamepath(vnode_t *pvp, vnode_t *vp, const char *name, size_t len)
3274 {
3275 	vn_setpath_common(pvp, vp, name, len, B_TRUE);
3276 }
3277 
3278 /*
3279  * Similar to vn_setpath_str(), this function sets the path of the destination
3280  * vnode to the be the same as the source vnode.
3281  */
3282 void
3283 vn_copypath(struct vnode *src, struct vnode *dst)
3284 {
3285 	char *buf;
3286 	hrtime_t stamp;
3287 	size_t buflen;
3288 
3289 	mutex_enter(&src->v_lock);
3290 	if (src->v_path == vn_vpath_empty) {
3291 		mutex_exit(&src->v_lock);
3292 		return;
3293 	}
3294 	buflen = strlen(src->v_path) + 1;
3295 	mutex_exit(&src->v_lock);
3296 
3297 	buf = kmem_alloc(buflen, KM_SLEEP);
3298 
3299 	mutex_enter(&src->v_lock);
3300 	if (src->v_path == vn_vpath_empty ||
3301 	    strlen(src->v_path) + 1 != buflen) {
3302 		mutex_exit(&src->v_lock);
3303 		kmem_free(buf, buflen);
3304 		return;
3305 	}
3306 	bcopy(src->v_path, buf, buflen);
3307 	stamp = src->v_path_stamp;
3308 	mutex_exit(&src->v_lock);
3309 
3310 	mutex_enter(&dst->v_lock);
3311 	if (dst->v_path != vn_vpath_empty) {
3312 		mutex_exit(&dst->v_lock);
3313 		kmem_free(buf, buflen);
3314 		return;
3315 	}
3316 	dst->v_path = buf;
3317 	dst->v_path_stamp = stamp;
3318 	mutex_exit(&dst->v_lock);
3319 }
3320 
3321 
3322 /*
3323  * XXX Private interface for segvn routines that handle vnode
3324  * large page segments.
3325  *
3326  * return 1 if vp's file system VOP_PAGEIO() implementation
3327  * can be safely used instead of VOP_GETPAGE() for handling
3328  * pagefaults against regular non swap files. VOP_PAGEIO()
3329  * interface is considered safe here if its implementation
3330  * is very close to VOP_GETPAGE() implementation.
3331  * e.g. It zero's out the part of the page beyond EOF. Doesn't
3332  * panic if there're file holes but instead returns an error.
3333  * Doesn't assume file won't be changed by user writes, etc.
3334  *
3335  * return 0 otherwise.
3336  *
3337  * For now allow segvn to only use VOP_PAGEIO() with ufs and nfs.
3338  */
3339 int
3340 vn_vmpss_usepageio(vnode_t *vp)
3341 {
3342 	vfs_t   *vfsp = vp->v_vfsp;
3343 	char *fsname = vfssw[vfsp->vfs_fstype].vsw_name;
3344 	char *pageio_ok_fss[] = {"ufs", "nfs", NULL};
3345 	char **fsok = pageio_ok_fss;
3346 
3347 	if (fsname == NULL) {
3348 		return (0);
3349 	}
3350 
3351 	for (; *fsok; fsok++) {
3352 		if (strcmp(*fsok, fsname) == 0) {
3353 			return (1);
3354 		}
3355 	}
3356 	return (0);
3357 }
3358 
3359 /* VOP_XXX() macros call the corresponding fop_xxx() function */
3360 
3361 int
3362 fop_open(
3363 	vnode_t **vpp,
3364 	int mode,
3365 	cred_t *cr,
3366 	caller_context_t *ct)
3367 {
3368 	int ret;
3369 	vnode_t *vp = *vpp;
3370 
3371 	VN_HOLD(vp);
3372 	/*
3373 	 * Adding to the vnode counts before calling open
3374 	 * avoids the need for a mutex. It circumvents a race
3375 	 * condition where a query made on the vnode counts results in a
3376 	 * false negative. The inquirer goes away believing the file is
3377 	 * not open when there is an open on the file already under way.
3378 	 *
3379 	 * The counts are meant to prevent NFS from granting a delegation
3380 	 * when it would be dangerous to do so.
3381 	 *
3382 	 * The vnode counts are only kept on regular files
3383 	 */
3384 	if ((*vpp)->v_type == VREG) {
3385 		if (mode & FREAD)
3386 			atomic_inc_32(&(*vpp)->v_rdcnt);
3387 		if (mode & FWRITE)
3388 			atomic_inc_32(&(*vpp)->v_wrcnt);
3389 	}
3390 
3391 	VOPXID_MAP_CR(vp, cr);
3392 
3393 	ret = (*(*(vpp))->v_op->vop_open)(vpp, mode, cr, ct);
3394 
3395 	if (ret) {
3396 		/*
3397 		 * Use the saved vp just in case the vnode ptr got trashed
3398 		 * by the error.
3399 		 */
3400 		VOPSTATS_UPDATE(vp, open);
3401 		if ((vp->v_type == VREG) && (mode & FREAD))
3402 			atomic_dec_32(&vp->v_rdcnt);
3403 		if ((vp->v_type == VREG) && (mode & FWRITE))
3404 			atomic_dec_32(&vp->v_wrcnt);
3405 	} else {
3406 		/*
3407 		 * Some filesystems will return a different vnode,
3408 		 * but the same path was still used to open it.
3409 		 * So if we do change the vnode and need to
3410 		 * copy over the path, do so here, rather than special
3411 		 * casing each filesystem. Adjust the vnode counts to
3412 		 * reflect the vnode switch.
3413 		 */
3414 		VOPSTATS_UPDATE(*vpp, open);
3415 		if (*vpp != vp) {
3416 			vn_copypath(vp, *vpp);
3417 			if (((*vpp)->v_type == VREG) && (mode & FREAD))
3418 				atomic_inc_32(&(*vpp)->v_rdcnt);
3419 			if ((vp->v_type == VREG) && (mode & FREAD))
3420 				atomic_dec_32(&vp->v_rdcnt);
3421 			if (((*vpp)->v_type == VREG) && (mode & FWRITE))
3422 				atomic_inc_32(&(*vpp)->v_wrcnt);
3423 			if ((vp->v_type == VREG) && (mode & FWRITE))
3424 				atomic_dec_32(&vp->v_wrcnt);
3425 		}
3426 	}
3427 	VN_RELE(vp);
3428 	return (ret);
3429 }
3430 
3431 int
3432 fop_close(
3433 	vnode_t *vp,
3434 	int flag,
3435 	int count,
3436 	offset_t offset,
3437 	cred_t *cr,
3438 	caller_context_t *ct)
3439 {
3440 	int err;
3441 
3442 	VOPXID_MAP_CR(vp, cr);
3443 
3444 	err = (*(vp)->v_op->vop_close)(vp, flag, count, offset, cr, ct);
3445 	VOPSTATS_UPDATE(vp, close);
3446 	/*
3447 	 * Check passed in count to handle possible dups. Vnode counts are only
3448 	 * kept on regular files
3449 	 */
3450 	if ((vp->v_type == VREG) && (count == 1))  {
3451 		if (flag & FREAD) {
3452 			ASSERT(vp->v_rdcnt > 0);
3453 			atomic_dec_32(&vp->v_rdcnt);
3454 		}
3455 		if (flag & FWRITE) {
3456 			ASSERT(vp->v_wrcnt > 0);
3457 			atomic_dec_32(&vp->v_wrcnt);
3458 		}
3459 	}
3460 	return (err);
3461 }
3462 
3463 int
3464 fop_read(
3465 	vnode_t *vp,
3466 	uio_t *uiop,
3467 	int ioflag,
3468 	cred_t *cr,
3469 	caller_context_t *ct)
3470 {
3471 	int	err;
3472 	ssize_t	resid_start = uiop->uio_resid;
3473 
3474 	VOPXID_MAP_CR(vp, cr);
3475 
3476 	err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct);
3477 	VOPSTATS_UPDATE_IO(vp, read,
3478 	    read_bytes, (resid_start - uiop->uio_resid));
3479 	return (err);
3480 }
3481 
3482 int
3483 fop_write(
3484 	vnode_t *vp,
3485 	uio_t *uiop,
3486 	int ioflag,
3487 	cred_t *cr,
3488 	caller_context_t *ct)
3489 {
3490 	int	err;
3491 	ssize_t	resid_start = uiop->uio_resid;
3492 
3493 	VOPXID_MAP_CR(vp, cr);
3494 
3495 	err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct);
3496 	VOPSTATS_UPDATE_IO(vp, write,
3497 	    write_bytes, (resid_start - uiop->uio_resid));
3498 	return (err);
3499 }
3500 
3501 int
3502 fop_ioctl(
3503 	vnode_t *vp,
3504 	int cmd,
3505 	intptr_t arg,
3506 	int flag,
3507 	cred_t *cr,
3508 	int *rvalp,
3509 	caller_context_t *ct)
3510 {
3511 	int	err;
3512 
3513 	VOPXID_MAP_CR(vp, cr);
3514 
3515 	err = (*(vp)->v_op->vop_ioctl)(vp, cmd, arg, flag, cr, rvalp, ct);
3516 	VOPSTATS_UPDATE(vp, ioctl);
3517 	return (err);
3518 }
3519 
3520 int
3521 fop_setfl(
3522 	vnode_t *vp,
3523 	int oflags,
3524 	int nflags,
3525 	cred_t *cr,
3526 	caller_context_t *ct)
3527 {
3528 	int	err;
3529 
3530 	VOPXID_MAP_CR(vp, cr);
3531 
3532 	err = (*(vp)->v_op->vop_setfl)(vp, oflags, nflags, cr, ct);
3533 	VOPSTATS_UPDATE(vp, setfl);
3534 	return (err);
3535 }
3536 
3537 int
3538 fop_getattr(
3539 	vnode_t *vp,
3540 	vattr_t *vap,
3541 	int flags,
3542 	cred_t *cr,
3543 	caller_context_t *ct)
3544 {
3545 	int	err;
3546 
3547 	VOPXID_MAP_CR(vp, cr);
3548 
3549 	/*
3550 	 * If this file system doesn't understand the xvattr extensions
3551 	 * then turn off the xvattr bit.
3552 	 */
3553 	if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3554 		vap->va_mask &= ~AT_XVATTR;
3555 	}
3556 
3557 	/*
3558 	 * We're only allowed to skip the ACL check iff we used a 32 bit
3559 	 * ACE mask with VOP_ACCESS() to determine permissions.
3560 	 */
3561 	if ((flags & ATTR_NOACLCHECK) &&
3562 	    vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3563 		return (EINVAL);
3564 	}
3565 	err = (*(vp)->v_op->vop_getattr)(vp, vap, flags, cr, ct);
3566 	VOPSTATS_UPDATE(vp, getattr);
3567 	return (err);
3568 }
3569 
3570 int
3571 fop_setattr(
3572 	vnode_t *vp,
3573 	vattr_t *vap,
3574 	int flags,
3575 	cred_t *cr,
3576 	caller_context_t *ct)
3577 {
3578 	int	err;
3579 
3580 	VOPXID_MAP_CR(vp, cr);
3581 
3582 	/*
3583 	 * If this file system doesn't understand the xvattr extensions
3584 	 * then turn off the xvattr bit.
3585 	 */
3586 	if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3587 		vap->va_mask &= ~AT_XVATTR;
3588 	}
3589 
3590 	/*
3591 	 * We're only allowed to skip the ACL check iff we used a 32 bit
3592 	 * ACE mask with VOP_ACCESS() to determine permissions.
3593 	 */
3594 	if ((flags & ATTR_NOACLCHECK) &&
3595 	    vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3596 		return (EINVAL);
3597 	}
3598 	err = (*(vp)->v_op->vop_setattr)(vp, vap, flags, cr, ct);
3599 	VOPSTATS_UPDATE(vp, setattr);
3600 	return (err);
3601 }
3602 
3603 int
3604 fop_access(
3605 	vnode_t *vp,
3606 	int mode,
3607 	int flags,
3608 	cred_t *cr,
3609 	caller_context_t *ct)
3610 {
3611 	int	err;
3612 
3613 	if ((flags & V_ACE_MASK) &&
3614 	    vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3615 		return (EINVAL);
3616 	}
3617 
3618 	VOPXID_MAP_CR(vp, cr);
3619 
3620 	err = (*(vp)->v_op->vop_access)(vp, mode, flags, cr, ct);
3621 	VOPSTATS_UPDATE(vp, access);
3622 	return (err);
3623 }
3624 
3625 int
3626 fop_lookup(
3627 	vnode_t *dvp,
3628 	char *nm,
3629 	vnode_t **vpp,
3630 	pathname_t *pnp,
3631 	int flags,
3632 	vnode_t *rdir,
3633 	cred_t *cr,
3634 	caller_context_t *ct,
3635 	int *deflags,		/* Returned per-dirent flags */
3636 	pathname_t *ppnp)	/* Returned case-preserved name in directory */
3637 {
3638 	int ret;
3639 
3640 	/*
3641 	 * If this file system doesn't support case-insensitive access
3642 	 * and said access is requested, fail quickly.  It is required
3643 	 * that if the vfs supports case-insensitive lookup, it also
3644 	 * supports extended dirent flags.
3645 	 */
3646 	if (flags & FIGNORECASE &&
3647 	    (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3648 	    vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3649 		return (EINVAL);
3650 
3651 	VOPXID_MAP_CR(dvp, cr);
3652 
3653 	if ((flags & LOOKUP_XATTR) && (flags & LOOKUP_HAVE_SYSATTR_DIR) == 0) {
3654 		ret = xattr_dir_lookup(dvp, vpp, flags, cr);
3655 	} else {
3656 		ret = (*(dvp)->v_op->vop_lookup)
3657 		    (dvp, nm, vpp, pnp, flags, rdir, cr, ct, deflags, ppnp);
3658 	}
3659 	if (ret == 0 && *vpp) {
3660 		VOPSTATS_UPDATE(*vpp, lookup);
3661 		vn_updatepath(dvp, *vpp, nm);
3662 	}
3663 
3664 	return (ret);
3665 }
3666 
3667 int
3668 fop_create(
3669 	vnode_t *dvp,
3670 	char *name,
3671 	vattr_t *vap,
3672 	vcexcl_t excl,
3673 	int mode,
3674 	vnode_t **vpp,
3675 	cred_t *cr,
3676 	int flags,
3677 	caller_context_t *ct,
3678 	vsecattr_t *vsecp)	/* ACL to set during create */
3679 {
3680 	int ret;
3681 
3682 	if (vsecp != NULL &&
3683 	    vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3684 		return (EINVAL);
3685 	}
3686 	/*
3687 	 * If this file system doesn't support case-insensitive access
3688 	 * and said access is requested, fail quickly.
3689 	 */
3690 	if (flags & FIGNORECASE &&
3691 	    (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3692 	    vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3693 		return (EINVAL);
3694 
3695 	VOPXID_MAP_CR(dvp, cr);
3696 
3697 	ret = (*(dvp)->v_op->vop_create)
3698 	    (dvp, name, vap, excl, mode, vpp, cr, flags, ct, vsecp);
3699 	if (ret == 0 && *vpp) {
3700 		VOPSTATS_UPDATE(*vpp, create);
3701 		vn_updatepath(dvp, *vpp, name);
3702 	}
3703 
3704 	return (ret);
3705 }
3706 
3707 int
3708 fop_remove(
3709 	vnode_t *dvp,
3710 	char *nm,
3711 	cred_t *cr,
3712 	caller_context_t *ct,
3713 	int flags)
3714 {
3715 	int	err;
3716 
3717 	/*
3718 	 * If this file system doesn't support case-insensitive access
3719 	 * and said access is requested, fail quickly.
3720 	 */
3721 	if (flags & FIGNORECASE &&
3722 	    (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3723 	    vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3724 		return (EINVAL);
3725 
3726 	VOPXID_MAP_CR(dvp, cr);
3727 
3728 	err = (*(dvp)->v_op->vop_remove)(dvp, nm, cr, ct, flags);
3729 	VOPSTATS_UPDATE(dvp, remove);
3730 	return (err);
3731 }
3732 
3733 int
3734 fop_link(
3735 	vnode_t *tdvp,
3736 	vnode_t *svp,
3737 	char *tnm,
3738 	cred_t *cr,
3739 	caller_context_t *ct,
3740 	int flags)
3741 {
3742 	int	err;
3743 
3744 	/*
3745 	 * If the target file system doesn't support case-insensitive access
3746 	 * and said access is requested, fail quickly.
3747 	 */
3748 	if (flags & FIGNORECASE &&
3749 	    (vfs_has_feature(tdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3750 	    vfs_has_feature(tdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3751 		return (EINVAL);
3752 
3753 	VOPXID_MAP_CR(tdvp, cr);
3754 
3755 	err = (*(tdvp)->v_op->vop_link)(tdvp, svp, tnm, cr, ct, flags);
3756 	VOPSTATS_UPDATE(tdvp, link);
3757 	return (err);
3758 }
3759 
3760 int
3761 fop_rename(
3762 	vnode_t *sdvp,
3763 	char *snm,
3764 	vnode_t *tdvp,
3765 	char *tnm,
3766 	cred_t *cr,
3767 	caller_context_t *ct,
3768 	int flags)
3769 {
3770 	int	err;
3771 
3772 	/*
3773 	 * If the file system involved does not support
3774 	 * case-insensitive access and said access is requested, fail
3775 	 * quickly.
3776 	 */
3777 	if (flags & FIGNORECASE &&
3778 	    ((vfs_has_feature(sdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3779 	    vfs_has_feature(sdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)))
3780 		return (EINVAL);
3781 
3782 	VOPXID_MAP_CR(tdvp, cr);
3783 
3784 	err = (*(sdvp)->v_op->vop_rename)(sdvp, snm, tdvp, tnm, cr, ct, flags);
3785 	VOPSTATS_UPDATE(sdvp, rename);
3786 	return (err);
3787 }
3788 
3789 int
3790 fop_mkdir(
3791 	vnode_t *dvp,
3792 	char *dirname,
3793 	vattr_t *vap,
3794 	vnode_t **vpp,
3795 	cred_t *cr,
3796 	caller_context_t *ct,
3797 	int flags,
3798 	vsecattr_t *vsecp)	/* ACL to set during create */
3799 {
3800 	int ret;
3801 
3802 	if (vsecp != NULL &&
3803 	    vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3804 		return (EINVAL);
3805 	}
3806 	/*
3807 	 * If this file system doesn't support case-insensitive access
3808 	 * and said access is requested, fail quickly.
3809 	 */
3810 	if (flags & FIGNORECASE &&
3811 	    (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3812 	    vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3813 		return (EINVAL);
3814 
3815 	VOPXID_MAP_CR(dvp, cr);
3816 
3817 	ret = (*(dvp)->v_op->vop_mkdir)
3818 	    (dvp, dirname, vap, vpp, cr, ct, flags, vsecp);
3819 	if (ret == 0 && *vpp) {
3820 		VOPSTATS_UPDATE(*vpp, mkdir);
3821 		vn_updatepath(dvp, *vpp, dirname);
3822 	}
3823 
3824 	return (ret);
3825 }
3826 
3827 int
3828 fop_rmdir(
3829 	vnode_t *dvp,
3830 	char *nm,
3831 	vnode_t *cdir,
3832 	cred_t *cr,
3833 	caller_context_t *ct,
3834 	int flags)
3835 {
3836 	int	err;
3837 
3838 	/*
3839 	 * If this file system doesn't support case-insensitive access
3840 	 * and said access is requested, fail quickly.
3841 	 */
3842 	if (flags & FIGNORECASE &&
3843 	    (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3844 	    vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3845 		return (EINVAL);
3846 
3847 	VOPXID_MAP_CR(dvp, cr);
3848 
3849 	err = (*(dvp)->v_op->vop_rmdir)(dvp, nm, cdir, cr, ct, flags);
3850 	VOPSTATS_UPDATE(dvp, rmdir);
3851 	return (err);
3852 }
3853 
3854 int
3855 fop_readdir(
3856 	vnode_t *vp,
3857 	uio_t *uiop,
3858 	cred_t *cr,
3859 	int *eofp,
3860 	caller_context_t *ct,
3861 	int flags)
3862 {
3863 	int	err;
3864 	ssize_t	resid_start = uiop->uio_resid;
3865 
3866 	/*
3867 	 * If this file system doesn't support retrieving directory
3868 	 * entry flags and said access is requested, fail quickly.
3869 	 */
3870 	if (flags & V_RDDIR_ENTFLAGS &&
3871 	    vfs_has_feature(vp->v_vfsp, VFSFT_DIRENTFLAGS) == 0)
3872 		return (EINVAL);
3873 
3874 	VOPXID_MAP_CR(vp, cr);
3875 
3876 	err = (*(vp)->v_op->vop_readdir)(vp, uiop, cr, eofp, ct, flags);
3877 	VOPSTATS_UPDATE_IO(vp, readdir,
3878 	    readdir_bytes, (resid_start - uiop->uio_resid));
3879 	return (err);
3880 }
3881 
3882 int
3883 fop_symlink(
3884 	vnode_t *dvp,
3885 	char *linkname,
3886 	vattr_t *vap,
3887 	char *target,
3888 	cred_t *cr,
3889 	caller_context_t *ct,
3890 	int flags)
3891 {
3892 	int	err;
3893 	xvattr_t xvattr;
3894 
3895 	/*
3896 	 * If this file system doesn't support case-insensitive access
3897 	 * and said access is requested, fail quickly.
3898 	 */
3899 	if (flags & FIGNORECASE &&
3900 	    (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3901 	    vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3902 		return (EINVAL);
3903 
3904 	VOPXID_MAP_CR(dvp, cr);
3905 
3906 	/* check for reparse point */
3907 	if ((vfs_has_feature(dvp->v_vfsp, VFSFT_REPARSE)) &&
3908 	    (strncmp(target, FS_REPARSE_TAG_STR,
3909 	    strlen(FS_REPARSE_TAG_STR)) == 0)) {
3910 		if (!fs_reparse_mark(target, vap, &xvattr))
3911 			vap = (vattr_t *)&xvattr;
3912 	}
3913 
3914 	err = (*(dvp)->v_op->vop_symlink)
3915 	    (dvp, linkname, vap, target, cr, ct, flags);
3916 	VOPSTATS_UPDATE(dvp, symlink);
3917 	return (err);
3918 }
3919 
3920 int
3921 fop_readlink(
3922 	vnode_t *vp,
3923 	uio_t *uiop,
3924 	cred_t *cr,
3925 	caller_context_t *ct)
3926 {
3927 	int	err;
3928 
3929 	VOPXID_MAP_CR(vp, cr);
3930 
3931 	err = (*(vp)->v_op->vop_readlink)(vp, uiop, cr, ct);
3932 	VOPSTATS_UPDATE(vp, readlink);
3933 	return (err);
3934 }
3935 
3936 int
3937 fop_fsync(
3938 	vnode_t *vp,
3939 	int syncflag,
3940 	cred_t *cr,
3941 	caller_context_t *ct)
3942 {
3943 	int	err;
3944 
3945 	VOPXID_MAP_CR(vp, cr);
3946 
3947 	err = (*(vp)->v_op->vop_fsync)(vp, syncflag, cr, ct);
3948 	VOPSTATS_UPDATE(vp, fsync);
3949 	return (err);
3950 }
3951 
3952 void
3953 fop_inactive(
3954 	vnode_t *vp,
3955 	cred_t *cr,
3956 	caller_context_t *ct)
3957 {
3958 	/* Need to update stats before vop call since we may lose the vnode */
3959 	VOPSTATS_UPDATE(vp, inactive);
3960 
3961 	VOPXID_MAP_CR(vp, cr);
3962 
3963 	(*(vp)->v_op->vop_inactive)(vp, cr, ct);
3964 }
3965 
3966 int
3967 fop_fid(
3968 	vnode_t *vp,
3969 	fid_t *fidp,
3970 	caller_context_t *ct)
3971 {
3972 	int	err;
3973 
3974 	err = (*(vp)->v_op->vop_fid)(vp, fidp, ct);
3975 	VOPSTATS_UPDATE(vp, fid);
3976 	return (err);
3977 }
3978 
3979 int
3980 fop_rwlock(
3981 	vnode_t *vp,
3982 	int write_lock,
3983 	caller_context_t *ct)
3984 {
3985 	int	ret;
3986 
3987 	ret = ((*(vp)->v_op->vop_rwlock)(vp, write_lock, ct));
3988 	VOPSTATS_UPDATE(vp, rwlock);
3989 	return (ret);
3990 }
3991 
3992 void
3993 fop_rwunlock(
3994 	vnode_t *vp,
3995 	int write_lock,
3996 	caller_context_t *ct)
3997 {
3998 	(*(vp)->v_op->vop_rwunlock)(vp, write_lock, ct);
3999 	VOPSTATS_UPDATE(vp, rwunlock);
4000 }
4001 
4002 int
4003 fop_seek(
4004 	vnode_t *vp,
4005 	offset_t ooff,
4006 	offset_t *noffp,
4007 	caller_context_t *ct)
4008 {
4009 	int	err;
4010 
4011 	err = (*(vp)->v_op->vop_seek)(vp, ooff, noffp, ct);
4012 	VOPSTATS_UPDATE(vp, seek);
4013 	return (err);
4014 }
4015 
4016 int
4017 fop_cmp(
4018 	vnode_t *vp1,
4019 	vnode_t *vp2,
4020 	caller_context_t *ct)
4021 {
4022 	int	err;
4023 
4024 	err = (*(vp1)->v_op->vop_cmp)(vp1, vp2, ct);
4025 	VOPSTATS_UPDATE(vp1, cmp);
4026 	return (err);
4027 }
4028 
4029 int
4030 fop_frlock(
4031 	vnode_t *vp,
4032 	int cmd,
4033 	flock64_t *bfp,
4034 	int flag,
4035 	offset_t offset,
4036 	struct flk_callback *flk_cbp,
4037 	cred_t *cr,
4038 	caller_context_t *ct)
4039 {
4040 	int	err;
4041 
4042 	VOPXID_MAP_CR(vp, cr);
4043 
4044 	err = (*(vp)->v_op->vop_frlock)
4045 	    (vp, cmd, bfp, flag, offset, flk_cbp, cr, ct);
4046 	VOPSTATS_UPDATE(vp, frlock);
4047 	return (err);
4048 }
4049 
4050 int
4051 fop_space(
4052 	vnode_t *vp,
4053 	int cmd,
4054 	flock64_t *bfp,
4055 	int flag,
4056 	offset_t offset,
4057 	cred_t *cr,
4058 	caller_context_t *ct)
4059 {
4060 	int	err;
4061 
4062 	VOPXID_MAP_CR(vp, cr);
4063 
4064 	err = (*(vp)->v_op->vop_space)(vp, cmd, bfp, flag, offset, cr, ct);
4065 	VOPSTATS_UPDATE(vp, space);
4066 	return (err);
4067 }
4068 
4069 int
4070 fop_realvp(
4071 	vnode_t *vp,
4072 	vnode_t **vpp,
4073 	caller_context_t *ct)
4074 {
4075 	int	err;
4076 
4077 	err = (*(vp)->v_op->vop_realvp)(vp, vpp, ct);
4078 	VOPSTATS_UPDATE(vp, realvp);
4079 	return (err);
4080 }
4081 
4082 int
4083 fop_getpage(
4084 	vnode_t *vp,
4085 	offset_t off,
4086 	size_t len,
4087 	uint_t *protp,
4088 	page_t **plarr,
4089 	size_t plsz,
4090 	struct seg *seg,
4091 	caddr_t addr,
4092 	enum seg_rw rw,
4093 	cred_t *cr,
4094 	caller_context_t *ct)
4095 {
4096 	int	err;
4097 
4098 	VOPXID_MAP_CR(vp, cr);
4099 
4100 	err = (*(vp)->v_op->vop_getpage)
4101 	    (vp, off, len, protp, plarr, plsz, seg, addr, rw, cr, ct);
4102 	VOPSTATS_UPDATE(vp, getpage);
4103 	return (err);
4104 }
4105 
4106 int
4107 fop_putpage(
4108 	vnode_t *vp,
4109 	offset_t off,
4110 	size_t len,
4111 	int flags,
4112 	cred_t *cr,
4113 	caller_context_t *ct)
4114 {
4115 	int	err;
4116 
4117 	VOPXID_MAP_CR(vp, cr);
4118 
4119 	err = (*(vp)->v_op->vop_putpage)(vp, off, len, flags, cr, ct);
4120 	VOPSTATS_UPDATE(vp, putpage);
4121 	return (err);
4122 }
4123 
4124 int
4125 fop_map(
4126 	vnode_t *vp,
4127 	offset_t off,
4128 	struct as *as,
4129 	caddr_t *addrp,
4130 	size_t len,
4131 	uchar_t prot,
4132 	uchar_t maxprot,
4133 	uint_t flags,
4134 	cred_t *cr,
4135 	caller_context_t *ct)
4136 {
4137 	int	err;
4138 
4139 	VOPXID_MAP_CR(vp, cr);
4140 
4141 	err = (*(vp)->v_op->vop_map)
4142 	    (vp, off, as, addrp, len, prot, maxprot, flags, cr, ct);
4143 	VOPSTATS_UPDATE(vp, map);
4144 	return (err);
4145 }
4146 
4147 int
4148 fop_addmap(
4149 	vnode_t *vp,
4150 	offset_t off,
4151 	struct as *as,
4152 	caddr_t addr,
4153 	size_t len,
4154 	uchar_t prot,
4155 	uchar_t maxprot,
4156 	uint_t flags,
4157 	cred_t *cr,
4158 	caller_context_t *ct)
4159 {
4160 	int error;
4161 	u_longlong_t delta;
4162 
4163 	VOPXID_MAP_CR(vp, cr);
4164 
4165 	error = (*(vp)->v_op->vop_addmap)
4166 	    (vp, off, as, addr, len, prot, maxprot, flags, cr, ct);
4167 
4168 	if ((!error) && (vp->v_type == VREG)) {
4169 		delta = (u_longlong_t)btopr(len);
4170 		/*
4171 		 * If file is declared MAP_PRIVATE, it can't be written back
4172 		 * even if open for write. Handle as read.
4173 		 */
4174 		if (flags & MAP_PRIVATE) {
4175 			atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4176 			    (int64_t)delta);
4177 		} else {
4178 			/*
4179 			 * atomic_add_64 forces the fetch of a 64 bit value to
4180 			 * be atomic on 32 bit machines
4181 			 */
4182 			if (maxprot & PROT_WRITE)
4183 				atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
4184 				    (int64_t)delta);
4185 			if (maxprot & PROT_READ)
4186 				atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4187 				    (int64_t)delta);
4188 			if (maxprot & PROT_EXEC)
4189 				atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4190 				    (int64_t)delta);
4191 		}
4192 	}
4193 	VOPSTATS_UPDATE(vp, addmap);
4194 	return (error);
4195 }
4196 
4197 int
4198 fop_delmap(
4199 	vnode_t *vp,
4200 	offset_t off,
4201 	struct as *as,
4202 	caddr_t addr,
4203 	size_t len,
4204 	uint_t prot,
4205 	uint_t maxprot,
4206 	uint_t flags,
4207 	cred_t *cr,
4208 	caller_context_t *ct)
4209 {
4210 	int error;
4211 	u_longlong_t delta;
4212 
4213 	VOPXID_MAP_CR(vp, cr);
4214 
4215 	error = (*(vp)->v_op->vop_delmap)
4216 	    (vp, off, as, addr, len, prot, maxprot, flags, cr, ct);
4217 
4218 	/*
4219 	 * NFS calls into delmap twice, the first time
4220 	 * it simply establishes a callback mechanism and returns EAGAIN
4221 	 * while the real work is being done upon the second invocation.
4222 	 * We have to detect this here and only decrement the counts upon
4223 	 * the second delmap request.
4224 	 */
4225 	if ((error != EAGAIN) && (vp->v_type == VREG)) {
4226 
4227 		delta = (u_longlong_t)btopr(len);
4228 
4229 		if (flags & MAP_PRIVATE) {
4230 			atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4231 			    (int64_t)(-delta));
4232 		} else {
4233 			/*
4234 			 * atomic_add_64 forces the fetch of a 64 bit value
4235 			 * to be atomic on 32 bit machines
4236 			 */
4237 			if (maxprot & PROT_WRITE)
4238 				atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
4239 				    (int64_t)(-delta));
4240 			if (maxprot & PROT_READ)
4241 				atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4242 				    (int64_t)(-delta));
4243 			if (maxprot & PROT_EXEC)
4244 				atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4245 				    (int64_t)(-delta));
4246 		}
4247 	}
4248 	VOPSTATS_UPDATE(vp, delmap);
4249 	return (error);
4250 }
4251 
4252 
4253 int
4254 fop_poll(
4255 	vnode_t *vp,
4256 	short events,
4257 	int anyyet,
4258 	short *reventsp,
4259 	struct pollhead **phpp,
4260 	caller_context_t *ct)
4261 {
4262 	int	err;
4263 
4264 	err = (*(vp)->v_op->vop_poll)(vp, events, anyyet, reventsp, phpp, ct);
4265 	VOPSTATS_UPDATE(vp, poll);
4266 	return (err);
4267 }
4268 
4269 int
4270 fop_dump(
4271 	vnode_t *vp,
4272 	caddr_t addr,
4273 	offset_t lbdn,
4274 	offset_t dblks,
4275 	caller_context_t *ct)
4276 {
4277 	int	err;
4278 
4279 	/* ensure lbdn and dblks can be passed safely to bdev_dump */
4280 	if ((lbdn != (daddr_t)lbdn) || (dblks != (int)dblks))
4281 		return (EIO);
4282 
4283 	err = (*(vp)->v_op->vop_dump)(vp, addr, lbdn, dblks, ct);
4284 	VOPSTATS_UPDATE(vp, dump);
4285 	return (err);
4286 }
4287 
4288 int
4289 fop_pathconf(
4290 	vnode_t *vp,
4291 	int cmd,
4292 	ulong_t *valp,
4293 	cred_t *cr,
4294 	caller_context_t *ct)
4295 {
4296 	int	err;
4297 
4298 	VOPXID_MAP_CR(vp, cr);
4299 
4300 	err = (*(vp)->v_op->vop_pathconf)(vp, cmd, valp, cr, ct);
4301 	VOPSTATS_UPDATE(vp, pathconf);
4302 	return (err);
4303 }
4304 
4305 int
4306 fop_pageio(
4307 	vnode_t *vp,
4308 	struct page *pp,
4309 	u_offset_t io_off,
4310 	size_t io_len,
4311 	int flags,
4312 	cred_t *cr,
4313 	caller_context_t *ct)
4314 {
4315 	int	err;
4316 
4317 	VOPXID_MAP_CR(vp, cr);
4318 
4319 	err = (*(vp)->v_op->vop_pageio)(vp, pp, io_off, io_len, flags, cr, ct);
4320 	VOPSTATS_UPDATE(vp, pageio);
4321 	return (err);
4322 }
4323 
4324 int
4325 fop_dumpctl(
4326 	vnode_t *vp,
4327 	int action,
4328 	offset_t *blkp,
4329 	caller_context_t *ct)
4330 {
4331 	int	err;
4332 	err = (*(vp)->v_op->vop_dumpctl)(vp, action, blkp, ct);
4333 	VOPSTATS_UPDATE(vp, dumpctl);
4334 	return (err);
4335 }
4336 
4337 void
4338 fop_dispose(
4339 	vnode_t *vp,
4340 	page_t *pp,
4341 	int flag,
4342 	int dn,
4343 	cred_t *cr,
4344 	caller_context_t *ct)
4345 {
4346 	/* Must do stats first since it's possible to lose the vnode */
4347 	VOPSTATS_UPDATE(vp, dispose);
4348 
4349 	VOPXID_MAP_CR(vp, cr);
4350 
4351 	(*(vp)->v_op->vop_dispose)(vp, pp, flag, dn, cr, ct);
4352 }
4353 
4354 int
4355 fop_setsecattr(
4356 	vnode_t *vp,
4357 	vsecattr_t *vsap,
4358 	int flag,
4359 	cred_t *cr,
4360 	caller_context_t *ct)
4361 {
4362 	int	err;
4363 
4364 	VOPXID_MAP_CR(vp, cr);
4365 
4366 	/*
4367 	 * We're only allowed to skip the ACL check iff we used a 32 bit
4368 	 * ACE mask with VOP_ACCESS() to determine permissions.
4369 	 */
4370 	if ((flag & ATTR_NOACLCHECK) &&
4371 	    vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4372 		return (EINVAL);
4373 	}
4374 	err = (*(vp)->v_op->vop_setsecattr) (vp, vsap, flag, cr, ct);
4375 	VOPSTATS_UPDATE(vp, setsecattr);
4376 	return (err);
4377 }
4378 
4379 int
4380 fop_getsecattr(
4381 	vnode_t *vp,
4382 	vsecattr_t *vsap,
4383 	int flag,
4384 	cred_t *cr,
4385 	caller_context_t *ct)
4386 {
4387 	int	err;
4388 
4389 	/*
4390 	 * We're only allowed to skip the ACL check iff we used a 32 bit
4391 	 * ACE mask with VOP_ACCESS() to determine permissions.
4392 	 */
4393 	if ((flag & ATTR_NOACLCHECK) &&
4394 	    vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4395 		return (EINVAL);
4396 	}
4397 
4398 	VOPXID_MAP_CR(vp, cr);
4399 
4400 	err = (*(vp)->v_op->vop_getsecattr) (vp, vsap, flag, cr, ct);
4401 	VOPSTATS_UPDATE(vp, getsecattr);
4402 	return (err);
4403 }
4404 
4405 int
4406 fop_shrlock(
4407 	vnode_t *vp,
4408 	int cmd,
4409 	struct shrlock *shr,
4410 	int flag,
4411 	cred_t *cr,
4412 	caller_context_t *ct)
4413 {
4414 	int	err;
4415 
4416 	VOPXID_MAP_CR(vp, cr);
4417 
4418 	err = (*(vp)->v_op->vop_shrlock)(vp, cmd, shr, flag, cr, ct);
4419 	VOPSTATS_UPDATE(vp, shrlock);
4420 	return (err);
4421 }
4422 
4423 int
4424 fop_vnevent(vnode_t *vp, vnevent_t vnevent, vnode_t *dvp, char *fnm,
4425     caller_context_t *ct)
4426 {
4427 	int	err;
4428 
4429 	err = (*(vp)->v_op->vop_vnevent)(vp, vnevent, dvp, fnm, ct);
4430 	VOPSTATS_UPDATE(vp, vnevent);
4431 	return (err);
4432 }
4433 
4434 int
4435 fop_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *uiop, cred_t *cr,
4436     caller_context_t *ct)
4437 {
4438 	int err;
4439 
4440 	if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4441 		return (ENOTSUP);
4442 	err = (*(vp)->v_op->vop_reqzcbuf)(vp, ioflag, uiop, cr, ct);
4443 	VOPSTATS_UPDATE(vp, reqzcbuf);
4444 	return (err);
4445 }
4446 
4447 int
4448 fop_retzcbuf(vnode_t *vp, xuio_t *uiop, cred_t *cr, caller_context_t *ct)
4449 {
4450 	int err;
4451 
4452 	if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4453 		return (ENOTSUP);
4454 	err = (*(vp)->v_op->vop_retzcbuf)(vp, uiop, cr, ct);
4455 	VOPSTATS_UPDATE(vp, retzcbuf);
4456 	return (err);
4457 }
4458 
4459 /*
4460  * Default destructor
4461  *	Needed because NULL destructor means that the key is unused
4462  */
4463 /* ARGSUSED */
4464 void
4465 vsd_defaultdestructor(void *value)
4466 {}
4467 
4468 /*
4469  * Create a key (index into per vnode array)
4470  *	Locks out vsd_create, vsd_destroy, and vsd_free
4471  *	May allocate memory with lock held
4472  */
4473 void
4474 vsd_create(uint_t *keyp, void (*destructor)(void *))
4475 {
4476 	int	i;
4477 	uint_t	nkeys;
4478 
4479 	/*
4480 	 * if key is allocated, do nothing
4481 	 */
4482 	mutex_enter(&vsd_lock);
4483 	if (*keyp) {
4484 		mutex_exit(&vsd_lock);
4485 		return;
4486 	}
4487 	/*
4488 	 * find an unused key
4489 	 */
4490 	if (destructor == NULL)
4491 		destructor = vsd_defaultdestructor;
4492 
4493 	for (i = 0; i < vsd_nkeys; ++i)
4494 		if (vsd_destructor[i] == NULL)
4495 			break;
4496 
4497 	/*
4498 	 * if no unused keys, increase the size of the destructor array
4499 	 */
4500 	if (i == vsd_nkeys) {
4501 		if ((nkeys = (vsd_nkeys << 1)) == 0)
4502 			nkeys = 1;
4503 		vsd_destructor =
4504 		    (void (**)(void *))vsd_realloc((void *)vsd_destructor,
4505 		    (size_t)(vsd_nkeys * sizeof (void (*)(void *))),
4506 		    (size_t)(nkeys * sizeof (void (*)(void *))));
4507 		vsd_nkeys = nkeys;
4508 	}
4509 
4510 	/*
4511 	 * allocate the next available unused key
4512 	 */
4513 	vsd_destructor[i] = destructor;
4514 	*keyp = i + 1;
4515 
4516 	/* create vsd_list, if it doesn't exist */
4517 	if (vsd_list == NULL) {
4518 		vsd_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
4519 		list_create(vsd_list, sizeof (struct vsd_node),
4520 		    offsetof(struct vsd_node, vs_nodes));
4521 	}
4522 
4523 	mutex_exit(&vsd_lock);
4524 }
4525 
4526 /*
4527  * Destroy a key
4528  *
4529  * Assumes that the caller is preventing vsd_set and vsd_get
4530  * Locks out vsd_create, vsd_destroy, and vsd_free
4531  * May free memory with lock held
4532  */
4533 void
4534 vsd_destroy(uint_t *keyp)
4535 {
4536 	uint_t key;
4537 	struct vsd_node *vsd;
4538 
4539 	/*
4540 	 * protect the key namespace and our destructor lists
4541 	 */
4542 	mutex_enter(&vsd_lock);
4543 	key = *keyp;
4544 	*keyp = 0;
4545 
4546 	ASSERT(key <= vsd_nkeys);
4547 
4548 	/*
4549 	 * if the key is valid
4550 	 */
4551 	if (key != 0) {
4552 		uint_t k = key - 1;
4553 		/*
4554 		 * for every vnode with VSD, call key's destructor
4555 		 */
4556 		for (vsd = list_head(vsd_list); vsd != NULL;
4557 		    vsd = list_next(vsd_list, vsd)) {
4558 			/*
4559 			 * no VSD for key in this vnode
4560 			 */
4561 			if (key > vsd->vs_nkeys)
4562 				continue;
4563 			/*
4564 			 * call destructor for key
4565 			 */
4566 			if (vsd->vs_value[k] && vsd_destructor[k])
4567 				(*vsd_destructor[k])(vsd->vs_value[k]);
4568 			/*
4569 			 * reset value for key
4570 			 */
4571 			vsd->vs_value[k] = NULL;
4572 		}
4573 		/*
4574 		 * actually free the key (NULL destructor == unused)
4575 		 */
4576 		vsd_destructor[k] = NULL;
4577 	}
4578 
4579 	mutex_exit(&vsd_lock);
4580 }
4581 
4582 /*
4583  * Quickly return the per vnode value that was stored with the specified key
4584  * Assumes the caller is protecting key from vsd_create and vsd_destroy
4585  * Assumes the caller is holding v_vsd_lock to protect the vsd.
4586  */
4587 void *
4588 vsd_get(vnode_t *vp, uint_t key)
4589 {
4590 	struct vsd_node *vsd;
4591 
4592 	ASSERT(vp != NULL);
4593 	ASSERT(mutex_owned(&vp->v_vsd_lock));
4594 
4595 	vsd = vp->v_vsd;
4596 
4597 	if (key && vsd != NULL && key <= vsd->vs_nkeys)
4598 		return (vsd->vs_value[key - 1]);
4599 	return (NULL);
4600 }
4601 
4602 /*
4603  * Set a per vnode value indexed with the specified key
4604  * Assumes the caller is holding v_vsd_lock to protect the vsd.
4605  */
4606 int
4607 vsd_set(vnode_t *vp, uint_t key, void *value)
4608 {
4609 	struct vsd_node *vsd;
4610 
4611 	ASSERT(vp != NULL);
4612 	ASSERT(mutex_owned(&vp->v_vsd_lock));
4613 
4614 	if (key == 0)
4615 		return (EINVAL);
4616 
4617 	vsd = vp->v_vsd;
4618 	if (vsd == NULL)
4619 		vsd = vp->v_vsd = kmem_zalloc(sizeof (*vsd), KM_SLEEP);
4620 
4621 	/*
4622 	 * If the vsd was just allocated, vs_nkeys will be 0, so the following
4623 	 * code won't happen and we will continue down and allocate space for
4624 	 * the vs_value array.
4625 	 * If the caller is replacing one value with another, then it is up
4626 	 * to the caller to free/rele/destroy the previous value (if needed).
4627 	 */
4628 	if (key <= vsd->vs_nkeys) {
4629 		vsd->vs_value[key - 1] = value;
4630 		return (0);
4631 	}
4632 
4633 	ASSERT(key <= vsd_nkeys);
4634 
4635 	if (vsd->vs_nkeys == 0) {
4636 		mutex_enter(&vsd_lock);	/* lock out vsd_destroy() */
4637 		/*
4638 		 * Link onto list of all VSD nodes.
4639 		 */
4640 		list_insert_head(vsd_list, vsd);
4641 		mutex_exit(&vsd_lock);
4642 	}
4643 
4644 	/*
4645 	 * Allocate vnode local storage and set the value for key
4646 	 */
4647 	vsd->vs_value = vsd_realloc(vsd->vs_value,
4648 	    vsd->vs_nkeys * sizeof (void *),
4649 	    key * sizeof (void *));
4650 	vsd->vs_nkeys = key;
4651 	vsd->vs_value[key - 1] = value;
4652 
4653 	return (0);
4654 }
4655 
4656 /*
4657  * Called from vn_free() to run the destructor function for each vsd
4658  *	Locks out vsd_create and vsd_destroy
4659  *	Assumes that the destructor *DOES NOT* use vsd
4660  */
4661 void
4662 vsd_free(vnode_t *vp)
4663 {
4664 	int i;
4665 	struct vsd_node *vsd = vp->v_vsd;
4666 
4667 	if (vsd == NULL)
4668 		return;
4669 
4670 	if (vsd->vs_nkeys == 0) {
4671 		kmem_free(vsd, sizeof (*vsd));
4672 		vp->v_vsd = NULL;
4673 		return;
4674 	}
4675 
4676 	/*
4677 	 * lock out vsd_create and vsd_destroy, call
4678 	 * the destructor, and mark the value as destroyed.
4679 	 */
4680 	mutex_enter(&vsd_lock);
4681 
4682 	for (i = 0; i < vsd->vs_nkeys; i++) {
4683 		if (vsd->vs_value[i] && vsd_destructor[i])
4684 			(*vsd_destructor[i])(vsd->vs_value[i]);
4685 		vsd->vs_value[i] = NULL;
4686 	}
4687 
4688 	/*
4689 	 * remove from linked list of VSD nodes
4690 	 */
4691 	list_remove(vsd_list, vsd);
4692 
4693 	mutex_exit(&vsd_lock);
4694 
4695 	/*
4696 	 * free up the VSD
4697 	 */
4698 	kmem_free(vsd->vs_value, vsd->vs_nkeys * sizeof (void *));
4699 	kmem_free(vsd, sizeof (struct vsd_node));
4700 	vp->v_vsd = NULL;
4701 }
4702 
4703 /*
4704  * realloc
4705  */
4706 static void *
4707 vsd_realloc(void *old, size_t osize, size_t nsize)
4708 {
4709 	void *new;
4710 
4711 	new = kmem_zalloc(nsize, KM_SLEEP);
4712 	if (old) {
4713 		bcopy(old, new, osize);
4714 		kmem_free(old, osize);
4715 	}
4716 	return (new);
4717 }
4718 
4719 /*
4720  * Setup the extensible system attribute for creating a reparse point.
4721  * The symlink data 'target' is validated for proper format of a reparse
4722  * string and a check also made to make sure the symlink data does not
4723  * point to an existing file.
4724  *
4725  * return 0 if ok else -1.
4726  */
4727 static int
4728 fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr)
4729 {
4730 	xoptattr_t *xoap;
4731 
4732 	if ((!target) || (!vap) || (!xvattr))
4733 		return (-1);
4734 
4735 	/* validate reparse string */
4736 	if (reparse_validate((const char *)target))
4737 		return (-1);
4738 
4739 	xva_init(xvattr);
4740 	xvattr->xva_vattr = *vap;
4741 	xvattr->xva_vattr.va_mask |= AT_XVATTR;
4742 	xoap = xva_getxoptattr(xvattr);
4743 	ASSERT(xoap);
4744 	XVA_SET_REQ(xvattr, XAT_REPARSE);
4745 	xoap->xoa_reparse = 1;
4746 
4747 	return (0);
4748 }
4749 
4750 /*
4751  * Function to check whether a symlink is a reparse point.
4752  * Return B_TRUE if it is a reparse point, else return B_FALSE
4753  */
4754 boolean_t
4755 vn_is_reparse(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4756 {
4757 	xvattr_t xvattr;
4758 	xoptattr_t *xoap;
4759 
4760 	if ((vp->v_type != VLNK) ||
4761 	    !(vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR)))
4762 		return (B_FALSE);
4763 
4764 	xva_init(&xvattr);
4765 	xoap = xva_getxoptattr(&xvattr);
4766 	ASSERT(xoap);
4767 	XVA_SET_REQ(&xvattr, XAT_REPARSE);
4768 
4769 	if (VOP_GETATTR(vp, &xvattr.xva_vattr, 0, cr, ct))
4770 		return (B_FALSE);
4771 
4772 	if ((!(xvattr.xva_vattr.va_mask & AT_XVATTR)) ||
4773 	    (!(XVA_ISSET_RTN(&xvattr, XAT_REPARSE))))
4774 		return (B_FALSE);
4775 
4776 	return (xoap->xoa_reparse ? B_TRUE : B_FALSE);
4777 }
4778