xref: /titanic_50/usr/src/uts/common/fs/dev/sdev_subr.c (revision 4b3b7fc6e1f62f5e2bee41aafc52e9234c484bc0)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
24  */
25 
26 /*
27  * utility routines for the /dev fs
28  */
29 
30 #include <sys/types.h>
31 #include <sys/param.h>
32 #include <sys/t_lock.h>
33 #include <sys/systm.h>
34 #include <sys/sysmacros.h>
35 #include <sys/user.h>
36 #include <sys/time.h>
37 #include <sys/vfs.h>
38 #include <sys/vnode.h>
39 #include <sys/file.h>
40 #include <sys/fcntl.h>
41 #include <sys/flock.h>
42 #include <sys/kmem.h>
43 #include <sys/uio.h>
44 #include <sys/errno.h>
45 #include <sys/stat.h>
46 #include <sys/cred.h>
47 #include <sys/dirent.h>
48 #include <sys/pathname.h>
49 #include <sys/cmn_err.h>
50 #include <sys/debug.h>
51 #include <sys/mode.h>
52 #include <sys/policy.h>
53 #include <fs/fs_subr.h>
54 #include <sys/mount.h>
55 #include <sys/fs/snode.h>
56 #include <sys/fs/dv_node.h>
57 #include <sys/fs/sdev_impl.h>
58 #include <sys/sunndi.h>
59 #include <sys/sunmdi.h>
60 #include <sys/conf.h>
61 #include <sys/proc.h>
62 #include <sys/user.h>
63 #include <sys/modctl.h>
64 
65 #ifdef DEBUG
66 int sdev_debug = 0x00000001;
67 int sdev_debug_cache_flags = 0;
68 #endif
69 
70 /*
71  * globals
72  */
73 /* prototype memory vattrs */
74 vattr_t sdev_vattr_dir = {
75 	AT_TYPE|AT_MODE|AT_UID|AT_GID,		/* va_mask */
76 	VDIR,					/* va_type */
77 	SDEV_DIRMODE_DEFAULT,			/* va_mode */
78 	SDEV_UID_DEFAULT,			/* va_uid */
79 	SDEV_GID_DEFAULT,			/* va_gid */
80 	0,					/* va_fsid */
81 	0,					/* va_nodeid */
82 	0,					/* va_nlink */
83 	0,					/* va_size */
84 	0,					/* va_atime */
85 	0,					/* va_mtime */
86 	0,					/* va_ctime */
87 	0,					/* va_rdev */
88 	0,					/* va_blksize */
89 	0,					/* va_nblocks */
90 	0					/* va_vcode */
91 };
92 
93 vattr_t sdev_vattr_lnk = {
94 	AT_TYPE|AT_MODE,			/* va_mask */
95 	VLNK,					/* va_type */
96 	SDEV_LNKMODE_DEFAULT,			/* va_mode */
97 	SDEV_UID_DEFAULT,			/* va_uid */
98 	SDEV_GID_DEFAULT,			/* va_gid */
99 	0,					/* va_fsid */
100 	0,					/* va_nodeid */
101 	0,					/* va_nlink */
102 	0,					/* va_size */
103 	0,					/* va_atime */
104 	0,					/* va_mtime */
105 	0,					/* va_ctime */
106 	0,					/* va_rdev */
107 	0,					/* va_blksize */
108 	0,					/* va_nblocks */
109 	0					/* va_vcode */
110 };
111 
112 vattr_t sdev_vattr_blk = {
113 	AT_TYPE|AT_MODE|AT_UID|AT_GID,		/* va_mask */
114 	VBLK,					/* va_type */
115 	S_IFBLK | SDEV_DEVMODE_DEFAULT,		/* va_mode */
116 	SDEV_UID_DEFAULT,			/* va_uid */
117 	SDEV_GID_DEFAULT,			/* va_gid */
118 	0,					/* va_fsid */
119 	0,					/* va_nodeid */
120 	0,					/* va_nlink */
121 	0,					/* va_size */
122 	0,					/* va_atime */
123 	0,					/* va_mtime */
124 	0,					/* va_ctime */
125 	0,					/* va_rdev */
126 	0,					/* va_blksize */
127 	0,					/* va_nblocks */
128 	0					/* va_vcode */
129 };
130 
131 vattr_t sdev_vattr_chr = {
132 	AT_TYPE|AT_MODE|AT_UID|AT_GID,		/* va_mask */
133 	VCHR,					/* va_type */
134 	S_IFCHR | SDEV_DEVMODE_DEFAULT,		/* va_mode */
135 	SDEV_UID_DEFAULT,			/* va_uid */
136 	SDEV_GID_DEFAULT,			/* va_gid */
137 	0,					/* va_fsid */
138 	0,					/* va_nodeid */
139 	0,					/* va_nlink */
140 	0,					/* va_size */
141 	0,					/* va_atime */
142 	0,					/* va_mtime */
143 	0,					/* va_ctime */
144 	0,					/* va_rdev */
145 	0,					/* va_blksize */
146 	0,					/* va_nblocks */
147 	0					/* va_vcode */
148 };
149 
150 kmem_cache_t	*sdev_node_cache;	/* sdev_node cache */
151 int		devtype;		/* fstype */
152 
153 /* static */
154 static struct vnodeops *sdev_get_vop(struct sdev_node *);
155 static void sdev_set_no_negcache(struct sdev_node *);
156 static fs_operation_def_t *sdev_merge_vtab(const fs_operation_def_t []);
157 static void sdev_free_vtab(fs_operation_def_t *);
158 
159 static void
160 sdev_prof_free(struct sdev_node *dv)
161 {
162 	ASSERT(!SDEV_IS_GLOBAL(dv));
163 	nvlist_free(dv->sdev_prof.dev_name);
164 	nvlist_free(dv->sdev_prof.dev_map);
165 	nvlist_free(dv->sdev_prof.dev_symlink);
166 	nvlist_free(dv->sdev_prof.dev_glob_incdir);
167 	nvlist_free(dv->sdev_prof.dev_glob_excdir);
168 	bzero(&dv->sdev_prof, sizeof (dv->sdev_prof));
169 }
170 
171 /* sdev_node cache constructor */
172 /*ARGSUSED1*/
173 static int
174 i_sdev_node_ctor(void *buf, void *cfarg, int flag)
175 {
176 	struct sdev_node *dv = (struct sdev_node *)buf;
177 	struct vnode *vp;
178 
179 	bzero(buf, sizeof (struct sdev_node));
180 	vp = dv->sdev_vnode = vn_alloc(flag);
181 	if (vp == NULL) {
182 		return (-1);
183 	}
184 	vp->v_data = dv;
185 	rw_init(&dv->sdev_contents, NULL, RW_DEFAULT, NULL);
186 	return (0);
187 }
188 
189 /* sdev_node cache destructor */
190 /*ARGSUSED1*/
191 static void
192 i_sdev_node_dtor(void *buf, void *arg)
193 {
194 	struct sdev_node *dv = (struct sdev_node *)buf;
195 	struct vnode *vp = SDEVTOV(dv);
196 
197 	rw_destroy(&dv->sdev_contents);
198 	vn_free(vp);
199 }
200 
201 /* initialize sdev_node cache */
202 void
203 sdev_node_cache_init()
204 {
205 	int flags = 0;
206 
207 #ifdef	DEBUG
208 	flags = sdev_debug_cache_flags;
209 	if (flags)
210 		sdcmn_err(("cache debug flags 0x%x\n", flags));
211 #endif	/* DEBUG */
212 
213 	ASSERT(sdev_node_cache == NULL);
214 	sdev_node_cache = kmem_cache_create("sdev_node_cache",
215 	    sizeof (struct sdev_node), 0, i_sdev_node_ctor, i_sdev_node_dtor,
216 	    NULL, NULL, NULL, flags);
217 }
218 
219 /* destroy sdev_node cache */
220 void
221 sdev_node_cache_fini()
222 {
223 	ASSERT(sdev_node_cache != NULL);
224 	kmem_cache_destroy(sdev_node_cache);
225 	sdev_node_cache = NULL;
226 }
227 
228 /*
229  * Compare two nodes lexographically to balance avl tree
230  */
231 static int
232 sdev_compare_nodes(const struct sdev_node *dv1, const struct sdev_node *dv2)
233 {
234 	int rv;
235 	if ((rv = strcmp(dv1->sdev_name, dv2->sdev_name)) == 0)
236 		return (0);
237 	return ((rv < 0) ? -1 : 1);
238 }
239 
240 void
241 sdev_set_nodestate(struct sdev_node *dv, sdev_node_state_t state)
242 {
243 	ASSERT(dv);
244 	ASSERT(RW_WRITE_HELD(&dv->sdev_contents));
245 	dv->sdev_state = state;
246 }
247 
248 static void
249 sdev_attr_update(struct sdev_node *dv, vattr_t *vap)
250 {
251 	timestruc_t	now;
252 	struct vattr	*attrp;
253 	uint_t		mask;
254 
255 	ASSERT(dv->sdev_attr);
256 	ASSERT(vap);
257 
258 	attrp = dv->sdev_attr;
259 	mask = vap->va_mask;
260 	if (mask & AT_TYPE)
261 		attrp->va_type = vap->va_type;
262 	if (mask & AT_MODE)
263 		attrp->va_mode = vap->va_mode;
264 	if (mask & AT_UID)
265 		attrp->va_uid = vap->va_uid;
266 	if (mask & AT_GID)
267 		attrp->va_gid = vap->va_gid;
268 	if (mask & AT_RDEV)
269 		attrp->va_rdev = vap->va_rdev;
270 
271 	gethrestime(&now);
272 	attrp->va_atime = (mask & AT_ATIME) ? vap->va_atime : now;
273 	attrp->va_mtime = (mask & AT_MTIME) ? vap->va_mtime : now;
274 	attrp->va_ctime = (mask & AT_CTIME) ? vap->va_ctime : now;
275 }
276 
277 static void
278 sdev_attr_alloc(struct sdev_node *dv, vattr_t *vap)
279 {
280 	ASSERT(dv->sdev_attr == NULL);
281 	ASSERT(vap->va_mask & AT_TYPE);
282 	ASSERT(vap->va_mask & AT_MODE);
283 
284 	dv->sdev_attr = kmem_zalloc(sizeof (struct vattr), KM_SLEEP);
285 	sdev_attr_update(dv, vap);
286 }
287 
288 /* alloc and initialize a sdev_node */
289 int
290 sdev_nodeinit(struct sdev_node *ddv, char *nm, struct sdev_node **newdv,
291     vattr_t *vap)
292 {
293 	struct sdev_node *dv = NULL;
294 	struct vnode *vp;
295 	size_t nmlen, len;
296 	devname_handle_t  *dhl;
297 
298 	nmlen = strlen(nm) + 1;
299 	if (nmlen > MAXNAMELEN) {
300 		sdcmn_err9(("sdev_nodeinit: node name %s"
301 		    " too long\n", nm));
302 		*newdv = NULL;
303 		return (ENAMETOOLONG);
304 	}
305 
306 	dv = kmem_cache_alloc(sdev_node_cache, KM_SLEEP);
307 
308 	dv->sdev_name = kmem_alloc(nmlen, KM_SLEEP);
309 	bcopy(nm, dv->sdev_name, nmlen);
310 	dv->sdev_namelen = nmlen - 1;	/* '\0' not included */
311 	len = strlen(ddv->sdev_path) + strlen(nm) + 2;
312 	dv->sdev_path = kmem_alloc(len, KM_SLEEP);
313 	(void) snprintf(dv->sdev_path, len, "%s/%s", ddv->sdev_path, nm);
314 	/* overwritten for VLNK nodes */
315 	dv->sdev_symlink = NULL;
316 
317 	vp = SDEVTOV(dv);
318 	vn_reinit(vp);
319 	vp->v_vfsp = SDEVTOV(ddv)->v_vfsp;
320 	if (vap)
321 		vp->v_type = vap->va_type;
322 
323 	/*
324 	 * initialized to the parent's vnodeops.
325 	 * maybe overwriten for a VDIR
326 	 */
327 	vn_setops(vp, vn_getops(SDEVTOV(ddv)));
328 	vn_exists(vp);
329 
330 	dv->sdev_dotdot = NULL;
331 	dv->sdev_attrvp = NULL;
332 	if (vap) {
333 		sdev_attr_alloc(dv, vap);
334 	} else {
335 		dv->sdev_attr = NULL;
336 	}
337 
338 	dv->sdev_ino = sdev_mkino(dv);
339 	dv->sdev_nlink = 0;		/* updated on insert */
340 	dv->sdev_flags = ddv->sdev_flags; /* inherit from the parent first */
341 	dv->sdev_flags |= SDEV_BUILD;
342 	mutex_init(&dv->sdev_lookup_lock, NULL, MUTEX_DEFAULT, NULL);
343 	cv_init(&dv->sdev_lookup_cv, NULL, CV_DEFAULT, NULL);
344 	if (SDEV_IS_GLOBAL(ddv)) {
345 		dv->sdev_flags |= SDEV_GLOBAL;
346 		dhl = &(dv->sdev_handle);
347 		dhl->dh_data = dv;
348 		dhl->dh_args = NULL;
349 		sdev_set_no_negcache(dv);
350 		dv->sdev_gdir_gen = 0;
351 	} else {
352 		dv->sdev_flags &= ~SDEV_GLOBAL;
353 		dv->sdev_origin = NULL; /* set later */
354 		bzero(&dv->sdev_prof, sizeof (dv->sdev_prof));
355 		dv->sdev_ldir_gen = 0;
356 		dv->sdev_devtree_gen = 0;
357 	}
358 
359 	rw_enter(&dv->sdev_contents, RW_WRITER);
360 	sdev_set_nodestate(dv, SDEV_INIT);
361 	rw_exit(&dv->sdev_contents);
362 	*newdv = dv;
363 
364 	return (0);
365 }
366 
367 /*
368  * Transition a sdev_node into SDEV_READY state. If this fails, it is up to the
369  * caller to transition the node to the SDEV_ZOMBIE state.
370  */
371 int
372 sdev_nodeready(struct sdev_node *dv, struct vattr *vap, struct vnode *avp,
373     void *args, struct cred *cred)
374 {
375 	int error = 0;
376 	struct vnode *vp = SDEVTOV(dv);
377 	vtype_t type;
378 
379 	ASSERT(dv && (dv->sdev_state != SDEV_READY) && vap);
380 
381 	type = vap->va_type;
382 	vp->v_type = type;
383 	vp->v_rdev = vap->va_rdev;
384 	rw_enter(&dv->sdev_contents, RW_WRITER);
385 	if (type == VDIR) {
386 		dv->sdev_nlink = 2;
387 		dv->sdev_flags &= ~SDEV_PERSIST;
388 		dv->sdev_flags &= ~SDEV_DYNAMIC;
389 		vn_setops(vp, sdev_get_vop(dv)); /* from internal vtab */
390 		ASSERT(dv->sdev_dotdot);
391 		ASSERT(SDEVTOV(dv->sdev_dotdot)->v_type == VDIR);
392 		vp->v_rdev = SDEVTOV(dv->sdev_dotdot)->v_rdev;
393 		avl_create(&dv->sdev_entries,
394 		    (int (*)(const void *, const void *))sdev_compare_nodes,
395 		    sizeof (struct sdev_node),
396 		    offsetof(struct sdev_node, sdev_avllink));
397 	} else if (type == VLNK) {
398 		ASSERT(args);
399 		dv->sdev_nlink = 1;
400 		dv->sdev_symlink = i_ddi_strdup((char *)args, KM_SLEEP);
401 	} else {
402 		dv->sdev_nlink = 1;
403 	}
404 
405 	if (!(SDEV_IS_GLOBAL(dv))) {
406 		dv->sdev_origin = (struct sdev_node *)args;
407 		dv->sdev_flags &= ~SDEV_PERSIST;
408 	}
409 
410 	/*
411 	 * shadow node is created here OR
412 	 * if failed (indicated by dv->sdev_attrvp == NULL),
413 	 * created later in sdev_setattr
414 	 */
415 	if (avp) {
416 		dv->sdev_attrvp = avp;
417 	} else {
418 		if (dv->sdev_attr == NULL) {
419 			sdev_attr_alloc(dv, vap);
420 		} else {
421 			sdev_attr_update(dv, vap);
422 		}
423 
424 		if ((dv->sdev_attrvp == NULL) && SDEV_IS_PERSIST(dv))
425 			error = sdev_shadow_node(dv, cred);
426 	}
427 
428 	if (error == 0) {
429 		/* transition to READY state */
430 		sdev_set_nodestate(dv, SDEV_READY);
431 		sdev_nc_node_exists(dv);
432 	}
433 	rw_exit(&dv->sdev_contents);
434 	return (error);
435 }
436 
437 /*
438  * Build the VROOT sdev_node.
439  */
440 /*ARGSUSED*/
441 struct sdev_node *
442 sdev_mkroot(struct vfs *vfsp, dev_t devdev, struct vnode *mvp,
443     struct vnode *avp, struct cred *cred)
444 {
445 	struct sdev_node *dv;
446 	struct vnode *vp;
447 	char devdir[] = "/dev";
448 
449 	ASSERT(sdev_node_cache != NULL);
450 	ASSERT(avp);
451 	dv = kmem_cache_alloc(sdev_node_cache, KM_SLEEP);
452 	vp = SDEVTOV(dv);
453 	vn_reinit(vp);
454 	vp->v_flag |= VROOT;
455 	vp->v_vfsp = vfsp;
456 	vp->v_type = VDIR;
457 	vp->v_rdev = devdev;
458 	vn_setops(vp, sdev_vnodeops); /* apply the default vnodeops at /dev */
459 	vn_exists(vp);
460 
461 	if (vfsp->vfs_mntpt)
462 		dv->sdev_name = i_ddi_strdup(
463 		    (char *)refstr_value(vfsp->vfs_mntpt), KM_SLEEP);
464 	else
465 		/* vfs_mountdev1 set mount point later */
466 		dv->sdev_name = i_ddi_strdup("/dev", KM_SLEEP);
467 	dv->sdev_namelen = strlen(dv->sdev_name); /* '\0' not included */
468 	dv->sdev_path = i_ddi_strdup(devdir, KM_SLEEP);
469 	dv->sdev_ino = SDEV_ROOTINO;
470 	dv->sdev_nlink = 2;		/* name + . (no sdev_insert) */
471 	dv->sdev_dotdot = dv;		/* .. == self */
472 	dv->sdev_attrvp = avp;
473 	dv->sdev_attr = NULL;
474 	mutex_init(&dv->sdev_lookup_lock, NULL, MUTEX_DEFAULT, NULL);
475 	cv_init(&dv->sdev_lookup_cv, NULL, CV_DEFAULT, NULL);
476 	if (strcmp(dv->sdev_name, "/dev") == 0) {
477 		dv->sdev_flags = SDEV_BUILD|SDEV_GLOBAL|SDEV_PERSIST;
478 		bzero(&dv->sdev_handle, sizeof (dv->sdev_handle));
479 		dv->sdev_gdir_gen = 0;
480 	} else {
481 		dv->sdev_flags = SDEV_BUILD;
482 		dv->sdev_flags &= ~SDEV_PERSIST;
483 		bzero(&dv->sdev_prof, sizeof (dv->sdev_prof));
484 		dv->sdev_ldir_gen = 0;
485 		dv->sdev_devtree_gen = 0;
486 	}
487 
488 	avl_create(&dv->sdev_entries,
489 	    (int (*)(const void *, const void *))sdev_compare_nodes,
490 	    sizeof (struct sdev_node),
491 	    offsetof(struct sdev_node, sdev_avllink));
492 
493 	rw_enter(&dv->sdev_contents, RW_WRITER);
494 	sdev_set_nodestate(dv, SDEV_READY);
495 	rw_exit(&dv->sdev_contents);
496 	sdev_nc_node_exists(dv);
497 	return (dv);
498 }
499 
500 /* directory dependent vop table */
501 struct sdev_vop_table {
502 	char *vt_name;				/* subdirectory name */
503 	const fs_operation_def_t *vt_service;	/* vnodeops table */
504 	struct vnodeops *vt_vops;		/* constructed vop */
505 	struct vnodeops **vt_global_vops;	/* global container for vop */
506 	int (*vt_vtor)(struct sdev_node *);	/* validate sdev_node */
507 	int vt_flags;
508 };
509 
510 /*
511  * A nice improvement would be to provide a plug-in mechanism
512  * for this table instead of a const table.
513  */
514 static struct sdev_vop_table vtab[] =
515 {
516 	{ "pts", devpts_vnodeops_tbl, NULL, &devpts_vnodeops, devpts_validate,
517 	SDEV_DYNAMIC | SDEV_VTOR },
518 
519 	{ "vt", devvt_vnodeops_tbl, NULL, &devvt_vnodeops, devvt_validate,
520 	SDEV_DYNAMIC | SDEV_VTOR },
521 
522 	{ "zvol", devzvol_vnodeops_tbl, NULL, &devzvol_vnodeops,
523 	devzvol_validate, SDEV_ZONED | SDEV_DYNAMIC | SDEV_VTOR | SDEV_SUBDIR },
524 
525 	{ "zcons", NULL, NULL, NULL, NULL, SDEV_NO_NCACHE },
526 
527 	{ "net", devnet_vnodeops_tbl, NULL, &devnet_vnodeops, devnet_validate,
528 	SDEV_DYNAMIC | SDEV_VTOR },
529 
530 	{ "ipnet", devipnet_vnodeops_tbl, NULL, &devipnet_vnodeops,
531 	devipnet_validate, SDEV_DYNAMIC | SDEV_VTOR | SDEV_NO_NCACHE },
532 
533 	/*
534 	 * SDEV_DYNAMIC: prevent calling out to devfsadm, since only the
535 	 * lofi driver controls child nodes.
536 	 *
537 	 * SDEV_PERSIST: ensure devfsadm knows to clean up any persisted
538 	 * stale nodes (e.g. from devfsadm -R).
539 	 *
540 	 * In addition, devfsadm knows not to attempt a rmdir: a zone
541 	 * may hold a reference, which would zombify the node,
542 	 * preventing a mkdir.
543 	 */
544 
545 	{ "lofi", NULL, NULL, NULL, NULL,
546 	    SDEV_ZONED | SDEV_DYNAMIC | SDEV_PERSIST },
547 	{ "rlofi", NULL, NULL, NULL, NULL,
548 	    SDEV_ZONED | SDEV_DYNAMIC | SDEV_PERSIST },
549 
550 	{ NULL, NULL, NULL, NULL, NULL, 0}
551 };
552 
553 /*
554  * We need to match off of the sdev_path, not the sdev_name. We are only allowed
555  * to exist directly under /dev.
556  */
557 struct sdev_vop_table *
558 sdev_match(struct sdev_node *dv)
559 {
560 	int vlen;
561 	int i;
562 	const char *path;
563 
564 	if (strlen(dv->sdev_path) <= 5)
565 		return (NULL);
566 
567 	if (strncmp(dv->sdev_path, "/dev/", 5) != 0)
568 		return (NULL);
569 	path = dv->sdev_path + 5;
570 
571 	for (i = 0; vtab[i].vt_name; i++) {
572 		if (strcmp(vtab[i].vt_name, path) == 0)
573 			return (&vtab[i]);
574 		if (vtab[i].vt_flags & SDEV_SUBDIR) {
575 			vlen = strlen(vtab[i].vt_name);
576 			if ((strncmp(vtab[i].vt_name, path,
577 			    vlen - 1) == 0) && path[vlen] == '/')
578 				return (&vtab[i]);
579 		}
580 
581 	}
582 	return (NULL);
583 }
584 
585 /*
586  *  sets a directory's vnodeops if the directory is in the vtab;
587  */
588 static struct vnodeops *
589 sdev_get_vop(struct sdev_node *dv)
590 {
591 	struct sdev_vop_table *vtp;
592 	char *path;
593 
594 	path = dv->sdev_path;
595 	ASSERT(path);
596 
597 	/* gets the relative path to /dev/ */
598 	path += 5;
599 
600 	/* gets the vtab entry it matches */
601 	if ((vtp = sdev_match(dv)) != NULL) {
602 		dv->sdev_flags |= vtp->vt_flags;
603 		if (SDEV_IS_PERSIST(dv->sdev_dotdot) &&
604 		    (SDEV_IS_PERSIST(dv) || !SDEV_IS_DYNAMIC(dv)))
605 			dv->sdev_flags |= SDEV_PERSIST;
606 
607 		if (vtp->vt_vops) {
608 			if (vtp->vt_global_vops)
609 				*(vtp->vt_global_vops) = vtp->vt_vops;
610 
611 			return (vtp->vt_vops);
612 		}
613 
614 		if (vtp->vt_service) {
615 			fs_operation_def_t *templ;
616 			templ = sdev_merge_vtab(vtp->vt_service);
617 			if (vn_make_ops(vtp->vt_name,
618 			    (const fs_operation_def_t *)templ,
619 			    &vtp->vt_vops) != 0) {
620 				cmn_err(CE_PANIC, "%s: malformed vnode ops\n",
621 				    vtp->vt_name);
622 				/*NOTREACHED*/
623 			}
624 			if (vtp->vt_global_vops) {
625 				*(vtp->vt_global_vops) = vtp->vt_vops;
626 			}
627 			sdev_free_vtab(templ);
628 
629 			return (vtp->vt_vops);
630 		}
631 
632 		return (sdev_vnodeops);
633 	}
634 
635 	/* child inherits the persistence of the parent */
636 	if (SDEV_IS_PERSIST(dv->sdev_dotdot))
637 		dv->sdev_flags |= SDEV_PERSIST;
638 
639 	return (sdev_vnodeops);
640 }
641 
642 static void
643 sdev_set_no_negcache(struct sdev_node *dv)
644 {
645 	int i;
646 	char *path;
647 
648 	ASSERT(dv->sdev_path);
649 	path = dv->sdev_path + strlen("/dev/");
650 
651 	for (i = 0; vtab[i].vt_name; i++) {
652 		if (strcmp(vtab[i].vt_name, path) == 0) {
653 			if (vtab[i].vt_flags & SDEV_NO_NCACHE)
654 				dv->sdev_flags |= SDEV_NO_NCACHE;
655 			break;
656 		}
657 	}
658 }
659 
660 void *
661 sdev_get_vtor(struct sdev_node *dv)
662 {
663 	struct sdev_vop_table *vtp;
664 
665 	vtp = sdev_match(dv);
666 	if (vtp)
667 		return ((void *)vtp->vt_vtor);
668 	else
669 		return (NULL);
670 }
671 
672 /*
673  * Build the base root inode
674  */
675 ino_t
676 sdev_mkino(struct sdev_node *dv)
677 {
678 	ino_t	ino;
679 
680 	/*
681 	 * for now, follow the lead of tmpfs here
682 	 * need to someday understand the requirements here
683 	 */
684 	ino = (ino_t)(uint32_t)((uintptr_t)dv >> 3);
685 	ino += SDEV_ROOTINO + 1;
686 
687 	return (ino);
688 }
689 
690 int
691 sdev_getlink(struct vnode *linkvp, char **link)
692 {
693 	int err;
694 	char *buf;
695 	struct uio uio = {0};
696 	struct iovec iov = {0};
697 
698 	if (linkvp == NULL)
699 		return (ENOENT);
700 	ASSERT(linkvp->v_type == VLNK);
701 
702 	buf = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
703 	iov.iov_base = buf;
704 	iov.iov_len = MAXPATHLEN;
705 	uio.uio_iov = &iov;
706 	uio.uio_iovcnt = 1;
707 	uio.uio_resid = MAXPATHLEN;
708 	uio.uio_segflg = UIO_SYSSPACE;
709 	uio.uio_llimit = MAXOFFSET_T;
710 
711 	err = VOP_READLINK(linkvp, &uio, kcred, NULL);
712 	if (err) {
713 		cmn_err(CE_WARN, "readlink %s failed in dev\n", buf);
714 		kmem_free(buf, MAXPATHLEN);
715 		return (ENOENT);
716 	}
717 
718 	/* mission complete */
719 	*link = i_ddi_strdup(buf, KM_SLEEP);
720 	kmem_free(buf, MAXPATHLEN);
721 	return (0);
722 }
723 
724 /*
725  * A convenient wrapper to get the devfs node vnode for a device
726  * minor functionality: readlink() of a /dev symlink
727  * Place the link into dv->sdev_symlink
728  */
729 static int
730 sdev_follow_link(struct sdev_node *dv)
731 {
732 	int err;
733 	struct vnode *linkvp;
734 	char *link = NULL;
735 
736 	linkvp = SDEVTOV(dv);
737 	if (linkvp == NULL)
738 		return (ENOENT);
739 	ASSERT(linkvp->v_type == VLNK);
740 	err = sdev_getlink(linkvp, &link);
741 	if (err) {
742 		dv->sdev_symlink = NULL;
743 		return (ENOENT);
744 	}
745 
746 	ASSERT(link != NULL);
747 	dv->sdev_symlink = link;
748 	return (0);
749 }
750 
751 static int
752 sdev_node_check(struct sdev_node *dv, struct vattr *nvap, void *nargs)
753 {
754 	vtype_t otype = SDEVTOV(dv)->v_type;
755 
756 	/*
757 	 * existing sdev_node has a different type.
758 	 */
759 	if (otype != nvap->va_type) {
760 		sdcmn_err9(("sdev_node_check: existing node "
761 		    "  %s type %d does not match new node type %d\n",
762 		    dv->sdev_name, otype, nvap->va_type));
763 		return (EEXIST);
764 	}
765 
766 	/*
767 	 * For a symlink, the target should be the same.
768 	 */
769 	if (otype == VLNK) {
770 		ASSERT(nargs != NULL);
771 		ASSERT(dv->sdev_symlink != NULL);
772 		if (strcmp(dv->sdev_symlink, (char *)nargs) != 0) {
773 			sdcmn_err9(("sdev_node_check: existing node "
774 			    " %s has different symlink %s as new node "
775 			    " %s\n", dv->sdev_name, dv->sdev_symlink,
776 			    (char *)nargs));
777 			return (EEXIST);
778 		}
779 	}
780 
781 	return (0);
782 }
783 
784 /*
785  * sdev_mknode - a wrapper for sdev_nodeinit(), sdev_nodeready()
786  *
787  * arguments:
788  *	- ddv (parent)
789  *	- nm (child name)
790  *	- newdv (sdev_node for nm is returned here)
791  *	- vap (vattr for the node to be created, va_type should be set.
792  *	- avp (attribute vnode)
793  *	  the defaults should be used if unknown)
794  *	- cred
795  *	- args
796  *	    . tnm (for VLNK)
797  *	    . global sdev_node (for !SDEV_GLOBAL)
798  * 	- state: SDEV_INIT, SDEV_READY
799  *
800  * only ddv, nm, newddv, vap, cred are required for sdev_mknode(SDEV_INIT)
801  *
802  * NOTE:  directory contents writers lock needs to be held before
803  *	  calling this routine.
804  */
805 int
806 sdev_mknode(struct sdev_node *ddv, char *nm, struct sdev_node **newdv,
807     struct vattr *vap, struct vnode *avp, void *args, struct cred *cred,
808     sdev_node_state_t state)
809 {
810 	int error = 0;
811 	sdev_node_state_t node_state;
812 	struct sdev_node *dv = NULL;
813 
814 	ASSERT(state != SDEV_ZOMBIE);
815 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
816 
817 	if (*newdv) {
818 		dv = *newdv;
819 	} else {
820 		/* allocate and initialize a sdev_node */
821 		if (ddv->sdev_state == SDEV_ZOMBIE) {
822 			sdcmn_err9(("sdev_mknode: parent %s ZOMBIEd\n",
823 			    ddv->sdev_path));
824 			return (ENOENT);
825 		}
826 
827 		error = sdev_nodeinit(ddv, nm, &dv, vap);
828 		if (error != 0) {
829 			sdcmn_err9(("sdev_mknode: error %d,"
830 			    " name %s can not be initialized\n",
831 			    error, nm));
832 			return (error);
833 		}
834 		ASSERT(dv);
835 
836 		/* insert into the directory cache */
837 		sdev_cache_update(ddv, &dv, nm, SDEV_CACHE_ADD);
838 	}
839 
840 	ASSERT(dv);
841 	node_state = dv->sdev_state;
842 	ASSERT(node_state != SDEV_ZOMBIE);
843 
844 	if (state == SDEV_READY) {
845 		switch (node_state) {
846 		case SDEV_INIT:
847 			error = sdev_nodeready(dv, vap, avp, args, cred);
848 			if (error) {
849 				sdcmn_err9(("sdev_mknode: node %s can NOT"
850 				    " be transitioned into READY state, "
851 				    "error %d\n", nm, error));
852 			}
853 			break;
854 		case SDEV_READY:
855 			/*
856 			 * Do some sanity checking to make sure
857 			 * the existing sdev_node is what has been
858 			 * asked for.
859 			 */
860 			error = sdev_node_check(dv, vap, args);
861 			break;
862 		default:
863 			break;
864 		}
865 	}
866 
867 	if (!error) {
868 		*newdv = dv;
869 		ASSERT((*newdv)->sdev_state != SDEV_ZOMBIE);
870 	} else {
871 		sdev_cache_update(ddv, &dv, nm, SDEV_CACHE_DELETE);
872 		/*
873 		 * We created this node, it wasn't passed into us. Therefore it
874 		 * is up to us to delete it.
875 		 */
876 		if (*newdv == NULL)
877 			SDEV_SIMPLE_RELE(dv);
878 		*newdv = NULL;
879 	}
880 
881 	return (error);
882 }
883 
884 /*
885  * convenient wrapper to change vp's ATIME, CTIME and MTIME
886  */
887 void
888 sdev_update_timestamps(struct vnode *vp, cred_t *cred, uint_t mask)
889 {
890 	struct vattr attr;
891 	timestruc_t now;
892 	int err;
893 
894 	ASSERT(vp);
895 	gethrestime(&now);
896 	if (mask & AT_CTIME)
897 		attr.va_ctime = now;
898 	if (mask & AT_MTIME)
899 		attr.va_mtime = now;
900 	if (mask & AT_ATIME)
901 		attr.va_atime = now;
902 
903 	attr.va_mask = (mask & AT_TIMES);
904 	err = VOP_SETATTR(vp, &attr, 0, cred, NULL);
905 	if (err && (err != EROFS)) {
906 		sdcmn_err(("update timestamps error %d\n", err));
907 	}
908 }
909 
910 /*
911  * the backing store vnode is released here
912  */
913 /*ARGSUSED1*/
914 void
915 sdev_nodedestroy(struct sdev_node *dv, uint_t flags)
916 {
917 	/* no references */
918 	ASSERT(dv->sdev_nlink == 0);
919 
920 	if (dv->sdev_attrvp != NULLVP) {
921 		VN_RELE(dv->sdev_attrvp);
922 		/*
923 		 * reset the attrvp so that no more
924 		 * references can be made on this already
925 		 * vn_rele() vnode
926 		 */
927 		dv->sdev_attrvp = NULLVP;
928 	}
929 
930 	if (dv->sdev_attr != NULL) {
931 		kmem_free(dv->sdev_attr, sizeof (struct vattr));
932 		dv->sdev_attr = NULL;
933 	}
934 
935 	if (dv->sdev_name != NULL) {
936 		kmem_free(dv->sdev_name, dv->sdev_namelen + 1);
937 		dv->sdev_name = NULL;
938 	}
939 
940 	if (dv->sdev_symlink != NULL) {
941 		kmem_free(dv->sdev_symlink, strlen(dv->sdev_symlink) + 1);
942 		dv->sdev_symlink = NULL;
943 	}
944 
945 	if (dv->sdev_path) {
946 		kmem_free(dv->sdev_path, strlen(dv->sdev_path) + 1);
947 		dv->sdev_path = NULL;
948 	}
949 
950 	if (!SDEV_IS_GLOBAL(dv))
951 		sdev_prof_free(dv);
952 
953 	if (SDEVTOV(dv)->v_type == VDIR) {
954 		ASSERT(SDEV_FIRST_ENTRY(dv) == NULL);
955 		avl_destroy(&dv->sdev_entries);
956 	}
957 
958 	mutex_destroy(&dv->sdev_lookup_lock);
959 	cv_destroy(&dv->sdev_lookup_cv);
960 
961 	/* return node to initial state as per constructor */
962 	(void) memset((void *)&dv->sdev_instance_data, 0,
963 	    sizeof (dv->sdev_instance_data));
964 	vn_invalid(SDEVTOV(dv));
965 	kmem_cache_free(sdev_node_cache, dv);
966 }
967 
968 /*
969  * DIRECTORY CACHE lookup
970  */
971 struct sdev_node *
972 sdev_findbyname(struct sdev_node *ddv, char *nm)
973 {
974 	struct sdev_node *dv;
975 	struct sdev_node dvtmp;
976 	avl_index_t	where;
977 
978 	ASSERT(RW_LOCK_HELD(&ddv->sdev_contents));
979 
980 	dvtmp.sdev_name = nm;
981 	dv = avl_find(&ddv->sdev_entries, &dvtmp, &where);
982 	if (dv) {
983 		ASSERT(dv->sdev_dotdot == ddv);
984 		ASSERT(strcmp(dv->sdev_name, nm) == 0);
985 		ASSERT(dv->sdev_state != SDEV_ZOMBIE);
986 		SDEV_HOLD(dv);
987 		return (dv);
988 	}
989 	return (NULL);
990 }
991 
992 /*
993  * Inserts a new sdev_node in a parent directory
994  */
995 void
996 sdev_direnter(struct sdev_node *ddv, struct sdev_node *dv)
997 {
998 	avl_index_t where;
999 
1000 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1001 	ASSERT(SDEVTOV(ddv)->v_type == VDIR);
1002 	ASSERT(ddv->sdev_nlink >= 2);
1003 	ASSERT(dv->sdev_nlink == 0);
1004 	ASSERT(dv->sdev_state != SDEV_ZOMBIE);
1005 
1006 	dv->sdev_dotdot = ddv;
1007 	VERIFY(avl_find(&ddv->sdev_entries, dv, &where) == NULL);
1008 	avl_insert(&ddv->sdev_entries, dv, where);
1009 	ddv->sdev_nlink++;
1010 }
1011 
1012 /*
1013  * The following check is needed because while sdev_nodes are linked
1014  * in SDEV_INIT state, they have their link counts incremented only
1015  * in SDEV_READY state.
1016  */
1017 static void
1018 decr_link(struct sdev_node *dv)
1019 {
1020 	VERIFY(RW_WRITE_HELD(&dv->sdev_contents));
1021 	if (dv->sdev_state != SDEV_INIT) {
1022 		VERIFY(dv->sdev_nlink >= 1);
1023 		dv->sdev_nlink--;
1024 	} else {
1025 		VERIFY(dv->sdev_nlink == 0);
1026 	}
1027 }
1028 
1029 /*
1030  * Delete an existing dv from directory cache
1031  *
1032  * In the case of a node is still held by non-zero reference count, the node is
1033  * put into ZOMBIE state. The node is always unlinked from its parent, but it is
1034  * not destroyed via sdev_inactive until its reference count reaches "0".
1035  */
1036 static void
1037 sdev_dirdelete(struct sdev_node *ddv, struct sdev_node *dv)
1038 {
1039 	struct vnode *vp;
1040 	sdev_node_state_t os;
1041 
1042 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1043 
1044 	vp = SDEVTOV(dv);
1045 	mutex_enter(&vp->v_lock);
1046 	rw_enter(&dv->sdev_contents, RW_WRITER);
1047 	os = dv->sdev_state;
1048 	ASSERT(os != SDEV_ZOMBIE);
1049 	dv->sdev_state = SDEV_ZOMBIE;
1050 
1051 	/*
1052 	 * unlink ourselves from the parent directory now to take care of the ..
1053 	 * link. However, if we're a directory, we don't remove our reference to
1054 	 * ourself eg. '.' until we are torn down in the inactive callback.
1055 	 */
1056 	decr_link(ddv);
1057 	avl_remove(&ddv->sdev_entries, dv);
1058 	/*
1059 	 * sdev_inactive expects nodes to have a link to themselves when we're
1060 	 * tearing them down. If we're transitioning from the initial state to
1061 	 * zombie and not via ready, then we're not going to have this link that
1062 	 * comes from the node being ready. As a result, we need to increment
1063 	 * our link count by one to account for this.
1064 	 */
1065 	if (os == SDEV_INIT && dv->sdev_nlink == 0)
1066 		dv->sdev_nlink++;
1067 	rw_exit(&dv->sdev_contents);
1068 	mutex_exit(&vp->v_lock);
1069 }
1070 
1071 /*
1072  * check if the source is in the path of the target
1073  *
1074  * source and target are different
1075  */
1076 /*ARGSUSED2*/
1077 static int
1078 sdev_checkpath(struct sdev_node *sdv, struct sdev_node *tdv, struct cred *cred)
1079 {
1080 	int error = 0;
1081 	struct sdev_node *dotdot, *dir;
1082 
1083 	dotdot = tdv->sdev_dotdot;
1084 	ASSERT(dotdot);
1085 
1086 	/* fs root */
1087 	if (dotdot == tdv) {
1088 		return (0);
1089 	}
1090 
1091 	for (;;) {
1092 		/*
1093 		 * avoid error cases like
1094 		 *	mv a a/b
1095 		 *	mv a a/b/c
1096 		 *	etc.
1097 		 */
1098 		if (dotdot == sdv) {
1099 			error = EINVAL;
1100 			break;
1101 		}
1102 
1103 		dir = dotdot;
1104 		dotdot = dir->sdev_dotdot;
1105 
1106 		/* done checking because root is reached */
1107 		if (dir == dotdot) {
1108 			break;
1109 		}
1110 	}
1111 	return (error);
1112 }
1113 
1114 int
1115 sdev_rnmnode(struct sdev_node *oddv, struct sdev_node *odv,
1116     struct sdev_node *nddv, struct sdev_node **ndvp, char *nnm,
1117     struct cred *cred)
1118 {
1119 	int error = 0;
1120 	struct vnode *ovp = SDEVTOV(odv);
1121 	struct vnode *nvp;
1122 	struct vattr vattr;
1123 	int doingdir = (ovp->v_type == VDIR);
1124 	char *link = NULL;
1125 	int samedir = (oddv == nddv) ? 1 : 0;
1126 	int bkstore = 0;
1127 	struct sdev_node *idv = NULL;
1128 	struct sdev_node *ndv = NULL;
1129 	timestruc_t now;
1130 
1131 	vattr.va_mask = AT_TYPE|AT_MODE|AT_UID|AT_GID;
1132 	error = VOP_GETATTR(ovp, &vattr, 0, cred, NULL);
1133 	if (error)
1134 		return (error);
1135 
1136 	if (!samedir)
1137 		rw_enter(&oddv->sdev_contents, RW_WRITER);
1138 	rw_enter(&nddv->sdev_contents, RW_WRITER);
1139 
1140 	/*
1141 	 * the source may have been deleted by another thread before
1142 	 * we gets here.
1143 	 */
1144 	if (odv->sdev_state != SDEV_READY) {
1145 		error = ENOENT;
1146 		goto err_out;
1147 	}
1148 
1149 	if (doingdir && (odv == nddv)) {
1150 		error = EINVAL;
1151 		goto err_out;
1152 	}
1153 
1154 	/*
1155 	 * If renaming a directory, and the parents are different (".." must be
1156 	 * changed) then the source dir must not be in the dir hierarchy above
1157 	 * the target since it would orphan everything below the source dir.
1158 	 */
1159 	if (doingdir && (oddv != nddv)) {
1160 		error = sdev_checkpath(odv, nddv, cred);
1161 		if (error)
1162 			goto err_out;
1163 	}
1164 
1165 	/* fix the source for a symlink */
1166 	if (vattr.va_type == VLNK) {
1167 		if (odv->sdev_symlink == NULL) {
1168 			error = sdev_follow_link(odv);
1169 			if (error) {
1170 				/*
1171 				 * The underlying symlink doesn't exist. This
1172 				 * node probably shouldn't even exist. While
1173 				 * it's a bit jarring to consumers, we're going
1174 				 * to remove the node from /dev.
1175 				 */
1176 				if (SDEV_IS_PERSIST((*ndvp)))
1177 					bkstore = 1;
1178 				sdev_dirdelete(oddv, odv);
1179 				if (bkstore) {
1180 					ASSERT(nddv->sdev_attrvp);
1181 					error = VOP_REMOVE(nddv->sdev_attrvp,
1182 					    nnm, cred, NULL, 0);
1183 					if (error)
1184 						goto err_out;
1185 				}
1186 				error = ENOENT;
1187 				goto err_out;
1188 			}
1189 		}
1190 		ASSERT(odv->sdev_symlink);
1191 		link = i_ddi_strdup(odv->sdev_symlink, KM_SLEEP);
1192 	}
1193 
1194 	/* destination existing */
1195 	if (*ndvp) {
1196 		nvp = SDEVTOV(*ndvp);
1197 		ASSERT(nvp);
1198 
1199 		/* handling renaming to itself */
1200 		if (odv == *ndvp) {
1201 			error = 0;
1202 			goto err_out;
1203 		}
1204 
1205 		if (nvp->v_type == VDIR) {
1206 			if (!doingdir) {
1207 				error = EISDIR;
1208 				goto err_out;
1209 			}
1210 
1211 			if (vn_vfswlock(nvp)) {
1212 				error = EBUSY;
1213 				goto err_out;
1214 			}
1215 
1216 			if (vn_mountedvfs(nvp) != NULL) {
1217 				vn_vfsunlock(nvp);
1218 				error = EBUSY;
1219 				goto err_out;
1220 			}
1221 
1222 			/* in case dir1 exists in dir2 and "mv dir1 dir2" */
1223 			if ((*ndvp)->sdev_nlink > 2) {
1224 				vn_vfsunlock(nvp);
1225 				error = EEXIST;
1226 				goto err_out;
1227 			}
1228 			vn_vfsunlock(nvp);
1229 
1230 			/*
1231 			 * We did not place the hold on *ndvp, so even though
1232 			 * we're deleting the node, we should not get rid of our
1233 			 * reference.
1234 			 */
1235 			sdev_dirdelete(nddv, *ndvp);
1236 			*ndvp = NULL;
1237 			ASSERT(nddv->sdev_attrvp);
1238 			error = VOP_RMDIR(nddv->sdev_attrvp, nnm,
1239 			    nddv->sdev_attrvp, cred, NULL, 0);
1240 			if (error)
1241 				goto err_out;
1242 		} else {
1243 			if (doingdir) {
1244 				error = ENOTDIR;
1245 				goto err_out;
1246 			}
1247 
1248 			if (SDEV_IS_PERSIST((*ndvp))) {
1249 				bkstore = 1;
1250 			}
1251 
1252 			/*
1253 			 * Get rid of the node from the directory cache note.
1254 			 * Don't forget that it's not up to us to remove the vn
1255 			 * ref on the sdev node, as we did not place it.
1256 			 */
1257 			sdev_dirdelete(nddv, *ndvp);
1258 			*ndvp = NULL;
1259 			if (bkstore) {
1260 				ASSERT(nddv->sdev_attrvp);
1261 				error = VOP_REMOVE(nddv->sdev_attrvp,
1262 				    nnm, cred, NULL, 0);
1263 				if (error)
1264 					goto err_out;
1265 			}
1266 		}
1267 	}
1268 
1269 	/*
1270 	 * make a fresh node from the source attrs
1271 	 */
1272 	ASSERT(RW_WRITE_HELD(&nddv->sdev_contents));
1273 	error = sdev_mknode(nddv, nnm, ndvp, &vattr,
1274 	    NULL, (void *)link, cred, SDEV_READY);
1275 
1276 	if (link != NULL) {
1277 		kmem_free(link, strlen(link) + 1);
1278 		link = NULL;
1279 	}
1280 
1281 	if (error)
1282 		goto err_out;
1283 	ASSERT(*ndvp);
1284 	ASSERT((*ndvp)->sdev_state == SDEV_READY);
1285 
1286 	/* move dir contents */
1287 	if (doingdir) {
1288 		for (idv = SDEV_FIRST_ENTRY(odv); idv;
1289 		    idv = SDEV_NEXT_ENTRY(odv, idv)) {
1290 			SDEV_HOLD(idv);
1291 			error = sdev_rnmnode(odv, idv,
1292 			    (struct sdev_node *)(*ndvp), &ndv,
1293 			    idv->sdev_name, cred);
1294 			SDEV_RELE(idv);
1295 			if (error)
1296 				goto err_out;
1297 			ndv = NULL;
1298 		}
1299 	}
1300 
1301 	if ((*ndvp)->sdev_attrvp) {
1302 		sdev_update_timestamps((*ndvp)->sdev_attrvp, kcred,
1303 		    AT_CTIME|AT_ATIME);
1304 	} else {
1305 		ASSERT((*ndvp)->sdev_attr);
1306 		gethrestime(&now);
1307 		(*ndvp)->sdev_attr->va_ctime = now;
1308 		(*ndvp)->sdev_attr->va_atime = now;
1309 	}
1310 
1311 	if (nddv->sdev_attrvp) {
1312 		sdev_update_timestamps(nddv->sdev_attrvp, kcred,
1313 		    AT_MTIME|AT_ATIME);
1314 	} else {
1315 		ASSERT(nddv->sdev_attr);
1316 		gethrestime(&now);
1317 		nddv->sdev_attr->va_mtime = now;
1318 		nddv->sdev_attr->va_atime = now;
1319 	}
1320 	rw_exit(&nddv->sdev_contents);
1321 	if (!samedir)
1322 		rw_exit(&oddv->sdev_contents);
1323 
1324 	SDEV_RELE(*ndvp);
1325 	return (error);
1326 
1327 err_out:
1328 	if (link != NULL) {
1329 		kmem_free(link, strlen(link) + 1);
1330 		link = NULL;
1331 	}
1332 
1333 	rw_exit(&nddv->sdev_contents);
1334 	if (!samedir)
1335 		rw_exit(&oddv->sdev_contents);
1336 	return (error);
1337 }
1338 
1339 /*
1340  * Merge sdev_node specific information into an attribute structure.
1341  *
1342  * note: sdev_node is not locked here
1343  */
1344 void
1345 sdev_vattr_merge(struct sdev_node *dv, struct vattr *vap)
1346 {
1347 	struct vnode *vp = SDEVTOV(dv);
1348 
1349 	vap->va_nlink = dv->sdev_nlink;
1350 	vap->va_nodeid = dv->sdev_ino;
1351 	vap->va_fsid = SDEVTOV(dv->sdev_dotdot)->v_rdev;
1352 	vap->va_type = vp->v_type;
1353 
1354 	if (vp->v_type == VDIR) {
1355 		vap->va_rdev = 0;
1356 		vap->va_fsid = vp->v_rdev;
1357 	} else if (vp->v_type == VLNK) {
1358 		vap->va_rdev = 0;
1359 		vap->va_mode  &= ~S_IFMT;
1360 		vap->va_mode |= S_IFLNK;
1361 	} else if ((vp->v_type == VCHR) || (vp->v_type == VBLK)) {
1362 		vap->va_rdev = vp->v_rdev;
1363 		vap->va_mode &= ~S_IFMT;
1364 		if (vap->va_type == VCHR)
1365 			vap->va_mode |= S_IFCHR;
1366 		else
1367 			vap->va_mode |= S_IFBLK;
1368 	} else {
1369 		vap->va_rdev = 0;
1370 	}
1371 }
1372 
1373 struct vattr *
1374 sdev_getdefault_attr(enum vtype type)
1375 {
1376 	if (type == VDIR)
1377 		return (&sdev_vattr_dir);
1378 	else if (type == VCHR)
1379 		return (&sdev_vattr_chr);
1380 	else if (type == VBLK)
1381 		return (&sdev_vattr_blk);
1382 	else if (type == VLNK)
1383 		return (&sdev_vattr_lnk);
1384 	else
1385 		return (NULL);
1386 }
1387 int
1388 sdev_to_vp(struct sdev_node *dv, struct vnode **vpp)
1389 {
1390 	int rv = 0;
1391 	struct vnode *vp = SDEVTOV(dv);
1392 
1393 	switch (vp->v_type) {
1394 	case VCHR:
1395 	case VBLK:
1396 		/*
1397 		 * If vnode is a device, return special vnode instead
1398 		 * (though it knows all about -us- via sp->s_realvp)
1399 		 */
1400 		*vpp = specvp(vp, vp->v_rdev, vp->v_type, kcred);
1401 		VN_RELE(vp);
1402 		if (*vpp == NULLVP)
1403 			rv = ENOSYS;
1404 		break;
1405 	default:	/* most types are returned as is */
1406 		*vpp = vp;
1407 		break;
1408 	}
1409 	return (rv);
1410 }
1411 
1412 /*
1413  * junction between devname and root file system, e.g. ufs
1414  */
1415 int
1416 devname_backstore_lookup(struct sdev_node *ddv, char *nm, struct vnode **rvp)
1417 {
1418 	struct vnode *rdvp = ddv->sdev_attrvp;
1419 	int rval = 0;
1420 
1421 	ASSERT(rdvp);
1422 
1423 	rval = VOP_LOOKUP(rdvp, nm, rvp, NULL, 0, NULL, kcred, NULL, NULL,
1424 	    NULL);
1425 	return (rval);
1426 }
1427 
1428 static int
1429 sdev_filldir_from_store(struct sdev_node *ddv, int dlen, struct cred *cred)
1430 {
1431 	struct sdev_node *dv = NULL;
1432 	char	*nm;
1433 	struct vnode *dirvp;
1434 	int	error;
1435 	vnode_t	*vp;
1436 	int eof;
1437 	struct iovec iov;
1438 	struct uio uio;
1439 	struct dirent64 *dp;
1440 	dirent64_t *dbuf;
1441 	size_t dbuflen;
1442 	struct vattr vattr;
1443 	char *link = NULL;
1444 
1445 	if (ddv->sdev_attrvp == NULL)
1446 		return (0);
1447 	if (!(ddv->sdev_flags & SDEV_BUILD))
1448 		return (0);
1449 
1450 	dirvp = ddv->sdev_attrvp;
1451 	VN_HOLD(dirvp);
1452 	dbuf = kmem_zalloc(dlen, KM_SLEEP);
1453 
1454 	uio.uio_iov = &iov;
1455 	uio.uio_iovcnt = 1;
1456 	uio.uio_segflg = UIO_SYSSPACE;
1457 	uio.uio_fmode = 0;
1458 	uio.uio_extflg = UIO_COPY_CACHED;
1459 	uio.uio_loffset = 0;
1460 	uio.uio_llimit = MAXOFFSET_T;
1461 
1462 	eof = 0;
1463 	error = 0;
1464 	while (!error && !eof) {
1465 		uio.uio_resid = dlen;
1466 		iov.iov_base = (char *)dbuf;
1467 		iov.iov_len = dlen;
1468 		(void) VOP_RWLOCK(dirvp, V_WRITELOCK_FALSE, NULL);
1469 		error = VOP_READDIR(dirvp, &uio, kcred, &eof, NULL, 0);
1470 		VOP_RWUNLOCK(dirvp, V_WRITELOCK_FALSE, NULL);
1471 
1472 		dbuflen = dlen - uio.uio_resid;
1473 		if (error || dbuflen == 0)
1474 			break;
1475 
1476 		if (!(ddv->sdev_flags & SDEV_BUILD))
1477 			break;
1478 
1479 		for (dp = dbuf; ((intptr_t)dp <
1480 		    (intptr_t)dbuf + dbuflen);
1481 		    dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen)) {
1482 			nm = dp->d_name;
1483 
1484 			if (strcmp(nm, ".") == 0 ||
1485 			    strcmp(nm, "..") == 0)
1486 				continue;
1487 
1488 			vp = NULLVP;
1489 			dv = sdev_cache_lookup(ddv, nm);
1490 			if (dv) {
1491 				VERIFY(dv->sdev_state != SDEV_ZOMBIE);
1492 				SDEV_SIMPLE_RELE(dv);
1493 				continue;
1494 			}
1495 
1496 			/* refill the cache if not already */
1497 			error = devname_backstore_lookup(ddv, nm, &vp);
1498 			if (error)
1499 				continue;
1500 
1501 			vattr.va_mask = AT_TYPE|AT_MODE|AT_UID|AT_GID;
1502 			error = VOP_GETATTR(vp, &vattr, 0, cred, NULL);
1503 			if (error)
1504 				continue;
1505 
1506 			if (vattr.va_type == VLNK) {
1507 				error = sdev_getlink(vp, &link);
1508 				if (error) {
1509 					continue;
1510 				}
1511 				ASSERT(link != NULL);
1512 			}
1513 
1514 			if (!rw_tryupgrade(&ddv->sdev_contents)) {
1515 				rw_exit(&ddv->sdev_contents);
1516 				rw_enter(&ddv->sdev_contents, RW_WRITER);
1517 			}
1518 			error = sdev_mknode(ddv, nm, &dv, &vattr, vp, link,
1519 			    cred, SDEV_READY);
1520 			rw_downgrade(&ddv->sdev_contents);
1521 
1522 			if (link != NULL) {
1523 				kmem_free(link, strlen(link) + 1);
1524 				link = NULL;
1525 			}
1526 
1527 			if (!error) {
1528 				ASSERT(dv);
1529 				ASSERT(dv->sdev_state != SDEV_ZOMBIE);
1530 				SDEV_SIMPLE_RELE(dv);
1531 			}
1532 			vp = NULL;
1533 			dv = NULL;
1534 		}
1535 	}
1536 
1537 done:
1538 	VN_RELE(dirvp);
1539 	kmem_free(dbuf, dlen);
1540 
1541 	return (error);
1542 }
1543 
1544 void
1545 sdev_filldir_dynamic(struct sdev_node *ddv)
1546 {
1547 	int error;
1548 	int i;
1549 	struct vattr vattr;
1550 	struct vattr *vap = &vattr;
1551 	char *nm = NULL;
1552 	struct sdev_node *dv = NULL;
1553 
1554 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1555 	ASSERT((ddv->sdev_flags & SDEV_BUILD));
1556 
1557 	*vap = *sdev_getdefault_attr(VDIR);	/* note structure copy here */
1558 	gethrestime(&vap->va_atime);
1559 	vap->va_mtime = vap->va_atime;
1560 	vap->va_ctime = vap->va_atime;
1561 	for (i = 0; vtab[i].vt_name != NULL; i++) {
1562 		/*
1563 		 * This early, we may be in a read-only /dev environment: leave
1564 		 * the creation of any nodes we'd attempt to persist to
1565 		 * devfsadm. Because /dev itself is normally persistent, any
1566 		 * node which is not marked dynamic will end up being marked
1567 		 * persistent. However, some nodes are both dynamic and
1568 		 * persistent, mostly lofi and rlofi, so we need to be careful
1569 		 * in our check.
1570 		 */
1571 		if ((vtab[i].vt_flags & SDEV_PERSIST) ||
1572 		    !(vtab[i].vt_flags & SDEV_DYNAMIC))
1573 			continue;
1574 		nm = vtab[i].vt_name;
1575 		ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1576 		dv = NULL;
1577 		error = sdev_mknode(ddv, nm, &dv, vap, NULL,
1578 		    NULL, kcred, SDEV_READY);
1579 		if (error) {
1580 			cmn_err(CE_WARN, "%s/%s: error %d\n",
1581 			    ddv->sdev_name, nm, error);
1582 		} else {
1583 			ASSERT(dv);
1584 			ASSERT(dv->sdev_state != SDEV_ZOMBIE);
1585 			SDEV_SIMPLE_RELE(dv);
1586 		}
1587 	}
1588 }
1589 
1590 /*
1591  * Creating a backing store entry based on sdev_attr.
1592  * This is called either as part of node creation in a persistent directory
1593  * or from setattr/setsecattr to persist access attributes across reboot.
1594  */
1595 int
1596 sdev_shadow_node(struct sdev_node *dv, struct cred *cred)
1597 {
1598 	int error = 0;
1599 	struct vnode *dvp = SDEVTOV(dv->sdev_dotdot);
1600 	struct vnode *rdvp = VTOSDEV(dvp)->sdev_attrvp;
1601 	struct vattr *vap = dv->sdev_attr;
1602 	char *nm = dv->sdev_name;
1603 	struct vnode *tmpvp, **rvp = &tmpvp, *rrvp = NULL;
1604 
1605 	ASSERT(dv && dv->sdev_name && rdvp);
1606 	ASSERT(RW_WRITE_HELD(&dv->sdev_contents) && dv->sdev_attrvp == NULL);
1607 
1608 lookup:
1609 	/* try to find it in the backing store */
1610 	error = VOP_LOOKUP(rdvp, nm, rvp, NULL, 0, NULL, cred, NULL, NULL,
1611 	    NULL);
1612 	if (error == 0) {
1613 		if (VOP_REALVP(*rvp, &rrvp, NULL) == 0) {
1614 			VN_HOLD(rrvp);
1615 			VN_RELE(*rvp);
1616 			*rvp = rrvp;
1617 		}
1618 
1619 		kmem_free(dv->sdev_attr, sizeof (vattr_t));
1620 		dv->sdev_attr = NULL;
1621 		dv->sdev_attrvp = *rvp;
1622 		return (0);
1623 	}
1624 
1625 	/* let's try to persist the node */
1626 	gethrestime(&vap->va_atime);
1627 	vap->va_mtime = vap->va_atime;
1628 	vap->va_ctime = vap->va_atime;
1629 	vap->va_mask |= AT_TYPE|AT_MODE;
1630 	switch (vap->va_type) {
1631 	case VDIR:
1632 		error = VOP_MKDIR(rdvp, nm, vap, rvp, cred, NULL, 0, NULL);
1633 		sdcmn_err9(("sdev_shadow_node: mkdir vp %p error %d\n",
1634 		    (void *)(*rvp), error));
1635 		if (!error)
1636 			VN_RELE(*rvp);
1637 		break;
1638 	case VCHR:
1639 	case VBLK:
1640 	case VREG:
1641 	case VDOOR:
1642 		error = VOP_CREATE(rdvp, nm, vap, NONEXCL, VREAD|VWRITE,
1643 		    rvp, cred, 0, NULL, NULL);
1644 		sdcmn_err9(("sdev_shadow_node: create vp %p, error %d\n",
1645 		    (void *)(*rvp), error));
1646 		if (!error)
1647 			VN_RELE(*rvp);
1648 		break;
1649 	case VLNK:
1650 		ASSERT(dv->sdev_symlink);
1651 		error = VOP_SYMLINK(rdvp, nm, vap, dv->sdev_symlink, cred,
1652 		    NULL, 0);
1653 		sdcmn_err9(("sdev_shadow_node: create symlink error %d\n",
1654 		    error));
1655 		break;
1656 	default:
1657 		cmn_err(CE_PANIC, "dev: %s: sdev_shadow_node "
1658 		    "create\n", nm);
1659 		/*NOTREACHED*/
1660 	}
1661 
1662 	/* go back to lookup to factor out spec node and set attrvp */
1663 	if (error == 0)
1664 		goto lookup;
1665 
1666 	sdcmn_err(("cannot persist %s - error %d\n", dv->sdev_path, error));
1667 	return (error);
1668 }
1669 
1670 static void
1671 sdev_cache_add(struct sdev_node *ddv, struct sdev_node **dv, char *nm)
1672 {
1673 	struct sdev_node *dup = NULL;
1674 
1675 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1676 	if ((dup = sdev_findbyname(ddv, nm)) == NULL) {
1677 		sdev_direnter(ddv, *dv);
1678 	} else {
1679 		VERIFY(dup->sdev_state != SDEV_ZOMBIE);
1680 		SDEV_SIMPLE_RELE(*dv);
1681 		sdev_nodedestroy(*dv, 0);
1682 		*dv = dup;
1683 	}
1684 }
1685 
1686 static void
1687 sdev_cache_delete(struct sdev_node *ddv, struct sdev_node **dv)
1688 {
1689 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1690 	sdev_dirdelete(ddv, *dv);
1691 }
1692 
1693 /*
1694  * update the in-core directory cache
1695  */
1696 void
1697 sdev_cache_update(struct sdev_node *ddv, struct sdev_node **dv, char *nm,
1698     sdev_cache_ops_t ops)
1699 {
1700 	ASSERT((SDEV_HELD(*dv)));
1701 
1702 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1703 	switch (ops) {
1704 	case SDEV_CACHE_ADD:
1705 		sdev_cache_add(ddv, dv, nm);
1706 		break;
1707 	case SDEV_CACHE_DELETE:
1708 		sdev_cache_delete(ddv, dv);
1709 		break;
1710 	default:
1711 		break;
1712 	}
1713 }
1714 
1715 /*
1716  * retrieve the named entry from the directory cache
1717  */
1718 struct sdev_node *
1719 sdev_cache_lookup(struct sdev_node *ddv, char *nm)
1720 {
1721 	struct sdev_node *dv = NULL;
1722 
1723 	ASSERT(RW_LOCK_HELD(&ddv->sdev_contents));
1724 	dv = sdev_findbyname(ddv, nm);
1725 
1726 	return (dv);
1727 }
1728 
1729 /*
1730  * Implicit reconfig for nodes constructed by a link generator
1731  * Start devfsadm if needed, or if devfsadm is in progress,
1732  * prepare to block on devfsadm either completing or
1733  * constructing the desired node.  As devfsadmd is global
1734  * in scope, constructing all necessary nodes, we only
1735  * need to initiate it once.
1736  */
1737 static int
1738 sdev_call_devfsadmd(struct sdev_node *ddv, struct sdev_node *dv, char *nm)
1739 {
1740 	int error = 0;
1741 
1742 	if (DEVNAME_DEVFSADM_IS_RUNNING(devfsadm_state)) {
1743 		sdcmn_err6(("lookup: waiting for %s/%s, 0x%x\n",
1744 		    ddv->sdev_name, nm, devfsadm_state));
1745 		mutex_enter(&dv->sdev_lookup_lock);
1746 		SDEV_BLOCK_OTHERS(dv, (SDEV_LOOKUP | SDEV_LGWAITING));
1747 		mutex_exit(&dv->sdev_lookup_lock);
1748 		error = 0;
1749 	} else if (!DEVNAME_DEVFSADM_HAS_RUN(devfsadm_state)) {
1750 		sdcmn_err6(("lookup %s/%s starting devfsadm, 0x%x\n",
1751 		    ddv->sdev_name, nm, devfsadm_state));
1752 
1753 		sdev_devfsadmd_thread(ddv, dv, kcred);
1754 		mutex_enter(&dv->sdev_lookup_lock);
1755 		SDEV_BLOCK_OTHERS(dv,
1756 		    (SDEV_LOOKUP | SDEV_LGWAITING));
1757 		mutex_exit(&dv->sdev_lookup_lock);
1758 		error = 0;
1759 	} else {
1760 		error = -1;
1761 	}
1762 
1763 	return (error);
1764 }
1765 
1766 /*
1767  *  Support for specialized device naming construction mechanisms
1768  */
1769 static int
1770 sdev_call_dircallback(struct sdev_node *ddv, struct sdev_node **dvp, char *nm,
1771     int (*callback)(struct sdev_node *, char *, void **, struct cred *,
1772     void *, char *), int flags, struct cred *cred)
1773 {
1774 	int rv = 0;
1775 	char *physpath = NULL;
1776 	struct vattr vattr;
1777 	struct vattr *vap = &vattr;
1778 	struct sdev_node *dv = NULL;
1779 
1780 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1781 	if (flags & SDEV_VLINK) {
1782 		physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1783 		rv = callback(ddv, nm, (void *)&physpath, kcred, NULL,
1784 		    NULL);
1785 		if (rv) {
1786 			kmem_free(physpath, MAXPATHLEN);
1787 			return (-1);
1788 		}
1789 
1790 		*vap = *sdev_getdefault_attr(VLNK);	/* structure copy */
1791 		vap->va_size = strlen(physpath);
1792 		gethrestime(&vap->va_atime);
1793 		vap->va_mtime = vap->va_atime;
1794 		vap->va_ctime = vap->va_atime;
1795 
1796 		rv = sdev_mknode(ddv, nm, &dv, vap, NULL,
1797 		    (void *)physpath, cred, SDEV_READY);
1798 		kmem_free(physpath, MAXPATHLEN);
1799 		if (rv)
1800 			return (rv);
1801 	} else if (flags & SDEV_VATTR) {
1802 		/*
1803 		 * /dev/pts
1804 		 *
1805 		 * callback is responsible to set the basic attributes,
1806 		 * e.g. va_type/va_uid/va_gid/
1807 		 *    dev_t if VCHR or VBLK/
1808 		 */
1809 		ASSERT(callback);
1810 		rv = callback(ddv, nm, (void *)&vattr, kcred, NULL, NULL);
1811 		if (rv) {
1812 			sdcmn_err3(("devname_lookup_func: SDEV_NONE "
1813 			    "callback failed \n"));
1814 			return (-1);
1815 		}
1816 
1817 		rv = sdev_mknode(ddv, nm, &dv, &vattr, NULL, NULL,
1818 		    cred, SDEV_READY);
1819 
1820 		if (rv)
1821 			return (rv);
1822 
1823 	} else {
1824 		impossible(("lookup: %s/%s by %s not supported (%d)\n",
1825 		    SDEVTOV(ddv)->v_path, nm, curproc->p_user.u_comm,
1826 		    __LINE__));
1827 		rv = -1;
1828 	}
1829 
1830 	*dvp = dv;
1831 	return (rv);
1832 }
1833 
1834 static int
1835 is_devfsadm_thread(char *exec_name)
1836 {
1837 	/*
1838 	 * note: because devfsadmd -> /usr/sbin/devfsadm
1839 	 * it is safe to use "devfsadm" to capture the lookups
1840 	 * from devfsadm and its daemon version.
1841 	 */
1842 	if (strcmp(exec_name, "devfsadm") == 0)
1843 		return (1);
1844 	return (0);
1845 }
1846 
1847 /*
1848  * Lookup Order:
1849  *	sdev_node cache;
1850  *	backing store (SDEV_PERSIST);
1851  *	DBNR: a. dir_ops implemented in the loadable modules;
1852  *	      b. vnode ops in vtab.
1853  */
1854 int
1855 devname_lookup_func(struct sdev_node *ddv, char *nm, struct vnode **vpp,
1856     struct cred *cred, int (*callback)(struct sdev_node *, char *, void **,
1857     struct cred *, void *, char *), int flags)
1858 {
1859 	int rv = 0, nmlen;
1860 	struct vnode *rvp = NULL;
1861 	struct sdev_node *dv = NULL;
1862 	int	retried = 0;
1863 	int	error = 0;
1864 	struct vattr vattr;
1865 	char *lookup_thread = curproc->p_user.u_comm;
1866 	int failed_flags = 0;
1867 	int (*vtor)(struct sdev_node *) = NULL;
1868 	int state;
1869 	int parent_state;
1870 	char *link = NULL;
1871 
1872 	if (SDEVTOV(ddv)->v_type != VDIR)
1873 		return (ENOTDIR);
1874 
1875 	/*
1876 	 * Empty name or ., return node itself.
1877 	 */
1878 	nmlen = strlen(nm);
1879 	if ((nmlen == 0) || ((nmlen == 1) && (nm[0] == '.'))) {
1880 		*vpp = SDEVTOV(ddv);
1881 		VN_HOLD(*vpp);
1882 		return (0);
1883 	}
1884 
1885 	/*
1886 	 * .., return the parent directory
1887 	 */
1888 	if ((nmlen == 2) && (strcmp(nm, "..") == 0)) {
1889 		*vpp = SDEVTOV(ddv->sdev_dotdot);
1890 		VN_HOLD(*vpp);
1891 		return (0);
1892 	}
1893 
1894 	rw_enter(&ddv->sdev_contents, RW_READER);
1895 	if (ddv->sdev_flags & SDEV_VTOR) {
1896 		vtor = (int (*)(struct sdev_node *))sdev_get_vtor(ddv);
1897 		ASSERT(vtor);
1898 	}
1899 
1900 tryagain:
1901 	/*
1902 	 * (a) directory cache lookup:
1903 	 */
1904 	ASSERT(RW_READ_HELD(&ddv->sdev_contents));
1905 	parent_state = ddv->sdev_state;
1906 	dv = sdev_cache_lookup(ddv, nm);
1907 	if (dv) {
1908 		state = dv->sdev_state;
1909 		switch (state) {
1910 		case SDEV_INIT:
1911 			if (is_devfsadm_thread(lookup_thread))
1912 				break;
1913 
1914 			/* ZOMBIED parent won't allow node creation */
1915 			if (parent_state == SDEV_ZOMBIE) {
1916 				SD_TRACE_FAILED_LOOKUP(ddv, nm,
1917 				    retried);
1918 				goto nolock_notfound;
1919 			}
1920 
1921 			mutex_enter(&dv->sdev_lookup_lock);
1922 			/* compensate the threads started after devfsadm */
1923 			if (DEVNAME_DEVFSADM_IS_RUNNING(devfsadm_state) &&
1924 			    !(SDEV_IS_LOOKUP(dv)))
1925 				SDEV_BLOCK_OTHERS(dv,
1926 				    (SDEV_LOOKUP | SDEV_LGWAITING));
1927 
1928 			if (SDEV_IS_LOOKUP(dv)) {
1929 				failed_flags |= SLF_REBUILT;
1930 				rw_exit(&ddv->sdev_contents);
1931 				error = sdev_wait4lookup(dv, SDEV_LOOKUP);
1932 				mutex_exit(&dv->sdev_lookup_lock);
1933 				rw_enter(&ddv->sdev_contents, RW_READER);
1934 
1935 				if (error != 0) {
1936 					SD_TRACE_FAILED_LOOKUP(ddv, nm,
1937 					    retried);
1938 					goto nolock_notfound;
1939 				}
1940 
1941 				state = dv->sdev_state;
1942 				if (state == SDEV_INIT) {
1943 					SD_TRACE_FAILED_LOOKUP(ddv, nm,
1944 					    retried);
1945 					goto nolock_notfound;
1946 				} else if (state == SDEV_READY) {
1947 					goto found;
1948 				} else if (state == SDEV_ZOMBIE) {
1949 					rw_exit(&ddv->sdev_contents);
1950 					SD_TRACE_FAILED_LOOKUP(ddv, nm,
1951 					    retried);
1952 					SDEV_RELE(dv);
1953 					goto lookup_failed;
1954 				}
1955 			} else {
1956 				mutex_exit(&dv->sdev_lookup_lock);
1957 			}
1958 			break;
1959 		case SDEV_READY:
1960 			goto found;
1961 		case SDEV_ZOMBIE:
1962 			rw_exit(&ddv->sdev_contents);
1963 			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
1964 			SDEV_RELE(dv);
1965 			goto lookup_failed;
1966 		default:
1967 			rw_exit(&ddv->sdev_contents);
1968 			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
1969 			sdev_lookup_failed(ddv, nm, failed_flags);
1970 			*vpp = NULLVP;
1971 			return (ENOENT);
1972 		}
1973 	}
1974 	ASSERT(RW_READ_HELD(&ddv->sdev_contents));
1975 
1976 	/*
1977 	 * ZOMBIED parent does not allow new node creation.
1978 	 * bail out early
1979 	 */
1980 	if (parent_state == SDEV_ZOMBIE) {
1981 		rw_exit(&ddv->sdev_contents);
1982 		*vpp = NULLVP;
1983 		SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
1984 		return (ENOENT);
1985 	}
1986 
1987 	/*
1988 	 * (b0): backing store lookup
1989 	 *	SDEV_PERSIST is default except:
1990 	 *		1) pts nodes
1991 	 *		2) non-chmod'ed local nodes
1992 	 *		3) zvol nodes
1993 	 */
1994 	if (SDEV_IS_PERSIST(ddv)) {
1995 		error = devname_backstore_lookup(ddv, nm, &rvp);
1996 
1997 		if (!error) {
1998 
1999 			vattr.va_mask = AT_TYPE|AT_MODE|AT_UID|AT_GID;
2000 			error = VOP_GETATTR(rvp, &vattr, 0, cred, NULL);
2001 			if (error) {
2002 				rw_exit(&ddv->sdev_contents);
2003 				if (dv)
2004 					SDEV_RELE(dv);
2005 				SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2006 				sdev_lookup_failed(ddv, nm, failed_flags);
2007 				*vpp = NULLVP;
2008 				return (ENOENT);
2009 			}
2010 
2011 			if (vattr.va_type == VLNK) {
2012 				error = sdev_getlink(rvp, &link);
2013 				if (error) {
2014 					rw_exit(&ddv->sdev_contents);
2015 					if (dv)
2016 						SDEV_RELE(dv);
2017 					SD_TRACE_FAILED_LOOKUP(ddv, nm,
2018 					    retried);
2019 					sdev_lookup_failed(ddv, nm,
2020 					    failed_flags);
2021 					*vpp = NULLVP;
2022 					return (ENOENT);
2023 				}
2024 				ASSERT(link != NULL);
2025 			}
2026 
2027 			if (!rw_tryupgrade(&ddv->sdev_contents)) {
2028 				rw_exit(&ddv->sdev_contents);
2029 				rw_enter(&ddv->sdev_contents, RW_WRITER);
2030 			}
2031 			error = sdev_mknode(ddv, nm, &dv, &vattr,
2032 			    rvp, link, cred, SDEV_READY);
2033 			rw_downgrade(&ddv->sdev_contents);
2034 
2035 			if (link != NULL) {
2036 				kmem_free(link, strlen(link) + 1);
2037 				link = NULL;
2038 			}
2039 
2040 			if (error) {
2041 				SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2042 				rw_exit(&ddv->sdev_contents);
2043 				if (dv)
2044 					SDEV_RELE(dv);
2045 				goto lookup_failed;
2046 			} else {
2047 				goto found;
2048 			}
2049 		} else if (retried) {
2050 			rw_exit(&ddv->sdev_contents);
2051 			sdcmn_err3(("retry of lookup of %s/%s: failed\n",
2052 			    ddv->sdev_name, nm));
2053 			if (dv)
2054 				SDEV_RELE(dv);
2055 			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2056 			sdev_lookup_failed(ddv, nm, failed_flags);
2057 			*vpp = NULLVP;
2058 			return (ENOENT);
2059 		}
2060 	}
2061 
2062 lookup_create_node:
2063 	/* first thread that is doing the lookup on this node */
2064 	if (callback) {
2065 		ASSERT(dv == NULL);
2066 		if (!rw_tryupgrade(&ddv->sdev_contents)) {
2067 			rw_exit(&ddv->sdev_contents);
2068 			rw_enter(&ddv->sdev_contents, RW_WRITER);
2069 		}
2070 		error = sdev_call_dircallback(ddv, &dv, nm, callback,
2071 		    flags, cred);
2072 		rw_downgrade(&ddv->sdev_contents);
2073 		if (error == 0) {
2074 			goto found;
2075 		} else {
2076 			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2077 			rw_exit(&ddv->sdev_contents);
2078 			goto lookup_failed;
2079 		}
2080 	}
2081 	if (!dv) {
2082 		if (!rw_tryupgrade(&ddv->sdev_contents)) {
2083 			rw_exit(&ddv->sdev_contents);
2084 			rw_enter(&ddv->sdev_contents, RW_WRITER);
2085 		}
2086 		error = sdev_mknode(ddv, nm, &dv, NULL, NULL, NULL,
2087 		    cred, SDEV_INIT);
2088 		if (!dv) {
2089 			rw_exit(&ddv->sdev_contents);
2090 			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2091 			sdev_lookup_failed(ddv, nm, failed_flags);
2092 			*vpp = NULLVP;
2093 			return (ENOENT);
2094 		}
2095 		rw_downgrade(&ddv->sdev_contents);
2096 	}
2097 
2098 	/*
2099 	 * (b1) invoking devfsadm once per life time for devfsadm nodes
2100 	 */
2101 	ASSERT(SDEV_HELD(dv));
2102 
2103 	if (SDEV_IS_NO_NCACHE(dv))
2104 		failed_flags |= SLF_NO_NCACHE;
2105 	if (sdev_reconfig_boot || !i_ddi_io_initialized() ||
2106 	    SDEV_IS_DYNAMIC(ddv) || SDEV_IS_NO_NCACHE(dv) ||
2107 	    ((moddebug & MODDEBUG_FINI_EBUSY) != 0)) {
2108 		ASSERT(SDEV_HELD(dv));
2109 		SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2110 		goto nolock_notfound;
2111 	}
2112 
2113 	/*
2114 	 * filter out known non-existent devices recorded
2115 	 * during initial reconfiguration boot for which
2116 	 * reconfig should not be done and lookup may
2117 	 * be short-circuited now.
2118 	 */
2119 	if (sdev_lookup_filter(ddv, nm)) {
2120 		SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2121 		goto nolock_notfound;
2122 	}
2123 
2124 	/* bypassing devfsadm internal nodes */
2125 	if (is_devfsadm_thread(lookup_thread)) {
2126 		SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2127 		goto nolock_notfound;
2128 	}
2129 
2130 	if (sdev_reconfig_disable) {
2131 		SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2132 		goto nolock_notfound;
2133 	}
2134 
2135 	error = sdev_call_devfsadmd(ddv, dv, nm);
2136 	if (error == 0) {
2137 		sdcmn_err8(("lookup of %s/%s by %s: reconfig\n",
2138 		    ddv->sdev_name, nm, curproc->p_user.u_comm));
2139 		if (sdev_reconfig_verbose) {
2140 			cmn_err(CE_CONT,
2141 			    "?lookup of %s/%s by %s: reconfig\n",
2142 			    ddv->sdev_name, nm, curproc->p_user.u_comm);
2143 		}
2144 		retried = 1;
2145 		failed_flags |= SLF_REBUILT;
2146 		ASSERT(dv->sdev_state != SDEV_ZOMBIE);
2147 		SDEV_SIMPLE_RELE(dv);
2148 		goto tryagain;
2149 	} else {
2150 		SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2151 		goto nolock_notfound;
2152 	}
2153 
2154 found:
2155 	ASSERT(dv->sdev_state == SDEV_READY);
2156 	if (vtor) {
2157 		/*
2158 		 * Check validity of returned node
2159 		 */
2160 		switch (vtor(dv)) {
2161 		case SDEV_VTOR_VALID:
2162 			break;
2163 		case SDEV_VTOR_STALE:
2164 			/*
2165 			 * The name exists, but the cache entry is
2166 			 * stale and needs to be re-created.
2167 			 */
2168 			ASSERT(RW_READ_HELD(&ddv->sdev_contents));
2169 			if (rw_tryupgrade(&ddv->sdev_contents) == 0) {
2170 				rw_exit(&ddv->sdev_contents);
2171 				rw_enter(&ddv->sdev_contents, RW_WRITER);
2172 			}
2173 			sdev_cache_update(ddv, &dv, nm, SDEV_CACHE_DELETE);
2174 			rw_downgrade(&ddv->sdev_contents);
2175 			SDEV_RELE(dv);
2176 			dv = NULL;
2177 			goto lookup_create_node;
2178 			/* FALLTHRU */
2179 		case SDEV_VTOR_INVALID:
2180 			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2181 			sdcmn_err7(("lookup: destroy invalid "
2182 			    "node: %s(%p)\n", dv->sdev_name, (void *)dv));
2183 			goto nolock_notfound;
2184 		case SDEV_VTOR_SKIP:
2185 			sdcmn_err7(("lookup: node not applicable - "
2186 			    "skipping: %s(%p)\n", dv->sdev_name, (void *)dv));
2187 			rw_exit(&ddv->sdev_contents);
2188 			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2189 			SDEV_RELE(dv);
2190 			goto lookup_failed;
2191 		default:
2192 			cmn_err(CE_PANIC,
2193 			    "dev fs: validator failed: %s(%p)\n",
2194 			    dv->sdev_name, (void *)dv);
2195 			break;
2196 		}
2197 	}
2198 
2199 	rw_exit(&ddv->sdev_contents);
2200 	rv = sdev_to_vp(dv, vpp);
2201 	sdcmn_err3(("devname_lookup_func: returning vp %p v_count %d state %d "
2202 	    "for nm %s, error %d\n", (void *)*vpp, (*vpp)->v_count,
2203 	    dv->sdev_state, nm, rv));
2204 	return (rv);
2205 
2206 nolock_notfound:
2207 	/*
2208 	 * Destroy the node that is created for synchronization purposes.
2209 	 */
2210 	sdcmn_err3(("devname_lookup_func: %s with state %d\n",
2211 	    nm, dv->sdev_state));
2212 	ASSERT(RW_READ_HELD(&ddv->sdev_contents));
2213 	if (dv->sdev_state == SDEV_INIT) {
2214 		if (!rw_tryupgrade(&ddv->sdev_contents)) {
2215 			rw_exit(&ddv->sdev_contents);
2216 			rw_enter(&ddv->sdev_contents, RW_WRITER);
2217 		}
2218 
2219 		/*
2220 		 * Node state may have changed during the lock
2221 		 * changes. Re-check.
2222 		 */
2223 		if (dv->sdev_state == SDEV_INIT) {
2224 			sdev_dirdelete(ddv, dv);
2225 			rw_exit(&ddv->sdev_contents);
2226 			sdev_lookup_failed(ddv, nm, failed_flags);
2227 			SDEV_RELE(dv);
2228 			*vpp = NULL;
2229 			return (ENOENT);
2230 		}
2231 	}
2232 
2233 	rw_exit(&ddv->sdev_contents);
2234 	SDEV_RELE(dv);
2235 
2236 lookup_failed:
2237 	sdev_lookup_failed(ddv, nm, failed_flags);
2238 	*vpp = NULL;
2239 	return (ENOENT);
2240 }
2241 
2242 /*
2243  * Given a directory node, mark all nodes beneath as
2244  * STALE, i.e. nodes that don't exist as far as new
2245  * consumers are concerned.  Remove them from the
2246  * list of directory entries so that no lookup or
2247  * directory traversal will find them.  The node
2248  * not deallocated so existing holds are not affected.
2249  */
2250 void
2251 sdev_stale(struct sdev_node *ddv)
2252 {
2253 	struct sdev_node *dv;
2254 	struct vnode *vp;
2255 
2256 	ASSERT(SDEVTOV(ddv)->v_type == VDIR);
2257 
2258 	rw_enter(&ddv->sdev_contents, RW_WRITER);
2259 	while ((dv = SDEV_FIRST_ENTRY(ddv)) != NULL) {
2260 		vp = SDEVTOV(dv);
2261 		SDEV_HOLD(dv);
2262 		if (vp->v_type == VDIR)
2263 			sdev_stale(dv);
2264 
2265 		sdev_dirdelete(ddv, dv);
2266 		SDEV_RELE(dv);
2267 	}
2268 	ddv->sdev_flags |= SDEV_BUILD;
2269 	rw_exit(&ddv->sdev_contents);
2270 }
2271 
2272 /*
2273  * Given a directory node, clean out all the nodes beneath.
2274  * If expr is specified, clean node with names matching expr.
2275  * If SDEV_ENFORCE is specified in flags, busy nodes are made stale,
2276  *	so they are excluded from future lookups.
2277  */
2278 int
2279 sdev_cleandir(struct sdev_node *ddv, char *expr, uint_t flags)
2280 {
2281 	int error = 0;
2282 	int busy = 0;
2283 	struct vnode *vp;
2284 	struct sdev_node *dv;
2285 	int bkstore = 0;
2286 	int len = 0;
2287 	char *bks_name = NULL;
2288 
2289 	ASSERT(SDEVTOV(ddv)->v_type == VDIR);
2290 
2291 	/*
2292 	 * We try our best to destroy all unused sdev_node's
2293 	 */
2294 	rw_enter(&ddv->sdev_contents, RW_WRITER);
2295 	while ((dv = SDEV_FIRST_ENTRY(ddv)) != NULL) {
2296 		vp = SDEVTOV(dv);
2297 
2298 		if (expr && gmatch(dv->sdev_name, expr) == 0)
2299 			continue;
2300 
2301 		if (vp->v_type == VDIR &&
2302 		    sdev_cleandir(dv, NULL, flags) != 0) {
2303 			sdcmn_err9(("sdev_cleandir: dir %s busy\n",
2304 			    dv->sdev_name));
2305 			busy++;
2306 			continue;
2307 		}
2308 
2309 		if (vp->v_count > 0 && (flags & SDEV_ENFORCE) == 0) {
2310 			sdcmn_err9(("sdev_cleandir: dir %s busy\n",
2311 			    dv->sdev_name));
2312 			busy++;
2313 			continue;
2314 		}
2315 
2316 		/*
2317 		 * at this point, either dv is not held or SDEV_ENFORCE
2318 		 * is specified. In either case, dv needs to be deleted
2319 		 */
2320 		SDEV_HOLD(dv);
2321 
2322 		bkstore = SDEV_IS_PERSIST(dv) ? 1 : 0;
2323 		if (bkstore && (vp->v_type == VDIR))
2324 			bkstore += 1;
2325 
2326 		if (bkstore) {
2327 			len = strlen(dv->sdev_name) + 1;
2328 			bks_name = kmem_alloc(len, KM_SLEEP);
2329 			bcopy(dv->sdev_name, bks_name, len);
2330 		}
2331 
2332 		sdev_dirdelete(ddv, dv);
2333 
2334 		/* take care the backing store clean up */
2335 		if (bkstore) {
2336 			ASSERT(bks_name);
2337 			ASSERT(ddv->sdev_attrvp);
2338 
2339 			if (bkstore == 1) {
2340 				error = VOP_REMOVE(ddv->sdev_attrvp,
2341 				    bks_name, kcred, NULL, 0);
2342 			} else if (bkstore == 2) {
2343 				error = VOP_RMDIR(ddv->sdev_attrvp,
2344 				    bks_name, ddv->sdev_attrvp, kcred, NULL, 0);
2345 			}
2346 
2347 			/* do not propagate the backing store errors */
2348 			if (error) {
2349 				sdcmn_err9(("sdev_cleandir: backing store"
2350 				    "not cleaned\n"));
2351 				error = 0;
2352 			}
2353 
2354 			bkstore = 0;
2355 			kmem_free(bks_name, len);
2356 			bks_name = NULL;
2357 			len = 0;
2358 		}
2359 
2360 		ddv->sdev_flags |= SDEV_BUILD;
2361 		SDEV_RELE(dv);
2362 	}
2363 
2364 	ddv->sdev_flags |= SDEV_BUILD;
2365 	rw_exit(&ddv->sdev_contents);
2366 
2367 	if (busy) {
2368 		error = EBUSY;
2369 	}
2370 
2371 	return (error);
2372 }
2373 
2374 /*
2375  * a convenient wrapper for readdir() funcs
2376  */
2377 size_t
2378 add_dir_entry(dirent64_t *de, char *nm, size_t size, ino_t ino, offset_t off)
2379 {
2380 	size_t reclen = DIRENT64_RECLEN(strlen(nm));
2381 	if (reclen > size)
2382 		return (0);
2383 
2384 	de->d_ino = (ino64_t)ino;
2385 	de->d_off = (off64_t)off + 1;
2386 	de->d_reclen = (ushort_t)reclen;
2387 	(void) strncpy(de->d_name, nm, DIRENT64_NAMELEN(reclen));
2388 	return (reclen);
2389 }
2390 
2391 /*
2392  * sdev_mount service routines
2393  */
2394 int
2395 sdev_copyin_mountargs(struct mounta *uap, struct sdev_mountargs *args)
2396 {
2397 	int	error;
2398 
2399 	if (uap->datalen != sizeof (*args))
2400 		return (EINVAL);
2401 
2402 	if (error = copyin(uap->dataptr, args, sizeof (*args))) {
2403 		cmn_err(CE_WARN, "sdev_copyin_mountargs: can not"
2404 		    "get user data. error %d\n", error);
2405 		return (EFAULT);
2406 	}
2407 
2408 	return (0);
2409 }
2410 
2411 #ifdef nextdp
2412 #undef nextdp
2413 #endif
2414 #define	nextdp(dp)	((struct dirent64 *) \
2415 			    (intptr_t)((char *)(dp) + (dp)->d_reclen))
2416 
2417 /*
2418  * readdir helper func
2419  */
2420 int
2421 devname_readdir_func(vnode_t *vp, uio_t *uiop, cred_t *cred, int *eofp,
2422     int flags)
2423 {
2424 	struct sdev_node *ddv = VTOSDEV(vp);
2425 	struct sdev_node *dv;
2426 	dirent64_t	*dp;
2427 	ulong_t		outcount = 0;
2428 	size_t		namelen;
2429 	ulong_t		alloc_count;
2430 	void		*outbuf;
2431 	struct iovec	*iovp;
2432 	int		error = 0;
2433 	size_t		reclen;
2434 	offset_t	diroff;
2435 	offset_t	soff;
2436 	int		this_reclen;
2437 	int (*vtor)(struct sdev_node *) = NULL;
2438 	struct vattr attr;
2439 	timestruc_t now;
2440 
2441 	ASSERT(ddv->sdev_attr || ddv->sdev_attrvp);
2442 	ASSERT(RW_READ_HELD(&ddv->sdev_contents));
2443 
2444 	if (uiop->uio_loffset >= MAXOFF_T) {
2445 		if (eofp)
2446 			*eofp = 1;
2447 		return (0);
2448 	}
2449 
2450 	if (uiop->uio_iovcnt != 1)
2451 		return (EINVAL);
2452 
2453 	if (vp->v_type != VDIR)
2454 		return (ENOTDIR);
2455 
2456 	if (ddv->sdev_flags & SDEV_VTOR) {
2457 		vtor = (int (*)(struct sdev_node *))sdev_get_vtor(ddv);
2458 		ASSERT(vtor);
2459 	}
2460 
2461 	if (eofp != NULL)
2462 		*eofp = 0;
2463 
2464 	soff = uiop->uio_loffset;
2465 	iovp = uiop->uio_iov;
2466 	alloc_count = iovp->iov_len;
2467 	dp = outbuf = kmem_alloc(alloc_count, KM_SLEEP);
2468 	outcount = 0;
2469 
2470 	if (ddv->sdev_state == SDEV_ZOMBIE)
2471 		goto get_cache;
2472 
2473 	if (SDEV_IS_GLOBAL(ddv)) {
2474 
2475 		if ((sdev_boot_state == SDEV_BOOT_STATE_COMPLETE) &&
2476 		    !sdev_reconfig_boot && (flags & SDEV_BROWSE) &&
2477 		    !SDEV_IS_DYNAMIC(ddv) && !SDEV_IS_NO_NCACHE(ddv) &&
2478 		    ((moddebug & MODDEBUG_FINI_EBUSY) == 0) &&
2479 		    !DEVNAME_DEVFSADM_HAS_RUN(devfsadm_state) &&
2480 		    !DEVNAME_DEVFSADM_IS_RUNNING(devfsadm_state) &&
2481 		    !sdev_reconfig_disable) {
2482 			/*
2483 			 * invoking "devfsadm" to do system device reconfig
2484 			 */
2485 			mutex_enter(&ddv->sdev_lookup_lock);
2486 			SDEV_BLOCK_OTHERS(ddv,
2487 			    (SDEV_READDIR|SDEV_LGWAITING));
2488 			mutex_exit(&ddv->sdev_lookup_lock);
2489 
2490 			sdcmn_err8(("readdir of %s by %s: reconfig\n",
2491 			    ddv->sdev_path, curproc->p_user.u_comm));
2492 			if (sdev_reconfig_verbose) {
2493 				cmn_err(CE_CONT,
2494 				    "?readdir of %s by %s: reconfig\n",
2495 				    ddv->sdev_path, curproc->p_user.u_comm);
2496 			}
2497 
2498 			sdev_devfsadmd_thread(ddv, NULL, kcred);
2499 		} else if (DEVNAME_DEVFSADM_IS_RUNNING(devfsadm_state)) {
2500 			/*
2501 			 * compensate the "ls" started later than "devfsadm"
2502 			 */
2503 			mutex_enter(&ddv->sdev_lookup_lock);
2504 			SDEV_BLOCK_OTHERS(ddv, (SDEV_READDIR|SDEV_LGWAITING));
2505 			mutex_exit(&ddv->sdev_lookup_lock);
2506 		}
2507 
2508 		/*
2509 		 * release the contents lock so that
2510 		 * the cache may be updated by devfsadmd
2511 		 */
2512 		rw_exit(&ddv->sdev_contents);
2513 		mutex_enter(&ddv->sdev_lookup_lock);
2514 		if (SDEV_IS_READDIR(ddv))
2515 			(void) sdev_wait4lookup(ddv, SDEV_READDIR);
2516 		mutex_exit(&ddv->sdev_lookup_lock);
2517 		rw_enter(&ddv->sdev_contents, RW_READER);
2518 
2519 		sdcmn_err4(("readdir of directory %s by %s\n",
2520 		    ddv->sdev_name, curproc->p_user.u_comm));
2521 		if (ddv->sdev_flags & SDEV_BUILD) {
2522 			if (SDEV_IS_PERSIST(ddv)) {
2523 				error = sdev_filldir_from_store(ddv,
2524 				    alloc_count, cred);
2525 			}
2526 			ddv->sdev_flags &= ~SDEV_BUILD;
2527 		}
2528 	}
2529 
2530 get_cache:
2531 	/* handle "." and ".." */
2532 	diroff = 0;
2533 	if (soff == 0) {
2534 		/* first time */
2535 		this_reclen = DIRENT64_RECLEN(1);
2536 		if (alloc_count < this_reclen) {
2537 			error = EINVAL;
2538 			goto done;
2539 		}
2540 
2541 		dp->d_ino = (ino64_t)ddv->sdev_ino;
2542 		dp->d_off = (off64_t)1;
2543 		dp->d_reclen = (ushort_t)this_reclen;
2544 
2545 		(void) strncpy(dp->d_name, ".",
2546 		    DIRENT64_NAMELEN(this_reclen));
2547 		outcount += dp->d_reclen;
2548 		dp = nextdp(dp);
2549 	}
2550 
2551 	diroff++;
2552 	if (soff <= 1) {
2553 		this_reclen = DIRENT64_RECLEN(2);
2554 		if (alloc_count < outcount + this_reclen) {
2555 			error = EINVAL;
2556 			goto done;
2557 		}
2558 
2559 		dp->d_reclen = (ushort_t)this_reclen;
2560 		dp->d_ino = (ino64_t)ddv->sdev_dotdot->sdev_ino;
2561 		dp->d_off = (off64_t)2;
2562 
2563 		(void) strncpy(dp->d_name, "..",
2564 		    DIRENT64_NAMELEN(this_reclen));
2565 		outcount += dp->d_reclen;
2566 
2567 		dp = nextdp(dp);
2568 	}
2569 
2570 
2571 	/* gets the cache */
2572 	diroff++;
2573 	for (dv = SDEV_FIRST_ENTRY(ddv); dv;
2574 	    dv = SDEV_NEXT_ENTRY(ddv, dv), diroff++) {
2575 		sdcmn_err3(("sdev_readdir: diroff %lld soff %lld for '%s' \n",
2576 		    diroff, soff, dv->sdev_name));
2577 
2578 		/* bypassing pre-matured nodes */
2579 		if (diroff < soff || (dv->sdev_state != SDEV_READY)) {
2580 			sdcmn_err3(("sdev_readdir: pre-mature node  "
2581 			    "%s %d\n", dv->sdev_name, dv->sdev_state));
2582 			continue;
2583 		}
2584 
2585 		/*
2586 		 * Check validity of node
2587 		 * Drop invalid and nodes to be skipped.
2588 		 * A node the validator indicates as stale needs
2589 		 * to be returned as presumably the node name itself
2590 		 * is valid and the node data itself will be refreshed
2591 		 * on lookup.  An application performing a readdir then
2592 		 * stat on each entry should thus always see consistent
2593 		 * data.  In any case, it is not possible to synchronize
2594 		 * with dynamic kernel state, and any view we return can
2595 		 * never be anything more than a snapshot at a point in time.
2596 		 */
2597 		if (vtor) {
2598 			switch (vtor(dv)) {
2599 			case SDEV_VTOR_VALID:
2600 				break;
2601 			case SDEV_VTOR_INVALID:
2602 			case SDEV_VTOR_SKIP:
2603 				continue;
2604 			case SDEV_VTOR_STALE:
2605 				sdcmn_err3(("sdev_readir: %s stale\n",
2606 				    dv->sdev_name));
2607 				break;
2608 			default:
2609 				cmn_err(CE_PANIC,
2610 				    "dev fs: validator failed: %s(%p)\n",
2611 				    dv->sdev_name, (void *)dv);
2612 				break;
2613 			/*NOTREACHED*/
2614 			}
2615 		}
2616 
2617 		namelen = strlen(dv->sdev_name);
2618 		reclen = DIRENT64_RECLEN(namelen);
2619 		if (outcount + reclen > alloc_count) {
2620 			goto full;
2621 		}
2622 		dp->d_reclen = (ushort_t)reclen;
2623 		dp->d_ino = (ino64_t)dv->sdev_ino;
2624 		dp->d_off = (off64_t)diroff + 1;
2625 		(void) strncpy(dp->d_name, dv->sdev_name,
2626 		    DIRENT64_NAMELEN(reclen));
2627 		outcount += reclen;
2628 		dp = nextdp(dp);
2629 	}
2630 
2631 full:
2632 	sdcmn_err4(("sdev_readdir: moving %lu bytes: "
2633 	    "diroff %lld, soff %lld, dv %p\n", outcount, diroff, soff,
2634 	    (void *)dv));
2635 
2636 	if (outcount)
2637 		error = uiomove(outbuf, outcount, UIO_READ, uiop);
2638 
2639 	if (!error) {
2640 		uiop->uio_loffset = diroff;
2641 		if (eofp)
2642 			*eofp = dv ? 0 : 1;
2643 	}
2644 
2645 
2646 	if (ddv->sdev_attrvp) {
2647 		gethrestime(&now);
2648 		attr.va_ctime = now;
2649 		attr.va_atime = now;
2650 		attr.va_mask = AT_CTIME|AT_ATIME;
2651 
2652 		(void) VOP_SETATTR(ddv->sdev_attrvp, &attr, 0, kcred, NULL);
2653 	}
2654 done:
2655 	kmem_free(outbuf, alloc_count);
2656 	return (error);
2657 }
2658 
2659 static int
2660 sdev_modctl_lookup(const char *path, vnode_t **r_vp)
2661 {
2662 	vnode_t *vp;
2663 	vnode_t *cvp;
2664 	struct sdev_node *svp;
2665 	char *nm;
2666 	struct pathname pn;
2667 	int error;
2668 	int persisted = 0;
2669 
2670 	ASSERT(INGLOBALZONE(curproc));
2671 
2672 	if (error = pn_get((char *)path, UIO_SYSSPACE, &pn))
2673 		return (error);
2674 	nm = kmem_alloc(MAXNAMELEN, KM_SLEEP);
2675 
2676 	vp = rootdir;
2677 	VN_HOLD(vp);
2678 
2679 	while (pn_pathleft(&pn)) {
2680 		ASSERT(vp->v_type == VDIR || vp->v_type == VLNK);
2681 		(void) pn_getcomponent(&pn, nm);
2682 
2683 		/*
2684 		 * Deal with the .. special case where we may be
2685 		 * traversing up across a mount point, to the
2686 		 * root of this filesystem or global root.
2687 		 */
2688 		if (nm[0] == '.' && nm[1] == '.' && nm[2] == 0) {
2689 checkforroot:
2690 			if (VN_CMP(vp, rootdir)) {
2691 				nm[1] = 0;
2692 			} else if (vp->v_flag & VROOT) {
2693 				vfs_t *vfsp;
2694 				cvp = vp;
2695 				vfsp = cvp->v_vfsp;
2696 				vfs_rlock_wait(vfsp);
2697 				vp = cvp->v_vfsp->vfs_vnodecovered;
2698 				if (vp == NULL ||
2699 				    (cvp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) {
2700 					vfs_unlock(vfsp);
2701 					VN_RELE(cvp);
2702 					error = EIO;
2703 					break;
2704 				}
2705 				VN_HOLD(vp);
2706 				vfs_unlock(vfsp);
2707 				VN_RELE(cvp);
2708 				cvp = NULL;
2709 				goto checkforroot;
2710 			}
2711 		}
2712 
2713 		error = VOP_LOOKUP(vp, nm, &cvp, NULL, 0, NULL, kcred, NULL,
2714 		    NULL, NULL);
2715 		if (error) {
2716 			VN_RELE(vp);
2717 			break;
2718 		}
2719 
2720 		/* traverse mount points encountered on our journey */
2721 		if (vn_ismntpt(cvp) && (error = traverse(&cvp)) != 0) {
2722 			VN_RELE(vp);
2723 			VN_RELE(cvp);
2724 			break;
2725 		}
2726 
2727 		/*
2728 		 * symbolic link, can be either relative and absolute
2729 		 */
2730 		if ((cvp->v_type == VLNK) && pn_pathleft(&pn)) {
2731 			struct pathname linkpath;
2732 			pn_alloc(&linkpath);
2733 			if (error = pn_getsymlink(cvp, &linkpath, kcred)) {
2734 				pn_free(&linkpath);
2735 				break;
2736 			}
2737 			if (pn_pathleft(&linkpath) == 0)
2738 				(void) pn_set(&linkpath, ".");
2739 			error = pn_insert(&pn, &linkpath, strlen(nm));
2740 			pn_free(&linkpath);
2741 			if (pn.pn_pathlen == 0) {
2742 				VN_RELE(vp);
2743 				return (ENOENT);
2744 			}
2745 			if (pn.pn_path[0] == '/') {
2746 				pn_skipslash(&pn);
2747 				VN_RELE(vp);
2748 				VN_RELE(cvp);
2749 				vp = rootdir;
2750 				VN_HOLD(vp);
2751 			} else {
2752 				VN_RELE(cvp);
2753 			}
2754 			continue;
2755 		}
2756 
2757 		VN_RELE(vp);
2758 
2759 		/*
2760 		 * Direct the operation to the persisting filesystem
2761 		 * underlying /dev.  Bail if we encounter a
2762 		 * non-persistent dev entity here.
2763 		 */
2764 		if (cvp->v_vfsp->vfs_fstype == devtype) {
2765 
2766 			if ((VTOSDEV(cvp)->sdev_flags & SDEV_PERSIST) == 0) {
2767 				error = ENOENT;
2768 				VN_RELE(cvp);
2769 				break;
2770 			}
2771 
2772 			if (VTOSDEV(cvp) == NULL) {
2773 				error = ENOENT;
2774 				VN_RELE(cvp);
2775 				break;
2776 			}
2777 			svp = VTOSDEV(cvp);
2778 			if ((vp = svp->sdev_attrvp) == NULL) {
2779 				error = ENOENT;
2780 				VN_RELE(cvp);
2781 				break;
2782 			}
2783 			persisted = 1;
2784 			VN_HOLD(vp);
2785 			VN_RELE(cvp);
2786 			cvp = vp;
2787 		}
2788 
2789 		vp = cvp;
2790 		pn_skipslash(&pn);
2791 	}
2792 
2793 	kmem_free(nm, MAXNAMELEN);
2794 	pn_free(&pn);
2795 
2796 	if (error)
2797 		return (error);
2798 
2799 	/*
2800 	 * Only return persisted nodes in the filesystem underlying /dev.
2801 	 */
2802 	if (!persisted) {
2803 		VN_RELE(vp);
2804 		return (ENOENT);
2805 	}
2806 
2807 	*r_vp = vp;
2808 	return (0);
2809 }
2810 
2811 int
2812 sdev_modctl_readdir(const char *dir, char ***dirlistp,
2813 	int *npathsp, int *npathsp_alloc, int checking_empty)
2814 {
2815 	char	**pathlist = NULL;
2816 	char	**newlist = NULL;
2817 	int	npaths = 0;
2818 	int	npaths_alloc = 0;
2819 	dirent64_t *dbuf = NULL;
2820 	int	n;
2821 	char	*s;
2822 	int error;
2823 	vnode_t *vp;
2824 	int eof;
2825 	struct iovec iov;
2826 	struct uio uio;
2827 	struct dirent64 *dp;
2828 	size_t dlen;
2829 	size_t dbuflen;
2830 	int ndirents = 64;
2831 	char *nm;
2832 
2833 	error = sdev_modctl_lookup(dir, &vp);
2834 	sdcmn_err11(("modctl readdir: %s by %s: %s\n",
2835 	    dir, curproc->p_user.u_comm,
2836 	    (error == 0) ? "ok" : "failed"));
2837 	if (error)
2838 		return (error);
2839 
2840 	dlen = ndirents * (sizeof (*dbuf));
2841 	dbuf = kmem_alloc(dlen, KM_SLEEP);
2842 
2843 	uio.uio_iov = &iov;
2844 	uio.uio_iovcnt = 1;
2845 	uio.uio_segflg = UIO_SYSSPACE;
2846 	uio.uio_fmode = 0;
2847 	uio.uio_extflg = UIO_COPY_CACHED;
2848 	uio.uio_loffset = 0;
2849 	uio.uio_llimit = MAXOFFSET_T;
2850 
2851 	eof = 0;
2852 	error = 0;
2853 	while (!error && !eof) {
2854 		uio.uio_resid = dlen;
2855 		iov.iov_base = (char *)dbuf;
2856 		iov.iov_len = dlen;
2857 
2858 		(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2859 		error = VOP_READDIR(vp, &uio, kcred, &eof, NULL, 0);
2860 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2861 
2862 		dbuflen = dlen - uio.uio_resid;
2863 
2864 		if (error || dbuflen == 0)
2865 			break;
2866 
2867 		for (dp = dbuf; ((intptr_t)dp < (intptr_t)dbuf + dbuflen);
2868 		    dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen)) {
2869 
2870 			nm = dp->d_name;
2871 
2872 			if (strcmp(nm, ".") == 0 || strcmp(nm, "..") == 0)
2873 				continue;
2874 			if (npaths == npaths_alloc) {
2875 				npaths_alloc += 64;
2876 				newlist = (char **)
2877 				    kmem_zalloc((npaths_alloc + 1) *
2878 				    sizeof (char *), KM_SLEEP);
2879 				if (pathlist) {
2880 					bcopy(pathlist, newlist,
2881 					    npaths * sizeof (char *));
2882 					kmem_free(pathlist,
2883 					    (npaths + 1) * sizeof (char *));
2884 				}
2885 				pathlist = newlist;
2886 			}
2887 			n = strlen(nm) + 1;
2888 			s = kmem_alloc(n, KM_SLEEP);
2889 			bcopy(nm, s, n);
2890 			pathlist[npaths++] = s;
2891 			sdcmn_err11(("  %s/%s\n", dir, s));
2892 
2893 			/* if checking empty, one entry is as good as many */
2894 			if (checking_empty) {
2895 				eof = 1;
2896 				break;
2897 			}
2898 		}
2899 	}
2900 
2901 exit:
2902 	VN_RELE(vp);
2903 
2904 	if (dbuf)
2905 		kmem_free(dbuf, dlen);
2906 
2907 	if (error)
2908 		return (error);
2909 
2910 	*dirlistp = pathlist;
2911 	*npathsp = npaths;
2912 	*npathsp_alloc = npaths_alloc;
2913 
2914 	return (0);
2915 }
2916 
2917 void
2918 sdev_modctl_readdir_free(char **pathlist, int npaths, int npaths_alloc)
2919 {
2920 	int	i, n;
2921 
2922 	for (i = 0; i < npaths; i++) {
2923 		n = strlen(pathlist[i]) + 1;
2924 		kmem_free(pathlist[i], n);
2925 	}
2926 
2927 	kmem_free(pathlist, (npaths_alloc + 1) * sizeof (char *));
2928 }
2929 
2930 int
2931 sdev_modctl_devexists(const char *path)
2932 {
2933 	vnode_t *vp;
2934 	int error;
2935 
2936 	error = sdev_modctl_lookup(path, &vp);
2937 	sdcmn_err11(("modctl dev exists: %s by %s: %s\n",
2938 	    path, curproc->p_user.u_comm,
2939 	    (error == 0) ? "ok" : "failed"));
2940 	if (error == 0)
2941 		VN_RELE(vp);
2942 
2943 	return (error);
2944 }
2945 
2946 extern int sdev_vnodeops_tbl_size;
2947 
2948 /*
2949  * construct a new template with overrides from vtab
2950  */
2951 static fs_operation_def_t *
2952 sdev_merge_vtab(const fs_operation_def_t tab[])
2953 {
2954 	fs_operation_def_t *new;
2955 	const fs_operation_def_t *tab_entry;
2956 
2957 	/* make a copy of standard vnode ops table */
2958 	new = kmem_alloc(sdev_vnodeops_tbl_size, KM_SLEEP);
2959 	bcopy((void *)sdev_vnodeops_tbl, new, sdev_vnodeops_tbl_size);
2960 
2961 	/* replace the overrides from tab */
2962 	for (tab_entry = tab; tab_entry->name != NULL; tab_entry++) {
2963 		fs_operation_def_t *std_entry = new;
2964 		while (std_entry->name) {
2965 			if (strcmp(tab_entry->name, std_entry->name) == 0) {
2966 				std_entry->func = tab_entry->func;
2967 				break;
2968 			}
2969 			std_entry++;
2970 		}
2971 		if (std_entry->name == NULL)
2972 			cmn_err(CE_NOTE, "sdev_merge_vtab: entry %s unused.",
2973 			    tab_entry->name);
2974 	}
2975 
2976 	return (new);
2977 }
2978 
2979 /* free memory allocated by sdev_merge_vtab */
2980 static void
2981 sdev_free_vtab(fs_operation_def_t *new)
2982 {
2983 	kmem_free(new, sdev_vnodeops_tbl_size);
2984 }
2985 
2986 /*
2987  * a generic setattr() function
2988  *
2989  * note: flags only supports AT_UID and AT_GID.
2990  *	 Future enhancements can be done for other types, e.g. AT_MODE
2991  */
2992 int
2993 devname_setattr_func(struct vnode *vp, struct vattr *vap, int flags,
2994     struct cred *cred, int (*callback)(struct sdev_node *, struct vattr *,
2995     int), int protocol)
2996 {
2997 	struct sdev_node	*dv = VTOSDEV(vp);
2998 	struct sdev_node	*parent = dv->sdev_dotdot;
2999 	struct vattr		*get;
3000 	uint_t			mask = vap->va_mask;
3001 	int 			error;
3002 
3003 	/* some sanity checks */
3004 	if (vap->va_mask & AT_NOSET)
3005 		return (EINVAL);
3006 
3007 	if (vap->va_mask & AT_SIZE) {
3008 		if (vp->v_type == VDIR) {
3009 			return (EISDIR);
3010 		}
3011 	}
3012 
3013 	/* no need to set attribute, but do not fail either */
3014 	ASSERT(parent);
3015 	rw_enter(&parent->sdev_contents, RW_READER);
3016 	if (dv->sdev_state == SDEV_ZOMBIE) {
3017 		rw_exit(&parent->sdev_contents);
3018 		return (0);
3019 	}
3020 
3021 	/* If backing store exists, just set it. */
3022 	if (dv->sdev_attrvp) {
3023 		rw_exit(&parent->sdev_contents);
3024 		return (VOP_SETATTR(dv->sdev_attrvp, vap, flags, cred, NULL));
3025 	}
3026 
3027 	/*
3028 	 * Otherwise, for nodes with the persistence attribute, create it.
3029 	 */
3030 	ASSERT(dv->sdev_attr);
3031 	if (SDEV_IS_PERSIST(dv) ||
3032 	    ((vap->va_mask & ~AT_TIMES) != 0 && !SDEV_IS_DYNAMIC(dv))) {
3033 		sdev_vattr_merge(dv, vap);
3034 		rw_enter(&dv->sdev_contents, RW_WRITER);
3035 		error = sdev_shadow_node(dv, cred);
3036 		rw_exit(&dv->sdev_contents);
3037 		rw_exit(&parent->sdev_contents);
3038 
3039 		if (error)
3040 			return (error);
3041 		return (VOP_SETATTR(dv->sdev_attrvp, vap, flags, cred, NULL));
3042 	}
3043 
3044 
3045 	/*
3046 	 * sdev_attr was allocated in sdev_mknode
3047 	 */
3048 	rw_enter(&dv->sdev_contents, RW_WRITER);
3049 	error = secpolicy_vnode_setattr(cred, vp, vap,
3050 	    dv->sdev_attr, flags, sdev_unlocked_access, dv);
3051 	if (error) {
3052 		rw_exit(&dv->sdev_contents);
3053 		rw_exit(&parent->sdev_contents);
3054 		return (error);
3055 	}
3056 
3057 	get = dv->sdev_attr;
3058 	if (mask & AT_MODE) {
3059 		get->va_mode &= S_IFMT;
3060 		get->va_mode |= vap->va_mode & ~S_IFMT;
3061 	}
3062 
3063 	if ((mask & AT_UID) || (mask & AT_GID)) {
3064 		if (mask & AT_UID)
3065 			get->va_uid = vap->va_uid;
3066 		if (mask & AT_GID)
3067 			get->va_gid = vap->va_gid;
3068 		/*
3069 		 * a callback must be provided if the protocol is set
3070 		 */
3071 		if ((protocol & AT_UID) || (protocol & AT_GID)) {
3072 			ASSERT(callback);
3073 			error = callback(dv, get, protocol);
3074 			if (error) {
3075 				rw_exit(&dv->sdev_contents);
3076 				rw_exit(&parent->sdev_contents);
3077 				return (error);
3078 			}
3079 		}
3080 	}
3081 
3082 	if (mask & AT_ATIME)
3083 		get->va_atime = vap->va_atime;
3084 	if (mask & AT_MTIME)
3085 		get->va_mtime = vap->va_mtime;
3086 	if (mask & (AT_MODE | AT_UID | AT_GID | AT_CTIME)) {
3087 		gethrestime(&get->va_ctime);
3088 	}
3089 
3090 	sdev_vattr_merge(dv, get);
3091 	rw_exit(&dv->sdev_contents);
3092 	rw_exit(&parent->sdev_contents);
3093 	return (0);
3094 }
3095 
3096 /*
3097  * a generic inactive() function
3098  */
3099 /*ARGSUSED*/
3100 void
3101 devname_inactive_func(struct vnode *vp, struct cred *cred,
3102     void (*callback)(struct vnode *))
3103 {
3104 	int clean;
3105 	struct sdev_node *dv = VTOSDEV(vp);
3106 	int state;
3107 
3108 	mutex_enter(&vp->v_lock);
3109 	ASSERT(vp->v_count >= 1);
3110 
3111 
3112 	if (vp->v_count == 1 && callback != NULL)
3113 		callback(vp);
3114 
3115 	rw_enter(&dv->sdev_contents, RW_WRITER);
3116 	state = dv->sdev_state;
3117 
3118 	clean = (vp->v_count == 1) && (state == SDEV_ZOMBIE);
3119 
3120 	/*
3121 	 * sdev is a rather bad public citizen. It violates the general
3122 	 * agreement that in memory nodes should always have a valid reference
3123 	 * count on their vnode. But that's not the case here. This means that
3124 	 * we do actually have to distinguish between getting inactive callbacks
3125 	 * for zombies and otherwise. This should probably be fixed.
3126 	 */
3127 	if (clean) {
3128 		/* Remove the . entry to ourselves */
3129 		if (vp->v_type == VDIR) {
3130 			decr_link(dv);
3131 		}
3132 		VERIFY(dv->sdev_nlink == 1);
3133 		decr_link(dv);
3134 		--vp->v_count;
3135 		rw_exit(&dv->sdev_contents);
3136 		mutex_exit(&vp->v_lock);
3137 		sdev_nodedestroy(dv, 0);
3138 	} else {
3139 		--vp->v_count;
3140 		rw_exit(&dv->sdev_contents);
3141 		mutex_exit(&vp->v_lock);
3142 	}
3143 }
3144