xref: /illumos-gate/usr/src/uts/common/fs/dev/sdev_subr.c (revision 8c69cc8fbe729fa7b091e901c4b50508ccc6bb33)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2013, 2016 Joyent, Inc. All rights reserved.
24  * Copyright (c) 2017 by Delphix. All rights reserved.
25  */
26 
27 /*
28  * utility routines for the /dev fs
29  */
30 
31 #include <sys/types.h>
32 #include <sys/param.h>
33 #include <sys/t_lock.h>
34 #include <sys/systm.h>
35 #include <sys/sysmacros.h>
36 #include <sys/user.h>
37 #include <sys/time.h>
38 #include <sys/vfs.h>
39 #include <sys/vnode.h>
40 #include <sys/file.h>
41 #include <sys/fcntl.h>
42 #include <sys/flock.h>
43 #include <sys/kmem.h>
44 #include <sys/uio.h>
45 #include <sys/errno.h>
46 #include <sys/stat.h>
47 #include <sys/cred.h>
48 #include <sys/dirent.h>
49 #include <sys/pathname.h>
50 #include <sys/cmn_err.h>
51 #include <sys/debug.h>
52 #include <sys/mode.h>
53 #include <sys/policy.h>
54 #include <fs/fs_subr.h>
55 #include <sys/mount.h>
56 #include <sys/fs/snode.h>
57 #include <sys/fs/dv_node.h>
58 #include <sys/fs/sdev_impl.h>
59 #include <sys/sunndi.h>
60 #include <sys/sunmdi.h>
61 #include <sys/conf.h>
62 #include <sys/proc.h>
63 #include <sys/user.h>
64 #include <sys/modctl.h>
65 
66 #ifdef DEBUG
67 int sdev_debug = 0x00000001;
68 int sdev_debug_cache_flags = 0;
69 #endif
70 
71 /*
72  * globals
73  */
74 /* prototype memory vattrs */
75 vattr_t sdev_vattr_dir = {
76 	AT_TYPE|AT_MODE|AT_UID|AT_GID,		/* va_mask */
77 	VDIR,					/* va_type */
78 	SDEV_DIRMODE_DEFAULT,			/* va_mode */
79 	SDEV_UID_DEFAULT,			/* va_uid */
80 	SDEV_GID_DEFAULT,			/* va_gid */
81 	0,					/* va_fsid */
82 	0,					/* va_nodeid */
83 	0,					/* va_nlink */
84 	0,					/* va_size */
85 	0,					/* va_atime */
86 	0,					/* va_mtime */
87 	0,					/* va_ctime */
88 	0,					/* va_rdev */
89 	0,					/* va_blksize */
90 	0,					/* va_nblocks */
91 	0					/* va_vcode */
92 };
93 
94 vattr_t sdev_vattr_lnk = {
95 	AT_TYPE|AT_MODE,			/* va_mask */
96 	VLNK,					/* va_type */
97 	SDEV_LNKMODE_DEFAULT,			/* va_mode */
98 	SDEV_UID_DEFAULT,			/* va_uid */
99 	SDEV_GID_DEFAULT,			/* va_gid */
100 	0,					/* va_fsid */
101 	0,					/* va_nodeid */
102 	0,					/* va_nlink */
103 	0,					/* va_size */
104 	0,					/* va_atime */
105 	0,					/* va_mtime */
106 	0,					/* va_ctime */
107 	0,					/* va_rdev */
108 	0,					/* va_blksize */
109 	0,					/* va_nblocks */
110 	0					/* va_vcode */
111 };
112 
113 vattr_t sdev_vattr_blk = {
114 	AT_TYPE|AT_MODE|AT_UID|AT_GID,		/* va_mask */
115 	VBLK,					/* va_type */
116 	S_IFBLK | SDEV_DEVMODE_DEFAULT,		/* va_mode */
117 	SDEV_UID_DEFAULT,			/* va_uid */
118 	SDEV_GID_DEFAULT,			/* va_gid */
119 	0,					/* va_fsid */
120 	0,					/* va_nodeid */
121 	0,					/* va_nlink */
122 	0,					/* va_size */
123 	0,					/* va_atime */
124 	0,					/* va_mtime */
125 	0,					/* va_ctime */
126 	0,					/* va_rdev */
127 	0,					/* va_blksize */
128 	0,					/* va_nblocks */
129 	0					/* va_vcode */
130 };
131 
132 vattr_t sdev_vattr_chr = {
133 	AT_TYPE|AT_MODE|AT_UID|AT_GID,		/* va_mask */
134 	VCHR,					/* va_type */
135 	S_IFCHR | SDEV_DEVMODE_DEFAULT,		/* va_mode */
136 	SDEV_UID_DEFAULT,			/* va_uid */
137 	SDEV_GID_DEFAULT,			/* va_gid */
138 	0,					/* va_fsid */
139 	0,					/* va_nodeid */
140 	0,					/* va_nlink */
141 	0,					/* va_size */
142 	0,					/* va_atime */
143 	0,					/* va_mtime */
144 	0,					/* va_ctime */
145 	0,					/* va_rdev */
146 	0,					/* va_blksize */
147 	0,					/* va_nblocks */
148 	0					/* va_vcode */
149 };
150 
151 kmem_cache_t	*sdev_node_cache;	/* sdev_node cache */
152 int		devtype;		/* fstype */
153 
154 /* static */
155 static struct vnodeops *sdev_get_vop(struct sdev_node *);
156 static void sdev_set_no_negcache(struct sdev_node *);
157 static fs_operation_def_t *sdev_merge_vtab(const fs_operation_def_t []);
158 static void sdev_free_vtab(fs_operation_def_t *);
159 
160 static void
161 sdev_prof_free(struct sdev_node *dv)
162 {
163 	ASSERT(!SDEV_IS_GLOBAL(dv));
164 	nvlist_free(dv->sdev_prof.dev_name);
165 	nvlist_free(dv->sdev_prof.dev_map);
166 	nvlist_free(dv->sdev_prof.dev_symlink);
167 	nvlist_free(dv->sdev_prof.dev_glob_incdir);
168 	nvlist_free(dv->sdev_prof.dev_glob_excdir);
169 	bzero(&dv->sdev_prof, sizeof (dv->sdev_prof));
170 }
171 
172 /* sdev_node cache constructor */
173 /*ARGSUSED1*/
174 static int
175 i_sdev_node_ctor(void *buf, void *cfarg, int flag)
176 {
177 	struct sdev_node *dv = (struct sdev_node *)buf;
178 	struct vnode *vp;
179 
180 	bzero(buf, sizeof (struct sdev_node));
181 	vp = dv->sdev_vnode = vn_alloc(flag);
182 	if (vp == NULL) {
183 		return (-1);
184 	}
185 	vp->v_data = dv;
186 	rw_init(&dv->sdev_contents, NULL, RW_DEFAULT, NULL);
187 	return (0);
188 }
189 
190 /* sdev_node cache destructor */
191 /*ARGSUSED1*/
192 static void
193 i_sdev_node_dtor(void *buf, void *arg)
194 {
195 	struct sdev_node *dv = (struct sdev_node *)buf;
196 	struct vnode *vp = SDEVTOV(dv);
197 
198 	rw_destroy(&dv->sdev_contents);
199 	vn_free(vp);
200 }
201 
202 /* initialize sdev_node cache */
203 void
204 sdev_node_cache_init()
205 {
206 	int flags = 0;
207 
208 #ifdef	DEBUG
209 	flags = sdev_debug_cache_flags;
210 	if (flags)
211 		sdcmn_err(("cache debug flags 0x%x\n", flags));
212 #endif	/* DEBUG */
213 
214 	ASSERT(sdev_node_cache == NULL);
215 	sdev_node_cache = kmem_cache_create("sdev_node_cache",
216 	    sizeof (struct sdev_node), 0, i_sdev_node_ctor, i_sdev_node_dtor,
217 	    NULL, NULL, NULL, flags);
218 }
219 
220 /* destroy sdev_node cache */
221 void
222 sdev_node_cache_fini()
223 {
224 	ASSERT(sdev_node_cache != NULL);
225 	kmem_cache_destroy(sdev_node_cache);
226 	sdev_node_cache = NULL;
227 }
228 
229 /*
230  * Compare two nodes lexographically to balance avl tree
231  */
232 static int
233 sdev_compare_nodes(const struct sdev_node *dv1, const struct sdev_node *dv2)
234 {
235 	int rv;
236 	if ((rv = strcmp(dv1->sdev_name, dv2->sdev_name)) == 0)
237 		return (0);
238 	return ((rv < 0) ? -1 : 1);
239 }
240 
241 void
242 sdev_set_nodestate(struct sdev_node *dv, sdev_node_state_t state)
243 {
244 	ASSERT(dv);
245 	ASSERT(RW_WRITE_HELD(&dv->sdev_contents));
246 	dv->sdev_state = state;
247 }
248 
249 static void
250 sdev_attr_update(struct sdev_node *dv, vattr_t *vap)
251 {
252 	timestruc_t	now;
253 	struct vattr	*attrp;
254 	uint_t		mask;
255 
256 	ASSERT(dv->sdev_attr);
257 	ASSERT(vap);
258 
259 	attrp = dv->sdev_attr;
260 	mask = vap->va_mask;
261 	if (mask & AT_TYPE)
262 		attrp->va_type = vap->va_type;
263 	if (mask & AT_MODE)
264 		attrp->va_mode = vap->va_mode;
265 	if (mask & AT_UID)
266 		attrp->va_uid = vap->va_uid;
267 	if (mask & AT_GID)
268 		attrp->va_gid = vap->va_gid;
269 	if (mask & AT_RDEV)
270 		attrp->va_rdev = vap->va_rdev;
271 
272 	gethrestime(&now);
273 	attrp->va_atime = (mask & AT_ATIME) ? vap->va_atime : now;
274 	attrp->va_mtime = (mask & AT_MTIME) ? vap->va_mtime : now;
275 	attrp->va_ctime = (mask & AT_CTIME) ? vap->va_ctime : now;
276 }
277 
278 static void
279 sdev_attr_alloc(struct sdev_node *dv, vattr_t *vap)
280 {
281 	ASSERT(dv->sdev_attr == NULL);
282 	ASSERT(vap->va_mask & AT_TYPE);
283 	ASSERT(vap->va_mask & AT_MODE);
284 
285 	dv->sdev_attr = kmem_zalloc(sizeof (struct vattr), KM_SLEEP);
286 	sdev_attr_update(dv, vap);
287 }
288 
289 /* alloc and initialize a sdev_node */
290 int
291 sdev_nodeinit(struct sdev_node *ddv, char *nm, struct sdev_node **newdv,
292     vattr_t *vap)
293 {
294 	struct sdev_node *dv = NULL;
295 	struct vnode *vp;
296 	size_t nmlen, len;
297 	devname_handle_t  *dhl;
298 
299 	nmlen = strlen(nm) + 1;
300 	if (nmlen > MAXNAMELEN) {
301 		sdcmn_err9(("sdev_nodeinit: node name %s"
302 		    " too long\n", nm));
303 		*newdv = NULL;
304 		return (ENAMETOOLONG);
305 	}
306 
307 	dv = kmem_cache_alloc(sdev_node_cache, KM_SLEEP);
308 
309 	dv->sdev_name = kmem_alloc(nmlen, KM_SLEEP);
310 	bcopy(nm, dv->sdev_name, nmlen);
311 	dv->sdev_namelen = nmlen - 1;	/* '\0' not included */
312 	len = strlen(ddv->sdev_path) + strlen(nm) + 2;
313 	dv->sdev_path = kmem_alloc(len, KM_SLEEP);
314 	(void) snprintf(dv->sdev_path, len, "%s/%s", ddv->sdev_path, nm);
315 	/* overwritten for VLNK nodes */
316 	dv->sdev_symlink = NULL;
317 
318 	vp = SDEVTOV(dv);
319 	vn_reinit(vp);
320 	vp->v_vfsp = SDEVTOV(ddv)->v_vfsp;
321 	if (vap)
322 		vp->v_type = vap->va_type;
323 
324 	/*
325 	 * initialized to the parent's vnodeops.
326 	 * maybe overwriten for a VDIR
327 	 */
328 	vn_setops(vp, vn_getops(SDEVTOV(ddv)));
329 	vn_exists(vp);
330 
331 	dv->sdev_dotdot = NULL;
332 	dv->sdev_attrvp = NULL;
333 	if (vap) {
334 		sdev_attr_alloc(dv, vap);
335 	} else {
336 		dv->sdev_attr = NULL;
337 	}
338 
339 	dv->sdev_ino = sdev_mkino(dv);
340 	dv->sdev_nlink = 0;		/* updated on insert */
341 	dv->sdev_flags = ddv->sdev_flags; /* inherit from the parent first */
342 	dv->sdev_flags |= SDEV_BUILD;
343 	mutex_init(&dv->sdev_lookup_lock, NULL, MUTEX_DEFAULT, NULL);
344 	cv_init(&dv->sdev_lookup_cv, NULL, CV_DEFAULT, NULL);
345 	if (SDEV_IS_GLOBAL(ddv)) {
346 		dv->sdev_flags |= SDEV_GLOBAL;
347 		dhl = &(dv->sdev_handle);
348 		dhl->dh_data = dv;
349 		dhl->dh_args = NULL;
350 		sdev_set_no_negcache(dv);
351 		dv->sdev_gdir_gen = 0;
352 	} else {
353 		dv->sdev_flags &= ~SDEV_GLOBAL;
354 		dv->sdev_origin = NULL; /* set later */
355 		bzero(&dv->sdev_prof, sizeof (dv->sdev_prof));
356 		dv->sdev_ldir_gen = 0;
357 		dv->sdev_devtree_gen = 0;
358 	}
359 
360 	rw_enter(&dv->sdev_contents, RW_WRITER);
361 	sdev_set_nodestate(dv, SDEV_INIT);
362 	rw_exit(&dv->sdev_contents);
363 	*newdv = dv;
364 
365 	return (0);
366 }
367 
368 /*
369  * Transition a sdev_node into SDEV_READY state. If this fails, it is up to the
370  * caller to transition the node to the SDEV_ZOMBIE state.
371  */
372 int
373 sdev_nodeready(struct sdev_node *dv, struct vattr *vap, struct vnode *avp,
374     void *args, struct cred *cred)
375 {
376 	int error = 0;
377 	struct vnode *vp = SDEVTOV(dv);
378 	vtype_t type;
379 
380 	ASSERT(dv && (dv->sdev_state != SDEV_READY) && vap);
381 
382 	type = vap->va_type;
383 	vp->v_type = type;
384 	vp->v_rdev = vap->va_rdev;
385 	rw_enter(&dv->sdev_contents, RW_WRITER);
386 	if (type == VDIR) {
387 		dv->sdev_nlink = 2;
388 		dv->sdev_flags &= ~SDEV_PERSIST;
389 		dv->sdev_flags &= ~SDEV_DYNAMIC;
390 		vn_setops(vp, sdev_get_vop(dv)); /* from internal vtab */
391 		ASSERT(dv->sdev_dotdot);
392 		ASSERT(SDEVTOV(dv->sdev_dotdot)->v_type == VDIR);
393 		vp->v_rdev = SDEVTOV(dv->sdev_dotdot)->v_rdev;
394 		avl_create(&dv->sdev_entries,
395 		    (int (*)(const void *, const void *))sdev_compare_nodes,
396 		    sizeof (struct sdev_node),
397 		    offsetof(struct sdev_node, sdev_avllink));
398 	} else if (type == VLNK) {
399 		ASSERT(args);
400 		dv->sdev_nlink = 1;
401 		dv->sdev_symlink = i_ddi_strdup((char *)args, KM_SLEEP);
402 	} else {
403 		dv->sdev_nlink = 1;
404 	}
405 
406 	if (!(SDEV_IS_GLOBAL(dv))) {
407 		dv->sdev_origin = (struct sdev_node *)args;
408 		dv->sdev_flags &= ~SDEV_PERSIST;
409 	}
410 
411 	/*
412 	 * shadow node is created here OR
413 	 * if failed (indicated by dv->sdev_attrvp == NULL),
414 	 * created later in sdev_setattr
415 	 */
416 	if (avp) {
417 		dv->sdev_attrvp = avp;
418 	} else {
419 		if (dv->sdev_attr == NULL) {
420 			sdev_attr_alloc(dv, vap);
421 		} else {
422 			sdev_attr_update(dv, vap);
423 		}
424 
425 		if ((dv->sdev_attrvp == NULL) && SDEV_IS_PERSIST(dv))
426 			error = sdev_shadow_node(dv, cred);
427 	}
428 
429 	if (error == 0) {
430 		/* transition to READY state */
431 		sdev_set_nodestate(dv, SDEV_READY);
432 		sdev_nc_node_exists(dv);
433 	}
434 	rw_exit(&dv->sdev_contents);
435 	return (error);
436 }
437 
438 /*
439  * Build the VROOT sdev_node.
440  */
441 /*ARGSUSED*/
442 struct sdev_node *
443 sdev_mkroot(struct vfs *vfsp, dev_t devdev, struct vnode *mvp,
444     struct vnode *avp, struct cred *cred)
445 {
446 	struct sdev_node *dv;
447 	struct vnode *vp;
448 	char devdir[] = "/dev";
449 
450 	ASSERT(sdev_node_cache != NULL);
451 	ASSERT(avp);
452 	dv = kmem_cache_alloc(sdev_node_cache, KM_SLEEP);
453 	vp = SDEVTOV(dv);
454 	vn_reinit(vp);
455 	vp->v_flag |= VROOT;
456 	vp->v_vfsp = vfsp;
457 	vp->v_type = VDIR;
458 	vp->v_rdev = devdev;
459 	vn_setops(vp, sdev_vnodeops); /* apply the default vnodeops at /dev */
460 	vn_exists(vp);
461 
462 	if (vfsp->vfs_mntpt)
463 		dv->sdev_name = i_ddi_strdup(
464 		    (char *)refstr_value(vfsp->vfs_mntpt), KM_SLEEP);
465 	else
466 		/* vfs_mountdev1 set mount point later */
467 		dv->sdev_name = i_ddi_strdup("/dev", KM_SLEEP);
468 	dv->sdev_namelen = strlen(dv->sdev_name); /* '\0' not included */
469 	dv->sdev_path = i_ddi_strdup(devdir, KM_SLEEP);
470 	dv->sdev_ino = SDEV_ROOTINO;
471 	dv->sdev_nlink = 2;		/* name + . (no sdev_insert) */
472 	dv->sdev_dotdot = dv;		/* .. == self */
473 	dv->sdev_attrvp = avp;
474 	dv->sdev_attr = NULL;
475 	mutex_init(&dv->sdev_lookup_lock, NULL, MUTEX_DEFAULT, NULL);
476 	cv_init(&dv->sdev_lookup_cv, NULL, CV_DEFAULT, NULL);
477 	if (strcmp(dv->sdev_name, "/dev") == 0) {
478 		dv->sdev_flags = SDEV_BUILD|SDEV_GLOBAL|SDEV_PERSIST;
479 		bzero(&dv->sdev_handle, sizeof (dv->sdev_handle));
480 		dv->sdev_gdir_gen = 0;
481 	} else {
482 		dv->sdev_flags = SDEV_BUILD;
483 		dv->sdev_flags &= ~SDEV_PERSIST;
484 		bzero(&dv->sdev_prof, sizeof (dv->sdev_prof));
485 		dv->sdev_ldir_gen = 0;
486 		dv->sdev_devtree_gen = 0;
487 	}
488 
489 	avl_create(&dv->sdev_entries,
490 	    (int (*)(const void *, const void *))sdev_compare_nodes,
491 	    sizeof (struct sdev_node),
492 	    offsetof(struct sdev_node, sdev_avllink));
493 
494 	rw_enter(&dv->sdev_contents, RW_WRITER);
495 	sdev_set_nodestate(dv, SDEV_READY);
496 	rw_exit(&dv->sdev_contents);
497 	sdev_nc_node_exists(dv);
498 	return (dv);
499 }
500 
501 /* directory dependent vop table */
502 struct sdev_vop_table {
503 	char *vt_name;				/* subdirectory name */
504 	const fs_operation_def_t *vt_service;	/* vnodeops table */
505 	struct vnodeops *vt_vops;		/* constructed vop */
506 	struct vnodeops **vt_global_vops;	/* global container for vop */
507 	int (*vt_vtor)(struct sdev_node *);	/* validate sdev_node */
508 	int vt_flags;
509 };
510 
511 /*
512  * A nice improvement would be to provide a plug-in mechanism
513  * for this table instead of a const table.
514  */
515 static struct sdev_vop_table vtab[] =
516 {
517 	{ "pts", devpts_vnodeops_tbl, NULL, &devpts_vnodeops, devpts_validate,
518 	SDEV_DYNAMIC | SDEV_VTOR },
519 
520 	{ "vt", devvt_vnodeops_tbl, NULL, &devvt_vnodeops, devvt_validate,
521 	SDEV_DYNAMIC | SDEV_VTOR },
522 
523 	{ "zvol", devzvol_vnodeops_tbl, NULL, &devzvol_vnodeops,
524 	devzvol_validate, SDEV_ZONED | SDEV_DYNAMIC | SDEV_VTOR | SDEV_SUBDIR },
525 
526 	{ "zcons", NULL, NULL, NULL, NULL, SDEV_NO_NCACHE },
527 
528 	{ "net", devnet_vnodeops_tbl, NULL, &devnet_vnodeops, devnet_validate,
529 	SDEV_DYNAMIC | SDEV_VTOR },
530 
531 	{ "ipnet", devipnet_vnodeops_tbl, NULL, &devipnet_vnodeops,
532 	devipnet_validate, SDEV_DYNAMIC | SDEV_VTOR | SDEV_NO_NCACHE },
533 
534 	/*
535 	 * SDEV_DYNAMIC: prevent calling out to devfsadm, since only the
536 	 * lofi driver controls child nodes.
537 	 *
538 	 * SDEV_PERSIST: ensure devfsadm knows to clean up any persisted
539 	 * stale nodes (e.g. from devfsadm -R).
540 	 *
541 	 * In addition, devfsadm knows not to attempt a rmdir: a zone
542 	 * may hold a reference, which would zombify the node,
543 	 * preventing a mkdir.
544 	 */
545 
546 	{ "lofi", NULL, NULL, NULL, NULL,
547 	    SDEV_ZONED | SDEV_DYNAMIC | SDEV_PERSIST },
548 	{ "rlofi", NULL, NULL, NULL, NULL,
549 	    SDEV_ZONED | SDEV_DYNAMIC | SDEV_PERSIST },
550 
551 	{ NULL, NULL, NULL, NULL, NULL, 0}
552 };
553 
554 /*
555  * We need to match off of the sdev_path, not the sdev_name. We are only allowed
556  * to exist directly under /dev.
557  */
558 struct sdev_vop_table *
559 sdev_match(struct sdev_node *dv)
560 {
561 	int vlen;
562 	int i;
563 	const char *path;
564 
565 	if (strlen(dv->sdev_path) <= 5)
566 		return (NULL);
567 
568 	if (strncmp(dv->sdev_path, "/dev/", 5) != 0)
569 		return (NULL);
570 	path = dv->sdev_path + 5;
571 
572 	for (i = 0; vtab[i].vt_name; i++) {
573 		if (strcmp(vtab[i].vt_name, path) == 0)
574 			return (&vtab[i]);
575 		if (vtab[i].vt_flags & SDEV_SUBDIR) {
576 			vlen = strlen(vtab[i].vt_name);
577 			if ((strncmp(vtab[i].vt_name, path,
578 			    vlen - 1) == 0) && path[vlen] == '/')
579 				return (&vtab[i]);
580 		}
581 
582 	}
583 	return (NULL);
584 }
585 
586 /*
587  *  sets a directory's vnodeops if the directory is in the vtab;
588  */
589 static struct vnodeops *
590 sdev_get_vop(struct sdev_node *dv)
591 {
592 	struct sdev_vop_table *vtp;
593 	char *path;
594 
595 	path = dv->sdev_path;
596 	ASSERT(path);
597 
598 	/* gets the relative path to /dev/ */
599 	path += 5;
600 
601 	/* gets the vtab entry it matches */
602 	if ((vtp = sdev_match(dv)) != NULL) {
603 		dv->sdev_flags |= vtp->vt_flags;
604 		if (SDEV_IS_PERSIST(dv->sdev_dotdot) &&
605 		    (SDEV_IS_PERSIST(dv) || !SDEV_IS_DYNAMIC(dv)))
606 			dv->sdev_flags |= SDEV_PERSIST;
607 
608 		if (vtp->vt_vops) {
609 			if (vtp->vt_global_vops)
610 				*(vtp->vt_global_vops) = vtp->vt_vops;
611 
612 			return (vtp->vt_vops);
613 		}
614 
615 		if (vtp->vt_service) {
616 			fs_operation_def_t *templ;
617 			templ = sdev_merge_vtab(vtp->vt_service);
618 			if (vn_make_ops(vtp->vt_name,
619 			    (const fs_operation_def_t *)templ,
620 			    &vtp->vt_vops) != 0) {
621 				cmn_err(CE_PANIC, "%s: malformed vnode ops\n",
622 				    vtp->vt_name);
623 				/*NOTREACHED*/
624 			}
625 			if (vtp->vt_global_vops) {
626 				*(vtp->vt_global_vops) = vtp->vt_vops;
627 			}
628 			sdev_free_vtab(templ);
629 
630 			return (vtp->vt_vops);
631 		}
632 
633 		return (sdev_vnodeops);
634 	}
635 
636 	/* child inherits the persistence of the parent */
637 	if (SDEV_IS_PERSIST(dv->sdev_dotdot))
638 		dv->sdev_flags |= SDEV_PERSIST;
639 
640 	return (sdev_vnodeops);
641 }
642 
643 static void
644 sdev_set_no_negcache(struct sdev_node *dv)
645 {
646 	int i;
647 	char *path;
648 
649 	ASSERT(dv->sdev_path);
650 	path = dv->sdev_path + strlen("/dev/");
651 
652 	for (i = 0; vtab[i].vt_name; i++) {
653 		if (strcmp(vtab[i].vt_name, path) == 0) {
654 			if (vtab[i].vt_flags & SDEV_NO_NCACHE)
655 				dv->sdev_flags |= SDEV_NO_NCACHE;
656 			break;
657 		}
658 	}
659 }
660 
661 void *
662 sdev_get_vtor(struct sdev_node *dv)
663 {
664 	struct sdev_vop_table *vtp;
665 
666 	vtp = sdev_match(dv);
667 	if (vtp)
668 		return ((void *)vtp->vt_vtor);
669 	else
670 		return (NULL);
671 }
672 
673 /*
674  * Build the base root inode
675  */
676 ino_t
677 sdev_mkino(struct sdev_node *dv)
678 {
679 	ino_t	ino;
680 
681 	/*
682 	 * for now, follow the lead of tmpfs here
683 	 * need to someday understand the requirements here
684 	 */
685 	ino = (ino_t)(uint32_t)((uintptr_t)dv >> 3);
686 	ino += SDEV_ROOTINO + 1;
687 
688 	return (ino);
689 }
690 
691 int
692 sdev_getlink(struct vnode *linkvp, char **link)
693 {
694 	int err;
695 	char *buf;
696 	struct uio uio = {0};
697 	struct iovec iov = {0};
698 
699 	if (linkvp == NULL)
700 		return (ENOENT);
701 	ASSERT(linkvp->v_type == VLNK);
702 
703 	buf = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
704 	iov.iov_base = buf;
705 	iov.iov_len = MAXPATHLEN;
706 	uio.uio_iov = &iov;
707 	uio.uio_iovcnt = 1;
708 	uio.uio_resid = MAXPATHLEN;
709 	uio.uio_segflg = UIO_SYSSPACE;
710 	uio.uio_llimit = MAXOFFSET_T;
711 
712 	err = VOP_READLINK(linkvp, &uio, kcred, NULL);
713 	if (err) {
714 		cmn_err(CE_WARN, "readlink %s failed in dev\n", buf);
715 		kmem_free(buf, MAXPATHLEN);
716 		return (ENOENT);
717 	}
718 
719 	/* mission complete */
720 	*link = i_ddi_strdup(buf, KM_SLEEP);
721 	kmem_free(buf, MAXPATHLEN);
722 	return (0);
723 }
724 
725 /*
726  * A convenient wrapper to get the devfs node vnode for a device
727  * minor functionality: readlink() of a /dev symlink
728  * Place the link into dv->sdev_symlink
729  */
730 static int
731 sdev_follow_link(struct sdev_node *dv)
732 {
733 	int err;
734 	struct vnode *linkvp;
735 	char *link = NULL;
736 
737 	linkvp = SDEVTOV(dv);
738 	if (linkvp == NULL)
739 		return (ENOENT);
740 	ASSERT(linkvp->v_type == VLNK);
741 	err = sdev_getlink(linkvp, &link);
742 	if (err) {
743 		dv->sdev_symlink = NULL;
744 		return (ENOENT);
745 	}
746 
747 	ASSERT(link != NULL);
748 	dv->sdev_symlink = link;
749 	return (0);
750 }
751 
752 static int
753 sdev_node_check(struct sdev_node *dv, struct vattr *nvap, void *nargs)
754 {
755 	vtype_t otype = SDEVTOV(dv)->v_type;
756 
757 	/*
758 	 * existing sdev_node has a different type.
759 	 */
760 	if (otype != nvap->va_type) {
761 		sdcmn_err9(("sdev_node_check: existing node "
762 		    "  %s type %d does not match new node type %d\n",
763 		    dv->sdev_name, otype, nvap->va_type));
764 		return (EEXIST);
765 	}
766 
767 	/*
768 	 * For a symlink, the target should be the same.
769 	 */
770 	if (otype == VLNK) {
771 		ASSERT(nargs != NULL);
772 		ASSERT(dv->sdev_symlink != NULL);
773 		if (strcmp(dv->sdev_symlink, (char *)nargs) != 0) {
774 			sdcmn_err9(("sdev_node_check: existing node "
775 			    " %s has different symlink %s as new node "
776 			    " %s\n", dv->sdev_name, dv->sdev_symlink,
777 			    (char *)nargs));
778 			return (EEXIST);
779 		}
780 	}
781 
782 	return (0);
783 }
784 
785 /*
786  * sdev_mknode - a wrapper for sdev_nodeinit(), sdev_nodeready()
787  *
788  * arguments:
789  *	- ddv (parent)
790  *	- nm (child name)
791  *	- newdv (sdev_node for nm is returned here)
792  *	- vap (vattr for the node to be created, va_type should be set.
793  *	- avp (attribute vnode)
794  *	  the defaults should be used if unknown)
795  *	- cred
796  *	- args
797  *	    . tnm (for VLNK)
798  *	    . global sdev_node (for !SDEV_GLOBAL)
799  * 	- state: SDEV_INIT, SDEV_READY
800  *
801  * only ddv, nm, newddv, vap, cred are required for sdev_mknode(SDEV_INIT)
802  *
803  * NOTE:  directory contents writers lock needs to be held before
804  *	  calling this routine.
805  */
806 int
807 sdev_mknode(struct sdev_node *ddv, char *nm, struct sdev_node **newdv,
808     struct vattr *vap, struct vnode *avp, void *args, struct cred *cred,
809     sdev_node_state_t state)
810 {
811 	int error = 0;
812 	sdev_node_state_t node_state;
813 	struct sdev_node *dv = NULL;
814 
815 	ASSERT(state != SDEV_ZOMBIE);
816 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
817 
818 	if (*newdv) {
819 		dv = *newdv;
820 	} else {
821 		/* allocate and initialize a sdev_node */
822 		if (ddv->sdev_state == SDEV_ZOMBIE) {
823 			sdcmn_err9(("sdev_mknode: parent %s ZOMBIEd\n",
824 			    ddv->sdev_path));
825 			return (ENOENT);
826 		}
827 
828 		error = sdev_nodeinit(ddv, nm, &dv, vap);
829 		if (error != 0) {
830 			sdcmn_err9(("sdev_mknode: error %d,"
831 			    " name %s can not be initialized\n",
832 			    error, nm));
833 			return (error);
834 		}
835 		ASSERT(dv);
836 
837 		/* insert into the directory cache */
838 		sdev_cache_update(ddv, &dv, nm, SDEV_CACHE_ADD);
839 	}
840 
841 	ASSERT(dv);
842 	node_state = dv->sdev_state;
843 	ASSERT(node_state != SDEV_ZOMBIE);
844 
845 	if (state == SDEV_READY) {
846 		switch (node_state) {
847 		case SDEV_INIT:
848 			error = sdev_nodeready(dv, vap, avp, args, cred);
849 			if (error) {
850 				sdcmn_err9(("sdev_mknode: node %s can NOT"
851 				    " be transitioned into READY state, "
852 				    "error %d\n", nm, error));
853 			}
854 			break;
855 		case SDEV_READY:
856 			/*
857 			 * Do some sanity checking to make sure
858 			 * the existing sdev_node is what has been
859 			 * asked for.
860 			 */
861 			error = sdev_node_check(dv, vap, args);
862 			break;
863 		default:
864 			break;
865 		}
866 	}
867 
868 	if (!error) {
869 		*newdv = dv;
870 		ASSERT((*newdv)->sdev_state != SDEV_ZOMBIE);
871 	} else {
872 		sdev_cache_update(ddv, &dv, nm, SDEV_CACHE_DELETE);
873 		/*
874 		 * We created this node, it wasn't passed into us. Therefore it
875 		 * is up to us to delete it.
876 		 */
877 		if (*newdv == NULL)
878 			SDEV_SIMPLE_RELE(dv);
879 		*newdv = NULL;
880 	}
881 
882 	return (error);
883 }
884 
885 /*
886  * convenient wrapper to change vp's ATIME, CTIME and MTIME
887  */
888 void
889 sdev_update_timestamps(struct vnode *vp, cred_t *cred, uint_t mask)
890 {
891 	struct vattr attr;
892 	timestruc_t now;
893 	int err;
894 
895 	ASSERT(vp);
896 	gethrestime(&now);
897 	if (mask & AT_CTIME)
898 		attr.va_ctime = now;
899 	if (mask & AT_MTIME)
900 		attr.va_mtime = now;
901 	if (mask & AT_ATIME)
902 		attr.va_atime = now;
903 
904 	attr.va_mask = (mask & AT_TIMES);
905 	err = VOP_SETATTR(vp, &attr, 0, cred, NULL);
906 	if (err && (err != EROFS)) {
907 		sdcmn_err(("update timestamps error %d\n", err));
908 	}
909 }
910 
911 /*
912  * the backing store vnode is released here
913  */
914 /*ARGSUSED1*/
915 void
916 sdev_nodedestroy(struct sdev_node *dv, uint_t flags)
917 {
918 	/* no references */
919 	ASSERT(dv->sdev_nlink == 0);
920 
921 	if (dv->sdev_attrvp != NULLVP) {
922 		VN_RELE(dv->sdev_attrvp);
923 		/*
924 		 * reset the attrvp so that no more
925 		 * references can be made on this already
926 		 * vn_rele() vnode
927 		 */
928 		dv->sdev_attrvp = NULLVP;
929 	}
930 
931 	if (dv->sdev_attr != NULL) {
932 		kmem_free(dv->sdev_attr, sizeof (struct vattr));
933 		dv->sdev_attr = NULL;
934 	}
935 
936 	if (dv->sdev_name != NULL) {
937 		kmem_free(dv->sdev_name, dv->sdev_namelen + 1);
938 		dv->sdev_name = NULL;
939 	}
940 
941 	if (dv->sdev_symlink != NULL) {
942 		kmem_free(dv->sdev_symlink, strlen(dv->sdev_symlink) + 1);
943 		dv->sdev_symlink = NULL;
944 	}
945 
946 	if (dv->sdev_path) {
947 		kmem_free(dv->sdev_path, strlen(dv->sdev_path) + 1);
948 		dv->sdev_path = NULL;
949 	}
950 
951 	if (!SDEV_IS_GLOBAL(dv))
952 		sdev_prof_free(dv);
953 
954 	if (SDEVTOV(dv)->v_type == VDIR) {
955 		ASSERT(SDEV_FIRST_ENTRY(dv) == NULL);
956 		avl_destroy(&dv->sdev_entries);
957 	}
958 
959 	mutex_destroy(&dv->sdev_lookup_lock);
960 	cv_destroy(&dv->sdev_lookup_cv);
961 
962 	/* return node to initial state as per constructor */
963 	(void) memset((void *)&dv->sdev_instance_data, 0,
964 	    sizeof (dv->sdev_instance_data));
965 	vn_invalid(SDEVTOV(dv));
966 	kmem_cache_free(sdev_node_cache, dv);
967 }
968 
969 /*
970  * DIRECTORY CACHE lookup
971  */
972 struct sdev_node *
973 sdev_findbyname(struct sdev_node *ddv, char *nm)
974 {
975 	struct sdev_node *dv;
976 	struct sdev_node dvtmp;
977 	avl_index_t	where;
978 
979 	ASSERT(RW_LOCK_HELD(&ddv->sdev_contents));
980 
981 	dvtmp.sdev_name = nm;
982 	dv = avl_find(&ddv->sdev_entries, &dvtmp, &where);
983 	if (dv) {
984 		ASSERT(dv->sdev_dotdot == ddv);
985 		ASSERT(strcmp(dv->sdev_name, nm) == 0);
986 		ASSERT(dv->sdev_state != SDEV_ZOMBIE);
987 		SDEV_HOLD(dv);
988 		return (dv);
989 	}
990 	return (NULL);
991 }
992 
993 /*
994  * Inserts a new sdev_node in a parent directory
995  */
996 void
997 sdev_direnter(struct sdev_node *ddv, struct sdev_node *dv)
998 {
999 	avl_index_t where;
1000 
1001 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1002 	ASSERT(SDEVTOV(ddv)->v_type == VDIR);
1003 	ASSERT(ddv->sdev_nlink >= 2);
1004 	ASSERT(dv->sdev_nlink == 0);
1005 	ASSERT(dv->sdev_state != SDEV_ZOMBIE);
1006 
1007 	dv->sdev_dotdot = ddv;
1008 	VERIFY(avl_find(&ddv->sdev_entries, dv, &where) == NULL);
1009 	avl_insert(&ddv->sdev_entries, dv, where);
1010 	ddv->sdev_nlink++;
1011 }
1012 
1013 /*
1014  * The following check is needed because while sdev_nodes are linked
1015  * in SDEV_INIT state, they have their link counts incremented only
1016  * in SDEV_READY state.
1017  */
1018 static void
1019 decr_link(struct sdev_node *dv)
1020 {
1021 	VERIFY(RW_WRITE_HELD(&dv->sdev_contents));
1022 	if (dv->sdev_state != SDEV_INIT) {
1023 		VERIFY(dv->sdev_nlink >= 1);
1024 		dv->sdev_nlink--;
1025 	} else {
1026 		VERIFY(dv->sdev_nlink == 0);
1027 	}
1028 }
1029 
1030 /*
1031  * Delete an existing dv from directory cache
1032  *
1033  * In the case of a node is still held by non-zero reference count, the node is
1034  * put into ZOMBIE state. The node is always unlinked from its parent, but it is
1035  * not destroyed via sdev_inactive until its reference count reaches "0".
1036  */
1037 static void
1038 sdev_dirdelete(struct sdev_node *ddv, struct sdev_node *dv)
1039 {
1040 	struct vnode *vp;
1041 	sdev_node_state_t os;
1042 
1043 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1044 
1045 	vp = SDEVTOV(dv);
1046 	mutex_enter(&vp->v_lock);
1047 	rw_enter(&dv->sdev_contents, RW_WRITER);
1048 	os = dv->sdev_state;
1049 	ASSERT(os != SDEV_ZOMBIE);
1050 	dv->sdev_state = SDEV_ZOMBIE;
1051 
1052 	/*
1053 	 * unlink ourselves from the parent directory now to take care of the ..
1054 	 * link. However, if we're a directory, we don't remove our reference to
1055 	 * ourself eg. '.' until we are torn down in the inactive callback.
1056 	 */
1057 	decr_link(ddv);
1058 	avl_remove(&ddv->sdev_entries, dv);
1059 	/*
1060 	 * sdev_inactive expects nodes to have a link to themselves when we're
1061 	 * tearing them down. If we're transitioning from the initial state to
1062 	 * zombie and not via ready, then we're not going to have this link that
1063 	 * comes from the node being ready. As a result, we need to increment
1064 	 * our link count by one to account for this.
1065 	 */
1066 	if (os == SDEV_INIT && dv->sdev_nlink == 0)
1067 		dv->sdev_nlink++;
1068 	rw_exit(&dv->sdev_contents);
1069 	mutex_exit(&vp->v_lock);
1070 }
1071 
1072 /*
1073  * check if the source is in the path of the target
1074  *
1075  * source and target are different
1076  */
1077 /*ARGSUSED2*/
1078 static int
1079 sdev_checkpath(struct sdev_node *sdv, struct sdev_node *tdv, struct cred *cred)
1080 {
1081 	int error = 0;
1082 	struct sdev_node *dotdot, *dir;
1083 
1084 	dotdot = tdv->sdev_dotdot;
1085 	ASSERT(dotdot);
1086 
1087 	/* fs root */
1088 	if (dotdot == tdv) {
1089 		return (0);
1090 	}
1091 
1092 	for (;;) {
1093 		/*
1094 		 * avoid error cases like
1095 		 *	mv a a/b
1096 		 *	mv a a/b/c
1097 		 *	etc.
1098 		 */
1099 		if (dotdot == sdv) {
1100 			error = EINVAL;
1101 			break;
1102 		}
1103 
1104 		dir = dotdot;
1105 		dotdot = dir->sdev_dotdot;
1106 
1107 		/* done checking because root is reached */
1108 		if (dir == dotdot) {
1109 			break;
1110 		}
1111 	}
1112 	return (error);
1113 }
1114 
1115 int
1116 sdev_rnmnode(struct sdev_node *oddv, struct sdev_node *odv,
1117     struct sdev_node *nddv, struct sdev_node **ndvp, char *nnm,
1118     struct cred *cred)
1119 {
1120 	int error = 0;
1121 	struct vnode *ovp = SDEVTOV(odv);
1122 	struct vnode *nvp;
1123 	struct vattr vattr;
1124 	int doingdir = (ovp->v_type == VDIR);
1125 	char *link = NULL;
1126 	int samedir = (oddv == nddv) ? 1 : 0;
1127 	int bkstore = 0;
1128 	struct sdev_node *idv = NULL;
1129 	struct sdev_node *ndv = NULL;
1130 	timestruc_t now;
1131 
1132 	vattr.va_mask = AT_TYPE|AT_MODE|AT_UID|AT_GID;
1133 	error = VOP_GETATTR(ovp, &vattr, 0, cred, NULL);
1134 	if (error)
1135 		return (error);
1136 
1137 	if (!samedir)
1138 		rw_enter(&oddv->sdev_contents, RW_WRITER);
1139 	rw_enter(&nddv->sdev_contents, RW_WRITER);
1140 
1141 	/*
1142 	 * the source may have been deleted by another thread before
1143 	 * we gets here.
1144 	 */
1145 	if (odv->sdev_state != SDEV_READY) {
1146 		error = ENOENT;
1147 		goto err_out;
1148 	}
1149 
1150 	if (doingdir && (odv == nddv)) {
1151 		error = EINVAL;
1152 		goto err_out;
1153 	}
1154 
1155 	/*
1156 	 * If renaming a directory, and the parents are different (".." must be
1157 	 * changed) then the source dir must not be in the dir hierarchy above
1158 	 * the target since it would orphan everything below the source dir.
1159 	 */
1160 	if (doingdir && (oddv != nddv)) {
1161 		error = sdev_checkpath(odv, nddv, cred);
1162 		if (error)
1163 			goto err_out;
1164 	}
1165 
1166 	/* fix the source for a symlink */
1167 	if (vattr.va_type == VLNK) {
1168 		if (odv->sdev_symlink == NULL) {
1169 			error = sdev_follow_link(odv);
1170 			if (error) {
1171 				/*
1172 				 * The underlying symlink doesn't exist. This
1173 				 * node probably shouldn't even exist. While
1174 				 * it's a bit jarring to consumers, we're going
1175 				 * to remove the node from /dev.
1176 				 */
1177 				if (SDEV_IS_PERSIST((*ndvp)))
1178 					bkstore = 1;
1179 				sdev_dirdelete(oddv, odv);
1180 				if (bkstore) {
1181 					ASSERT(nddv->sdev_attrvp);
1182 					error = VOP_REMOVE(nddv->sdev_attrvp,
1183 					    nnm, cred, NULL, 0);
1184 					if (error)
1185 						goto err_out;
1186 				}
1187 				error = ENOENT;
1188 				goto err_out;
1189 			}
1190 		}
1191 		ASSERT(odv->sdev_symlink);
1192 		link = i_ddi_strdup(odv->sdev_symlink, KM_SLEEP);
1193 	}
1194 
1195 	/* destination existing */
1196 	if (*ndvp) {
1197 		nvp = SDEVTOV(*ndvp);
1198 		ASSERT(nvp);
1199 
1200 		/* handling renaming to itself */
1201 		if (odv == *ndvp) {
1202 			error = 0;
1203 			goto err_out;
1204 		}
1205 
1206 		if (nvp->v_type == VDIR) {
1207 			if (!doingdir) {
1208 				error = EISDIR;
1209 				goto err_out;
1210 			}
1211 
1212 			if (vn_vfswlock(nvp)) {
1213 				error = EBUSY;
1214 				goto err_out;
1215 			}
1216 
1217 			if (vn_mountedvfs(nvp) != NULL) {
1218 				vn_vfsunlock(nvp);
1219 				error = EBUSY;
1220 				goto err_out;
1221 			}
1222 
1223 			/* in case dir1 exists in dir2 and "mv dir1 dir2" */
1224 			if ((*ndvp)->sdev_nlink > 2) {
1225 				vn_vfsunlock(nvp);
1226 				error = EEXIST;
1227 				goto err_out;
1228 			}
1229 			vn_vfsunlock(nvp);
1230 
1231 			/*
1232 			 * We did not place the hold on *ndvp, so even though
1233 			 * we're deleting the node, we should not get rid of our
1234 			 * reference.
1235 			 */
1236 			sdev_dirdelete(nddv, *ndvp);
1237 			*ndvp = NULL;
1238 			ASSERT(nddv->sdev_attrvp);
1239 			error = VOP_RMDIR(nddv->sdev_attrvp, nnm,
1240 			    nddv->sdev_attrvp, cred, NULL, 0);
1241 			if (error)
1242 				goto err_out;
1243 		} else {
1244 			if (doingdir) {
1245 				error = ENOTDIR;
1246 				goto err_out;
1247 			}
1248 
1249 			if (SDEV_IS_PERSIST((*ndvp))) {
1250 				bkstore = 1;
1251 			}
1252 
1253 			/*
1254 			 * Get rid of the node from the directory cache note.
1255 			 * Don't forget that it's not up to us to remove the vn
1256 			 * ref on the sdev node, as we did not place it.
1257 			 */
1258 			sdev_dirdelete(nddv, *ndvp);
1259 			*ndvp = NULL;
1260 			if (bkstore) {
1261 				ASSERT(nddv->sdev_attrvp);
1262 				error = VOP_REMOVE(nddv->sdev_attrvp,
1263 				    nnm, cred, NULL, 0);
1264 				if (error)
1265 					goto err_out;
1266 			}
1267 		}
1268 	}
1269 
1270 	/*
1271 	 * make a fresh node from the source attrs
1272 	 */
1273 	ASSERT(RW_WRITE_HELD(&nddv->sdev_contents));
1274 	error = sdev_mknode(nddv, nnm, ndvp, &vattr,
1275 	    NULL, (void *)link, cred, SDEV_READY);
1276 
1277 	if (link != NULL) {
1278 		kmem_free(link, strlen(link) + 1);
1279 		link = NULL;
1280 	}
1281 
1282 	if (error)
1283 		goto err_out;
1284 	ASSERT(*ndvp);
1285 	ASSERT((*ndvp)->sdev_state == SDEV_READY);
1286 
1287 	/* move dir contents */
1288 	if (doingdir) {
1289 		for (idv = SDEV_FIRST_ENTRY(odv); idv;
1290 		    idv = SDEV_NEXT_ENTRY(odv, idv)) {
1291 			SDEV_HOLD(idv);
1292 			error = sdev_rnmnode(odv, idv,
1293 			    (struct sdev_node *)(*ndvp), &ndv,
1294 			    idv->sdev_name, cred);
1295 			SDEV_RELE(idv);
1296 			if (error)
1297 				goto err_out;
1298 			ndv = NULL;
1299 		}
1300 	}
1301 
1302 	if ((*ndvp)->sdev_attrvp) {
1303 		sdev_update_timestamps((*ndvp)->sdev_attrvp, kcred,
1304 		    AT_CTIME|AT_ATIME);
1305 	} else {
1306 		ASSERT((*ndvp)->sdev_attr);
1307 		gethrestime(&now);
1308 		(*ndvp)->sdev_attr->va_ctime = now;
1309 		(*ndvp)->sdev_attr->va_atime = now;
1310 	}
1311 
1312 	if (nddv->sdev_attrvp) {
1313 		sdev_update_timestamps(nddv->sdev_attrvp, kcred,
1314 		    AT_MTIME|AT_ATIME);
1315 	} else {
1316 		ASSERT(nddv->sdev_attr);
1317 		gethrestime(&now);
1318 		nddv->sdev_attr->va_mtime = now;
1319 		nddv->sdev_attr->va_atime = now;
1320 	}
1321 	rw_exit(&nddv->sdev_contents);
1322 	if (!samedir)
1323 		rw_exit(&oddv->sdev_contents);
1324 
1325 	SDEV_RELE(*ndvp);
1326 	return (error);
1327 
1328 err_out:
1329 	if (link != NULL) {
1330 		kmem_free(link, strlen(link) + 1);
1331 		link = NULL;
1332 	}
1333 
1334 	rw_exit(&nddv->sdev_contents);
1335 	if (!samedir)
1336 		rw_exit(&oddv->sdev_contents);
1337 	return (error);
1338 }
1339 
1340 /*
1341  * Merge sdev_node specific information into an attribute structure.
1342  *
1343  * note: sdev_node is not locked here
1344  */
1345 void
1346 sdev_vattr_merge(struct sdev_node *dv, struct vattr *vap)
1347 {
1348 	struct vnode *vp = SDEVTOV(dv);
1349 
1350 	vap->va_nlink = dv->sdev_nlink;
1351 	vap->va_nodeid = dv->sdev_ino;
1352 	vap->va_fsid = SDEVTOV(dv->sdev_dotdot)->v_rdev;
1353 	vap->va_type = vp->v_type;
1354 
1355 	if (vp->v_type == VDIR) {
1356 		vap->va_rdev = 0;
1357 		vap->va_fsid = vp->v_rdev;
1358 	} else if (vp->v_type == VLNK) {
1359 		vap->va_rdev = 0;
1360 		vap->va_mode  &= ~S_IFMT;
1361 		vap->va_mode |= S_IFLNK;
1362 	} else if ((vp->v_type == VCHR) || (vp->v_type == VBLK)) {
1363 		vap->va_rdev = vp->v_rdev;
1364 		vap->va_mode &= ~S_IFMT;
1365 		if (vap->va_type == VCHR)
1366 			vap->va_mode |= S_IFCHR;
1367 		else
1368 			vap->va_mode |= S_IFBLK;
1369 	} else {
1370 		vap->va_rdev = 0;
1371 	}
1372 }
1373 
1374 struct vattr *
1375 sdev_getdefault_attr(enum vtype type)
1376 {
1377 	if (type == VDIR)
1378 		return (&sdev_vattr_dir);
1379 	else if (type == VCHR)
1380 		return (&sdev_vattr_chr);
1381 	else if (type == VBLK)
1382 		return (&sdev_vattr_blk);
1383 	else if (type == VLNK)
1384 		return (&sdev_vattr_lnk);
1385 	else
1386 		return (NULL);
1387 }
1388 int
1389 sdev_to_vp(struct sdev_node *dv, struct vnode **vpp)
1390 {
1391 	int rv = 0;
1392 	struct vnode *vp = SDEVTOV(dv);
1393 
1394 	switch (vp->v_type) {
1395 	case VCHR:
1396 	case VBLK:
1397 		/*
1398 		 * If vnode is a device, return special vnode instead
1399 		 * (though it knows all about -us- via sp->s_realvp)
1400 		 */
1401 		*vpp = specvp(vp, vp->v_rdev, vp->v_type, kcred);
1402 		VN_RELE(vp);
1403 		if (*vpp == NULLVP)
1404 			rv = ENOSYS;
1405 		break;
1406 	default:	/* most types are returned as is */
1407 		*vpp = vp;
1408 		break;
1409 	}
1410 	return (rv);
1411 }
1412 
1413 /*
1414  * junction between devname and root file system, e.g. ufs
1415  */
1416 int
1417 devname_backstore_lookup(struct sdev_node *ddv, char *nm, struct vnode **rvp)
1418 {
1419 	struct vnode *rdvp = ddv->sdev_attrvp;
1420 	int rval = 0;
1421 
1422 	ASSERT(rdvp);
1423 
1424 	rval = VOP_LOOKUP(rdvp, nm, rvp, NULL, 0, NULL, kcred, NULL, NULL,
1425 	    NULL);
1426 	return (rval);
1427 }
1428 
1429 static int
1430 sdev_filldir_from_store(struct sdev_node *ddv, int dlen, struct cred *cred)
1431 {
1432 	struct sdev_node *dv = NULL;
1433 	char	*nm;
1434 	struct vnode *dirvp;
1435 	int	error;
1436 	vnode_t	*vp;
1437 	int eof;
1438 	struct iovec iov;
1439 	struct uio uio;
1440 	struct dirent64 *dp;
1441 	dirent64_t *dbuf;
1442 	size_t dbuflen;
1443 	struct vattr vattr;
1444 	char *link = NULL;
1445 
1446 	if (ddv->sdev_attrvp == NULL)
1447 		return (0);
1448 	if (!(ddv->sdev_flags & SDEV_BUILD))
1449 		return (0);
1450 
1451 	dirvp = ddv->sdev_attrvp;
1452 	VN_HOLD(dirvp);
1453 	dbuf = kmem_zalloc(dlen, KM_SLEEP);
1454 
1455 	uio.uio_iov = &iov;
1456 	uio.uio_iovcnt = 1;
1457 	uio.uio_segflg = UIO_SYSSPACE;
1458 	uio.uio_fmode = 0;
1459 	uio.uio_extflg = UIO_COPY_CACHED;
1460 	uio.uio_loffset = 0;
1461 	uio.uio_llimit = MAXOFFSET_T;
1462 
1463 	eof = 0;
1464 	error = 0;
1465 	while (!error && !eof) {
1466 		uio.uio_resid = dlen;
1467 		iov.iov_base = (char *)dbuf;
1468 		iov.iov_len = dlen;
1469 		(void) VOP_RWLOCK(dirvp, V_WRITELOCK_FALSE, NULL);
1470 		error = VOP_READDIR(dirvp, &uio, kcred, &eof, NULL, 0);
1471 		VOP_RWUNLOCK(dirvp, V_WRITELOCK_FALSE, NULL);
1472 
1473 		dbuflen = dlen - uio.uio_resid;
1474 		if (error || dbuflen == 0)
1475 			break;
1476 
1477 		if (!(ddv->sdev_flags & SDEV_BUILD))
1478 			break;
1479 
1480 		for (dp = dbuf; ((intptr_t)dp <
1481 		    (intptr_t)dbuf + dbuflen);
1482 		    dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen)) {
1483 			nm = dp->d_name;
1484 
1485 			if (strcmp(nm, ".") == 0 ||
1486 			    strcmp(nm, "..") == 0)
1487 				continue;
1488 
1489 			vp = NULLVP;
1490 			dv = sdev_cache_lookup(ddv, nm);
1491 			if (dv) {
1492 				VERIFY(dv->sdev_state != SDEV_ZOMBIE);
1493 				SDEV_SIMPLE_RELE(dv);
1494 				continue;
1495 			}
1496 
1497 			/* refill the cache if not already */
1498 			error = devname_backstore_lookup(ddv, nm, &vp);
1499 			if (error)
1500 				continue;
1501 
1502 			vattr.va_mask = AT_TYPE|AT_MODE|AT_UID|AT_GID;
1503 			error = VOP_GETATTR(vp, &vattr, 0, cred, NULL);
1504 			if (error)
1505 				continue;
1506 
1507 			if (vattr.va_type == VLNK) {
1508 				error = sdev_getlink(vp, &link);
1509 				if (error) {
1510 					continue;
1511 				}
1512 				ASSERT(link != NULL);
1513 			}
1514 
1515 			if (!rw_tryupgrade(&ddv->sdev_contents)) {
1516 				rw_exit(&ddv->sdev_contents);
1517 				rw_enter(&ddv->sdev_contents, RW_WRITER);
1518 			}
1519 			error = sdev_mknode(ddv, nm, &dv, &vattr, vp, link,
1520 			    cred, SDEV_READY);
1521 			rw_downgrade(&ddv->sdev_contents);
1522 
1523 			if (link != NULL) {
1524 				kmem_free(link, strlen(link) + 1);
1525 				link = NULL;
1526 			}
1527 
1528 			if (!error) {
1529 				ASSERT(dv);
1530 				ASSERT(dv->sdev_state != SDEV_ZOMBIE);
1531 				SDEV_SIMPLE_RELE(dv);
1532 			}
1533 			vp = NULL;
1534 			dv = NULL;
1535 		}
1536 	}
1537 
1538 done:
1539 	VN_RELE(dirvp);
1540 	kmem_free(dbuf, dlen);
1541 
1542 	return (error);
1543 }
1544 
1545 void
1546 sdev_filldir_dynamic(struct sdev_node *ddv)
1547 {
1548 	int error;
1549 	int i;
1550 	struct vattr vattr;
1551 	struct vattr *vap = &vattr;
1552 	char *nm = NULL;
1553 	struct sdev_node *dv = NULL;
1554 
1555 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1556 	ASSERT((ddv->sdev_flags & SDEV_BUILD));
1557 
1558 	*vap = *sdev_getdefault_attr(VDIR);	/* note structure copy here */
1559 	gethrestime(&vap->va_atime);
1560 	vap->va_mtime = vap->va_atime;
1561 	vap->va_ctime = vap->va_atime;
1562 	for (i = 0; vtab[i].vt_name != NULL; i++) {
1563 		/*
1564 		 * This early, we may be in a read-only /dev environment: leave
1565 		 * the creation of any nodes we'd attempt to persist to
1566 		 * devfsadm. Because /dev itself is normally persistent, any
1567 		 * node which is not marked dynamic will end up being marked
1568 		 * persistent. However, some nodes are both dynamic and
1569 		 * persistent, mostly lofi and rlofi, so we need to be careful
1570 		 * in our check.
1571 		 */
1572 		if ((vtab[i].vt_flags & SDEV_PERSIST) ||
1573 		    !(vtab[i].vt_flags & SDEV_DYNAMIC))
1574 			continue;
1575 		nm = vtab[i].vt_name;
1576 		ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1577 		dv = NULL;
1578 		error = sdev_mknode(ddv, nm, &dv, vap, NULL,
1579 		    NULL, kcred, SDEV_READY);
1580 		if (error) {
1581 			cmn_err(CE_WARN, "%s/%s: error %d\n",
1582 			    ddv->sdev_name, nm, error);
1583 		} else {
1584 			ASSERT(dv);
1585 			ASSERT(dv->sdev_state != SDEV_ZOMBIE);
1586 			SDEV_SIMPLE_RELE(dv);
1587 		}
1588 	}
1589 }
1590 
1591 /*
1592  * Creating a backing store entry based on sdev_attr.
1593  * This is called either as part of node creation in a persistent directory
1594  * or from setattr/setsecattr to persist access attributes across reboot.
1595  */
1596 int
1597 sdev_shadow_node(struct sdev_node *dv, struct cred *cred)
1598 {
1599 	int error = 0;
1600 	struct vnode *dvp = SDEVTOV(dv->sdev_dotdot);
1601 	struct vnode *rdvp = VTOSDEV(dvp)->sdev_attrvp;
1602 	struct vattr *vap = dv->sdev_attr;
1603 	char *nm = dv->sdev_name;
1604 	struct vnode *tmpvp, **rvp = &tmpvp, *rrvp = NULL;
1605 
1606 	ASSERT(dv && dv->sdev_name && rdvp);
1607 	ASSERT(RW_WRITE_HELD(&dv->sdev_contents) && dv->sdev_attrvp == NULL);
1608 
1609 lookup:
1610 	/* try to find it in the backing store */
1611 	error = VOP_LOOKUP(rdvp, nm, rvp, NULL, 0, NULL, cred, NULL, NULL,
1612 	    NULL);
1613 	if (error == 0) {
1614 		if (VOP_REALVP(*rvp, &rrvp, NULL) == 0) {
1615 			VN_HOLD(rrvp);
1616 			VN_RELE(*rvp);
1617 			*rvp = rrvp;
1618 		}
1619 
1620 		kmem_free(dv->sdev_attr, sizeof (vattr_t));
1621 		dv->sdev_attr = NULL;
1622 		dv->sdev_attrvp = *rvp;
1623 		return (0);
1624 	}
1625 
1626 	/* let's try to persist the node */
1627 	gethrestime(&vap->va_atime);
1628 	vap->va_mtime = vap->va_atime;
1629 	vap->va_ctime = vap->va_atime;
1630 	vap->va_mask |= AT_TYPE|AT_MODE;
1631 	switch (vap->va_type) {
1632 	case VDIR:
1633 		error = VOP_MKDIR(rdvp, nm, vap, rvp, cred, NULL, 0, NULL);
1634 		sdcmn_err9(("sdev_shadow_node: mkdir vp %p error %d\n",
1635 		    (void *)(*rvp), error));
1636 		if (!error)
1637 			VN_RELE(*rvp);
1638 		break;
1639 	case VCHR:
1640 	case VBLK:
1641 	case VREG:
1642 	case VDOOR:
1643 		error = VOP_CREATE(rdvp, nm, vap, NONEXCL, VREAD|VWRITE,
1644 		    rvp, cred, 0, NULL, NULL);
1645 		sdcmn_err9(("sdev_shadow_node: create vp %p, error %d\n",
1646 		    (void *)(*rvp), error));
1647 		if (!error)
1648 			VN_RELE(*rvp);
1649 		break;
1650 	case VLNK:
1651 		ASSERT(dv->sdev_symlink);
1652 		error = VOP_SYMLINK(rdvp, nm, vap, dv->sdev_symlink, cred,
1653 		    NULL, 0);
1654 		sdcmn_err9(("sdev_shadow_node: create symlink error %d\n",
1655 		    error));
1656 		break;
1657 	default:
1658 		cmn_err(CE_PANIC, "dev: %s: sdev_shadow_node "
1659 		    "create\n", nm);
1660 		/*NOTREACHED*/
1661 	}
1662 
1663 	/* go back to lookup to factor out spec node and set attrvp */
1664 	if (error == 0)
1665 		goto lookup;
1666 
1667 	sdcmn_err(("cannot persist %s - error %d\n", dv->sdev_path, error));
1668 	return (error);
1669 }
1670 
1671 static void
1672 sdev_cache_add(struct sdev_node *ddv, struct sdev_node **dv, char *nm)
1673 {
1674 	struct sdev_node *dup = NULL;
1675 
1676 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1677 	if ((dup = sdev_findbyname(ddv, nm)) == NULL) {
1678 		sdev_direnter(ddv, *dv);
1679 	} else {
1680 		VERIFY(dup->sdev_state != SDEV_ZOMBIE);
1681 		SDEV_SIMPLE_RELE(*dv);
1682 		sdev_nodedestroy(*dv, 0);
1683 		*dv = dup;
1684 	}
1685 }
1686 
1687 static void
1688 sdev_cache_delete(struct sdev_node *ddv, struct sdev_node **dv)
1689 {
1690 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1691 	sdev_dirdelete(ddv, *dv);
1692 }
1693 
1694 /*
1695  * update the in-core directory cache
1696  */
1697 void
1698 sdev_cache_update(struct sdev_node *ddv, struct sdev_node **dv, char *nm,
1699     sdev_cache_ops_t ops)
1700 {
1701 	ASSERT((SDEV_HELD(*dv)));
1702 
1703 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1704 	switch (ops) {
1705 	case SDEV_CACHE_ADD:
1706 		sdev_cache_add(ddv, dv, nm);
1707 		break;
1708 	case SDEV_CACHE_DELETE:
1709 		sdev_cache_delete(ddv, dv);
1710 		break;
1711 	default:
1712 		break;
1713 	}
1714 }
1715 
1716 /*
1717  * retrieve the named entry from the directory cache
1718  */
1719 struct sdev_node *
1720 sdev_cache_lookup(struct sdev_node *ddv, char *nm)
1721 {
1722 	struct sdev_node *dv = NULL;
1723 
1724 	ASSERT(RW_LOCK_HELD(&ddv->sdev_contents));
1725 	dv = sdev_findbyname(ddv, nm);
1726 
1727 	return (dv);
1728 }
1729 
1730 /*
1731  * Implicit reconfig for nodes constructed by a link generator
1732  * Start devfsadm if needed, or if devfsadm is in progress,
1733  * prepare to block on devfsadm either completing or
1734  * constructing the desired node.  As devfsadmd is global
1735  * in scope, constructing all necessary nodes, we only
1736  * need to initiate it once.
1737  */
1738 static int
1739 sdev_call_devfsadmd(struct sdev_node *ddv, struct sdev_node *dv, char *nm)
1740 {
1741 	int error = 0;
1742 
1743 	if (DEVNAME_DEVFSADM_IS_RUNNING(devfsadm_state)) {
1744 		sdcmn_err6(("lookup: waiting for %s/%s, 0x%x\n",
1745 		    ddv->sdev_name, nm, devfsadm_state));
1746 		mutex_enter(&dv->sdev_lookup_lock);
1747 		SDEV_BLOCK_OTHERS(dv, (SDEV_LOOKUP | SDEV_LGWAITING));
1748 		mutex_exit(&dv->sdev_lookup_lock);
1749 		error = 0;
1750 	} else if (!DEVNAME_DEVFSADM_HAS_RUN(devfsadm_state)) {
1751 		sdcmn_err6(("lookup %s/%s starting devfsadm, 0x%x\n",
1752 		    ddv->sdev_name, nm, devfsadm_state));
1753 
1754 		sdev_devfsadmd_thread(ddv, dv, kcred);
1755 		mutex_enter(&dv->sdev_lookup_lock);
1756 		SDEV_BLOCK_OTHERS(dv,
1757 		    (SDEV_LOOKUP | SDEV_LGWAITING));
1758 		mutex_exit(&dv->sdev_lookup_lock);
1759 		error = 0;
1760 	} else {
1761 		error = -1;
1762 	}
1763 
1764 	return (error);
1765 }
1766 
1767 /*
1768  *  Support for specialized device naming construction mechanisms
1769  */
1770 static int
1771 sdev_call_dircallback(struct sdev_node *ddv, struct sdev_node **dvp, char *nm,
1772     int (*callback)(struct sdev_node *, char *, void **, struct cred *,
1773     void *, char *), int flags, struct cred *cred)
1774 {
1775 	int rv = 0;
1776 	char *physpath = NULL;
1777 	struct vattr vattr;
1778 	struct vattr *vap = &vattr;
1779 	struct sdev_node *dv = NULL;
1780 
1781 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1782 	if (flags & SDEV_VLINK) {
1783 		physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1784 		rv = callback(ddv, nm, (void *)&physpath, kcred, NULL,
1785 		    NULL);
1786 		if (rv) {
1787 			kmem_free(physpath, MAXPATHLEN);
1788 			return (-1);
1789 		}
1790 
1791 		*vap = *sdev_getdefault_attr(VLNK);	/* structure copy */
1792 		vap->va_size = strlen(physpath);
1793 		gethrestime(&vap->va_atime);
1794 		vap->va_mtime = vap->va_atime;
1795 		vap->va_ctime = vap->va_atime;
1796 
1797 		rv = sdev_mknode(ddv, nm, &dv, vap, NULL,
1798 		    (void *)physpath, cred, SDEV_READY);
1799 		kmem_free(physpath, MAXPATHLEN);
1800 		if (rv)
1801 			return (rv);
1802 	} else if (flags & SDEV_VATTR) {
1803 		/*
1804 		 * /dev/pts
1805 		 *
1806 		 * callback is responsible to set the basic attributes,
1807 		 * e.g. va_type/va_uid/va_gid/
1808 		 *    dev_t if VCHR or VBLK/
1809 		 */
1810 		ASSERT(callback);
1811 		rv = callback(ddv, nm, (void *)&vattr, kcred, NULL, NULL);
1812 		if (rv) {
1813 			sdcmn_err3(("devname_lookup_func: SDEV_NONE "
1814 			    "callback failed \n"));
1815 			return (-1);
1816 		}
1817 
1818 		rv = sdev_mknode(ddv, nm, &dv, &vattr, NULL, NULL,
1819 		    cred, SDEV_READY);
1820 
1821 		if (rv)
1822 			return (rv);
1823 
1824 	} else {
1825 		impossible(("lookup: %s/%s by %s not supported (%d)\n",
1826 		    SDEVTOV(ddv)->v_path, nm, curproc->p_user.u_comm,
1827 		    __LINE__));
1828 		rv = -1;
1829 	}
1830 
1831 	*dvp = dv;
1832 	return (rv);
1833 }
1834 
1835 static int
1836 is_devfsadm_thread(char *exec_name)
1837 {
1838 	/*
1839 	 * note: because devfsadmd -> /usr/sbin/devfsadm
1840 	 * it is safe to use "devfsadm" to capture the lookups
1841 	 * from devfsadm and its daemon version.
1842 	 */
1843 	if (strcmp(exec_name, "devfsadm") == 0)
1844 		return (1);
1845 	return (0);
1846 }
1847 
1848 /*
1849  * Lookup Order:
1850  *	sdev_node cache;
1851  *	backing store (SDEV_PERSIST);
1852  *	DBNR: a. dir_ops implemented in the loadable modules;
1853  *	      b. vnode ops in vtab.
1854  */
1855 int
1856 devname_lookup_func(struct sdev_node *ddv, char *nm, struct vnode **vpp,
1857     struct cred *cred, int (*callback)(struct sdev_node *, char *, void **,
1858     struct cred *, void *, char *), int flags)
1859 {
1860 	int rv = 0, nmlen;
1861 	struct vnode *rvp = NULL;
1862 	struct sdev_node *dv = NULL;
1863 	int	retried = 0;
1864 	int	error = 0;
1865 	struct vattr vattr;
1866 	char *lookup_thread = curproc->p_user.u_comm;
1867 	int failed_flags = 0;
1868 	int (*vtor)(struct sdev_node *) = NULL;
1869 	int state;
1870 	int parent_state;
1871 	char *link = NULL;
1872 
1873 	if (SDEVTOV(ddv)->v_type != VDIR)
1874 		return (ENOTDIR);
1875 
1876 	/*
1877 	 * Empty name or ., return node itself.
1878 	 */
1879 	nmlen = strlen(nm);
1880 	if ((nmlen == 0) || ((nmlen == 1) && (nm[0] == '.'))) {
1881 		*vpp = SDEVTOV(ddv);
1882 		VN_HOLD(*vpp);
1883 		return (0);
1884 	}
1885 
1886 	/*
1887 	 * .., return the parent directory
1888 	 */
1889 	if ((nmlen == 2) && (strcmp(nm, "..") == 0)) {
1890 		*vpp = SDEVTOV(ddv->sdev_dotdot);
1891 		VN_HOLD(*vpp);
1892 		return (0);
1893 	}
1894 
1895 	rw_enter(&ddv->sdev_contents, RW_READER);
1896 	if (ddv->sdev_flags & SDEV_VTOR) {
1897 		vtor = (int (*)(struct sdev_node *))sdev_get_vtor(ddv);
1898 		ASSERT(vtor);
1899 	}
1900 
1901 tryagain:
1902 	/*
1903 	 * (a) directory cache lookup:
1904 	 */
1905 	ASSERT(RW_READ_HELD(&ddv->sdev_contents));
1906 	parent_state = ddv->sdev_state;
1907 	dv = sdev_cache_lookup(ddv, nm);
1908 	if (dv) {
1909 		state = dv->sdev_state;
1910 		switch (state) {
1911 		case SDEV_INIT:
1912 			if (is_devfsadm_thread(lookup_thread))
1913 				break;
1914 
1915 			/* ZOMBIED parent won't allow node creation */
1916 			if (parent_state == SDEV_ZOMBIE) {
1917 				SD_TRACE_FAILED_LOOKUP(ddv, nm,
1918 				    retried);
1919 				goto nolock_notfound;
1920 			}
1921 
1922 			mutex_enter(&dv->sdev_lookup_lock);
1923 			/* compensate the threads started after devfsadm */
1924 			if (DEVNAME_DEVFSADM_IS_RUNNING(devfsadm_state) &&
1925 			    !(SDEV_IS_LOOKUP(dv)))
1926 				SDEV_BLOCK_OTHERS(dv,
1927 				    (SDEV_LOOKUP | SDEV_LGWAITING));
1928 
1929 			if (SDEV_IS_LOOKUP(dv)) {
1930 				failed_flags |= SLF_REBUILT;
1931 				rw_exit(&ddv->sdev_contents);
1932 				error = sdev_wait4lookup(dv, SDEV_LOOKUP);
1933 				mutex_exit(&dv->sdev_lookup_lock);
1934 				rw_enter(&ddv->sdev_contents, RW_READER);
1935 
1936 				if (error != 0) {
1937 					SD_TRACE_FAILED_LOOKUP(ddv, nm,
1938 					    retried);
1939 					goto nolock_notfound;
1940 				}
1941 
1942 				state = dv->sdev_state;
1943 				if (state == SDEV_INIT) {
1944 					SD_TRACE_FAILED_LOOKUP(ddv, nm,
1945 					    retried);
1946 					goto nolock_notfound;
1947 				} else if (state == SDEV_READY) {
1948 					goto found;
1949 				} else if (state == SDEV_ZOMBIE) {
1950 					rw_exit(&ddv->sdev_contents);
1951 					SD_TRACE_FAILED_LOOKUP(ddv, nm,
1952 					    retried);
1953 					SDEV_RELE(dv);
1954 					goto lookup_failed;
1955 				}
1956 			} else {
1957 				mutex_exit(&dv->sdev_lookup_lock);
1958 			}
1959 			break;
1960 		case SDEV_READY:
1961 			goto found;
1962 		case SDEV_ZOMBIE:
1963 			rw_exit(&ddv->sdev_contents);
1964 			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
1965 			SDEV_RELE(dv);
1966 			goto lookup_failed;
1967 		default:
1968 			rw_exit(&ddv->sdev_contents);
1969 			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
1970 			sdev_lookup_failed(ddv, nm, failed_flags);
1971 			*vpp = NULLVP;
1972 			return (ENOENT);
1973 		}
1974 	}
1975 	ASSERT(RW_READ_HELD(&ddv->sdev_contents));
1976 
1977 	/*
1978 	 * ZOMBIED parent does not allow new node creation.
1979 	 * bail out early
1980 	 */
1981 	if (parent_state == SDEV_ZOMBIE) {
1982 		rw_exit(&ddv->sdev_contents);
1983 		*vpp = NULLVP;
1984 		SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
1985 		return (ENOENT);
1986 	}
1987 
1988 	/*
1989 	 * (b0): backing store lookup
1990 	 *	SDEV_PERSIST is default except:
1991 	 *		1) pts nodes
1992 	 *		2) non-chmod'ed local nodes
1993 	 *		3) zvol nodes
1994 	 */
1995 	if (SDEV_IS_PERSIST(ddv)) {
1996 		error = devname_backstore_lookup(ddv, nm, &rvp);
1997 
1998 		if (!error) {
1999 
2000 			vattr.va_mask = AT_TYPE|AT_MODE|AT_UID|AT_GID;
2001 			error = VOP_GETATTR(rvp, &vattr, 0, cred, NULL);
2002 			if (error) {
2003 				rw_exit(&ddv->sdev_contents);
2004 				if (dv)
2005 					SDEV_RELE(dv);
2006 				SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2007 				sdev_lookup_failed(ddv, nm, failed_flags);
2008 				*vpp = NULLVP;
2009 				return (ENOENT);
2010 			}
2011 
2012 			if (vattr.va_type == VLNK) {
2013 				error = sdev_getlink(rvp, &link);
2014 				if (error) {
2015 					rw_exit(&ddv->sdev_contents);
2016 					if (dv)
2017 						SDEV_RELE(dv);
2018 					SD_TRACE_FAILED_LOOKUP(ddv, nm,
2019 					    retried);
2020 					sdev_lookup_failed(ddv, nm,
2021 					    failed_flags);
2022 					*vpp = NULLVP;
2023 					return (ENOENT);
2024 				}
2025 				ASSERT(link != NULL);
2026 			}
2027 
2028 			if (!rw_tryupgrade(&ddv->sdev_contents)) {
2029 				rw_exit(&ddv->sdev_contents);
2030 				rw_enter(&ddv->sdev_contents, RW_WRITER);
2031 			}
2032 			error = sdev_mknode(ddv, nm, &dv, &vattr,
2033 			    rvp, link, cred, SDEV_READY);
2034 			rw_downgrade(&ddv->sdev_contents);
2035 
2036 			if (link != NULL) {
2037 				kmem_free(link, strlen(link) + 1);
2038 				link = NULL;
2039 			}
2040 
2041 			if (error) {
2042 				SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2043 				rw_exit(&ddv->sdev_contents);
2044 				if (dv)
2045 					SDEV_RELE(dv);
2046 				goto lookup_failed;
2047 			} else {
2048 				goto found;
2049 			}
2050 		} else if (retried) {
2051 			rw_exit(&ddv->sdev_contents);
2052 			sdcmn_err3(("retry of lookup of %s/%s: failed\n",
2053 			    ddv->sdev_name, nm));
2054 			if (dv)
2055 				SDEV_RELE(dv);
2056 			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2057 			sdev_lookup_failed(ddv, nm, failed_flags);
2058 			*vpp = NULLVP;
2059 			return (ENOENT);
2060 		}
2061 	}
2062 
2063 lookup_create_node:
2064 	/* first thread that is doing the lookup on this node */
2065 	if (callback) {
2066 		ASSERT(dv == NULL);
2067 		if (!rw_tryupgrade(&ddv->sdev_contents)) {
2068 			rw_exit(&ddv->sdev_contents);
2069 			rw_enter(&ddv->sdev_contents, RW_WRITER);
2070 		}
2071 		error = sdev_call_dircallback(ddv, &dv, nm, callback,
2072 		    flags, cred);
2073 		rw_downgrade(&ddv->sdev_contents);
2074 		if (error == 0) {
2075 			goto found;
2076 		} else {
2077 			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2078 			rw_exit(&ddv->sdev_contents);
2079 			goto lookup_failed;
2080 		}
2081 	}
2082 	if (!dv) {
2083 		if (!rw_tryupgrade(&ddv->sdev_contents)) {
2084 			rw_exit(&ddv->sdev_contents);
2085 			rw_enter(&ddv->sdev_contents, RW_WRITER);
2086 		}
2087 		error = sdev_mknode(ddv, nm, &dv, NULL, NULL, NULL,
2088 		    cred, SDEV_INIT);
2089 		if (!dv) {
2090 			rw_exit(&ddv->sdev_contents);
2091 			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2092 			sdev_lookup_failed(ddv, nm, failed_flags);
2093 			*vpp = NULLVP;
2094 			return (ENOENT);
2095 		}
2096 		rw_downgrade(&ddv->sdev_contents);
2097 	}
2098 
2099 	/*
2100 	 * (b1) invoking devfsadm once per life time for devfsadm nodes
2101 	 */
2102 	ASSERT(SDEV_HELD(dv));
2103 
2104 	if (SDEV_IS_NO_NCACHE(dv))
2105 		failed_flags |= SLF_NO_NCACHE;
2106 	if (sdev_reconfig_boot || !i_ddi_io_initialized() ||
2107 	    SDEV_IS_DYNAMIC(ddv) || SDEV_IS_NO_NCACHE(dv) ||
2108 	    ((moddebug & MODDEBUG_FINI_EBUSY) != 0)) {
2109 		ASSERT(SDEV_HELD(dv));
2110 		SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2111 		goto nolock_notfound;
2112 	}
2113 
2114 	/*
2115 	 * filter out known non-existent devices recorded
2116 	 * during initial reconfiguration boot for which
2117 	 * reconfig should not be done and lookup may
2118 	 * be short-circuited now.
2119 	 */
2120 	if (sdev_lookup_filter(ddv, nm)) {
2121 		SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2122 		goto nolock_notfound;
2123 	}
2124 
2125 	/* bypassing devfsadm internal nodes */
2126 	if (is_devfsadm_thread(lookup_thread)) {
2127 		SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2128 		goto nolock_notfound;
2129 	}
2130 
2131 	if (sdev_reconfig_disable) {
2132 		SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2133 		goto nolock_notfound;
2134 	}
2135 
2136 	error = sdev_call_devfsadmd(ddv, dv, nm);
2137 	if (error == 0) {
2138 		sdcmn_err8(("lookup of %s/%s by %s: reconfig\n",
2139 		    ddv->sdev_name, nm, curproc->p_user.u_comm));
2140 		if (sdev_reconfig_verbose) {
2141 			cmn_err(CE_CONT,
2142 			    "?lookup of %s/%s by %s: reconfig\n",
2143 			    ddv->sdev_name, nm, curproc->p_user.u_comm);
2144 		}
2145 		retried = 1;
2146 		failed_flags |= SLF_REBUILT;
2147 		ASSERT(dv->sdev_state != SDEV_ZOMBIE);
2148 		SDEV_SIMPLE_RELE(dv);
2149 		goto tryagain;
2150 	} else {
2151 		SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2152 		goto nolock_notfound;
2153 	}
2154 
2155 found:
2156 	ASSERT(dv->sdev_state == SDEV_READY);
2157 	if (vtor) {
2158 		/*
2159 		 * Check validity of returned node
2160 		 */
2161 		switch (vtor(dv)) {
2162 		case SDEV_VTOR_VALID:
2163 			break;
2164 		case SDEV_VTOR_STALE:
2165 			/*
2166 			 * The name exists, but the cache entry is
2167 			 * stale and needs to be re-created.
2168 			 */
2169 			ASSERT(RW_READ_HELD(&ddv->sdev_contents));
2170 			if (rw_tryupgrade(&ddv->sdev_contents) == 0) {
2171 				rw_exit(&ddv->sdev_contents);
2172 				rw_enter(&ddv->sdev_contents, RW_WRITER);
2173 			}
2174 			sdev_cache_update(ddv, &dv, nm, SDEV_CACHE_DELETE);
2175 			rw_downgrade(&ddv->sdev_contents);
2176 			SDEV_RELE(dv);
2177 			dv = NULL;
2178 			goto lookup_create_node;
2179 			/* FALLTHRU */
2180 		case SDEV_VTOR_INVALID:
2181 			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2182 			sdcmn_err7(("lookup: destroy invalid "
2183 			    "node: %s(%p)\n", dv->sdev_name, (void *)dv));
2184 			goto nolock_notfound;
2185 		case SDEV_VTOR_SKIP:
2186 			sdcmn_err7(("lookup: node not applicable - "
2187 			    "skipping: %s(%p)\n", dv->sdev_name, (void *)dv));
2188 			rw_exit(&ddv->sdev_contents);
2189 			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2190 			SDEV_RELE(dv);
2191 			goto lookup_failed;
2192 		default:
2193 			cmn_err(CE_PANIC,
2194 			    "dev fs: validator failed: %s(%p)\n",
2195 			    dv->sdev_name, (void *)dv);
2196 			break;
2197 		}
2198 	}
2199 
2200 	rw_exit(&ddv->sdev_contents);
2201 	rv = sdev_to_vp(dv, vpp);
2202 	sdcmn_err3(("devname_lookup_func: returning vp %p v_count %d state %d "
2203 	    "for nm %s, error %d\n", (void *)*vpp, (*vpp)->v_count,
2204 	    dv->sdev_state, nm, rv));
2205 	return (rv);
2206 
2207 nolock_notfound:
2208 	/*
2209 	 * Destroy the node that is created for synchronization purposes.
2210 	 */
2211 	sdcmn_err3(("devname_lookup_func: %s with state %d\n",
2212 	    nm, dv->sdev_state));
2213 	ASSERT(RW_READ_HELD(&ddv->sdev_contents));
2214 	if (dv->sdev_state == SDEV_INIT) {
2215 		if (!rw_tryupgrade(&ddv->sdev_contents)) {
2216 			rw_exit(&ddv->sdev_contents);
2217 			rw_enter(&ddv->sdev_contents, RW_WRITER);
2218 		}
2219 
2220 		/*
2221 		 * Node state may have changed during the lock
2222 		 * changes. Re-check.
2223 		 */
2224 		if (dv->sdev_state == SDEV_INIT) {
2225 			sdev_dirdelete(ddv, dv);
2226 			rw_exit(&ddv->sdev_contents);
2227 			sdev_lookup_failed(ddv, nm, failed_flags);
2228 			SDEV_RELE(dv);
2229 			*vpp = NULL;
2230 			return (ENOENT);
2231 		}
2232 	}
2233 
2234 	rw_exit(&ddv->sdev_contents);
2235 	SDEV_RELE(dv);
2236 
2237 lookup_failed:
2238 	sdev_lookup_failed(ddv, nm, failed_flags);
2239 	*vpp = NULL;
2240 	return (ENOENT);
2241 }
2242 
2243 /*
2244  * Given a directory node, mark all nodes beneath as
2245  * STALE, i.e. nodes that don't exist as far as new
2246  * consumers are concerned.  Remove them from the
2247  * list of directory entries so that no lookup or
2248  * directory traversal will find them.  The node
2249  * not deallocated so existing holds are not affected.
2250  */
2251 void
2252 sdev_stale(struct sdev_node *ddv)
2253 {
2254 	struct sdev_node *dv;
2255 	struct vnode *vp;
2256 
2257 	ASSERT(SDEVTOV(ddv)->v_type == VDIR);
2258 
2259 	rw_enter(&ddv->sdev_contents, RW_WRITER);
2260 	while ((dv = SDEV_FIRST_ENTRY(ddv)) != NULL) {
2261 		vp = SDEVTOV(dv);
2262 		SDEV_HOLD(dv);
2263 		if (vp->v_type == VDIR)
2264 			sdev_stale(dv);
2265 
2266 		sdev_dirdelete(ddv, dv);
2267 		SDEV_RELE(dv);
2268 	}
2269 	ddv->sdev_flags |= SDEV_BUILD;
2270 	rw_exit(&ddv->sdev_contents);
2271 }
2272 
2273 /*
2274  * Given a directory node, clean out all the nodes beneath.
2275  * If expr is specified, clean node with names matching expr.
2276  * If SDEV_ENFORCE is specified in flags, busy nodes are made stale,
2277  *	so they are excluded from future lookups.
2278  */
2279 int
2280 sdev_cleandir(struct sdev_node *ddv, char *expr, uint_t flags)
2281 {
2282 	int error = 0;
2283 	int busy = 0;
2284 	struct vnode *vp;
2285 	struct sdev_node *dv, *next;
2286 	int bkstore = 0;
2287 	int len = 0;
2288 	char *bks_name = NULL;
2289 
2290 	ASSERT(SDEVTOV(ddv)->v_type == VDIR);
2291 
2292 	/*
2293 	 * We try our best to destroy all unused sdev_node's
2294 	 */
2295 	rw_enter(&ddv->sdev_contents, RW_WRITER);
2296 	for (dv = SDEV_FIRST_ENTRY(ddv); dv != NULL; dv = next) {
2297 		next = SDEV_NEXT_ENTRY(ddv, dv);
2298 		vp = SDEVTOV(dv);
2299 
2300 		if (expr && gmatch(dv->sdev_name, expr) == 0)
2301 			continue;
2302 
2303 		if (vp->v_type == VDIR &&
2304 		    sdev_cleandir(dv, NULL, flags) != 0) {
2305 			sdcmn_err9(("sdev_cleandir: dir %s busy\n",
2306 			    dv->sdev_name));
2307 			busy++;
2308 			continue;
2309 		}
2310 
2311 		if (vp->v_count > 0 && (flags & SDEV_ENFORCE) == 0) {
2312 			sdcmn_err9(("sdev_cleandir: dir %s busy\n",
2313 			    dv->sdev_name));
2314 			busy++;
2315 			continue;
2316 		}
2317 
2318 		/*
2319 		 * at this point, either dv is not held or SDEV_ENFORCE
2320 		 * is specified. In either case, dv needs to be deleted
2321 		 */
2322 		SDEV_HOLD(dv);
2323 
2324 		bkstore = SDEV_IS_PERSIST(dv) ? 1 : 0;
2325 		if (bkstore && (vp->v_type == VDIR))
2326 			bkstore += 1;
2327 
2328 		if (bkstore) {
2329 			len = strlen(dv->sdev_name) + 1;
2330 			bks_name = kmem_alloc(len, KM_SLEEP);
2331 			bcopy(dv->sdev_name, bks_name, len);
2332 		}
2333 
2334 		sdev_dirdelete(ddv, dv);
2335 
2336 		/* take care the backing store clean up */
2337 		if (bkstore) {
2338 			ASSERT(bks_name);
2339 			ASSERT(ddv->sdev_attrvp);
2340 
2341 			if (bkstore == 1) {
2342 				error = VOP_REMOVE(ddv->sdev_attrvp,
2343 				    bks_name, kcred, NULL, 0);
2344 			} else if (bkstore == 2) {
2345 				error = VOP_RMDIR(ddv->sdev_attrvp,
2346 				    bks_name, ddv->sdev_attrvp, kcred, NULL, 0);
2347 			}
2348 
2349 			/* do not propagate the backing store errors */
2350 			if (error) {
2351 				sdcmn_err9(("sdev_cleandir: backing store"
2352 				    "not cleaned\n"));
2353 				error = 0;
2354 			}
2355 
2356 			bkstore = 0;
2357 			kmem_free(bks_name, len);
2358 			bks_name = NULL;
2359 			len = 0;
2360 		}
2361 
2362 		ddv->sdev_flags |= SDEV_BUILD;
2363 		SDEV_RELE(dv);
2364 	}
2365 
2366 	ddv->sdev_flags |= SDEV_BUILD;
2367 	rw_exit(&ddv->sdev_contents);
2368 
2369 	if (busy) {
2370 		error = EBUSY;
2371 	}
2372 
2373 	return (error);
2374 }
2375 
2376 /*
2377  * a convenient wrapper for readdir() funcs
2378  */
2379 size_t
2380 add_dir_entry(dirent64_t *de, char *nm, size_t size, ino_t ino, offset_t off)
2381 {
2382 	size_t reclen = DIRENT64_RECLEN(strlen(nm));
2383 	if (reclen > size)
2384 		return (0);
2385 
2386 	de->d_ino = (ino64_t)ino;
2387 	de->d_off = (off64_t)off + 1;
2388 	de->d_reclen = (ushort_t)reclen;
2389 	(void) strncpy(de->d_name, nm, DIRENT64_NAMELEN(reclen));
2390 	return (reclen);
2391 }
2392 
2393 /*
2394  * sdev_mount service routines
2395  */
2396 int
2397 sdev_copyin_mountargs(struct mounta *uap, struct sdev_mountargs *args)
2398 {
2399 	int	error;
2400 
2401 	if (uap->datalen != sizeof (*args))
2402 		return (EINVAL);
2403 
2404 	if (error = copyin(uap->dataptr, args, sizeof (*args))) {
2405 		cmn_err(CE_WARN, "sdev_copyin_mountargs: can not"
2406 		    "get user data. error %d\n", error);
2407 		return (EFAULT);
2408 	}
2409 
2410 	return (0);
2411 }
2412 
2413 #ifdef nextdp
2414 #undef nextdp
2415 #endif
2416 #define	nextdp(dp)	((struct dirent64 *) \
2417 			    (intptr_t)((char *)(dp) + (dp)->d_reclen))
2418 
2419 /*
2420  * readdir helper func
2421  */
2422 int
2423 devname_readdir_func(vnode_t *vp, uio_t *uiop, cred_t *cred, int *eofp,
2424     int flags)
2425 {
2426 	struct sdev_node *ddv = VTOSDEV(vp);
2427 	struct sdev_node *dv;
2428 	dirent64_t	*dp;
2429 	ulong_t		outcount = 0;
2430 	size_t		namelen;
2431 	ulong_t		alloc_count;
2432 	void		*outbuf;
2433 	struct iovec	*iovp;
2434 	int		error = 0;
2435 	size_t		reclen;
2436 	offset_t	diroff;
2437 	offset_t	soff;
2438 	int		this_reclen;
2439 	int (*vtor)(struct sdev_node *) = NULL;
2440 	struct vattr attr;
2441 	timestruc_t now;
2442 
2443 	ASSERT(ddv->sdev_attr || ddv->sdev_attrvp);
2444 	ASSERT(RW_READ_HELD(&ddv->sdev_contents));
2445 
2446 	if (uiop->uio_loffset >= MAXOFF_T) {
2447 		if (eofp)
2448 			*eofp = 1;
2449 		return (0);
2450 	}
2451 
2452 	if (uiop->uio_iovcnt != 1)
2453 		return (EINVAL);
2454 
2455 	if (vp->v_type != VDIR)
2456 		return (ENOTDIR);
2457 
2458 	if (ddv->sdev_flags & SDEV_VTOR) {
2459 		vtor = (int (*)(struct sdev_node *))sdev_get_vtor(ddv);
2460 		ASSERT(vtor);
2461 	}
2462 
2463 	if (eofp != NULL)
2464 		*eofp = 0;
2465 
2466 	soff = uiop->uio_loffset;
2467 	iovp = uiop->uio_iov;
2468 	alloc_count = iovp->iov_len;
2469 	dp = outbuf = kmem_alloc(alloc_count, KM_SLEEP);
2470 	outcount = 0;
2471 
2472 	if (ddv->sdev_state == SDEV_ZOMBIE)
2473 		goto get_cache;
2474 
2475 	if (SDEV_IS_GLOBAL(ddv)) {
2476 
2477 		if ((sdev_boot_state == SDEV_BOOT_STATE_COMPLETE) &&
2478 		    !sdev_reconfig_boot && (flags & SDEV_BROWSE) &&
2479 		    !SDEV_IS_DYNAMIC(ddv) && !SDEV_IS_NO_NCACHE(ddv) &&
2480 		    ((moddebug & MODDEBUG_FINI_EBUSY) == 0) &&
2481 		    !DEVNAME_DEVFSADM_HAS_RUN(devfsadm_state) &&
2482 		    !DEVNAME_DEVFSADM_IS_RUNNING(devfsadm_state) &&
2483 		    !sdev_reconfig_disable) {
2484 			/*
2485 			 * invoking "devfsadm" to do system device reconfig
2486 			 */
2487 			mutex_enter(&ddv->sdev_lookup_lock);
2488 			SDEV_BLOCK_OTHERS(ddv,
2489 			    (SDEV_READDIR|SDEV_LGWAITING));
2490 			mutex_exit(&ddv->sdev_lookup_lock);
2491 
2492 			sdcmn_err8(("readdir of %s by %s: reconfig\n",
2493 			    ddv->sdev_path, curproc->p_user.u_comm));
2494 			if (sdev_reconfig_verbose) {
2495 				cmn_err(CE_CONT,
2496 				    "?readdir of %s by %s: reconfig\n",
2497 				    ddv->sdev_path, curproc->p_user.u_comm);
2498 			}
2499 
2500 			sdev_devfsadmd_thread(ddv, NULL, kcred);
2501 		} else if (DEVNAME_DEVFSADM_IS_RUNNING(devfsadm_state)) {
2502 			/*
2503 			 * compensate the "ls" started later than "devfsadm"
2504 			 */
2505 			mutex_enter(&ddv->sdev_lookup_lock);
2506 			SDEV_BLOCK_OTHERS(ddv, (SDEV_READDIR|SDEV_LGWAITING));
2507 			mutex_exit(&ddv->sdev_lookup_lock);
2508 		}
2509 
2510 		/*
2511 		 * release the contents lock so that
2512 		 * the cache may be updated by devfsadmd
2513 		 */
2514 		rw_exit(&ddv->sdev_contents);
2515 		mutex_enter(&ddv->sdev_lookup_lock);
2516 		if (SDEV_IS_READDIR(ddv))
2517 			(void) sdev_wait4lookup(ddv, SDEV_READDIR);
2518 		mutex_exit(&ddv->sdev_lookup_lock);
2519 		rw_enter(&ddv->sdev_contents, RW_READER);
2520 
2521 		sdcmn_err4(("readdir of directory %s by %s\n",
2522 		    ddv->sdev_name, curproc->p_user.u_comm));
2523 		if (ddv->sdev_flags & SDEV_BUILD) {
2524 			if (SDEV_IS_PERSIST(ddv)) {
2525 				error = sdev_filldir_from_store(ddv,
2526 				    alloc_count, cred);
2527 			}
2528 			ddv->sdev_flags &= ~SDEV_BUILD;
2529 		}
2530 	}
2531 
2532 get_cache:
2533 	/* handle "." and ".." */
2534 	diroff = 0;
2535 	if (soff == 0) {
2536 		/* first time */
2537 		this_reclen = DIRENT64_RECLEN(1);
2538 		if (alloc_count < this_reclen) {
2539 			error = EINVAL;
2540 			goto done;
2541 		}
2542 
2543 		dp->d_ino = (ino64_t)ddv->sdev_ino;
2544 		dp->d_off = (off64_t)1;
2545 		dp->d_reclen = (ushort_t)this_reclen;
2546 
2547 		(void) strncpy(dp->d_name, ".",
2548 		    DIRENT64_NAMELEN(this_reclen));
2549 		outcount += dp->d_reclen;
2550 		dp = nextdp(dp);
2551 	}
2552 
2553 	diroff++;
2554 	if (soff <= 1) {
2555 		this_reclen = DIRENT64_RECLEN(2);
2556 		if (alloc_count < outcount + this_reclen) {
2557 			error = EINVAL;
2558 			goto done;
2559 		}
2560 
2561 		dp->d_reclen = (ushort_t)this_reclen;
2562 		dp->d_ino = (ino64_t)ddv->sdev_dotdot->sdev_ino;
2563 		dp->d_off = (off64_t)2;
2564 
2565 		(void) strncpy(dp->d_name, "..",
2566 		    DIRENT64_NAMELEN(this_reclen));
2567 		outcount += dp->d_reclen;
2568 
2569 		dp = nextdp(dp);
2570 	}
2571 
2572 
2573 	/* gets the cache */
2574 	diroff++;
2575 	for (dv = SDEV_FIRST_ENTRY(ddv); dv;
2576 	    dv = SDEV_NEXT_ENTRY(ddv, dv), diroff++) {
2577 		sdcmn_err3(("sdev_readdir: diroff %lld soff %lld for '%s' \n",
2578 		    diroff, soff, dv->sdev_name));
2579 
2580 		/* bypassing pre-matured nodes */
2581 		if (diroff < soff || (dv->sdev_state != SDEV_READY)) {
2582 			sdcmn_err3(("sdev_readdir: pre-mature node  "
2583 			    "%s %d\n", dv->sdev_name, dv->sdev_state));
2584 			continue;
2585 		}
2586 
2587 		/*
2588 		 * Check validity of node
2589 		 * Drop invalid and nodes to be skipped.
2590 		 * A node the validator indicates as stale needs
2591 		 * to be returned as presumably the node name itself
2592 		 * is valid and the node data itself will be refreshed
2593 		 * on lookup.  An application performing a readdir then
2594 		 * stat on each entry should thus always see consistent
2595 		 * data.  In any case, it is not possible to synchronize
2596 		 * with dynamic kernel state, and any view we return can
2597 		 * never be anything more than a snapshot at a point in time.
2598 		 */
2599 		if (vtor) {
2600 			switch (vtor(dv)) {
2601 			case SDEV_VTOR_VALID:
2602 				break;
2603 			case SDEV_VTOR_INVALID:
2604 			case SDEV_VTOR_SKIP:
2605 				continue;
2606 			case SDEV_VTOR_STALE:
2607 				sdcmn_err3(("sdev_readir: %s stale\n",
2608 				    dv->sdev_name));
2609 				break;
2610 			default:
2611 				cmn_err(CE_PANIC,
2612 				    "dev fs: validator failed: %s(%p)\n",
2613 				    dv->sdev_name, (void *)dv);
2614 				break;
2615 			/*NOTREACHED*/
2616 			}
2617 		}
2618 
2619 		namelen = strlen(dv->sdev_name);
2620 		reclen = DIRENT64_RECLEN(namelen);
2621 		if (outcount + reclen > alloc_count) {
2622 			goto full;
2623 		}
2624 		dp->d_reclen = (ushort_t)reclen;
2625 		dp->d_ino = (ino64_t)dv->sdev_ino;
2626 		dp->d_off = (off64_t)diroff + 1;
2627 		(void) strncpy(dp->d_name, dv->sdev_name,
2628 		    DIRENT64_NAMELEN(reclen));
2629 		outcount += reclen;
2630 		dp = nextdp(dp);
2631 	}
2632 
2633 full:
2634 	sdcmn_err4(("sdev_readdir: moving %lu bytes: "
2635 	    "diroff %lld, soff %lld, dv %p\n", outcount, diroff, soff,
2636 	    (void *)dv));
2637 
2638 	if (outcount)
2639 		error = uiomove(outbuf, outcount, UIO_READ, uiop);
2640 
2641 	if (!error) {
2642 		uiop->uio_loffset = diroff;
2643 		if (eofp)
2644 			*eofp = dv ? 0 : 1;
2645 	}
2646 
2647 
2648 	if (ddv->sdev_attrvp) {
2649 		gethrestime(&now);
2650 		attr.va_ctime = now;
2651 		attr.va_atime = now;
2652 		attr.va_mask = AT_CTIME|AT_ATIME;
2653 
2654 		(void) VOP_SETATTR(ddv->sdev_attrvp, &attr, 0, kcred, NULL);
2655 	}
2656 done:
2657 	kmem_free(outbuf, alloc_count);
2658 	return (error);
2659 }
2660 
2661 static int
2662 sdev_modctl_lookup(const char *path, vnode_t **r_vp)
2663 {
2664 	vnode_t *vp;
2665 	vnode_t *cvp;
2666 	struct sdev_node *svp;
2667 	char *nm;
2668 	struct pathname pn;
2669 	int error;
2670 	int persisted = 0;
2671 
2672 	ASSERT(INGLOBALZONE(curproc));
2673 
2674 	if (error = pn_get((char *)path, UIO_SYSSPACE, &pn))
2675 		return (error);
2676 	nm = kmem_alloc(MAXNAMELEN, KM_SLEEP);
2677 
2678 	vp = rootdir;
2679 	VN_HOLD(vp);
2680 
2681 	while (pn_pathleft(&pn)) {
2682 		ASSERT(vp->v_type == VDIR || vp->v_type == VLNK);
2683 		(void) pn_getcomponent(&pn, nm);
2684 
2685 		/*
2686 		 * Deal with the .. special case where we may be
2687 		 * traversing up across a mount point, to the
2688 		 * root of this filesystem or global root.
2689 		 */
2690 		if (nm[0] == '.' && nm[1] == '.' && nm[2] == 0) {
2691 checkforroot:
2692 			if (VN_CMP(vp, rootdir)) {
2693 				nm[1] = 0;
2694 			} else if (vp->v_flag & VROOT) {
2695 				vfs_t *vfsp;
2696 				cvp = vp;
2697 				vfsp = cvp->v_vfsp;
2698 				vfs_rlock_wait(vfsp);
2699 				vp = cvp->v_vfsp->vfs_vnodecovered;
2700 				if (vp == NULL ||
2701 				    (cvp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) {
2702 					vfs_unlock(vfsp);
2703 					VN_RELE(cvp);
2704 					error = EIO;
2705 					break;
2706 				}
2707 				VN_HOLD(vp);
2708 				vfs_unlock(vfsp);
2709 				VN_RELE(cvp);
2710 				cvp = NULL;
2711 				goto checkforroot;
2712 			}
2713 		}
2714 
2715 		error = VOP_LOOKUP(vp, nm, &cvp, NULL, 0, NULL, kcred, NULL,
2716 		    NULL, NULL);
2717 		if (error) {
2718 			VN_RELE(vp);
2719 			break;
2720 		}
2721 
2722 		/* traverse mount points encountered on our journey */
2723 		if (vn_ismntpt(cvp) && (error = traverse(&cvp)) != 0) {
2724 			VN_RELE(vp);
2725 			VN_RELE(cvp);
2726 			break;
2727 		}
2728 
2729 		/*
2730 		 * symbolic link, can be either relative and absolute
2731 		 */
2732 		if ((cvp->v_type == VLNK) && pn_pathleft(&pn)) {
2733 			struct pathname linkpath;
2734 			pn_alloc(&linkpath);
2735 			if (error = pn_getsymlink(cvp, &linkpath, kcred)) {
2736 				pn_free(&linkpath);
2737 				break;
2738 			}
2739 			if (pn_pathleft(&linkpath) == 0)
2740 				(void) pn_set(&linkpath, ".");
2741 			error = pn_insert(&pn, &linkpath, strlen(nm));
2742 			pn_free(&linkpath);
2743 			if (pn.pn_pathlen == 0) {
2744 				VN_RELE(vp);
2745 				return (ENOENT);
2746 			}
2747 			if (pn.pn_path[0] == '/') {
2748 				pn_skipslash(&pn);
2749 				VN_RELE(vp);
2750 				VN_RELE(cvp);
2751 				vp = rootdir;
2752 				VN_HOLD(vp);
2753 			} else {
2754 				VN_RELE(cvp);
2755 			}
2756 			continue;
2757 		}
2758 
2759 		VN_RELE(vp);
2760 
2761 		/*
2762 		 * Direct the operation to the persisting filesystem
2763 		 * underlying /dev.  Bail if we encounter a
2764 		 * non-persistent dev entity here.
2765 		 */
2766 		if (cvp->v_vfsp->vfs_fstype == devtype) {
2767 
2768 			if ((VTOSDEV(cvp)->sdev_flags & SDEV_PERSIST) == 0) {
2769 				error = ENOENT;
2770 				VN_RELE(cvp);
2771 				break;
2772 			}
2773 
2774 			if (VTOSDEV(cvp) == NULL) {
2775 				error = ENOENT;
2776 				VN_RELE(cvp);
2777 				break;
2778 			}
2779 			svp = VTOSDEV(cvp);
2780 			if ((vp = svp->sdev_attrvp) == NULL) {
2781 				error = ENOENT;
2782 				VN_RELE(cvp);
2783 				break;
2784 			}
2785 			persisted = 1;
2786 			VN_HOLD(vp);
2787 			VN_RELE(cvp);
2788 			cvp = vp;
2789 		}
2790 
2791 		vp = cvp;
2792 		pn_skipslash(&pn);
2793 	}
2794 
2795 	kmem_free(nm, MAXNAMELEN);
2796 	pn_free(&pn);
2797 
2798 	if (error)
2799 		return (error);
2800 
2801 	/*
2802 	 * Only return persisted nodes in the filesystem underlying /dev.
2803 	 */
2804 	if (!persisted) {
2805 		VN_RELE(vp);
2806 		return (ENOENT);
2807 	}
2808 
2809 	*r_vp = vp;
2810 	return (0);
2811 }
2812 
2813 int
2814 sdev_modctl_readdir(const char *dir, char ***dirlistp, int *npathsp,
2815     int *npathsp_alloc, int checking_empty)
2816 {
2817 	char	**pathlist = NULL;
2818 	char	**newlist = NULL;
2819 	int	npaths = 0;
2820 	int	npaths_alloc = 0;
2821 	dirent64_t *dbuf = NULL;
2822 	int	n;
2823 	char	*s;
2824 	int error;
2825 	vnode_t *vp;
2826 	int eof;
2827 	struct iovec iov;
2828 	struct uio uio;
2829 	struct dirent64 *dp;
2830 	size_t dlen;
2831 	size_t dbuflen;
2832 	int ndirents = 64;
2833 	char *nm;
2834 
2835 	error = sdev_modctl_lookup(dir, &vp);
2836 	sdcmn_err11(("modctl readdir: %s by %s: %s\n",
2837 	    dir, curproc->p_user.u_comm,
2838 	    (error == 0) ? "ok" : "failed"));
2839 	if (error)
2840 		return (error);
2841 
2842 	dlen = ndirents * (sizeof (*dbuf));
2843 	dbuf = kmem_alloc(dlen, KM_SLEEP);
2844 
2845 	uio.uio_iov = &iov;
2846 	uio.uio_iovcnt = 1;
2847 	uio.uio_segflg = UIO_SYSSPACE;
2848 	uio.uio_fmode = 0;
2849 	uio.uio_extflg = UIO_COPY_CACHED;
2850 	uio.uio_loffset = 0;
2851 	uio.uio_llimit = MAXOFFSET_T;
2852 
2853 	eof = 0;
2854 	error = 0;
2855 	while (!error && !eof) {
2856 		uio.uio_resid = dlen;
2857 		iov.iov_base = (char *)dbuf;
2858 		iov.iov_len = dlen;
2859 
2860 		(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2861 		error = VOP_READDIR(vp, &uio, kcred, &eof, NULL, 0);
2862 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2863 
2864 		dbuflen = dlen - uio.uio_resid;
2865 
2866 		if (error || dbuflen == 0)
2867 			break;
2868 
2869 		for (dp = dbuf; ((intptr_t)dp < (intptr_t)dbuf + dbuflen);
2870 		    dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen)) {
2871 
2872 			nm = dp->d_name;
2873 
2874 			if (strcmp(nm, ".") == 0 || strcmp(nm, "..") == 0)
2875 				continue;
2876 			if (npaths == npaths_alloc) {
2877 				npaths_alloc += 64;
2878 				newlist = (char **)
2879 				    kmem_zalloc((npaths_alloc + 1) *
2880 				    sizeof (char *), KM_SLEEP);
2881 				if (pathlist) {
2882 					bcopy(pathlist, newlist,
2883 					    npaths * sizeof (char *));
2884 					kmem_free(pathlist,
2885 					    (npaths + 1) * sizeof (char *));
2886 				}
2887 				pathlist = newlist;
2888 			}
2889 			n = strlen(nm) + 1;
2890 			s = kmem_alloc(n, KM_SLEEP);
2891 			bcopy(nm, s, n);
2892 			pathlist[npaths++] = s;
2893 			sdcmn_err11(("  %s/%s\n", dir, s));
2894 
2895 			/* if checking empty, one entry is as good as many */
2896 			if (checking_empty) {
2897 				eof = 1;
2898 				break;
2899 			}
2900 		}
2901 	}
2902 
2903 exit:
2904 	VN_RELE(vp);
2905 
2906 	if (dbuf)
2907 		kmem_free(dbuf, dlen);
2908 
2909 	if (error)
2910 		return (error);
2911 
2912 	*dirlistp = pathlist;
2913 	*npathsp = npaths;
2914 	*npathsp_alloc = npaths_alloc;
2915 
2916 	return (0);
2917 }
2918 
2919 void
2920 sdev_modctl_readdir_free(char **pathlist, int npaths, int npaths_alloc)
2921 {
2922 	int	i, n;
2923 
2924 	for (i = 0; i < npaths; i++) {
2925 		n = strlen(pathlist[i]) + 1;
2926 		kmem_free(pathlist[i], n);
2927 	}
2928 
2929 	kmem_free(pathlist, (npaths_alloc + 1) * sizeof (char *));
2930 }
2931 
2932 int
2933 sdev_modctl_devexists(const char *path)
2934 {
2935 	vnode_t *vp;
2936 	int error;
2937 
2938 	error = sdev_modctl_lookup(path, &vp);
2939 	sdcmn_err11(("modctl dev exists: %s by %s: %s\n",
2940 	    path, curproc->p_user.u_comm,
2941 	    (error == 0) ? "ok" : "failed"));
2942 	if (error == 0)
2943 		VN_RELE(vp);
2944 
2945 	return (error);
2946 }
2947 
2948 extern int sdev_vnodeops_tbl_size;
2949 
2950 /*
2951  * construct a new template with overrides from vtab
2952  */
2953 static fs_operation_def_t *
2954 sdev_merge_vtab(const fs_operation_def_t tab[])
2955 {
2956 	fs_operation_def_t *new;
2957 	const fs_operation_def_t *tab_entry;
2958 
2959 	/* make a copy of standard vnode ops table */
2960 	new = kmem_alloc(sdev_vnodeops_tbl_size, KM_SLEEP);
2961 	bcopy((void *)sdev_vnodeops_tbl, new, sdev_vnodeops_tbl_size);
2962 
2963 	/* replace the overrides from tab */
2964 	for (tab_entry = tab; tab_entry->name != NULL; tab_entry++) {
2965 		fs_operation_def_t *std_entry = new;
2966 		while (std_entry->name) {
2967 			if (strcmp(tab_entry->name, std_entry->name) == 0) {
2968 				std_entry->func = tab_entry->func;
2969 				break;
2970 			}
2971 			std_entry++;
2972 		}
2973 		if (std_entry->name == NULL)
2974 			cmn_err(CE_NOTE, "sdev_merge_vtab: entry %s unused.",
2975 			    tab_entry->name);
2976 	}
2977 
2978 	return (new);
2979 }
2980 
2981 /* free memory allocated by sdev_merge_vtab */
2982 static void
2983 sdev_free_vtab(fs_operation_def_t *new)
2984 {
2985 	kmem_free(new, sdev_vnodeops_tbl_size);
2986 }
2987 
2988 /*
2989  * a generic setattr() function
2990  *
2991  * note: flags only supports AT_UID and AT_GID.
2992  *	 Future enhancements can be done for other types, e.g. AT_MODE
2993  */
2994 int
2995 devname_setattr_func(struct vnode *vp, struct vattr *vap, int flags,
2996     struct cred *cred, int (*callback)(struct sdev_node *, struct vattr *,
2997     int), int protocol)
2998 {
2999 	struct sdev_node	*dv = VTOSDEV(vp);
3000 	struct sdev_node	*parent = dv->sdev_dotdot;
3001 	struct vattr		*get;
3002 	uint_t			mask = vap->va_mask;
3003 	int 			error;
3004 
3005 	/* some sanity checks */
3006 	if (vap->va_mask & AT_NOSET)
3007 		return (EINVAL);
3008 
3009 	if (vap->va_mask & AT_SIZE) {
3010 		if (vp->v_type == VDIR) {
3011 			return (EISDIR);
3012 		}
3013 	}
3014 
3015 	/* no need to set attribute, but do not fail either */
3016 	ASSERT(parent);
3017 	rw_enter(&parent->sdev_contents, RW_READER);
3018 	if (dv->sdev_state == SDEV_ZOMBIE) {
3019 		rw_exit(&parent->sdev_contents);
3020 		return (0);
3021 	}
3022 
3023 	/* If backing store exists, just set it. */
3024 	if (dv->sdev_attrvp) {
3025 		rw_exit(&parent->sdev_contents);
3026 		return (VOP_SETATTR(dv->sdev_attrvp, vap, flags, cred, NULL));
3027 	}
3028 
3029 	/*
3030 	 * Otherwise, for nodes with the persistence attribute, create it.
3031 	 */
3032 	ASSERT(dv->sdev_attr);
3033 	if (SDEV_IS_PERSIST(dv) ||
3034 	    ((vap->va_mask & ~AT_TIMES) != 0 && !SDEV_IS_DYNAMIC(dv))) {
3035 		sdev_vattr_merge(dv, vap);
3036 		rw_enter(&dv->sdev_contents, RW_WRITER);
3037 		error = sdev_shadow_node(dv, cred);
3038 		rw_exit(&dv->sdev_contents);
3039 		rw_exit(&parent->sdev_contents);
3040 
3041 		if (error)
3042 			return (error);
3043 		return (VOP_SETATTR(dv->sdev_attrvp, vap, flags, cred, NULL));
3044 	}
3045 
3046 
3047 	/*
3048 	 * sdev_attr was allocated in sdev_mknode
3049 	 */
3050 	rw_enter(&dv->sdev_contents, RW_WRITER);
3051 	error = secpolicy_vnode_setattr(cred, vp, vap,
3052 	    dv->sdev_attr, flags, sdev_unlocked_access, dv);
3053 	if (error) {
3054 		rw_exit(&dv->sdev_contents);
3055 		rw_exit(&parent->sdev_contents);
3056 		return (error);
3057 	}
3058 
3059 	get = dv->sdev_attr;
3060 	if (mask & AT_MODE) {
3061 		get->va_mode &= S_IFMT;
3062 		get->va_mode |= vap->va_mode & ~S_IFMT;
3063 	}
3064 
3065 	if ((mask & AT_UID) || (mask & AT_GID)) {
3066 		if (mask & AT_UID)
3067 			get->va_uid = vap->va_uid;
3068 		if (mask & AT_GID)
3069 			get->va_gid = vap->va_gid;
3070 		/*
3071 		 * a callback must be provided if the protocol is set
3072 		 */
3073 		if ((protocol & AT_UID) || (protocol & AT_GID)) {
3074 			ASSERT(callback);
3075 			error = callback(dv, get, protocol);
3076 			if (error) {
3077 				rw_exit(&dv->sdev_contents);
3078 				rw_exit(&parent->sdev_contents);
3079 				return (error);
3080 			}
3081 		}
3082 	}
3083 
3084 	if (mask & AT_ATIME)
3085 		get->va_atime = vap->va_atime;
3086 	if (mask & AT_MTIME)
3087 		get->va_mtime = vap->va_mtime;
3088 	if (mask & (AT_MODE | AT_UID | AT_GID | AT_CTIME)) {
3089 		gethrestime(&get->va_ctime);
3090 	}
3091 
3092 	sdev_vattr_merge(dv, get);
3093 	rw_exit(&dv->sdev_contents);
3094 	rw_exit(&parent->sdev_contents);
3095 	return (0);
3096 }
3097 
3098 /*
3099  * a generic inactive() function
3100  */
3101 /*ARGSUSED*/
3102 void
3103 devname_inactive_func(struct vnode *vp, struct cred *cred,
3104     void (*callback)(struct vnode *))
3105 {
3106 	int clean;
3107 	struct sdev_node *dv = VTOSDEV(vp);
3108 	int state;
3109 
3110 	mutex_enter(&vp->v_lock);
3111 	ASSERT(vp->v_count >= 1);
3112 
3113 
3114 	if (vp->v_count == 1 && callback != NULL)
3115 		callback(vp);
3116 
3117 	rw_enter(&dv->sdev_contents, RW_WRITER);
3118 	state = dv->sdev_state;
3119 
3120 	clean = (vp->v_count == 1) && (state == SDEV_ZOMBIE);
3121 
3122 	/*
3123 	 * sdev is a rather bad public citizen. It violates the general
3124 	 * agreement that in memory nodes should always have a valid reference
3125 	 * count on their vnode. But that's not the case here. This means that
3126 	 * we do actually have to distinguish between getting inactive callbacks
3127 	 * for zombies and otherwise. This should probably be fixed.
3128 	 */
3129 	if (clean) {
3130 		/* Remove the . entry to ourselves */
3131 		if (vp->v_type == VDIR) {
3132 			decr_link(dv);
3133 		}
3134 		VERIFY(dv->sdev_nlink == 1);
3135 		decr_link(dv);
3136 		VN_RELE_LOCKED(vp);
3137 		rw_exit(&dv->sdev_contents);
3138 		mutex_exit(&vp->v_lock);
3139 		sdev_nodedestroy(dv, 0);
3140 	} else {
3141 		VN_RELE_LOCKED(vp);
3142 		rw_exit(&dv->sdev_contents);
3143 		mutex_exit(&vp->v_lock);
3144 	}
3145 }
3146