xref: /titanic_50/usr/src/uts/common/fs/dev/sdev_subr.c (revision 7e0e2549bfaa531aff576083ab0c07f84fa8fb27)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
24  */
25 
26 /*
27  * utility routines for the /dev fs
28  */
29 
30 #include <sys/types.h>
31 #include <sys/param.h>
32 #include <sys/t_lock.h>
33 #include <sys/systm.h>
34 #include <sys/sysmacros.h>
35 #include <sys/user.h>
36 #include <sys/time.h>
37 #include <sys/vfs.h>
38 #include <sys/vnode.h>
39 #include <sys/file.h>
40 #include <sys/fcntl.h>
41 #include <sys/flock.h>
42 #include <sys/kmem.h>
43 #include <sys/uio.h>
44 #include <sys/errno.h>
45 #include <sys/stat.h>
46 #include <sys/cred.h>
47 #include <sys/dirent.h>
48 #include <sys/pathname.h>
49 #include <sys/cmn_err.h>
50 #include <sys/debug.h>
51 #include <sys/mode.h>
52 #include <sys/policy.h>
53 #include <fs/fs_subr.h>
54 #include <sys/mount.h>
55 #include <sys/fs/snode.h>
56 #include <sys/fs/dv_node.h>
57 #include <sys/fs/sdev_impl.h>
58 #include <sys/sunndi.h>
59 #include <sys/sunmdi.h>
60 #include <sys/conf.h>
61 #include <sys/proc.h>
62 #include <sys/user.h>
63 #include <sys/modctl.h>
64 
65 #ifdef DEBUG
66 int sdev_debug = 0x00000001;
67 int sdev_debug_cache_flags = 0;
68 #endif
69 
70 /*
71  * globals
72  */
73 /* prototype memory vattrs */
74 vattr_t sdev_vattr_dir = {
75 	AT_TYPE|AT_MODE|AT_UID|AT_GID,		/* va_mask */
76 	VDIR,					/* va_type */
77 	SDEV_DIRMODE_DEFAULT,			/* va_mode */
78 	SDEV_UID_DEFAULT,			/* va_uid */
79 	SDEV_GID_DEFAULT,			/* va_gid */
80 	0,					/* va_fsid */
81 	0,					/* va_nodeid */
82 	0,					/* va_nlink */
83 	0,					/* va_size */
84 	0,					/* va_atime */
85 	0,					/* va_mtime */
86 	0,					/* va_ctime */
87 	0,					/* va_rdev */
88 	0,					/* va_blksize */
89 	0,					/* va_nblocks */
90 	0					/* va_vcode */
91 };
92 
93 vattr_t sdev_vattr_lnk = {
94 	AT_TYPE|AT_MODE,			/* va_mask */
95 	VLNK,					/* va_type */
96 	SDEV_LNKMODE_DEFAULT,			/* va_mode */
97 	SDEV_UID_DEFAULT,			/* va_uid */
98 	SDEV_GID_DEFAULT,			/* va_gid */
99 	0,					/* va_fsid */
100 	0,					/* va_nodeid */
101 	0,					/* va_nlink */
102 	0,					/* va_size */
103 	0,					/* va_atime */
104 	0,					/* va_mtime */
105 	0,					/* va_ctime */
106 	0,					/* va_rdev */
107 	0,					/* va_blksize */
108 	0,					/* va_nblocks */
109 	0					/* va_vcode */
110 };
111 
112 vattr_t sdev_vattr_blk = {
113 	AT_TYPE|AT_MODE|AT_UID|AT_GID,		/* va_mask */
114 	VBLK,					/* va_type */
115 	S_IFBLK | SDEV_DEVMODE_DEFAULT,		/* va_mode */
116 	SDEV_UID_DEFAULT,			/* va_uid */
117 	SDEV_GID_DEFAULT,			/* va_gid */
118 	0,					/* va_fsid */
119 	0,					/* va_nodeid */
120 	0,					/* va_nlink */
121 	0,					/* va_size */
122 	0,					/* va_atime */
123 	0,					/* va_mtime */
124 	0,					/* va_ctime */
125 	0,					/* va_rdev */
126 	0,					/* va_blksize */
127 	0,					/* va_nblocks */
128 	0					/* va_vcode */
129 };
130 
131 vattr_t sdev_vattr_chr = {
132 	AT_TYPE|AT_MODE|AT_UID|AT_GID,		/* va_mask */
133 	VCHR,					/* va_type */
134 	S_IFCHR | SDEV_DEVMODE_DEFAULT,		/* va_mode */
135 	SDEV_UID_DEFAULT,			/* va_uid */
136 	SDEV_GID_DEFAULT,			/* va_gid */
137 	0,					/* va_fsid */
138 	0,					/* va_nodeid */
139 	0,					/* va_nlink */
140 	0,					/* va_size */
141 	0,					/* va_atime */
142 	0,					/* va_mtime */
143 	0,					/* va_ctime */
144 	0,					/* va_rdev */
145 	0,					/* va_blksize */
146 	0,					/* va_nblocks */
147 	0					/* va_vcode */
148 };
149 
150 kmem_cache_t	*sdev_node_cache;	/* sdev_node cache */
151 int		devtype;		/* fstype */
152 
153 /* static */
154 static struct vnodeops *sdev_get_vop(struct sdev_node *);
155 static void sdev_set_no_negcache(struct sdev_node *);
156 static fs_operation_def_t *sdev_merge_vtab(const fs_operation_def_t []);
157 static void sdev_free_vtab(fs_operation_def_t *);
158 
159 static void
160 sdev_prof_free(struct sdev_node *dv)
161 {
162 	ASSERT(!SDEV_IS_GLOBAL(dv));
163 	if (dv->sdev_prof.dev_name)
164 		nvlist_free(dv->sdev_prof.dev_name);
165 	if (dv->sdev_prof.dev_map)
166 		nvlist_free(dv->sdev_prof.dev_map);
167 	if (dv->sdev_prof.dev_symlink)
168 		nvlist_free(dv->sdev_prof.dev_symlink);
169 	if (dv->sdev_prof.dev_glob_incdir)
170 		nvlist_free(dv->sdev_prof.dev_glob_incdir);
171 	if (dv->sdev_prof.dev_glob_excdir)
172 		nvlist_free(dv->sdev_prof.dev_glob_excdir);
173 	bzero(&dv->sdev_prof, sizeof (dv->sdev_prof));
174 }
175 
176 /* sdev_node cache constructor */
177 /*ARGSUSED1*/
178 static int
179 i_sdev_node_ctor(void *buf, void *cfarg, int flag)
180 {
181 	struct sdev_node *dv = (struct sdev_node *)buf;
182 	struct vnode *vp;
183 
184 	bzero(buf, sizeof (struct sdev_node));
185 	vp = dv->sdev_vnode = vn_alloc(flag);
186 	if (vp == NULL) {
187 		return (-1);
188 	}
189 	vp->v_data = dv;
190 	rw_init(&dv->sdev_contents, NULL, RW_DEFAULT, NULL);
191 	return (0);
192 }
193 
194 /* sdev_node cache destructor */
195 /*ARGSUSED1*/
196 static void
197 i_sdev_node_dtor(void *buf, void *arg)
198 {
199 	struct sdev_node *dv = (struct sdev_node *)buf;
200 	struct vnode *vp = SDEVTOV(dv);
201 
202 	rw_destroy(&dv->sdev_contents);
203 	vn_free(vp);
204 }
205 
206 /* initialize sdev_node cache */
207 void
208 sdev_node_cache_init()
209 {
210 	int flags = 0;
211 
212 #ifdef	DEBUG
213 	flags = sdev_debug_cache_flags;
214 	if (flags)
215 		sdcmn_err(("cache debug flags 0x%x\n", flags));
216 #endif	/* DEBUG */
217 
218 	ASSERT(sdev_node_cache == NULL);
219 	sdev_node_cache = kmem_cache_create("sdev_node_cache",
220 	    sizeof (struct sdev_node), 0, i_sdev_node_ctor, i_sdev_node_dtor,
221 	    NULL, NULL, NULL, flags);
222 }
223 
224 /* destroy sdev_node cache */
225 void
226 sdev_node_cache_fini()
227 {
228 	ASSERT(sdev_node_cache != NULL);
229 	kmem_cache_destroy(sdev_node_cache);
230 	sdev_node_cache = NULL;
231 }
232 
233 /*
234  * Compare two nodes lexographically to balance avl tree
235  */
236 static int
237 sdev_compare_nodes(const struct sdev_node *dv1, const struct sdev_node *dv2)
238 {
239 	int rv;
240 	if ((rv = strcmp(dv1->sdev_name, dv2->sdev_name)) == 0)
241 		return (0);
242 	return ((rv < 0) ? -1 : 1);
243 }
244 
245 void
246 sdev_set_nodestate(struct sdev_node *dv, sdev_node_state_t state)
247 {
248 	ASSERT(dv);
249 	ASSERT(RW_WRITE_HELD(&dv->sdev_contents));
250 	dv->sdev_state = state;
251 }
252 
253 static void
254 sdev_attr_update(struct sdev_node *dv, vattr_t *vap)
255 {
256 	timestruc_t	now;
257 	struct vattr	*attrp;
258 	uint_t		mask;
259 
260 	ASSERT(dv->sdev_attr);
261 	ASSERT(vap);
262 
263 	attrp = dv->sdev_attr;
264 	mask = vap->va_mask;
265 	if (mask & AT_TYPE)
266 		attrp->va_type = vap->va_type;
267 	if (mask & AT_MODE)
268 		attrp->va_mode = vap->va_mode;
269 	if (mask & AT_UID)
270 		attrp->va_uid = vap->va_uid;
271 	if (mask & AT_GID)
272 		attrp->va_gid = vap->va_gid;
273 	if (mask & AT_RDEV)
274 		attrp->va_rdev = vap->va_rdev;
275 
276 	gethrestime(&now);
277 	attrp->va_atime = (mask & AT_ATIME) ? vap->va_atime : now;
278 	attrp->va_mtime = (mask & AT_MTIME) ? vap->va_mtime : now;
279 	attrp->va_ctime = (mask & AT_CTIME) ? vap->va_ctime : now;
280 }
281 
282 static void
283 sdev_attr_alloc(struct sdev_node *dv, vattr_t *vap)
284 {
285 	ASSERT(dv->sdev_attr == NULL);
286 	ASSERT(vap->va_mask & AT_TYPE);
287 	ASSERT(vap->va_mask & AT_MODE);
288 
289 	dv->sdev_attr = kmem_zalloc(sizeof (struct vattr), KM_SLEEP);
290 	sdev_attr_update(dv, vap);
291 }
292 
293 /* alloc and initialize a sdev_node */
294 int
295 sdev_nodeinit(struct sdev_node *ddv, char *nm, struct sdev_node **newdv,
296     vattr_t *vap)
297 {
298 	struct sdev_node *dv = NULL;
299 	struct vnode *vp;
300 	size_t nmlen, len;
301 	devname_handle_t  *dhl;
302 
303 	nmlen = strlen(nm) + 1;
304 	if (nmlen > MAXNAMELEN) {
305 		sdcmn_err9(("sdev_nodeinit: node name %s"
306 		    " too long\n", nm));
307 		*newdv = NULL;
308 		return (ENAMETOOLONG);
309 	}
310 
311 	dv = kmem_cache_alloc(sdev_node_cache, KM_SLEEP);
312 
313 	dv->sdev_name = kmem_alloc(nmlen, KM_SLEEP);
314 	bcopy(nm, dv->sdev_name, nmlen);
315 	dv->sdev_namelen = nmlen - 1;	/* '\0' not included */
316 	len = strlen(ddv->sdev_path) + strlen(nm) + 2;
317 	dv->sdev_path = kmem_alloc(len, KM_SLEEP);
318 	(void) snprintf(dv->sdev_path, len, "%s/%s", ddv->sdev_path, nm);
319 	/* overwritten for VLNK nodes */
320 	dv->sdev_symlink = NULL;
321 
322 	vp = SDEVTOV(dv);
323 	vn_reinit(vp);
324 	vp->v_vfsp = SDEVTOV(ddv)->v_vfsp;
325 	if (vap)
326 		vp->v_type = vap->va_type;
327 
328 	/*
329 	 * initialized to the parent's vnodeops.
330 	 * maybe overwriten for a VDIR
331 	 */
332 	vn_setops(vp, vn_getops(SDEVTOV(ddv)));
333 	vn_exists(vp);
334 
335 	dv->sdev_dotdot = NULL;
336 	dv->sdev_attrvp = NULL;
337 	if (vap) {
338 		sdev_attr_alloc(dv, vap);
339 	} else {
340 		dv->sdev_attr = NULL;
341 	}
342 
343 	dv->sdev_ino = sdev_mkino(dv);
344 	dv->sdev_nlink = 0;		/* updated on insert */
345 	dv->sdev_flags = ddv->sdev_flags; /* inherit from the parent first */
346 	dv->sdev_flags |= SDEV_BUILD;
347 	mutex_init(&dv->sdev_lookup_lock, NULL, MUTEX_DEFAULT, NULL);
348 	cv_init(&dv->sdev_lookup_cv, NULL, CV_DEFAULT, NULL);
349 	if (SDEV_IS_GLOBAL(ddv)) {
350 		dv->sdev_flags |= SDEV_GLOBAL;
351 		dhl = &(dv->sdev_handle);
352 		dhl->dh_data = dv;
353 		dhl->dh_args = NULL;
354 		sdev_set_no_negcache(dv);
355 		dv->sdev_gdir_gen = 0;
356 	} else {
357 		dv->sdev_flags &= ~SDEV_GLOBAL;
358 		dv->sdev_origin = NULL; /* set later */
359 		bzero(&dv->sdev_prof, sizeof (dv->sdev_prof));
360 		dv->sdev_ldir_gen = 0;
361 		dv->sdev_devtree_gen = 0;
362 	}
363 
364 	rw_enter(&dv->sdev_contents, RW_WRITER);
365 	sdev_set_nodestate(dv, SDEV_INIT);
366 	rw_exit(&dv->sdev_contents);
367 	*newdv = dv;
368 
369 	return (0);
370 }
371 
372 /*
373  * Transition a sdev_node into SDEV_READY state. If this fails, it is up to the
374  * caller to transition the node to the SDEV_ZOMBIE state.
375  */
376 int
377 sdev_nodeready(struct sdev_node *dv, struct vattr *vap, struct vnode *avp,
378     void *args, struct cred *cred)
379 {
380 	int error = 0;
381 	struct vnode *vp = SDEVTOV(dv);
382 	vtype_t type;
383 
384 	ASSERT(dv && (dv->sdev_state != SDEV_READY) && vap);
385 
386 	type = vap->va_type;
387 	vp->v_type = type;
388 	vp->v_rdev = vap->va_rdev;
389 	rw_enter(&dv->sdev_contents, RW_WRITER);
390 	if (type == VDIR) {
391 		dv->sdev_nlink = 2;
392 		dv->sdev_flags &= ~SDEV_PERSIST;
393 		dv->sdev_flags &= ~SDEV_DYNAMIC;
394 		vn_setops(vp, sdev_get_vop(dv)); /* from internal vtab */
395 		ASSERT(dv->sdev_dotdot);
396 		ASSERT(SDEVTOV(dv->sdev_dotdot)->v_type == VDIR);
397 		vp->v_rdev = SDEVTOV(dv->sdev_dotdot)->v_rdev;
398 		avl_create(&dv->sdev_entries,
399 		    (int (*)(const void *, const void *))sdev_compare_nodes,
400 		    sizeof (struct sdev_node),
401 		    offsetof(struct sdev_node, sdev_avllink));
402 	} else if (type == VLNK) {
403 		ASSERT(args);
404 		dv->sdev_nlink = 1;
405 		dv->sdev_symlink = i_ddi_strdup((char *)args, KM_SLEEP);
406 	} else {
407 		dv->sdev_nlink = 1;
408 	}
409 
410 	if (!(SDEV_IS_GLOBAL(dv))) {
411 		dv->sdev_origin = (struct sdev_node *)args;
412 		dv->sdev_flags &= ~SDEV_PERSIST;
413 	}
414 
415 	/*
416 	 * shadow node is created here OR
417 	 * if failed (indicated by dv->sdev_attrvp == NULL),
418 	 * created later in sdev_setattr
419 	 */
420 	if (avp) {
421 		dv->sdev_attrvp = avp;
422 	} else {
423 		if (dv->sdev_attr == NULL) {
424 			sdev_attr_alloc(dv, vap);
425 		} else {
426 			sdev_attr_update(dv, vap);
427 		}
428 
429 		if ((dv->sdev_attrvp == NULL) && SDEV_IS_PERSIST(dv))
430 			error = sdev_shadow_node(dv, cred);
431 	}
432 
433 	if (error == 0) {
434 		/* transition to READY state */
435 		sdev_set_nodestate(dv, SDEV_READY);
436 		sdev_nc_node_exists(dv);
437 	}
438 	rw_exit(&dv->sdev_contents);
439 	return (error);
440 }
441 
442 /*
443  * Build the VROOT sdev_node.
444  */
445 /*ARGSUSED*/
446 struct sdev_node *
447 sdev_mkroot(struct vfs *vfsp, dev_t devdev, struct vnode *mvp,
448     struct vnode *avp, struct cred *cred)
449 {
450 	struct sdev_node *dv;
451 	struct vnode *vp;
452 	char devdir[] = "/dev";
453 
454 	ASSERT(sdev_node_cache != NULL);
455 	ASSERT(avp);
456 	dv = kmem_cache_alloc(sdev_node_cache, KM_SLEEP);
457 	vp = SDEVTOV(dv);
458 	vn_reinit(vp);
459 	vp->v_flag |= VROOT;
460 	vp->v_vfsp = vfsp;
461 	vp->v_type = VDIR;
462 	vp->v_rdev = devdev;
463 	vn_setops(vp, sdev_vnodeops); /* apply the default vnodeops at /dev */
464 	vn_exists(vp);
465 
466 	if (vfsp->vfs_mntpt)
467 		dv->sdev_name = i_ddi_strdup(
468 		    (char *)refstr_value(vfsp->vfs_mntpt), KM_SLEEP);
469 	else
470 		/* vfs_mountdev1 set mount point later */
471 		dv->sdev_name = i_ddi_strdup("/dev", KM_SLEEP);
472 	dv->sdev_namelen = strlen(dv->sdev_name); /* '\0' not included */
473 	dv->sdev_path = i_ddi_strdup(devdir, KM_SLEEP);
474 	dv->sdev_ino = SDEV_ROOTINO;
475 	dv->sdev_nlink = 2;		/* name + . (no sdev_insert) */
476 	dv->sdev_dotdot = dv;		/* .. == self */
477 	dv->sdev_attrvp = avp;
478 	dv->sdev_attr = NULL;
479 	mutex_init(&dv->sdev_lookup_lock, NULL, MUTEX_DEFAULT, NULL);
480 	cv_init(&dv->sdev_lookup_cv, NULL, CV_DEFAULT, NULL);
481 	if (strcmp(dv->sdev_name, "/dev") == 0) {
482 		dv->sdev_flags = SDEV_BUILD|SDEV_GLOBAL|SDEV_PERSIST;
483 		bzero(&dv->sdev_handle, sizeof (dv->sdev_handle));
484 		dv->sdev_gdir_gen = 0;
485 	} else {
486 		dv->sdev_flags = SDEV_BUILD;
487 		dv->sdev_flags &= ~SDEV_PERSIST;
488 		bzero(&dv->sdev_prof, sizeof (dv->sdev_prof));
489 		dv->sdev_ldir_gen = 0;
490 		dv->sdev_devtree_gen = 0;
491 	}
492 
493 	avl_create(&dv->sdev_entries,
494 	    (int (*)(const void *, const void *))sdev_compare_nodes,
495 	    sizeof (struct sdev_node),
496 	    offsetof(struct sdev_node, sdev_avllink));
497 
498 	rw_enter(&dv->sdev_contents, RW_WRITER);
499 	sdev_set_nodestate(dv, SDEV_READY);
500 	rw_exit(&dv->sdev_contents);
501 	sdev_nc_node_exists(dv);
502 	return (dv);
503 }
504 
505 /* directory dependent vop table */
506 struct sdev_vop_table {
507 	char *vt_name;				/* subdirectory name */
508 	const fs_operation_def_t *vt_service;	/* vnodeops table */
509 	struct vnodeops *vt_vops;		/* constructed vop */
510 	struct vnodeops **vt_global_vops;	/* global container for vop */
511 	int (*vt_vtor)(struct sdev_node *);	/* validate sdev_node */
512 	int vt_flags;
513 };
514 
515 /*
516  * A nice improvement would be to provide a plug-in mechanism
517  * for this table instead of a const table.
518  */
519 static struct sdev_vop_table vtab[] =
520 {
521 	{ "pts", devpts_vnodeops_tbl, NULL, &devpts_vnodeops, devpts_validate,
522 	SDEV_DYNAMIC | SDEV_VTOR },
523 
524 	{ "vt", devvt_vnodeops_tbl, NULL, &devvt_vnodeops, devvt_validate,
525 	SDEV_DYNAMIC | SDEV_VTOR },
526 
527 	{ "zvol", devzvol_vnodeops_tbl, NULL, &devzvol_vnodeops,
528 	devzvol_validate, SDEV_ZONED | SDEV_DYNAMIC | SDEV_VTOR | SDEV_SUBDIR },
529 
530 	{ "zcons", NULL, NULL, NULL, NULL, SDEV_NO_NCACHE },
531 
532 	{ "net", devnet_vnodeops_tbl, NULL, &devnet_vnodeops, devnet_validate,
533 	SDEV_DYNAMIC | SDEV_VTOR },
534 
535 	{ "ipnet", devipnet_vnodeops_tbl, NULL, &devipnet_vnodeops,
536 	devipnet_validate, SDEV_DYNAMIC | SDEV_VTOR | SDEV_NO_NCACHE },
537 
538 	/*
539 	 * SDEV_DYNAMIC: prevent calling out to devfsadm, since only the
540 	 * lofi driver controls child nodes.
541 	 *
542 	 * SDEV_PERSIST: ensure devfsadm knows to clean up any persisted
543 	 * stale nodes (e.g. from devfsadm -R).
544 	 *
545 	 * In addition, devfsadm knows not to attempt a rmdir: a zone
546 	 * may hold a reference, which would zombify the node,
547 	 * preventing a mkdir.
548 	 */
549 
550 	{ "lofi", NULL, NULL, NULL, NULL,
551 	    SDEV_ZONED | SDEV_DYNAMIC | SDEV_PERSIST },
552 	{ "rlofi", NULL, NULL, NULL, NULL,
553 	    SDEV_ZONED | SDEV_DYNAMIC | SDEV_PERSIST },
554 
555 	{ NULL, NULL, NULL, NULL, NULL, 0}
556 };
557 
558 /*
559  * We need to match off of the sdev_path, not the sdev_name. We are only allowed
560  * to exist directly under /dev.
561  */
562 struct sdev_vop_table *
563 sdev_match(struct sdev_node *dv)
564 {
565 	int vlen;
566 	int i;
567 	const char *path;
568 
569 	if (strlen(dv->sdev_path) <= 5)
570 		return (NULL);
571 
572 	if (strncmp(dv->sdev_path, "/dev/", 5) != 0)
573 		return (NULL);
574 	path = dv->sdev_path + 5;
575 
576 	for (i = 0; vtab[i].vt_name; i++) {
577 		if (strcmp(vtab[i].vt_name, path) == 0)
578 			return (&vtab[i]);
579 		if (vtab[i].vt_flags & SDEV_SUBDIR) {
580 			vlen = strlen(vtab[i].vt_name);
581 			if ((strncmp(vtab[i].vt_name, path,
582 			    vlen - 1) == 0) && path[vlen] == '/')
583 				return (&vtab[i]);
584 		}
585 
586 	}
587 	return (NULL);
588 }
589 
590 /*
591  *  sets a directory's vnodeops if the directory is in the vtab;
592  */
593 static struct vnodeops *
594 sdev_get_vop(struct sdev_node *dv)
595 {
596 	struct sdev_vop_table *vtp;
597 	char *path;
598 
599 	path = dv->sdev_path;
600 	ASSERT(path);
601 
602 	/* gets the relative path to /dev/ */
603 	path += 5;
604 
605 	/* gets the vtab entry it matches */
606 	if ((vtp = sdev_match(dv)) != NULL) {
607 		dv->sdev_flags |= vtp->vt_flags;
608 		if (SDEV_IS_PERSIST(dv->sdev_dotdot) &&
609 		    (SDEV_IS_PERSIST(dv) || !SDEV_IS_DYNAMIC(dv)))
610 			dv->sdev_flags |= SDEV_PERSIST;
611 
612 		if (vtp->vt_vops) {
613 			if (vtp->vt_global_vops)
614 				*(vtp->vt_global_vops) = vtp->vt_vops;
615 
616 			return (vtp->vt_vops);
617 		}
618 
619 		if (vtp->vt_service) {
620 			fs_operation_def_t *templ;
621 			templ = sdev_merge_vtab(vtp->vt_service);
622 			if (vn_make_ops(vtp->vt_name,
623 			    (const fs_operation_def_t *)templ,
624 			    &vtp->vt_vops) != 0) {
625 				cmn_err(CE_PANIC, "%s: malformed vnode ops\n",
626 				    vtp->vt_name);
627 				/*NOTREACHED*/
628 			}
629 			if (vtp->vt_global_vops) {
630 				*(vtp->vt_global_vops) = vtp->vt_vops;
631 			}
632 			sdev_free_vtab(templ);
633 
634 			return (vtp->vt_vops);
635 		}
636 
637 		return (sdev_vnodeops);
638 	}
639 
640 	/* child inherits the persistence of the parent */
641 	if (SDEV_IS_PERSIST(dv->sdev_dotdot))
642 		dv->sdev_flags |= SDEV_PERSIST;
643 
644 	return (sdev_vnodeops);
645 }
646 
647 static void
648 sdev_set_no_negcache(struct sdev_node *dv)
649 {
650 	int i;
651 	char *path;
652 
653 	ASSERT(dv->sdev_path);
654 	path = dv->sdev_path + strlen("/dev/");
655 
656 	for (i = 0; vtab[i].vt_name; i++) {
657 		if (strcmp(vtab[i].vt_name, path) == 0) {
658 			if (vtab[i].vt_flags & SDEV_NO_NCACHE)
659 				dv->sdev_flags |= SDEV_NO_NCACHE;
660 			break;
661 		}
662 	}
663 }
664 
665 void *
666 sdev_get_vtor(struct sdev_node *dv)
667 {
668 	struct sdev_vop_table *vtp;
669 
670 	vtp = sdev_match(dv);
671 	if (vtp)
672 		return ((void *)vtp->vt_vtor);
673 	else
674 		return (NULL);
675 }
676 
677 /*
678  * Build the base root inode
679  */
680 ino_t
681 sdev_mkino(struct sdev_node *dv)
682 {
683 	ino_t	ino;
684 
685 	/*
686 	 * for now, follow the lead of tmpfs here
687 	 * need to someday understand the requirements here
688 	 */
689 	ino = (ino_t)(uint32_t)((uintptr_t)dv >> 3);
690 	ino += SDEV_ROOTINO + 1;
691 
692 	return (ino);
693 }
694 
695 int
696 sdev_getlink(struct vnode *linkvp, char **link)
697 {
698 	int err;
699 	char *buf;
700 	struct uio uio = {0};
701 	struct iovec iov = {0};
702 
703 	if (linkvp == NULL)
704 		return (ENOENT);
705 	ASSERT(linkvp->v_type == VLNK);
706 
707 	buf = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
708 	iov.iov_base = buf;
709 	iov.iov_len = MAXPATHLEN;
710 	uio.uio_iov = &iov;
711 	uio.uio_iovcnt = 1;
712 	uio.uio_resid = MAXPATHLEN;
713 	uio.uio_segflg = UIO_SYSSPACE;
714 	uio.uio_llimit = MAXOFFSET_T;
715 
716 	err = VOP_READLINK(linkvp, &uio, kcred, NULL);
717 	if (err) {
718 		cmn_err(CE_WARN, "readlink %s failed in dev\n", buf);
719 		kmem_free(buf, MAXPATHLEN);
720 		return (ENOENT);
721 	}
722 
723 	/* mission complete */
724 	*link = i_ddi_strdup(buf, KM_SLEEP);
725 	kmem_free(buf, MAXPATHLEN);
726 	return (0);
727 }
728 
729 /*
730  * A convenient wrapper to get the devfs node vnode for a device
731  * minor functionality: readlink() of a /dev symlink
732  * Place the link into dv->sdev_symlink
733  */
734 static int
735 sdev_follow_link(struct sdev_node *dv)
736 {
737 	int err;
738 	struct vnode *linkvp;
739 	char *link = NULL;
740 
741 	linkvp = SDEVTOV(dv);
742 	if (linkvp == NULL)
743 		return (ENOENT);
744 	ASSERT(linkvp->v_type == VLNK);
745 	err = sdev_getlink(linkvp, &link);
746 	if (err) {
747 		dv->sdev_symlink = NULL;
748 		return (ENOENT);
749 	}
750 
751 	ASSERT(link != NULL);
752 	dv->sdev_symlink = link;
753 	return (0);
754 }
755 
756 static int
757 sdev_node_check(struct sdev_node *dv, struct vattr *nvap, void *nargs)
758 {
759 	vtype_t otype = SDEVTOV(dv)->v_type;
760 
761 	/*
762 	 * existing sdev_node has a different type.
763 	 */
764 	if (otype != nvap->va_type) {
765 		sdcmn_err9(("sdev_node_check: existing node "
766 		    "  %s type %d does not match new node type %d\n",
767 		    dv->sdev_name, otype, nvap->va_type));
768 		return (EEXIST);
769 	}
770 
771 	/*
772 	 * For a symlink, the target should be the same.
773 	 */
774 	if (otype == VLNK) {
775 		ASSERT(nargs != NULL);
776 		ASSERT(dv->sdev_symlink != NULL);
777 		if (strcmp(dv->sdev_symlink, (char *)nargs) != 0) {
778 			sdcmn_err9(("sdev_node_check: existing node "
779 			    " %s has different symlink %s as new node "
780 			    " %s\n", dv->sdev_name, dv->sdev_symlink,
781 			    (char *)nargs));
782 			return (EEXIST);
783 		}
784 	}
785 
786 	return (0);
787 }
788 
789 /*
790  * sdev_mknode - a wrapper for sdev_nodeinit(), sdev_nodeready()
791  *
792  * arguments:
793  *	- ddv (parent)
794  *	- nm (child name)
795  *	- newdv (sdev_node for nm is returned here)
796  *	- vap (vattr for the node to be created, va_type should be set.
797  *	- avp (attribute vnode)
798  *	  the defaults should be used if unknown)
799  *	- cred
800  *	- args
801  *	    . tnm (for VLNK)
802  *	    . global sdev_node (for !SDEV_GLOBAL)
803  * 	- state: SDEV_INIT, SDEV_READY
804  *
805  * only ddv, nm, newddv, vap, cred are required for sdev_mknode(SDEV_INIT)
806  *
807  * NOTE:  directory contents writers lock needs to be held before
808  *	  calling this routine.
809  */
810 int
811 sdev_mknode(struct sdev_node *ddv, char *nm, struct sdev_node **newdv,
812     struct vattr *vap, struct vnode *avp, void *args, struct cred *cred,
813     sdev_node_state_t state)
814 {
815 	int error = 0;
816 	sdev_node_state_t node_state;
817 	struct sdev_node *dv = NULL;
818 
819 	ASSERT(state != SDEV_ZOMBIE);
820 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
821 
822 	if (*newdv) {
823 		dv = *newdv;
824 	} else {
825 		/* allocate and initialize a sdev_node */
826 		if (ddv->sdev_state == SDEV_ZOMBIE) {
827 			sdcmn_err9(("sdev_mknode: parent %s ZOMBIEd\n",
828 			    ddv->sdev_path));
829 			return (ENOENT);
830 		}
831 
832 		error = sdev_nodeinit(ddv, nm, &dv, vap);
833 		if (error != 0) {
834 			sdcmn_err9(("sdev_mknode: error %d,"
835 			    " name %s can not be initialized\n",
836 			    error, nm));
837 			return (error);
838 		}
839 		ASSERT(dv);
840 
841 		/* insert into the directory cache */
842 		sdev_cache_update(ddv, &dv, nm, SDEV_CACHE_ADD);
843 	}
844 
845 	ASSERT(dv);
846 	node_state = dv->sdev_state;
847 	ASSERT(node_state != SDEV_ZOMBIE);
848 
849 	if (state == SDEV_READY) {
850 		switch (node_state) {
851 		case SDEV_INIT:
852 			error = sdev_nodeready(dv, vap, avp, args, cred);
853 			if (error) {
854 				sdcmn_err9(("sdev_mknode: node %s can NOT"
855 				    " be transitioned into READY state, "
856 				    "error %d\n", nm, error));
857 			}
858 			break;
859 		case SDEV_READY:
860 			/*
861 			 * Do some sanity checking to make sure
862 			 * the existing sdev_node is what has been
863 			 * asked for.
864 			 */
865 			error = sdev_node_check(dv, vap, args);
866 			break;
867 		default:
868 			break;
869 		}
870 	}
871 
872 	if (!error) {
873 		*newdv = dv;
874 		ASSERT((*newdv)->sdev_state != SDEV_ZOMBIE);
875 	} else {
876 		sdev_cache_update(ddv, &dv, nm, SDEV_CACHE_DELETE);
877 		/*
878 		 * We created this node, it wasn't passed into us. Therefore it
879 		 * is up to us to delete it.
880 		 */
881 		if (*newdv == NULL)
882 			SDEV_SIMPLE_RELE(dv);
883 		*newdv = NULL;
884 	}
885 
886 	return (error);
887 }
888 
889 /*
890  * convenient wrapper to change vp's ATIME, CTIME and MTIME
891  */
892 void
893 sdev_update_timestamps(struct vnode *vp, cred_t *cred, uint_t mask)
894 {
895 	struct vattr attr;
896 	timestruc_t now;
897 	int err;
898 
899 	ASSERT(vp);
900 	gethrestime(&now);
901 	if (mask & AT_CTIME)
902 		attr.va_ctime = now;
903 	if (mask & AT_MTIME)
904 		attr.va_mtime = now;
905 	if (mask & AT_ATIME)
906 		attr.va_atime = now;
907 
908 	attr.va_mask = (mask & AT_TIMES);
909 	err = VOP_SETATTR(vp, &attr, 0, cred, NULL);
910 	if (err && (err != EROFS)) {
911 		sdcmn_err(("update timestamps error %d\n", err));
912 	}
913 }
914 
915 /*
916  * the backing store vnode is released here
917  */
918 /*ARGSUSED1*/
919 void
920 sdev_nodedestroy(struct sdev_node *dv, uint_t flags)
921 {
922 	/* no references */
923 	ASSERT(dv->sdev_nlink == 0);
924 
925 	if (dv->sdev_attrvp != NULLVP) {
926 		VN_RELE(dv->sdev_attrvp);
927 		/*
928 		 * reset the attrvp so that no more
929 		 * references can be made on this already
930 		 * vn_rele() vnode
931 		 */
932 		dv->sdev_attrvp = NULLVP;
933 	}
934 
935 	if (dv->sdev_attr != NULL) {
936 		kmem_free(dv->sdev_attr, sizeof (struct vattr));
937 		dv->sdev_attr = NULL;
938 	}
939 
940 	if (dv->sdev_name != NULL) {
941 		kmem_free(dv->sdev_name, dv->sdev_namelen + 1);
942 		dv->sdev_name = NULL;
943 	}
944 
945 	if (dv->sdev_symlink != NULL) {
946 		kmem_free(dv->sdev_symlink, strlen(dv->sdev_symlink) + 1);
947 		dv->sdev_symlink = NULL;
948 	}
949 
950 	if (dv->sdev_path) {
951 		kmem_free(dv->sdev_path, strlen(dv->sdev_path) + 1);
952 		dv->sdev_path = NULL;
953 	}
954 
955 	if (!SDEV_IS_GLOBAL(dv))
956 		sdev_prof_free(dv);
957 
958 	if (SDEVTOV(dv)->v_type == VDIR) {
959 		ASSERT(SDEV_FIRST_ENTRY(dv) == NULL);
960 		avl_destroy(&dv->sdev_entries);
961 	}
962 
963 	mutex_destroy(&dv->sdev_lookup_lock);
964 	cv_destroy(&dv->sdev_lookup_cv);
965 
966 	/* return node to initial state as per constructor */
967 	(void) memset((void *)&dv->sdev_instance_data, 0,
968 	    sizeof (dv->sdev_instance_data));
969 	vn_invalid(SDEVTOV(dv));
970 	kmem_cache_free(sdev_node_cache, dv);
971 }
972 
973 /*
974  * DIRECTORY CACHE lookup
975  */
976 struct sdev_node *
977 sdev_findbyname(struct sdev_node *ddv, char *nm)
978 {
979 	struct sdev_node *dv;
980 	struct sdev_node dvtmp;
981 	avl_index_t	where;
982 
983 	ASSERT(RW_LOCK_HELD(&ddv->sdev_contents));
984 
985 	dvtmp.sdev_name = nm;
986 	dv = avl_find(&ddv->sdev_entries, &dvtmp, &where);
987 	if (dv) {
988 		ASSERT(dv->sdev_dotdot == ddv);
989 		ASSERT(strcmp(dv->sdev_name, nm) == 0);
990 		ASSERT(dv->sdev_state != SDEV_ZOMBIE);
991 		SDEV_HOLD(dv);
992 		return (dv);
993 	}
994 	return (NULL);
995 }
996 
997 /*
998  * Inserts a new sdev_node in a parent directory
999  */
1000 void
1001 sdev_direnter(struct sdev_node *ddv, struct sdev_node *dv)
1002 {
1003 	avl_index_t where;
1004 
1005 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1006 	ASSERT(SDEVTOV(ddv)->v_type == VDIR);
1007 	ASSERT(ddv->sdev_nlink >= 2);
1008 	ASSERT(dv->sdev_nlink == 0);
1009 	ASSERT(dv->sdev_state != SDEV_ZOMBIE);
1010 
1011 	dv->sdev_dotdot = ddv;
1012 	VERIFY(avl_find(&ddv->sdev_entries, dv, &where) == NULL);
1013 	avl_insert(&ddv->sdev_entries, dv, where);
1014 	ddv->sdev_nlink++;
1015 }
1016 
1017 /*
1018  * The following check is needed because while sdev_nodes are linked
1019  * in SDEV_INIT state, they have their link counts incremented only
1020  * in SDEV_READY state.
1021  */
1022 static void
1023 decr_link(struct sdev_node *dv)
1024 {
1025 	VERIFY(RW_WRITE_HELD(&dv->sdev_contents));
1026 	if (dv->sdev_state != SDEV_INIT) {
1027 		VERIFY(dv->sdev_nlink >= 1);
1028 		dv->sdev_nlink--;
1029 	} else {
1030 		VERIFY(dv->sdev_nlink == 0);
1031 	}
1032 }
1033 
1034 /*
1035  * Delete an existing dv from directory cache
1036  *
1037  * In the case of a node is still held by non-zero reference count, the node is
1038  * put into ZOMBIE state. The node is always unlinked from its parent, but it is
1039  * not destroyed via sdev_inactive until its reference count reaches "0".
1040  */
1041 static void
1042 sdev_dirdelete(struct sdev_node *ddv, struct sdev_node *dv)
1043 {
1044 	struct vnode *vp;
1045 	sdev_node_state_t os;
1046 
1047 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1048 
1049 	vp = SDEVTOV(dv);
1050 	mutex_enter(&vp->v_lock);
1051 	rw_enter(&dv->sdev_contents, RW_WRITER);
1052 	os = dv->sdev_state;
1053 	ASSERT(os != SDEV_ZOMBIE);
1054 	dv->sdev_state = SDEV_ZOMBIE;
1055 
1056 	/*
1057 	 * unlink ourselves from the parent directory now to take care of the ..
1058 	 * link. However, if we're a directory, we don't remove our reference to
1059 	 * ourself eg. '.' until we are torn down in the inactive callback.
1060 	 */
1061 	decr_link(ddv);
1062 	avl_remove(&ddv->sdev_entries, dv);
1063 	/*
1064 	 * sdev_inactive expects nodes to have a link to themselves when we're
1065 	 * tearing them down. If we're transitioning from the initial state to
1066 	 * zombie and not via ready, then we're not going to have this link that
1067 	 * comes from the node being ready. As a result, we need to increment
1068 	 * our link count by one to account for this.
1069 	 */
1070 	if (os == SDEV_INIT && dv->sdev_nlink == 0)
1071 		dv->sdev_nlink++;
1072 	rw_exit(&dv->sdev_contents);
1073 	mutex_exit(&vp->v_lock);
1074 }
1075 
1076 /*
1077  * check if the source is in the path of the target
1078  *
1079  * source and target are different
1080  */
1081 /*ARGSUSED2*/
1082 static int
1083 sdev_checkpath(struct sdev_node *sdv, struct sdev_node *tdv, struct cred *cred)
1084 {
1085 	int error = 0;
1086 	struct sdev_node *dotdot, *dir;
1087 
1088 	dotdot = tdv->sdev_dotdot;
1089 	ASSERT(dotdot);
1090 
1091 	/* fs root */
1092 	if (dotdot == tdv) {
1093 		return (0);
1094 	}
1095 
1096 	for (;;) {
1097 		/*
1098 		 * avoid error cases like
1099 		 *	mv a a/b
1100 		 *	mv a a/b/c
1101 		 *	etc.
1102 		 */
1103 		if (dotdot == sdv) {
1104 			error = EINVAL;
1105 			break;
1106 		}
1107 
1108 		dir = dotdot;
1109 		dotdot = dir->sdev_dotdot;
1110 
1111 		/* done checking because root is reached */
1112 		if (dir == dotdot) {
1113 			break;
1114 		}
1115 	}
1116 	return (error);
1117 }
1118 
1119 int
1120 sdev_rnmnode(struct sdev_node *oddv, struct sdev_node *odv,
1121     struct sdev_node *nddv, struct sdev_node **ndvp, char *nnm,
1122     struct cred *cred)
1123 {
1124 	int error = 0;
1125 	struct vnode *ovp = SDEVTOV(odv);
1126 	struct vnode *nvp;
1127 	struct vattr vattr;
1128 	int doingdir = (ovp->v_type == VDIR);
1129 	char *link = NULL;
1130 	int samedir = (oddv == nddv) ? 1 : 0;
1131 	int bkstore = 0;
1132 	struct sdev_node *idv = NULL;
1133 	struct sdev_node *ndv = NULL;
1134 	timestruc_t now;
1135 
1136 	vattr.va_mask = AT_TYPE|AT_MODE|AT_UID|AT_GID;
1137 	error = VOP_GETATTR(ovp, &vattr, 0, cred, NULL);
1138 	if (error)
1139 		return (error);
1140 
1141 	if (!samedir)
1142 		rw_enter(&oddv->sdev_contents, RW_WRITER);
1143 	rw_enter(&nddv->sdev_contents, RW_WRITER);
1144 
1145 	/*
1146 	 * the source may have been deleted by another thread before
1147 	 * we gets here.
1148 	 */
1149 	if (odv->sdev_state != SDEV_READY) {
1150 		error = ENOENT;
1151 		goto err_out;
1152 	}
1153 
1154 	if (doingdir && (odv == nddv)) {
1155 		error = EINVAL;
1156 		goto err_out;
1157 	}
1158 
1159 	/*
1160 	 * If renaming a directory, and the parents are different (".." must be
1161 	 * changed) then the source dir must not be in the dir hierarchy above
1162 	 * the target since it would orphan everything below the source dir.
1163 	 */
1164 	if (doingdir && (oddv != nddv)) {
1165 		error = sdev_checkpath(odv, nddv, cred);
1166 		if (error)
1167 			goto err_out;
1168 	}
1169 
1170 	/* fix the source for a symlink */
1171 	if (vattr.va_type == VLNK) {
1172 		if (odv->sdev_symlink == NULL) {
1173 			error = sdev_follow_link(odv);
1174 			if (error) {
1175 				/*
1176 				 * The underlying symlink doesn't exist. This
1177 				 * node probably shouldn't even exist. While
1178 				 * it's a bit jarring to consumers, we're going
1179 				 * to remove the node from /dev.
1180 				 */
1181 				if (SDEV_IS_PERSIST((*ndvp)))
1182 					bkstore = 1;
1183 				sdev_dirdelete(oddv, odv);
1184 				if (bkstore) {
1185 					ASSERT(nddv->sdev_attrvp);
1186 					error = VOP_REMOVE(nddv->sdev_attrvp,
1187 					    nnm, cred, NULL, 0);
1188 					if (error)
1189 						goto err_out;
1190 				}
1191 				error = ENOENT;
1192 				goto err_out;
1193 			}
1194 		}
1195 		ASSERT(odv->sdev_symlink);
1196 		link = i_ddi_strdup(odv->sdev_symlink, KM_SLEEP);
1197 	}
1198 
1199 	/* destination existing */
1200 	if (*ndvp) {
1201 		nvp = SDEVTOV(*ndvp);
1202 		ASSERT(nvp);
1203 
1204 		/* handling renaming to itself */
1205 		if (odv == *ndvp) {
1206 			error = 0;
1207 			goto err_out;
1208 		}
1209 
1210 		if (nvp->v_type == VDIR) {
1211 			if (!doingdir) {
1212 				error = EISDIR;
1213 				goto err_out;
1214 			}
1215 
1216 			if (vn_vfswlock(nvp)) {
1217 				error = EBUSY;
1218 				goto err_out;
1219 			}
1220 
1221 			if (vn_mountedvfs(nvp) != NULL) {
1222 				vn_vfsunlock(nvp);
1223 				error = EBUSY;
1224 				goto err_out;
1225 			}
1226 
1227 			/* in case dir1 exists in dir2 and "mv dir1 dir2" */
1228 			if ((*ndvp)->sdev_nlink > 2) {
1229 				vn_vfsunlock(nvp);
1230 				error = EEXIST;
1231 				goto err_out;
1232 			}
1233 			vn_vfsunlock(nvp);
1234 
1235 			/*
1236 			 * We did not place the hold on *ndvp, so even though
1237 			 * we're deleting the node, we should not get rid of our
1238 			 * reference.
1239 			 */
1240 			sdev_dirdelete(nddv, *ndvp);
1241 			*ndvp = NULL;
1242 			ASSERT(nddv->sdev_attrvp);
1243 			error = VOP_RMDIR(nddv->sdev_attrvp, nnm,
1244 			    nddv->sdev_attrvp, cred, NULL, 0);
1245 			if (error)
1246 				goto err_out;
1247 		} else {
1248 			if (doingdir) {
1249 				error = ENOTDIR;
1250 				goto err_out;
1251 			}
1252 
1253 			if (SDEV_IS_PERSIST((*ndvp))) {
1254 				bkstore = 1;
1255 			}
1256 
1257 			/*
1258 			 * Get rid of the node from the directory cache note.
1259 			 * Don't forget that it's not up to us to remove the vn
1260 			 * ref on the sdev node, as we did not place it.
1261 			 */
1262 			sdev_dirdelete(nddv, *ndvp);
1263 			*ndvp = NULL;
1264 			if (bkstore) {
1265 				ASSERT(nddv->sdev_attrvp);
1266 				error = VOP_REMOVE(nddv->sdev_attrvp,
1267 				    nnm, cred, NULL, 0);
1268 				if (error)
1269 					goto err_out;
1270 			}
1271 		}
1272 	}
1273 
1274 	/*
1275 	 * make a fresh node from the source attrs
1276 	 */
1277 	ASSERT(RW_WRITE_HELD(&nddv->sdev_contents));
1278 	error = sdev_mknode(nddv, nnm, ndvp, &vattr,
1279 	    NULL, (void *)link, cred, SDEV_READY);
1280 
1281 	if (link != NULL) {
1282 		kmem_free(link, strlen(link) + 1);
1283 		link = NULL;
1284 	}
1285 
1286 	if (error)
1287 		goto err_out;
1288 	ASSERT(*ndvp);
1289 	ASSERT((*ndvp)->sdev_state == SDEV_READY);
1290 
1291 	/* move dir contents */
1292 	if (doingdir) {
1293 		for (idv = SDEV_FIRST_ENTRY(odv); idv;
1294 		    idv = SDEV_NEXT_ENTRY(odv, idv)) {
1295 			SDEV_HOLD(idv);
1296 			error = sdev_rnmnode(odv, idv,
1297 			    (struct sdev_node *)(*ndvp), &ndv,
1298 			    idv->sdev_name, cred);
1299 			SDEV_RELE(idv);
1300 			if (error)
1301 				goto err_out;
1302 			ndv = NULL;
1303 		}
1304 	}
1305 
1306 	if ((*ndvp)->sdev_attrvp) {
1307 		sdev_update_timestamps((*ndvp)->sdev_attrvp, kcred,
1308 		    AT_CTIME|AT_ATIME);
1309 	} else {
1310 		ASSERT((*ndvp)->sdev_attr);
1311 		gethrestime(&now);
1312 		(*ndvp)->sdev_attr->va_ctime = now;
1313 		(*ndvp)->sdev_attr->va_atime = now;
1314 	}
1315 
1316 	if (nddv->sdev_attrvp) {
1317 		sdev_update_timestamps(nddv->sdev_attrvp, kcred,
1318 		    AT_MTIME|AT_ATIME);
1319 	} else {
1320 		ASSERT(nddv->sdev_attr);
1321 		gethrestime(&now);
1322 		nddv->sdev_attr->va_mtime = now;
1323 		nddv->sdev_attr->va_atime = now;
1324 	}
1325 	rw_exit(&nddv->sdev_contents);
1326 	if (!samedir)
1327 		rw_exit(&oddv->sdev_contents);
1328 
1329 	SDEV_RELE(*ndvp);
1330 	return (error);
1331 
1332 err_out:
1333 	if (link != NULL) {
1334 		kmem_free(link, strlen(link) + 1);
1335 		link = NULL;
1336 	}
1337 
1338 	rw_exit(&nddv->sdev_contents);
1339 	if (!samedir)
1340 		rw_exit(&oddv->sdev_contents);
1341 	return (error);
1342 }
1343 
1344 /*
1345  * Merge sdev_node specific information into an attribute structure.
1346  *
1347  * note: sdev_node is not locked here
1348  */
1349 void
1350 sdev_vattr_merge(struct sdev_node *dv, struct vattr *vap)
1351 {
1352 	struct vnode *vp = SDEVTOV(dv);
1353 
1354 	vap->va_nlink = dv->sdev_nlink;
1355 	vap->va_nodeid = dv->sdev_ino;
1356 	vap->va_fsid = SDEVTOV(dv->sdev_dotdot)->v_rdev;
1357 	vap->va_type = vp->v_type;
1358 
1359 	if (vp->v_type == VDIR) {
1360 		vap->va_rdev = 0;
1361 		vap->va_fsid = vp->v_rdev;
1362 	} else if (vp->v_type == VLNK) {
1363 		vap->va_rdev = 0;
1364 		vap->va_mode  &= ~S_IFMT;
1365 		vap->va_mode |= S_IFLNK;
1366 	} else if ((vp->v_type == VCHR) || (vp->v_type == VBLK)) {
1367 		vap->va_rdev = vp->v_rdev;
1368 		vap->va_mode &= ~S_IFMT;
1369 		if (vap->va_type == VCHR)
1370 			vap->va_mode |= S_IFCHR;
1371 		else
1372 			vap->va_mode |= S_IFBLK;
1373 	} else {
1374 		vap->va_rdev = 0;
1375 	}
1376 }
1377 
1378 struct vattr *
1379 sdev_getdefault_attr(enum vtype type)
1380 {
1381 	if (type == VDIR)
1382 		return (&sdev_vattr_dir);
1383 	else if (type == VCHR)
1384 		return (&sdev_vattr_chr);
1385 	else if (type == VBLK)
1386 		return (&sdev_vattr_blk);
1387 	else if (type == VLNK)
1388 		return (&sdev_vattr_lnk);
1389 	else
1390 		return (NULL);
1391 }
1392 int
1393 sdev_to_vp(struct sdev_node *dv, struct vnode **vpp)
1394 {
1395 	int rv = 0;
1396 	struct vnode *vp = SDEVTOV(dv);
1397 
1398 	switch (vp->v_type) {
1399 	case VCHR:
1400 	case VBLK:
1401 		/*
1402 		 * If vnode is a device, return special vnode instead
1403 		 * (though it knows all about -us- via sp->s_realvp)
1404 		 */
1405 		*vpp = specvp(vp, vp->v_rdev, vp->v_type, kcred);
1406 		VN_RELE(vp);
1407 		if (*vpp == NULLVP)
1408 			rv = ENOSYS;
1409 		break;
1410 	default:	/* most types are returned as is */
1411 		*vpp = vp;
1412 		break;
1413 	}
1414 	return (rv);
1415 }
1416 
1417 /*
1418  * junction between devname and root file system, e.g. ufs
1419  */
1420 int
1421 devname_backstore_lookup(struct sdev_node *ddv, char *nm, struct vnode **rvp)
1422 {
1423 	struct vnode *rdvp = ddv->sdev_attrvp;
1424 	int rval = 0;
1425 
1426 	ASSERT(rdvp);
1427 
1428 	rval = VOP_LOOKUP(rdvp, nm, rvp, NULL, 0, NULL, kcred, NULL, NULL,
1429 	    NULL);
1430 	return (rval);
1431 }
1432 
1433 static int
1434 sdev_filldir_from_store(struct sdev_node *ddv, int dlen, struct cred *cred)
1435 {
1436 	struct sdev_node *dv = NULL;
1437 	char	*nm;
1438 	struct vnode *dirvp;
1439 	int	error;
1440 	vnode_t	*vp;
1441 	int eof;
1442 	struct iovec iov;
1443 	struct uio uio;
1444 	struct dirent64 *dp;
1445 	dirent64_t *dbuf;
1446 	size_t dbuflen;
1447 	struct vattr vattr;
1448 	char *link = NULL;
1449 
1450 	if (ddv->sdev_attrvp == NULL)
1451 		return (0);
1452 	if (!(ddv->sdev_flags & SDEV_BUILD))
1453 		return (0);
1454 
1455 	dirvp = ddv->sdev_attrvp;
1456 	VN_HOLD(dirvp);
1457 	dbuf = kmem_zalloc(dlen, KM_SLEEP);
1458 
1459 	uio.uio_iov = &iov;
1460 	uio.uio_iovcnt = 1;
1461 	uio.uio_segflg = UIO_SYSSPACE;
1462 	uio.uio_fmode = 0;
1463 	uio.uio_extflg = UIO_COPY_CACHED;
1464 	uio.uio_loffset = 0;
1465 	uio.uio_llimit = MAXOFFSET_T;
1466 
1467 	eof = 0;
1468 	error = 0;
1469 	while (!error && !eof) {
1470 		uio.uio_resid = dlen;
1471 		iov.iov_base = (char *)dbuf;
1472 		iov.iov_len = dlen;
1473 		(void) VOP_RWLOCK(dirvp, V_WRITELOCK_FALSE, NULL);
1474 		error = VOP_READDIR(dirvp, &uio, kcred, &eof, NULL, 0);
1475 		VOP_RWUNLOCK(dirvp, V_WRITELOCK_FALSE, NULL);
1476 
1477 		dbuflen = dlen - uio.uio_resid;
1478 		if (error || dbuflen == 0)
1479 			break;
1480 
1481 		if (!(ddv->sdev_flags & SDEV_BUILD))
1482 			break;
1483 
1484 		for (dp = dbuf; ((intptr_t)dp <
1485 		    (intptr_t)dbuf + dbuflen);
1486 		    dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen)) {
1487 			nm = dp->d_name;
1488 
1489 			if (strcmp(nm, ".") == 0 ||
1490 			    strcmp(nm, "..") == 0)
1491 				continue;
1492 
1493 			vp = NULLVP;
1494 			dv = sdev_cache_lookup(ddv, nm);
1495 			if (dv) {
1496 				VERIFY(dv->sdev_state != SDEV_ZOMBIE);
1497 				SDEV_SIMPLE_RELE(dv);
1498 				continue;
1499 			}
1500 
1501 			/* refill the cache if not already */
1502 			error = devname_backstore_lookup(ddv, nm, &vp);
1503 			if (error)
1504 				continue;
1505 
1506 			vattr.va_mask = AT_TYPE|AT_MODE|AT_UID|AT_GID;
1507 			error = VOP_GETATTR(vp, &vattr, 0, cred, NULL);
1508 			if (error)
1509 				continue;
1510 
1511 			if (vattr.va_type == VLNK) {
1512 				error = sdev_getlink(vp, &link);
1513 				if (error) {
1514 					continue;
1515 				}
1516 				ASSERT(link != NULL);
1517 			}
1518 
1519 			if (!rw_tryupgrade(&ddv->sdev_contents)) {
1520 				rw_exit(&ddv->sdev_contents);
1521 				rw_enter(&ddv->sdev_contents, RW_WRITER);
1522 			}
1523 			error = sdev_mknode(ddv, nm, &dv, &vattr, vp, link,
1524 			    cred, SDEV_READY);
1525 			rw_downgrade(&ddv->sdev_contents);
1526 
1527 			if (link != NULL) {
1528 				kmem_free(link, strlen(link) + 1);
1529 				link = NULL;
1530 			}
1531 
1532 			if (!error) {
1533 				ASSERT(dv);
1534 				ASSERT(dv->sdev_state != SDEV_ZOMBIE);
1535 				SDEV_SIMPLE_RELE(dv);
1536 			}
1537 			vp = NULL;
1538 			dv = NULL;
1539 		}
1540 	}
1541 
1542 done:
1543 	VN_RELE(dirvp);
1544 	kmem_free(dbuf, dlen);
1545 
1546 	return (error);
1547 }
1548 
1549 void
1550 sdev_filldir_dynamic(struct sdev_node *ddv)
1551 {
1552 	int error;
1553 	int i;
1554 	struct vattr vattr;
1555 	struct vattr *vap = &vattr;
1556 	char *nm = NULL;
1557 	struct sdev_node *dv = NULL;
1558 
1559 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1560 	ASSERT((ddv->sdev_flags & SDEV_BUILD));
1561 
1562 	*vap = *sdev_getdefault_attr(VDIR);	/* note structure copy here */
1563 	gethrestime(&vap->va_atime);
1564 	vap->va_mtime = vap->va_atime;
1565 	vap->va_ctime = vap->va_atime;
1566 	for (i = 0; vtab[i].vt_name != NULL; i++) {
1567 		/*
1568 		 * This early, we may be in a read-only /dev environment: leave
1569 		 * the creation of any nodes we'd attempt to persist to
1570 		 * devfsadm. Because /dev itself is normally persistent, any
1571 		 * node which is not marked dynamic will end up being marked
1572 		 * persistent. However, some nodes are both dynamic and
1573 		 * persistent, mostly lofi and rlofi, so we need to be careful
1574 		 * in our check.
1575 		 */
1576 		if ((vtab[i].vt_flags & SDEV_PERSIST) ||
1577 		    !(vtab[i].vt_flags & SDEV_DYNAMIC))
1578 			continue;
1579 		nm = vtab[i].vt_name;
1580 		ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1581 		dv = NULL;
1582 		error = sdev_mknode(ddv, nm, &dv, vap, NULL,
1583 		    NULL, kcred, SDEV_READY);
1584 		if (error) {
1585 			cmn_err(CE_WARN, "%s/%s: error %d\n",
1586 			    ddv->sdev_name, nm, error);
1587 		} else {
1588 			ASSERT(dv);
1589 			ASSERT(dv->sdev_state != SDEV_ZOMBIE);
1590 			SDEV_SIMPLE_RELE(dv);
1591 		}
1592 	}
1593 }
1594 
1595 /*
1596  * Creating a backing store entry based on sdev_attr.
1597  * This is called either as part of node creation in a persistent directory
1598  * or from setattr/setsecattr to persist access attributes across reboot.
1599  */
1600 int
1601 sdev_shadow_node(struct sdev_node *dv, struct cred *cred)
1602 {
1603 	int error = 0;
1604 	struct vnode *dvp = SDEVTOV(dv->sdev_dotdot);
1605 	struct vnode *rdvp = VTOSDEV(dvp)->sdev_attrvp;
1606 	struct vattr *vap = dv->sdev_attr;
1607 	char *nm = dv->sdev_name;
1608 	struct vnode *tmpvp, **rvp = &tmpvp, *rrvp = NULL;
1609 
1610 	ASSERT(dv && dv->sdev_name && rdvp);
1611 	ASSERT(RW_WRITE_HELD(&dv->sdev_contents) && dv->sdev_attrvp == NULL);
1612 
1613 lookup:
1614 	/* try to find it in the backing store */
1615 	error = VOP_LOOKUP(rdvp, nm, rvp, NULL, 0, NULL, cred, NULL, NULL,
1616 	    NULL);
1617 	if (error == 0) {
1618 		if (VOP_REALVP(*rvp, &rrvp, NULL) == 0) {
1619 			VN_HOLD(rrvp);
1620 			VN_RELE(*rvp);
1621 			*rvp = rrvp;
1622 		}
1623 
1624 		kmem_free(dv->sdev_attr, sizeof (vattr_t));
1625 		dv->sdev_attr = NULL;
1626 		dv->sdev_attrvp = *rvp;
1627 		return (0);
1628 	}
1629 
1630 	/* let's try to persist the node */
1631 	gethrestime(&vap->va_atime);
1632 	vap->va_mtime = vap->va_atime;
1633 	vap->va_ctime = vap->va_atime;
1634 	vap->va_mask |= AT_TYPE|AT_MODE;
1635 	switch (vap->va_type) {
1636 	case VDIR:
1637 		error = VOP_MKDIR(rdvp, nm, vap, rvp, cred, NULL, 0, NULL);
1638 		sdcmn_err9(("sdev_shadow_node: mkdir vp %p error %d\n",
1639 		    (void *)(*rvp), error));
1640 		if (!error)
1641 			VN_RELE(*rvp);
1642 		break;
1643 	case VCHR:
1644 	case VBLK:
1645 	case VREG:
1646 	case VDOOR:
1647 		error = VOP_CREATE(rdvp, nm, vap, NONEXCL, VREAD|VWRITE,
1648 		    rvp, cred, 0, NULL, NULL);
1649 		sdcmn_err9(("sdev_shadow_node: create vp %p, error %d\n",
1650 		    (void *)(*rvp), error));
1651 		if (!error)
1652 			VN_RELE(*rvp);
1653 		break;
1654 	case VLNK:
1655 		ASSERT(dv->sdev_symlink);
1656 		error = VOP_SYMLINK(rdvp, nm, vap, dv->sdev_symlink, cred,
1657 		    NULL, 0);
1658 		sdcmn_err9(("sdev_shadow_node: create symlink error %d\n",
1659 		    error));
1660 		break;
1661 	default:
1662 		cmn_err(CE_PANIC, "dev: %s: sdev_shadow_node "
1663 		    "create\n", nm);
1664 		/*NOTREACHED*/
1665 	}
1666 
1667 	/* go back to lookup to factor out spec node and set attrvp */
1668 	if (error == 0)
1669 		goto lookup;
1670 
1671 	sdcmn_err(("cannot persist %s - error %d\n", dv->sdev_path, error));
1672 	return (error);
1673 }
1674 
1675 static void
1676 sdev_cache_add(struct sdev_node *ddv, struct sdev_node **dv, char *nm)
1677 {
1678 	struct sdev_node *dup = NULL;
1679 
1680 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1681 	if ((dup = sdev_findbyname(ddv, nm)) == NULL) {
1682 		sdev_direnter(ddv, *dv);
1683 	} else {
1684 		VERIFY(dup->sdev_state != SDEV_ZOMBIE);
1685 		SDEV_SIMPLE_RELE(*dv);
1686 		sdev_nodedestroy(*dv, 0);
1687 		*dv = dup;
1688 	}
1689 }
1690 
1691 static void
1692 sdev_cache_delete(struct sdev_node *ddv, struct sdev_node **dv)
1693 {
1694 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1695 	sdev_dirdelete(ddv, *dv);
1696 }
1697 
1698 /*
1699  * update the in-core directory cache
1700  */
1701 void
1702 sdev_cache_update(struct sdev_node *ddv, struct sdev_node **dv, char *nm,
1703     sdev_cache_ops_t ops)
1704 {
1705 	ASSERT((SDEV_HELD(*dv)));
1706 
1707 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1708 	switch (ops) {
1709 	case SDEV_CACHE_ADD:
1710 		sdev_cache_add(ddv, dv, nm);
1711 		break;
1712 	case SDEV_CACHE_DELETE:
1713 		sdev_cache_delete(ddv, dv);
1714 		break;
1715 	default:
1716 		break;
1717 	}
1718 }
1719 
1720 /*
1721  * retrieve the named entry from the directory cache
1722  */
1723 struct sdev_node *
1724 sdev_cache_lookup(struct sdev_node *ddv, char *nm)
1725 {
1726 	struct sdev_node *dv = NULL;
1727 
1728 	ASSERT(RW_LOCK_HELD(&ddv->sdev_contents));
1729 	dv = sdev_findbyname(ddv, nm);
1730 
1731 	return (dv);
1732 }
1733 
1734 /*
1735  * Implicit reconfig for nodes constructed by a link generator
1736  * Start devfsadm if needed, or if devfsadm is in progress,
1737  * prepare to block on devfsadm either completing or
1738  * constructing the desired node.  As devfsadmd is global
1739  * in scope, constructing all necessary nodes, we only
1740  * need to initiate it once.
1741  */
1742 static int
1743 sdev_call_devfsadmd(struct sdev_node *ddv, struct sdev_node *dv, char *nm)
1744 {
1745 	int error = 0;
1746 
1747 	if (DEVNAME_DEVFSADM_IS_RUNNING(devfsadm_state)) {
1748 		sdcmn_err6(("lookup: waiting for %s/%s, 0x%x\n",
1749 		    ddv->sdev_name, nm, devfsadm_state));
1750 		mutex_enter(&dv->sdev_lookup_lock);
1751 		SDEV_BLOCK_OTHERS(dv, (SDEV_LOOKUP | SDEV_LGWAITING));
1752 		mutex_exit(&dv->sdev_lookup_lock);
1753 		error = 0;
1754 	} else if (!DEVNAME_DEVFSADM_HAS_RUN(devfsadm_state)) {
1755 		sdcmn_err6(("lookup %s/%s starting devfsadm, 0x%x\n",
1756 		    ddv->sdev_name, nm, devfsadm_state));
1757 
1758 		sdev_devfsadmd_thread(ddv, dv, kcred);
1759 		mutex_enter(&dv->sdev_lookup_lock);
1760 		SDEV_BLOCK_OTHERS(dv,
1761 		    (SDEV_LOOKUP | SDEV_LGWAITING));
1762 		mutex_exit(&dv->sdev_lookup_lock);
1763 		error = 0;
1764 	} else {
1765 		error = -1;
1766 	}
1767 
1768 	return (error);
1769 }
1770 
1771 /*
1772  *  Support for specialized device naming construction mechanisms
1773  */
1774 static int
1775 sdev_call_dircallback(struct sdev_node *ddv, struct sdev_node **dvp, char *nm,
1776     int (*callback)(struct sdev_node *, char *, void **, struct cred *,
1777     void *, char *), int flags, struct cred *cred)
1778 {
1779 	int rv = 0;
1780 	char *physpath = NULL;
1781 	struct vattr vattr;
1782 	struct vattr *vap = &vattr;
1783 	struct sdev_node *dv = NULL;
1784 
1785 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1786 	if (flags & SDEV_VLINK) {
1787 		physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1788 		rv = callback(ddv, nm, (void *)&physpath, kcred, NULL,
1789 		    NULL);
1790 		if (rv) {
1791 			kmem_free(physpath, MAXPATHLEN);
1792 			return (-1);
1793 		}
1794 
1795 		*vap = *sdev_getdefault_attr(VLNK);	/* structure copy */
1796 		vap->va_size = strlen(physpath);
1797 		gethrestime(&vap->va_atime);
1798 		vap->va_mtime = vap->va_atime;
1799 		vap->va_ctime = vap->va_atime;
1800 
1801 		rv = sdev_mknode(ddv, nm, &dv, vap, NULL,
1802 		    (void *)physpath, cred, SDEV_READY);
1803 		kmem_free(physpath, MAXPATHLEN);
1804 		if (rv)
1805 			return (rv);
1806 	} else if (flags & SDEV_VATTR) {
1807 		/*
1808 		 * /dev/pts
1809 		 *
1810 		 * callback is responsible to set the basic attributes,
1811 		 * e.g. va_type/va_uid/va_gid/
1812 		 *    dev_t if VCHR or VBLK/
1813 		 */
1814 		ASSERT(callback);
1815 		rv = callback(ddv, nm, (void *)&vattr, kcred, NULL, NULL);
1816 		if (rv) {
1817 			sdcmn_err3(("devname_lookup_func: SDEV_NONE "
1818 			    "callback failed \n"));
1819 			return (-1);
1820 		}
1821 
1822 		rv = sdev_mknode(ddv, nm, &dv, &vattr, NULL, NULL,
1823 		    cred, SDEV_READY);
1824 
1825 		if (rv)
1826 			return (rv);
1827 
1828 	} else {
1829 		impossible(("lookup: %s/%s by %s not supported (%d)\n",
1830 		    SDEVTOV(ddv)->v_path, nm, curproc->p_user.u_comm,
1831 		    __LINE__));
1832 		rv = -1;
1833 	}
1834 
1835 	*dvp = dv;
1836 	return (rv);
1837 }
1838 
1839 static int
1840 is_devfsadm_thread(char *exec_name)
1841 {
1842 	/*
1843 	 * note: because devfsadmd -> /usr/sbin/devfsadm
1844 	 * it is safe to use "devfsadm" to capture the lookups
1845 	 * from devfsadm and its daemon version.
1846 	 */
1847 	if (strcmp(exec_name, "devfsadm") == 0)
1848 		return (1);
1849 	return (0);
1850 }
1851 
1852 /*
1853  * Lookup Order:
1854  *	sdev_node cache;
1855  *	backing store (SDEV_PERSIST);
1856  *	DBNR: a. dir_ops implemented in the loadable modules;
1857  *	      b. vnode ops in vtab.
1858  */
1859 int
1860 devname_lookup_func(struct sdev_node *ddv, char *nm, struct vnode **vpp,
1861     struct cred *cred, int (*callback)(struct sdev_node *, char *, void **,
1862     struct cred *, void *, char *), int flags)
1863 {
1864 	int rv = 0, nmlen;
1865 	struct vnode *rvp = NULL;
1866 	struct sdev_node *dv = NULL;
1867 	int	retried = 0;
1868 	int	error = 0;
1869 	struct vattr vattr;
1870 	char *lookup_thread = curproc->p_user.u_comm;
1871 	int failed_flags = 0;
1872 	int (*vtor)(struct sdev_node *) = NULL;
1873 	int state;
1874 	int parent_state;
1875 	char *link = NULL;
1876 
1877 	if (SDEVTOV(ddv)->v_type != VDIR)
1878 		return (ENOTDIR);
1879 
1880 	/*
1881 	 * Empty name or ., return node itself.
1882 	 */
1883 	nmlen = strlen(nm);
1884 	if ((nmlen == 0) || ((nmlen == 1) && (nm[0] == '.'))) {
1885 		*vpp = SDEVTOV(ddv);
1886 		VN_HOLD(*vpp);
1887 		return (0);
1888 	}
1889 
1890 	/*
1891 	 * .., return the parent directory
1892 	 */
1893 	if ((nmlen == 2) && (strcmp(nm, "..") == 0)) {
1894 		*vpp = SDEVTOV(ddv->sdev_dotdot);
1895 		VN_HOLD(*vpp);
1896 		return (0);
1897 	}
1898 
1899 	rw_enter(&ddv->sdev_contents, RW_READER);
1900 	if (ddv->sdev_flags & SDEV_VTOR) {
1901 		vtor = (int (*)(struct sdev_node *))sdev_get_vtor(ddv);
1902 		ASSERT(vtor);
1903 	}
1904 
1905 tryagain:
1906 	/*
1907 	 * (a) directory cache lookup:
1908 	 */
1909 	ASSERT(RW_READ_HELD(&ddv->sdev_contents));
1910 	parent_state = ddv->sdev_state;
1911 	dv = sdev_cache_lookup(ddv, nm);
1912 	if (dv) {
1913 		state = dv->sdev_state;
1914 		switch (state) {
1915 		case SDEV_INIT:
1916 			if (is_devfsadm_thread(lookup_thread))
1917 				break;
1918 
1919 			/* ZOMBIED parent won't allow node creation */
1920 			if (parent_state == SDEV_ZOMBIE) {
1921 				SD_TRACE_FAILED_LOOKUP(ddv, nm,
1922 				    retried);
1923 				goto nolock_notfound;
1924 			}
1925 
1926 			mutex_enter(&dv->sdev_lookup_lock);
1927 			/* compensate the threads started after devfsadm */
1928 			if (DEVNAME_DEVFSADM_IS_RUNNING(devfsadm_state) &&
1929 			    !(SDEV_IS_LOOKUP(dv)))
1930 				SDEV_BLOCK_OTHERS(dv,
1931 				    (SDEV_LOOKUP | SDEV_LGWAITING));
1932 
1933 			if (SDEV_IS_LOOKUP(dv)) {
1934 				failed_flags |= SLF_REBUILT;
1935 				rw_exit(&ddv->sdev_contents);
1936 				error = sdev_wait4lookup(dv, SDEV_LOOKUP);
1937 				mutex_exit(&dv->sdev_lookup_lock);
1938 				rw_enter(&ddv->sdev_contents, RW_READER);
1939 
1940 				if (error != 0) {
1941 					SD_TRACE_FAILED_LOOKUP(ddv, nm,
1942 					    retried);
1943 					goto nolock_notfound;
1944 				}
1945 
1946 				state = dv->sdev_state;
1947 				if (state == SDEV_INIT) {
1948 					SD_TRACE_FAILED_LOOKUP(ddv, nm,
1949 					    retried);
1950 					goto nolock_notfound;
1951 				} else if (state == SDEV_READY) {
1952 					goto found;
1953 				} else if (state == SDEV_ZOMBIE) {
1954 					rw_exit(&ddv->sdev_contents);
1955 					SD_TRACE_FAILED_LOOKUP(ddv, nm,
1956 					    retried);
1957 					SDEV_RELE(dv);
1958 					goto lookup_failed;
1959 				}
1960 			} else {
1961 				mutex_exit(&dv->sdev_lookup_lock);
1962 			}
1963 			break;
1964 		case SDEV_READY:
1965 			goto found;
1966 		case SDEV_ZOMBIE:
1967 			rw_exit(&ddv->sdev_contents);
1968 			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
1969 			SDEV_RELE(dv);
1970 			goto lookup_failed;
1971 		default:
1972 			rw_exit(&ddv->sdev_contents);
1973 			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
1974 			sdev_lookup_failed(ddv, nm, failed_flags);
1975 			*vpp = NULLVP;
1976 			return (ENOENT);
1977 		}
1978 	}
1979 	ASSERT(RW_READ_HELD(&ddv->sdev_contents));
1980 
1981 	/*
1982 	 * ZOMBIED parent does not allow new node creation.
1983 	 * bail out early
1984 	 */
1985 	if (parent_state == SDEV_ZOMBIE) {
1986 		rw_exit(&ddv->sdev_contents);
1987 		*vpp = NULLVP;
1988 		SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
1989 		return (ENOENT);
1990 	}
1991 
1992 	/*
1993 	 * (b0): backing store lookup
1994 	 *	SDEV_PERSIST is default except:
1995 	 *		1) pts nodes
1996 	 *		2) non-chmod'ed local nodes
1997 	 *		3) zvol nodes
1998 	 */
1999 	if (SDEV_IS_PERSIST(ddv)) {
2000 		error = devname_backstore_lookup(ddv, nm, &rvp);
2001 
2002 		if (!error) {
2003 
2004 			vattr.va_mask = AT_TYPE|AT_MODE|AT_UID|AT_GID;
2005 			error = VOP_GETATTR(rvp, &vattr, 0, cred, NULL);
2006 			if (error) {
2007 				rw_exit(&ddv->sdev_contents);
2008 				if (dv)
2009 					SDEV_RELE(dv);
2010 				SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2011 				sdev_lookup_failed(ddv, nm, failed_flags);
2012 				*vpp = NULLVP;
2013 				return (ENOENT);
2014 			}
2015 
2016 			if (vattr.va_type == VLNK) {
2017 				error = sdev_getlink(rvp, &link);
2018 				if (error) {
2019 					rw_exit(&ddv->sdev_contents);
2020 					if (dv)
2021 						SDEV_RELE(dv);
2022 					SD_TRACE_FAILED_LOOKUP(ddv, nm,
2023 					    retried);
2024 					sdev_lookup_failed(ddv, nm,
2025 					    failed_flags);
2026 					*vpp = NULLVP;
2027 					return (ENOENT);
2028 				}
2029 				ASSERT(link != NULL);
2030 			}
2031 
2032 			if (!rw_tryupgrade(&ddv->sdev_contents)) {
2033 				rw_exit(&ddv->sdev_contents);
2034 				rw_enter(&ddv->sdev_contents, RW_WRITER);
2035 			}
2036 			error = sdev_mknode(ddv, nm, &dv, &vattr,
2037 			    rvp, link, cred, SDEV_READY);
2038 			rw_downgrade(&ddv->sdev_contents);
2039 
2040 			if (link != NULL) {
2041 				kmem_free(link, strlen(link) + 1);
2042 				link = NULL;
2043 			}
2044 
2045 			if (error) {
2046 				SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2047 				rw_exit(&ddv->sdev_contents);
2048 				if (dv)
2049 					SDEV_RELE(dv);
2050 				goto lookup_failed;
2051 			} else {
2052 				goto found;
2053 			}
2054 		} else if (retried) {
2055 			rw_exit(&ddv->sdev_contents);
2056 			sdcmn_err3(("retry of lookup of %s/%s: failed\n",
2057 			    ddv->sdev_name, nm));
2058 			if (dv)
2059 				SDEV_RELE(dv);
2060 			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2061 			sdev_lookup_failed(ddv, nm, failed_flags);
2062 			*vpp = NULLVP;
2063 			return (ENOENT);
2064 		}
2065 	}
2066 
2067 lookup_create_node:
2068 	/* first thread that is doing the lookup on this node */
2069 	if (callback) {
2070 		ASSERT(dv == NULL);
2071 		if (!rw_tryupgrade(&ddv->sdev_contents)) {
2072 			rw_exit(&ddv->sdev_contents);
2073 			rw_enter(&ddv->sdev_contents, RW_WRITER);
2074 		}
2075 		error = sdev_call_dircallback(ddv, &dv, nm, callback,
2076 		    flags, cred);
2077 		rw_downgrade(&ddv->sdev_contents);
2078 		if (error == 0) {
2079 			goto found;
2080 		} else {
2081 			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2082 			rw_exit(&ddv->sdev_contents);
2083 			goto lookup_failed;
2084 		}
2085 	}
2086 	if (!dv) {
2087 		if (!rw_tryupgrade(&ddv->sdev_contents)) {
2088 			rw_exit(&ddv->sdev_contents);
2089 			rw_enter(&ddv->sdev_contents, RW_WRITER);
2090 		}
2091 		error = sdev_mknode(ddv, nm, &dv, NULL, NULL, NULL,
2092 		    cred, SDEV_INIT);
2093 		if (!dv) {
2094 			rw_exit(&ddv->sdev_contents);
2095 			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2096 			sdev_lookup_failed(ddv, nm, failed_flags);
2097 			*vpp = NULLVP;
2098 			return (ENOENT);
2099 		}
2100 		rw_downgrade(&ddv->sdev_contents);
2101 	}
2102 
2103 	/*
2104 	 * (b1) invoking devfsadm once per life time for devfsadm nodes
2105 	 */
2106 	ASSERT(SDEV_HELD(dv));
2107 
2108 	if (SDEV_IS_NO_NCACHE(dv))
2109 		failed_flags |= SLF_NO_NCACHE;
2110 	if (sdev_reconfig_boot || !i_ddi_io_initialized() ||
2111 	    SDEV_IS_DYNAMIC(ddv) || SDEV_IS_NO_NCACHE(dv) ||
2112 	    ((moddebug & MODDEBUG_FINI_EBUSY) != 0)) {
2113 		ASSERT(SDEV_HELD(dv));
2114 		SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2115 		goto nolock_notfound;
2116 	}
2117 
2118 	/*
2119 	 * filter out known non-existent devices recorded
2120 	 * during initial reconfiguration boot for which
2121 	 * reconfig should not be done and lookup may
2122 	 * be short-circuited now.
2123 	 */
2124 	if (sdev_lookup_filter(ddv, nm)) {
2125 		SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2126 		goto nolock_notfound;
2127 	}
2128 
2129 	/* bypassing devfsadm internal nodes */
2130 	if (is_devfsadm_thread(lookup_thread)) {
2131 		SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2132 		goto nolock_notfound;
2133 	}
2134 
2135 	if (sdev_reconfig_disable) {
2136 		SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2137 		goto nolock_notfound;
2138 	}
2139 
2140 	error = sdev_call_devfsadmd(ddv, dv, nm);
2141 	if (error == 0) {
2142 		sdcmn_err8(("lookup of %s/%s by %s: reconfig\n",
2143 		    ddv->sdev_name, nm, curproc->p_user.u_comm));
2144 		if (sdev_reconfig_verbose) {
2145 			cmn_err(CE_CONT,
2146 			    "?lookup of %s/%s by %s: reconfig\n",
2147 			    ddv->sdev_name, nm, curproc->p_user.u_comm);
2148 		}
2149 		retried = 1;
2150 		failed_flags |= SLF_REBUILT;
2151 		ASSERT(dv->sdev_state != SDEV_ZOMBIE);
2152 		SDEV_SIMPLE_RELE(dv);
2153 		goto tryagain;
2154 	} else {
2155 		SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2156 		goto nolock_notfound;
2157 	}
2158 
2159 found:
2160 	ASSERT(dv->sdev_state == SDEV_READY);
2161 	if (vtor) {
2162 		/*
2163 		 * Check validity of returned node
2164 		 */
2165 		switch (vtor(dv)) {
2166 		case SDEV_VTOR_VALID:
2167 			break;
2168 		case SDEV_VTOR_STALE:
2169 			/*
2170 			 * The name exists, but the cache entry is
2171 			 * stale and needs to be re-created.
2172 			 */
2173 			ASSERT(RW_READ_HELD(&ddv->sdev_contents));
2174 			if (rw_tryupgrade(&ddv->sdev_contents) == 0) {
2175 				rw_exit(&ddv->sdev_contents);
2176 				rw_enter(&ddv->sdev_contents, RW_WRITER);
2177 			}
2178 			sdev_cache_update(ddv, &dv, nm, SDEV_CACHE_DELETE);
2179 			rw_downgrade(&ddv->sdev_contents);
2180 			SDEV_RELE(dv);
2181 			dv = NULL;
2182 			goto lookup_create_node;
2183 			/* FALLTHRU */
2184 		case SDEV_VTOR_INVALID:
2185 			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2186 			sdcmn_err7(("lookup: destroy invalid "
2187 			    "node: %s(%p)\n", dv->sdev_name, (void *)dv));
2188 			goto nolock_notfound;
2189 		case SDEV_VTOR_SKIP:
2190 			sdcmn_err7(("lookup: node not applicable - "
2191 			    "skipping: %s(%p)\n", dv->sdev_name, (void *)dv));
2192 			rw_exit(&ddv->sdev_contents);
2193 			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2194 			SDEV_RELE(dv);
2195 			goto lookup_failed;
2196 		default:
2197 			cmn_err(CE_PANIC,
2198 			    "dev fs: validator failed: %s(%p)\n",
2199 			    dv->sdev_name, (void *)dv);
2200 			break;
2201 		}
2202 	}
2203 
2204 	rw_exit(&ddv->sdev_contents);
2205 	rv = sdev_to_vp(dv, vpp);
2206 	sdcmn_err3(("devname_lookup_func: returning vp %p v_count %d state %d "
2207 	    "for nm %s, error %d\n", (void *)*vpp, (*vpp)->v_count,
2208 	    dv->sdev_state, nm, rv));
2209 	return (rv);
2210 
2211 nolock_notfound:
2212 	/*
2213 	 * Destroy the node that is created for synchronization purposes.
2214 	 */
2215 	sdcmn_err3(("devname_lookup_func: %s with state %d\n",
2216 	    nm, dv->sdev_state));
2217 	ASSERT(RW_READ_HELD(&ddv->sdev_contents));
2218 	if (dv->sdev_state == SDEV_INIT) {
2219 		if (!rw_tryupgrade(&ddv->sdev_contents)) {
2220 			rw_exit(&ddv->sdev_contents);
2221 			rw_enter(&ddv->sdev_contents, RW_WRITER);
2222 		}
2223 
2224 		/*
2225 		 * Node state may have changed during the lock
2226 		 * changes. Re-check.
2227 		 */
2228 		if (dv->sdev_state == SDEV_INIT) {
2229 			sdev_dirdelete(ddv, dv);
2230 			rw_exit(&ddv->sdev_contents);
2231 			sdev_lookup_failed(ddv, nm, failed_flags);
2232 			SDEV_RELE(dv);
2233 			*vpp = NULL;
2234 			return (ENOENT);
2235 		}
2236 	}
2237 
2238 	rw_exit(&ddv->sdev_contents);
2239 	SDEV_RELE(dv);
2240 
2241 lookup_failed:
2242 	sdev_lookup_failed(ddv, nm, failed_flags);
2243 	*vpp = NULL;
2244 	return (ENOENT);
2245 }
2246 
2247 /*
2248  * Given a directory node, mark all nodes beneath as
2249  * STALE, i.e. nodes that don't exist as far as new
2250  * consumers are concerned.  Remove them from the
2251  * list of directory entries so that no lookup or
2252  * directory traversal will find them.  The node
2253  * not deallocated so existing holds are not affected.
2254  */
2255 void
2256 sdev_stale(struct sdev_node *ddv)
2257 {
2258 	struct sdev_node *dv;
2259 	struct vnode *vp;
2260 
2261 	ASSERT(SDEVTOV(ddv)->v_type == VDIR);
2262 
2263 	rw_enter(&ddv->sdev_contents, RW_WRITER);
2264 	while ((dv = SDEV_FIRST_ENTRY(ddv)) != NULL) {
2265 		vp = SDEVTOV(dv);
2266 		SDEV_HOLD(dv);
2267 		if (vp->v_type == VDIR)
2268 			sdev_stale(dv);
2269 
2270 		sdev_dirdelete(ddv, dv);
2271 		SDEV_RELE(dv);
2272 	}
2273 	ddv->sdev_flags |= SDEV_BUILD;
2274 	rw_exit(&ddv->sdev_contents);
2275 }
2276 
2277 /*
2278  * Given a directory node, clean out all the nodes beneath.
2279  * If expr is specified, clean node with names matching expr.
2280  * If SDEV_ENFORCE is specified in flags, busy nodes are made stale,
2281  *	so they are excluded from future lookups.
2282  */
2283 int
2284 sdev_cleandir(struct sdev_node *ddv, char *expr, uint_t flags)
2285 {
2286 	int error = 0;
2287 	int busy = 0;
2288 	struct vnode *vp;
2289 	struct sdev_node *dv;
2290 	int bkstore = 0;
2291 	int len = 0;
2292 	char *bks_name = NULL;
2293 
2294 	ASSERT(SDEVTOV(ddv)->v_type == VDIR);
2295 
2296 	/*
2297 	 * We try our best to destroy all unused sdev_node's
2298 	 */
2299 	rw_enter(&ddv->sdev_contents, RW_WRITER);
2300 	while ((dv = SDEV_FIRST_ENTRY(ddv)) != NULL) {
2301 		vp = SDEVTOV(dv);
2302 
2303 		if (expr && gmatch(dv->sdev_name, expr) == 0)
2304 			continue;
2305 
2306 		if (vp->v_type == VDIR &&
2307 		    sdev_cleandir(dv, NULL, flags) != 0) {
2308 			sdcmn_err9(("sdev_cleandir: dir %s busy\n",
2309 			    dv->sdev_name));
2310 			busy++;
2311 			continue;
2312 		}
2313 
2314 		if (vp->v_count > 0 && (flags & SDEV_ENFORCE) == 0) {
2315 			sdcmn_err9(("sdev_cleandir: dir %s busy\n",
2316 			    dv->sdev_name));
2317 			busy++;
2318 			continue;
2319 		}
2320 
2321 		/*
2322 		 * at this point, either dv is not held or SDEV_ENFORCE
2323 		 * is specified. In either case, dv needs to be deleted
2324 		 */
2325 		SDEV_HOLD(dv);
2326 
2327 		bkstore = SDEV_IS_PERSIST(dv) ? 1 : 0;
2328 		if (bkstore && (vp->v_type == VDIR))
2329 			bkstore += 1;
2330 
2331 		if (bkstore) {
2332 			len = strlen(dv->sdev_name) + 1;
2333 			bks_name = kmem_alloc(len, KM_SLEEP);
2334 			bcopy(dv->sdev_name, bks_name, len);
2335 		}
2336 
2337 		sdev_dirdelete(ddv, dv);
2338 
2339 		/* take care the backing store clean up */
2340 		if (bkstore) {
2341 			ASSERT(bks_name);
2342 			ASSERT(ddv->sdev_attrvp);
2343 
2344 			if (bkstore == 1) {
2345 				error = VOP_REMOVE(ddv->sdev_attrvp,
2346 				    bks_name, kcred, NULL, 0);
2347 			} else if (bkstore == 2) {
2348 				error = VOP_RMDIR(ddv->sdev_attrvp,
2349 				    bks_name, ddv->sdev_attrvp, kcred, NULL, 0);
2350 			}
2351 
2352 			/* do not propagate the backing store errors */
2353 			if (error) {
2354 				sdcmn_err9(("sdev_cleandir: backing store"
2355 				    "not cleaned\n"));
2356 				error = 0;
2357 			}
2358 
2359 			bkstore = 0;
2360 			kmem_free(bks_name, len);
2361 			bks_name = NULL;
2362 			len = 0;
2363 		}
2364 
2365 		ddv->sdev_flags |= SDEV_BUILD;
2366 		SDEV_RELE(dv);
2367 	}
2368 
2369 	ddv->sdev_flags |= SDEV_BUILD;
2370 	rw_exit(&ddv->sdev_contents);
2371 
2372 	if (busy) {
2373 		error = EBUSY;
2374 	}
2375 
2376 	return (error);
2377 }
2378 
2379 /*
2380  * a convenient wrapper for readdir() funcs
2381  */
2382 size_t
2383 add_dir_entry(dirent64_t *de, char *nm, size_t size, ino_t ino, offset_t off)
2384 {
2385 	size_t reclen = DIRENT64_RECLEN(strlen(nm));
2386 	if (reclen > size)
2387 		return (0);
2388 
2389 	de->d_ino = (ino64_t)ino;
2390 	de->d_off = (off64_t)off + 1;
2391 	de->d_reclen = (ushort_t)reclen;
2392 	(void) strncpy(de->d_name, nm, DIRENT64_NAMELEN(reclen));
2393 	return (reclen);
2394 }
2395 
2396 /*
2397  * sdev_mount service routines
2398  */
2399 int
2400 sdev_copyin_mountargs(struct mounta *uap, struct sdev_mountargs *args)
2401 {
2402 	int	error;
2403 
2404 	if (uap->datalen != sizeof (*args))
2405 		return (EINVAL);
2406 
2407 	if (error = copyin(uap->dataptr, args, sizeof (*args))) {
2408 		cmn_err(CE_WARN, "sdev_copyin_mountargs: can not"
2409 		    "get user data. error %d\n", error);
2410 		return (EFAULT);
2411 	}
2412 
2413 	return (0);
2414 }
2415 
2416 #ifdef nextdp
2417 #undef nextdp
2418 #endif
2419 #define	nextdp(dp)	((struct dirent64 *) \
2420 			    (intptr_t)((char *)(dp) + (dp)->d_reclen))
2421 
2422 /*
2423  * readdir helper func
2424  */
2425 int
2426 devname_readdir_func(vnode_t *vp, uio_t *uiop, cred_t *cred, int *eofp,
2427     int flags)
2428 {
2429 	struct sdev_node *ddv = VTOSDEV(vp);
2430 	struct sdev_node *dv;
2431 	dirent64_t	*dp;
2432 	ulong_t		outcount = 0;
2433 	size_t		namelen;
2434 	ulong_t		alloc_count;
2435 	void		*outbuf;
2436 	struct iovec	*iovp;
2437 	int		error = 0;
2438 	size_t		reclen;
2439 	offset_t	diroff;
2440 	offset_t	soff;
2441 	int		this_reclen;
2442 	int (*vtor)(struct sdev_node *) = NULL;
2443 	struct vattr attr;
2444 	timestruc_t now;
2445 
2446 	ASSERT(ddv->sdev_attr || ddv->sdev_attrvp);
2447 	ASSERT(RW_READ_HELD(&ddv->sdev_contents));
2448 
2449 	if (uiop->uio_loffset >= MAXOFF_T) {
2450 		if (eofp)
2451 			*eofp = 1;
2452 		return (0);
2453 	}
2454 
2455 	if (uiop->uio_iovcnt != 1)
2456 		return (EINVAL);
2457 
2458 	if (vp->v_type != VDIR)
2459 		return (ENOTDIR);
2460 
2461 	if (ddv->sdev_flags & SDEV_VTOR) {
2462 		vtor = (int (*)(struct sdev_node *))sdev_get_vtor(ddv);
2463 		ASSERT(vtor);
2464 	}
2465 
2466 	if (eofp != NULL)
2467 		*eofp = 0;
2468 
2469 	soff = uiop->uio_loffset;
2470 	iovp = uiop->uio_iov;
2471 	alloc_count = iovp->iov_len;
2472 	dp = outbuf = kmem_alloc(alloc_count, KM_SLEEP);
2473 	outcount = 0;
2474 
2475 	if (ddv->sdev_state == SDEV_ZOMBIE)
2476 		goto get_cache;
2477 
2478 	if (SDEV_IS_GLOBAL(ddv)) {
2479 
2480 		if ((sdev_boot_state == SDEV_BOOT_STATE_COMPLETE) &&
2481 		    !sdev_reconfig_boot && (flags & SDEV_BROWSE) &&
2482 		    !SDEV_IS_DYNAMIC(ddv) && !SDEV_IS_NO_NCACHE(ddv) &&
2483 		    ((moddebug & MODDEBUG_FINI_EBUSY) == 0) &&
2484 		    !DEVNAME_DEVFSADM_HAS_RUN(devfsadm_state) &&
2485 		    !DEVNAME_DEVFSADM_IS_RUNNING(devfsadm_state) &&
2486 		    !sdev_reconfig_disable) {
2487 			/*
2488 			 * invoking "devfsadm" to do system device reconfig
2489 			 */
2490 			mutex_enter(&ddv->sdev_lookup_lock);
2491 			SDEV_BLOCK_OTHERS(ddv,
2492 			    (SDEV_READDIR|SDEV_LGWAITING));
2493 			mutex_exit(&ddv->sdev_lookup_lock);
2494 
2495 			sdcmn_err8(("readdir of %s by %s: reconfig\n",
2496 			    ddv->sdev_path, curproc->p_user.u_comm));
2497 			if (sdev_reconfig_verbose) {
2498 				cmn_err(CE_CONT,
2499 				    "?readdir of %s by %s: reconfig\n",
2500 				    ddv->sdev_path, curproc->p_user.u_comm);
2501 			}
2502 
2503 			sdev_devfsadmd_thread(ddv, NULL, kcred);
2504 		} else if (DEVNAME_DEVFSADM_IS_RUNNING(devfsadm_state)) {
2505 			/*
2506 			 * compensate the "ls" started later than "devfsadm"
2507 			 */
2508 			mutex_enter(&ddv->sdev_lookup_lock);
2509 			SDEV_BLOCK_OTHERS(ddv, (SDEV_READDIR|SDEV_LGWAITING));
2510 			mutex_exit(&ddv->sdev_lookup_lock);
2511 		}
2512 
2513 		/*
2514 		 * release the contents lock so that
2515 		 * the cache may be updated by devfsadmd
2516 		 */
2517 		rw_exit(&ddv->sdev_contents);
2518 		mutex_enter(&ddv->sdev_lookup_lock);
2519 		if (SDEV_IS_READDIR(ddv))
2520 			(void) sdev_wait4lookup(ddv, SDEV_READDIR);
2521 		mutex_exit(&ddv->sdev_lookup_lock);
2522 		rw_enter(&ddv->sdev_contents, RW_READER);
2523 
2524 		sdcmn_err4(("readdir of directory %s by %s\n",
2525 		    ddv->sdev_name, curproc->p_user.u_comm));
2526 		if (ddv->sdev_flags & SDEV_BUILD) {
2527 			if (SDEV_IS_PERSIST(ddv)) {
2528 				error = sdev_filldir_from_store(ddv,
2529 				    alloc_count, cred);
2530 			}
2531 			ddv->sdev_flags &= ~SDEV_BUILD;
2532 		}
2533 	}
2534 
2535 get_cache:
2536 	/* handle "." and ".." */
2537 	diroff = 0;
2538 	if (soff == 0) {
2539 		/* first time */
2540 		this_reclen = DIRENT64_RECLEN(1);
2541 		if (alloc_count < this_reclen) {
2542 			error = EINVAL;
2543 			goto done;
2544 		}
2545 
2546 		dp->d_ino = (ino64_t)ddv->sdev_ino;
2547 		dp->d_off = (off64_t)1;
2548 		dp->d_reclen = (ushort_t)this_reclen;
2549 
2550 		(void) strncpy(dp->d_name, ".",
2551 		    DIRENT64_NAMELEN(this_reclen));
2552 		outcount += dp->d_reclen;
2553 		dp = nextdp(dp);
2554 	}
2555 
2556 	diroff++;
2557 	if (soff <= 1) {
2558 		this_reclen = DIRENT64_RECLEN(2);
2559 		if (alloc_count < outcount + this_reclen) {
2560 			error = EINVAL;
2561 			goto done;
2562 		}
2563 
2564 		dp->d_reclen = (ushort_t)this_reclen;
2565 		dp->d_ino = (ino64_t)ddv->sdev_dotdot->sdev_ino;
2566 		dp->d_off = (off64_t)2;
2567 
2568 		(void) strncpy(dp->d_name, "..",
2569 		    DIRENT64_NAMELEN(this_reclen));
2570 		outcount += dp->d_reclen;
2571 
2572 		dp = nextdp(dp);
2573 	}
2574 
2575 
2576 	/* gets the cache */
2577 	diroff++;
2578 	for (dv = SDEV_FIRST_ENTRY(ddv); dv;
2579 	    dv = SDEV_NEXT_ENTRY(ddv, dv), diroff++) {
2580 		sdcmn_err3(("sdev_readdir: diroff %lld soff %lld for '%s' \n",
2581 		    diroff, soff, dv->sdev_name));
2582 
2583 		/* bypassing pre-matured nodes */
2584 		if (diroff < soff || (dv->sdev_state != SDEV_READY)) {
2585 			sdcmn_err3(("sdev_readdir: pre-mature node  "
2586 			    "%s %d\n", dv->sdev_name, dv->sdev_state));
2587 			continue;
2588 		}
2589 
2590 		/*
2591 		 * Check validity of node
2592 		 * Drop invalid and nodes to be skipped.
2593 		 * A node the validator indicates as stale needs
2594 		 * to be returned as presumably the node name itself
2595 		 * is valid and the node data itself will be refreshed
2596 		 * on lookup.  An application performing a readdir then
2597 		 * stat on each entry should thus always see consistent
2598 		 * data.  In any case, it is not possible to synchronize
2599 		 * with dynamic kernel state, and any view we return can
2600 		 * never be anything more than a snapshot at a point in time.
2601 		 */
2602 		if (vtor) {
2603 			switch (vtor(dv)) {
2604 			case SDEV_VTOR_VALID:
2605 				break;
2606 			case SDEV_VTOR_INVALID:
2607 			case SDEV_VTOR_SKIP:
2608 				continue;
2609 			case SDEV_VTOR_STALE:
2610 				sdcmn_err3(("sdev_readir: %s stale\n",
2611 				    dv->sdev_name));
2612 				break;
2613 			default:
2614 				cmn_err(CE_PANIC,
2615 				    "dev fs: validator failed: %s(%p)\n",
2616 				    dv->sdev_name, (void *)dv);
2617 				break;
2618 			/*NOTREACHED*/
2619 			}
2620 		}
2621 
2622 		namelen = strlen(dv->sdev_name);
2623 		reclen = DIRENT64_RECLEN(namelen);
2624 		if (outcount + reclen > alloc_count) {
2625 			goto full;
2626 		}
2627 		dp->d_reclen = (ushort_t)reclen;
2628 		dp->d_ino = (ino64_t)dv->sdev_ino;
2629 		dp->d_off = (off64_t)diroff + 1;
2630 		(void) strncpy(dp->d_name, dv->sdev_name,
2631 		    DIRENT64_NAMELEN(reclen));
2632 		outcount += reclen;
2633 		dp = nextdp(dp);
2634 	}
2635 
2636 full:
2637 	sdcmn_err4(("sdev_readdir: moving %lu bytes: "
2638 	    "diroff %lld, soff %lld, dv %p\n", outcount, diroff, soff,
2639 	    (void *)dv));
2640 
2641 	if (outcount)
2642 		error = uiomove(outbuf, outcount, UIO_READ, uiop);
2643 
2644 	if (!error) {
2645 		uiop->uio_loffset = diroff;
2646 		if (eofp)
2647 			*eofp = dv ? 0 : 1;
2648 	}
2649 
2650 
2651 	if (ddv->sdev_attrvp) {
2652 		gethrestime(&now);
2653 		attr.va_ctime = now;
2654 		attr.va_atime = now;
2655 		attr.va_mask = AT_CTIME|AT_ATIME;
2656 
2657 		(void) VOP_SETATTR(ddv->sdev_attrvp, &attr, 0, kcred, NULL);
2658 	}
2659 done:
2660 	kmem_free(outbuf, alloc_count);
2661 	return (error);
2662 }
2663 
2664 static int
2665 sdev_modctl_lookup(const char *path, vnode_t **r_vp)
2666 {
2667 	vnode_t *vp;
2668 	vnode_t *cvp;
2669 	struct sdev_node *svp;
2670 	char *nm;
2671 	struct pathname pn;
2672 	int error;
2673 	int persisted = 0;
2674 
2675 	ASSERT(INGLOBALZONE(curproc));
2676 
2677 	if (error = pn_get((char *)path, UIO_SYSSPACE, &pn))
2678 		return (error);
2679 	nm = kmem_alloc(MAXNAMELEN, KM_SLEEP);
2680 
2681 	vp = rootdir;
2682 	VN_HOLD(vp);
2683 
2684 	while (pn_pathleft(&pn)) {
2685 		ASSERT(vp->v_type == VDIR || vp->v_type == VLNK);
2686 		(void) pn_getcomponent(&pn, nm);
2687 
2688 		/*
2689 		 * Deal with the .. special case where we may be
2690 		 * traversing up across a mount point, to the
2691 		 * root of this filesystem or global root.
2692 		 */
2693 		if (nm[0] == '.' && nm[1] == '.' && nm[2] == 0) {
2694 checkforroot:
2695 			if (VN_CMP(vp, rootdir)) {
2696 				nm[1] = 0;
2697 			} else if (vp->v_flag & VROOT) {
2698 				vfs_t *vfsp;
2699 				cvp = vp;
2700 				vfsp = cvp->v_vfsp;
2701 				vfs_rlock_wait(vfsp);
2702 				vp = cvp->v_vfsp->vfs_vnodecovered;
2703 				if (vp == NULL ||
2704 				    (cvp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) {
2705 					vfs_unlock(vfsp);
2706 					VN_RELE(cvp);
2707 					error = EIO;
2708 					break;
2709 				}
2710 				VN_HOLD(vp);
2711 				vfs_unlock(vfsp);
2712 				VN_RELE(cvp);
2713 				cvp = NULL;
2714 				goto checkforroot;
2715 			}
2716 		}
2717 
2718 		error = VOP_LOOKUP(vp, nm, &cvp, NULL, 0, NULL, kcred, NULL,
2719 		    NULL, NULL);
2720 		if (error) {
2721 			VN_RELE(vp);
2722 			break;
2723 		}
2724 
2725 		/* traverse mount points encountered on our journey */
2726 		if (vn_ismntpt(cvp) && (error = traverse(&cvp)) != 0) {
2727 			VN_RELE(vp);
2728 			VN_RELE(cvp);
2729 			break;
2730 		}
2731 
2732 		/*
2733 		 * symbolic link, can be either relative and absolute
2734 		 */
2735 		if ((cvp->v_type == VLNK) && pn_pathleft(&pn)) {
2736 			struct pathname linkpath;
2737 			pn_alloc(&linkpath);
2738 			if (error = pn_getsymlink(cvp, &linkpath, kcred)) {
2739 				pn_free(&linkpath);
2740 				break;
2741 			}
2742 			if (pn_pathleft(&linkpath) == 0)
2743 				(void) pn_set(&linkpath, ".");
2744 			error = pn_insert(&pn, &linkpath, strlen(nm));
2745 			pn_free(&linkpath);
2746 			if (pn.pn_pathlen == 0) {
2747 				VN_RELE(vp);
2748 				return (ENOENT);
2749 			}
2750 			if (pn.pn_path[0] == '/') {
2751 				pn_skipslash(&pn);
2752 				VN_RELE(vp);
2753 				VN_RELE(cvp);
2754 				vp = rootdir;
2755 				VN_HOLD(vp);
2756 			} else {
2757 				VN_RELE(cvp);
2758 			}
2759 			continue;
2760 		}
2761 
2762 		VN_RELE(vp);
2763 
2764 		/*
2765 		 * Direct the operation to the persisting filesystem
2766 		 * underlying /dev.  Bail if we encounter a
2767 		 * non-persistent dev entity here.
2768 		 */
2769 		if (cvp->v_vfsp->vfs_fstype == devtype) {
2770 
2771 			if ((VTOSDEV(cvp)->sdev_flags & SDEV_PERSIST) == 0) {
2772 				error = ENOENT;
2773 				VN_RELE(cvp);
2774 				break;
2775 			}
2776 
2777 			if (VTOSDEV(cvp) == NULL) {
2778 				error = ENOENT;
2779 				VN_RELE(cvp);
2780 				break;
2781 			}
2782 			svp = VTOSDEV(cvp);
2783 			if ((vp = svp->sdev_attrvp) == NULL) {
2784 				error = ENOENT;
2785 				VN_RELE(cvp);
2786 				break;
2787 			}
2788 			persisted = 1;
2789 			VN_HOLD(vp);
2790 			VN_RELE(cvp);
2791 			cvp = vp;
2792 		}
2793 
2794 		vp = cvp;
2795 		pn_skipslash(&pn);
2796 	}
2797 
2798 	kmem_free(nm, MAXNAMELEN);
2799 	pn_free(&pn);
2800 
2801 	if (error)
2802 		return (error);
2803 
2804 	/*
2805 	 * Only return persisted nodes in the filesystem underlying /dev.
2806 	 */
2807 	if (!persisted) {
2808 		VN_RELE(vp);
2809 		return (ENOENT);
2810 	}
2811 
2812 	*r_vp = vp;
2813 	return (0);
2814 }
2815 
2816 int
2817 sdev_modctl_readdir(const char *dir, char ***dirlistp,
2818 	int *npathsp, int *npathsp_alloc, int checking_empty)
2819 {
2820 	char	**pathlist = NULL;
2821 	char	**newlist = NULL;
2822 	int	npaths = 0;
2823 	int	npaths_alloc = 0;
2824 	dirent64_t *dbuf = NULL;
2825 	int	n;
2826 	char	*s;
2827 	int error;
2828 	vnode_t *vp;
2829 	int eof;
2830 	struct iovec iov;
2831 	struct uio uio;
2832 	struct dirent64 *dp;
2833 	size_t dlen;
2834 	size_t dbuflen;
2835 	int ndirents = 64;
2836 	char *nm;
2837 
2838 	error = sdev_modctl_lookup(dir, &vp);
2839 	sdcmn_err11(("modctl readdir: %s by %s: %s\n",
2840 	    dir, curproc->p_user.u_comm,
2841 	    (error == 0) ? "ok" : "failed"));
2842 	if (error)
2843 		return (error);
2844 
2845 	dlen = ndirents * (sizeof (*dbuf));
2846 	dbuf = kmem_alloc(dlen, KM_SLEEP);
2847 
2848 	uio.uio_iov = &iov;
2849 	uio.uio_iovcnt = 1;
2850 	uio.uio_segflg = UIO_SYSSPACE;
2851 	uio.uio_fmode = 0;
2852 	uio.uio_extflg = UIO_COPY_CACHED;
2853 	uio.uio_loffset = 0;
2854 	uio.uio_llimit = MAXOFFSET_T;
2855 
2856 	eof = 0;
2857 	error = 0;
2858 	while (!error && !eof) {
2859 		uio.uio_resid = dlen;
2860 		iov.iov_base = (char *)dbuf;
2861 		iov.iov_len = dlen;
2862 
2863 		(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2864 		error = VOP_READDIR(vp, &uio, kcred, &eof, NULL, 0);
2865 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2866 
2867 		dbuflen = dlen - uio.uio_resid;
2868 
2869 		if (error || dbuflen == 0)
2870 			break;
2871 
2872 		for (dp = dbuf; ((intptr_t)dp < (intptr_t)dbuf + dbuflen);
2873 		    dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen)) {
2874 
2875 			nm = dp->d_name;
2876 
2877 			if (strcmp(nm, ".") == 0 || strcmp(nm, "..") == 0)
2878 				continue;
2879 			if (npaths == npaths_alloc) {
2880 				npaths_alloc += 64;
2881 				newlist = (char **)
2882 				    kmem_zalloc((npaths_alloc + 1) *
2883 				    sizeof (char *), KM_SLEEP);
2884 				if (pathlist) {
2885 					bcopy(pathlist, newlist,
2886 					    npaths * sizeof (char *));
2887 					kmem_free(pathlist,
2888 					    (npaths + 1) * sizeof (char *));
2889 				}
2890 				pathlist = newlist;
2891 			}
2892 			n = strlen(nm) + 1;
2893 			s = kmem_alloc(n, KM_SLEEP);
2894 			bcopy(nm, s, n);
2895 			pathlist[npaths++] = s;
2896 			sdcmn_err11(("  %s/%s\n", dir, s));
2897 
2898 			/* if checking empty, one entry is as good as many */
2899 			if (checking_empty) {
2900 				eof = 1;
2901 				break;
2902 			}
2903 		}
2904 	}
2905 
2906 exit:
2907 	VN_RELE(vp);
2908 
2909 	if (dbuf)
2910 		kmem_free(dbuf, dlen);
2911 
2912 	if (error)
2913 		return (error);
2914 
2915 	*dirlistp = pathlist;
2916 	*npathsp = npaths;
2917 	*npathsp_alloc = npaths_alloc;
2918 
2919 	return (0);
2920 }
2921 
2922 void
2923 sdev_modctl_readdir_free(char **pathlist, int npaths, int npaths_alloc)
2924 {
2925 	int	i, n;
2926 
2927 	for (i = 0; i < npaths; i++) {
2928 		n = strlen(pathlist[i]) + 1;
2929 		kmem_free(pathlist[i], n);
2930 	}
2931 
2932 	kmem_free(pathlist, (npaths_alloc + 1) * sizeof (char *));
2933 }
2934 
2935 int
2936 sdev_modctl_devexists(const char *path)
2937 {
2938 	vnode_t *vp;
2939 	int error;
2940 
2941 	error = sdev_modctl_lookup(path, &vp);
2942 	sdcmn_err11(("modctl dev exists: %s by %s: %s\n",
2943 	    path, curproc->p_user.u_comm,
2944 	    (error == 0) ? "ok" : "failed"));
2945 	if (error == 0)
2946 		VN_RELE(vp);
2947 
2948 	return (error);
2949 }
2950 
2951 extern int sdev_vnodeops_tbl_size;
2952 
2953 /*
2954  * construct a new template with overrides from vtab
2955  */
2956 static fs_operation_def_t *
2957 sdev_merge_vtab(const fs_operation_def_t tab[])
2958 {
2959 	fs_operation_def_t *new;
2960 	const fs_operation_def_t *tab_entry;
2961 
2962 	/* make a copy of standard vnode ops table */
2963 	new = kmem_alloc(sdev_vnodeops_tbl_size, KM_SLEEP);
2964 	bcopy((void *)sdev_vnodeops_tbl, new, sdev_vnodeops_tbl_size);
2965 
2966 	/* replace the overrides from tab */
2967 	for (tab_entry = tab; tab_entry->name != NULL; tab_entry++) {
2968 		fs_operation_def_t *std_entry = new;
2969 		while (std_entry->name) {
2970 			if (strcmp(tab_entry->name, std_entry->name) == 0) {
2971 				std_entry->func = tab_entry->func;
2972 				break;
2973 			}
2974 			std_entry++;
2975 		}
2976 		if (std_entry->name == NULL)
2977 			cmn_err(CE_NOTE, "sdev_merge_vtab: entry %s unused.",
2978 			    tab_entry->name);
2979 	}
2980 
2981 	return (new);
2982 }
2983 
2984 /* free memory allocated by sdev_merge_vtab */
2985 static void
2986 sdev_free_vtab(fs_operation_def_t *new)
2987 {
2988 	kmem_free(new, sdev_vnodeops_tbl_size);
2989 }
2990 
2991 /*
2992  * a generic setattr() function
2993  *
2994  * note: flags only supports AT_UID and AT_GID.
2995  *	 Future enhancements can be done for other types, e.g. AT_MODE
2996  */
2997 int
2998 devname_setattr_func(struct vnode *vp, struct vattr *vap, int flags,
2999     struct cred *cred, int (*callback)(struct sdev_node *, struct vattr *,
3000     int), int protocol)
3001 {
3002 	struct sdev_node	*dv = VTOSDEV(vp);
3003 	struct sdev_node	*parent = dv->sdev_dotdot;
3004 	struct vattr		*get;
3005 	uint_t			mask = vap->va_mask;
3006 	int 			error;
3007 
3008 	/* some sanity checks */
3009 	if (vap->va_mask & AT_NOSET)
3010 		return (EINVAL);
3011 
3012 	if (vap->va_mask & AT_SIZE) {
3013 		if (vp->v_type == VDIR) {
3014 			return (EISDIR);
3015 		}
3016 	}
3017 
3018 	/* no need to set attribute, but do not fail either */
3019 	ASSERT(parent);
3020 	rw_enter(&parent->sdev_contents, RW_READER);
3021 	if (dv->sdev_state == SDEV_ZOMBIE) {
3022 		rw_exit(&parent->sdev_contents);
3023 		return (0);
3024 	}
3025 
3026 	/* If backing store exists, just set it. */
3027 	if (dv->sdev_attrvp) {
3028 		rw_exit(&parent->sdev_contents);
3029 		return (VOP_SETATTR(dv->sdev_attrvp, vap, flags, cred, NULL));
3030 	}
3031 
3032 	/*
3033 	 * Otherwise, for nodes with the persistence attribute, create it.
3034 	 */
3035 	ASSERT(dv->sdev_attr);
3036 	if (SDEV_IS_PERSIST(dv) ||
3037 	    ((vap->va_mask & ~AT_TIMES) != 0 && !SDEV_IS_DYNAMIC(dv))) {
3038 		sdev_vattr_merge(dv, vap);
3039 		rw_enter(&dv->sdev_contents, RW_WRITER);
3040 		error = sdev_shadow_node(dv, cred);
3041 		rw_exit(&dv->sdev_contents);
3042 		rw_exit(&parent->sdev_contents);
3043 
3044 		if (error)
3045 			return (error);
3046 		return (VOP_SETATTR(dv->sdev_attrvp, vap, flags, cred, NULL));
3047 	}
3048 
3049 
3050 	/*
3051 	 * sdev_attr was allocated in sdev_mknode
3052 	 */
3053 	rw_enter(&dv->sdev_contents, RW_WRITER);
3054 	error = secpolicy_vnode_setattr(cred, vp, vap,
3055 	    dv->sdev_attr, flags, sdev_unlocked_access, dv);
3056 	if (error) {
3057 		rw_exit(&dv->sdev_contents);
3058 		rw_exit(&parent->sdev_contents);
3059 		return (error);
3060 	}
3061 
3062 	get = dv->sdev_attr;
3063 	if (mask & AT_MODE) {
3064 		get->va_mode &= S_IFMT;
3065 		get->va_mode |= vap->va_mode & ~S_IFMT;
3066 	}
3067 
3068 	if ((mask & AT_UID) || (mask & AT_GID)) {
3069 		if (mask & AT_UID)
3070 			get->va_uid = vap->va_uid;
3071 		if (mask & AT_GID)
3072 			get->va_gid = vap->va_gid;
3073 		/*
3074 		 * a callback must be provided if the protocol is set
3075 		 */
3076 		if ((protocol & AT_UID) || (protocol & AT_GID)) {
3077 			ASSERT(callback);
3078 			error = callback(dv, get, protocol);
3079 			if (error) {
3080 				rw_exit(&dv->sdev_contents);
3081 				rw_exit(&parent->sdev_contents);
3082 				return (error);
3083 			}
3084 		}
3085 	}
3086 
3087 	if (mask & AT_ATIME)
3088 		get->va_atime = vap->va_atime;
3089 	if (mask & AT_MTIME)
3090 		get->va_mtime = vap->va_mtime;
3091 	if (mask & (AT_MODE | AT_UID | AT_GID | AT_CTIME)) {
3092 		gethrestime(&get->va_ctime);
3093 	}
3094 
3095 	sdev_vattr_merge(dv, get);
3096 	rw_exit(&dv->sdev_contents);
3097 	rw_exit(&parent->sdev_contents);
3098 	return (0);
3099 }
3100 
3101 /*
3102  * a generic inactive() function
3103  */
3104 /*ARGSUSED*/
3105 void
3106 devname_inactive_func(struct vnode *vp, struct cred *cred,
3107     void (*callback)(struct vnode *))
3108 {
3109 	int clean;
3110 	struct sdev_node *dv = VTOSDEV(vp);
3111 	int state;
3112 
3113 	mutex_enter(&vp->v_lock);
3114 	ASSERT(vp->v_count >= 1);
3115 
3116 
3117 	if (vp->v_count == 1 && callback != NULL)
3118 		callback(vp);
3119 
3120 	rw_enter(&dv->sdev_contents, RW_WRITER);
3121 	state = dv->sdev_state;
3122 
3123 	clean = (vp->v_count == 1) && (state == SDEV_ZOMBIE);
3124 
3125 	/*
3126 	 * sdev is a rather bad public citizen. It violates the general
3127 	 * agreement that in memory nodes should always have a valid reference
3128 	 * count on their vnode. But that's not the case here. This means that
3129 	 * we do actually have to distinguish between getting inactive callbacks
3130 	 * for zombies and otherwise. This should probably be fixed.
3131 	 */
3132 	if (clean) {
3133 		/* Remove the . entry to ourselves */
3134 		if (vp->v_type == VDIR) {
3135 			decr_link(dv);
3136 		}
3137 		VERIFY(dv->sdev_nlink == 1);
3138 		decr_link(dv);
3139 		--vp->v_count;
3140 		rw_exit(&dv->sdev_contents);
3141 		mutex_exit(&vp->v_lock);
3142 		sdev_nodedestroy(dv, 0);
3143 	} else {
3144 		--vp->v_count;
3145 		rw_exit(&dv->sdev_contents);
3146 		mutex_exit(&vp->v_lock);
3147 	}
3148 }
3149