xref: /illumos-gate/usr/src/uts/common/fs/dev/sdev_subr.c (revision 3bf53144544c5875536acef3d1214ec06c34adad)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2013, 2016 Joyent, Inc. All rights reserved.
24  */
25 
26 /*
27  * utility routines for the /dev fs
28  */
29 
30 #include <sys/types.h>
31 #include <sys/param.h>
32 #include <sys/t_lock.h>
33 #include <sys/systm.h>
34 #include <sys/sysmacros.h>
35 #include <sys/user.h>
36 #include <sys/time.h>
37 #include <sys/vfs.h>
38 #include <sys/vnode.h>
39 #include <sys/file.h>
40 #include <sys/fcntl.h>
41 #include <sys/flock.h>
42 #include <sys/kmem.h>
43 #include <sys/uio.h>
44 #include <sys/errno.h>
45 #include <sys/stat.h>
46 #include <sys/cred.h>
47 #include <sys/dirent.h>
48 #include <sys/pathname.h>
49 #include <sys/cmn_err.h>
50 #include <sys/debug.h>
51 #include <sys/mode.h>
52 #include <sys/policy.h>
53 #include <fs/fs_subr.h>
54 #include <sys/mount.h>
55 #include <sys/fs/snode.h>
56 #include <sys/fs/dv_node.h>
57 #include <sys/fs/sdev_impl.h>
58 #include <sys/sunndi.h>
59 #include <sys/sunmdi.h>
60 #include <sys/conf.h>
61 #include <sys/proc.h>
62 #include <sys/user.h>
63 #include <sys/modctl.h>
64 
65 #ifdef DEBUG
66 int sdev_debug = 0x00000001;
67 int sdev_debug_cache_flags = 0;
68 #endif
69 
70 /*
71  * globals
72  */
73 /* prototype memory vattrs */
74 vattr_t sdev_vattr_dir = {
75 	AT_TYPE|AT_MODE|AT_UID|AT_GID,		/* va_mask */
76 	VDIR,					/* va_type */
77 	SDEV_DIRMODE_DEFAULT,			/* va_mode */
78 	SDEV_UID_DEFAULT,			/* va_uid */
79 	SDEV_GID_DEFAULT,			/* va_gid */
80 	0,					/* va_fsid */
81 	0,					/* va_nodeid */
82 	0,					/* va_nlink */
83 	0,					/* va_size */
84 	0,					/* va_atime */
85 	0,					/* va_mtime */
86 	0,					/* va_ctime */
87 	0,					/* va_rdev */
88 	0,					/* va_blksize */
89 	0,					/* va_nblocks */
90 	0					/* va_vcode */
91 };
92 
93 vattr_t sdev_vattr_lnk = {
94 	AT_TYPE|AT_MODE,			/* va_mask */
95 	VLNK,					/* va_type */
96 	SDEV_LNKMODE_DEFAULT,			/* va_mode */
97 	SDEV_UID_DEFAULT,			/* va_uid */
98 	SDEV_GID_DEFAULT,			/* va_gid */
99 	0,					/* va_fsid */
100 	0,					/* va_nodeid */
101 	0,					/* va_nlink */
102 	0,					/* va_size */
103 	0,					/* va_atime */
104 	0,					/* va_mtime */
105 	0,					/* va_ctime */
106 	0,					/* va_rdev */
107 	0,					/* va_blksize */
108 	0,					/* va_nblocks */
109 	0					/* va_vcode */
110 };
111 
112 vattr_t sdev_vattr_blk = {
113 	AT_TYPE|AT_MODE|AT_UID|AT_GID,		/* va_mask */
114 	VBLK,					/* va_type */
115 	S_IFBLK | SDEV_DEVMODE_DEFAULT,		/* va_mode */
116 	SDEV_UID_DEFAULT,			/* va_uid */
117 	SDEV_GID_DEFAULT,			/* va_gid */
118 	0,					/* va_fsid */
119 	0,					/* va_nodeid */
120 	0,					/* va_nlink */
121 	0,					/* va_size */
122 	0,					/* va_atime */
123 	0,					/* va_mtime */
124 	0,					/* va_ctime */
125 	0,					/* va_rdev */
126 	0,					/* va_blksize */
127 	0,					/* va_nblocks */
128 	0					/* va_vcode */
129 };
130 
131 vattr_t sdev_vattr_chr = {
132 	AT_TYPE|AT_MODE|AT_UID|AT_GID,		/* va_mask */
133 	VCHR,					/* va_type */
134 	S_IFCHR | SDEV_DEVMODE_DEFAULT,		/* va_mode */
135 	SDEV_UID_DEFAULT,			/* va_uid */
136 	SDEV_GID_DEFAULT,			/* va_gid */
137 	0,					/* va_fsid */
138 	0,					/* va_nodeid */
139 	0,					/* va_nlink */
140 	0,					/* va_size */
141 	0,					/* va_atime */
142 	0,					/* va_mtime */
143 	0,					/* va_ctime */
144 	0,					/* va_rdev */
145 	0,					/* va_blksize */
146 	0,					/* va_nblocks */
147 	0					/* va_vcode */
148 };
149 
150 kmem_cache_t	*sdev_node_cache;	/* sdev_node cache */
151 int		devtype;		/* fstype */
152 
153 /* static */
154 static struct vnodeops *sdev_get_vop(struct sdev_node *);
155 static void sdev_set_no_negcache(struct sdev_node *);
156 static fs_operation_def_t *sdev_merge_vtab(const fs_operation_def_t []);
157 static void sdev_free_vtab(fs_operation_def_t *);
158 
159 static void
160 sdev_prof_free(struct sdev_node *dv)
161 {
162 	ASSERT(!SDEV_IS_GLOBAL(dv));
163 	nvlist_free(dv->sdev_prof.dev_name);
164 	nvlist_free(dv->sdev_prof.dev_map);
165 	nvlist_free(dv->sdev_prof.dev_symlink);
166 	nvlist_free(dv->sdev_prof.dev_glob_incdir);
167 	nvlist_free(dv->sdev_prof.dev_glob_excdir);
168 	bzero(&dv->sdev_prof, sizeof (dv->sdev_prof));
169 }
170 
171 /* sdev_node cache constructor */
172 /*ARGSUSED1*/
173 static int
174 i_sdev_node_ctor(void *buf, void *cfarg, int flag)
175 {
176 	struct sdev_node *dv = (struct sdev_node *)buf;
177 	struct vnode *vp;
178 
179 	bzero(buf, sizeof (struct sdev_node));
180 	vp = dv->sdev_vnode = vn_alloc(flag);
181 	if (vp == NULL) {
182 		return (-1);
183 	}
184 	vp->v_data = dv;
185 	rw_init(&dv->sdev_contents, NULL, RW_DEFAULT, NULL);
186 	return (0);
187 }
188 
189 /* sdev_node cache destructor */
190 /*ARGSUSED1*/
191 static void
192 i_sdev_node_dtor(void *buf, void *arg)
193 {
194 	struct sdev_node *dv = (struct sdev_node *)buf;
195 	struct vnode *vp = SDEVTOV(dv);
196 
197 	rw_destroy(&dv->sdev_contents);
198 	vn_free(vp);
199 }
200 
201 /* initialize sdev_node cache */
202 void
203 sdev_node_cache_init()
204 {
205 	int flags = 0;
206 
207 #ifdef	DEBUG
208 	flags = sdev_debug_cache_flags;
209 	if (flags)
210 		sdcmn_err(("cache debug flags 0x%x\n", flags));
211 #endif	/* DEBUG */
212 
213 	ASSERT(sdev_node_cache == NULL);
214 	sdev_node_cache = kmem_cache_create("sdev_node_cache",
215 	    sizeof (struct sdev_node), 0, i_sdev_node_ctor, i_sdev_node_dtor,
216 	    NULL, NULL, NULL, flags);
217 }
218 
219 /* destroy sdev_node cache */
220 void
221 sdev_node_cache_fini()
222 {
223 	ASSERT(sdev_node_cache != NULL);
224 	kmem_cache_destroy(sdev_node_cache);
225 	sdev_node_cache = NULL;
226 }
227 
228 /*
229  * Compare two nodes lexographically to balance avl tree
230  */
231 static int
232 sdev_compare_nodes(const struct sdev_node *dv1, const struct sdev_node *dv2)
233 {
234 	int rv;
235 	if ((rv = strcmp(dv1->sdev_name, dv2->sdev_name)) == 0)
236 		return (0);
237 	return ((rv < 0) ? -1 : 1);
238 }
239 
240 void
241 sdev_set_nodestate(struct sdev_node *dv, sdev_node_state_t state)
242 {
243 	ASSERT(dv);
244 	ASSERT(RW_WRITE_HELD(&dv->sdev_contents));
245 	dv->sdev_state = state;
246 }
247 
248 static void
249 sdev_attr_update(struct sdev_node *dv, vattr_t *vap)
250 {
251 	timestruc_t	now;
252 	struct vattr	*attrp;
253 	uint_t		mask;
254 
255 	ASSERT(dv->sdev_attr);
256 	ASSERT(vap);
257 
258 	attrp = dv->sdev_attr;
259 	mask = vap->va_mask;
260 	if (mask & AT_TYPE)
261 		attrp->va_type = vap->va_type;
262 	if (mask & AT_MODE)
263 		attrp->va_mode = vap->va_mode;
264 	if (mask & AT_UID)
265 		attrp->va_uid = vap->va_uid;
266 	if (mask & AT_GID)
267 		attrp->va_gid = vap->va_gid;
268 	if (mask & AT_RDEV)
269 		attrp->va_rdev = vap->va_rdev;
270 
271 	gethrestime(&now);
272 	attrp->va_atime = (mask & AT_ATIME) ? vap->va_atime : now;
273 	attrp->va_mtime = (mask & AT_MTIME) ? vap->va_mtime : now;
274 	attrp->va_ctime = (mask & AT_CTIME) ? vap->va_ctime : now;
275 }
276 
277 static void
278 sdev_attr_alloc(struct sdev_node *dv, vattr_t *vap)
279 {
280 	ASSERT(dv->sdev_attr == NULL);
281 	ASSERT(vap->va_mask & AT_TYPE);
282 	ASSERT(vap->va_mask & AT_MODE);
283 
284 	dv->sdev_attr = kmem_zalloc(sizeof (struct vattr), KM_SLEEP);
285 	sdev_attr_update(dv, vap);
286 }
287 
288 /* alloc and initialize a sdev_node */
289 int
290 sdev_nodeinit(struct sdev_node *ddv, char *nm, struct sdev_node **newdv,
291     vattr_t *vap)
292 {
293 	struct sdev_node *dv = NULL;
294 	struct vnode *vp;
295 	size_t nmlen, len;
296 	devname_handle_t  *dhl;
297 
298 	nmlen = strlen(nm) + 1;
299 	if (nmlen > MAXNAMELEN) {
300 		sdcmn_err9(("sdev_nodeinit: node name %s"
301 		    " too long\n", nm));
302 		*newdv = NULL;
303 		return (ENAMETOOLONG);
304 	}
305 
306 	dv = kmem_cache_alloc(sdev_node_cache, KM_SLEEP);
307 
308 	dv->sdev_name = kmem_alloc(nmlen, KM_SLEEP);
309 	bcopy(nm, dv->sdev_name, nmlen);
310 	dv->sdev_namelen = nmlen - 1;	/* '\0' not included */
311 	len = strlen(ddv->sdev_path) + strlen(nm) + 2;
312 	dv->sdev_path = kmem_alloc(len, KM_SLEEP);
313 	(void) snprintf(dv->sdev_path, len, "%s/%s", ddv->sdev_path, nm);
314 	/* overwritten for VLNK nodes */
315 	dv->sdev_symlink = NULL;
316 
317 	vp = SDEVTOV(dv);
318 	vn_reinit(vp);
319 	vp->v_vfsp = SDEVTOV(ddv)->v_vfsp;
320 	if (vap)
321 		vp->v_type = vap->va_type;
322 
323 	/*
324 	 * initialized to the parent's vnodeops.
325 	 * maybe overwriten for a VDIR
326 	 */
327 	vn_setops(vp, vn_getops(SDEVTOV(ddv)));
328 	vn_exists(vp);
329 
330 	dv->sdev_dotdot = NULL;
331 	dv->sdev_attrvp = NULL;
332 	if (vap) {
333 		sdev_attr_alloc(dv, vap);
334 	} else {
335 		dv->sdev_attr = NULL;
336 	}
337 
338 	dv->sdev_ino = sdev_mkino(dv);
339 	dv->sdev_nlink = 0;		/* updated on insert */
340 	dv->sdev_flags = ddv->sdev_flags; /* inherit from the parent first */
341 	dv->sdev_flags |= SDEV_BUILD;
342 	mutex_init(&dv->sdev_lookup_lock, NULL, MUTEX_DEFAULT, NULL);
343 	cv_init(&dv->sdev_lookup_cv, NULL, CV_DEFAULT, NULL);
344 	if (SDEV_IS_GLOBAL(ddv)) {
345 		dv->sdev_flags |= SDEV_GLOBAL;
346 		dhl = &(dv->sdev_handle);
347 		dhl->dh_data = dv;
348 		dhl->dh_args = NULL;
349 		sdev_set_no_negcache(dv);
350 		dv->sdev_gdir_gen = 0;
351 	} else {
352 		dv->sdev_flags &= ~SDEV_GLOBAL;
353 		dv->sdev_origin = NULL; /* set later */
354 		bzero(&dv->sdev_prof, sizeof (dv->sdev_prof));
355 		dv->sdev_ldir_gen = 0;
356 		dv->sdev_devtree_gen = 0;
357 	}
358 
359 	rw_enter(&dv->sdev_contents, RW_WRITER);
360 	sdev_set_nodestate(dv, SDEV_INIT);
361 	rw_exit(&dv->sdev_contents);
362 	*newdv = dv;
363 
364 	return (0);
365 }
366 
367 /*
368  * Transition a sdev_node into SDEV_READY state. If this fails, it is up to the
369  * caller to transition the node to the SDEV_ZOMBIE state.
370  */
371 int
372 sdev_nodeready(struct sdev_node *dv, struct vattr *vap, struct vnode *avp,
373     void *args, struct cred *cred)
374 {
375 	int error = 0;
376 	struct vnode *vp = SDEVTOV(dv);
377 	vtype_t type;
378 
379 	ASSERT(dv && (dv->sdev_state != SDEV_READY) && vap);
380 
381 	type = vap->va_type;
382 	vp->v_type = type;
383 	vp->v_rdev = vap->va_rdev;
384 	rw_enter(&dv->sdev_contents, RW_WRITER);
385 	if (type == VDIR) {
386 		dv->sdev_nlink = 2;
387 		dv->sdev_flags &= ~SDEV_PERSIST;
388 		dv->sdev_flags &= ~SDEV_DYNAMIC;
389 		vn_setops(vp, sdev_get_vop(dv)); /* from internal vtab */
390 		ASSERT(dv->sdev_dotdot);
391 		ASSERT(SDEVTOV(dv->sdev_dotdot)->v_type == VDIR);
392 		vp->v_rdev = SDEVTOV(dv->sdev_dotdot)->v_rdev;
393 		avl_create(&dv->sdev_entries,
394 		    (int (*)(const void *, const void *))sdev_compare_nodes,
395 		    sizeof (struct sdev_node),
396 		    offsetof(struct sdev_node, sdev_avllink));
397 	} else if (type == VLNK) {
398 		ASSERT(args);
399 		dv->sdev_nlink = 1;
400 		dv->sdev_symlink = i_ddi_strdup((char *)args, KM_SLEEP);
401 	} else {
402 		dv->sdev_nlink = 1;
403 	}
404 
405 	if (!(SDEV_IS_GLOBAL(dv))) {
406 		dv->sdev_origin = (struct sdev_node *)args;
407 		dv->sdev_flags &= ~SDEV_PERSIST;
408 	}
409 
410 	/*
411 	 * shadow node is created here OR
412 	 * if failed (indicated by dv->sdev_attrvp == NULL),
413 	 * created later in sdev_setattr
414 	 */
415 	if (avp) {
416 		dv->sdev_attrvp = avp;
417 	} else {
418 		if (dv->sdev_attr == NULL) {
419 			sdev_attr_alloc(dv, vap);
420 		} else {
421 			sdev_attr_update(dv, vap);
422 		}
423 
424 		if ((dv->sdev_attrvp == NULL) && SDEV_IS_PERSIST(dv))
425 			error = sdev_shadow_node(dv, cred);
426 	}
427 
428 	if (error == 0) {
429 		/* transition to READY state */
430 		sdev_set_nodestate(dv, SDEV_READY);
431 		sdev_nc_node_exists(dv);
432 	}
433 	rw_exit(&dv->sdev_contents);
434 	return (error);
435 }
436 
437 /*
438  * Build the VROOT sdev_node.
439  */
440 /*ARGSUSED*/
441 struct sdev_node *
442 sdev_mkroot(struct vfs *vfsp, dev_t devdev, struct vnode *mvp,
443     struct vnode *avp, struct cred *cred)
444 {
445 	struct sdev_node *dv;
446 	struct vnode *vp;
447 	char devdir[] = "/dev";
448 
449 	ASSERT(sdev_node_cache != NULL);
450 	ASSERT(avp);
451 	dv = kmem_cache_alloc(sdev_node_cache, KM_SLEEP);
452 	vp = SDEVTOV(dv);
453 	vn_reinit(vp);
454 	vp->v_flag |= VROOT;
455 	vp->v_vfsp = vfsp;
456 	vp->v_type = VDIR;
457 	vp->v_rdev = devdev;
458 	vn_setops(vp, sdev_vnodeops); /* apply the default vnodeops at /dev */
459 	vn_exists(vp);
460 
461 	if (vfsp->vfs_mntpt)
462 		dv->sdev_name = i_ddi_strdup(
463 		    (char *)refstr_value(vfsp->vfs_mntpt), KM_SLEEP);
464 	else
465 		/* vfs_mountdev1 set mount point later */
466 		dv->sdev_name = i_ddi_strdup("/dev", KM_SLEEP);
467 	dv->sdev_namelen = strlen(dv->sdev_name); /* '\0' not included */
468 	dv->sdev_path = i_ddi_strdup(devdir, KM_SLEEP);
469 	dv->sdev_ino = SDEV_ROOTINO;
470 	dv->sdev_nlink = 2;		/* name + . (no sdev_insert) */
471 	dv->sdev_dotdot = dv;		/* .. == self */
472 	dv->sdev_attrvp = avp;
473 	dv->sdev_attr = NULL;
474 	mutex_init(&dv->sdev_lookup_lock, NULL, MUTEX_DEFAULT, NULL);
475 	cv_init(&dv->sdev_lookup_cv, NULL, CV_DEFAULT, NULL);
476 	if (strcmp(dv->sdev_name, "/dev") == 0) {
477 		dv->sdev_flags = SDEV_BUILD|SDEV_GLOBAL|SDEV_PERSIST;
478 		bzero(&dv->sdev_handle, sizeof (dv->sdev_handle));
479 		dv->sdev_gdir_gen = 0;
480 	} else {
481 		dv->sdev_flags = SDEV_BUILD;
482 		dv->sdev_flags &= ~SDEV_PERSIST;
483 		bzero(&dv->sdev_prof, sizeof (dv->sdev_prof));
484 		dv->sdev_ldir_gen = 0;
485 		dv->sdev_devtree_gen = 0;
486 	}
487 
488 	avl_create(&dv->sdev_entries,
489 	    (int (*)(const void *, const void *))sdev_compare_nodes,
490 	    sizeof (struct sdev_node),
491 	    offsetof(struct sdev_node, sdev_avllink));
492 
493 	rw_enter(&dv->sdev_contents, RW_WRITER);
494 	sdev_set_nodestate(dv, SDEV_READY);
495 	rw_exit(&dv->sdev_contents);
496 	sdev_nc_node_exists(dv);
497 	return (dv);
498 }
499 
500 /* directory dependent vop table */
501 struct sdev_vop_table {
502 	char *vt_name;				/* subdirectory name */
503 	const fs_operation_def_t *vt_service;	/* vnodeops table */
504 	struct vnodeops *vt_vops;		/* constructed vop */
505 	struct vnodeops **vt_global_vops;	/* global container for vop */
506 	int (*vt_vtor)(struct sdev_node *);	/* validate sdev_node */
507 	int vt_flags;
508 };
509 
510 /*
511  * A nice improvement would be to provide a plug-in mechanism
512  * for this table instead of a const table.
513  */
514 static struct sdev_vop_table vtab[] =
515 {
516 	{ "pts", devpts_vnodeops_tbl, NULL, &devpts_vnodeops, devpts_validate,
517 	SDEV_DYNAMIC | SDEV_VTOR },
518 
519 	{ "vt", devvt_vnodeops_tbl, NULL, &devvt_vnodeops, devvt_validate,
520 	SDEV_DYNAMIC | SDEV_VTOR },
521 
522 	{ "zvol", devzvol_vnodeops_tbl, NULL, &devzvol_vnodeops,
523 	devzvol_validate, SDEV_ZONED | SDEV_DYNAMIC | SDEV_VTOR | SDEV_SUBDIR },
524 
525 	{ "zcons", NULL, NULL, NULL, NULL, SDEV_NO_NCACHE },
526 
527 	{ "net", devnet_vnodeops_tbl, NULL, &devnet_vnodeops, devnet_validate,
528 	SDEV_DYNAMIC | SDEV_VTOR },
529 
530 	{ "ipnet", devipnet_vnodeops_tbl, NULL, &devipnet_vnodeops,
531 	devipnet_validate, SDEV_DYNAMIC | SDEV_VTOR | SDEV_NO_NCACHE },
532 
533 	/*
534 	 * SDEV_DYNAMIC: prevent calling out to devfsadm, since only the
535 	 * lofi driver controls child nodes.
536 	 *
537 	 * SDEV_PERSIST: ensure devfsadm knows to clean up any persisted
538 	 * stale nodes (e.g. from devfsadm -R).
539 	 *
540 	 * In addition, devfsadm knows not to attempt a rmdir: a zone
541 	 * may hold a reference, which would zombify the node,
542 	 * preventing a mkdir.
543 	 */
544 
545 	{ "lofi", NULL, NULL, NULL, NULL,
546 	    SDEV_ZONED | SDEV_DYNAMIC | SDEV_PERSIST },
547 	{ "rlofi", NULL, NULL, NULL, NULL,
548 	    SDEV_ZONED | SDEV_DYNAMIC | SDEV_PERSIST },
549 
550 	{ NULL, NULL, NULL, NULL, NULL, 0}
551 };
552 
553 /*
554  * We need to match off of the sdev_path, not the sdev_name. We are only allowed
555  * to exist directly under /dev.
556  */
557 struct sdev_vop_table *
558 sdev_match(struct sdev_node *dv)
559 {
560 	int vlen;
561 	int i;
562 	const char *path;
563 
564 	if (strlen(dv->sdev_path) <= 5)
565 		return (NULL);
566 
567 	if (strncmp(dv->sdev_path, "/dev/", 5) != 0)
568 		return (NULL);
569 	path = dv->sdev_path + 5;
570 
571 	for (i = 0; vtab[i].vt_name; i++) {
572 		if (strcmp(vtab[i].vt_name, path) == 0)
573 			return (&vtab[i]);
574 		if (vtab[i].vt_flags & SDEV_SUBDIR) {
575 			vlen = strlen(vtab[i].vt_name);
576 			if ((strncmp(vtab[i].vt_name, path,
577 			    vlen - 1) == 0) && path[vlen] == '/')
578 				return (&vtab[i]);
579 		}
580 
581 	}
582 	return (NULL);
583 }
584 
585 /*
586  *  sets a directory's vnodeops if the directory is in the vtab;
587  */
588 static struct vnodeops *
589 sdev_get_vop(struct sdev_node *dv)
590 {
591 	struct sdev_vop_table *vtp;
592 	char *path;
593 
594 	path = dv->sdev_path;
595 	ASSERT(path);
596 
597 	/* gets the relative path to /dev/ */
598 	path += 5;
599 
600 	/* gets the vtab entry it matches */
601 	if ((vtp = sdev_match(dv)) != NULL) {
602 		dv->sdev_flags |= vtp->vt_flags;
603 		if (SDEV_IS_PERSIST(dv->sdev_dotdot) &&
604 		    (SDEV_IS_PERSIST(dv) || !SDEV_IS_DYNAMIC(dv)))
605 			dv->sdev_flags |= SDEV_PERSIST;
606 
607 		if (vtp->vt_vops) {
608 			if (vtp->vt_global_vops)
609 				*(vtp->vt_global_vops) = vtp->vt_vops;
610 
611 			return (vtp->vt_vops);
612 		}
613 
614 		if (vtp->vt_service) {
615 			fs_operation_def_t *templ;
616 			templ = sdev_merge_vtab(vtp->vt_service);
617 			if (vn_make_ops(vtp->vt_name,
618 			    (const fs_operation_def_t *)templ,
619 			    &vtp->vt_vops) != 0) {
620 				cmn_err(CE_PANIC, "%s: malformed vnode ops\n",
621 				    vtp->vt_name);
622 				/*NOTREACHED*/
623 			}
624 			if (vtp->vt_global_vops) {
625 				*(vtp->vt_global_vops) = vtp->vt_vops;
626 			}
627 			sdev_free_vtab(templ);
628 
629 			return (vtp->vt_vops);
630 		}
631 
632 		return (sdev_vnodeops);
633 	}
634 
635 	/* child inherits the persistence of the parent */
636 	if (SDEV_IS_PERSIST(dv->sdev_dotdot))
637 		dv->sdev_flags |= SDEV_PERSIST;
638 
639 	return (sdev_vnodeops);
640 }
641 
642 static void
643 sdev_set_no_negcache(struct sdev_node *dv)
644 {
645 	int i;
646 	char *path;
647 
648 	ASSERT(dv->sdev_path);
649 	path = dv->sdev_path + strlen("/dev/");
650 
651 	for (i = 0; vtab[i].vt_name; i++) {
652 		if (strcmp(vtab[i].vt_name, path) == 0) {
653 			if (vtab[i].vt_flags & SDEV_NO_NCACHE)
654 				dv->sdev_flags |= SDEV_NO_NCACHE;
655 			break;
656 		}
657 	}
658 }
659 
660 void *
661 sdev_get_vtor(struct sdev_node *dv)
662 {
663 	struct sdev_vop_table *vtp;
664 
665 	vtp = sdev_match(dv);
666 	if (vtp)
667 		return ((void *)vtp->vt_vtor);
668 	else
669 		return (NULL);
670 }
671 
672 /*
673  * Build the base root inode
674  */
675 ino_t
676 sdev_mkino(struct sdev_node *dv)
677 {
678 	ino_t	ino;
679 
680 	/*
681 	 * for now, follow the lead of tmpfs here
682 	 * need to someday understand the requirements here
683 	 */
684 	ino = (ino_t)(uint32_t)((uintptr_t)dv >> 3);
685 	ino += SDEV_ROOTINO + 1;
686 
687 	return (ino);
688 }
689 
690 int
691 sdev_getlink(struct vnode *linkvp, char **link)
692 {
693 	int err;
694 	char *buf;
695 	struct uio uio = {0};
696 	struct iovec iov = {0};
697 
698 	if (linkvp == NULL)
699 		return (ENOENT);
700 	ASSERT(linkvp->v_type == VLNK);
701 
702 	buf = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
703 	iov.iov_base = buf;
704 	iov.iov_len = MAXPATHLEN;
705 	uio.uio_iov = &iov;
706 	uio.uio_iovcnt = 1;
707 	uio.uio_resid = MAXPATHLEN;
708 	uio.uio_segflg = UIO_SYSSPACE;
709 	uio.uio_llimit = MAXOFFSET_T;
710 
711 	err = VOP_READLINK(linkvp, &uio, kcred, NULL);
712 	if (err) {
713 		cmn_err(CE_WARN, "readlink %s failed in dev\n", buf);
714 		kmem_free(buf, MAXPATHLEN);
715 		return (ENOENT);
716 	}
717 
718 	/* mission complete */
719 	*link = i_ddi_strdup(buf, KM_SLEEP);
720 	kmem_free(buf, MAXPATHLEN);
721 	return (0);
722 }
723 
724 /*
725  * A convenient wrapper to get the devfs node vnode for a device
726  * minor functionality: readlink() of a /dev symlink
727  * Place the link into dv->sdev_symlink
728  */
729 static int
730 sdev_follow_link(struct sdev_node *dv)
731 {
732 	int err;
733 	struct vnode *linkvp;
734 	char *link = NULL;
735 
736 	linkvp = SDEVTOV(dv);
737 	if (linkvp == NULL)
738 		return (ENOENT);
739 	ASSERT(linkvp->v_type == VLNK);
740 	err = sdev_getlink(linkvp, &link);
741 	if (err) {
742 		dv->sdev_symlink = NULL;
743 		return (ENOENT);
744 	}
745 
746 	ASSERT(link != NULL);
747 	dv->sdev_symlink = link;
748 	return (0);
749 }
750 
751 static int
752 sdev_node_check(struct sdev_node *dv, struct vattr *nvap, void *nargs)
753 {
754 	vtype_t otype = SDEVTOV(dv)->v_type;
755 
756 	/*
757 	 * existing sdev_node has a different type.
758 	 */
759 	if (otype != nvap->va_type) {
760 		sdcmn_err9(("sdev_node_check: existing node "
761 		    "  %s type %d does not match new node type %d\n",
762 		    dv->sdev_name, otype, nvap->va_type));
763 		return (EEXIST);
764 	}
765 
766 	/*
767 	 * For a symlink, the target should be the same.
768 	 */
769 	if (otype == VLNK) {
770 		ASSERT(nargs != NULL);
771 		ASSERT(dv->sdev_symlink != NULL);
772 		if (strcmp(dv->sdev_symlink, (char *)nargs) != 0) {
773 			sdcmn_err9(("sdev_node_check: existing node "
774 			    " %s has different symlink %s as new node "
775 			    " %s\n", dv->sdev_name, dv->sdev_symlink,
776 			    (char *)nargs));
777 			return (EEXIST);
778 		}
779 	}
780 
781 	return (0);
782 }
783 
784 /*
785  * sdev_mknode - a wrapper for sdev_nodeinit(), sdev_nodeready()
786  *
787  * arguments:
788  *	- ddv (parent)
789  *	- nm (child name)
790  *	- newdv (sdev_node for nm is returned here)
791  *	- vap (vattr for the node to be created, va_type should be set.
792  *	- avp (attribute vnode)
793  *	  the defaults should be used if unknown)
794  *	- cred
795  *	- args
796  *	    . tnm (for VLNK)
797  *	    . global sdev_node (for !SDEV_GLOBAL)
798  * 	- state: SDEV_INIT, SDEV_READY
799  *
800  * only ddv, nm, newddv, vap, cred are required for sdev_mknode(SDEV_INIT)
801  *
802  * NOTE:  directory contents writers lock needs to be held before
803  *	  calling this routine.
804  */
805 int
806 sdev_mknode(struct sdev_node *ddv, char *nm, struct sdev_node **newdv,
807     struct vattr *vap, struct vnode *avp, void *args, struct cred *cred,
808     sdev_node_state_t state)
809 {
810 	int error = 0;
811 	sdev_node_state_t node_state;
812 	struct sdev_node *dv = NULL;
813 
814 	ASSERT(state != SDEV_ZOMBIE);
815 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
816 
817 	if (*newdv) {
818 		dv = *newdv;
819 	} else {
820 		/* allocate and initialize a sdev_node */
821 		if (ddv->sdev_state == SDEV_ZOMBIE) {
822 			sdcmn_err9(("sdev_mknode: parent %s ZOMBIEd\n",
823 			    ddv->sdev_path));
824 			return (ENOENT);
825 		}
826 
827 		error = sdev_nodeinit(ddv, nm, &dv, vap);
828 		if (error != 0) {
829 			sdcmn_err9(("sdev_mknode: error %d,"
830 			    " name %s can not be initialized\n",
831 			    error, nm));
832 			return (error);
833 		}
834 		ASSERT(dv);
835 
836 		/* insert into the directory cache */
837 		sdev_cache_update(ddv, &dv, nm, SDEV_CACHE_ADD);
838 	}
839 
840 	ASSERT(dv);
841 	node_state = dv->sdev_state;
842 	ASSERT(node_state != SDEV_ZOMBIE);
843 
844 	if (state == SDEV_READY) {
845 		switch (node_state) {
846 		case SDEV_INIT:
847 			error = sdev_nodeready(dv, vap, avp, args, cred);
848 			if (error) {
849 				sdcmn_err9(("sdev_mknode: node %s can NOT"
850 				    " be transitioned into READY state, "
851 				    "error %d\n", nm, error));
852 			}
853 			break;
854 		case SDEV_READY:
855 			/*
856 			 * Do some sanity checking to make sure
857 			 * the existing sdev_node is what has been
858 			 * asked for.
859 			 */
860 			error = sdev_node_check(dv, vap, args);
861 			break;
862 		default:
863 			break;
864 		}
865 	}
866 
867 	if (!error) {
868 		*newdv = dv;
869 		ASSERT((*newdv)->sdev_state != SDEV_ZOMBIE);
870 	} else {
871 		sdev_cache_update(ddv, &dv, nm, SDEV_CACHE_DELETE);
872 		/*
873 		 * We created this node, it wasn't passed into us. Therefore it
874 		 * is up to us to delete it.
875 		 */
876 		if (*newdv == NULL)
877 			SDEV_SIMPLE_RELE(dv);
878 		*newdv = NULL;
879 	}
880 
881 	return (error);
882 }
883 
884 /*
885  * convenient wrapper to change vp's ATIME, CTIME and MTIME
886  */
887 void
888 sdev_update_timestamps(struct vnode *vp, cred_t *cred, uint_t mask)
889 {
890 	struct vattr attr;
891 	timestruc_t now;
892 	int err;
893 
894 	ASSERT(vp);
895 	gethrestime(&now);
896 	if (mask & AT_CTIME)
897 		attr.va_ctime = now;
898 	if (mask & AT_MTIME)
899 		attr.va_mtime = now;
900 	if (mask & AT_ATIME)
901 		attr.va_atime = now;
902 
903 	attr.va_mask = (mask & AT_TIMES);
904 	err = VOP_SETATTR(vp, &attr, 0, cred, NULL);
905 	if (err && (err != EROFS)) {
906 		sdcmn_err(("update timestamps error %d\n", err));
907 	}
908 }
909 
910 /*
911  * the backing store vnode is released here
912  */
913 /*ARGSUSED1*/
914 void
915 sdev_nodedestroy(struct sdev_node *dv, uint_t flags)
916 {
917 	/* no references */
918 	ASSERT(dv->sdev_nlink == 0);
919 
920 	if (dv->sdev_attrvp != NULLVP) {
921 		VN_RELE(dv->sdev_attrvp);
922 		/*
923 		 * reset the attrvp so that no more
924 		 * references can be made on this already
925 		 * vn_rele() vnode
926 		 */
927 		dv->sdev_attrvp = NULLVP;
928 	}
929 
930 	if (dv->sdev_attr != NULL) {
931 		kmem_free(dv->sdev_attr, sizeof (struct vattr));
932 		dv->sdev_attr = NULL;
933 	}
934 
935 	if (dv->sdev_name != NULL) {
936 		kmem_free(dv->sdev_name, dv->sdev_namelen + 1);
937 		dv->sdev_name = NULL;
938 	}
939 
940 	if (dv->sdev_symlink != NULL) {
941 		kmem_free(dv->sdev_symlink, strlen(dv->sdev_symlink) + 1);
942 		dv->sdev_symlink = NULL;
943 	}
944 
945 	if (dv->sdev_path) {
946 		kmem_free(dv->sdev_path, strlen(dv->sdev_path) + 1);
947 		dv->sdev_path = NULL;
948 	}
949 
950 	if (!SDEV_IS_GLOBAL(dv))
951 		sdev_prof_free(dv);
952 
953 	if (SDEVTOV(dv)->v_type == VDIR) {
954 		ASSERT(SDEV_FIRST_ENTRY(dv) == NULL);
955 		avl_destroy(&dv->sdev_entries);
956 	}
957 
958 	mutex_destroy(&dv->sdev_lookup_lock);
959 	cv_destroy(&dv->sdev_lookup_cv);
960 
961 	/* return node to initial state as per constructor */
962 	(void) memset((void *)&dv->sdev_instance_data, 0,
963 	    sizeof (dv->sdev_instance_data));
964 	vn_invalid(SDEVTOV(dv));
965 	kmem_cache_free(sdev_node_cache, dv);
966 }
967 
968 /*
969  * DIRECTORY CACHE lookup
970  */
971 struct sdev_node *
972 sdev_findbyname(struct sdev_node *ddv, char *nm)
973 {
974 	struct sdev_node *dv;
975 	struct sdev_node dvtmp;
976 	avl_index_t	where;
977 
978 	ASSERT(RW_LOCK_HELD(&ddv->sdev_contents));
979 
980 	dvtmp.sdev_name = nm;
981 	dv = avl_find(&ddv->sdev_entries, &dvtmp, &where);
982 	if (dv) {
983 		ASSERT(dv->sdev_dotdot == ddv);
984 		ASSERT(strcmp(dv->sdev_name, nm) == 0);
985 		ASSERT(dv->sdev_state != SDEV_ZOMBIE);
986 		SDEV_HOLD(dv);
987 		return (dv);
988 	}
989 	return (NULL);
990 }
991 
992 /*
993  * Inserts a new sdev_node in a parent directory
994  */
995 void
996 sdev_direnter(struct sdev_node *ddv, struct sdev_node *dv)
997 {
998 	avl_index_t where;
999 
1000 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1001 	ASSERT(SDEVTOV(ddv)->v_type == VDIR);
1002 	ASSERT(ddv->sdev_nlink >= 2);
1003 	ASSERT(dv->sdev_nlink == 0);
1004 	ASSERT(dv->sdev_state != SDEV_ZOMBIE);
1005 
1006 	dv->sdev_dotdot = ddv;
1007 	VERIFY(avl_find(&ddv->sdev_entries, dv, &where) == NULL);
1008 	avl_insert(&ddv->sdev_entries, dv, where);
1009 	ddv->sdev_nlink++;
1010 }
1011 
1012 /*
1013  * The following check is needed because while sdev_nodes are linked
1014  * in SDEV_INIT state, they have their link counts incremented only
1015  * in SDEV_READY state.
1016  */
1017 static void
1018 decr_link(struct sdev_node *dv)
1019 {
1020 	VERIFY(RW_WRITE_HELD(&dv->sdev_contents));
1021 	if (dv->sdev_state != SDEV_INIT) {
1022 		VERIFY(dv->sdev_nlink >= 1);
1023 		dv->sdev_nlink--;
1024 	} else {
1025 		VERIFY(dv->sdev_nlink == 0);
1026 	}
1027 }
1028 
1029 /*
1030  * Delete an existing dv from directory cache
1031  *
1032  * In the case of a node is still held by non-zero reference count, the node is
1033  * put into ZOMBIE state. The node is always unlinked from its parent, but it is
1034  * not destroyed via sdev_inactive until its reference count reaches "0".
1035  */
1036 static void
1037 sdev_dirdelete(struct sdev_node *ddv, struct sdev_node *dv)
1038 {
1039 	struct vnode *vp;
1040 	sdev_node_state_t os;
1041 
1042 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1043 
1044 	vp = SDEVTOV(dv);
1045 	mutex_enter(&vp->v_lock);
1046 	rw_enter(&dv->sdev_contents, RW_WRITER);
1047 	os = dv->sdev_state;
1048 	ASSERT(os != SDEV_ZOMBIE);
1049 	dv->sdev_state = SDEV_ZOMBIE;
1050 
1051 	/*
1052 	 * unlink ourselves from the parent directory now to take care of the ..
1053 	 * link. However, if we're a directory, we don't remove our reference to
1054 	 * ourself eg. '.' until we are torn down in the inactive callback.
1055 	 */
1056 	decr_link(ddv);
1057 	avl_remove(&ddv->sdev_entries, dv);
1058 	/*
1059 	 * sdev_inactive expects nodes to have a link to themselves when we're
1060 	 * tearing them down. If we're transitioning from the initial state to
1061 	 * zombie and not via ready, then we're not going to have this link that
1062 	 * comes from the node being ready. As a result, we need to increment
1063 	 * our link count by one to account for this.
1064 	 */
1065 	if (os == SDEV_INIT && dv->sdev_nlink == 0)
1066 		dv->sdev_nlink++;
1067 	rw_exit(&dv->sdev_contents);
1068 	mutex_exit(&vp->v_lock);
1069 }
1070 
1071 /*
1072  * check if the source is in the path of the target
1073  *
1074  * source and target are different
1075  */
1076 /*ARGSUSED2*/
1077 static int
1078 sdev_checkpath(struct sdev_node *sdv, struct sdev_node *tdv, struct cred *cred)
1079 {
1080 	int error = 0;
1081 	struct sdev_node *dotdot, *dir;
1082 
1083 	dotdot = tdv->sdev_dotdot;
1084 	ASSERT(dotdot);
1085 
1086 	/* fs root */
1087 	if (dotdot == tdv) {
1088 		return (0);
1089 	}
1090 
1091 	for (;;) {
1092 		/*
1093 		 * avoid error cases like
1094 		 *	mv a a/b
1095 		 *	mv a a/b/c
1096 		 *	etc.
1097 		 */
1098 		if (dotdot == sdv) {
1099 			error = EINVAL;
1100 			break;
1101 		}
1102 
1103 		dir = dotdot;
1104 		dotdot = dir->sdev_dotdot;
1105 
1106 		/* done checking because root is reached */
1107 		if (dir == dotdot) {
1108 			break;
1109 		}
1110 	}
1111 	return (error);
1112 }
1113 
1114 int
1115 sdev_rnmnode(struct sdev_node *oddv, struct sdev_node *odv,
1116     struct sdev_node *nddv, struct sdev_node **ndvp, char *nnm,
1117     struct cred *cred)
1118 {
1119 	int error = 0;
1120 	struct vnode *ovp = SDEVTOV(odv);
1121 	struct vnode *nvp;
1122 	struct vattr vattr;
1123 	int doingdir = (ovp->v_type == VDIR);
1124 	char *link = NULL;
1125 	int samedir = (oddv == nddv) ? 1 : 0;
1126 	int bkstore = 0;
1127 	struct sdev_node *idv = NULL;
1128 	struct sdev_node *ndv = NULL;
1129 	timestruc_t now;
1130 
1131 	vattr.va_mask = AT_TYPE|AT_MODE|AT_UID|AT_GID;
1132 	error = VOP_GETATTR(ovp, &vattr, 0, cred, NULL);
1133 	if (error)
1134 		return (error);
1135 
1136 	if (!samedir)
1137 		rw_enter(&oddv->sdev_contents, RW_WRITER);
1138 	rw_enter(&nddv->sdev_contents, RW_WRITER);
1139 
1140 	/*
1141 	 * the source may have been deleted by another thread before
1142 	 * we gets here.
1143 	 */
1144 	if (odv->sdev_state != SDEV_READY) {
1145 		error = ENOENT;
1146 		goto err_out;
1147 	}
1148 
1149 	if (doingdir && (odv == nddv)) {
1150 		error = EINVAL;
1151 		goto err_out;
1152 	}
1153 
1154 	/*
1155 	 * If renaming a directory, and the parents are different (".." must be
1156 	 * changed) then the source dir must not be in the dir hierarchy above
1157 	 * the target since it would orphan everything below the source dir.
1158 	 */
1159 	if (doingdir && (oddv != nddv)) {
1160 		error = sdev_checkpath(odv, nddv, cred);
1161 		if (error)
1162 			goto err_out;
1163 	}
1164 
1165 	/* fix the source for a symlink */
1166 	if (vattr.va_type == VLNK) {
1167 		if (odv->sdev_symlink == NULL) {
1168 			error = sdev_follow_link(odv);
1169 			if (error) {
1170 				/*
1171 				 * The underlying symlink doesn't exist. This
1172 				 * node probably shouldn't even exist. While
1173 				 * it's a bit jarring to consumers, we're going
1174 				 * to remove the node from /dev.
1175 				 */
1176 				if (SDEV_IS_PERSIST((*ndvp)))
1177 					bkstore = 1;
1178 				sdev_dirdelete(oddv, odv);
1179 				if (bkstore) {
1180 					ASSERT(nddv->sdev_attrvp);
1181 					error = VOP_REMOVE(nddv->sdev_attrvp,
1182 					    nnm, cred, NULL, 0);
1183 					if (error)
1184 						goto err_out;
1185 				}
1186 				error = ENOENT;
1187 				goto err_out;
1188 			}
1189 		}
1190 		ASSERT(odv->sdev_symlink);
1191 		link = i_ddi_strdup(odv->sdev_symlink, KM_SLEEP);
1192 	}
1193 
1194 	/* destination existing */
1195 	if (*ndvp) {
1196 		nvp = SDEVTOV(*ndvp);
1197 		ASSERT(nvp);
1198 
1199 		/* handling renaming to itself */
1200 		if (odv == *ndvp) {
1201 			error = 0;
1202 			goto err_out;
1203 		}
1204 
1205 		if (nvp->v_type == VDIR) {
1206 			if (!doingdir) {
1207 				error = EISDIR;
1208 				goto err_out;
1209 			}
1210 
1211 			if (vn_vfswlock(nvp)) {
1212 				error = EBUSY;
1213 				goto err_out;
1214 			}
1215 
1216 			if (vn_mountedvfs(nvp) != NULL) {
1217 				vn_vfsunlock(nvp);
1218 				error = EBUSY;
1219 				goto err_out;
1220 			}
1221 
1222 			/* in case dir1 exists in dir2 and "mv dir1 dir2" */
1223 			if ((*ndvp)->sdev_nlink > 2) {
1224 				vn_vfsunlock(nvp);
1225 				error = EEXIST;
1226 				goto err_out;
1227 			}
1228 			vn_vfsunlock(nvp);
1229 
1230 			/*
1231 			 * We did not place the hold on *ndvp, so even though
1232 			 * we're deleting the node, we should not get rid of our
1233 			 * reference.
1234 			 */
1235 			sdev_dirdelete(nddv, *ndvp);
1236 			*ndvp = NULL;
1237 			ASSERT(nddv->sdev_attrvp);
1238 			error = VOP_RMDIR(nddv->sdev_attrvp, nnm,
1239 			    nddv->sdev_attrvp, cred, NULL, 0);
1240 			if (error)
1241 				goto err_out;
1242 		} else {
1243 			if (doingdir) {
1244 				error = ENOTDIR;
1245 				goto err_out;
1246 			}
1247 
1248 			if (SDEV_IS_PERSIST((*ndvp))) {
1249 				bkstore = 1;
1250 			}
1251 
1252 			/*
1253 			 * Get rid of the node from the directory cache note.
1254 			 * Don't forget that it's not up to us to remove the vn
1255 			 * ref on the sdev node, as we did not place it.
1256 			 */
1257 			sdev_dirdelete(nddv, *ndvp);
1258 			*ndvp = NULL;
1259 			if (bkstore) {
1260 				ASSERT(nddv->sdev_attrvp);
1261 				error = VOP_REMOVE(nddv->sdev_attrvp,
1262 				    nnm, cred, NULL, 0);
1263 				if (error)
1264 					goto err_out;
1265 			}
1266 		}
1267 	}
1268 
1269 	/*
1270 	 * make a fresh node from the source attrs
1271 	 */
1272 	ASSERT(RW_WRITE_HELD(&nddv->sdev_contents));
1273 	error = sdev_mknode(nddv, nnm, ndvp, &vattr,
1274 	    NULL, (void *)link, cred, SDEV_READY);
1275 
1276 	if (link != NULL) {
1277 		kmem_free(link, strlen(link) + 1);
1278 		link = NULL;
1279 	}
1280 
1281 	if (error)
1282 		goto err_out;
1283 	ASSERT(*ndvp);
1284 	ASSERT((*ndvp)->sdev_state == SDEV_READY);
1285 
1286 	/* move dir contents */
1287 	if (doingdir) {
1288 		for (idv = SDEV_FIRST_ENTRY(odv); idv;
1289 		    idv = SDEV_NEXT_ENTRY(odv, idv)) {
1290 			SDEV_HOLD(idv);
1291 			error = sdev_rnmnode(odv, idv,
1292 			    (struct sdev_node *)(*ndvp), &ndv,
1293 			    idv->sdev_name, cred);
1294 			SDEV_RELE(idv);
1295 			if (error)
1296 				goto err_out;
1297 			ndv = NULL;
1298 		}
1299 	}
1300 
1301 	if ((*ndvp)->sdev_attrvp) {
1302 		sdev_update_timestamps((*ndvp)->sdev_attrvp, kcred,
1303 		    AT_CTIME|AT_ATIME);
1304 	} else {
1305 		ASSERT((*ndvp)->sdev_attr);
1306 		gethrestime(&now);
1307 		(*ndvp)->sdev_attr->va_ctime = now;
1308 		(*ndvp)->sdev_attr->va_atime = now;
1309 	}
1310 
1311 	if (nddv->sdev_attrvp) {
1312 		sdev_update_timestamps(nddv->sdev_attrvp, kcred,
1313 		    AT_MTIME|AT_ATIME);
1314 	} else {
1315 		ASSERT(nddv->sdev_attr);
1316 		gethrestime(&now);
1317 		nddv->sdev_attr->va_mtime = now;
1318 		nddv->sdev_attr->va_atime = now;
1319 	}
1320 	rw_exit(&nddv->sdev_contents);
1321 	if (!samedir)
1322 		rw_exit(&oddv->sdev_contents);
1323 
1324 	SDEV_RELE(*ndvp);
1325 	return (error);
1326 
1327 err_out:
1328 	if (link != NULL) {
1329 		kmem_free(link, strlen(link) + 1);
1330 		link = NULL;
1331 	}
1332 
1333 	rw_exit(&nddv->sdev_contents);
1334 	if (!samedir)
1335 		rw_exit(&oddv->sdev_contents);
1336 	return (error);
1337 }
1338 
1339 /*
1340  * Merge sdev_node specific information into an attribute structure.
1341  *
1342  * note: sdev_node is not locked here
1343  */
1344 void
1345 sdev_vattr_merge(struct sdev_node *dv, struct vattr *vap)
1346 {
1347 	struct vnode *vp = SDEVTOV(dv);
1348 
1349 	vap->va_nlink = dv->sdev_nlink;
1350 	vap->va_nodeid = dv->sdev_ino;
1351 	vap->va_fsid = SDEVTOV(dv->sdev_dotdot)->v_rdev;
1352 	vap->va_type = vp->v_type;
1353 
1354 	if (vp->v_type == VDIR) {
1355 		vap->va_rdev = 0;
1356 		vap->va_fsid = vp->v_rdev;
1357 	} else if (vp->v_type == VLNK) {
1358 		vap->va_rdev = 0;
1359 		vap->va_mode  &= ~S_IFMT;
1360 		vap->va_mode |= S_IFLNK;
1361 	} else if ((vp->v_type == VCHR) || (vp->v_type == VBLK)) {
1362 		vap->va_rdev = vp->v_rdev;
1363 		vap->va_mode &= ~S_IFMT;
1364 		if (vap->va_type == VCHR)
1365 			vap->va_mode |= S_IFCHR;
1366 		else
1367 			vap->va_mode |= S_IFBLK;
1368 	} else {
1369 		vap->va_rdev = 0;
1370 	}
1371 }
1372 
1373 struct vattr *
1374 sdev_getdefault_attr(enum vtype type)
1375 {
1376 	if (type == VDIR)
1377 		return (&sdev_vattr_dir);
1378 	else if (type == VCHR)
1379 		return (&sdev_vattr_chr);
1380 	else if (type == VBLK)
1381 		return (&sdev_vattr_blk);
1382 	else if (type == VLNK)
1383 		return (&sdev_vattr_lnk);
1384 	else
1385 		return (NULL);
1386 }
1387 int
1388 sdev_to_vp(struct sdev_node *dv, struct vnode **vpp)
1389 {
1390 	int rv = 0;
1391 	struct vnode *vp = SDEVTOV(dv);
1392 
1393 	switch (vp->v_type) {
1394 	case VCHR:
1395 	case VBLK:
1396 		/*
1397 		 * If vnode is a device, return special vnode instead
1398 		 * (though it knows all about -us- via sp->s_realvp)
1399 		 */
1400 		*vpp = specvp(vp, vp->v_rdev, vp->v_type, kcred);
1401 		VN_RELE(vp);
1402 		if (*vpp == NULLVP)
1403 			rv = ENOSYS;
1404 		break;
1405 	default:	/* most types are returned as is */
1406 		*vpp = vp;
1407 		break;
1408 	}
1409 	return (rv);
1410 }
1411 
1412 /*
1413  * junction between devname and root file system, e.g. ufs
1414  */
1415 int
1416 devname_backstore_lookup(struct sdev_node *ddv, char *nm, struct vnode **rvp)
1417 {
1418 	struct vnode *rdvp = ddv->sdev_attrvp;
1419 	int rval = 0;
1420 
1421 	ASSERT(rdvp);
1422 
1423 	rval = VOP_LOOKUP(rdvp, nm, rvp, NULL, 0, NULL, kcred, NULL, NULL,
1424 	    NULL);
1425 	return (rval);
1426 }
1427 
1428 static int
1429 sdev_filldir_from_store(struct sdev_node *ddv, int dlen, struct cred *cred)
1430 {
1431 	struct sdev_node *dv = NULL;
1432 	char	*nm;
1433 	struct vnode *dirvp;
1434 	int	error;
1435 	vnode_t	*vp;
1436 	int eof;
1437 	struct iovec iov;
1438 	struct uio uio;
1439 	struct dirent64 *dp;
1440 	dirent64_t *dbuf;
1441 	size_t dbuflen;
1442 	struct vattr vattr;
1443 	char *link = NULL;
1444 
1445 	if (ddv->sdev_attrvp == NULL)
1446 		return (0);
1447 	if (!(ddv->sdev_flags & SDEV_BUILD))
1448 		return (0);
1449 
1450 	dirvp = ddv->sdev_attrvp;
1451 	VN_HOLD(dirvp);
1452 	dbuf = kmem_zalloc(dlen, KM_SLEEP);
1453 
1454 	uio.uio_iov = &iov;
1455 	uio.uio_iovcnt = 1;
1456 	uio.uio_segflg = UIO_SYSSPACE;
1457 	uio.uio_fmode = 0;
1458 	uio.uio_extflg = UIO_COPY_CACHED;
1459 	uio.uio_loffset = 0;
1460 	uio.uio_llimit = MAXOFFSET_T;
1461 
1462 	eof = 0;
1463 	error = 0;
1464 	while (!error && !eof) {
1465 		uio.uio_resid = dlen;
1466 		iov.iov_base = (char *)dbuf;
1467 		iov.iov_len = dlen;
1468 		(void) VOP_RWLOCK(dirvp, V_WRITELOCK_FALSE, NULL);
1469 		error = VOP_READDIR(dirvp, &uio, kcred, &eof, NULL, 0);
1470 		VOP_RWUNLOCK(dirvp, V_WRITELOCK_FALSE, NULL);
1471 
1472 		dbuflen = dlen - uio.uio_resid;
1473 		if (error || dbuflen == 0)
1474 			break;
1475 
1476 		if (!(ddv->sdev_flags & SDEV_BUILD))
1477 			break;
1478 
1479 		for (dp = dbuf; ((intptr_t)dp <
1480 		    (intptr_t)dbuf + dbuflen);
1481 		    dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen)) {
1482 			nm = dp->d_name;
1483 
1484 			if (strcmp(nm, ".") == 0 ||
1485 			    strcmp(nm, "..") == 0)
1486 				continue;
1487 
1488 			vp = NULLVP;
1489 			dv = sdev_cache_lookup(ddv, nm);
1490 			if (dv) {
1491 				VERIFY(dv->sdev_state != SDEV_ZOMBIE);
1492 				SDEV_SIMPLE_RELE(dv);
1493 				continue;
1494 			}
1495 
1496 			/* refill the cache if not already */
1497 			error = devname_backstore_lookup(ddv, nm, &vp);
1498 			if (error)
1499 				continue;
1500 
1501 			vattr.va_mask = AT_TYPE|AT_MODE|AT_UID|AT_GID;
1502 			error = VOP_GETATTR(vp, &vattr, 0, cred, NULL);
1503 			if (error)
1504 				continue;
1505 
1506 			if (vattr.va_type == VLNK) {
1507 				error = sdev_getlink(vp, &link);
1508 				if (error) {
1509 					continue;
1510 				}
1511 				ASSERT(link != NULL);
1512 			}
1513 
1514 			if (!rw_tryupgrade(&ddv->sdev_contents)) {
1515 				rw_exit(&ddv->sdev_contents);
1516 				rw_enter(&ddv->sdev_contents, RW_WRITER);
1517 			}
1518 			error = sdev_mknode(ddv, nm, &dv, &vattr, vp, link,
1519 			    cred, SDEV_READY);
1520 			rw_downgrade(&ddv->sdev_contents);
1521 
1522 			if (link != NULL) {
1523 				kmem_free(link, strlen(link) + 1);
1524 				link = NULL;
1525 			}
1526 
1527 			if (!error) {
1528 				ASSERT(dv);
1529 				ASSERT(dv->sdev_state != SDEV_ZOMBIE);
1530 				SDEV_SIMPLE_RELE(dv);
1531 			}
1532 			vp = NULL;
1533 			dv = NULL;
1534 		}
1535 	}
1536 
1537 done:
1538 	VN_RELE(dirvp);
1539 	kmem_free(dbuf, dlen);
1540 
1541 	return (error);
1542 }
1543 
1544 void
1545 sdev_filldir_dynamic(struct sdev_node *ddv)
1546 {
1547 	int error;
1548 	int i;
1549 	struct vattr vattr;
1550 	struct vattr *vap = &vattr;
1551 	char *nm = NULL;
1552 	struct sdev_node *dv = NULL;
1553 
1554 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1555 	ASSERT((ddv->sdev_flags & SDEV_BUILD));
1556 
1557 	*vap = *sdev_getdefault_attr(VDIR);	/* note structure copy here */
1558 	gethrestime(&vap->va_atime);
1559 	vap->va_mtime = vap->va_atime;
1560 	vap->va_ctime = vap->va_atime;
1561 	for (i = 0; vtab[i].vt_name != NULL; i++) {
1562 		/*
1563 		 * This early, we may be in a read-only /dev environment: leave
1564 		 * the creation of any nodes we'd attempt to persist to
1565 		 * devfsadm. Because /dev itself is normally persistent, any
1566 		 * node which is not marked dynamic will end up being marked
1567 		 * persistent. However, some nodes are both dynamic and
1568 		 * persistent, mostly lofi and rlofi, so we need to be careful
1569 		 * in our check.
1570 		 */
1571 		if ((vtab[i].vt_flags & SDEV_PERSIST) ||
1572 		    !(vtab[i].vt_flags & SDEV_DYNAMIC))
1573 			continue;
1574 		nm = vtab[i].vt_name;
1575 		ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1576 		dv = NULL;
1577 		error = sdev_mknode(ddv, nm, &dv, vap, NULL,
1578 		    NULL, kcred, SDEV_READY);
1579 		if (error) {
1580 			cmn_err(CE_WARN, "%s/%s: error %d\n",
1581 			    ddv->sdev_name, nm, error);
1582 		} else {
1583 			ASSERT(dv);
1584 			ASSERT(dv->sdev_state != SDEV_ZOMBIE);
1585 			SDEV_SIMPLE_RELE(dv);
1586 		}
1587 	}
1588 }
1589 
1590 /*
1591  * Creating a backing store entry based on sdev_attr.
1592  * This is called either as part of node creation in a persistent directory
1593  * or from setattr/setsecattr to persist access attributes across reboot.
1594  */
1595 int
1596 sdev_shadow_node(struct sdev_node *dv, struct cred *cred)
1597 {
1598 	int error = 0;
1599 	struct vnode *dvp = SDEVTOV(dv->sdev_dotdot);
1600 	struct vnode *rdvp = VTOSDEV(dvp)->sdev_attrvp;
1601 	struct vattr *vap = dv->sdev_attr;
1602 	char *nm = dv->sdev_name;
1603 	struct vnode *tmpvp, **rvp = &tmpvp, *rrvp = NULL;
1604 
1605 	ASSERT(dv && dv->sdev_name && rdvp);
1606 	ASSERT(RW_WRITE_HELD(&dv->sdev_contents) && dv->sdev_attrvp == NULL);
1607 
1608 lookup:
1609 	/* try to find it in the backing store */
1610 	error = VOP_LOOKUP(rdvp, nm, rvp, NULL, 0, NULL, cred, NULL, NULL,
1611 	    NULL);
1612 	if (error == 0) {
1613 		if (VOP_REALVP(*rvp, &rrvp, NULL) == 0) {
1614 			VN_HOLD(rrvp);
1615 			VN_RELE(*rvp);
1616 			*rvp = rrvp;
1617 		}
1618 
1619 		kmem_free(dv->sdev_attr, sizeof (vattr_t));
1620 		dv->sdev_attr = NULL;
1621 		dv->sdev_attrvp = *rvp;
1622 		return (0);
1623 	}
1624 
1625 	/* let's try to persist the node */
1626 	gethrestime(&vap->va_atime);
1627 	vap->va_mtime = vap->va_atime;
1628 	vap->va_ctime = vap->va_atime;
1629 	vap->va_mask |= AT_TYPE|AT_MODE;
1630 	switch (vap->va_type) {
1631 	case VDIR:
1632 		error = VOP_MKDIR(rdvp, nm, vap, rvp, cred, NULL, 0, NULL);
1633 		sdcmn_err9(("sdev_shadow_node: mkdir vp %p error %d\n",
1634 		    (void *)(*rvp), error));
1635 		if (!error)
1636 			VN_RELE(*rvp);
1637 		break;
1638 	case VCHR:
1639 	case VBLK:
1640 	case VREG:
1641 	case VDOOR:
1642 		error = VOP_CREATE(rdvp, nm, vap, NONEXCL, VREAD|VWRITE,
1643 		    rvp, cred, 0, NULL, NULL);
1644 		sdcmn_err9(("sdev_shadow_node: create vp %p, error %d\n",
1645 		    (void *)(*rvp), error));
1646 		if (!error)
1647 			VN_RELE(*rvp);
1648 		break;
1649 	case VLNK:
1650 		ASSERT(dv->sdev_symlink);
1651 		error = VOP_SYMLINK(rdvp, nm, vap, dv->sdev_symlink, cred,
1652 		    NULL, 0);
1653 		sdcmn_err9(("sdev_shadow_node: create symlink error %d\n",
1654 		    error));
1655 		break;
1656 	default:
1657 		cmn_err(CE_PANIC, "dev: %s: sdev_shadow_node "
1658 		    "create\n", nm);
1659 		/*NOTREACHED*/
1660 	}
1661 
1662 	/* go back to lookup to factor out spec node and set attrvp */
1663 	if (error == 0)
1664 		goto lookup;
1665 
1666 	sdcmn_err(("cannot persist %s - error %d\n", dv->sdev_path, error));
1667 	return (error);
1668 }
1669 
1670 static void
1671 sdev_cache_add(struct sdev_node *ddv, struct sdev_node **dv, char *nm)
1672 {
1673 	struct sdev_node *dup = NULL;
1674 
1675 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1676 	if ((dup = sdev_findbyname(ddv, nm)) == NULL) {
1677 		sdev_direnter(ddv, *dv);
1678 	} else {
1679 		VERIFY(dup->sdev_state != SDEV_ZOMBIE);
1680 		SDEV_SIMPLE_RELE(*dv);
1681 		sdev_nodedestroy(*dv, 0);
1682 		*dv = dup;
1683 	}
1684 }
1685 
1686 static void
1687 sdev_cache_delete(struct sdev_node *ddv, struct sdev_node **dv)
1688 {
1689 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1690 	sdev_dirdelete(ddv, *dv);
1691 }
1692 
1693 /*
1694  * update the in-core directory cache
1695  */
1696 void
1697 sdev_cache_update(struct sdev_node *ddv, struct sdev_node **dv, char *nm,
1698     sdev_cache_ops_t ops)
1699 {
1700 	ASSERT((SDEV_HELD(*dv)));
1701 
1702 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1703 	switch (ops) {
1704 	case SDEV_CACHE_ADD:
1705 		sdev_cache_add(ddv, dv, nm);
1706 		break;
1707 	case SDEV_CACHE_DELETE:
1708 		sdev_cache_delete(ddv, dv);
1709 		break;
1710 	default:
1711 		break;
1712 	}
1713 }
1714 
1715 /*
1716  * retrieve the named entry from the directory cache
1717  */
1718 struct sdev_node *
1719 sdev_cache_lookup(struct sdev_node *ddv, char *nm)
1720 {
1721 	struct sdev_node *dv = NULL;
1722 
1723 	ASSERT(RW_LOCK_HELD(&ddv->sdev_contents));
1724 	dv = sdev_findbyname(ddv, nm);
1725 
1726 	return (dv);
1727 }
1728 
1729 /*
1730  * Implicit reconfig for nodes constructed by a link generator
1731  * Start devfsadm if needed, or if devfsadm is in progress,
1732  * prepare to block on devfsadm either completing or
1733  * constructing the desired node.  As devfsadmd is global
1734  * in scope, constructing all necessary nodes, we only
1735  * need to initiate it once.
1736  */
1737 static int
1738 sdev_call_devfsadmd(struct sdev_node *ddv, struct sdev_node *dv, char *nm)
1739 {
1740 	int error = 0;
1741 
1742 	if (DEVNAME_DEVFSADM_IS_RUNNING(devfsadm_state)) {
1743 		sdcmn_err6(("lookup: waiting for %s/%s, 0x%x\n",
1744 		    ddv->sdev_name, nm, devfsadm_state));
1745 		mutex_enter(&dv->sdev_lookup_lock);
1746 		SDEV_BLOCK_OTHERS(dv, (SDEV_LOOKUP | SDEV_LGWAITING));
1747 		mutex_exit(&dv->sdev_lookup_lock);
1748 		error = 0;
1749 	} else if (!DEVNAME_DEVFSADM_HAS_RUN(devfsadm_state)) {
1750 		sdcmn_err6(("lookup %s/%s starting devfsadm, 0x%x\n",
1751 		    ddv->sdev_name, nm, devfsadm_state));
1752 
1753 		sdev_devfsadmd_thread(ddv, dv, kcred);
1754 		mutex_enter(&dv->sdev_lookup_lock);
1755 		SDEV_BLOCK_OTHERS(dv,
1756 		    (SDEV_LOOKUP | SDEV_LGWAITING));
1757 		mutex_exit(&dv->sdev_lookup_lock);
1758 		error = 0;
1759 	} else {
1760 		error = -1;
1761 	}
1762 
1763 	return (error);
1764 }
1765 
1766 /*
1767  *  Support for specialized device naming construction mechanisms
1768  */
1769 static int
1770 sdev_call_dircallback(struct sdev_node *ddv, struct sdev_node **dvp, char *nm,
1771     int (*callback)(struct sdev_node *, char *, void **, struct cred *,
1772     void *, char *), int flags, struct cred *cred)
1773 {
1774 	int rv = 0;
1775 	char *physpath = NULL;
1776 	struct vattr vattr;
1777 	struct vattr *vap = &vattr;
1778 	struct sdev_node *dv = NULL;
1779 
1780 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1781 	if (flags & SDEV_VLINK) {
1782 		physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1783 		rv = callback(ddv, nm, (void *)&physpath, kcred, NULL,
1784 		    NULL);
1785 		if (rv) {
1786 			kmem_free(physpath, MAXPATHLEN);
1787 			return (-1);
1788 		}
1789 
1790 		*vap = *sdev_getdefault_attr(VLNK);	/* structure copy */
1791 		vap->va_size = strlen(physpath);
1792 		gethrestime(&vap->va_atime);
1793 		vap->va_mtime = vap->va_atime;
1794 		vap->va_ctime = vap->va_atime;
1795 
1796 		rv = sdev_mknode(ddv, nm, &dv, vap, NULL,
1797 		    (void *)physpath, cred, SDEV_READY);
1798 		kmem_free(physpath, MAXPATHLEN);
1799 		if (rv)
1800 			return (rv);
1801 	} else if (flags & SDEV_VATTR) {
1802 		/*
1803 		 * /dev/pts
1804 		 *
1805 		 * callback is responsible to set the basic attributes,
1806 		 * e.g. va_type/va_uid/va_gid/
1807 		 *    dev_t if VCHR or VBLK/
1808 		 */
1809 		ASSERT(callback);
1810 		rv = callback(ddv, nm, (void *)&vattr, kcred, NULL, NULL);
1811 		if (rv) {
1812 			sdcmn_err3(("devname_lookup_func: SDEV_NONE "
1813 			    "callback failed \n"));
1814 			return (-1);
1815 		}
1816 
1817 		rv = sdev_mknode(ddv, nm, &dv, &vattr, NULL, NULL,
1818 		    cred, SDEV_READY);
1819 
1820 		if (rv)
1821 			return (rv);
1822 
1823 	} else {
1824 		impossible(("lookup: %s/%s by %s not supported (%d)\n",
1825 		    SDEVTOV(ddv)->v_path, nm, curproc->p_user.u_comm,
1826 		    __LINE__));
1827 		rv = -1;
1828 	}
1829 
1830 	*dvp = dv;
1831 	return (rv);
1832 }
1833 
1834 static int
1835 is_devfsadm_thread(char *exec_name)
1836 {
1837 	/*
1838 	 * note: because devfsadmd -> /usr/sbin/devfsadm
1839 	 * it is safe to use "devfsadm" to capture the lookups
1840 	 * from devfsadm and its daemon version.
1841 	 */
1842 	if (strcmp(exec_name, "devfsadm") == 0)
1843 		return (1);
1844 	return (0);
1845 }
1846 
1847 /*
1848  * Lookup Order:
1849  *	sdev_node cache;
1850  *	backing store (SDEV_PERSIST);
1851  *	DBNR: a. dir_ops implemented in the loadable modules;
1852  *	      b. vnode ops in vtab.
1853  */
1854 int
1855 devname_lookup_func(struct sdev_node *ddv, char *nm, struct vnode **vpp,
1856     struct cred *cred, int (*callback)(struct sdev_node *, char *, void **,
1857     struct cred *, void *, char *), int flags)
1858 {
1859 	int rv = 0, nmlen;
1860 	struct vnode *rvp = NULL;
1861 	struct sdev_node *dv = NULL;
1862 	int	retried = 0;
1863 	int	error = 0;
1864 	struct vattr vattr;
1865 	char *lookup_thread = curproc->p_user.u_comm;
1866 	int failed_flags = 0;
1867 	int (*vtor)(struct sdev_node *) = NULL;
1868 	int state;
1869 	int parent_state;
1870 	char *link = NULL;
1871 
1872 	if (SDEVTOV(ddv)->v_type != VDIR)
1873 		return (ENOTDIR);
1874 
1875 	/*
1876 	 * Empty name or ., return node itself.
1877 	 */
1878 	nmlen = strlen(nm);
1879 	if ((nmlen == 0) || ((nmlen == 1) && (nm[0] == '.'))) {
1880 		*vpp = SDEVTOV(ddv);
1881 		VN_HOLD(*vpp);
1882 		return (0);
1883 	}
1884 
1885 	/*
1886 	 * .., return the parent directory
1887 	 */
1888 	if ((nmlen == 2) && (strcmp(nm, "..") == 0)) {
1889 		*vpp = SDEVTOV(ddv->sdev_dotdot);
1890 		VN_HOLD(*vpp);
1891 		return (0);
1892 	}
1893 
1894 	rw_enter(&ddv->sdev_contents, RW_READER);
1895 	if (ddv->sdev_flags & SDEV_VTOR) {
1896 		vtor = (int (*)(struct sdev_node *))sdev_get_vtor(ddv);
1897 		ASSERT(vtor);
1898 	}
1899 
1900 tryagain:
1901 	/*
1902 	 * (a) directory cache lookup:
1903 	 */
1904 	ASSERT(RW_READ_HELD(&ddv->sdev_contents));
1905 	parent_state = ddv->sdev_state;
1906 	dv = sdev_cache_lookup(ddv, nm);
1907 	if (dv) {
1908 		state = dv->sdev_state;
1909 		switch (state) {
1910 		case SDEV_INIT:
1911 			if (is_devfsadm_thread(lookup_thread))
1912 				break;
1913 
1914 			/* ZOMBIED parent won't allow node creation */
1915 			if (parent_state == SDEV_ZOMBIE) {
1916 				SD_TRACE_FAILED_LOOKUP(ddv, nm,
1917 				    retried);
1918 				goto nolock_notfound;
1919 			}
1920 
1921 			mutex_enter(&dv->sdev_lookup_lock);
1922 			/* compensate the threads started after devfsadm */
1923 			if (DEVNAME_DEVFSADM_IS_RUNNING(devfsadm_state) &&
1924 			    !(SDEV_IS_LOOKUP(dv)))
1925 				SDEV_BLOCK_OTHERS(dv,
1926 				    (SDEV_LOOKUP | SDEV_LGWAITING));
1927 
1928 			if (SDEV_IS_LOOKUP(dv)) {
1929 				failed_flags |= SLF_REBUILT;
1930 				rw_exit(&ddv->sdev_contents);
1931 				error = sdev_wait4lookup(dv, SDEV_LOOKUP);
1932 				mutex_exit(&dv->sdev_lookup_lock);
1933 				rw_enter(&ddv->sdev_contents, RW_READER);
1934 
1935 				if (error != 0) {
1936 					SD_TRACE_FAILED_LOOKUP(ddv, nm,
1937 					    retried);
1938 					goto nolock_notfound;
1939 				}
1940 
1941 				state = dv->sdev_state;
1942 				if (state == SDEV_INIT) {
1943 					SD_TRACE_FAILED_LOOKUP(ddv, nm,
1944 					    retried);
1945 					goto nolock_notfound;
1946 				} else if (state == SDEV_READY) {
1947 					goto found;
1948 				} else if (state == SDEV_ZOMBIE) {
1949 					rw_exit(&ddv->sdev_contents);
1950 					SD_TRACE_FAILED_LOOKUP(ddv, nm,
1951 					    retried);
1952 					SDEV_RELE(dv);
1953 					goto lookup_failed;
1954 				}
1955 			} else {
1956 				mutex_exit(&dv->sdev_lookup_lock);
1957 			}
1958 			break;
1959 		case SDEV_READY:
1960 			goto found;
1961 		case SDEV_ZOMBIE:
1962 			rw_exit(&ddv->sdev_contents);
1963 			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
1964 			SDEV_RELE(dv);
1965 			goto lookup_failed;
1966 		default:
1967 			rw_exit(&ddv->sdev_contents);
1968 			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
1969 			sdev_lookup_failed(ddv, nm, failed_flags);
1970 			*vpp = NULLVP;
1971 			return (ENOENT);
1972 		}
1973 	}
1974 	ASSERT(RW_READ_HELD(&ddv->sdev_contents));
1975 
1976 	/*
1977 	 * ZOMBIED parent does not allow new node creation.
1978 	 * bail out early
1979 	 */
1980 	if (parent_state == SDEV_ZOMBIE) {
1981 		rw_exit(&ddv->sdev_contents);
1982 		*vpp = NULLVP;
1983 		SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
1984 		return (ENOENT);
1985 	}
1986 
1987 	/*
1988 	 * (b0): backing store lookup
1989 	 *	SDEV_PERSIST is default except:
1990 	 *		1) pts nodes
1991 	 *		2) non-chmod'ed local nodes
1992 	 *		3) zvol nodes
1993 	 */
1994 	if (SDEV_IS_PERSIST(ddv)) {
1995 		error = devname_backstore_lookup(ddv, nm, &rvp);
1996 
1997 		if (!error) {
1998 
1999 			vattr.va_mask = AT_TYPE|AT_MODE|AT_UID|AT_GID;
2000 			error = VOP_GETATTR(rvp, &vattr, 0, cred, NULL);
2001 			if (error) {
2002 				rw_exit(&ddv->sdev_contents);
2003 				if (dv)
2004 					SDEV_RELE(dv);
2005 				SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2006 				sdev_lookup_failed(ddv, nm, failed_flags);
2007 				*vpp = NULLVP;
2008 				return (ENOENT);
2009 			}
2010 
2011 			if (vattr.va_type == VLNK) {
2012 				error = sdev_getlink(rvp, &link);
2013 				if (error) {
2014 					rw_exit(&ddv->sdev_contents);
2015 					if (dv)
2016 						SDEV_RELE(dv);
2017 					SD_TRACE_FAILED_LOOKUP(ddv, nm,
2018 					    retried);
2019 					sdev_lookup_failed(ddv, nm,
2020 					    failed_flags);
2021 					*vpp = NULLVP;
2022 					return (ENOENT);
2023 				}
2024 				ASSERT(link != NULL);
2025 			}
2026 
2027 			if (!rw_tryupgrade(&ddv->sdev_contents)) {
2028 				rw_exit(&ddv->sdev_contents);
2029 				rw_enter(&ddv->sdev_contents, RW_WRITER);
2030 			}
2031 			error = sdev_mknode(ddv, nm, &dv, &vattr,
2032 			    rvp, link, cred, SDEV_READY);
2033 			rw_downgrade(&ddv->sdev_contents);
2034 
2035 			if (link != NULL) {
2036 				kmem_free(link, strlen(link) + 1);
2037 				link = NULL;
2038 			}
2039 
2040 			if (error) {
2041 				SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2042 				rw_exit(&ddv->sdev_contents);
2043 				if (dv)
2044 					SDEV_RELE(dv);
2045 				goto lookup_failed;
2046 			} else {
2047 				goto found;
2048 			}
2049 		} else if (retried) {
2050 			rw_exit(&ddv->sdev_contents);
2051 			sdcmn_err3(("retry of lookup of %s/%s: failed\n",
2052 			    ddv->sdev_name, nm));
2053 			if (dv)
2054 				SDEV_RELE(dv);
2055 			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2056 			sdev_lookup_failed(ddv, nm, failed_flags);
2057 			*vpp = NULLVP;
2058 			return (ENOENT);
2059 		}
2060 	}
2061 
2062 lookup_create_node:
2063 	/* first thread that is doing the lookup on this node */
2064 	if (callback) {
2065 		ASSERT(dv == NULL);
2066 		if (!rw_tryupgrade(&ddv->sdev_contents)) {
2067 			rw_exit(&ddv->sdev_contents);
2068 			rw_enter(&ddv->sdev_contents, RW_WRITER);
2069 		}
2070 		error = sdev_call_dircallback(ddv, &dv, nm, callback,
2071 		    flags, cred);
2072 		rw_downgrade(&ddv->sdev_contents);
2073 		if (error == 0) {
2074 			goto found;
2075 		} else {
2076 			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2077 			rw_exit(&ddv->sdev_contents);
2078 			goto lookup_failed;
2079 		}
2080 	}
2081 	if (!dv) {
2082 		if (!rw_tryupgrade(&ddv->sdev_contents)) {
2083 			rw_exit(&ddv->sdev_contents);
2084 			rw_enter(&ddv->sdev_contents, RW_WRITER);
2085 		}
2086 		error = sdev_mknode(ddv, nm, &dv, NULL, NULL, NULL,
2087 		    cred, SDEV_INIT);
2088 		if (!dv) {
2089 			rw_exit(&ddv->sdev_contents);
2090 			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2091 			sdev_lookup_failed(ddv, nm, failed_flags);
2092 			*vpp = NULLVP;
2093 			return (ENOENT);
2094 		}
2095 		rw_downgrade(&ddv->sdev_contents);
2096 	}
2097 
2098 	/*
2099 	 * (b1) invoking devfsadm once per life time for devfsadm nodes
2100 	 */
2101 	ASSERT(SDEV_HELD(dv));
2102 
2103 	if (SDEV_IS_NO_NCACHE(dv))
2104 		failed_flags |= SLF_NO_NCACHE;
2105 	if (sdev_reconfig_boot || !i_ddi_io_initialized() ||
2106 	    SDEV_IS_DYNAMIC(ddv) || SDEV_IS_NO_NCACHE(dv) ||
2107 	    ((moddebug & MODDEBUG_FINI_EBUSY) != 0)) {
2108 		ASSERT(SDEV_HELD(dv));
2109 		SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2110 		goto nolock_notfound;
2111 	}
2112 
2113 	/*
2114 	 * filter out known non-existent devices recorded
2115 	 * during initial reconfiguration boot for which
2116 	 * reconfig should not be done and lookup may
2117 	 * be short-circuited now.
2118 	 */
2119 	if (sdev_lookup_filter(ddv, nm)) {
2120 		SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2121 		goto nolock_notfound;
2122 	}
2123 
2124 	/* bypassing devfsadm internal nodes */
2125 	if (is_devfsadm_thread(lookup_thread)) {
2126 		SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2127 		goto nolock_notfound;
2128 	}
2129 
2130 	if (sdev_reconfig_disable) {
2131 		SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2132 		goto nolock_notfound;
2133 	}
2134 
2135 	error = sdev_call_devfsadmd(ddv, dv, nm);
2136 	if (error == 0) {
2137 		sdcmn_err8(("lookup of %s/%s by %s: reconfig\n",
2138 		    ddv->sdev_name, nm, curproc->p_user.u_comm));
2139 		if (sdev_reconfig_verbose) {
2140 			cmn_err(CE_CONT,
2141 			    "?lookup of %s/%s by %s: reconfig\n",
2142 			    ddv->sdev_name, nm, curproc->p_user.u_comm);
2143 		}
2144 		retried = 1;
2145 		failed_flags |= SLF_REBUILT;
2146 		ASSERT(dv->sdev_state != SDEV_ZOMBIE);
2147 		SDEV_SIMPLE_RELE(dv);
2148 		goto tryagain;
2149 	} else {
2150 		SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2151 		goto nolock_notfound;
2152 	}
2153 
2154 found:
2155 	ASSERT(dv->sdev_state == SDEV_READY);
2156 	if (vtor) {
2157 		/*
2158 		 * Check validity of returned node
2159 		 */
2160 		switch (vtor(dv)) {
2161 		case SDEV_VTOR_VALID:
2162 			break;
2163 		case SDEV_VTOR_STALE:
2164 			/*
2165 			 * The name exists, but the cache entry is
2166 			 * stale and needs to be re-created.
2167 			 */
2168 			ASSERT(RW_READ_HELD(&ddv->sdev_contents));
2169 			if (rw_tryupgrade(&ddv->sdev_contents) == 0) {
2170 				rw_exit(&ddv->sdev_contents);
2171 				rw_enter(&ddv->sdev_contents, RW_WRITER);
2172 			}
2173 			sdev_cache_update(ddv, &dv, nm, SDEV_CACHE_DELETE);
2174 			rw_downgrade(&ddv->sdev_contents);
2175 			SDEV_RELE(dv);
2176 			dv = NULL;
2177 			goto lookup_create_node;
2178 			/* FALLTHRU */
2179 		case SDEV_VTOR_INVALID:
2180 			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2181 			sdcmn_err7(("lookup: destroy invalid "
2182 			    "node: %s(%p)\n", dv->sdev_name, (void *)dv));
2183 			goto nolock_notfound;
2184 		case SDEV_VTOR_SKIP:
2185 			sdcmn_err7(("lookup: node not applicable - "
2186 			    "skipping: %s(%p)\n", dv->sdev_name, (void *)dv));
2187 			rw_exit(&ddv->sdev_contents);
2188 			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2189 			SDEV_RELE(dv);
2190 			goto lookup_failed;
2191 		default:
2192 			cmn_err(CE_PANIC,
2193 			    "dev fs: validator failed: %s(%p)\n",
2194 			    dv->sdev_name, (void *)dv);
2195 			break;
2196 		}
2197 	}
2198 
2199 	rw_exit(&ddv->sdev_contents);
2200 	rv = sdev_to_vp(dv, vpp);
2201 	sdcmn_err3(("devname_lookup_func: returning vp %p v_count %d state %d "
2202 	    "for nm %s, error %d\n", (void *)*vpp, (*vpp)->v_count,
2203 	    dv->sdev_state, nm, rv));
2204 	return (rv);
2205 
2206 nolock_notfound:
2207 	/*
2208 	 * Destroy the node that is created for synchronization purposes.
2209 	 */
2210 	sdcmn_err3(("devname_lookup_func: %s with state %d\n",
2211 	    nm, dv->sdev_state));
2212 	ASSERT(RW_READ_HELD(&ddv->sdev_contents));
2213 	if (dv->sdev_state == SDEV_INIT) {
2214 		if (!rw_tryupgrade(&ddv->sdev_contents)) {
2215 			rw_exit(&ddv->sdev_contents);
2216 			rw_enter(&ddv->sdev_contents, RW_WRITER);
2217 		}
2218 
2219 		/*
2220 		 * Node state may have changed during the lock
2221 		 * changes. Re-check.
2222 		 */
2223 		if (dv->sdev_state == SDEV_INIT) {
2224 			sdev_dirdelete(ddv, dv);
2225 			rw_exit(&ddv->sdev_contents);
2226 			sdev_lookup_failed(ddv, nm, failed_flags);
2227 			SDEV_RELE(dv);
2228 			*vpp = NULL;
2229 			return (ENOENT);
2230 		}
2231 	}
2232 
2233 	rw_exit(&ddv->sdev_contents);
2234 	SDEV_RELE(dv);
2235 
2236 lookup_failed:
2237 	sdev_lookup_failed(ddv, nm, failed_flags);
2238 	*vpp = NULL;
2239 	return (ENOENT);
2240 }
2241 
2242 /*
2243  * Given a directory node, mark all nodes beneath as
2244  * STALE, i.e. nodes that don't exist as far as new
2245  * consumers are concerned.  Remove them from the
2246  * list of directory entries so that no lookup or
2247  * directory traversal will find them.  The node
2248  * not deallocated so existing holds are not affected.
2249  */
2250 void
2251 sdev_stale(struct sdev_node *ddv)
2252 {
2253 	struct sdev_node *dv;
2254 	struct vnode *vp;
2255 
2256 	ASSERT(SDEVTOV(ddv)->v_type == VDIR);
2257 
2258 	rw_enter(&ddv->sdev_contents, RW_WRITER);
2259 	while ((dv = SDEV_FIRST_ENTRY(ddv)) != NULL) {
2260 		vp = SDEVTOV(dv);
2261 		SDEV_HOLD(dv);
2262 		if (vp->v_type == VDIR)
2263 			sdev_stale(dv);
2264 
2265 		sdev_dirdelete(ddv, dv);
2266 		SDEV_RELE(dv);
2267 	}
2268 	ddv->sdev_flags |= SDEV_BUILD;
2269 	rw_exit(&ddv->sdev_contents);
2270 }
2271 
2272 /*
2273  * Given a directory node, clean out all the nodes beneath.
2274  * If expr is specified, clean node with names matching expr.
2275  * If SDEV_ENFORCE is specified in flags, busy nodes are made stale,
2276  *	so they are excluded from future lookups.
2277  */
2278 int
2279 sdev_cleandir(struct sdev_node *ddv, char *expr, uint_t flags)
2280 {
2281 	int error = 0;
2282 	int busy = 0;
2283 	struct vnode *vp;
2284 	struct sdev_node *dv, *next;
2285 	int bkstore = 0;
2286 	int len = 0;
2287 	char *bks_name = NULL;
2288 
2289 	ASSERT(SDEVTOV(ddv)->v_type == VDIR);
2290 
2291 	/*
2292 	 * We try our best to destroy all unused sdev_node's
2293 	 */
2294 	rw_enter(&ddv->sdev_contents, RW_WRITER);
2295 	for (dv = SDEV_FIRST_ENTRY(ddv); dv != NULL; dv = next) {
2296 		next = SDEV_NEXT_ENTRY(ddv, dv);
2297 		vp = SDEVTOV(dv);
2298 
2299 		if (expr && gmatch(dv->sdev_name, expr) == 0)
2300 			continue;
2301 
2302 		if (vp->v_type == VDIR &&
2303 		    sdev_cleandir(dv, NULL, flags) != 0) {
2304 			sdcmn_err9(("sdev_cleandir: dir %s busy\n",
2305 			    dv->sdev_name));
2306 			busy++;
2307 			continue;
2308 		}
2309 
2310 		if (vp->v_count > 0 && (flags & SDEV_ENFORCE) == 0) {
2311 			sdcmn_err9(("sdev_cleandir: dir %s busy\n",
2312 			    dv->sdev_name));
2313 			busy++;
2314 			continue;
2315 		}
2316 
2317 		/*
2318 		 * at this point, either dv is not held or SDEV_ENFORCE
2319 		 * is specified. In either case, dv needs to be deleted
2320 		 */
2321 		SDEV_HOLD(dv);
2322 
2323 		bkstore = SDEV_IS_PERSIST(dv) ? 1 : 0;
2324 		if (bkstore && (vp->v_type == VDIR))
2325 			bkstore += 1;
2326 
2327 		if (bkstore) {
2328 			len = strlen(dv->sdev_name) + 1;
2329 			bks_name = kmem_alloc(len, KM_SLEEP);
2330 			bcopy(dv->sdev_name, bks_name, len);
2331 		}
2332 
2333 		sdev_dirdelete(ddv, dv);
2334 
2335 		/* take care the backing store clean up */
2336 		if (bkstore) {
2337 			ASSERT(bks_name);
2338 			ASSERT(ddv->sdev_attrvp);
2339 
2340 			if (bkstore == 1) {
2341 				error = VOP_REMOVE(ddv->sdev_attrvp,
2342 				    bks_name, kcred, NULL, 0);
2343 			} else if (bkstore == 2) {
2344 				error = VOP_RMDIR(ddv->sdev_attrvp,
2345 				    bks_name, ddv->sdev_attrvp, kcred, NULL, 0);
2346 			}
2347 
2348 			/* do not propagate the backing store errors */
2349 			if (error) {
2350 				sdcmn_err9(("sdev_cleandir: backing store"
2351 				    "not cleaned\n"));
2352 				error = 0;
2353 			}
2354 
2355 			bkstore = 0;
2356 			kmem_free(bks_name, len);
2357 			bks_name = NULL;
2358 			len = 0;
2359 		}
2360 
2361 		ddv->sdev_flags |= SDEV_BUILD;
2362 		SDEV_RELE(dv);
2363 	}
2364 
2365 	ddv->sdev_flags |= SDEV_BUILD;
2366 	rw_exit(&ddv->sdev_contents);
2367 
2368 	if (busy) {
2369 		error = EBUSY;
2370 	}
2371 
2372 	return (error);
2373 }
2374 
2375 /*
2376  * a convenient wrapper for readdir() funcs
2377  */
2378 size_t
2379 add_dir_entry(dirent64_t *de, char *nm, size_t size, ino_t ino, offset_t off)
2380 {
2381 	size_t reclen = DIRENT64_RECLEN(strlen(nm));
2382 	if (reclen > size)
2383 		return (0);
2384 
2385 	de->d_ino = (ino64_t)ino;
2386 	de->d_off = (off64_t)off + 1;
2387 	de->d_reclen = (ushort_t)reclen;
2388 	(void) strncpy(de->d_name, nm, DIRENT64_NAMELEN(reclen));
2389 	return (reclen);
2390 }
2391 
2392 /*
2393  * sdev_mount service routines
2394  */
2395 int
2396 sdev_copyin_mountargs(struct mounta *uap, struct sdev_mountargs *args)
2397 {
2398 	int	error;
2399 
2400 	if (uap->datalen != sizeof (*args))
2401 		return (EINVAL);
2402 
2403 	if (error = copyin(uap->dataptr, args, sizeof (*args))) {
2404 		cmn_err(CE_WARN, "sdev_copyin_mountargs: can not"
2405 		    "get user data. error %d\n", error);
2406 		return (EFAULT);
2407 	}
2408 
2409 	return (0);
2410 }
2411 
2412 #ifdef nextdp
2413 #undef nextdp
2414 #endif
2415 #define	nextdp(dp)	((struct dirent64 *) \
2416 			    (intptr_t)((char *)(dp) + (dp)->d_reclen))
2417 
2418 /*
2419  * readdir helper func
2420  */
2421 int
2422 devname_readdir_func(vnode_t *vp, uio_t *uiop, cred_t *cred, int *eofp,
2423     int flags)
2424 {
2425 	struct sdev_node *ddv = VTOSDEV(vp);
2426 	struct sdev_node *dv;
2427 	dirent64_t	*dp;
2428 	ulong_t		outcount = 0;
2429 	size_t		namelen;
2430 	ulong_t		alloc_count;
2431 	void		*outbuf;
2432 	struct iovec	*iovp;
2433 	int		error = 0;
2434 	size_t		reclen;
2435 	offset_t	diroff;
2436 	offset_t	soff;
2437 	int		this_reclen;
2438 	int (*vtor)(struct sdev_node *) = NULL;
2439 	struct vattr attr;
2440 	timestruc_t now;
2441 
2442 	ASSERT(ddv->sdev_attr || ddv->sdev_attrvp);
2443 	ASSERT(RW_READ_HELD(&ddv->sdev_contents));
2444 
2445 	if (uiop->uio_loffset >= MAXOFF_T) {
2446 		if (eofp)
2447 			*eofp = 1;
2448 		return (0);
2449 	}
2450 
2451 	if (uiop->uio_iovcnt != 1)
2452 		return (EINVAL);
2453 
2454 	if (vp->v_type != VDIR)
2455 		return (ENOTDIR);
2456 
2457 	if (ddv->sdev_flags & SDEV_VTOR) {
2458 		vtor = (int (*)(struct sdev_node *))sdev_get_vtor(ddv);
2459 		ASSERT(vtor);
2460 	}
2461 
2462 	if (eofp != NULL)
2463 		*eofp = 0;
2464 
2465 	soff = uiop->uio_loffset;
2466 	iovp = uiop->uio_iov;
2467 	alloc_count = iovp->iov_len;
2468 	dp = outbuf = kmem_alloc(alloc_count, KM_SLEEP);
2469 	outcount = 0;
2470 
2471 	if (ddv->sdev_state == SDEV_ZOMBIE)
2472 		goto get_cache;
2473 
2474 	if (SDEV_IS_GLOBAL(ddv)) {
2475 
2476 		if ((sdev_boot_state == SDEV_BOOT_STATE_COMPLETE) &&
2477 		    !sdev_reconfig_boot && (flags & SDEV_BROWSE) &&
2478 		    !SDEV_IS_DYNAMIC(ddv) && !SDEV_IS_NO_NCACHE(ddv) &&
2479 		    ((moddebug & MODDEBUG_FINI_EBUSY) == 0) &&
2480 		    !DEVNAME_DEVFSADM_HAS_RUN(devfsadm_state) &&
2481 		    !DEVNAME_DEVFSADM_IS_RUNNING(devfsadm_state) &&
2482 		    !sdev_reconfig_disable) {
2483 			/*
2484 			 * invoking "devfsadm" to do system device reconfig
2485 			 */
2486 			mutex_enter(&ddv->sdev_lookup_lock);
2487 			SDEV_BLOCK_OTHERS(ddv,
2488 			    (SDEV_READDIR|SDEV_LGWAITING));
2489 			mutex_exit(&ddv->sdev_lookup_lock);
2490 
2491 			sdcmn_err8(("readdir of %s by %s: reconfig\n",
2492 			    ddv->sdev_path, curproc->p_user.u_comm));
2493 			if (sdev_reconfig_verbose) {
2494 				cmn_err(CE_CONT,
2495 				    "?readdir of %s by %s: reconfig\n",
2496 				    ddv->sdev_path, curproc->p_user.u_comm);
2497 			}
2498 
2499 			sdev_devfsadmd_thread(ddv, NULL, kcred);
2500 		} else if (DEVNAME_DEVFSADM_IS_RUNNING(devfsadm_state)) {
2501 			/*
2502 			 * compensate the "ls" started later than "devfsadm"
2503 			 */
2504 			mutex_enter(&ddv->sdev_lookup_lock);
2505 			SDEV_BLOCK_OTHERS(ddv, (SDEV_READDIR|SDEV_LGWAITING));
2506 			mutex_exit(&ddv->sdev_lookup_lock);
2507 		}
2508 
2509 		/*
2510 		 * release the contents lock so that
2511 		 * the cache may be updated by devfsadmd
2512 		 */
2513 		rw_exit(&ddv->sdev_contents);
2514 		mutex_enter(&ddv->sdev_lookup_lock);
2515 		if (SDEV_IS_READDIR(ddv))
2516 			(void) sdev_wait4lookup(ddv, SDEV_READDIR);
2517 		mutex_exit(&ddv->sdev_lookup_lock);
2518 		rw_enter(&ddv->sdev_contents, RW_READER);
2519 
2520 		sdcmn_err4(("readdir of directory %s by %s\n",
2521 		    ddv->sdev_name, curproc->p_user.u_comm));
2522 		if (ddv->sdev_flags & SDEV_BUILD) {
2523 			if (SDEV_IS_PERSIST(ddv)) {
2524 				error = sdev_filldir_from_store(ddv,
2525 				    alloc_count, cred);
2526 			}
2527 			ddv->sdev_flags &= ~SDEV_BUILD;
2528 		}
2529 	}
2530 
2531 get_cache:
2532 	/* handle "." and ".." */
2533 	diroff = 0;
2534 	if (soff == 0) {
2535 		/* first time */
2536 		this_reclen = DIRENT64_RECLEN(1);
2537 		if (alloc_count < this_reclen) {
2538 			error = EINVAL;
2539 			goto done;
2540 		}
2541 
2542 		dp->d_ino = (ino64_t)ddv->sdev_ino;
2543 		dp->d_off = (off64_t)1;
2544 		dp->d_reclen = (ushort_t)this_reclen;
2545 
2546 		(void) strncpy(dp->d_name, ".",
2547 		    DIRENT64_NAMELEN(this_reclen));
2548 		outcount += dp->d_reclen;
2549 		dp = nextdp(dp);
2550 	}
2551 
2552 	diroff++;
2553 	if (soff <= 1) {
2554 		this_reclen = DIRENT64_RECLEN(2);
2555 		if (alloc_count < outcount + this_reclen) {
2556 			error = EINVAL;
2557 			goto done;
2558 		}
2559 
2560 		dp->d_reclen = (ushort_t)this_reclen;
2561 		dp->d_ino = (ino64_t)ddv->sdev_dotdot->sdev_ino;
2562 		dp->d_off = (off64_t)2;
2563 
2564 		(void) strncpy(dp->d_name, "..",
2565 		    DIRENT64_NAMELEN(this_reclen));
2566 		outcount += dp->d_reclen;
2567 
2568 		dp = nextdp(dp);
2569 	}
2570 
2571 
2572 	/* gets the cache */
2573 	diroff++;
2574 	for (dv = SDEV_FIRST_ENTRY(ddv); dv;
2575 	    dv = SDEV_NEXT_ENTRY(ddv, dv), diroff++) {
2576 		sdcmn_err3(("sdev_readdir: diroff %lld soff %lld for '%s' \n",
2577 		    diroff, soff, dv->sdev_name));
2578 
2579 		/* bypassing pre-matured nodes */
2580 		if (diroff < soff || (dv->sdev_state != SDEV_READY)) {
2581 			sdcmn_err3(("sdev_readdir: pre-mature node  "
2582 			    "%s %d\n", dv->sdev_name, dv->sdev_state));
2583 			continue;
2584 		}
2585 
2586 		/*
2587 		 * Check validity of node
2588 		 * Drop invalid and nodes to be skipped.
2589 		 * A node the validator indicates as stale needs
2590 		 * to be returned as presumably the node name itself
2591 		 * is valid and the node data itself will be refreshed
2592 		 * on lookup.  An application performing a readdir then
2593 		 * stat on each entry should thus always see consistent
2594 		 * data.  In any case, it is not possible to synchronize
2595 		 * with dynamic kernel state, and any view we return can
2596 		 * never be anything more than a snapshot at a point in time.
2597 		 */
2598 		if (vtor) {
2599 			switch (vtor(dv)) {
2600 			case SDEV_VTOR_VALID:
2601 				break;
2602 			case SDEV_VTOR_INVALID:
2603 			case SDEV_VTOR_SKIP:
2604 				continue;
2605 			case SDEV_VTOR_STALE:
2606 				sdcmn_err3(("sdev_readir: %s stale\n",
2607 				    dv->sdev_name));
2608 				break;
2609 			default:
2610 				cmn_err(CE_PANIC,
2611 				    "dev fs: validator failed: %s(%p)\n",
2612 				    dv->sdev_name, (void *)dv);
2613 				break;
2614 			/*NOTREACHED*/
2615 			}
2616 		}
2617 
2618 		namelen = strlen(dv->sdev_name);
2619 		reclen = DIRENT64_RECLEN(namelen);
2620 		if (outcount + reclen > alloc_count) {
2621 			goto full;
2622 		}
2623 		dp->d_reclen = (ushort_t)reclen;
2624 		dp->d_ino = (ino64_t)dv->sdev_ino;
2625 		dp->d_off = (off64_t)diroff + 1;
2626 		(void) strncpy(dp->d_name, dv->sdev_name,
2627 		    DIRENT64_NAMELEN(reclen));
2628 		outcount += reclen;
2629 		dp = nextdp(dp);
2630 	}
2631 
2632 full:
2633 	sdcmn_err4(("sdev_readdir: moving %lu bytes: "
2634 	    "diroff %lld, soff %lld, dv %p\n", outcount, diroff, soff,
2635 	    (void *)dv));
2636 
2637 	if (outcount)
2638 		error = uiomove(outbuf, outcount, UIO_READ, uiop);
2639 
2640 	if (!error) {
2641 		uiop->uio_loffset = diroff;
2642 		if (eofp)
2643 			*eofp = dv ? 0 : 1;
2644 	}
2645 
2646 
2647 	if (ddv->sdev_attrvp) {
2648 		gethrestime(&now);
2649 		attr.va_ctime = now;
2650 		attr.va_atime = now;
2651 		attr.va_mask = AT_CTIME|AT_ATIME;
2652 
2653 		(void) VOP_SETATTR(ddv->sdev_attrvp, &attr, 0, kcred, NULL);
2654 	}
2655 done:
2656 	kmem_free(outbuf, alloc_count);
2657 	return (error);
2658 }
2659 
2660 static int
2661 sdev_modctl_lookup(const char *path, vnode_t **r_vp)
2662 {
2663 	vnode_t *vp;
2664 	vnode_t *cvp;
2665 	struct sdev_node *svp;
2666 	char *nm;
2667 	struct pathname pn;
2668 	int error;
2669 	int persisted = 0;
2670 
2671 	ASSERT(INGLOBALZONE(curproc));
2672 
2673 	if (error = pn_get((char *)path, UIO_SYSSPACE, &pn))
2674 		return (error);
2675 	nm = kmem_alloc(MAXNAMELEN, KM_SLEEP);
2676 
2677 	vp = rootdir;
2678 	VN_HOLD(vp);
2679 
2680 	while (pn_pathleft(&pn)) {
2681 		ASSERT(vp->v_type == VDIR || vp->v_type == VLNK);
2682 		(void) pn_getcomponent(&pn, nm);
2683 
2684 		/*
2685 		 * Deal with the .. special case where we may be
2686 		 * traversing up across a mount point, to the
2687 		 * root of this filesystem or global root.
2688 		 */
2689 		if (nm[0] == '.' && nm[1] == '.' && nm[2] == 0) {
2690 checkforroot:
2691 			if (VN_CMP(vp, rootdir)) {
2692 				nm[1] = 0;
2693 			} else if (vp->v_flag & VROOT) {
2694 				vfs_t *vfsp;
2695 				cvp = vp;
2696 				vfsp = cvp->v_vfsp;
2697 				vfs_rlock_wait(vfsp);
2698 				vp = cvp->v_vfsp->vfs_vnodecovered;
2699 				if (vp == NULL ||
2700 				    (cvp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) {
2701 					vfs_unlock(vfsp);
2702 					VN_RELE(cvp);
2703 					error = EIO;
2704 					break;
2705 				}
2706 				VN_HOLD(vp);
2707 				vfs_unlock(vfsp);
2708 				VN_RELE(cvp);
2709 				cvp = NULL;
2710 				goto checkforroot;
2711 			}
2712 		}
2713 
2714 		error = VOP_LOOKUP(vp, nm, &cvp, NULL, 0, NULL, kcred, NULL,
2715 		    NULL, NULL);
2716 		if (error) {
2717 			VN_RELE(vp);
2718 			break;
2719 		}
2720 
2721 		/* traverse mount points encountered on our journey */
2722 		if (vn_ismntpt(cvp) && (error = traverse(&cvp)) != 0) {
2723 			VN_RELE(vp);
2724 			VN_RELE(cvp);
2725 			break;
2726 		}
2727 
2728 		/*
2729 		 * symbolic link, can be either relative and absolute
2730 		 */
2731 		if ((cvp->v_type == VLNK) && pn_pathleft(&pn)) {
2732 			struct pathname linkpath;
2733 			pn_alloc(&linkpath);
2734 			if (error = pn_getsymlink(cvp, &linkpath, kcred)) {
2735 				pn_free(&linkpath);
2736 				break;
2737 			}
2738 			if (pn_pathleft(&linkpath) == 0)
2739 				(void) pn_set(&linkpath, ".");
2740 			error = pn_insert(&pn, &linkpath, strlen(nm));
2741 			pn_free(&linkpath);
2742 			if (pn.pn_pathlen == 0) {
2743 				VN_RELE(vp);
2744 				return (ENOENT);
2745 			}
2746 			if (pn.pn_path[0] == '/') {
2747 				pn_skipslash(&pn);
2748 				VN_RELE(vp);
2749 				VN_RELE(cvp);
2750 				vp = rootdir;
2751 				VN_HOLD(vp);
2752 			} else {
2753 				VN_RELE(cvp);
2754 			}
2755 			continue;
2756 		}
2757 
2758 		VN_RELE(vp);
2759 
2760 		/*
2761 		 * Direct the operation to the persisting filesystem
2762 		 * underlying /dev.  Bail if we encounter a
2763 		 * non-persistent dev entity here.
2764 		 */
2765 		if (cvp->v_vfsp->vfs_fstype == devtype) {
2766 
2767 			if ((VTOSDEV(cvp)->sdev_flags & SDEV_PERSIST) == 0) {
2768 				error = ENOENT;
2769 				VN_RELE(cvp);
2770 				break;
2771 			}
2772 
2773 			if (VTOSDEV(cvp) == NULL) {
2774 				error = ENOENT;
2775 				VN_RELE(cvp);
2776 				break;
2777 			}
2778 			svp = VTOSDEV(cvp);
2779 			if ((vp = svp->sdev_attrvp) == NULL) {
2780 				error = ENOENT;
2781 				VN_RELE(cvp);
2782 				break;
2783 			}
2784 			persisted = 1;
2785 			VN_HOLD(vp);
2786 			VN_RELE(cvp);
2787 			cvp = vp;
2788 		}
2789 
2790 		vp = cvp;
2791 		pn_skipslash(&pn);
2792 	}
2793 
2794 	kmem_free(nm, MAXNAMELEN);
2795 	pn_free(&pn);
2796 
2797 	if (error)
2798 		return (error);
2799 
2800 	/*
2801 	 * Only return persisted nodes in the filesystem underlying /dev.
2802 	 */
2803 	if (!persisted) {
2804 		VN_RELE(vp);
2805 		return (ENOENT);
2806 	}
2807 
2808 	*r_vp = vp;
2809 	return (0);
2810 }
2811 
2812 int
2813 sdev_modctl_readdir(const char *dir, char ***dirlistp,
2814 	int *npathsp, int *npathsp_alloc, int checking_empty)
2815 {
2816 	char	**pathlist = NULL;
2817 	char	**newlist = NULL;
2818 	int	npaths = 0;
2819 	int	npaths_alloc = 0;
2820 	dirent64_t *dbuf = NULL;
2821 	int	n;
2822 	char	*s;
2823 	int error;
2824 	vnode_t *vp;
2825 	int eof;
2826 	struct iovec iov;
2827 	struct uio uio;
2828 	struct dirent64 *dp;
2829 	size_t dlen;
2830 	size_t dbuflen;
2831 	int ndirents = 64;
2832 	char *nm;
2833 
2834 	error = sdev_modctl_lookup(dir, &vp);
2835 	sdcmn_err11(("modctl readdir: %s by %s: %s\n",
2836 	    dir, curproc->p_user.u_comm,
2837 	    (error == 0) ? "ok" : "failed"));
2838 	if (error)
2839 		return (error);
2840 
2841 	dlen = ndirents * (sizeof (*dbuf));
2842 	dbuf = kmem_alloc(dlen, KM_SLEEP);
2843 
2844 	uio.uio_iov = &iov;
2845 	uio.uio_iovcnt = 1;
2846 	uio.uio_segflg = UIO_SYSSPACE;
2847 	uio.uio_fmode = 0;
2848 	uio.uio_extflg = UIO_COPY_CACHED;
2849 	uio.uio_loffset = 0;
2850 	uio.uio_llimit = MAXOFFSET_T;
2851 
2852 	eof = 0;
2853 	error = 0;
2854 	while (!error && !eof) {
2855 		uio.uio_resid = dlen;
2856 		iov.iov_base = (char *)dbuf;
2857 		iov.iov_len = dlen;
2858 
2859 		(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2860 		error = VOP_READDIR(vp, &uio, kcred, &eof, NULL, 0);
2861 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2862 
2863 		dbuflen = dlen - uio.uio_resid;
2864 
2865 		if (error || dbuflen == 0)
2866 			break;
2867 
2868 		for (dp = dbuf; ((intptr_t)dp < (intptr_t)dbuf + dbuflen);
2869 		    dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen)) {
2870 
2871 			nm = dp->d_name;
2872 
2873 			if (strcmp(nm, ".") == 0 || strcmp(nm, "..") == 0)
2874 				continue;
2875 			if (npaths == npaths_alloc) {
2876 				npaths_alloc += 64;
2877 				newlist = (char **)
2878 				    kmem_zalloc((npaths_alloc + 1) *
2879 				    sizeof (char *), KM_SLEEP);
2880 				if (pathlist) {
2881 					bcopy(pathlist, newlist,
2882 					    npaths * sizeof (char *));
2883 					kmem_free(pathlist,
2884 					    (npaths + 1) * sizeof (char *));
2885 				}
2886 				pathlist = newlist;
2887 			}
2888 			n = strlen(nm) + 1;
2889 			s = kmem_alloc(n, KM_SLEEP);
2890 			bcopy(nm, s, n);
2891 			pathlist[npaths++] = s;
2892 			sdcmn_err11(("  %s/%s\n", dir, s));
2893 
2894 			/* if checking empty, one entry is as good as many */
2895 			if (checking_empty) {
2896 				eof = 1;
2897 				break;
2898 			}
2899 		}
2900 	}
2901 
2902 exit:
2903 	VN_RELE(vp);
2904 
2905 	if (dbuf)
2906 		kmem_free(dbuf, dlen);
2907 
2908 	if (error)
2909 		return (error);
2910 
2911 	*dirlistp = pathlist;
2912 	*npathsp = npaths;
2913 	*npathsp_alloc = npaths_alloc;
2914 
2915 	return (0);
2916 }
2917 
2918 void
2919 sdev_modctl_readdir_free(char **pathlist, int npaths, int npaths_alloc)
2920 {
2921 	int	i, n;
2922 
2923 	for (i = 0; i < npaths; i++) {
2924 		n = strlen(pathlist[i]) + 1;
2925 		kmem_free(pathlist[i], n);
2926 	}
2927 
2928 	kmem_free(pathlist, (npaths_alloc + 1) * sizeof (char *));
2929 }
2930 
2931 int
2932 sdev_modctl_devexists(const char *path)
2933 {
2934 	vnode_t *vp;
2935 	int error;
2936 
2937 	error = sdev_modctl_lookup(path, &vp);
2938 	sdcmn_err11(("modctl dev exists: %s by %s: %s\n",
2939 	    path, curproc->p_user.u_comm,
2940 	    (error == 0) ? "ok" : "failed"));
2941 	if (error == 0)
2942 		VN_RELE(vp);
2943 
2944 	return (error);
2945 }
2946 
2947 extern int sdev_vnodeops_tbl_size;
2948 
2949 /*
2950  * construct a new template with overrides from vtab
2951  */
2952 static fs_operation_def_t *
2953 sdev_merge_vtab(const fs_operation_def_t tab[])
2954 {
2955 	fs_operation_def_t *new;
2956 	const fs_operation_def_t *tab_entry;
2957 
2958 	/* make a copy of standard vnode ops table */
2959 	new = kmem_alloc(sdev_vnodeops_tbl_size, KM_SLEEP);
2960 	bcopy((void *)sdev_vnodeops_tbl, new, sdev_vnodeops_tbl_size);
2961 
2962 	/* replace the overrides from tab */
2963 	for (tab_entry = tab; tab_entry->name != NULL; tab_entry++) {
2964 		fs_operation_def_t *std_entry = new;
2965 		while (std_entry->name) {
2966 			if (strcmp(tab_entry->name, std_entry->name) == 0) {
2967 				std_entry->func = tab_entry->func;
2968 				break;
2969 			}
2970 			std_entry++;
2971 		}
2972 		if (std_entry->name == NULL)
2973 			cmn_err(CE_NOTE, "sdev_merge_vtab: entry %s unused.",
2974 			    tab_entry->name);
2975 	}
2976 
2977 	return (new);
2978 }
2979 
2980 /* free memory allocated by sdev_merge_vtab */
2981 static void
2982 sdev_free_vtab(fs_operation_def_t *new)
2983 {
2984 	kmem_free(new, sdev_vnodeops_tbl_size);
2985 }
2986 
2987 /*
2988  * a generic setattr() function
2989  *
2990  * note: flags only supports AT_UID and AT_GID.
2991  *	 Future enhancements can be done for other types, e.g. AT_MODE
2992  */
2993 int
2994 devname_setattr_func(struct vnode *vp, struct vattr *vap, int flags,
2995     struct cred *cred, int (*callback)(struct sdev_node *, struct vattr *,
2996     int), int protocol)
2997 {
2998 	struct sdev_node	*dv = VTOSDEV(vp);
2999 	struct sdev_node	*parent = dv->sdev_dotdot;
3000 	struct vattr		*get;
3001 	uint_t			mask = vap->va_mask;
3002 	int 			error;
3003 
3004 	/* some sanity checks */
3005 	if (vap->va_mask & AT_NOSET)
3006 		return (EINVAL);
3007 
3008 	if (vap->va_mask & AT_SIZE) {
3009 		if (vp->v_type == VDIR) {
3010 			return (EISDIR);
3011 		}
3012 	}
3013 
3014 	/* no need to set attribute, but do not fail either */
3015 	ASSERT(parent);
3016 	rw_enter(&parent->sdev_contents, RW_READER);
3017 	if (dv->sdev_state == SDEV_ZOMBIE) {
3018 		rw_exit(&parent->sdev_contents);
3019 		return (0);
3020 	}
3021 
3022 	/* If backing store exists, just set it. */
3023 	if (dv->sdev_attrvp) {
3024 		rw_exit(&parent->sdev_contents);
3025 		return (VOP_SETATTR(dv->sdev_attrvp, vap, flags, cred, NULL));
3026 	}
3027 
3028 	/*
3029 	 * Otherwise, for nodes with the persistence attribute, create it.
3030 	 */
3031 	ASSERT(dv->sdev_attr);
3032 	if (SDEV_IS_PERSIST(dv) ||
3033 	    ((vap->va_mask & ~AT_TIMES) != 0 && !SDEV_IS_DYNAMIC(dv))) {
3034 		sdev_vattr_merge(dv, vap);
3035 		rw_enter(&dv->sdev_contents, RW_WRITER);
3036 		error = sdev_shadow_node(dv, cred);
3037 		rw_exit(&dv->sdev_contents);
3038 		rw_exit(&parent->sdev_contents);
3039 
3040 		if (error)
3041 			return (error);
3042 		return (VOP_SETATTR(dv->sdev_attrvp, vap, flags, cred, NULL));
3043 	}
3044 
3045 
3046 	/*
3047 	 * sdev_attr was allocated in sdev_mknode
3048 	 */
3049 	rw_enter(&dv->sdev_contents, RW_WRITER);
3050 	error = secpolicy_vnode_setattr(cred, vp, vap,
3051 	    dv->sdev_attr, flags, sdev_unlocked_access, dv);
3052 	if (error) {
3053 		rw_exit(&dv->sdev_contents);
3054 		rw_exit(&parent->sdev_contents);
3055 		return (error);
3056 	}
3057 
3058 	get = dv->sdev_attr;
3059 	if (mask & AT_MODE) {
3060 		get->va_mode &= S_IFMT;
3061 		get->va_mode |= vap->va_mode & ~S_IFMT;
3062 	}
3063 
3064 	if ((mask & AT_UID) || (mask & AT_GID)) {
3065 		if (mask & AT_UID)
3066 			get->va_uid = vap->va_uid;
3067 		if (mask & AT_GID)
3068 			get->va_gid = vap->va_gid;
3069 		/*
3070 		 * a callback must be provided if the protocol is set
3071 		 */
3072 		if ((protocol & AT_UID) || (protocol & AT_GID)) {
3073 			ASSERT(callback);
3074 			error = callback(dv, get, protocol);
3075 			if (error) {
3076 				rw_exit(&dv->sdev_contents);
3077 				rw_exit(&parent->sdev_contents);
3078 				return (error);
3079 			}
3080 		}
3081 	}
3082 
3083 	if (mask & AT_ATIME)
3084 		get->va_atime = vap->va_atime;
3085 	if (mask & AT_MTIME)
3086 		get->va_mtime = vap->va_mtime;
3087 	if (mask & (AT_MODE | AT_UID | AT_GID | AT_CTIME)) {
3088 		gethrestime(&get->va_ctime);
3089 	}
3090 
3091 	sdev_vattr_merge(dv, get);
3092 	rw_exit(&dv->sdev_contents);
3093 	rw_exit(&parent->sdev_contents);
3094 	return (0);
3095 }
3096 
3097 /*
3098  * a generic inactive() function
3099  */
3100 /*ARGSUSED*/
3101 void
3102 devname_inactive_func(struct vnode *vp, struct cred *cred,
3103     void (*callback)(struct vnode *))
3104 {
3105 	int clean;
3106 	struct sdev_node *dv = VTOSDEV(vp);
3107 	int state;
3108 
3109 	mutex_enter(&vp->v_lock);
3110 	ASSERT(vp->v_count >= 1);
3111 
3112 
3113 	if (vp->v_count == 1 && callback != NULL)
3114 		callback(vp);
3115 
3116 	rw_enter(&dv->sdev_contents, RW_WRITER);
3117 	state = dv->sdev_state;
3118 
3119 	clean = (vp->v_count == 1) && (state == SDEV_ZOMBIE);
3120 
3121 	/*
3122 	 * sdev is a rather bad public citizen. It violates the general
3123 	 * agreement that in memory nodes should always have a valid reference
3124 	 * count on their vnode. But that's not the case here. This means that
3125 	 * we do actually have to distinguish between getting inactive callbacks
3126 	 * for zombies and otherwise. This should probably be fixed.
3127 	 */
3128 	if (clean) {
3129 		/* Remove the . entry to ourselves */
3130 		if (vp->v_type == VDIR) {
3131 			decr_link(dv);
3132 		}
3133 		VERIFY(dv->sdev_nlink == 1);
3134 		decr_link(dv);
3135 		--vp->v_count;
3136 		rw_exit(&dv->sdev_contents);
3137 		mutex_exit(&vp->v_lock);
3138 		sdev_nodedestroy(dv, 0);
3139 	} else {
3140 		--vp->v_count;
3141 		rw_exit(&dv->sdev_contents);
3142 		mutex_exit(&vp->v_lock);
3143 	}
3144 }
3145