xref: /titanic_51/usr/src/uts/common/fs/dev/sdev_subr.c (revision 6fb87a99fb662e7d25003aeb3817ceb2a41eb2fa)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * utility routines for the /dev fs
27  */
28 
29 #include <sys/types.h>
30 #include <sys/param.h>
31 #include <sys/t_lock.h>
32 #include <sys/systm.h>
33 #include <sys/sysmacros.h>
34 #include <sys/user.h>
35 #include <sys/time.h>
36 #include <sys/vfs.h>
37 #include <sys/vnode.h>
38 #include <sys/file.h>
39 #include <sys/fcntl.h>
40 #include <sys/flock.h>
41 #include <sys/kmem.h>
42 #include <sys/uio.h>
43 #include <sys/errno.h>
44 #include <sys/stat.h>
45 #include <sys/cred.h>
46 #include <sys/dirent.h>
47 #include <sys/pathname.h>
48 #include <sys/cmn_err.h>
49 #include <sys/debug.h>
50 #include <sys/mode.h>
51 #include <sys/policy.h>
52 #include <fs/fs_subr.h>
53 #include <sys/mount.h>
54 #include <sys/fs/snode.h>
55 #include <sys/fs/dv_node.h>
56 #include <sys/fs/sdev_impl.h>
57 #include <sys/sunndi.h>
58 #include <sys/sunmdi.h>
59 #include <sys/conf.h>
60 #include <sys/proc.h>
61 #include <sys/user.h>
62 #include <sys/modctl.h>
63 
64 #ifdef DEBUG
65 int sdev_debug = 0x00000001;
66 int sdev_debug_cache_flags = 0;
67 #endif
68 
69 /*
70  * globals
71  */
72 /* prototype memory vattrs */
73 vattr_t sdev_vattr_dir = {
74 	AT_TYPE|AT_MODE|AT_UID|AT_GID,		/* va_mask */
75 	VDIR,					/* va_type */
76 	SDEV_DIRMODE_DEFAULT,			/* va_mode */
77 	SDEV_UID_DEFAULT,			/* va_uid */
78 	SDEV_GID_DEFAULT,			/* va_gid */
79 	0,					/* va_fsid */
80 	0,					/* va_nodeid */
81 	0,					/* va_nlink */
82 	0,					/* va_size */
83 	0,					/* va_atime */
84 	0,					/* va_mtime */
85 	0,					/* va_ctime */
86 	0,					/* va_rdev */
87 	0,					/* va_blksize */
88 	0,					/* va_nblocks */
89 	0					/* va_vcode */
90 };
91 
92 vattr_t sdev_vattr_lnk = {
93 	AT_TYPE|AT_MODE,			/* va_mask */
94 	VLNK,					/* va_type */
95 	SDEV_LNKMODE_DEFAULT,			/* va_mode */
96 	SDEV_UID_DEFAULT,			/* va_uid */
97 	SDEV_GID_DEFAULT,			/* va_gid */
98 	0,					/* va_fsid */
99 	0,					/* va_nodeid */
100 	0,					/* va_nlink */
101 	0,					/* va_size */
102 	0,					/* va_atime */
103 	0,					/* va_mtime */
104 	0,					/* va_ctime */
105 	0,					/* va_rdev */
106 	0,					/* va_blksize */
107 	0,					/* va_nblocks */
108 	0					/* va_vcode */
109 };
110 
111 vattr_t sdev_vattr_blk = {
112 	AT_TYPE|AT_MODE|AT_UID|AT_GID,		/* va_mask */
113 	VBLK,					/* va_type */
114 	S_IFBLK | SDEV_DEVMODE_DEFAULT,		/* va_mode */
115 	SDEV_UID_DEFAULT,			/* va_uid */
116 	SDEV_GID_DEFAULT,			/* va_gid */
117 	0,					/* va_fsid */
118 	0,					/* va_nodeid */
119 	0,					/* va_nlink */
120 	0,					/* va_size */
121 	0,					/* va_atime */
122 	0,					/* va_mtime */
123 	0,					/* va_ctime */
124 	0,					/* va_rdev */
125 	0,					/* va_blksize */
126 	0,					/* va_nblocks */
127 	0					/* va_vcode */
128 };
129 
130 vattr_t sdev_vattr_chr = {
131 	AT_TYPE|AT_MODE|AT_UID|AT_GID,		/* va_mask */
132 	VCHR,					/* va_type */
133 	S_IFCHR | SDEV_DEVMODE_DEFAULT,		/* va_mode */
134 	SDEV_UID_DEFAULT,			/* va_uid */
135 	SDEV_GID_DEFAULT,			/* va_gid */
136 	0,					/* va_fsid */
137 	0,					/* va_nodeid */
138 	0,					/* va_nlink */
139 	0,					/* va_size */
140 	0,					/* va_atime */
141 	0,					/* va_mtime */
142 	0,					/* va_ctime */
143 	0,					/* va_rdev */
144 	0,					/* va_blksize */
145 	0,					/* va_nblocks */
146 	0					/* va_vcode */
147 };
148 
149 kmem_cache_t	*sdev_node_cache;	/* sdev_node cache */
150 int		devtype;		/* fstype */
151 
152 /* static */
153 static struct vnodeops *sdev_get_vop(struct sdev_node *);
154 static void sdev_set_no_negcache(struct sdev_node *);
155 static fs_operation_def_t *sdev_merge_vtab(const fs_operation_def_t []);
156 static void sdev_free_vtab(fs_operation_def_t *);
157 
158 static void
159 sdev_prof_free(struct sdev_node *dv)
160 {
161 	ASSERT(!SDEV_IS_GLOBAL(dv));
162 	if (dv->sdev_prof.dev_name)
163 		nvlist_free(dv->sdev_prof.dev_name);
164 	if (dv->sdev_prof.dev_map)
165 		nvlist_free(dv->sdev_prof.dev_map);
166 	if (dv->sdev_prof.dev_symlink)
167 		nvlist_free(dv->sdev_prof.dev_symlink);
168 	if (dv->sdev_prof.dev_glob_incdir)
169 		nvlist_free(dv->sdev_prof.dev_glob_incdir);
170 	if (dv->sdev_prof.dev_glob_excdir)
171 		nvlist_free(dv->sdev_prof.dev_glob_excdir);
172 	bzero(&dv->sdev_prof, sizeof (dv->sdev_prof));
173 }
174 
175 /* sdev_node cache constructor */
176 /*ARGSUSED1*/
177 static int
178 i_sdev_node_ctor(void *buf, void *cfarg, int flag)
179 {
180 	struct sdev_node *dv = (struct sdev_node *)buf;
181 	struct vnode *vp;
182 
183 	bzero(buf, sizeof (struct sdev_node));
184 	vp = dv->sdev_vnode = vn_alloc(flag);
185 	if (vp == NULL) {
186 		return (-1);
187 	}
188 	vp->v_data = dv;
189 	rw_init(&dv->sdev_contents, NULL, RW_DEFAULT, NULL);
190 	return (0);
191 }
192 
193 /* sdev_node cache destructor */
194 /*ARGSUSED1*/
195 static void
196 i_sdev_node_dtor(void *buf, void *arg)
197 {
198 	struct sdev_node *dv = (struct sdev_node *)buf;
199 	struct vnode *vp = SDEVTOV(dv);
200 
201 	rw_destroy(&dv->sdev_contents);
202 	vn_free(vp);
203 }
204 
205 /* initialize sdev_node cache */
206 void
207 sdev_node_cache_init()
208 {
209 	int flags = 0;
210 
211 #ifdef	DEBUG
212 	flags = sdev_debug_cache_flags;
213 	if (flags)
214 		sdcmn_err(("cache debug flags 0x%x\n", flags));
215 #endif	/* DEBUG */
216 
217 	ASSERT(sdev_node_cache == NULL);
218 	sdev_node_cache = kmem_cache_create("sdev_node_cache",
219 	    sizeof (struct sdev_node), 0, i_sdev_node_ctor, i_sdev_node_dtor,
220 	    NULL, NULL, NULL, flags);
221 }
222 
223 /* destroy sdev_node cache */
224 void
225 sdev_node_cache_fini()
226 {
227 	ASSERT(sdev_node_cache != NULL);
228 	kmem_cache_destroy(sdev_node_cache);
229 	sdev_node_cache = NULL;
230 }
231 
232 /*
233  * Compare two nodes lexographically to balance avl tree
234  */
235 static int
236 sdev_compare_nodes(const struct sdev_node *dv1, const struct sdev_node *dv2)
237 {
238 	int rv;
239 	if ((rv = strcmp(dv1->sdev_name, dv2->sdev_name)) == 0)
240 		return (0);
241 	return ((rv < 0) ? -1 : 1);
242 }
243 
244 void
245 sdev_set_nodestate(struct sdev_node *dv, sdev_node_state_t state)
246 {
247 	ASSERT(dv);
248 	ASSERT(RW_WRITE_HELD(&dv->sdev_contents));
249 	dv->sdev_state = state;
250 }
251 
252 static void
253 sdev_attr_update(struct sdev_node *dv, vattr_t *vap)
254 {
255 	timestruc_t	now;
256 	struct vattr	*attrp;
257 	uint_t		mask;
258 
259 	ASSERT(dv->sdev_attr);
260 	ASSERT(vap);
261 
262 	attrp = dv->sdev_attr;
263 	mask = vap->va_mask;
264 	if (mask & AT_TYPE)
265 		attrp->va_type = vap->va_type;
266 	if (mask & AT_MODE)
267 		attrp->va_mode = vap->va_mode;
268 	if (mask & AT_UID)
269 		attrp->va_uid = vap->va_uid;
270 	if (mask & AT_GID)
271 		attrp->va_gid = vap->va_gid;
272 	if (mask & AT_RDEV)
273 		attrp->va_rdev = vap->va_rdev;
274 
275 	gethrestime(&now);
276 	attrp->va_atime = (mask & AT_ATIME) ? vap->va_atime : now;
277 	attrp->va_mtime = (mask & AT_MTIME) ? vap->va_mtime : now;
278 	attrp->va_ctime = (mask & AT_CTIME) ? vap->va_ctime : now;
279 }
280 
281 static void
282 sdev_attr_alloc(struct sdev_node *dv, vattr_t *vap)
283 {
284 	ASSERT(dv->sdev_attr == NULL);
285 	ASSERT(vap->va_mask & AT_TYPE);
286 	ASSERT(vap->va_mask & AT_MODE);
287 
288 	dv->sdev_attr = kmem_zalloc(sizeof (struct vattr), KM_SLEEP);
289 	sdev_attr_update(dv, vap);
290 }
291 
292 /* alloc and initialize a sdev_node */
293 int
294 sdev_nodeinit(struct sdev_node *ddv, char *nm, struct sdev_node **newdv,
295     vattr_t *vap)
296 {
297 	struct sdev_node *dv = NULL;
298 	struct vnode *vp;
299 	size_t nmlen, len;
300 	devname_handle_t  *dhl;
301 
302 	nmlen = strlen(nm) + 1;
303 	if (nmlen > MAXNAMELEN) {
304 		sdcmn_err9(("sdev_nodeinit: node name %s"
305 		    " too long\n", nm));
306 		*newdv = NULL;
307 		return (ENAMETOOLONG);
308 	}
309 
310 	dv = kmem_cache_alloc(sdev_node_cache, KM_SLEEP);
311 
312 	dv->sdev_name = kmem_alloc(nmlen, KM_SLEEP);
313 	bcopy(nm, dv->sdev_name, nmlen);
314 	dv->sdev_namelen = nmlen - 1;	/* '\0' not included */
315 	len = strlen(ddv->sdev_path) + strlen(nm) + 2;
316 	dv->sdev_path = kmem_alloc(len, KM_SLEEP);
317 	(void) snprintf(dv->sdev_path, len, "%s/%s", ddv->sdev_path, nm);
318 	/* overwritten for VLNK nodes */
319 	dv->sdev_symlink = NULL;
320 
321 	vp = SDEVTOV(dv);
322 	vn_reinit(vp);
323 	vp->v_vfsp = SDEVTOV(ddv)->v_vfsp;
324 	if (vap)
325 		vp->v_type = vap->va_type;
326 
327 	/*
328 	 * initialized to the parent's vnodeops.
329 	 * maybe overwriten for a VDIR
330 	 */
331 	vn_setops(vp, vn_getops(SDEVTOV(ddv)));
332 	vn_exists(vp);
333 
334 	dv->sdev_dotdot = NULL;
335 	dv->sdev_attrvp = NULL;
336 	if (vap) {
337 		sdev_attr_alloc(dv, vap);
338 	} else {
339 		dv->sdev_attr = NULL;
340 	}
341 
342 	dv->sdev_ino = sdev_mkino(dv);
343 	dv->sdev_nlink = 0;		/* updated on insert */
344 	dv->sdev_flags = ddv->sdev_flags; /* inherit from the parent first */
345 	dv->sdev_flags |= SDEV_BUILD;
346 	mutex_init(&dv->sdev_lookup_lock, NULL, MUTEX_DEFAULT, NULL);
347 	cv_init(&dv->sdev_lookup_cv, NULL, CV_DEFAULT, NULL);
348 	if (SDEV_IS_GLOBAL(ddv)) {
349 		dv->sdev_flags |= SDEV_GLOBAL;
350 		dhl = &(dv->sdev_handle);
351 		dhl->dh_data = dv;
352 		dhl->dh_args = NULL;
353 		sdev_set_no_negcache(dv);
354 		dv->sdev_gdir_gen = 0;
355 	} else {
356 		dv->sdev_flags &= ~SDEV_GLOBAL;
357 		dv->sdev_origin = NULL; /* set later */
358 		bzero(&dv->sdev_prof, sizeof (dv->sdev_prof));
359 		dv->sdev_ldir_gen = 0;
360 		dv->sdev_devtree_gen = 0;
361 	}
362 
363 	rw_enter(&dv->sdev_contents, RW_WRITER);
364 	sdev_set_nodestate(dv, SDEV_INIT);
365 	rw_exit(&dv->sdev_contents);
366 	*newdv = dv;
367 
368 	return (0);
369 }
370 
371 /*
372  * transition a sdev_node into SDEV_READY state
373  */
374 int
375 sdev_nodeready(struct sdev_node *dv, struct vattr *vap, struct vnode *avp,
376     void *args, struct cred *cred)
377 {
378 	int error = 0;
379 	struct vnode *vp = SDEVTOV(dv);
380 	vtype_t type;
381 
382 	ASSERT(dv && (dv->sdev_state != SDEV_READY) && vap);
383 
384 	type = vap->va_type;
385 	vp->v_type = type;
386 	vp->v_rdev = vap->va_rdev;
387 	rw_enter(&dv->sdev_contents, RW_WRITER);
388 	if (type == VDIR) {
389 		dv->sdev_nlink = 2;
390 		dv->sdev_flags &= ~SDEV_PERSIST;
391 		dv->sdev_flags &= ~SDEV_DYNAMIC;
392 		vn_setops(vp, sdev_get_vop(dv)); /* from internal vtab */
393 		ASSERT(dv->sdev_dotdot);
394 		ASSERT(SDEVTOV(dv->sdev_dotdot)->v_type == VDIR);
395 		vp->v_rdev = SDEVTOV(dv->sdev_dotdot)->v_rdev;
396 		avl_create(&dv->sdev_entries,
397 		    (int (*)(const void *, const void *))sdev_compare_nodes,
398 		    sizeof (struct sdev_node),
399 		    offsetof(struct sdev_node, sdev_avllink));
400 	} else if (type == VLNK) {
401 		ASSERT(args);
402 		dv->sdev_nlink = 1;
403 		dv->sdev_symlink = i_ddi_strdup((char *)args, KM_SLEEP);
404 	} else {
405 		dv->sdev_nlink = 1;
406 	}
407 
408 	if (!(SDEV_IS_GLOBAL(dv))) {
409 		dv->sdev_origin = (struct sdev_node *)args;
410 		dv->sdev_flags &= ~SDEV_PERSIST;
411 	}
412 
413 	/*
414 	 * shadow node is created here OR
415 	 * if failed (indicated by dv->sdev_attrvp == NULL),
416 	 * created later in sdev_setattr
417 	 */
418 	if (avp) {
419 		dv->sdev_attrvp = avp;
420 	} else {
421 		if (dv->sdev_attr == NULL) {
422 			sdev_attr_alloc(dv, vap);
423 		} else {
424 			sdev_attr_update(dv, vap);
425 		}
426 
427 		if ((dv->sdev_attrvp == NULL) && SDEV_IS_PERSIST(dv))
428 			error = sdev_shadow_node(dv, cred);
429 	}
430 
431 	if (error == 0) {
432 		/* transition to READY state */
433 		sdev_set_nodestate(dv, SDEV_READY);
434 		sdev_nc_node_exists(dv);
435 	} else {
436 		sdev_set_nodestate(dv, SDEV_ZOMBIE);
437 	}
438 	rw_exit(&dv->sdev_contents);
439 	return (error);
440 }
441 
442 /*
443  * setting ZOMBIE state
444  */
445 static int
446 sdev_nodezombied(struct sdev_node *dv)
447 {
448 	rw_enter(&dv->sdev_contents, RW_WRITER);
449 	sdev_set_nodestate(dv, SDEV_ZOMBIE);
450 	rw_exit(&dv->sdev_contents);
451 	return (0);
452 }
453 
454 /*
455  * Build the VROOT sdev_node.
456  */
457 /*ARGSUSED*/
458 struct sdev_node *
459 sdev_mkroot(struct vfs *vfsp, dev_t devdev, struct vnode *mvp,
460     struct vnode *avp, struct cred *cred)
461 {
462 	struct sdev_node *dv;
463 	struct vnode *vp;
464 	char devdir[] = "/dev";
465 
466 	ASSERT(sdev_node_cache != NULL);
467 	ASSERT(avp);
468 	dv = kmem_cache_alloc(sdev_node_cache, KM_SLEEP);
469 	vp = SDEVTOV(dv);
470 	vn_reinit(vp);
471 	vp->v_flag |= VROOT;
472 	vp->v_vfsp = vfsp;
473 	vp->v_type = VDIR;
474 	vp->v_rdev = devdev;
475 	vn_setops(vp, sdev_vnodeops); /* apply the default vnodeops at /dev */
476 	vn_exists(vp);
477 
478 	if (vfsp->vfs_mntpt)
479 		dv->sdev_name = i_ddi_strdup(
480 		    (char *)refstr_value(vfsp->vfs_mntpt), KM_SLEEP);
481 	else
482 		/* vfs_mountdev1 set mount point later */
483 		dv->sdev_name = i_ddi_strdup("/dev", KM_SLEEP);
484 	dv->sdev_namelen = strlen(dv->sdev_name); /* '\0' not included */
485 	dv->sdev_path = i_ddi_strdup(devdir, KM_SLEEP);
486 	dv->sdev_ino = SDEV_ROOTINO;
487 	dv->sdev_nlink = 2;		/* name + . (no sdev_insert) */
488 	dv->sdev_dotdot = dv;		/* .. == self */
489 	dv->sdev_attrvp = avp;
490 	dv->sdev_attr = NULL;
491 	mutex_init(&dv->sdev_lookup_lock, NULL, MUTEX_DEFAULT, NULL);
492 	cv_init(&dv->sdev_lookup_cv, NULL, CV_DEFAULT, NULL);
493 	if (strcmp(dv->sdev_name, "/dev") == 0) {
494 		dv->sdev_flags = SDEV_BUILD|SDEV_GLOBAL|SDEV_PERSIST;
495 		bzero(&dv->sdev_handle, sizeof (dv->sdev_handle));
496 		dv->sdev_gdir_gen = 0;
497 	} else {
498 		dv->sdev_flags = SDEV_BUILD;
499 		dv->sdev_flags &= ~SDEV_PERSIST;
500 		bzero(&dv->sdev_prof, sizeof (dv->sdev_prof));
501 		dv->sdev_ldir_gen = 0;
502 		dv->sdev_devtree_gen = 0;
503 	}
504 
505 	avl_create(&dv->sdev_entries,
506 	    (int (*)(const void *, const void *))sdev_compare_nodes,
507 	    sizeof (struct sdev_node),
508 	    offsetof(struct sdev_node, sdev_avllink));
509 
510 	rw_enter(&dv->sdev_contents, RW_WRITER);
511 	sdev_set_nodestate(dv, SDEV_READY);
512 	rw_exit(&dv->sdev_contents);
513 	sdev_nc_node_exists(dv);
514 	return (dv);
515 }
516 
517 /* directory dependent vop table */
518 struct sdev_vop_table {
519 	char *vt_name;				/* subdirectory name */
520 	const fs_operation_def_t *vt_service;	/* vnodeops table */
521 	struct vnodeops *vt_vops;		/* constructed vop */
522 	struct vnodeops **vt_global_vops;	/* global container for vop */
523 	int (*vt_vtor)(struct sdev_node *);	/* validate sdev_node */
524 	int vt_flags;
525 };
526 
527 /*
528  * A nice improvement would be to provide a plug-in mechanism
529  * for this table instead of a const table.
530  */
531 static struct sdev_vop_table vtab[] =
532 {
533 	{ "pts", devpts_vnodeops_tbl, NULL, &devpts_vnodeops, devpts_validate,
534 	SDEV_DYNAMIC | SDEV_VTOR },
535 
536 	{ "vt", devvt_vnodeops_tbl, NULL, &devvt_vnodeops, devvt_validate,
537 	SDEV_DYNAMIC | SDEV_VTOR },
538 
539 	{ "zvol", devzvol_vnodeops_tbl, NULL, &devzvol_vnodeops,
540 	devzvol_validate, SDEV_DYNAMIC | SDEV_VTOR | SDEV_SUBDIR },
541 
542 	{ "zcons", NULL, NULL, NULL, NULL, SDEV_NO_NCACHE },
543 
544 	{ "net", devnet_vnodeops_tbl, NULL, &devnet_vnodeops, devnet_validate,
545 	SDEV_DYNAMIC | SDEV_VTOR },
546 
547 	{ "ipnet", devipnet_vnodeops_tbl, NULL, &devipnet_vnodeops,
548 	devipnet_validate, SDEV_DYNAMIC | SDEV_VTOR | SDEV_NO_NCACHE },
549 
550 	{ "lofi", NULL, NULL, NULL, NULL, SDEV_ZONED },
551 	{ "rlofi", NULL, NULL, NULL, NULL, SDEV_ZONED },
552 
553 	{ NULL, NULL, NULL, NULL, NULL, 0}
554 };
555 
556 struct sdev_vop_table *
557 sdev_match(struct sdev_node *dv)
558 {
559 	int vlen;
560 	int i;
561 
562 	for (i = 0; vtab[i].vt_name; i++) {
563 		if (strcmp(vtab[i].vt_name, dv->sdev_name) == 0)
564 			return (&vtab[i]);
565 		if (vtab[i].vt_flags & SDEV_SUBDIR) {
566 			char *ptr;
567 
568 			ASSERT(strlen(dv->sdev_path) > 5);
569 			ptr = dv->sdev_path + 5;
570 			vlen = strlen(vtab[i].vt_name);
571 			if ((strncmp(vtab[i].vt_name, ptr,
572 			    vlen - 1) == 0) && ptr[vlen] == '/')
573 				return (&vtab[i]);
574 		}
575 
576 	}
577 	return (NULL);
578 }
579 
580 /*
581  *  sets a directory's vnodeops if the directory is in the vtab;
582  */
583 static struct vnodeops *
584 sdev_get_vop(struct sdev_node *dv)
585 {
586 	struct sdev_vop_table *vtp;
587 	char *path;
588 
589 	path = dv->sdev_path;
590 	ASSERT(path);
591 
592 	/* gets the relative path to /dev/ */
593 	path += 5;
594 
595 	/* gets the vtab entry it matches */
596 	if ((vtp = sdev_match(dv)) != NULL) {
597 		dv->sdev_flags |= vtp->vt_flags;
598 
599 		if (vtp->vt_vops) {
600 			if (vtp->vt_global_vops)
601 				*(vtp->vt_global_vops) = vtp->vt_vops;
602 			return (vtp->vt_vops);
603 		}
604 
605 		if (vtp->vt_service) {
606 			fs_operation_def_t *templ;
607 			templ = sdev_merge_vtab(vtp->vt_service);
608 			if (vn_make_ops(vtp->vt_name,
609 			    (const fs_operation_def_t *)templ,
610 			    &vtp->vt_vops) != 0) {
611 				cmn_err(CE_PANIC, "%s: malformed vnode ops\n",
612 				    vtp->vt_name);
613 				/*NOTREACHED*/
614 			}
615 			if (vtp->vt_global_vops) {
616 				*(vtp->vt_global_vops) = vtp->vt_vops;
617 			}
618 			sdev_free_vtab(templ);
619 			return (vtp->vt_vops);
620 		}
621 		return (sdev_vnodeops);
622 	}
623 
624 	/* child inherits the persistence of the parent */
625 	if (SDEV_IS_PERSIST(dv->sdev_dotdot))
626 		dv->sdev_flags |= SDEV_PERSIST;
627 
628 	return (sdev_vnodeops);
629 }
630 
631 static void
632 sdev_set_no_negcache(struct sdev_node *dv)
633 {
634 	int i;
635 	char *path;
636 
637 	ASSERT(dv->sdev_path);
638 	path = dv->sdev_path + strlen("/dev/");
639 
640 	for (i = 0; vtab[i].vt_name; i++) {
641 		if (strcmp(vtab[i].vt_name, path) == 0) {
642 			if (vtab[i].vt_flags & SDEV_NO_NCACHE)
643 				dv->sdev_flags |= SDEV_NO_NCACHE;
644 			break;
645 		}
646 	}
647 }
648 
649 void *
650 sdev_get_vtor(struct sdev_node *dv)
651 {
652 	struct sdev_vop_table *vtp;
653 
654 	vtp = sdev_match(dv);
655 	if (vtp)
656 		return ((void *)vtp->vt_vtor);
657 	else
658 		return (NULL);
659 }
660 
661 /*
662  * Build the base root inode
663  */
664 ino_t
665 sdev_mkino(struct sdev_node *dv)
666 {
667 	ino_t	ino;
668 
669 	/*
670 	 * for now, follow the lead of tmpfs here
671 	 * need to someday understand the requirements here
672 	 */
673 	ino = (ino_t)(uint32_t)((uintptr_t)dv >> 3);
674 	ino += SDEV_ROOTINO + 1;
675 
676 	return (ino);
677 }
678 
679 int
680 sdev_getlink(struct vnode *linkvp, char **link)
681 {
682 	int err;
683 	char *buf;
684 	struct uio uio = {0};
685 	struct iovec iov = {0};
686 
687 	if (linkvp == NULL)
688 		return (ENOENT);
689 	ASSERT(linkvp->v_type == VLNK);
690 
691 	buf = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
692 	iov.iov_base = buf;
693 	iov.iov_len = MAXPATHLEN;
694 	uio.uio_iov = &iov;
695 	uio.uio_iovcnt = 1;
696 	uio.uio_resid = MAXPATHLEN;
697 	uio.uio_segflg = UIO_SYSSPACE;
698 	uio.uio_llimit = MAXOFFSET_T;
699 
700 	err = VOP_READLINK(linkvp, &uio, kcred, NULL);
701 	if (err) {
702 		cmn_err(CE_WARN, "readlink %s failed in dev\n", buf);
703 		kmem_free(buf, MAXPATHLEN);
704 		return (ENOENT);
705 	}
706 
707 	/* mission complete */
708 	*link = i_ddi_strdup(buf, KM_SLEEP);
709 	kmem_free(buf, MAXPATHLEN);
710 	return (0);
711 }
712 
713 /*
714  * A convenient wrapper to get the devfs node vnode for a device
715  * minor functionality: readlink() of a /dev symlink
716  * Place the link into dv->sdev_symlink
717  */
718 static int
719 sdev_follow_link(struct sdev_node *dv)
720 {
721 	int err;
722 	struct vnode *linkvp;
723 	char *link = NULL;
724 
725 	linkvp = SDEVTOV(dv);
726 	if (linkvp == NULL)
727 		return (ENOENT);
728 	ASSERT(linkvp->v_type == VLNK);
729 	err = sdev_getlink(linkvp, &link);
730 	if (err) {
731 		(void) sdev_nodezombied(dv);
732 		dv->sdev_symlink = NULL;
733 		return (ENOENT);
734 	}
735 
736 	ASSERT(link != NULL);
737 	dv->sdev_symlink = link;
738 	return (0);
739 }
740 
741 static int
742 sdev_node_check(struct sdev_node *dv, struct vattr *nvap, void *nargs)
743 {
744 	vtype_t otype = SDEVTOV(dv)->v_type;
745 
746 	/*
747 	 * existing sdev_node has a different type.
748 	 */
749 	if (otype != nvap->va_type) {
750 		sdcmn_err9(("sdev_node_check: existing node "
751 		    "  %s type %d does not match new node type %d\n",
752 		    dv->sdev_name, otype, nvap->va_type));
753 		return (EEXIST);
754 	}
755 
756 	/*
757 	 * For a symlink, the target should be the same.
758 	 */
759 	if (otype == VLNK) {
760 		ASSERT(nargs != NULL);
761 		ASSERT(dv->sdev_symlink != NULL);
762 		if (strcmp(dv->sdev_symlink, (char *)nargs) != 0) {
763 			sdcmn_err9(("sdev_node_check: existing node "
764 			    " %s has different symlink %s as new node "
765 			    " %s\n", dv->sdev_name, dv->sdev_symlink,
766 			    (char *)nargs));
767 			return (EEXIST);
768 		}
769 	}
770 
771 	return (0);
772 }
773 
774 /*
775  * sdev_mknode - a wrapper for sdev_nodeinit(), sdev_nodeready()
776  *
777  * arguments:
778  *	- ddv (parent)
779  *	- nm (child name)
780  *	- newdv (sdev_node for nm is returned here)
781  *	- vap (vattr for the node to be created, va_type should be set.
782  *	- avp (attribute vnode)
783  *	  the defaults should be used if unknown)
784  *	- cred
785  *	- args
786  *	    . tnm (for VLNK)
787  *	    . global sdev_node (for !SDEV_GLOBAL)
788  * 	- state: SDEV_INIT, SDEV_READY
789  *
790  * only ddv, nm, newddv, vap, cred are required for sdev_mknode(SDEV_INIT)
791  *
792  * NOTE:  directory contents writers lock needs to be held before
793  *	  calling this routine.
794  */
795 int
796 sdev_mknode(struct sdev_node *ddv, char *nm, struct sdev_node **newdv,
797     struct vattr *vap, struct vnode *avp, void *args, struct cred *cred,
798     sdev_node_state_t state)
799 {
800 	int error = 0;
801 	sdev_node_state_t node_state;
802 	struct sdev_node *dv = NULL;
803 
804 	ASSERT(state != SDEV_ZOMBIE);
805 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
806 
807 	if (*newdv) {
808 		dv = *newdv;
809 	} else {
810 		/* allocate and initialize a sdev_node */
811 		if (ddv->sdev_state == SDEV_ZOMBIE) {
812 			sdcmn_err9(("sdev_mknode: parent %s ZOMBIEd\n",
813 			    ddv->sdev_path));
814 			return (ENOENT);
815 		}
816 
817 		error = sdev_nodeinit(ddv, nm, &dv, vap);
818 		if (error != 0) {
819 			sdcmn_err9(("sdev_mknode: error %d,"
820 			    " name %s can not be initialized\n",
821 			    error, nm));
822 			return (error);
823 		}
824 		ASSERT(dv);
825 
826 		/* insert into the directory cache */
827 		error = sdev_cache_update(ddv, &dv, nm, SDEV_CACHE_ADD);
828 		if (error) {
829 			sdcmn_err9(("sdev_mknode: node %s can not"
830 			    " be added into directory cache\n", nm));
831 			return (ENOENT);
832 		}
833 	}
834 
835 	ASSERT(dv);
836 	node_state = dv->sdev_state;
837 	ASSERT(node_state != SDEV_ZOMBIE);
838 
839 	if (state == SDEV_READY) {
840 		switch (node_state) {
841 		case SDEV_INIT:
842 			error = sdev_nodeready(dv, vap, avp, args, cred);
843 			if (error) {
844 				sdcmn_err9(("sdev_mknode: node %s can NOT"
845 				    " be transitioned into READY state, "
846 				    "error %d\n", nm, error));
847 			}
848 			break;
849 		case SDEV_READY:
850 			/*
851 			 * Do some sanity checking to make sure
852 			 * the existing sdev_node is what has been
853 			 * asked for.
854 			 */
855 			error = sdev_node_check(dv, vap, args);
856 			break;
857 		default:
858 			break;
859 		}
860 	}
861 
862 	if (!error) {
863 		*newdv = dv;
864 		ASSERT((*newdv)->sdev_state != SDEV_ZOMBIE);
865 	} else {
866 		SDEV_SIMPLE_RELE(dv);
867 		*newdv = NULL;
868 	}
869 
870 	return (error);
871 }
872 
873 /*
874  * convenient wrapper to change vp's ATIME, CTIME and MTIME
875  */
876 void
877 sdev_update_timestamps(struct vnode *vp, cred_t *cred, uint_t mask)
878 {
879 	struct vattr attr;
880 	timestruc_t now;
881 	int err;
882 
883 	ASSERT(vp);
884 	gethrestime(&now);
885 	if (mask & AT_CTIME)
886 		attr.va_ctime = now;
887 	if (mask & AT_MTIME)
888 		attr.va_mtime = now;
889 	if (mask & AT_ATIME)
890 		attr.va_atime = now;
891 
892 	attr.va_mask = (mask & AT_TIMES);
893 	err = VOP_SETATTR(vp, &attr, 0, cred, NULL);
894 	if (err && (err != EROFS)) {
895 		sdcmn_err(("update timestamps error %d\n", err));
896 	}
897 }
898 
899 /*
900  * the backing store vnode is released here
901  */
902 /*ARGSUSED1*/
903 void
904 sdev_nodedestroy(struct sdev_node *dv, uint_t flags)
905 {
906 	/* no references */
907 	ASSERT(dv->sdev_nlink == 0);
908 
909 	if (dv->sdev_attrvp != NULLVP) {
910 		VN_RELE(dv->sdev_attrvp);
911 		/*
912 		 * reset the attrvp so that no more
913 		 * references can be made on this already
914 		 * vn_rele() vnode
915 		 */
916 		dv->sdev_attrvp = NULLVP;
917 	}
918 
919 	if (dv->sdev_attr != NULL) {
920 		kmem_free(dv->sdev_attr, sizeof (struct vattr));
921 		dv->sdev_attr = NULL;
922 	}
923 
924 	if (dv->sdev_name != NULL) {
925 		kmem_free(dv->sdev_name, dv->sdev_namelen + 1);
926 		dv->sdev_name = NULL;
927 	}
928 
929 	if (dv->sdev_symlink != NULL) {
930 		kmem_free(dv->sdev_symlink, strlen(dv->sdev_symlink) + 1);
931 		dv->sdev_symlink = NULL;
932 	}
933 
934 	if (dv->sdev_path) {
935 		kmem_free(dv->sdev_path, strlen(dv->sdev_path) + 1);
936 		dv->sdev_path = NULL;
937 	}
938 
939 	if (!SDEV_IS_GLOBAL(dv))
940 		sdev_prof_free(dv);
941 
942 	if (SDEVTOV(dv)->v_type == VDIR) {
943 		ASSERT(SDEV_FIRST_ENTRY(dv) == NULL);
944 		avl_destroy(&dv->sdev_entries);
945 	}
946 
947 	mutex_destroy(&dv->sdev_lookup_lock);
948 	cv_destroy(&dv->sdev_lookup_cv);
949 
950 	/* return node to initial state as per constructor */
951 	(void) memset((void *)&dv->sdev_instance_data, 0,
952 	    sizeof (dv->sdev_instance_data));
953 	vn_invalid(SDEVTOV(dv));
954 	kmem_cache_free(sdev_node_cache, dv);
955 }
956 
957 /*
958  * DIRECTORY CACHE lookup
959  */
960 struct sdev_node *
961 sdev_findbyname(struct sdev_node *ddv, char *nm)
962 {
963 	struct sdev_node *dv;
964 	struct sdev_node dvtmp;
965 	avl_index_t	where;
966 
967 	ASSERT(RW_LOCK_HELD(&ddv->sdev_contents));
968 
969 	dvtmp.sdev_name = nm;
970 	dv = avl_find(&ddv->sdev_entries, &dvtmp, &where);
971 	if (dv) {
972 		ASSERT(dv->sdev_dotdot == ddv);
973 		ASSERT(strcmp(dv->sdev_name, nm) == 0);
974 		SDEV_HOLD(dv);
975 		return (dv);
976 	}
977 	return (NULL);
978 }
979 
980 /*
981  * Inserts a new sdev_node in a parent directory
982  */
983 void
984 sdev_direnter(struct sdev_node *ddv, struct sdev_node *dv)
985 {
986 	avl_index_t where;
987 
988 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
989 	ASSERT(SDEVTOV(ddv)->v_type == VDIR);
990 	ASSERT(ddv->sdev_nlink >= 2);
991 	ASSERT(dv->sdev_nlink == 0);
992 
993 	dv->sdev_dotdot = ddv;
994 	VERIFY(avl_find(&ddv->sdev_entries, dv, &where) == NULL);
995 	avl_insert(&ddv->sdev_entries, dv, where);
996 	ddv->sdev_nlink++;
997 }
998 
999 /*
1000  * The following check is needed because while sdev_nodes are linked
1001  * in SDEV_INIT state, they have their link counts incremented only
1002  * in SDEV_READY state.
1003  */
1004 static void
1005 decr_link(struct sdev_node *dv)
1006 {
1007 	if (dv->sdev_state != SDEV_INIT)
1008 		dv->sdev_nlink--;
1009 	else
1010 		ASSERT(dv->sdev_nlink == 0);
1011 }
1012 
1013 /*
1014  * Delete an existing dv from directory cache
1015  *
1016  * In the case of a node is still held by non-zero reference count,
1017  *     the node is put into ZOMBIE state. Once the reference count
1018  *     reaches "0", the node is unlinked and destroyed,
1019  *     in sdev_inactive().
1020  */
1021 static int
1022 sdev_dirdelete(struct sdev_node *ddv, struct sdev_node *dv)
1023 {
1024 	struct vnode *vp;
1025 
1026 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1027 
1028 	vp = SDEVTOV(dv);
1029 	mutex_enter(&vp->v_lock);
1030 
1031 	/* dv is held still */
1032 	if (vp->v_count > 1) {
1033 		rw_enter(&dv->sdev_contents, RW_WRITER);
1034 		if (dv->sdev_state == SDEV_READY) {
1035 			sdcmn_err9((
1036 			    "sdev_dirdelete: node %s busy with count %d\n",
1037 			    dv->sdev_name, vp->v_count));
1038 			dv->sdev_state = SDEV_ZOMBIE;
1039 		}
1040 		rw_exit(&dv->sdev_contents);
1041 		--vp->v_count;
1042 		mutex_exit(&vp->v_lock);
1043 		return (EBUSY);
1044 	}
1045 	ASSERT(vp->v_count == 1);
1046 
1047 	/* unlink from the memory cache */
1048 	ddv->sdev_nlink--;	/* .. to above */
1049 	if (vp->v_type == VDIR) {
1050 		decr_link(dv);		/* . to self */
1051 	}
1052 
1053 	avl_remove(&ddv->sdev_entries, dv);
1054 	decr_link(dv);	/* name, back to zero */
1055 	vp->v_count--;
1056 	mutex_exit(&vp->v_lock);
1057 
1058 	/* destroy the node */
1059 	sdev_nodedestroy(dv, 0);
1060 	return (0);
1061 }
1062 
1063 /*
1064  * check if the source is in the path of the target
1065  *
1066  * source and target are different
1067  */
1068 /*ARGSUSED2*/
1069 static int
1070 sdev_checkpath(struct sdev_node *sdv, struct sdev_node *tdv, struct cred *cred)
1071 {
1072 	int error = 0;
1073 	struct sdev_node *dotdot, *dir;
1074 
1075 	dotdot = tdv->sdev_dotdot;
1076 	ASSERT(dotdot);
1077 
1078 	/* fs root */
1079 	if (dotdot == tdv) {
1080 		return (0);
1081 	}
1082 
1083 	for (;;) {
1084 		/*
1085 		 * avoid error cases like
1086 		 *	mv a a/b
1087 		 *	mv a a/b/c
1088 		 *	etc.
1089 		 */
1090 		if (dotdot == sdv) {
1091 			error = EINVAL;
1092 			break;
1093 		}
1094 
1095 		dir = dotdot;
1096 		dotdot = dir->sdev_dotdot;
1097 
1098 		/* done checking because root is reached */
1099 		if (dir == dotdot) {
1100 			break;
1101 		}
1102 	}
1103 	return (error);
1104 }
1105 
1106 int
1107 sdev_rnmnode(struct sdev_node *oddv, struct sdev_node *odv,
1108     struct sdev_node *nddv, struct sdev_node **ndvp, char *nnm,
1109     struct cred *cred)
1110 {
1111 	int error = 0;
1112 	struct vnode *ovp = SDEVTOV(odv);
1113 	struct vnode *nvp;
1114 	struct vattr vattr;
1115 	int doingdir = (ovp->v_type == VDIR);
1116 	char *link = NULL;
1117 	int samedir = (oddv == nddv) ? 1 : 0;
1118 	int bkstore = 0;
1119 	struct sdev_node *idv = NULL;
1120 	struct sdev_node *ndv = NULL;
1121 	timestruc_t now;
1122 
1123 	vattr.va_mask = AT_TYPE|AT_MODE|AT_UID|AT_GID;
1124 	error = VOP_GETATTR(ovp, &vattr, 0, cred, NULL);
1125 	if (error)
1126 		return (error);
1127 
1128 	if (!samedir)
1129 		rw_enter(&oddv->sdev_contents, RW_WRITER);
1130 	rw_enter(&nddv->sdev_contents, RW_WRITER);
1131 
1132 	/*
1133 	 * the source may have been deleted by another thread before
1134 	 * we gets here.
1135 	 */
1136 	if (odv->sdev_state != SDEV_READY) {
1137 		error = ENOENT;
1138 		goto err_out;
1139 	}
1140 
1141 	if (doingdir && (odv == nddv)) {
1142 		error = EINVAL;
1143 		goto err_out;
1144 	}
1145 
1146 	/*
1147 	 * If renaming a directory, and the parents are different (".." must be
1148 	 * changed) then the source dir must not be in the dir hierarchy above
1149 	 * the target since it would orphan everything below the source dir.
1150 	 */
1151 	if (doingdir && (oddv != nddv)) {
1152 		error = sdev_checkpath(odv, nddv, cred);
1153 		if (error)
1154 			goto err_out;
1155 	}
1156 
1157 	/* destination existing */
1158 	if (*ndvp) {
1159 		nvp = SDEVTOV(*ndvp);
1160 		ASSERT(nvp);
1161 
1162 		/* handling renaming to itself */
1163 		if (odv == *ndvp) {
1164 			error = 0;
1165 			goto err_out;
1166 		}
1167 
1168 		if (nvp->v_type == VDIR) {
1169 			if (!doingdir) {
1170 				error = EISDIR;
1171 				goto err_out;
1172 			}
1173 
1174 			if (vn_vfswlock(nvp)) {
1175 				error = EBUSY;
1176 				goto err_out;
1177 			}
1178 
1179 			if (vn_mountedvfs(nvp) != NULL) {
1180 				vn_vfsunlock(nvp);
1181 				error = EBUSY;
1182 				goto err_out;
1183 			}
1184 
1185 			/* in case dir1 exists in dir2 and "mv dir1 dir2" */
1186 			if ((*ndvp)->sdev_nlink > 2) {
1187 				vn_vfsunlock(nvp);
1188 				error = EEXIST;
1189 				goto err_out;
1190 			}
1191 			vn_vfsunlock(nvp);
1192 
1193 			(void) sdev_dirdelete(nddv, *ndvp);
1194 			*ndvp = NULL;
1195 			ASSERT(nddv->sdev_attrvp);
1196 			error = VOP_RMDIR(nddv->sdev_attrvp, nnm,
1197 			    nddv->sdev_attrvp, cred, NULL, 0);
1198 			if (error)
1199 				goto err_out;
1200 		} else {
1201 			if (doingdir) {
1202 				error = ENOTDIR;
1203 				goto err_out;
1204 			}
1205 
1206 			if (SDEV_IS_PERSIST((*ndvp))) {
1207 				bkstore = 1;
1208 			}
1209 
1210 			/*
1211 			 * get rid of the node from the directory cache
1212 			 * note, in case EBUSY is returned, the ZOMBIE
1213 			 * node is taken care in sdev_mknode.
1214 			 */
1215 			(void) sdev_dirdelete(nddv, *ndvp);
1216 			*ndvp = NULL;
1217 			if (bkstore) {
1218 				ASSERT(nddv->sdev_attrvp);
1219 				error = VOP_REMOVE(nddv->sdev_attrvp,
1220 				    nnm, cred, NULL, 0);
1221 				if (error)
1222 					goto err_out;
1223 			}
1224 		}
1225 	}
1226 
1227 	/* fix the source for a symlink */
1228 	if (vattr.va_type == VLNK) {
1229 		if (odv->sdev_symlink == NULL) {
1230 			error = sdev_follow_link(odv);
1231 			if (error) {
1232 				error = ENOENT;
1233 				goto err_out;
1234 			}
1235 		}
1236 		ASSERT(odv->sdev_symlink);
1237 		link = i_ddi_strdup(odv->sdev_symlink, KM_SLEEP);
1238 	}
1239 
1240 	/*
1241 	 * make a fresh node from the source attrs
1242 	 */
1243 	ASSERT(RW_WRITE_HELD(&nddv->sdev_contents));
1244 	error = sdev_mknode(nddv, nnm, ndvp, &vattr,
1245 	    NULL, (void *)link, cred, SDEV_READY);
1246 
1247 	if (link)
1248 		kmem_free(link, strlen(link) + 1);
1249 
1250 	if (error)
1251 		goto err_out;
1252 	ASSERT(*ndvp);
1253 	ASSERT((*ndvp)->sdev_state == SDEV_READY);
1254 
1255 	/* move dir contents */
1256 	if (doingdir) {
1257 		for (idv = SDEV_FIRST_ENTRY(odv); idv;
1258 		    idv = SDEV_NEXT_ENTRY(odv, idv)) {
1259 			error = sdev_rnmnode(odv, idv,
1260 			    (struct sdev_node *)(*ndvp), &ndv,
1261 			    idv->sdev_name, cred);
1262 			if (error)
1263 				goto err_out;
1264 			ndv = NULL;
1265 		}
1266 	}
1267 
1268 	if ((*ndvp)->sdev_attrvp) {
1269 		sdev_update_timestamps((*ndvp)->sdev_attrvp, kcred,
1270 		    AT_CTIME|AT_ATIME);
1271 	} else {
1272 		ASSERT((*ndvp)->sdev_attr);
1273 		gethrestime(&now);
1274 		(*ndvp)->sdev_attr->va_ctime = now;
1275 		(*ndvp)->sdev_attr->va_atime = now;
1276 	}
1277 
1278 	if (nddv->sdev_attrvp) {
1279 		sdev_update_timestamps(nddv->sdev_attrvp, kcred,
1280 		    AT_MTIME|AT_ATIME);
1281 	} else {
1282 		ASSERT(nddv->sdev_attr);
1283 		gethrestime(&now);
1284 		nddv->sdev_attr->va_mtime = now;
1285 		nddv->sdev_attr->va_atime = now;
1286 	}
1287 	rw_exit(&nddv->sdev_contents);
1288 	if (!samedir)
1289 		rw_exit(&oddv->sdev_contents);
1290 
1291 	SDEV_RELE(*ndvp);
1292 	return (error);
1293 
1294 err_out:
1295 	rw_exit(&nddv->sdev_contents);
1296 	if (!samedir)
1297 		rw_exit(&oddv->sdev_contents);
1298 	return (error);
1299 }
1300 
1301 /*
1302  * Merge sdev_node specific information into an attribute structure.
1303  *
1304  * note: sdev_node is not locked here
1305  */
1306 void
1307 sdev_vattr_merge(struct sdev_node *dv, struct vattr *vap)
1308 {
1309 	struct vnode *vp = SDEVTOV(dv);
1310 
1311 	vap->va_nlink = dv->sdev_nlink;
1312 	vap->va_nodeid = dv->sdev_ino;
1313 	vap->va_fsid = SDEVTOV(dv->sdev_dotdot)->v_rdev;
1314 	vap->va_type = vp->v_type;
1315 
1316 	if (vp->v_type == VDIR) {
1317 		vap->va_rdev = 0;
1318 		vap->va_fsid = vp->v_rdev;
1319 	} else if (vp->v_type == VLNK) {
1320 		vap->va_rdev = 0;
1321 		vap->va_mode  &= ~S_IFMT;
1322 		vap->va_mode |= S_IFLNK;
1323 	} else if ((vp->v_type == VCHR) || (vp->v_type == VBLK)) {
1324 		vap->va_rdev = vp->v_rdev;
1325 		vap->va_mode &= ~S_IFMT;
1326 		if (vap->va_type == VCHR)
1327 			vap->va_mode |= S_IFCHR;
1328 		else
1329 			vap->va_mode |= S_IFBLK;
1330 	} else {
1331 		vap->va_rdev = 0;
1332 	}
1333 }
1334 
1335 struct vattr *
1336 sdev_getdefault_attr(enum vtype type)
1337 {
1338 	if (type == VDIR)
1339 		return (&sdev_vattr_dir);
1340 	else if (type == VCHR)
1341 		return (&sdev_vattr_chr);
1342 	else if (type == VBLK)
1343 		return (&sdev_vattr_blk);
1344 	else if (type == VLNK)
1345 		return (&sdev_vattr_lnk);
1346 	else
1347 		return (NULL);
1348 }
1349 int
1350 sdev_to_vp(struct sdev_node *dv, struct vnode **vpp)
1351 {
1352 	int rv = 0;
1353 	struct vnode *vp = SDEVTOV(dv);
1354 
1355 	switch (vp->v_type) {
1356 	case VCHR:
1357 	case VBLK:
1358 		/*
1359 		 * If vnode is a device, return special vnode instead
1360 		 * (though it knows all about -us- via sp->s_realvp)
1361 		 */
1362 		*vpp = specvp(vp, vp->v_rdev, vp->v_type, kcred);
1363 		VN_RELE(vp);
1364 		if (*vpp == NULLVP)
1365 			rv = ENOSYS;
1366 		break;
1367 	default:	/* most types are returned as is */
1368 		*vpp = vp;
1369 		break;
1370 	}
1371 	return (rv);
1372 }
1373 
1374 /*
1375  * junction between devname and root file system, e.g. ufs
1376  */
1377 int
1378 devname_backstore_lookup(struct sdev_node *ddv, char *nm, struct vnode **rvp)
1379 {
1380 	struct vnode *rdvp = ddv->sdev_attrvp;
1381 	int rval = 0;
1382 
1383 	ASSERT(rdvp);
1384 
1385 	rval = VOP_LOOKUP(rdvp, nm, rvp, NULL, 0, NULL, kcred, NULL, NULL,
1386 	    NULL);
1387 	return (rval);
1388 }
1389 
1390 static int
1391 sdev_filldir_from_store(struct sdev_node *ddv, int dlen, struct cred *cred)
1392 {
1393 	struct sdev_node *dv = NULL;
1394 	char	*nm;
1395 	struct vnode *dirvp;
1396 	int	error;
1397 	vnode_t	*vp;
1398 	int eof;
1399 	struct iovec iov;
1400 	struct uio uio;
1401 	struct dirent64 *dp;
1402 	dirent64_t *dbuf;
1403 	size_t dbuflen;
1404 	struct vattr vattr;
1405 	char *link = NULL;
1406 
1407 	if (ddv->sdev_attrvp == NULL)
1408 		return (0);
1409 	if (!(ddv->sdev_flags & SDEV_BUILD))
1410 		return (0);
1411 
1412 	dirvp = ddv->sdev_attrvp;
1413 	VN_HOLD(dirvp);
1414 	dbuf = kmem_zalloc(dlen, KM_SLEEP);
1415 
1416 	uio.uio_iov = &iov;
1417 	uio.uio_iovcnt = 1;
1418 	uio.uio_segflg = UIO_SYSSPACE;
1419 	uio.uio_fmode = 0;
1420 	uio.uio_extflg = UIO_COPY_CACHED;
1421 	uio.uio_loffset = 0;
1422 	uio.uio_llimit = MAXOFFSET_T;
1423 
1424 	eof = 0;
1425 	error = 0;
1426 	while (!error && !eof) {
1427 		uio.uio_resid = dlen;
1428 		iov.iov_base = (char *)dbuf;
1429 		iov.iov_len = dlen;
1430 		(void) VOP_RWLOCK(dirvp, V_WRITELOCK_FALSE, NULL);
1431 		error = VOP_READDIR(dirvp, &uio, kcred, &eof, NULL, 0);
1432 		VOP_RWUNLOCK(dirvp, V_WRITELOCK_FALSE, NULL);
1433 
1434 		dbuflen = dlen - uio.uio_resid;
1435 		if (error || dbuflen == 0)
1436 			break;
1437 
1438 		if (!(ddv->sdev_flags & SDEV_BUILD))
1439 			break;
1440 
1441 		for (dp = dbuf; ((intptr_t)dp <
1442 		    (intptr_t)dbuf + dbuflen);
1443 		    dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen)) {
1444 			nm = dp->d_name;
1445 
1446 			if (strcmp(nm, ".") == 0 ||
1447 			    strcmp(nm, "..") == 0)
1448 				continue;
1449 
1450 			vp = NULLVP;
1451 			dv = sdev_cache_lookup(ddv, nm);
1452 			if (dv) {
1453 				if (dv->sdev_state != SDEV_ZOMBIE) {
1454 					SDEV_SIMPLE_RELE(dv);
1455 				} else {
1456 					/*
1457 					 * A ZOMBIE node may not have been
1458 					 * cleaned up from the backing store,
1459 					 * bypass this entry in this case,
1460 					 * and clean it up from the directory
1461 					 * cache if this is the last call.
1462 					 */
1463 					(void) sdev_dirdelete(ddv, dv);
1464 				}
1465 				continue;
1466 			}
1467 
1468 			/* refill the cache if not already */
1469 			error = devname_backstore_lookup(ddv, nm, &vp);
1470 			if (error)
1471 				continue;
1472 
1473 			vattr.va_mask = AT_TYPE|AT_MODE|AT_UID|AT_GID;
1474 			error = VOP_GETATTR(vp, &vattr, 0, cred, NULL);
1475 			if (error)
1476 				continue;
1477 
1478 			if (vattr.va_type == VLNK) {
1479 				error = sdev_getlink(vp, &link);
1480 				if (error) {
1481 					continue;
1482 				}
1483 				ASSERT(link != NULL);
1484 			}
1485 
1486 			if (!rw_tryupgrade(&ddv->sdev_contents)) {
1487 				rw_exit(&ddv->sdev_contents);
1488 				rw_enter(&ddv->sdev_contents, RW_WRITER);
1489 			}
1490 			error = sdev_mknode(ddv, nm, &dv, &vattr, vp, link,
1491 			    cred, SDEV_READY);
1492 			rw_downgrade(&ddv->sdev_contents);
1493 
1494 			if (link != NULL) {
1495 				kmem_free(link, strlen(link) + 1);
1496 				link = NULL;
1497 			}
1498 
1499 			if (!error) {
1500 				ASSERT(dv);
1501 				ASSERT(dv->sdev_state != SDEV_ZOMBIE);
1502 				SDEV_SIMPLE_RELE(dv);
1503 			}
1504 			vp = NULL;
1505 			dv = NULL;
1506 		}
1507 	}
1508 
1509 done:
1510 	VN_RELE(dirvp);
1511 	kmem_free(dbuf, dlen);
1512 
1513 	return (error);
1514 }
1515 
1516 void
1517 sdev_filldir_dynamic(struct sdev_node *ddv)
1518 {
1519 	int error;
1520 	int i;
1521 	struct vattr vattr;
1522 	struct vattr *vap = &vattr;
1523 	char *nm = NULL;
1524 	struct sdev_node *dv = NULL;
1525 
1526 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1527 	ASSERT((ddv->sdev_flags & SDEV_BUILD));
1528 
1529 	*vap = *sdev_getdefault_attr(VDIR);	/* note structure copy here */
1530 	gethrestime(&vap->va_atime);
1531 	vap->va_mtime = vap->va_atime;
1532 	vap->va_ctime = vap->va_atime;
1533 	for (i = 0; vtab[i].vt_name != NULL; i++) {
1534 		nm = vtab[i].vt_name;
1535 		ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1536 		dv = NULL;
1537 		error = sdev_mknode(ddv, nm, &dv, vap, NULL,
1538 		    NULL, kcred, SDEV_READY);
1539 		if (error) {
1540 			cmn_err(CE_WARN, "%s/%s: error %d\n",
1541 			    ddv->sdev_name, nm, error);
1542 		} else {
1543 			ASSERT(dv);
1544 			ASSERT(dv->sdev_state != SDEV_ZOMBIE);
1545 			SDEV_SIMPLE_RELE(dv);
1546 		}
1547 	}
1548 }
1549 
1550 /*
1551  * Creating a backing store entry based on sdev_attr.
1552  * This is called either as part of node creation in a persistent directory
1553  * or from setattr/setsecattr to persist access attributes across reboot.
1554  */
1555 int
1556 sdev_shadow_node(struct sdev_node *dv, struct cred *cred)
1557 {
1558 	int error = 0;
1559 	struct vnode *dvp = SDEVTOV(dv->sdev_dotdot);
1560 	struct vnode *rdvp = VTOSDEV(dvp)->sdev_attrvp;
1561 	struct vattr *vap = dv->sdev_attr;
1562 	char *nm = dv->sdev_name;
1563 	struct vnode *tmpvp, **rvp = &tmpvp, *rrvp = NULL;
1564 
1565 	ASSERT(dv && dv->sdev_name && rdvp);
1566 	ASSERT(RW_WRITE_HELD(&dv->sdev_contents) && dv->sdev_attrvp == NULL);
1567 
1568 lookup:
1569 	/* try to find it in the backing store */
1570 	error = VOP_LOOKUP(rdvp, nm, rvp, NULL, 0, NULL, cred, NULL, NULL,
1571 	    NULL);
1572 	if (error == 0) {
1573 		if (VOP_REALVP(*rvp, &rrvp, NULL) == 0) {
1574 			VN_HOLD(rrvp);
1575 			VN_RELE(*rvp);
1576 			*rvp = rrvp;
1577 		}
1578 
1579 		kmem_free(dv->sdev_attr, sizeof (vattr_t));
1580 		dv->sdev_attr = NULL;
1581 		dv->sdev_attrvp = *rvp;
1582 		return (0);
1583 	}
1584 
1585 	/* let's try to persist the node */
1586 	gethrestime(&vap->va_atime);
1587 	vap->va_mtime = vap->va_atime;
1588 	vap->va_ctime = vap->va_atime;
1589 	vap->va_mask |= AT_TYPE|AT_MODE;
1590 	switch (vap->va_type) {
1591 	case VDIR:
1592 		error = VOP_MKDIR(rdvp, nm, vap, rvp, cred, NULL, 0, NULL);
1593 		sdcmn_err9(("sdev_shadow_node: mkdir vp %p error %d\n",
1594 		    (void *)(*rvp), error));
1595 		break;
1596 	case VCHR:
1597 	case VBLK:
1598 	case VREG:
1599 	case VDOOR:
1600 		error = VOP_CREATE(rdvp, nm, vap, NONEXCL, VREAD|VWRITE,
1601 		    rvp, cred, 0, NULL, NULL);
1602 		sdcmn_err9(("sdev_shadow_node: create vp %p, error %d\n",
1603 		    (void *)(*rvp), error));
1604 		if (!error)
1605 			VN_RELE(*rvp);
1606 		break;
1607 	case VLNK:
1608 		ASSERT(dv->sdev_symlink);
1609 		error = VOP_SYMLINK(rdvp, nm, vap, dv->sdev_symlink, cred,
1610 		    NULL, 0);
1611 		sdcmn_err9(("sdev_shadow_node: create symlink error %d\n",
1612 		    error));
1613 		break;
1614 	default:
1615 		cmn_err(CE_PANIC, "dev: %s: sdev_shadow_node "
1616 		    "create\n", nm);
1617 		/*NOTREACHED*/
1618 	}
1619 
1620 	/* go back to lookup to factor out spec node and set attrvp */
1621 	if (error == 0)
1622 		goto lookup;
1623 
1624 	sdcmn_err(("cannot persist %s - error %d\n", dv->sdev_path, error));
1625 	return (error);
1626 }
1627 
1628 static int
1629 sdev_cache_add(struct sdev_node *ddv, struct sdev_node **dv, char *nm)
1630 {
1631 	int error = 0;
1632 	struct sdev_node *dup = NULL;
1633 
1634 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1635 	if ((dup = sdev_findbyname(ddv, nm)) == NULL) {
1636 		sdev_direnter(ddv, *dv);
1637 	} else {
1638 		if (dup->sdev_state == SDEV_ZOMBIE) {
1639 			error = sdev_dirdelete(ddv, dup);
1640 			/*
1641 			 * The ZOMBIE node is still hanging
1642 			 * around with more than one reference counts.
1643 			 * Fail the new node creation so that
1644 			 * the directory cache won't have
1645 			 * duplicate entries for the same named node
1646 			 */
1647 			if (error == EBUSY) {
1648 				SDEV_SIMPLE_RELE(*dv);
1649 				sdev_nodedestroy(*dv, 0);
1650 				*dv = NULL;
1651 				return (error);
1652 			}
1653 			sdev_direnter(ddv, *dv);
1654 		} else {
1655 			ASSERT((*dv)->sdev_state != SDEV_ZOMBIE);
1656 			SDEV_SIMPLE_RELE(*dv);
1657 			sdev_nodedestroy(*dv, 0);
1658 			*dv = dup;
1659 		}
1660 	}
1661 
1662 	return (0);
1663 }
1664 
1665 static int
1666 sdev_cache_delete(struct sdev_node *ddv, struct sdev_node **dv)
1667 {
1668 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1669 	return (sdev_dirdelete(ddv, *dv));
1670 }
1671 
1672 /*
1673  * update the in-core directory cache
1674  */
1675 int
1676 sdev_cache_update(struct sdev_node *ddv, struct sdev_node **dv, char *nm,
1677     sdev_cache_ops_t ops)
1678 {
1679 	int error = 0;
1680 
1681 	ASSERT((SDEV_HELD(*dv)));
1682 
1683 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1684 	switch (ops) {
1685 	case SDEV_CACHE_ADD:
1686 		error = sdev_cache_add(ddv, dv, nm);
1687 		break;
1688 	case SDEV_CACHE_DELETE:
1689 		error = sdev_cache_delete(ddv, dv);
1690 		break;
1691 	default:
1692 		break;
1693 	}
1694 
1695 	return (error);
1696 }
1697 
1698 /*
1699  * retrieve the named entry from the directory cache
1700  */
1701 struct sdev_node *
1702 sdev_cache_lookup(struct sdev_node *ddv, char *nm)
1703 {
1704 	struct sdev_node *dv = NULL;
1705 
1706 	ASSERT(RW_LOCK_HELD(&ddv->sdev_contents));
1707 	dv = sdev_findbyname(ddv, nm);
1708 
1709 	return (dv);
1710 }
1711 
1712 /*
1713  * Implicit reconfig for nodes constructed by a link generator
1714  * Start devfsadm if needed, or if devfsadm is in progress,
1715  * prepare to block on devfsadm either completing or
1716  * constructing the desired node.  As devfsadmd is global
1717  * in scope, constructing all necessary nodes, we only
1718  * need to initiate it once.
1719  */
1720 static int
1721 sdev_call_devfsadmd(struct sdev_node *ddv, struct sdev_node *dv, char *nm)
1722 {
1723 	int error = 0;
1724 
1725 	if (DEVNAME_DEVFSADM_IS_RUNNING(devfsadm_state)) {
1726 		sdcmn_err6(("lookup: waiting for %s/%s, 0x%x\n",
1727 		    ddv->sdev_name, nm, devfsadm_state));
1728 		mutex_enter(&dv->sdev_lookup_lock);
1729 		SDEV_BLOCK_OTHERS(dv, (SDEV_LOOKUP | SDEV_LGWAITING));
1730 		mutex_exit(&dv->sdev_lookup_lock);
1731 		error = 0;
1732 	} else if (!DEVNAME_DEVFSADM_HAS_RUN(devfsadm_state)) {
1733 		sdcmn_err6(("lookup %s/%s starting devfsadm, 0x%x\n",
1734 		    ddv->sdev_name, nm, devfsadm_state));
1735 
1736 		sdev_devfsadmd_thread(ddv, dv, kcred);
1737 		mutex_enter(&dv->sdev_lookup_lock);
1738 		SDEV_BLOCK_OTHERS(dv,
1739 		    (SDEV_LOOKUP | SDEV_LGWAITING));
1740 		mutex_exit(&dv->sdev_lookup_lock);
1741 		error = 0;
1742 	} else {
1743 		error = -1;
1744 	}
1745 
1746 	return (error);
1747 }
1748 
1749 /*
1750  *  Support for specialized device naming construction mechanisms
1751  */
1752 static int
1753 sdev_call_dircallback(struct sdev_node *ddv, struct sdev_node **dvp, char *nm,
1754     int (*callback)(struct sdev_node *, char *, void **, struct cred *,
1755     void *, char *), int flags, struct cred *cred)
1756 {
1757 	int rv = 0;
1758 	char *physpath = NULL;
1759 	struct vattr vattr;
1760 	struct vattr *vap = &vattr;
1761 	struct sdev_node *dv = NULL;
1762 
1763 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1764 	if (flags & SDEV_VLINK) {
1765 		physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1766 		rv = callback(ddv, nm, (void *)&physpath, kcred, NULL,
1767 		    NULL);
1768 		if (rv) {
1769 			kmem_free(physpath, MAXPATHLEN);
1770 			return (-1);
1771 		}
1772 
1773 		*vap = *sdev_getdefault_attr(VLNK);	/* structure copy */
1774 		vap->va_size = strlen(physpath);
1775 		gethrestime(&vap->va_atime);
1776 		vap->va_mtime = vap->va_atime;
1777 		vap->va_ctime = vap->va_atime;
1778 
1779 		rv = sdev_mknode(ddv, nm, &dv, vap, NULL,
1780 		    (void *)physpath, cred, SDEV_READY);
1781 		kmem_free(physpath, MAXPATHLEN);
1782 		if (rv)
1783 			return (rv);
1784 	} else if (flags & SDEV_VATTR) {
1785 		/*
1786 		 * /dev/pts
1787 		 *
1788 		 * callback is responsible to set the basic attributes,
1789 		 * e.g. va_type/va_uid/va_gid/
1790 		 *    dev_t if VCHR or VBLK/
1791 		 */
1792 		ASSERT(callback);
1793 		rv = callback(ddv, nm, (void *)&vattr, kcred, NULL, NULL);
1794 		if (rv) {
1795 			sdcmn_err3(("devname_lookup_func: SDEV_NONE "
1796 			    "callback failed \n"));
1797 			return (-1);
1798 		}
1799 
1800 		rv = sdev_mknode(ddv, nm, &dv, &vattr, NULL, NULL,
1801 		    cred, SDEV_READY);
1802 
1803 		if (rv)
1804 			return (rv);
1805 
1806 	} else {
1807 		impossible(("lookup: %s/%s by %s not supported (%d)\n",
1808 		    SDEVTOV(ddv)->v_path, nm, curproc->p_user.u_comm,
1809 		    __LINE__));
1810 		rv = -1;
1811 	}
1812 
1813 	*dvp = dv;
1814 	return (rv);
1815 }
1816 
1817 static int
1818 is_devfsadm_thread(char *exec_name)
1819 {
1820 	/*
1821 	 * note: because devfsadmd -> /usr/sbin/devfsadm
1822 	 * it is safe to use "devfsadm" to capture the lookups
1823 	 * from devfsadm and its daemon version.
1824 	 */
1825 	if (strcmp(exec_name, "devfsadm") == 0)
1826 		return (1);
1827 	return (0);
1828 }
1829 
1830 /*
1831  * Lookup Order:
1832  *	sdev_node cache;
1833  *	backing store (SDEV_PERSIST);
1834  *	DBNR: a. dir_ops implemented in the loadable modules;
1835  *	      b. vnode ops in vtab.
1836  */
1837 int
1838 devname_lookup_func(struct sdev_node *ddv, char *nm, struct vnode **vpp,
1839     struct cred *cred, int (*callback)(struct sdev_node *, char *, void **,
1840     struct cred *, void *, char *), int flags)
1841 {
1842 	int rv = 0, nmlen;
1843 	struct vnode *rvp = NULL;
1844 	struct sdev_node *dv = NULL;
1845 	int	retried = 0;
1846 	int	error = 0;
1847 	struct vattr vattr;
1848 	char *lookup_thread = curproc->p_user.u_comm;
1849 	int failed_flags = 0;
1850 	int (*vtor)(struct sdev_node *) = NULL;
1851 	int state;
1852 	int parent_state;
1853 	char *link = NULL;
1854 
1855 	if (SDEVTOV(ddv)->v_type != VDIR)
1856 		return (ENOTDIR);
1857 
1858 	/*
1859 	 * Empty name or ., return node itself.
1860 	 */
1861 	nmlen = strlen(nm);
1862 	if ((nmlen == 0) || ((nmlen == 1) && (nm[0] == '.'))) {
1863 		*vpp = SDEVTOV(ddv);
1864 		VN_HOLD(*vpp);
1865 		return (0);
1866 	}
1867 
1868 	/*
1869 	 * .., return the parent directory
1870 	 */
1871 	if ((nmlen == 2) && (strcmp(nm, "..") == 0)) {
1872 		*vpp = SDEVTOV(ddv->sdev_dotdot);
1873 		VN_HOLD(*vpp);
1874 		return (0);
1875 	}
1876 
1877 	rw_enter(&ddv->sdev_contents, RW_READER);
1878 	if (ddv->sdev_flags & SDEV_VTOR) {
1879 		vtor = (int (*)(struct sdev_node *))sdev_get_vtor(ddv);
1880 		ASSERT(vtor);
1881 	}
1882 
1883 tryagain:
1884 	/*
1885 	 * (a) directory cache lookup:
1886 	 */
1887 	ASSERT(RW_READ_HELD(&ddv->sdev_contents));
1888 	parent_state = ddv->sdev_state;
1889 	dv = sdev_cache_lookup(ddv, nm);
1890 	if (dv) {
1891 		state = dv->sdev_state;
1892 		switch (state) {
1893 		case SDEV_INIT:
1894 			if (is_devfsadm_thread(lookup_thread))
1895 				break;
1896 
1897 			/* ZOMBIED parent won't allow node creation */
1898 			if (parent_state == SDEV_ZOMBIE) {
1899 				SD_TRACE_FAILED_LOOKUP(ddv, nm,
1900 				    retried);
1901 				goto nolock_notfound;
1902 			}
1903 
1904 			mutex_enter(&dv->sdev_lookup_lock);
1905 			/* compensate the threads started after devfsadm */
1906 			if (DEVNAME_DEVFSADM_IS_RUNNING(devfsadm_state) &&
1907 			    !(SDEV_IS_LOOKUP(dv)))
1908 				SDEV_BLOCK_OTHERS(dv,
1909 				    (SDEV_LOOKUP | SDEV_LGWAITING));
1910 
1911 			if (SDEV_IS_LOOKUP(dv)) {
1912 				failed_flags |= SLF_REBUILT;
1913 				rw_exit(&ddv->sdev_contents);
1914 				error = sdev_wait4lookup(dv, SDEV_LOOKUP);
1915 				mutex_exit(&dv->sdev_lookup_lock);
1916 				rw_enter(&ddv->sdev_contents, RW_READER);
1917 
1918 				if (error != 0) {
1919 					SD_TRACE_FAILED_LOOKUP(ddv, nm,
1920 					    retried);
1921 					goto nolock_notfound;
1922 				}
1923 
1924 				state = dv->sdev_state;
1925 				if (state == SDEV_INIT) {
1926 					SD_TRACE_FAILED_LOOKUP(ddv, nm,
1927 					    retried);
1928 					goto nolock_notfound;
1929 				} else if (state == SDEV_READY) {
1930 					goto found;
1931 				} else if (state == SDEV_ZOMBIE) {
1932 					rw_exit(&ddv->sdev_contents);
1933 					SD_TRACE_FAILED_LOOKUP(ddv, nm,
1934 					    retried);
1935 					SDEV_RELE(dv);
1936 					goto lookup_failed;
1937 				}
1938 			} else {
1939 				mutex_exit(&dv->sdev_lookup_lock);
1940 			}
1941 			break;
1942 		case SDEV_READY:
1943 			goto found;
1944 		case SDEV_ZOMBIE:
1945 			rw_exit(&ddv->sdev_contents);
1946 			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
1947 			SDEV_RELE(dv);
1948 			goto lookup_failed;
1949 		default:
1950 			rw_exit(&ddv->sdev_contents);
1951 			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
1952 			sdev_lookup_failed(ddv, nm, failed_flags);
1953 			*vpp = NULLVP;
1954 			return (ENOENT);
1955 		}
1956 	}
1957 	ASSERT(RW_READ_HELD(&ddv->sdev_contents));
1958 
1959 	/*
1960 	 * ZOMBIED parent does not allow new node creation.
1961 	 * bail out early
1962 	 */
1963 	if (parent_state == SDEV_ZOMBIE) {
1964 		rw_exit(&ddv->sdev_contents);
1965 		*vpp = NULLVP;
1966 		SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
1967 		return (ENOENT);
1968 	}
1969 
1970 	/*
1971 	 * (b0): backing store lookup
1972 	 *	SDEV_PERSIST is default except:
1973 	 *		1) pts nodes
1974 	 *		2) non-chmod'ed local nodes
1975 	 *		3) zvol nodes
1976 	 */
1977 	if (SDEV_IS_PERSIST(ddv)) {
1978 		error = devname_backstore_lookup(ddv, nm, &rvp);
1979 
1980 		if (!error) {
1981 
1982 			vattr.va_mask = AT_TYPE|AT_MODE|AT_UID|AT_GID;
1983 			error = VOP_GETATTR(rvp, &vattr, 0, cred, NULL);
1984 			if (error) {
1985 				rw_exit(&ddv->sdev_contents);
1986 				if (dv)
1987 					SDEV_RELE(dv);
1988 				SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
1989 				sdev_lookup_failed(ddv, nm, failed_flags);
1990 				*vpp = NULLVP;
1991 				return (ENOENT);
1992 			}
1993 
1994 			if (vattr.va_type == VLNK) {
1995 				error = sdev_getlink(rvp, &link);
1996 				if (error) {
1997 					rw_exit(&ddv->sdev_contents);
1998 					if (dv)
1999 						SDEV_RELE(dv);
2000 					SD_TRACE_FAILED_LOOKUP(ddv, nm,
2001 					    retried);
2002 					sdev_lookup_failed(ddv, nm,
2003 					    failed_flags);
2004 					*vpp = NULLVP;
2005 					return (ENOENT);
2006 				}
2007 				ASSERT(link != NULL);
2008 			}
2009 
2010 			if (!rw_tryupgrade(&ddv->sdev_contents)) {
2011 				rw_exit(&ddv->sdev_contents);
2012 				rw_enter(&ddv->sdev_contents, RW_WRITER);
2013 			}
2014 			error = sdev_mknode(ddv, nm, &dv, &vattr,
2015 			    rvp, link, cred, SDEV_READY);
2016 			rw_downgrade(&ddv->sdev_contents);
2017 
2018 			if (link != NULL) {
2019 				kmem_free(link, strlen(link) + 1);
2020 				link = NULL;
2021 			}
2022 
2023 			if (error) {
2024 				SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2025 				rw_exit(&ddv->sdev_contents);
2026 				if (dv)
2027 					SDEV_RELE(dv);
2028 				goto lookup_failed;
2029 			} else {
2030 				goto found;
2031 			}
2032 		} else if (retried) {
2033 			rw_exit(&ddv->sdev_contents);
2034 			sdcmn_err3(("retry of lookup of %s/%s: failed\n",
2035 			    ddv->sdev_name, nm));
2036 			if (dv)
2037 				SDEV_RELE(dv);
2038 			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2039 			sdev_lookup_failed(ddv, nm, failed_flags);
2040 			*vpp = NULLVP;
2041 			return (ENOENT);
2042 		}
2043 	}
2044 
2045 lookup_create_node:
2046 	/* first thread that is doing the lookup on this node */
2047 	if (callback) {
2048 		ASSERT(dv == NULL);
2049 		if (!rw_tryupgrade(&ddv->sdev_contents)) {
2050 			rw_exit(&ddv->sdev_contents);
2051 			rw_enter(&ddv->sdev_contents, RW_WRITER);
2052 		}
2053 		error = sdev_call_dircallback(ddv, &dv, nm, callback,
2054 		    flags, cred);
2055 		rw_downgrade(&ddv->sdev_contents);
2056 		if (error == 0) {
2057 			goto found;
2058 		} else {
2059 			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2060 			rw_exit(&ddv->sdev_contents);
2061 			goto lookup_failed;
2062 		}
2063 	}
2064 	if (!dv) {
2065 		if (!rw_tryupgrade(&ddv->sdev_contents)) {
2066 			rw_exit(&ddv->sdev_contents);
2067 			rw_enter(&ddv->sdev_contents, RW_WRITER);
2068 		}
2069 		error = sdev_mknode(ddv, nm, &dv, NULL, NULL, NULL,
2070 		    cred, SDEV_INIT);
2071 		if (!dv) {
2072 			rw_exit(&ddv->sdev_contents);
2073 			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2074 			sdev_lookup_failed(ddv, nm, failed_flags);
2075 			*vpp = NULLVP;
2076 			return (ENOENT);
2077 		}
2078 		rw_downgrade(&ddv->sdev_contents);
2079 	}
2080 
2081 	/*
2082 	 * (b1) invoking devfsadm once per life time for devfsadm nodes
2083 	 */
2084 	ASSERT(SDEV_HELD(dv));
2085 
2086 	if (SDEV_IS_NO_NCACHE(dv))
2087 		failed_flags |= SLF_NO_NCACHE;
2088 	if (sdev_reconfig_boot || !i_ddi_io_initialized() ||
2089 	    SDEV_IS_DYNAMIC(ddv) || SDEV_IS_NO_NCACHE(dv) ||
2090 	    ((moddebug & MODDEBUG_FINI_EBUSY) != 0)) {
2091 		ASSERT(SDEV_HELD(dv));
2092 		SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2093 		goto nolock_notfound;
2094 	}
2095 
2096 	/*
2097 	 * filter out known non-existent devices recorded
2098 	 * during initial reconfiguration boot for which
2099 	 * reconfig should not be done and lookup may
2100 	 * be short-circuited now.
2101 	 */
2102 	if (sdev_lookup_filter(ddv, nm)) {
2103 		SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2104 		goto nolock_notfound;
2105 	}
2106 
2107 	/* bypassing devfsadm internal nodes */
2108 	if (is_devfsadm_thread(lookup_thread)) {
2109 		SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2110 		goto nolock_notfound;
2111 	}
2112 
2113 	if (sdev_reconfig_disable) {
2114 		SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2115 		goto nolock_notfound;
2116 	}
2117 
2118 	error = sdev_call_devfsadmd(ddv, dv, nm);
2119 	if (error == 0) {
2120 		sdcmn_err8(("lookup of %s/%s by %s: reconfig\n",
2121 		    ddv->sdev_name, nm, curproc->p_user.u_comm));
2122 		if (sdev_reconfig_verbose) {
2123 			cmn_err(CE_CONT,
2124 			    "?lookup of %s/%s by %s: reconfig\n",
2125 			    ddv->sdev_name, nm, curproc->p_user.u_comm);
2126 		}
2127 		retried = 1;
2128 		failed_flags |= SLF_REBUILT;
2129 		ASSERT(dv->sdev_state != SDEV_ZOMBIE);
2130 		SDEV_SIMPLE_RELE(dv);
2131 		goto tryagain;
2132 	} else {
2133 		SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2134 		goto nolock_notfound;
2135 	}
2136 
2137 found:
2138 	ASSERT(!(dv->sdev_flags & SDEV_STALE));
2139 	ASSERT(dv->sdev_state == SDEV_READY);
2140 	if (vtor) {
2141 		/*
2142 		 * Check validity of returned node
2143 		 */
2144 		switch (vtor(dv)) {
2145 		case SDEV_VTOR_VALID:
2146 			break;
2147 		case SDEV_VTOR_STALE:
2148 			/*
2149 			 * The name exists, but the cache entry is
2150 			 * stale and needs to be re-created.
2151 			 */
2152 			ASSERT(RW_READ_HELD(&ddv->sdev_contents));
2153 			if (rw_tryupgrade(&ddv->sdev_contents) == 0) {
2154 				rw_exit(&ddv->sdev_contents);
2155 				rw_enter(&ddv->sdev_contents, RW_WRITER);
2156 			}
2157 			error = sdev_cache_update(ddv, &dv, nm,
2158 			    SDEV_CACHE_DELETE);
2159 			rw_downgrade(&ddv->sdev_contents);
2160 			if (error == 0) {
2161 				dv = NULL;
2162 				goto lookup_create_node;
2163 			}
2164 			/* FALLTHRU */
2165 		case SDEV_VTOR_INVALID:
2166 			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2167 			sdcmn_err7(("lookup: destroy invalid "
2168 			    "node: %s(%p)\n", dv->sdev_name, (void *)dv));
2169 			goto nolock_notfound;
2170 		case SDEV_VTOR_SKIP:
2171 			sdcmn_err7(("lookup: node not applicable - "
2172 			    "skipping: %s(%p)\n", dv->sdev_name, (void *)dv));
2173 			rw_exit(&ddv->sdev_contents);
2174 			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2175 			SDEV_RELE(dv);
2176 			goto lookup_failed;
2177 		default:
2178 			cmn_err(CE_PANIC,
2179 			    "dev fs: validator failed: %s(%p)\n",
2180 			    dv->sdev_name, (void *)dv);
2181 			break;
2182 		}
2183 	}
2184 
2185 	rw_exit(&ddv->sdev_contents);
2186 	rv = sdev_to_vp(dv, vpp);
2187 	sdcmn_err3(("devname_lookup_func: returning vp %p v_count %d state %d "
2188 	    "for nm %s, error %d\n", (void *)*vpp, (*vpp)->v_count,
2189 	    dv->sdev_state, nm, rv));
2190 	return (rv);
2191 
2192 nolock_notfound:
2193 	/*
2194 	 * Destroy the node that is created for synchronization purposes.
2195 	 */
2196 	sdcmn_err3(("devname_lookup_func: %s with state %d\n",
2197 	    nm, dv->sdev_state));
2198 	ASSERT(RW_READ_HELD(&ddv->sdev_contents));
2199 	if (dv->sdev_state == SDEV_INIT) {
2200 		if (!rw_tryupgrade(&ddv->sdev_contents)) {
2201 			rw_exit(&ddv->sdev_contents);
2202 			rw_enter(&ddv->sdev_contents, RW_WRITER);
2203 		}
2204 
2205 		/*
2206 		 * Node state may have changed during the lock
2207 		 * changes. Re-check.
2208 		 */
2209 		if (dv->sdev_state == SDEV_INIT) {
2210 			(void) sdev_dirdelete(ddv, dv);
2211 			rw_exit(&ddv->sdev_contents);
2212 			sdev_lookup_failed(ddv, nm, failed_flags);
2213 			*vpp = NULL;
2214 			return (ENOENT);
2215 		}
2216 	}
2217 
2218 	rw_exit(&ddv->sdev_contents);
2219 	SDEV_RELE(dv);
2220 
2221 lookup_failed:
2222 	sdev_lookup_failed(ddv, nm, failed_flags);
2223 	*vpp = NULL;
2224 	return (ENOENT);
2225 }
2226 
2227 /*
2228  * Given a directory node, mark all nodes beneath as
2229  * STALE, i.e. nodes that don't exist as far as new
2230  * consumers are concerned.  Remove them from the
2231  * list of directory entries so that no lookup or
2232  * directory traversal will find them.  The node
2233  * not deallocated so existing holds are not affected.
2234  */
2235 void
2236 sdev_stale(struct sdev_node *ddv)
2237 {
2238 	struct sdev_node *dv;
2239 	struct vnode *vp;
2240 
2241 	ASSERT(SDEVTOV(ddv)->v_type == VDIR);
2242 
2243 	rw_enter(&ddv->sdev_contents, RW_WRITER);
2244 	for (dv = SDEV_FIRST_ENTRY(ddv); dv; dv = SDEV_NEXT_ENTRY(ddv, dv)) {
2245 		vp = SDEVTOV(dv);
2246 		if (vp->v_type == VDIR)
2247 			sdev_stale(dv);
2248 
2249 		sdcmn_err9(("sdev_stale: setting stale %s\n",
2250 		    dv->sdev_path));
2251 		dv->sdev_flags |= SDEV_STALE;
2252 		avl_remove(&ddv->sdev_entries, dv);
2253 	}
2254 	ddv->sdev_flags |= SDEV_BUILD;
2255 	rw_exit(&ddv->sdev_contents);
2256 }
2257 
2258 /*
2259  * Given a directory node, clean out all the nodes beneath.
2260  * If expr is specified, clean node with names matching expr.
2261  * If SDEV_ENFORCE is specified in flags, busy nodes are made stale,
2262  *	so they are excluded from future lookups.
2263  */
2264 int
2265 sdev_cleandir(struct sdev_node *ddv, char *expr, uint_t flags)
2266 {
2267 	int error = 0;
2268 	int busy = 0;
2269 	struct vnode *vp;
2270 	struct sdev_node *dv, *next = NULL;
2271 	int bkstore = 0;
2272 	int len = 0;
2273 	char *bks_name = NULL;
2274 
2275 	ASSERT(SDEVTOV(ddv)->v_type == VDIR);
2276 
2277 	/*
2278 	 * We try our best to destroy all unused sdev_node's
2279 	 */
2280 	rw_enter(&ddv->sdev_contents, RW_WRITER);
2281 	for (dv = SDEV_FIRST_ENTRY(ddv); dv; dv = next) {
2282 		next = SDEV_NEXT_ENTRY(ddv, dv);
2283 		vp = SDEVTOV(dv);
2284 
2285 		if (expr && gmatch(dv->sdev_name, expr) == 0)
2286 			continue;
2287 
2288 		if (vp->v_type == VDIR &&
2289 		    sdev_cleandir(dv, NULL, flags) != 0) {
2290 			sdcmn_err9(("sdev_cleandir: dir %s busy\n",
2291 			    dv->sdev_name));
2292 			busy++;
2293 			continue;
2294 		}
2295 
2296 		if (vp->v_count > 0 && (flags & SDEV_ENFORCE) == 0) {
2297 			sdcmn_err9(("sdev_cleandir: dir %s busy\n",
2298 			    dv->sdev_name));
2299 			busy++;
2300 			continue;
2301 		}
2302 
2303 		/*
2304 		 * at this point, either dv is not held or SDEV_ENFORCE
2305 		 * is specified. In either case, dv needs to be deleted
2306 		 */
2307 		SDEV_HOLD(dv);
2308 
2309 		bkstore = SDEV_IS_PERSIST(dv) ? 1 : 0;
2310 		if (bkstore && (vp->v_type == VDIR))
2311 			bkstore += 1;
2312 
2313 		if (bkstore) {
2314 			len = strlen(dv->sdev_name) + 1;
2315 			bks_name = kmem_alloc(len, KM_SLEEP);
2316 			bcopy(dv->sdev_name, bks_name, len);
2317 		}
2318 
2319 		error = sdev_dirdelete(ddv, dv);
2320 
2321 		if (error == EBUSY) {
2322 			sdcmn_err9(("sdev_cleandir: dir busy\n"));
2323 			busy++;
2324 		}
2325 
2326 		/* take care the backing store clean up */
2327 		if (bkstore && (error == 0)) {
2328 			ASSERT(bks_name);
2329 			ASSERT(ddv->sdev_attrvp);
2330 
2331 			if (bkstore == 1) {
2332 				error = VOP_REMOVE(ddv->sdev_attrvp,
2333 				    bks_name, kcred, NULL, 0);
2334 			} else if (bkstore == 2) {
2335 				error = VOP_RMDIR(ddv->sdev_attrvp,
2336 				    bks_name, ddv->sdev_attrvp, kcred, NULL, 0);
2337 			}
2338 
2339 			/* do not propagate the backing store errors */
2340 			if (error) {
2341 				sdcmn_err9(("sdev_cleandir: backing store"
2342 				    "not cleaned\n"));
2343 				error = 0;
2344 			}
2345 
2346 			bkstore = 0;
2347 			kmem_free(bks_name, len);
2348 			bks_name = NULL;
2349 			len = 0;
2350 		}
2351 	}
2352 
2353 	ddv->sdev_flags |= SDEV_BUILD;
2354 	rw_exit(&ddv->sdev_contents);
2355 
2356 	if (busy) {
2357 		error = EBUSY;
2358 	}
2359 
2360 	return (error);
2361 }
2362 
2363 /*
2364  * a convenient wrapper for readdir() funcs
2365  */
2366 size_t
2367 add_dir_entry(dirent64_t *de, char *nm, size_t size, ino_t ino, offset_t off)
2368 {
2369 	size_t reclen = DIRENT64_RECLEN(strlen(nm));
2370 	if (reclen > size)
2371 		return (0);
2372 
2373 	de->d_ino = (ino64_t)ino;
2374 	de->d_off = (off64_t)off + 1;
2375 	de->d_reclen = (ushort_t)reclen;
2376 	(void) strncpy(de->d_name, nm, DIRENT64_NAMELEN(reclen));
2377 	return (reclen);
2378 }
2379 
2380 /*
2381  * sdev_mount service routines
2382  */
2383 int
2384 sdev_copyin_mountargs(struct mounta *uap, struct sdev_mountargs *args)
2385 {
2386 	int	error;
2387 
2388 	if (uap->datalen != sizeof (*args))
2389 		return (EINVAL);
2390 
2391 	if (error = copyin(uap->dataptr, args, sizeof (*args))) {
2392 		cmn_err(CE_WARN, "sdev_copyin_mountargs: can not"
2393 		    "get user data. error %d\n", error);
2394 		return (EFAULT);
2395 	}
2396 
2397 	return (0);
2398 }
2399 
2400 #ifdef nextdp
2401 #undef nextdp
2402 #endif
2403 #define	nextdp(dp)	((struct dirent64 *) \
2404 			    (intptr_t)((char *)(dp) + (dp)->d_reclen))
2405 
2406 /*
2407  * readdir helper func
2408  */
2409 int
2410 devname_readdir_func(vnode_t *vp, uio_t *uiop, cred_t *cred, int *eofp,
2411     int flags)
2412 {
2413 	struct sdev_node *ddv = VTOSDEV(vp);
2414 	struct sdev_node *dv;
2415 	dirent64_t	*dp;
2416 	ulong_t		outcount = 0;
2417 	size_t		namelen;
2418 	ulong_t		alloc_count;
2419 	void		*outbuf;
2420 	struct iovec	*iovp;
2421 	int		error = 0;
2422 	size_t		reclen;
2423 	offset_t	diroff;
2424 	offset_t	soff;
2425 	int		this_reclen;
2426 	int (*vtor)(struct sdev_node *) = NULL;
2427 	struct vattr attr;
2428 	timestruc_t now;
2429 
2430 	ASSERT(ddv->sdev_attr || ddv->sdev_attrvp);
2431 	ASSERT(RW_READ_HELD(&ddv->sdev_contents));
2432 
2433 	if (uiop->uio_loffset >= MAXOFF_T) {
2434 		if (eofp)
2435 			*eofp = 1;
2436 		return (0);
2437 	}
2438 
2439 	if (uiop->uio_iovcnt != 1)
2440 		return (EINVAL);
2441 
2442 	if (vp->v_type != VDIR)
2443 		return (ENOTDIR);
2444 
2445 	if (ddv->sdev_flags & SDEV_VTOR) {
2446 		vtor = (int (*)(struct sdev_node *))sdev_get_vtor(ddv);
2447 		ASSERT(vtor);
2448 	}
2449 
2450 	if (eofp != NULL)
2451 		*eofp = 0;
2452 
2453 	soff = uiop->uio_loffset;
2454 	iovp = uiop->uio_iov;
2455 	alloc_count = iovp->iov_len;
2456 	dp = outbuf = kmem_alloc(alloc_count, KM_SLEEP);
2457 	outcount = 0;
2458 
2459 	if (ddv->sdev_state == SDEV_ZOMBIE)
2460 		goto get_cache;
2461 
2462 	if (SDEV_IS_GLOBAL(ddv)) {
2463 
2464 		if ((sdev_boot_state == SDEV_BOOT_STATE_COMPLETE) &&
2465 		    !sdev_reconfig_boot && (flags & SDEV_BROWSE) &&
2466 		    !SDEV_IS_DYNAMIC(ddv) && !SDEV_IS_NO_NCACHE(ddv) &&
2467 		    ((moddebug & MODDEBUG_FINI_EBUSY) == 0) &&
2468 		    !DEVNAME_DEVFSADM_HAS_RUN(devfsadm_state) &&
2469 		    !DEVNAME_DEVFSADM_IS_RUNNING(devfsadm_state) &&
2470 		    !sdev_reconfig_disable) {
2471 			/*
2472 			 * invoking "devfsadm" to do system device reconfig
2473 			 */
2474 			mutex_enter(&ddv->sdev_lookup_lock);
2475 			SDEV_BLOCK_OTHERS(ddv,
2476 			    (SDEV_READDIR|SDEV_LGWAITING));
2477 			mutex_exit(&ddv->sdev_lookup_lock);
2478 
2479 			sdcmn_err8(("readdir of %s by %s: reconfig\n",
2480 			    ddv->sdev_path, curproc->p_user.u_comm));
2481 			if (sdev_reconfig_verbose) {
2482 				cmn_err(CE_CONT,
2483 				    "?readdir of %s by %s: reconfig\n",
2484 				    ddv->sdev_path, curproc->p_user.u_comm);
2485 			}
2486 
2487 			sdev_devfsadmd_thread(ddv, NULL, kcred);
2488 		} else if (DEVNAME_DEVFSADM_IS_RUNNING(devfsadm_state)) {
2489 			/*
2490 			 * compensate the "ls" started later than "devfsadm"
2491 			 */
2492 			mutex_enter(&ddv->sdev_lookup_lock);
2493 			SDEV_BLOCK_OTHERS(ddv, (SDEV_READDIR|SDEV_LGWAITING));
2494 			mutex_exit(&ddv->sdev_lookup_lock);
2495 		}
2496 
2497 		/*
2498 		 * release the contents lock so that
2499 		 * the cache may be updated by devfsadmd
2500 		 */
2501 		rw_exit(&ddv->sdev_contents);
2502 		mutex_enter(&ddv->sdev_lookup_lock);
2503 		if (SDEV_IS_READDIR(ddv))
2504 			(void) sdev_wait4lookup(ddv, SDEV_READDIR);
2505 		mutex_exit(&ddv->sdev_lookup_lock);
2506 		rw_enter(&ddv->sdev_contents, RW_READER);
2507 
2508 		sdcmn_err4(("readdir of directory %s by %s\n",
2509 		    ddv->sdev_name, curproc->p_user.u_comm));
2510 		if (ddv->sdev_flags & SDEV_BUILD) {
2511 			if (SDEV_IS_PERSIST(ddv)) {
2512 				error = sdev_filldir_from_store(ddv,
2513 				    alloc_count, cred);
2514 			}
2515 			ddv->sdev_flags &= ~SDEV_BUILD;
2516 		}
2517 	}
2518 
2519 get_cache:
2520 	/* handle "." and ".." */
2521 	diroff = 0;
2522 	if (soff == 0) {
2523 		/* first time */
2524 		this_reclen = DIRENT64_RECLEN(1);
2525 		if (alloc_count < this_reclen) {
2526 			error = EINVAL;
2527 			goto done;
2528 		}
2529 
2530 		dp->d_ino = (ino64_t)ddv->sdev_ino;
2531 		dp->d_off = (off64_t)1;
2532 		dp->d_reclen = (ushort_t)this_reclen;
2533 
2534 		(void) strncpy(dp->d_name, ".",
2535 		    DIRENT64_NAMELEN(this_reclen));
2536 		outcount += dp->d_reclen;
2537 		dp = nextdp(dp);
2538 	}
2539 
2540 	diroff++;
2541 	if (soff <= 1) {
2542 		this_reclen = DIRENT64_RECLEN(2);
2543 		if (alloc_count < outcount + this_reclen) {
2544 			error = EINVAL;
2545 			goto done;
2546 		}
2547 
2548 		dp->d_reclen = (ushort_t)this_reclen;
2549 		dp->d_ino = (ino64_t)ddv->sdev_dotdot->sdev_ino;
2550 		dp->d_off = (off64_t)2;
2551 
2552 		(void) strncpy(dp->d_name, "..",
2553 		    DIRENT64_NAMELEN(this_reclen));
2554 		outcount += dp->d_reclen;
2555 
2556 		dp = nextdp(dp);
2557 	}
2558 
2559 
2560 	/* gets the cache */
2561 	diroff++;
2562 	for (dv = SDEV_FIRST_ENTRY(ddv); dv;
2563 	    dv = SDEV_NEXT_ENTRY(ddv, dv), diroff++) {
2564 		sdcmn_err3(("sdev_readdir: diroff %lld soff %lld for '%s' \n",
2565 		    diroff, soff, dv->sdev_name));
2566 
2567 		/* bypassing pre-matured nodes */
2568 		if (diroff < soff || (dv->sdev_state != SDEV_READY)) {
2569 			sdcmn_err3(("sdev_readdir: pre-mature node  "
2570 			    "%s %d\n", dv->sdev_name, dv->sdev_state));
2571 			continue;
2572 		}
2573 
2574 		/*
2575 		 * Check validity of node
2576 		 * Drop invalid and nodes to be skipped.
2577 		 * A node the validator indicates as stale needs
2578 		 * to be returned as presumably the node name itself
2579 		 * is valid and the node data itself will be refreshed
2580 		 * on lookup.  An application performing a readdir then
2581 		 * stat on each entry should thus always see consistent
2582 		 * data.  In any case, it is not possible to synchronize
2583 		 * with dynamic kernel state, and any view we return can
2584 		 * never be anything more than a snapshot at a point in time.
2585 		 */
2586 		if (vtor) {
2587 			switch (vtor(dv)) {
2588 			case SDEV_VTOR_VALID:
2589 				break;
2590 			case SDEV_VTOR_INVALID:
2591 			case SDEV_VTOR_SKIP:
2592 				continue;
2593 			case SDEV_VTOR_STALE:
2594 				sdcmn_err3(("sdev_readir: %s stale\n",
2595 				    dv->sdev_name));
2596 				break;
2597 			default:
2598 				cmn_err(CE_PANIC,
2599 				    "dev fs: validator failed: %s(%p)\n",
2600 				    dv->sdev_name, (void *)dv);
2601 				break;
2602 			/*NOTREACHED*/
2603 			}
2604 		}
2605 
2606 		namelen = strlen(dv->sdev_name);
2607 		reclen = DIRENT64_RECLEN(namelen);
2608 		if (outcount + reclen > alloc_count) {
2609 			goto full;
2610 		}
2611 		dp->d_reclen = (ushort_t)reclen;
2612 		dp->d_ino = (ino64_t)dv->sdev_ino;
2613 		dp->d_off = (off64_t)diroff + 1;
2614 		(void) strncpy(dp->d_name, dv->sdev_name,
2615 		    DIRENT64_NAMELEN(reclen));
2616 		outcount += reclen;
2617 		dp = nextdp(dp);
2618 	}
2619 
2620 full:
2621 	sdcmn_err4(("sdev_readdir: moving %lu bytes: "
2622 	    "diroff %lld, soff %lld, dv %p\n", outcount, diroff, soff,
2623 	    (void *)dv));
2624 
2625 	if (outcount)
2626 		error = uiomove(outbuf, outcount, UIO_READ, uiop);
2627 
2628 	if (!error) {
2629 		uiop->uio_loffset = diroff;
2630 		if (eofp)
2631 			*eofp = dv ? 0 : 1;
2632 	}
2633 
2634 
2635 	if (ddv->sdev_attrvp) {
2636 		gethrestime(&now);
2637 		attr.va_ctime = now;
2638 		attr.va_atime = now;
2639 		attr.va_mask = AT_CTIME|AT_ATIME;
2640 
2641 		(void) VOP_SETATTR(ddv->sdev_attrvp, &attr, 0, kcred, NULL);
2642 	}
2643 done:
2644 	kmem_free(outbuf, alloc_count);
2645 	return (error);
2646 }
2647 
2648 static int
2649 sdev_modctl_lookup(const char *path, vnode_t **r_vp)
2650 {
2651 	vnode_t *vp;
2652 	vnode_t *cvp;
2653 	struct sdev_node *svp;
2654 	char *nm;
2655 	struct pathname pn;
2656 	int error;
2657 	int persisted = 0;
2658 
2659 	ASSERT(INGLOBALZONE(curproc));
2660 
2661 	if (error = pn_get((char *)path, UIO_SYSSPACE, &pn))
2662 		return (error);
2663 	nm = kmem_alloc(MAXNAMELEN, KM_SLEEP);
2664 
2665 	vp = rootdir;
2666 	VN_HOLD(vp);
2667 
2668 	while (pn_pathleft(&pn)) {
2669 		ASSERT(vp->v_type == VDIR || vp->v_type == VLNK);
2670 		(void) pn_getcomponent(&pn, nm);
2671 
2672 		/*
2673 		 * Deal with the .. special case where we may be
2674 		 * traversing up across a mount point, to the
2675 		 * root of this filesystem or global root.
2676 		 */
2677 		if (nm[0] == '.' && nm[1] == '.' && nm[2] == 0) {
2678 checkforroot:
2679 			if (VN_CMP(vp, rootdir)) {
2680 				nm[1] = 0;
2681 			} else if (vp->v_flag & VROOT) {
2682 				vfs_t *vfsp;
2683 				cvp = vp;
2684 				vfsp = cvp->v_vfsp;
2685 				vfs_rlock_wait(vfsp);
2686 				vp = cvp->v_vfsp->vfs_vnodecovered;
2687 				if (vp == NULL ||
2688 				    (cvp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) {
2689 					vfs_unlock(vfsp);
2690 					VN_RELE(cvp);
2691 					error = EIO;
2692 					break;
2693 				}
2694 				VN_HOLD(vp);
2695 				vfs_unlock(vfsp);
2696 				VN_RELE(cvp);
2697 				cvp = NULL;
2698 				goto checkforroot;
2699 			}
2700 		}
2701 
2702 		error = VOP_LOOKUP(vp, nm, &cvp, NULL, 0, NULL, kcred, NULL,
2703 		    NULL, NULL);
2704 		if (error) {
2705 			VN_RELE(vp);
2706 			break;
2707 		}
2708 
2709 		/* traverse mount points encountered on our journey */
2710 		if (vn_ismntpt(cvp) && (error = traverse(&cvp)) != 0) {
2711 			VN_RELE(vp);
2712 			VN_RELE(cvp);
2713 			break;
2714 		}
2715 
2716 		/*
2717 		 * symbolic link, can be either relative and absolute
2718 		 */
2719 		if ((cvp->v_type == VLNK) && pn_pathleft(&pn)) {
2720 			struct pathname linkpath;
2721 			pn_alloc(&linkpath);
2722 			if (error = pn_getsymlink(cvp, &linkpath, kcred)) {
2723 				pn_free(&linkpath);
2724 				break;
2725 			}
2726 			if (pn_pathleft(&linkpath) == 0)
2727 				(void) pn_set(&linkpath, ".");
2728 			error = pn_insert(&pn, &linkpath, strlen(nm));
2729 			pn_free(&linkpath);
2730 			if (pn.pn_pathlen == 0) {
2731 				VN_RELE(vp);
2732 				return (ENOENT);
2733 			}
2734 			if (pn.pn_path[0] == '/') {
2735 				pn_skipslash(&pn);
2736 				VN_RELE(vp);
2737 				VN_RELE(cvp);
2738 				vp = rootdir;
2739 				VN_HOLD(vp);
2740 			} else {
2741 				VN_RELE(cvp);
2742 			}
2743 			continue;
2744 		}
2745 
2746 		VN_RELE(vp);
2747 
2748 		/*
2749 		 * Direct the operation to the persisting filesystem
2750 		 * underlying /dev.  Bail if we encounter a
2751 		 * non-persistent dev entity here.
2752 		 */
2753 		if (cvp->v_vfsp->vfs_fstype == devtype) {
2754 
2755 			if ((VTOSDEV(cvp)->sdev_flags & SDEV_PERSIST) == 0) {
2756 				error = ENOENT;
2757 				VN_RELE(cvp);
2758 				break;
2759 			}
2760 
2761 			if (VTOSDEV(cvp) == NULL) {
2762 				error = ENOENT;
2763 				VN_RELE(cvp);
2764 				break;
2765 			}
2766 			svp = VTOSDEV(cvp);
2767 			if ((vp = svp->sdev_attrvp) == NULL) {
2768 				error = ENOENT;
2769 				VN_RELE(cvp);
2770 				break;
2771 			}
2772 			persisted = 1;
2773 			VN_HOLD(vp);
2774 			VN_RELE(cvp);
2775 			cvp = vp;
2776 		}
2777 
2778 		vp = cvp;
2779 		pn_skipslash(&pn);
2780 	}
2781 
2782 	kmem_free(nm, MAXNAMELEN);
2783 	pn_free(&pn);
2784 
2785 	if (error)
2786 		return (error);
2787 
2788 	/*
2789 	 * Only return persisted nodes in the filesystem underlying /dev.
2790 	 */
2791 	if (!persisted) {
2792 		VN_RELE(vp);
2793 		return (ENOENT);
2794 	}
2795 
2796 	*r_vp = vp;
2797 	return (0);
2798 }
2799 
2800 int
2801 sdev_modctl_readdir(const char *dir, char ***dirlistp,
2802 	int *npathsp, int *npathsp_alloc, int checking_empty)
2803 {
2804 	char	**pathlist = NULL;
2805 	char	**newlist = NULL;
2806 	int	npaths = 0;
2807 	int	npaths_alloc = 0;
2808 	dirent64_t *dbuf = NULL;
2809 	int	n;
2810 	char	*s;
2811 	int error;
2812 	vnode_t *vp;
2813 	int eof;
2814 	struct iovec iov;
2815 	struct uio uio;
2816 	struct dirent64 *dp;
2817 	size_t dlen;
2818 	size_t dbuflen;
2819 	int ndirents = 64;
2820 	char *nm;
2821 
2822 	error = sdev_modctl_lookup(dir, &vp);
2823 	sdcmn_err11(("modctl readdir: %s by %s: %s\n",
2824 	    dir, curproc->p_user.u_comm,
2825 	    (error == 0) ? "ok" : "failed"));
2826 	if (error)
2827 		return (error);
2828 
2829 	dlen = ndirents * (sizeof (*dbuf));
2830 	dbuf = kmem_alloc(dlen, KM_SLEEP);
2831 
2832 	uio.uio_iov = &iov;
2833 	uio.uio_iovcnt = 1;
2834 	uio.uio_segflg = UIO_SYSSPACE;
2835 	uio.uio_fmode = 0;
2836 	uio.uio_extflg = UIO_COPY_CACHED;
2837 	uio.uio_loffset = 0;
2838 	uio.uio_llimit = MAXOFFSET_T;
2839 
2840 	eof = 0;
2841 	error = 0;
2842 	while (!error && !eof) {
2843 		uio.uio_resid = dlen;
2844 		iov.iov_base = (char *)dbuf;
2845 		iov.iov_len = dlen;
2846 
2847 		(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2848 		error = VOP_READDIR(vp, &uio, kcred, &eof, NULL, 0);
2849 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2850 
2851 		dbuflen = dlen - uio.uio_resid;
2852 
2853 		if (error || dbuflen == 0)
2854 			break;
2855 
2856 		for (dp = dbuf; ((intptr_t)dp < (intptr_t)dbuf + dbuflen);
2857 		    dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen)) {
2858 
2859 			nm = dp->d_name;
2860 
2861 			if (strcmp(nm, ".") == 0 || strcmp(nm, "..") == 0)
2862 				continue;
2863 			if (npaths == npaths_alloc) {
2864 				npaths_alloc += 64;
2865 				newlist = (char **)
2866 				    kmem_zalloc((npaths_alloc + 1) *
2867 				    sizeof (char *), KM_SLEEP);
2868 				if (pathlist) {
2869 					bcopy(pathlist, newlist,
2870 					    npaths * sizeof (char *));
2871 					kmem_free(pathlist,
2872 					    (npaths + 1) * sizeof (char *));
2873 				}
2874 				pathlist = newlist;
2875 			}
2876 			n = strlen(nm) + 1;
2877 			s = kmem_alloc(n, KM_SLEEP);
2878 			bcopy(nm, s, n);
2879 			pathlist[npaths++] = s;
2880 			sdcmn_err11(("  %s/%s\n", dir, s));
2881 
2882 			/* if checking empty, one entry is as good as many */
2883 			if (checking_empty) {
2884 				eof = 1;
2885 				break;
2886 			}
2887 		}
2888 	}
2889 
2890 exit:
2891 	VN_RELE(vp);
2892 
2893 	if (dbuf)
2894 		kmem_free(dbuf, dlen);
2895 
2896 	if (error)
2897 		return (error);
2898 
2899 	*dirlistp = pathlist;
2900 	*npathsp = npaths;
2901 	*npathsp_alloc = npaths_alloc;
2902 
2903 	return (0);
2904 }
2905 
2906 void
2907 sdev_modctl_readdir_free(char **pathlist, int npaths, int npaths_alloc)
2908 {
2909 	int	i, n;
2910 
2911 	for (i = 0; i < npaths; i++) {
2912 		n = strlen(pathlist[i]) + 1;
2913 		kmem_free(pathlist[i], n);
2914 	}
2915 
2916 	kmem_free(pathlist, (npaths_alloc + 1) * sizeof (char *));
2917 }
2918 
2919 int
2920 sdev_modctl_devexists(const char *path)
2921 {
2922 	vnode_t *vp;
2923 	int error;
2924 
2925 	error = sdev_modctl_lookup(path, &vp);
2926 	sdcmn_err11(("modctl dev exists: %s by %s: %s\n",
2927 	    path, curproc->p_user.u_comm,
2928 	    (error == 0) ? "ok" : "failed"));
2929 	if (error == 0)
2930 		VN_RELE(vp);
2931 
2932 	return (error);
2933 }
2934 
2935 extern int sdev_vnodeops_tbl_size;
2936 
2937 /*
2938  * construct a new template with overrides from vtab
2939  */
2940 static fs_operation_def_t *
2941 sdev_merge_vtab(const fs_operation_def_t tab[])
2942 {
2943 	fs_operation_def_t *new;
2944 	const fs_operation_def_t *tab_entry;
2945 
2946 	/* make a copy of standard vnode ops table */
2947 	new = kmem_alloc(sdev_vnodeops_tbl_size, KM_SLEEP);
2948 	bcopy((void *)sdev_vnodeops_tbl, new, sdev_vnodeops_tbl_size);
2949 
2950 	/* replace the overrides from tab */
2951 	for (tab_entry = tab; tab_entry->name != NULL; tab_entry++) {
2952 		fs_operation_def_t *std_entry = new;
2953 		while (std_entry->name) {
2954 			if (strcmp(tab_entry->name, std_entry->name) == 0) {
2955 				std_entry->func = tab_entry->func;
2956 				break;
2957 			}
2958 			std_entry++;
2959 		}
2960 		if (std_entry->name == NULL)
2961 			cmn_err(CE_NOTE, "sdev_merge_vtab: entry %s unused.",
2962 			    tab_entry->name);
2963 	}
2964 
2965 	return (new);
2966 }
2967 
2968 /* free memory allocated by sdev_merge_vtab */
2969 static void
2970 sdev_free_vtab(fs_operation_def_t *new)
2971 {
2972 	kmem_free(new, sdev_vnodeops_tbl_size);
2973 }
2974 
2975 /*
2976  * a generic setattr() function
2977  *
2978  * note: flags only supports AT_UID and AT_GID.
2979  *	 Future enhancements can be done for other types, e.g. AT_MODE
2980  */
2981 int
2982 devname_setattr_func(struct vnode *vp, struct vattr *vap, int flags,
2983     struct cred *cred, int (*callback)(struct sdev_node *, struct vattr *,
2984     int), int protocol)
2985 {
2986 	struct sdev_node	*dv = VTOSDEV(vp);
2987 	struct sdev_node	*parent = dv->sdev_dotdot;
2988 	struct vattr		*get;
2989 	uint_t			mask = vap->va_mask;
2990 	int 			error;
2991 
2992 	/* some sanity checks */
2993 	if (vap->va_mask & AT_NOSET)
2994 		return (EINVAL);
2995 
2996 	if (vap->va_mask & AT_SIZE) {
2997 		if (vp->v_type == VDIR) {
2998 			return (EISDIR);
2999 		}
3000 	}
3001 
3002 	/* no need to set attribute, but do not fail either */
3003 	ASSERT(parent);
3004 	rw_enter(&parent->sdev_contents, RW_READER);
3005 	if (dv->sdev_state == SDEV_ZOMBIE) {
3006 		rw_exit(&parent->sdev_contents);
3007 		return (0);
3008 	}
3009 
3010 	/* If backing store exists, just set it. */
3011 	if (dv->sdev_attrvp) {
3012 		rw_exit(&parent->sdev_contents);
3013 		return (VOP_SETATTR(dv->sdev_attrvp, vap, flags, cred, NULL));
3014 	}
3015 
3016 	/*
3017 	 * Otherwise, for nodes with the persistence attribute, create it.
3018 	 */
3019 	ASSERT(dv->sdev_attr);
3020 	if (SDEV_IS_PERSIST(dv) ||
3021 	    ((vap->va_mask & ~AT_TIMES) != 0 && !SDEV_IS_DYNAMIC(dv))) {
3022 		sdev_vattr_merge(dv, vap);
3023 		rw_enter(&dv->sdev_contents, RW_WRITER);
3024 		error = sdev_shadow_node(dv, cred);
3025 		rw_exit(&dv->sdev_contents);
3026 		rw_exit(&parent->sdev_contents);
3027 
3028 		if (error)
3029 			return (error);
3030 		return (VOP_SETATTR(dv->sdev_attrvp, vap, flags, cred, NULL));
3031 	}
3032 
3033 
3034 	/*
3035 	 * sdev_attr was allocated in sdev_mknode
3036 	 */
3037 	rw_enter(&dv->sdev_contents, RW_WRITER);
3038 	error = secpolicy_vnode_setattr(cred, vp, vap,
3039 	    dv->sdev_attr, flags, sdev_unlocked_access, dv);
3040 	if (error) {
3041 		rw_exit(&dv->sdev_contents);
3042 		rw_exit(&parent->sdev_contents);
3043 		return (error);
3044 	}
3045 
3046 	get = dv->sdev_attr;
3047 	if (mask & AT_MODE) {
3048 		get->va_mode &= S_IFMT;
3049 		get->va_mode |= vap->va_mode & ~S_IFMT;
3050 	}
3051 
3052 	if ((mask & AT_UID) || (mask & AT_GID)) {
3053 		if (mask & AT_UID)
3054 			get->va_uid = vap->va_uid;
3055 		if (mask & AT_GID)
3056 			get->va_gid = vap->va_gid;
3057 		/*
3058 		 * a callback must be provided if the protocol is set
3059 		 */
3060 		if ((protocol & AT_UID) || (protocol & AT_GID)) {
3061 			ASSERT(callback);
3062 			error = callback(dv, get, protocol);
3063 			if (error) {
3064 				rw_exit(&dv->sdev_contents);
3065 				rw_exit(&parent->sdev_contents);
3066 				return (error);
3067 			}
3068 		}
3069 	}
3070 
3071 	if (mask & AT_ATIME)
3072 		get->va_atime = vap->va_atime;
3073 	if (mask & AT_MTIME)
3074 		get->va_mtime = vap->va_mtime;
3075 	if (mask & (AT_MODE | AT_UID | AT_GID | AT_CTIME)) {
3076 		gethrestime(&get->va_ctime);
3077 	}
3078 
3079 	sdev_vattr_merge(dv, get);
3080 	rw_exit(&dv->sdev_contents);
3081 	rw_exit(&parent->sdev_contents);
3082 	return (0);
3083 }
3084 
3085 /*
3086  * a generic inactive() function
3087  */
3088 /*ARGSUSED*/
3089 void
3090 devname_inactive_func(struct vnode *vp, struct cred *cred,
3091     void (*callback)(struct vnode *))
3092 {
3093 	int clean;
3094 	struct sdev_node *dv = VTOSDEV(vp);
3095 	struct sdev_node *ddv = dv->sdev_dotdot;
3096 	int state;
3097 
3098 	rw_enter(&ddv->sdev_contents, RW_WRITER);
3099 	state = dv->sdev_state;
3100 
3101 	mutex_enter(&vp->v_lock);
3102 	ASSERT(vp->v_count >= 1);
3103 
3104 	if (vp->v_count == 1 && callback != NULL)
3105 		callback(vp);
3106 
3107 	clean = (vp->v_count == 1) && (state == SDEV_ZOMBIE);
3108 
3109 	/*
3110 	 * last ref count on the ZOMBIE node is released.
3111 	 * clean up the sdev_node, and
3112 	 * release the hold on the backing store node so that
3113 	 * the ZOMBIE backing stores also cleaned out.
3114 	 */
3115 	if (clean) {
3116 		ASSERT(ddv);
3117 
3118 		ddv->sdev_nlink--;
3119 		if (vp->v_type == VDIR) {
3120 			dv->sdev_nlink--;
3121 		}
3122 		if ((dv->sdev_flags & SDEV_STALE) == 0)
3123 			avl_remove(&ddv->sdev_entries, dv);
3124 		dv->sdev_nlink--;
3125 		--vp->v_count;
3126 		mutex_exit(&vp->v_lock);
3127 		sdev_nodedestroy(dv, 0);
3128 	} else {
3129 		--vp->v_count;
3130 		mutex_exit(&vp->v_lock);
3131 	}
3132 	rw_exit(&ddv->sdev_contents);
3133 }
3134