xref: /titanic_50/usr/src/uts/common/fs/devfs/devfs_subr.c (revision 8461248208fabd3a8230615f8615e5bf1b4dcdcb)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * miscellaneous routines for the devfs
31  */
32 
33 #include <sys/types.h>
34 #include <sys/param.h>
35 #include <sys/t_lock.h>
36 #include <sys/systm.h>
37 #include <sys/sysmacros.h>
38 #include <sys/user.h>
39 #include <sys/time.h>
40 #include <sys/vfs.h>
41 #include <sys/vnode.h>
42 #include <sys/file.h>
43 #include <sys/fcntl.h>
44 #include <sys/flock.h>
45 #include <sys/kmem.h>
46 #include <sys/uio.h>
47 #include <sys/errno.h>
48 #include <sys/stat.h>
49 #include <sys/cred.h>
50 #include <sys/dirent.h>
51 #include <sys/pathname.h>
52 #include <sys/cmn_err.h>
53 #include <sys/debug.h>
54 #include <sys/modctl.h>
55 #include <fs/fs_subr.h>
56 #include <sys/fs/dv_node.h>
57 #include <sys/fs/snode.h>
58 #include <sys/sunndi.h>
59 #include <sys/sunmdi.h>
60 #include <sys/conf.h>
61 
62 #ifdef DEBUG
63 int devfs_debug = 0x0;
64 #endif
65 
66 const char	dvnm[] = "devfs";
67 kmem_cache_t	*dv_node_cache;	/* dv_node cache */
68 uint_t		devfs_clean_key;
69 struct dv_node *dvroot;
70 
71 /* prototype memory vattrs */
72 vattr_t dv_vattr_dir = {
73 	AT_TYPE|AT_MODE|AT_UID|AT_GID, 		/* va_mask */
74 	VDIR,					/* va_type */
75 	DV_DIRMODE_DEFAULT,			/* va_mode */
76 	DV_UID_DEFAULT,				/* va_uid */
77 	DV_GID_DEFAULT,				/* va_gid */
78 	0,					/* va_fsid; */
79 	0,					/* va_nodeid; */
80 	0,					/* va_nlink; */
81 	0,					/* va_size; */
82 	0,					/* va_atime; */
83 	0,					/* va_mtime; */
84 	0,					/* va_ctime; */
85 	0,					/* va_rdev; */
86 	0,					/* va_blksize; */
87 	0,					/* va_nblocks; */
88 	0,					/* va_seq; */
89 };
90 
91 vattr_t dv_vattr_file = {
92 	AT_TYPE|AT_MODE|AT_SIZE|AT_UID|AT_GID|AT_RDEV,	/* va_mask */
93 	0,					/* va_type */
94 	DV_DEVMODE_DEFAULT,			/* va_mode */
95 	DV_UID_DEFAULT,				/* va_uid */
96 	DV_GID_DEFAULT,				/* va_gid */
97 	0,					/* va_fsid; */
98 	0,					/* va_nodeid; */
99 	0,					/* va_nlink; */
100 	0,					/* va_size; */
101 	0,					/* va_atime; */
102 	0,					/* va_mtime; */
103 	0,					/* va_ctime; */
104 	0,					/* va_rdev; */
105 	0,					/* va_blksize; */
106 	0,					/* va_nblocks; */
107 	0,					/* va_seq; */
108 };
109 
110 vattr_t dv_vattr_priv = {
111 	AT_TYPE|AT_MODE|AT_SIZE|AT_UID|AT_GID|AT_RDEV,	/* va_mask */
112 	0,					/* va_type */
113 	DV_DEVMODE_PRIV,			/* va_mode */
114 	DV_UID_DEFAULT,				/* va_uid */
115 	DV_GID_DEFAULT,				/* va_gid */
116 	0,					/* va_fsid; */
117 	0,					/* va_nodeid; */
118 	0,					/* va_nlink; */
119 	0,					/* va_size; */
120 	0,					/* va_atime; */
121 	0,					/* va_mtime; */
122 	0,					/* va_ctime; */
123 	0,					/* va_rdev; */
124 	0,					/* va_blksize; */
125 	0,					/* va_nblocks; */
126 	0,					/* va_seq; */
127 };
128 
129 extern dev_info_t	*clone_dip;
130 extern major_t		clone_major;
131 extern struct dev_ops	*ddi_hold_driver(major_t);
132 
133 /*
134  * dv_node cache constructor, destructor, can cache creation
135  */
136 /*ARGSUSED1*/
137 static int
138 i_dv_node_ctor(void *buf, void *cfarg, int flag)
139 {
140 	struct dv_node	*dv = (struct dv_node *)buf;
141 	struct vnode	*vp;
142 
143 	bzero(buf, sizeof (struct dv_node));
144 
145 	/* initialize persistent parts of dv_node */
146 	rw_init(&dv->dv_contents, NULL, RW_DEFAULT, NULL);
147 
148 	/* allocate vnode and initialize link back to dv_node */
149 	dv->dv_vnode = vn_alloc(KM_SLEEP);
150 	vp = DVTOV(dv);
151 	vp->v_data = (caddr_t)dv;
152 	return (0);
153 }
154 
155 /* dev_info node destructor for kmem cache */
156 /*ARGSUSED1*/
157 static void
158 i_dv_node_dtor(void *buf, void *arg)
159 {
160 	struct dv_node	*dv = (struct dv_node *)buf;
161 	struct vnode	*vp = DVTOV(dv);
162 
163 	rw_destroy(&dv->dv_contents);
164 	vn_invalid(vp);
165 	vn_free(vp);
166 }
167 
168 
169 /* initialize dev_info node cache */
170 void
171 dv_node_cache_init()
172 {
173 	ASSERT(dv_node_cache == NULL);
174 	dv_node_cache = kmem_cache_create("dv_node_cache",
175 	    sizeof (struct dv_node), 0, i_dv_node_ctor, i_dv_node_dtor,
176 	    NULL, NULL, NULL, 0);
177 
178 	tsd_create(&devfs_clean_key, NULL);
179 }
180 
181 /* initialize dev_info node cache */
182 void
183 dv_node_cache_fini()
184 {
185 	ASSERT(dv_node_cache != NULL);
186 	kmem_cache_destroy(dv_node_cache);
187 	dv_node_cache = NULL;
188 
189 	tsd_destroy(&devfs_clean_key);
190 }
191 
192 /*
193  * dv_mkino - Generate a unique inode number for devfs nodes.
194  *
195  * Although ino_t is 64 bits, the inode number is truncated to 32 bits for 32
196  * bit non-LARGEFILE applications. This means that there is a requirement to
197  * maintain the inode number as a 32 bit value or applications will have
198  * stat(2) calls fail with EOVERFLOW.  We form a 32 bit inode number from the
199  * dev_t. but if the minor number is larger than L_MAXMIN32 we fold extra minor
200  *
201  * To generate inode numbers for directories, we assume that we will never use
202  * more than half the major space - this allows for ~8190 drivers. We use this
203  * upper major number space to allocate inode numbers for directories by
204  * encoding the major and instance into this space.
205  *
206  * We also skew the result so that inode 2 is reserved for the root of the file
207  * system.
208  *
209  * As part of the future support for 64-bit dev_t APIs, the upper minor bits
210  * should be folded into the high inode bits by adding the following code
211  * after "ino |= 1":
212  *
213  * #if (L_BITSMINOR32 != L_BITSMINOR)
214  *		|* fold overflow minor bits into high bits of inode number *|
215  *		ino |= ((ino_t)(minor >> L_BITSMINOR32)) << L_BITSMINOR;
216  * #endif |* (L_BITSMINOR32 != L_BITSMINOR) *|
217  *
218  * This way only applications that use devices that overflow their minor
219  * space will have an application level impact.
220  */
221 static ino_t
222 dv_mkino(dev_info_t *devi, vtype_t typ, dev_t dev)
223 {
224 	major_t		major;
225 	minor_t		minor;
226 	ino_t		ino;
227 	static int	warn;
228 
229 	if (typ == VDIR) {
230 		major = ((L_MAXMAJ32 + 1) >> 1) + DEVI(devi)->devi_major;
231 		minor = ddi_get_instance(devi);
232 
233 		/* makedevice32 in high half of major number space */
234 		ino = (ino_t)((major << L_BITSMINOR32) | (minor & L_MAXMIN32));
235 
236 		major = DEVI(devi)->devi_major;
237 	} else {
238 		major = getmajor(dev);
239 		minor = getminor(dev);
240 
241 		/* makedevice32 */
242 		ino = (ino_t)((major << L_BITSMINOR32) | (minor & L_MAXMIN32));
243 
244 		/* make ino for VCHR different than VBLK */
245 		ino <<= 1;
246 		if (typ == VCHR)
247 			ino |= 1;
248 	}
249 
250 	ino += DV_ROOTINO + 1;		/* skew */
251 
252 	/*
253 	 * diagnose things a little early because adding the skew to a large
254 	 * minor number could roll over the major.
255 	 */
256 	if ((major >= (L_MAXMAJ32 >> 1)) && (warn == 0)) {
257 		warn = 1;
258 		cmn_err(CE_WARN, "%s: inode numbers are not unique", dvnm);
259 	}
260 
261 	return (ino);
262 }
263 
264 /*
265  * dv_mkroot
266  *
267  * Build the first VDIR dv_node.
268  */
269 struct dv_node *
270 dv_mkroot(struct vfs *vfsp, dev_t devfsdev)
271 {
272 	struct dv_node *dv;
273 	struct vnode *vp;
274 
275 	ASSERT(ddi_root_node() != NULL);
276 	ASSERT(dv_node_cache != NULL);
277 
278 	dcmn_err3(("dv_mkroot\n"));
279 	dv = kmem_cache_alloc(dv_node_cache, KM_SLEEP);
280 	vp = DVTOV(dv);
281 	vn_reinit(vp);
282 	vp->v_flag = VROOT;
283 	vp->v_vfsp = vfsp;
284 	vp->v_type = VDIR;
285 	vp->v_rdev = devfsdev;
286 	vn_setops(vp, dv_vnodeops);
287 	vn_exists(vp);
288 
289 	dvroot = dv;
290 
291 	dv->dv_name = NULL;		/* not needed */
292 	dv->dv_namelen = 0;
293 
294 	dv->dv_devi = ddi_root_node();
295 
296 	dv->dv_ino = DV_ROOTINO;
297 	dv->dv_nlink = 2;		/* name + . (no dv_insert) */
298 	dv->dv_dotdot = dv;		/* .. == self */
299 	dv->dv_attrvp = NULLVP;
300 	dv->dv_attr = NULL;
301 	dv->dv_flags = DV_BUILD;
302 	dv->dv_priv = NULL;
303 	dv->dv_busy = 0;
304 	dv->dv_dflt_mode = 0;
305 
306 	return (dv);
307 }
308 
309 /*
310  * dv_mkdir
311  *
312  * Given an probed or attached nexus node, create a VDIR dv_node.
313  * No dv_attrvp is created at this point.
314  */
315 struct dv_node *
316 dv_mkdir(struct dv_node *ddv, dev_info_t *devi, char *nm)
317 {
318 	struct dv_node *dv;
319 	struct vnode *vp;
320 	size_t nmlen;
321 
322 	ASSERT((devi));
323 	dcmn_err4(("dv_mkdir: %s\n", nm));
324 
325 	dv = kmem_cache_alloc(dv_node_cache, KM_SLEEP);
326 	nmlen = strlen(nm) + 1;
327 	dv->dv_name = kmem_alloc(nmlen, KM_SLEEP);
328 	bcopy(nm, dv->dv_name, nmlen);
329 	dv->dv_namelen = nmlen - 1;	/* '\0' not included */
330 	vp = DVTOV(dv);
331 	vn_reinit(vp);
332 	vp->v_flag = 0;
333 	vp->v_vfsp = DVTOV(ddv)->v_vfsp;
334 	vp->v_type = VDIR;
335 	vp->v_rdev = DVTOV(ddv)->v_rdev;
336 	vn_setops(vp, vn_getops(DVTOV(ddv)));
337 	vn_exists(vp);
338 
339 	dv->dv_devi = devi;
340 	ndi_hold_devi(devi);
341 
342 	dv->dv_ino = dv_mkino(devi, VDIR, NODEV);
343 	dv->dv_nlink = 0;		/* updated on insert */
344 	dv->dv_dotdot = ddv;
345 	dv->dv_attrvp = NULLVP;
346 	dv->dv_attr = NULL;
347 	dv->dv_flags = DV_BUILD;
348 	dv->dv_priv = NULL;
349 	dv->dv_busy = 0;
350 	dv->dv_dflt_mode = 0;
351 
352 	return (dv);
353 }
354 
355 /*
356  * dv_mknod
357  *
358  * Given a minor node, create a VCHR or VBLK dv_node.
359  * No dv_attrvp is created at this point.
360  */
361 static struct dv_node *
362 dv_mknod(struct dv_node *ddv, dev_info_t *devi, char *nm,
363 	struct ddi_minor_data *dmd)
364 {
365 	struct dv_node *dv;
366 	struct vnode *vp;
367 	size_t nmlen;
368 
369 	dcmn_err4(("dv_mknod: %s\n", nm));
370 
371 	dv = kmem_cache_alloc(dv_node_cache, KM_SLEEP);
372 	nmlen = strlen(nm) + 1;
373 	dv->dv_name = kmem_alloc(nmlen, KM_SLEEP);
374 	bcopy(nm, dv->dv_name, nmlen);
375 	dv->dv_namelen = nmlen - 1;	/* no '\0' */
376 	vp = DVTOV(dv);
377 	vn_reinit(vp);
378 	vp->v_flag = 0;
379 	vp->v_vfsp = DVTOV(ddv)->v_vfsp;
380 	vp->v_type = dmd->ddm_spec_type == S_IFCHR ? VCHR : VBLK;
381 	vp->v_rdev = dmd->ddm_dev;
382 	vn_setops(vp, vn_getops(DVTOV(ddv)));
383 	vn_exists(vp);
384 
385 	ASSERT(MUTEX_HELD(&DEVI(devi)->devi_lock));
386 	dv->dv_devi = devi;
387 	DEVI(devi)->devi_ref++;
388 
389 	dv->dv_ino = dv_mkino(devi, vp->v_type, vp->v_rdev);
390 	dv->dv_nlink = 0;		/* updated on insert */
391 	dv->dv_dotdot = ddv;
392 	dv->dv_attrvp = NULLVP;
393 	dv->dv_attr = NULL;
394 	dv->dv_flags = 0;
395 
396 	if (dmd->type == DDM_INTERNAL_PATH)
397 		dv->dv_flags |= DV_INTERNAL;
398 	if (dmd->ddm_flags & DM_NO_FSPERM)
399 		dv->dv_flags |= DV_NO_FSPERM;
400 
401 	dv->dv_priv = dmd->ddm_node_priv;
402 	if (dv->dv_priv)
403 		dphold(dv->dv_priv);
404 
405 	/*
406 	 * Minors created with ddi_create_priv_minor_node can specify
407 	 * a default mode permission other than the devfs default.
408 	 */
409 	if (dv->dv_priv || dv->dv_flags & DV_NO_FSPERM) {
410 		dcmn_err5(("%s: dv_mknod default priv mode 0%o\n",
411 		    dv->dv_name, dmd->ddm_priv_mode));
412 		dv->dv_flags |= DV_DFLT_MODE;
413 		dv->dv_dflt_mode = dmd->ddm_priv_mode & S_IAMB;
414 	}
415 
416 	return (dv);
417 }
418 
419 /*
420  * dv_destroy
421  *
422  * Destroy what we created in dv_mkdir or dv_mknod.
423  * In the case of a *referenced* directory, do nothing.
424  */
425 /*ARGSUSED1*/
426 void
427 dv_destroy(struct dv_node *dv, uint_t flags)
428 {
429 	vnode_t *vp = DVTOV(dv);
430 	ASSERT(dv->dv_nlink == 0);		/* no references */
431 	ASSERT(dv->dv_next == NULL);		/* unlinked from directory */
432 
433 	dcmn_err4(("dv_destroy: %s\n", dv->dv_name));
434 
435 	/*
436 	 * We may be asked to unlink referenced directories.
437 	 * In this case, there is nothing to be done.
438 	 * The eventual memory free will be done in
439 	 * devfs_inactive.
440 	 */
441 	if (vp->v_count != 0) {
442 		ASSERT(vp->v_type == VDIR);
443 		ASSERT(flags & DV_CLEAN_FORCE);
444 		ASSERT(DV_STALE(dv));
445 		return;
446 	}
447 
448 	if (dv->dv_attrvp != NULLVP)
449 		VN_RELE(dv->dv_attrvp);
450 	if (dv->dv_attr != NULL)
451 		kmem_free(dv->dv_attr, sizeof (struct vattr));
452 	if (dv->dv_name != NULL)
453 		kmem_free(dv->dv_name, dv->dv_namelen + 1);
454 	if (dv->dv_devi != NULL) {
455 		ndi_rele_devi(dv->dv_devi);
456 	}
457 	if (dv->dv_priv != NULL) {
458 		dpfree(dv->dv_priv);
459 	}
460 
461 	kmem_cache_free(dv_node_cache, dv);
462 }
463 
464 /*
465  * Find and hold dv_node by name
466  */
467 struct dv_node *
468 dv_findbyname(struct dv_node *ddv, char *nm)
469 {
470 	struct dv_node	*dv;
471 	size_t		nmlen = strlen(nm);
472 
473 	ASSERT(RW_LOCK_HELD(&ddv->dv_contents));
474 	dcmn_err3(("dv_findbyname: %s\n", nm));
475 	for (dv = ddv->dv_dot; dv; dv = dv->dv_next) {
476 		if (dv->dv_namelen != nmlen)
477 			continue;
478 		if (strcmp(dv->dv_name, nm) == 0) {
479 			VN_HOLD(DVTOV(dv));
480 			return (dv);
481 		}
482 	}
483 	return (NULL);
484 }
485 
486 /*
487  * Inserts a new dv_node in a parent directory
488  */
489 void
490 dv_insert(struct dv_node *ddv, struct dv_node *dv)
491 {
492 	ASSERT(RW_WRITE_HELD(&ddv->dv_contents));
493 	ASSERT(DVTOV(ddv)->v_type == VDIR);
494 	ASSERT(ddv->dv_nlink >= 2);
495 	ASSERT(dv->dv_nlink == 0);
496 
497 	dcmn_err3(("dv_insert: %s\n", dv->dv_name));
498 
499 	dv->dv_dotdot = ddv;
500 	dv->dv_next = ddv->dv_dot;
501 	ddv->dv_dot = dv;
502 	if (DVTOV(dv)->v_type == VDIR) {
503 		ddv->dv_nlink++;	/* .. to containing directory */
504 		dv->dv_nlink = 2;	/* name + . */
505 	} else {
506 		dv->dv_nlink = 1;	/* name */
507 	}
508 }
509 
510 /*
511  * Merge devfs node specific information into an attribute structure.
512  *
513  * NOTE: specfs provides ATIME,MTIME,CTIME,SIZE,BLKSIZE,NBLOCKS on leaf node.
514  */
515 void
516 dv_vattr_merge(struct dv_node *dv, struct vattr *vap)
517 {
518 	struct vnode *vp = DVTOV(dv);
519 
520 	vap->va_nodeid = dv->dv_ino;
521 	vap->va_nlink = dv->dv_nlink;
522 
523 	if (vp->v_type == VDIR) {
524 		vap->va_rdev = 0;
525 		vap->va_fsid = vp->v_rdev;
526 	} else {
527 		vap->va_rdev = vp->v_rdev;
528 		vap->va_fsid = DVTOV(dv->dv_dotdot)->v_rdev;
529 		vap->va_type = vp->v_type;
530 		/* don't trust the shadow file type */
531 		vap->va_mode &= ~S_IFMT;
532 		if (vap->va_type == VCHR)
533 			vap->va_mode |= S_IFCHR;
534 		else
535 			vap->va_mode |= S_IFBLK;
536 	}
537 }
538 
539 /*
540  * Free a vsecattr
541  */
542 static void
543 dv_free_vsa(struct vsecattr *vsap)
544 {
545 	if (vsap->vsa_aclcnt > 0 && vsap->vsa_aclentp)
546 		kmem_free(vsap->vsa_aclentp,
547 		    vsap->vsa_aclcnt * sizeof (aclent_t));
548 	if (vsap->vsa_dfaclcnt > 0 && vsap->vsa_dfaclentp)
549 		kmem_free(vsap->vsa_dfaclentp,
550 		    vsap->vsa_dfaclcnt * sizeof (aclent_t));
551 }
552 
553 /*
554  * dv_shadow_node
555  *
556  * Given a VDIR dv_node, find/create the associated VDIR
557  * node in the shadow attribute filesystem.
558  *
559  * Given a VCHR/VBLK dv_node, find the associated VREG
560  * node in the shadow attribute filesystem.  These nodes
561  * are only created to persist non-default attributes.
562  * Lack of such a node implies the default permissions
563  * are sufficient.
564  *
565  * Managing the attribute file entries is slightly tricky (mostly
566  * because we can't intercept VN_HOLD and VN_RELE except on the last
567  * release).
568  *
569  * We assert that if the dv_attrvp pointer is non-NULL, it points
570  * to a singly-held (by us) vnode that represents the shadow entry
571  * in the underlying filesystem.  To avoid store-ordering issues,
572  * we assert that the pointer can only be tested under the dv_contents
573  * READERS lock.
574  */
575 
576 void
577 dv_shadow_node(
578 	struct vnode *dvp,	/* devfs parent directory vnode */
579 	char *nm,		/* name component */
580 	struct vnode *vp,	/* devfs vnode */
581 	struct pathname *pnp,	/* the path .. */
582 	struct vnode *rdir,	/* the root .. */
583 	struct cred *cred,	/* who's asking? */
584 	int flags)		/* optionally create shadow node */
585 {
586 	struct dv_node	*dv;	/* dv_node of named directory */
587 	struct vnode	*rdvp;	/* shadow parent directory vnode */
588 	struct vnode	*rvp;	/* shadow vnode */
589 	struct vnode	*rrvp;	/* realvp of shadow vnode */
590 	struct vattr	vattr;
591 	int		create_tried;
592 	int		error;
593 	mperm_t		mp;
594 	struct vsecattr	vsa;
595 
596 	ASSERT(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK);
597 	dv = VTODV(vp);
598 	dcmn_err3(("dv_shadow_node: name %s attr %p\n",
599 	    nm, (void *)dv->dv_attrvp));
600 
601 	if ((flags & DV_SHADOW_WRITE_HELD) == 0) {
602 		ASSERT(RW_READ_HELD(&dv->dv_contents));
603 		if (dv->dv_attrvp != NULLVP)
604 			return;
605 		if (!rw_tryupgrade(&dv->dv_contents)) {
606 			rw_exit(&dv->dv_contents);
607 			rw_enter(&dv->dv_contents, RW_WRITER);
608 			if (dv->dv_attrvp != NULLVP) {
609 				rw_downgrade(&dv->dv_contents);
610 				return;
611 			}
612 		}
613 	} else {
614 		ASSERT(RW_WRITE_HELD(&dv->dv_contents));
615 		if (dv->dv_attrvp != NULLVP)
616 			return;
617 	}
618 
619 	ASSERT(RW_WRITE_HELD(&dv->dv_contents) && dv->dv_attrvp == NULL);
620 
621 	rdvp = VTODV(dvp)->dv_attrvp;
622 	create_tried = 0;
623 lookup:
624 	if (rdvp && (dv->dv_flags & DV_NO_FSPERM) == 0) {
625 		error = VOP_LOOKUP(rdvp, nm, &rvp, pnp, LOOKUP_DIR, rdir, cred);
626 
627 		/* factor out the snode since we only want the attribute node */
628 		if ((error == 0) && (VOP_REALVP(rvp, &rrvp) == 0)) {
629 			VN_HOLD(rrvp);
630 			VN_RELE(rvp);
631 			rvp = rrvp;
632 		}
633 	} else
634 		error = EROFS;		/* no parent, no entry */
635 
636 	/*
637 	 * All we want is the permissions (and maybe ACLs and
638 	 * extended attributes), and we want to perform lookups
639 	 * by name.  Drivers occasionally change their minor
640 	 * number space.  If something changes, there's no
641 	 * much we can do about it here.
642 	 */
643 
644 	/* The shadow node checks out. We are done */
645 	if (error == 0) {
646 		dv->dv_attrvp = rvp;	/* with one hold */
647 
648 		/*
649 		 * Determine if we have (non-trivial) ACLs on this node.
650 		 * NB: This should be changed call fs_acl_nontrivial for
651 		 * new ACE flavor ACLs.
652 		 */
653 		vsa.vsa_mask = VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT;
654 		error = VOP_GETSECATTR(rvp, &vsa, 0, cred);
655 		dv->dv_flags &= ~DV_ACL;
656 		if (error == 0) {
657 			if (vsa.vsa_aclcnt > MIN_ACL_ENTRIES) {
658 				dv->dv_flags |= DV_ACL;	/* non-trivial ACL */
659 			}
660 			dv_free_vsa(&vsa);
661 		}
662 
663 		/*
664 		 * If we have synced out the memory attributes, free
665 		 * them and switch back to using the persistent store.
666 		 */
667 		if (rvp && dv->dv_attr) {
668 			kmem_free(dv->dv_attr, sizeof (struct vattr));
669 			dv->dv_attr = NULL;
670 		}
671 		if ((flags & DV_SHADOW_WRITE_HELD) == 0)
672 			rw_downgrade(&dv->dv_contents);
673 		ASSERT(RW_LOCK_HELD(&dv->dv_contents));
674 		return;
675 	}
676 
677 	/*
678 	 * Failed to find attribute in persistent backing store,
679 	 * get default permission bits.  For minors not created by
680 	 * ddi_create_priv_minor_node(), use devfs defaults.
681 	 */
682 	if (vp->v_type == VDIR) {
683 		vattr = dv_vattr_dir;
684 	} else if (dv->dv_flags & DV_NO_FSPERM) {
685 		vattr = dv_vattr_priv;
686 	} else {
687 		/*
688 		 * look up perm bits from minor_perm
689 		 */
690 		vattr = dv_vattr_file;
691 		if (dev_minorperm(dv->dv_devi, dv->dv_name, &mp) == 0) {
692 			VATTR_MP_MERGE(vattr, mp);
693 			dcmn_err5(("%s: minor perm mode 0%o\n",
694 			    dv->dv_name, vattr.va_mode));
695 		} else if (dv->dv_flags & DV_DFLT_MODE) {
696 			ASSERT((dv->dv_dflt_mode & ~S_IAMB) == 0);
697 			vattr.va_mode &= ~S_IAMB;
698 			vattr.va_mode |= dv->dv_dflt_mode;
699 			dcmn_err5(("%s: priv mode 0%o\n",
700 			    dv->dv_name, vattr.va_mode));
701 		}
702 	}
703 
704 	dv_vattr_merge(dv, &vattr);
705 	gethrestime(&vattr.va_atime);
706 	vattr.va_mtime = vattr.va_atime;
707 	vattr.va_ctime = vattr.va_atime;
708 
709 	/*
710 	 * Try to create shadow dir. This is necessary in case
711 	 * we need to create a shadow leaf node later, when user
712 	 * executes chmod.
713 	 */
714 	if ((error == ENOENT) && !create_tried) {
715 		switch (vp->v_type) {
716 		case VDIR:
717 			error = VOP_MKDIR(rdvp, nm, &vattr, &rvp, kcred);
718 			dsysdebug(error, ("vop_mkdir %s %s %d\n",
719 			    VTODV(dvp)->dv_name, nm, error));
720 			create_tried = 1;
721 			break;
722 
723 		case VCHR:
724 		case VBLK:
725 			/*
726 			 * Shadow nodes are only created on demand
727 			 */
728 			if (flags & DV_SHADOW_CREATE) {
729 				error = VOP_CREATE(rdvp, nm, &vattr, NONEXCL,
730 				    VREAD|VWRITE, &rvp, kcred, 0);
731 				dsysdebug(error, ("vop_create %s %s %d\n",
732 				    VTODV(dvp)->dv_name, nm, error));
733 				create_tried = 1;
734 			}
735 			break;
736 
737 		default:
738 			cmn_err(CE_PANIC, "devfs: %s: create", dvnm);
739 			/*NOTREACHED*/
740 		}
741 
742 		if (create_tried &&
743 		    (error == 0) || (error == EEXIST)) {
744 			VN_RELE(rvp);
745 			goto lookup;
746 		}
747 	}
748 
749 	/* Store attribute in memory */
750 	if (dv->dv_attr == NULL) {
751 		dv->dv_attr = kmem_alloc(sizeof (struct vattr), KM_SLEEP);
752 		*(dv->dv_attr) = vattr;
753 	}
754 
755 	if ((flags & DV_SHADOW_WRITE_HELD) == 0)
756 		rw_downgrade(&dv->dv_contents);
757 	ASSERT(RW_LOCK_HELD(&dv->dv_contents));
758 }
759 
760 /*
761  * Given a devinfo node, and a name, returns the appropriate
762  * minor information for that named node, if it exists.
763  */
764 static int
765 dv_find_leafnode(dev_info_t *devi, char *minor_nm, struct ddi_minor_data *r_mi)
766 {
767 	struct ddi_minor_data *dmd;
768 
769 	ASSERT(i_ddi_node_state(devi) >= DS_ATTACHED);
770 	ASSERT(MUTEX_HELD(&DEVI(devi)->devi_lock));
771 
772 	dcmn_err3(("dv_find_leafnode: %s\n", minor_nm));
773 	for (dmd = DEVI(devi)->devi_minor; dmd; dmd = dmd->next) {
774 
775 		/*
776 		 * Skip alias nodes and nodes without a name.
777 		 */
778 		if ((dmd->type == DDM_ALIAS) || (dmd->ddm_name == NULL))
779 			    continue;
780 
781 		dcmn_err4(("dv_find_leafnode: (%s,%s)\n",
782 			minor_nm, dmd->ddm_name));
783 		if (strcmp(minor_nm, dmd->ddm_name) == 0) {
784 			r_mi->ddm_dev = dmd->ddm_dev;
785 			r_mi->ddm_spec_type = dmd->ddm_spec_type;
786 			r_mi->type = dmd->type;
787 			r_mi->ddm_flags = dmd->ddm_flags;
788 			r_mi->ddm_node_priv = dmd->ddm_node_priv;
789 			r_mi->ddm_priv_mode = dmd->ddm_priv_mode;
790 			if (r_mi->ddm_node_priv)
791 				dphold(r_mi->ddm_node_priv);
792 			return (0);
793 		}
794 	}
795 
796 	dcmn_err3(("dv_find_leafnode: %s: ENOENT\n", minor_nm));
797 	return (ENOENT);
798 }
799 
800 /*
801  * Special handling for clone node:
802  *	Clone minor name is a driver name, the minor number will
803  *	be the major number of the driver. There is no minor
804  *	node under the clone driver, so we'll manufacture the
805  *	dev_t.
806  */
807 static struct dv_node *
808 dv_clone_mknod(struct dv_node *ddv, char *drvname)
809 {
810 	major_t	major;
811 	struct dv_node *dvp;
812 	char *devnm;
813 	struct ddi_minor_data *dmd;
814 
815 	/*
816 	 * Make sure drvname is a STREAMS driver. We load the driver,
817 	 * but don't attach to any instances. This makes stat(2)
818 	 * relatively cheap.
819 	 */
820 	major = ddi_name_to_major(drvname);
821 	if (major == (major_t)-1)
822 		return (NULL);
823 
824 	if (ddi_hold_driver(major) == NULL)
825 		return (NULL);
826 
827 	if (STREAMSTAB(major) == NULL) {
828 		ddi_rele_driver(major);
829 		return (NULL);
830 	}
831 
832 	ddi_rele_driver(major);
833 	devnm = kmem_alloc(MAXNAMELEN, KM_SLEEP);
834 	(void) snprintf(devnm, MAXNAMELEN, "clone@0:%s", drvname);
835 	dmd = kmem_zalloc(sizeof (*dmd), KM_SLEEP);
836 	dmd->ddm_dev = makedevice(clone_major, (minor_t)major);
837 	dmd->ddm_spec_type = S_IFCHR;
838 	dvp = dv_mknod(ddv, clone_dip, devnm, dmd);
839 	kmem_free(dmd, sizeof (*dmd));
840 	kmem_free(devnm, MAXNAMELEN);
841 	return (dvp);
842 }
843 
844 /*
845  * Given the parent directory node, and a name in it, returns the
846  * named dv_node to the caller (as a vnode).
847  *
848  * (We need pnp and rdir for doing shadow lookups; they can be NULL)
849  */
850 int
851 dv_find(struct dv_node *ddv, char *nm, struct vnode **vpp, struct pathname *pnp,
852 	struct vnode *rdir, struct cred *cred, uint_t ndi_flags)
853 {
854 	extern int isminiroot;	/* see modctl.c */
855 
856 	int rv = 0, was_busy = 0, nmlen;
857 	struct vnode *vp;
858 	struct dv_node *dv, *dup;
859 	dev_info_t *pdevi, *devi = NULL;
860 	char *mnm;
861 	struct ddi_minor_data *dmd;
862 
863 	dcmn_err3(("dv_find %s\n", nm));
864 
865 	rw_enter(&ddv->dv_contents, RW_READER);
866 start:
867 	if (DV_STALE(ddv)) {
868 		rw_exit(&ddv->dv_contents);
869 		return (ESTALE);
870 	}
871 
872 	/*
873 	 * Empty name or ., return node itself.
874 	 */
875 	nmlen = strlen(nm);
876 	if ((nmlen == 0) || ((nmlen == 1) && (nm[0] == '.'))) {
877 		*vpp = DVTOV(ddv);
878 		rw_exit(&ddv->dv_contents);
879 		VN_HOLD(*vpp);
880 		return (0);
881 	}
882 
883 	/*
884 	 * .., return the parent directory
885 	 */
886 	if ((nmlen == 2) && (strcmp(nm, "..") == 0)) {
887 		*vpp = DVTOV(ddv->dv_dotdot);
888 		rw_exit(&ddv->dv_contents);
889 		VN_HOLD(*vpp);
890 		return (0);
891 	}
892 
893 	/*
894 	 * Fail anything without a valid device name component
895 	 */
896 	if (nm[0] == '@' || nm[0] == ':') {
897 		dcmn_err3(("devfs: no driver '%s'\n", nm));
898 		rw_exit(&ddv->dv_contents);
899 		return (ENOENT);
900 	}
901 
902 	/*
903 	 * So, now we have to deal with the trickier stuff.
904 	 *
905 	 * (a) search the existing list of dv_nodes on this directory
906 	 */
907 	if ((dv = dv_findbyname(ddv, nm)) != NULL) {
908 founddv:
909 		ASSERT(RW_LOCK_HELD(&ddv->dv_contents));
910 		rw_enter(&dv->dv_contents, RW_READER);
911 		vp = DVTOV(dv);
912 		if ((dv->dv_attrvp != NULLVP) ||
913 		    (vp->v_type != VDIR && dv->dv_attr != NULL)) {
914 			/*
915 			 * Common case - we already have attributes
916 			 */
917 			rw_exit(&dv->dv_contents);
918 			rw_exit(&ddv->dv_contents);
919 			goto found;
920 		}
921 
922 		/*
923 		 * No attribute vp, try and build one.
924 		 */
925 		dv_shadow_node(DVTOV(ddv), nm, vp, pnp, rdir, cred, 0);
926 		rw_exit(&dv->dv_contents);
927 		rw_exit(&ddv->dv_contents);
928 		goto found;
929 	}
930 
931 	/*
932 	 * (b) Search the child devinfo nodes of our parent directory,
933 	 * looking for the named node.  If we find it, build a new
934 	 * node, then grab the writers lock, search the directory
935 	 * if it's still not there, then insert it.
936 	 *
937 	 * We drop the devfs locks before accessing the device tree.
938 	 * Take care to mark the node BUSY so that a forced devfs_clean
939 	 * doesn't mark the directory node stale.
940 	 *
941 	 * Also, check if we are called as part of devfs_clean or
942 	 * reset_perm. If so, simply return not found because there
943 	 * is nothing to clean.
944 	 */
945 	if (tsd_get(devfs_clean_key)) {
946 		rw_exit(&ddv->dv_contents);
947 		return (ENOENT);
948 	}
949 
950 	/*
951 	 * We could be either READ or WRITE locked at
952 	 * this point. Upgrade if we are read locked.
953 	 */
954 	ASSERT(RW_LOCK_HELD(&ddv->dv_contents));
955 	if (rw_read_locked(&ddv->dv_contents) &&
956 	    !rw_tryupgrade(&ddv->dv_contents)) {
957 		rw_exit(&ddv->dv_contents);
958 		rw_enter(&ddv->dv_contents, RW_WRITER);
959 		/*
960 		 * Things may have changed when we dropped
961 		 * the contents lock, so start from top again
962 		 */
963 		goto start;
964 	}
965 	ddv->dv_busy++;		/* mark busy before dropping lock */
966 	was_busy++;
967 	rw_exit(&ddv->dv_contents);
968 
969 	pdevi = ddv->dv_devi;
970 	ASSERT(pdevi != NULL);
971 
972 	mnm = strchr(nm, ':');
973 	if (mnm)
974 		*mnm = (char)0;
975 
976 	/*
977 	 * Configure one nexus child, will call nexus's bus_ops
978 	 * If successful, devi is held upon returning.
979 	 * Note: devfs lookup should not be configuring grandchildren.
980 	 */
981 	ASSERT((ndi_flags & NDI_CONFIG) == 0);
982 
983 	rv = ndi_devi_config_one(pdevi, nm, &devi, ndi_flags | NDI_NO_EVENT);
984 	if (mnm)
985 		*mnm = ':';
986 	if (rv != NDI_SUCCESS) {
987 		rv = ENOENT;
988 		goto notfound;
989 	}
990 
991 	/*
992 	 * Don't make vhci clients visible under phci, unless we
993 	 * are in miniroot.
994 	 */
995 	if (isminiroot == 0 && ddi_get_parent(devi) != pdevi) {
996 		ndi_rele_devi(devi);
997 		rv = ENOENT;
998 		goto notfound;
999 	}
1000 
1001 	ASSERT(devi && (i_ddi_node_state(devi) >= DS_ATTACHED));
1002 
1003 	/*
1004 	 * Invalidate cache to notice newly created minor nodes.
1005 	 */
1006 	rw_enter(&ddv->dv_contents, RW_WRITER);
1007 	ddv->dv_flags |= DV_BUILD;
1008 	rw_exit(&ddv->dv_contents);
1009 
1010 	/*
1011 	 * mkdir for nexus drivers and leaf nodes as well.  If we are racing
1012 	 * and create a duplicate, the duplicate will be destroyed below.
1013 	 */
1014 	if (mnm == NULL) {
1015 		dv = dv_mkdir(ddv, devi, nm);
1016 	} else {
1017 		/*
1018 		 * For clone minors, load the driver indicated by minor name.
1019 		 */
1020 		mutex_enter(&DEVI(devi)->devi_lock);
1021 		if (devi == clone_dip) {
1022 			dv = dv_clone_mknod(ddv, mnm + 1);
1023 		} else {
1024 			/*
1025 			 * Find minor node and make a dv_node
1026 			 */
1027 			dmd = kmem_zalloc(sizeof (*dmd), KM_SLEEP);
1028 			if (dv_find_leafnode(devi, mnm + 1, dmd) == 0) {
1029 				dv = dv_mknod(ddv, devi, nm, dmd);
1030 				if (dmd->ddm_node_priv)
1031 					dpfree(dmd->ddm_node_priv);
1032 			}
1033 			kmem_free(dmd, sizeof (*dmd));
1034 		}
1035 		mutex_exit(&DEVI(devi)->devi_lock);
1036 	}
1037 	/*
1038 	 * Release hold from ndi_devi_config_one()
1039 	 */
1040 	ndi_rele_devi(devi);
1041 
1042 	if (dv == NULL) {
1043 		rv = ENOENT;
1044 		goto notfound;
1045 	}
1046 
1047 	/*
1048 	 * We have released the dv_contents lock, need to check
1049 	 * if another thread already created a duplicate node
1050 	 */
1051 	rw_enter(&ddv->dv_contents, RW_WRITER);
1052 	if ((dup = dv_findbyname(ddv, nm)) == NULL) {
1053 		dv_insert(ddv, dv);
1054 	} else {
1055 		/*
1056 		 * Duplicate found, use the existing node
1057 		 */
1058 		VN_RELE(DVTOV(dv));
1059 		dv_destroy(dv, 0);
1060 		dv = dup;
1061 	}
1062 	goto founddv;
1063 	/*NOTREACHED*/
1064 
1065 found:
1066 	/*
1067 	 * Skip non-kernel lookups of internal nodes.
1068 	 * This use of kcred to distinguish between user and
1069 	 * internal kernel lookups is unfortunate.  The information
1070 	 * provided by the seg argument to lookupnameat should
1071 	 * evolve into a lookup flag for filesystems that need
1072 	 * this distinction.
1073 	 */
1074 	if ((dv->dv_flags & DV_INTERNAL) && (cred != kcred)) {
1075 		VN_RELE(vp);
1076 		rv = ENOENT;
1077 		goto notfound;
1078 	}
1079 
1080 	dcmn_err2(("dv_find: returning vp for nm %s\n", nm));
1081 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
1082 		/*
1083 		 * If vnode is a device, return special vnode instead
1084 		 * (though it knows all about -us- via sp->s_realvp,
1085 		 * sp->s_devvp, and sp->s_dip)
1086 		 */
1087 		*vpp = specvp_devfs(vp, vp->v_rdev, vp->v_type, cred,
1088 			dv->dv_devi);
1089 		VN_RELE(vp);
1090 		if (*vpp == NULLVP)
1091 			rv = ENOSYS;
1092 	} else
1093 		*vpp = vp;
1094 
1095 notfound:
1096 	rw_enter(&ddv->dv_contents, RW_WRITER);
1097 	if (was_busy)
1098 		ddv->dv_busy--;
1099 	rw_exit(&ddv->dv_contents);
1100 	return (rv);
1101 }
1102 
1103 /*
1104  * The given directory node is out-of-date; that is, it has been
1105  * marked as needing to be rebuilt, possibly because some new devinfo
1106  * node has come into existence, or possibly because this is the first
1107  * time we've been here.
1108  */
1109 void
1110 dv_filldir(struct dv_node *ddv)
1111 {
1112 	struct dv_node *dv;
1113 	dev_info_t *devi, *pdevi;
1114 	struct ddi_minor_data *dmd;
1115 	char devnm[MAXNAMELEN];
1116 	int circ;
1117 
1118 	ASSERT(DVTOV(ddv)->v_type == VDIR);
1119 	ASSERT(RW_WRITE_HELD(&ddv->dv_contents));
1120 	ASSERT(ddv->dv_flags & DV_BUILD);
1121 
1122 	dcmn_err3(("dv_filldir: %s\n", ddv->dv_name));
1123 	if (DV_STALE(ddv))
1124 		return;
1125 	pdevi = ddv->dv_devi;
1126 
1127 	if (ndi_devi_config(pdevi, NDI_NO_EVENT) != NDI_SUCCESS) {
1128 		dcmn_err3(("dv_filldir: config error %s\n",
1129 			ddv->dv_name));
1130 	}
1131 
1132 	ndi_devi_enter(pdevi, &circ);
1133 	for (devi = ddi_get_child(pdevi); devi;
1134 	    devi = ddi_get_next_sibling(devi)) {
1135 		if (i_ddi_node_state(devi) < DS_PROBED)
1136 			continue;
1137 
1138 		dcmn_err3(("dv_filldir: node %s\n", ddi_node_name(devi)));
1139 
1140 		mutex_enter(&DEVI(devi)->devi_lock);
1141 		for (dmd = DEVI(devi)->devi_minor; dmd; dmd = dmd->next) {
1142 			char *addr;
1143 
1144 			/*
1145 			 * Skip alias nodes, internal nodes, and nodes
1146 			 * without a name.  We allow DDM_DEFAULT nodes
1147 			 * to appear in readdir.
1148 			 */
1149 			if ((dmd->type == DDM_ALIAS) ||
1150 			    (dmd->type == DDM_INTERNAL_PATH) ||
1151 			    (dmd->ddm_name == NULL))
1152 				continue;
1153 
1154 			addr = ddi_get_name_addr(devi);
1155 			if (addr && *addr)
1156 				(void) sprintf(devnm, "%s@%s:%s",
1157 				    ddi_node_name(devi), addr, dmd->ddm_name);
1158 			else
1159 				(void) sprintf(devnm, "%s:%s",
1160 				    ddi_node_name(devi), dmd->ddm_name);
1161 
1162 			if ((dv = dv_findbyname(ddv, devnm)) != NULL) {
1163 				/* dv_node already exists */
1164 				VN_RELE(DVTOV(dv));
1165 				continue;
1166 			}
1167 
1168 			dv = dv_mknod(ddv, devi, devnm, dmd);
1169 			dv_insert(ddv, dv);
1170 			VN_RELE(DVTOV(dv));
1171 		}
1172 		mutex_exit(&DEVI(devi)->devi_lock);
1173 
1174 		(void) ddi_deviname(devi, devnm);
1175 		if ((dv = dv_findbyname(ddv, devnm + 1)) == NULL) {
1176 			/* directory doesn't exist */
1177 			dv = dv_mkdir(ddv, devi, devnm + 1);
1178 			dv_insert(ddv, dv);
1179 		}
1180 		VN_RELE(DVTOV(dv));
1181 	}
1182 	ndi_devi_exit(pdevi, circ);
1183 
1184 	ddv->dv_flags &= ~DV_BUILD;
1185 }
1186 
1187 /*
1188  * Given a directory node, clean out all the nodes beneath.
1189  *
1190  * VDIR:	Reinvoke to clean them, then delete the directory.
1191  * VCHR, VBLK:	Just blow them away.
1192  *
1193  * Mark the directories touched as in need of a rebuild, in case
1194  * we fall over part way through. When DV_CLEAN_FORCE is specified,
1195  * we mark referenced empty directories as stale to facilitate DR.
1196  */
1197 int
1198 dv_cleandir(struct dv_node *ddv, char *devnm, uint_t flags)
1199 {
1200 	struct dv_node *dv, *prev = NULL, *next = NULL;
1201 	struct vnode *vp;
1202 	int retval = 0, set_stale = 0;
1203 
1204 	dcmn_err3(("dv_cleandir: %s\n", ddv->dv_name));
1205 
1206 	/*
1207 	 * If devnm is not NULL, we return immediately on busy.
1208 	 * Otherwise, we try our best to destroy all unused dv_node's
1209 	 */
1210 	rw_enter(&ddv->dv_contents, RW_WRITER);
1211 	for (dv = ddv->dv_dot; dv; prev = dv, dv = next) {
1212 		int error = 0;
1213 		next = dv->dv_next;
1214 
1215 		if (devnm && (strncmp(devnm, dv->dv_name, strlen(devnm)) ||
1216 		    (dv->dv_name[strlen(devnm)] != ':' &&
1217 		    dv->dv_name[strlen(devnm)] != '\0')))
1218 			/*
1219 			 * If devnm is specified, the non-minor
1220 			 * portion of the name must match devnm
1221 			 */
1222 			continue;
1223 
1224 		vp = DVTOV(dv);
1225 		if (vp->v_type == VDIR) {
1226 			if ((dv_cleandir(dv, NULL, flags) != 0) ||
1227 			    (dv->dv_nlink != 2)) {
1228 				error = EBUSY;
1229 			} else if (vp->v_count > 0) {
1230 				/*
1231 				 * The directory is empty but the directory
1232 				 * vnode is being held. If DV_CLEAN_FORCE is
1233 				 * specified, we force the directory to become
1234 				 * stale so that DR will succeed even if a
1235 				 * shell has /devices/xxx as current directory.
1236 				 */
1237 				rw_enter(&dv->dv_contents, RW_WRITER);
1238 				if (((flags & DV_CLEAN_FORCE) == 0) ||
1239 				    (dv->dv_busy != 0)) {
1240 					error = EBUSY;
1241 					rw_exit(&dv->dv_contents);
1242 				} else {
1243 					/*
1244 					 * mark the node stale later,
1245 					 * after unlinking from tree.
1246 					 * Hold the dv_contents lock
1247 					 * to prevent further lookup
1248 					 * before we mark it stale.
1249 					 */
1250 					set_stale = 1;
1251 				}
1252 			}
1253 		} else {
1254 			ASSERT(vp->v_type == VCHR || vp->v_type == VBLK);
1255 			mutex_enter(&vp->v_lock);
1256 			if (vp->v_count > 0) {
1257 				error = EBUSY;
1258 			}
1259 			mutex_exit(&vp->v_lock);
1260 		}
1261 
1262 		if (error != 0) {
1263 			retval = error;
1264 			if (devnm)
1265 				break;
1266 			continue;
1267 		}
1268 
1269 		/* unlink from directory */
1270 		if (vp->v_type == VDIR) {
1271 			ddv->dv_nlink--;	/* .. to above */
1272 			dv->dv_nlink--;		/* . to self */
1273 		}
1274 		if (prev)
1275 			prev->dv_next = dv->dv_next;
1276 		else
1277 			ddv->dv_dot = dv->dv_next;
1278 		dv->dv_next = NULL;
1279 		dv->dv_nlink--;			/* name, back to zero */
1280 
1281 		if (set_stale) {
1282 			/* only directories can be stale */
1283 			ASSERT(vp->v_type == VDIR);
1284 
1285 			/*
1286 			 * If v_count != 0, someone else has a reference
1287 			 * to this node. We mark the node stale and let
1288 			 * devfs_inactive() free the node.
1289 			 *
1290 			 * If v_count is already zero, no one else has
1291 			 * a reference to this node, do nothing and
1292 			 * dv_destroy() will free the node.
1293 			 *
1294 			 * We hold the vp->v_lock to synchronize with
1295 			 * devfs_inactive() to prevent double free.
1296 			 */
1297 			mutex_enter(&vp->v_lock);
1298 			if (vp->v_count != 0) {
1299 				ASSERT(!DV_STALE(dv));
1300 				ndi_rele_devi(dv->dv_devi);
1301 				dv->dv_devi = NULL;
1302 				/* release dv_contents held in set_scale */
1303 				rw_exit(&dv->dv_contents);
1304 				mutex_exit(&vp->v_lock);
1305 				/* don't touch dv after setting it stale */
1306 			} else {
1307 				/* release dv_contents held in set_scale */
1308 				rw_exit(&dv->dv_contents);
1309 				mutex_exit(&vp->v_lock);
1310 				dv_destroy(dv, flags);
1311 			}
1312 			set_stale = 0;
1313 		} else {
1314 			dv_destroy(dv, flags);
1315 		}
1316 		dv = prev;	/* reset dv/prev for next loop */
1317 	}
1318 	/*
1319 	 * This code may be invoked to inform devfs that a new node has
1320 	 * been created in the kernel device tree. So we always set
1321 	 * the DV_BUILD flag to allow the next dv_filldir() to pick
1322 	 * the new devinfo nodes.
1323 	 */
1324 	ddv->dv_flags |= DV_BUILD;
1325 
1326 	rw_exit(&ddv->dv_contents);
1327 
1328 	return (retval);
1329 }
1330 
1331 /*
1332  * Walk through the devfs hierarchy, correcting the permissions of
1333  * devices with default permissions that do not match those specified
1334  * by minor perm.  This can only be done for all drivers for now.
1335  */
1336 static int
1337 dv_reset_perm_dir(struct dv_node *ddv, uint_t flags)
1338 {
1339 	struct dv_node *dv, *next = NULL;
1340 	struct vnode *vp;
1341 	int retval = 0;
1342 	struct vattr *attrp;
1343 	mperm_t mp;
1344 	char *nm;
1345 	uid_t old_uid;
1346 	gid_t old_gid;
1347 	mode_t old_mode;
1348 
1349 	rw_enter(&ddv->dv_contents, RW_WRITER);
1350 	for (dv = ddv->dv_dot; dv; dv = next) {
1351 		int error = 0;
1352 		next = dv->dv_next;
1353 		nm = dv->dv_name;
1354 
1355 		rw_enter(&dv->dv_contents, RW_READER);
1356 		vp = DVTOV(dv);
1357 		if (vp->v_type == VDIR) {
1358 			rw_exit(&dv->dv_contents);
1359 			if (dv_reset_perm_dir(dv, flags) != 0) {
1360 				error = EBUSY;
1361 			}
1362 		} else {
1363 			ASSERT(vp->v_type == VCHR || vp->v_type == VBLK);
1364 
1365 			/*
1366 			 * Check for permissions from minor_perm
1367 			 * If there are none, we're done
1368 			 */
1369 			rw_exit(&dv->dv_contents);
1370 			if (dev_minorperm(dv->dv_devi, nm, &mp) != 0)
1371 				continue;
1372 
1373 			rw_enter(&dv->dv_contents, RW_READER);
1374 
1375 			/*
1376 			 * Allow a node's permissions to be altered
1377 			 * permanently from the defaults by chmod,
1378 			 * using the shadow node as backing store.
1379 			 * Otherwise, update node to minor_perm permissions.
1380 			 */
1381 			if (dv->dv_attrvp == NULLVP) {
1382 				/*
1383 				 * No attribute vp, try to find one.
1384 				 */
1385 				dv_shadow_node(DVTOV(ddv), nm, vp,
1386 					NULL, NULLVP, kcred, 0);
1387 			}
1388 			if (dv->dv_attrvp != NULLVP || dv->dv_attr == NULL) {
1389 				rw_exit(&dv->dv_contents);
1390 				continue;
1391 			}
1392 
1393 			attrp = dv->dv_attr;
1394 
1395 			if (VATTRP_MP_CMP(attrp, mp) == 0) {
1396 				dcmn_err5(("%s: no perm change: "
1397 				    "%d %d 0%o\n", nm, attrp->va_uid,
1398 				    attrp->va_gid, attrp->va_mode));
1399 				rw_exit(&dv->dv_contents);
1400 				continue;
1401 			}
1402 
1403 			old_uid = attrp->va_uid;
1404 			old_gid = attrp->va_gid;
1405 			old_mode = attrp->va_mode;
1406 
1407 			VATTRP_MP_MERGE(attrp, mp);
1408 			mutex_enter(&vp->v_lock);
1409 			if (vp->v_count > 0) {
1410 				error = EBUSY;
1411 			}
1412 			mutex_exit(&vp->v_lock);
1413 
1414 			dcmn_err5(("%s: perm %d/%d/0%o -> %d/%d/0%o (%d)\n",
1415 			    nm, old_uid, old_gid, old_mode, attrp->va_uid,
1416 			    attrp->va_gid, attrp->va_mode, error));
1417 
1418 			rw_exit(&dv->dv_contents);
1419 		}
1420 
1421 		if (error != 0) {
1422 			retval = error;
1423 		}
1424 	}
1425 
1426 	ddv->dv_flags |= DV_BUILD;
1427 
1428 	rw_exit(&ddv->dv_contents);
1429 
1430 	return (retval);
1431 }
1432 
1433 int
1434 devfs_reset_perm(uint_t flags)
1435 {
1436 	struct dv_node *dvp;
1437 	int rval;
1438 
1439 	if ((dvp = devfs_dip_to_dvnode(ddi_root_node())) == NULL)
1440 		return (0);
1441 
1442 	VN_HOLD(DVTOV(dvp));
1443 	rval = dv_reset_perm_dir(dvp, flags);
1444 	VN_RELE(DVTOV(dvp));
1445 	return (rval);
1446 }
1447 
1448 /*
1449  * Clean up dangling devfs shadow nodes for removed
1450  * drivers so that, in the event the driver is re-added
1451  * to the system, newly created nodes won't incorrectly
1452  * pick up these stale shadow node permissions.
1453  *
1454  * This is accomplished by walking down the pathname
1455  * to the directory, starting at the root's attribute
1456  * node, then removing all minors matching the specified
1457  * node name.  Care must be taken to remove all entries
1458  * in a directory before the directory itself, so that
1459  * the clean-up associated with rem_drv'ing a nexus driver
1460  * does not inadvertently result in an inconsistent
1461  * filesystem underlying devfs.
1462  */
1463 
1464 static int
1465 devfs_remdrv_rmdir(vnode_t *dirvp, const char *dir, vnode_t *rootvp)
1466 {
1467 	int error;
1468 	vnode_t *vp;
1469 	int eof;
1470 	struct iovec iov;
1471 	struct uio uio;
1472 	struct dirent64 *dp;
1473 	dirent64_t *dbuf;
1474 	size_t dlen;
1475 	size_t dbuflen;
1476 	int ndirents = 64;
1477 	char *nm;
1478 
1479 	VN_HOLD(dirvp);
1480 
1481 	dlen = ndirents * (sizeof (*dbuf));
1482 	dbuf = kmem_alloc(dlen, KM_SLEEP);
1483 
1484 	uio.uio_iov = &iov;
1485 	uio.uio_iovcnt = 1;
1486 	uio.uio_segflg = UIO_SYSSPACE;
1487 	uio.uio_fmode = 0;
1488 	uio.uio_extflg = UIO_COPY_CACHED;
1489 	uio.uio_loffset = 0;
1490 	uio.uio_llimit = MAXOFFSET_T;
1491 
1492 	eof = 0;
1493 	error = 0;
1494 	while (!error && !eof) {
1495 		uio.uio_resid = dlen;
1496 		iov.iov_base = (char *)dbuf;
1497 		iov.iov_len = dlen;
1498 
1499 		(void) VOP_RWLOCK(dirvp, V_WRITELOCK_FALSE, NULL);
1500 		error = VOP_READDIR(dirvp, &uio, kcred, &eof);
1501 		VOP_RWUNLOCK(dirvp, V_WRITELOCK_FALSE, NULL);
1502 
1503 		dbuflen = dlen - uio.uio_resid;
1504 
1505 		if (error || dbuflen == 0)
1506 			break;
1507 
1508 		for (dp = dbuf; ((intptr_t)dp < (intptr_t)dbuf + dbuflen);
1509 			dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen)) {
1510 
1511 			nm = dp->d_name;
1512 
1513 			if (strcmp(nm, ".") == 0 || strcmp(nm, "..") == 0)
1514 				continue;
1515 
1516 			error = VOP_LOOKUP(dirvp,
1517 				nm, &vp, NULL, 0, NULL, kcred);
1518 
1519 			dsysdebug(error,
1520 			    ("rem_drv %s/%s lookup (%d)\n",
1521 			    dir, nm, error));
1522 
1523 			if (error)
1524 				continue;
1525 
1526 			ASSERT(vp->v_type == VDIR ||
1527 				vp->v_type == VCHR || vp->v_type == VBLK);
1528 
1529 			if (vp->v_type == VDIR) {
1530 				error = devfs_remdrv_rmdir(vp, nm, rootvp);
1531 				if (error == 0) {
1532 					error = VOP_RMDIR(dirvp,
1533 					    (char *)nm, rootvp, kcred);
1534 					dsysdebug(error,
1535 					    ("rem_drv %s/%s rmdir (%d)\n",
1536 					    dir, nm, error));
1537 				}
1538 			} else {
1539 				error = VOP_REMOVE(dirvp, (char *)nm, kcred);
1540 				dsysdebug(error,
1541 				    ("rem_drv %s/%s remove (%d)\n",
1542 				    dir, nm, error));
1543 			}
1544 
1545 			VN_RELE(vp);
1546 			if (error) {
1547 				goto exit;
1548 			}
1549 		}
1550 	}
1551 
1552 exit:
1553 	VN_RELE(dirvp);
1554 	kmem_free(dbuf, dlen);
1555 
1556 	return (error);
1557 }
1558 
1559 int
1560 devfs_remdrv_cleanup(const char *dir, const char *nodename)
1561 {
1562 	int error;
1563 	vnode_t *vp;
1564 	vnode_t *dirvp;
1565 	int eof;
1566 	struct iovec iov;
1567 	struct uio uio;
1568 	struct dirent64 *dp;
1569 	dirent64_t *dbuf;
1570 	size_t dlen;
1571 	size_t dbuflen;
1572 	int ndirents = 64;
1573 	int nodenamelen = strlen(nodename);
1574 	char *nm;
1575 	struct pathname pn;
1576 
1577 	dcmn_err5(("devfs_remdrv_cleanup: %s %s\n", dir, nodename));
1578 
1579 	if (error = pn_get((char *)dir, UIO_SYSSPACE, &pn))
1580 		return (0);
1581 
1582 	rootvp = dvroot->dv_attrvp;
1583 	ASSERT(rootvp != NULL);
1584 	VN_HOLD(rootvp);
1585 
1586 	pn_skipslash(&pn);
1587 	dirvp = rootvp;
1588 	VN_HOLD(dirvp);
1589 
1590 	nm = kmem_alloc(MAXNAMELEN, KM_SLEEP);
1591 
1592 	while (pn_pathleft(&pn)) {
1593 		ASSERT(dirvp->v_type == VDIR);
1594 		(void) pn_getcomponent(&pn, nm);
1595 		ASSERT((strcmp(nm, ".") != 0) && (strcmp(nm, "..") != 0));
1596 		error = VOP_LOOKUP(dirvp, nm, &vp, NULL, 0, rootvp, kcred);
1597 		if (error) {
1598 			dcmn_err5(("remdrv_cleanup %s lookup error %d\n",
1599 			    nm, error));
1600 			VN_RELE(dirvp);
1601 			if (dirvp != rootvp)
1602 				VN_RELE(rootvp);
1603 			pn_free(&pn);
1604 			kmem_free(nm, MAXNAMELEN);
1605 			return (0);
1606 		}
1607 		VN_RELE(dirvp);
1608 		dirvp = vp;
1609 		pn_skipslash(&pn);
1610 	}
1611 
1612 	ASSERT(dirvp->v_type == VDIR);
1613 	if (dirvp != rootvp)
1614 		VN_RELE(rootvp);
1615 	pn_free(&pn);
1616 	kmem_free(nm, MAXNAMELEN);
1617 
1618 	dlen = ndirents * (sizeof (*dbuf));
1619 	dbuf = kmem_alloc(dlen, KM_SLEEP);
1620 
1621 	uio.uio_iov = &iov;
1622 	uio.uio_iovcnt = 1;
1623 	uio.uio_segflg = UIO_SYSSPACE;
1624 	uio.uio_fmode = 0;
1625 	uio.uio_extflg = UIO_COPY_CACHED;
1626 	uio.uio_loffset = 0;
1627 	uio.uio_llimit = MAXOFFSET_T;
1628 
1629 	eof = 0;
1630 	error = 0;
1631 	while (!error && !eof) {
1632 		uio.uio_resid = dlen;
1633 		iov.iov_base = (char *)dbuf;
1634 		iov.iov_len = dlen;
1635 
1636 		(void) VOP_RWLOCK(dirvp, V_WRITELOCK_FALSE, NULL);
1637 		error = VOP_READDIR(dirvp, &uio, kcred, &eof);
1638 		VOP_RWUNLOCK(dirvp, V_WRITELOCK_FALSE, NULL);
1639 
1640 		dbuflen = dlen - uio.uio_resid;
1641 
1642 		if (error || dbuflen == 0)
1643 			break;
1644 
1645 		for (dp = dbuf; ((intptr_t)dp < (intptr_t)dbuf + dbuflen);
1646 			dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen)) {
1647 
1648 			nm = dp->d_name;
1649 
1650 			if (strcmp(nm, ".") == 0 || strcmp(nm, "..") == 0)
1651 				continue;
1652 
1653 			if (strncmp(nm, nodename, nodenamelen) != 0)
1654 				continue;
1655 
1656 			error = VOP_LOOKUP(dirvp, nm, &vp,
1657 			    NULL, 0, NULL, kcred);
1658 
1659 			dsysdebug(error,
1660 			    ("rem_drv %s/%s lookup (%d)\n",
1661 			    dir, nm, error));
1662 
1663 			if (error)
1664 				continue;
1665 
1666 			ASSERT(vp->v_type == VDIR ||
1667 				vp->v_type == VCHR || vp->v_type == VBLK);
1668 
1669 			if (vp->v_type == VDIR) {
1670 				error = devfs_remdrv_rmdir(vp, nm, rootvp);
1671 				if (error == 0) {
1672 					error = VOP_RMDIR(dirvp,
1673 					    (char *)nm, rootvp, kcred);
1674 					dsysdebug(error,
1675 					    ("rem_drv %s/%s rmdir (%d)\n",
1676 					    dir, nm, error));
1677 				}
1678 			} else {
1679 				error = VOP_REMOVE(dirvp, (char *)nm, kcred);
1680 				dsysdebug(error,
1681 				    ("rem_drv %s/%s remove (%d)\n",
1682 				    dir, nm, error));
1683 			}
1684 
1685 			VN_RELE(vp);
1686 			if (error)
1687 				goto exit;
1688 		}
1689 	}
1690 
1691 exit:
1692 	VN_RELE(dirvp);
1693 
1694 	kmem_free(dbuf, dlen);
1695 
1696 	return (0);
1697 }
1698 
1699 struct dv_list {
1700 	struct dv_node	*dv;
1701 	struct dv_list	*next;
1702 };
1703 
1704 void
1705 dv_walk(
1706 	struct dv_node	*ddv,
1707 	char		*devnm,
1708 	void		(*callback)(struct dv_node *, void *),
1709 	void		*arg)
1710 {
1711 	struct vnode	*dvp;
1712 	struct dv_node	*dv;
1713 	struct dv_list	*head, *tail, *next;
1714 
1715 	dcmn_err3(("dv_walk: ddv = %s, devnm = %s\n",
1716 	    ddv->dv_name, devnm ? devnm : "<null>"));
1717 
1718 	dvp = DVTOV(ddv);
1719 
1720 	ASSERT(dvp->v_type == VDIR);
1721 
1722 	head = tail = next = NULL;
1723 
1724 	mutex_enter(&dvp->v_lock);
1725 
1726 	rw_enter(&ddv->dv_contents, RW_READER);
1727 	for (dv = ddv->dv_dot; dv; dv = dv->dv_next) {
1728 
1729 		int len;
1730 
1731 		/*
1732 		 * If devnm is not NULL and is not the empty string,
1733 		 * select only dv_nodes with matching non-minor name
1734 		 */
1735 		if (devnm && (len = strlen(devnm)) &&
1736 		    (strncmp(devnm, dv->dv_name, len) ||
1737 		    (dv->dv_name[len] != ':' && dv->dv_name[len] != '\0')))
1738 			continue;
1739 
1740 		callback(dv, arg);
1741 
1742 		if (DVTOV(dv)->v_type != VDIR)
1743 			continue;
1744 
1745 		next = kmem_zalloc(sizeof (*next), KM_SLEEP);
1746 		next->dv = dv;
1747 
1748 		if (tail)
1749 			tail->next = next;
1750 		else
1751 			head = next;
1752 
1753 		tail = next;
1754 	}
1755 
1756 	while (head) {
1757 		dv_walk(head->dv, NULL, callback, arg);
1758 		next = head->next;
1759 		kmem_free(head, sizeof (*head));
1760 		head = next;
1761 	}
1762 	rw_exit(&ddv->dv_contents);
1763 
1764 	mutex_exit(&dvp->v_lock);
1765 }
1766