xref: /titanic_51/usr/src/uts/common/fs/devfs/devfs_subr.c (revision 5d54f3d8999eac1762fe0a8c7177d20f1f201fae)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * miscellaneous routines for the devfs
31  */
32 
33 #include <sys/types.h>
34 #include <sys/param.h>
35 #include <sys/t_lock.h>
36 #include <sys/systm.h>
37 #include <sys/sysmacros.h>
38 #include <sys/user.h>
39 #include <sys/time.h>
40 #include <sys/vfs.h>
41 #include <sys/vnode.h>
42 #include <sys/file.h>
43 #include <sys/fcntl.h>
44 #include <sys/flock.h>
45 #include <sys/kmem.h>
46 #include <sys/uio.h>
47 #include <sys/errno.h>
48 #include <sys/stat.h>
49 #include <sys/cred.h>
50 #include <sys/dirent.h>
51 #include <sys/pathname.h>
52 #include <sys/cmn_err.h>
53 #include <sys/debug.h>
54 #include <sys/modctl.h>
55 #include <fs/fs_subr.h>
56 #include <sys/fs/dv_node.h>
57 #include <sys/fs/snode.h>
58 #include <sys/sunndi.h>
59 #include <sys/sunmdi.h>
60 #include <sys/conf.h>
61 
62 #ifdef DEBUG
63 int devfs_debug = 0x0;
64 #endif
65 
66 const char	dvnm[] = "devfs";
67 kmem_cache_t	*dv_node_cache;	/* dv_node cache */
68 uint_t		devfs_clean_key;
69 struct dv_node *dvroot;
70 
71 /* prototype memory vattrs */
72 vattr_t dv_vattr_dir = {
73 	AT_TYPE|AT_MODE|AT_UID|AT_GID, 		/* va_mask */
74 	VDIR,					/* va_type */
75 	DV_DIRMODE_DEFAULT,			/* va_mode */
76 	DV_UID_DEFAULT,				/* va_uid */
77 	DV_GID_DEFAULT,				/* va_gid */
78 	0,					/* va_fsid; */
79 	0,					/* va_nodeid; */
80 	0,					/* va_nlink; */
81 	0,					/* va_size; */
82 	0,					/* va_atime; */
83 	0,					/* va_mtime; */
84 	0,					/* va_ctime; */
85 	0,					/* va_rdev; */
86 	0,					/* va_blksize; */
87 	0,					/* va_nblocks; */
88 	0,					/* va_seq; */
89 };
90 
91 vattr_t dv_vattr_file = {
92 	AT_TYPE|AT_MODE|AT_SIZE|AT_UID|AT_GID|AT_RDEV,	/* va_mask */
93 	0,					/* va_type */
94 	DV_DEVMODE_DEFAULT,			/* va_mode */
95 	DV_UID_DEFAULT,				/* va_uid */
96 	DV_GID_DEFAULT,				/* va_gid */
97 	0,					/* va_fsid; */
98 	0,					/* va_nodeid; */
99 	0,					/* va_nlink; */
100 	0,					/* va_size; */
101 	0,					/* va_atime; */
102 	0,					/* va_mtime; */
103 	0,					/* va_ctime; */
104 	0,					/* va_rdev; */
105 	0,					/* va_blksize; */
106 	0,					/* va_nblocks; */
107 	0,					/* va_seq; */
108 };
109 
110 vattr_t dv_vattr_priv = {
111 	AT_TYPE|AT_MODE|AT_SIZE|AT_UID|AT_GID|AT_RDEV,	/* va_mask */
112 	0,					/* va_type */
113 	DV_DEVMODE_PRIV,			/* va_mode */
114 	DV_UID_DEFAULT,				/* va_uid */
115 	DV_GID_DEFAULT,				/* va_gid */
116 	0,					/* va_fsid; */
117 	0,					/* va_nodeid; */
118 	0,					/* va_nlink; */
119 	0,					/* va_size; */
120 	0,					/* va_atime; */
121 	0,					/* va_mtime; */
122 	0,					/* va_ctime; */
123 	0,					/* va_rdev; */
124 	0,					/* va_blksize; */
125 	0,					/* va_nblocks; */
126 	0,					/* va_seq; */
127 };
128 
129 extern dev_info_t	*clone_dip;
130 extern major_t		clone_major;
131 extern struct dev_ops	*ddi_hold_driver(major_t);
132 
133 /*
134  * dv_node cache constructor, destructor, can cache creation
135  */
136 /*ARGSUSED1*/
137 static int
138 i_dv_node_ctor(void *buf, void *cfarg, int flag)
139 {
140 	struct dv_node	*dv = (struct dv_node *)buf;
141 	struct vnode	*vp;
142 
143 	bzero(buf, sizeof (struct dv_node));
144 
145 	/* initialize persistent parts of dv_node */
146 	rw_init(&dv->dv_contents, NULL, RW_DEFAULT, NULL);
147 
148 	/* allocate vnode and initialize link back to dv_node */
149 	dv->dv_vnode = vn_alloc(KM_SLEEP);
150 	vp = DVTOV(dv);
151 	vp->v_data = (caddr_t)dv;
152 	return (0);
153 }
154 
155 /* dev_info node destructor for kmem cache */
156 /*ARGSUSED1*/
157 static void
158 i_dv_node_dtor(void *buf, void *arg)
159 {
160 	struct dv_node	*dv = (struct dv_node *)buf;
161 	struct vnode	*vp = DVTOV(dv);
162 
163 	rw_destroy(&dv->dv_contents);
164 	vn_invalid(vp);
165 	vn_free(vp);
166 }
167 
168 
169 /* initialize dev_info node cache */
170 void
171 dv_node_cache_init()
172 {
173 	ASSERT(dv_node_cache == NULL);
174 	dv_node_cache = kmem_cache_create("dv_node_cache",
175 	    sizeof (struct dv_node), 0, i_dv_node_ctor, i_dv_node_dtor,
176 	    NULL, NULL, NULL, 0);
177 
178 	tsd_create(&devfs_clean_key, NULL);
179 }
180 
181 /* initialize dev_info node cache */
182 void
183 dv_node_cache_fini()
184 {
185 	ASSERT(dv_node_cache != NULL);
186 	kmem_cache_destroy(dv_node_cache);
187 	dv_node_cache = NULL;
188 
189 	tsd_destroy(&devfs_clean_key);
190 }
191 
192 /*
193  * dv_mkino - Generate a unique inode number for devfs nodes.
194  *
195  * Although ino_t is 64 bits, the inode number is truncated to 32 bits for 32
196  * bit non-LARGEFILE applications. This means that there is a requirement to
197  * maintain the inode number as a 32 bit value or applications will have
198  * stat(2) calls fail with EOVERFLOW.  We form a 32 bit inode number from the
199  * dev_t. but if the minor number is larger than L_MAXMIN32 we fold extra minor
200  *
201  * To generate inode numbers for directories, we assume that we will never use
202  * more than half the major space - this allows for ~8190 drivers. We use this
203  * upper major number space to allocate inode numbers for directories by
204  * encoding the major and instance into this space.
205  *
206  * We also skew the result so that inode 2 is reserved for the root of the file
207  * system.
208  *
209  * As part of the future support for 64-bit dev_t APIs, the upper minor bits
210  * should be folded into the high inode bits by adding the following code
211  * after "ino |= 1":
212  *
213  * #if (L_BITSMINOR32 != L_BITSMINOR)
214  *		|* fold overflow minor bits into high bits of inode number *|
215  *		ino |= ((ino_t)(minor >> L_BITSMINOR32)) << L_BITSMINOR;
216  * #endif |* (L_BITSMINOR32 != L_BITSMINOR) *|
217  *
218  * This way only applications that use devices that overflow their minor
219  * space will have an application level impact.
220  */
221 static ino_t
222 dv_mkino(dev_info_t *devi, vtype_t typ, dev_t dev)
223 {
224 	major_t		major;
225 	minor_t		minor;
226 	ino_t		ino;
227 	static int	warn;
228 
229 	if (typ == VDIR) {
230 		major = ((L_MAXMAJ32 + 1) >> 1) + DEVI(devi)->devi_major;
231 		minor = ddi_get_instance(devi);
232 
233 		/* makedevice32 in high half of major number space */
234 		ino = (ino_t)((major << L_BITSMINOR32) | (minor & L_MAXMIN32));
235 
236 		major = DEVI(devi)->devi_major;
237 	} else {
238 		major = getmajor(dev);
239 		minor = getminor(dev);
240 
241 		/* makedevice32 */
242 		ino = (ino_t)((major << L_BITSMINOR32) | (minor & L_MAXMIN32));
243 
244 		/* make ino for VCHR different than VBLK */
245 		ino <<= 1;
246 		if (typ == VCHR)
247 			ino |= 1;
248 	}
249 
250 	ino += DV_ROOTINO + 1;		/* skew */
251 
252 	/*
253 	 * diagnose things a little early because adding the skew to a large
254 	 * minor number could roll over the major.
255 	 */
256 	if ((major >= (L_MAXMAJ32 >> 1)) && (warn == 0)) {
257 		warn = 1;
258 		cmn_err(CE_WARN, "%s: inode numbers are not unique", dvnm);
259 	}
260 
261 	return (ino);
262 }
263 
264 /*
265  * dv_mkroot
266  *
267  * Build the first VDIR dv_node.
268  */
269 struct dv_node *
270 dv_mkroot(struct vfs *vfsp, dev_t devfsdev)
271 {
272 	struct dv_node *dv;
273 	struct vnode *vp;
274 
275 	ASSERT(ddi_root_node() != NULL);
276 	ASSERT(dv_node_cache != NULL);
277 
278 	dcmn_err3(("dv_mkroot\n"));
279 	dv = kmem_cache_alloc(dv_node_cache, KM_SLEEP);
280 	vp = DVTOV(dv);
281 	vn_reinit(vp);
282 	vp->v_flag = VROOT;
283 	vp->v_vfsp = vfsp;
284 	vp->v_type = VDIR;
285 	vp->v_rdev = devfsdev;
286 	vn_setops(vp, dv_vnodeops);
287 	vn_exists(vp);
288 
289 	dvroot = dv;
290 
291 	dv->dv_name = NULL;		/* not needed */
292 	dv->dv_namelen = 0;
293 
294 	dv->dv_devi = ddi_root_node();
295 
296 	dv->dv_ino = DV_ROOTINO;
297 	dv->dv_nlink = 2;		/* name + . (no dv_insert) */
298 	dv->dv_dotdot = dv;		/* .. == self */
299 	dv->dv_attrvp = NULLVP;
300 	dv->dv_attr = NULL;
301 	dv->dv_flags = DV_BUILD;
302 	dv->dv_priv = NULL;
303 	dv->dv_busy = 0;
304 	dv->dv_dflt_mode = 0;
305 
306 	return (dv);
307 }
308 
309 /*
310  * dv_mkdir
311  *
312  * Given an probed or attached nexus node, create a VDIR dv_node.
313  * No dv_attrvp is created at this point.
314  */
315 struct dv_node *
316 dv_mkdir(struct dv_node *ddv, dev_info_t *devi, char *nm)
317 {
318 	struct dv_node *dv;
319 	struct vnode *vp;
320 	size_t nmlen;
321 
322 	ASSERT((devi));
323 	dcmn_err4(("dv_mkdir: %s\n", nm));
324 
325 	dv = kmem_cache_alloc(dv_node_cache, KM_SLEEP);
326 	nmlen = strlen(nm) + 1;
327 	dv->dv_name = kmem_alloc(nmlen, KM_SLEEP);
328 	bcopy(nm, dv->dv_name, nmlen);
329 	dv->dv_namelen = nmlen - 1;	/* '\0' not included */
330 	vp = DVTOV(dv);
331 	vn_reinit(vp);
332 	vp->v_flag = 0;
333 	vp->v_vfsp = DVTOV(ddv)->v_vfsp;
334 	vp->v_type = VDIR;
335 	vp->v_rdev = DVTOV(ddv)->v_rdev;
336 	vn_setops(vp, vn_getops(DVTOV(ddv)));
337 	vn_exists(vp);
338 
339 	dv->dv_devi = devi;
340 	ndi_hold_devi(devi);
341 
342 	dv->dv_ino = dv_mkino(devi, VDIR, NODEV);
343 	dv->dv_nlink = 0;		/* updated on insert */
344 	dv->dv_dotdot = ddv;
345 	dv->dv_attrvp = NULLVP;
346 	dv->dv_attr = NULL;
347 	dv->dv_flags = DV_BUILD;
348 	dv->dv_priv = NULL;
349 	dv->dv_busy = 0;
350 	dv->dv_dflt_mode = 0;
351 
352 	return (dv);
353 }
354 
355 /*
356  * dv_mknod
357  *
358  * Given a minor node, create a VCHR or VBLK dv_node.
359  * No dv_attrvp is created at this point.
360  */
361 static struct dv_node *
362 dv_mknod(struct dv_node *ddv, dev_info_t *devi, char *nm,
363 	struct ddi_minor_data *dmd)
364 {
365 	struct dv_node *dv;
366 	struct vnode *vp;
367 	size_t nmlen;
368 
369 	dcmn_err4(("dv_mknod: %s\n", nm));
370 
371 	dv = kmem_cache_alloc(dv_node_cache, KM_SLEEP);
372 	nmlen = strlen(nm) + 1;
373 	dv->dv_name = kmem_alloc(nmlen, KM_SLEEP);
374 	bcopy(nm, dv->dv_name, nmlen);
375 	dv->dv_namelen = nmlen - 1;	/* no '\0' */
376 	vp = DVTOV(dv);
377 	vn_reinit(vp);
378 	vp->v_flag = 0;
379 	vp->v_vfsp = DVTOV(ddv)->v_vfsp;
380 	vp->v_type = dmd->ddm_spec_type == S_IFCHR ? VCHR : VBLK;
381 	vp->v_rdev = dmd->ddm_dev;
382 	vn_setops(vp, vn_getops(DVTOV(ddv)));
383 	vn_exists(vp);
384 
385 	ASSERT(MUTEX_HELD(&DEVI(devi)->devi_lock));
386 	dv->dv_devi = devi;
387 	DEVI(devi)->devi_ref++;
388 
389 	dv->dv_ino = dv_mkino(devi, vp->v_type, vp->v_rdev);
390 	dv->dv_nlink = 0;		/* updated on insert */
391 	dv->dv_dotdot = ddv;
392 	dv->dv_attrvp = NULLVP;
393 	dv->dv_attr = NULL;
394 	dv->dv_flags = 0;
395 
396 	if (dmd->type == DDM_INTERNAL_PATH)
397 		dv->dv_flags |= DV_INTERNAL;
398 	if (dmd->ddm_flags & DM_NO_FSPERM)
399 		dv->dv_flags |= DV_NO_FSPERM;
400 
401 	dv->dv_priv = dmd->ddm_node_priv;
402 	if (dv->dv_priv)
403 		dphold(dv->dv_priv);
404 
405 	/*
406 	 * Minors created with ddi_create_priv_minor_node can specify
407 	 * a default mode permission other than the devfs default.
408 	 */
409 	if (dv->dv_priv || dv->dv_flags & DV_NO_FSPERM) {
410 		dcmn_err5(("%s: dv_mknod default priv mode 0%o\n",
411 		    dv->dv_name, dmd->ddm_priv_mode));
412 		dv->dv_flags |= DV_DFLT_MODE;
413 		dv->dv_dflt_mode = dmd->ddm_priv_mode & S_IAMB;
414 	}
415 
416 	return (dv);
417 }
418 
419 /*
420  * dv_destroy
421  *
422  * Destroy what we created in dv_mkdir or dv_mknod.
423  * In the case of a *referenced* directory, do nothing.
424  */
425 /*ARGSUSED1*/
426 void
427 dv_destroy(struct dv_node *dv, uint_t flags)
428 {
429 	vnode_t *vp = DVTOV(dv);
430 	ASSERT(dv->dv_nlink == 0);		/* no references */
431 	ASSERT(dv->dv_next == NULL);		/* unlinked from directory */
432 
433 	dcmn_err4(("dv_destroy: %s\n", dv->dv_name));
434 
435 	/*
436 	 * We may be asked to unlink referenced directories.
437 	 * In this case, there is nothing to be done.
438 	 * The eventual memory free will be done in
439 	 * devfs_inactive.
440 	 */
441 	if (vp->v_count != 0) {
442 		ASSERT(vp->v_type == VDIR);
443 		ASSERT(flags & DV_CLEAN_FORCE);
444 		ASSERT(DV_STALE(dv));
445 		return;
446 	}
447 
448 	if (dv->dv_attrvp != NULLVP)
449 		VN_RELE(dv->dv_attrvp);
450 	if (dv->dv_attr != NULL)
451 		kmem_free(dv->dv_attr, sizeof (struct vattr));
452 	if (dv->dv_name != NULL)
453 		kmem_free(dv->dv_name, dv->dv_namelen + 1);
454 	if (dv->dv_devi != NULL) {
455 		ndi_rele_devi(dv->dv_devi);
456 	}
457 	if (dv->dv_priv != NULL) {
458 		dpfree(dv->dv_priv);
459 	}
460 
461 	kmem_cache_free(dv_node_cache, dv);
462 }
463 
464 /*
465  * Find and hold dv_node by name
466  */
467 struct dv_node *
468 dv_findbyname(struct dv_node *ddv, char *nm)
469 {
470 	struct dv_node	*dv;
471 	size_t		nmlen = strlen(nm);
472 
473 	ASSERT(RW_LOCK_HELD(&ddv->dv_contents));
474 	dcmn_err3(("dv_findbyname: %s\n", nm));
475 	for (dv = ddv->dv_dot; dv; dv = dv->dv_next) {
476 		if (dv->dv_namelen != nmlen)
477 			continue;
478 		if (strcmp(dv->dv_name, nm) == 0) {
479 			VN_HOLD(DVTOV(dv));
480 			return (dv);
481 		}
482 	}
483 	return (NULL);
484 }
485 
486 /*
487  * Inserts a new dv_node in a parent directory
488  */
489 void
490 dv_insert(struct dv_node *ddv, struct dv_node *dv)
491 {
492 	ASSERT(RW_WRITE_HELD(&ddv->dv_contents));
493 	ASSERT(DVTOV(ddv)->v_type == VDIR);
494 	ASSERT(ddv->dv_nlink >= 2);
495 	ASSERT(dv->dv_nlink == 0);
496 
497 	dcmn_err3(("dv_insert: %s\n", dv->dv_name));
498 
499 	dv->dv_dotdot = ddv;
500 	dv->dv_next = ddv->dv_dot;
501 	ddv->dv_dot = dv;
502 	if (DVTOV(dv)->v_type == VDIR) {
503 		ddv->dv_nlink++;	/* .. to containing directory */
504 		dv->dv_nlink = 2;	/* name + . */
505 	} else {
506 		dv->dv_nlink = 1;	/* name */
507 	}
508 }
509 
510 /*
511  * Unlink a dv_node from a perent directory
512  */
513 void
514 dv_unlink(struct dv_node *ddv, struct dv_node *dv, struct dv_node **dv_pprev)
515 {
516 	/* verify linkage of arguments */
517 	ASSERT(ddv && dv && dv_pprev);
518 	ASSERT(dv->dv_dotdot == ddv);
519 	ASSERT(*dv_pprev == dv);
520 	ASSERT(RW_WRITE_HELD(&ddv->dv_contents));
521 	ASSERT(DVTOV(ddv)->v_type == VDIR);
522 
523 	dcmn_err3(("dv_unlink: %s\n", dv->dv_name));
524 
525 	if (DVTOV(dv)->v_type == VDIR) {
526 		ddv->dv_nlink--;	/* .. to containing directory */
527 		dv->dv_nlink -= 2;	/* name + . */
528 	} else {
529 		dv->dv_nlink -= 1;	/* name */
530 	}
531 	ASSERT(ddv->dv_nlink >= 2);
532 	ASSERT(dv->dv_nlink == 0);
533 
534 	/* update ddv->dv_dot/dv_next */
535 	*dv_pprev = dv->dv_next;
536 
537 	dv->dv_dotdot = NULL;
538 	dv->dv_next = NULL;
539 	dv->dv_dot = NULL;
540 }
541 
542 /*
543  * Merge devfs node specific information into an attribute structure.
544  *
545  * NOTE: specfs provides ATIME,MTIME,CTIME,SIZE,BLKSIZE,NBLOCKS on leaf node.
546  */
547 void
548 dv_vattr_merge(struct dv_node *dv, struct vattr *vap)
549 {
550 	struct vnode *vp = DVTOV(dv);
551 
552 	vap->va_nodeid = dv->dv_ino;
553 	vap->va_nlink = dv->dv_nlink;
554 
555 	if (vp->v_type == VDIR) {
556 		vap->va_rdev = 0;
557 		vap->va_fsid = vp->v_rdev;
558 	} else {
559 		vap->va_rdev = vp->v_rdev;
560 		vap->va_fsid = DVTOV(dv->dv_dotdot)->v_rdev;
561 		vap->va_type = vp->v_type;
562 		/* don't trust the shadow file type */
563 		vap->va_mode &= ~S_IFMT;
564 		if (vap->va_type == VCHR)
565 			vap->va_mode |= S_IFCHR;
566 		else
567 			vap->va_mode |= S_IFBLK;
568 	}
569 }
570 
571 /*
572  * Free a vsecattr
573  */
574 static void
575 dv_free_vsa(struct vsecattr *vsap)
576 {
577 	if (vsap->vsa_aclcnt > 0 && vsap->vsa_aclentp)
578 		kmem_free(vsap->vsa_aclentp,
579 		    vsap->vsa_aclcnt * sizeof (aclent_t));
580 	if (vsap->vsa_dfaclcnt > 0 && vsap->vsa_dfaclentp)
581 		kmem_free(vsap->vsa_dfaclentp,
582 		    vsap->vsa_dfaclcnt * sizeof (aclent_t));
583 }
584 
585 /*
586  * dv_shadow_node
587  *
588  * Given a VDIR dv_node, find/create the associated VDIR
589  * node in the shadow attribute filesystem.
590  *
591  * Given a VCHR/VBLK dv_node, find the associated VREG
592  * node in the shadow attribute filesystem.  These nodes
593  * are only created to persist non-default attributes.
594  * Lack of such a node implies the default permissions
595  * are sufficient.
596  *
597  * Managing the attribute file entries is slightly tricky (mostly
598  * because we can't intercept VN_HOLD and VN_RELE except on the last
599  * release).
600  *
601  * We assert that if the dv_attrvp pointer is non-NULL, it points
602  * to a singly-held (by us) vnode that represents the shadow entry
603  * in the underlying filesystem.  To avoid store-ordering issues,
604  * we assert that the pointer can only be tested under the dv_contents
605  * READERS lock.
606  */
607 
608 void
609 dv_shadow_node(
610 	struct vnode *dvp,	/* devfs parent directory vnode */
611 	char *nm,		/* name component */
612 	struct vnode *vp,	/* devfs vnode */
613 	struct pathname *pnp,	/* the path .. */
614 	struct vnode *rdir,	/* the root .. */
615 	struct cred *cred,	/* who's asking? */
616 	int flags)		/* optionally create shadow node */
617 {
618 	struct dv_node	*dv;	/* dv_node of named directory */
619 	struct vnode	*rdvp;	/* shadow parent directory vnode */
620 	struct vnode	*rvp;	/* shadow vnode */
621 	struct vnode	*rrvp;	/* realvp of shadow vnode */
622 	struct vattr	vattr;
623 	int		create_tried;
624 	int		error;
625 	mperm_t		mp;
626 	struct vsecattr	vsa;
627 
628 	ASSERT(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK);
629 	dv = VTODV(vp);
630 	dcmn_err3(("dv_shadow_node: name %s attr %p\n",
631 	    nm, (void *)dv->dv_attrvp));
632 
633 	if ((flags & DV_SHADOW_WRITE_HELD) == 0) {
634 		ASSERT(RW_READ_HELD(&dv->dv_contents));
635 		if (dv->dv_attrvp != NULLVP)
636 			return;
637 		if (!rw_tryupgrade(&dv->dv_contents)) {
638 			rw_exit(&dv->dv_contents);
639 			rw_enter(&dv->dv_contents, RW_WRITER);
640 			if (dv->dv_attrvp != NULLVP) {
641 				rw_downgrade(&dv->dv_contents);
642 				return;
643 			}
644 		}
645 	} else {
646 		ASSERT(RW_WRITE_HELD(&dv->dv_contents));
647 		if (dv->dv_attrvp != NULLVP)
648 			return;
649 	}
650 
651 	ASSERT(RW_WRITE_HELD(&dv->dv_contents) && dv->dv_attrvp == NULL);
652 
653 	rdvp = VTODV(dvp)->dv_attrvp;
654 	create_tried = 0;
655 lookup:
656 	if (rdvp && (dv->dv_flags & DV_NO_FSPERM) == 0) {
657 		error = VOP_LOOKUP(rdvp, nm, &rvp, pnp, LOOKUP_DIR, rdir, cred);
658 
659 		/* factor out the snode since we only want the attribute node */
660 		if ((error == 0) && (VOP_REALVP(rvp, &rrvp) == 0)) {
661 			VN_HOLD(rrvp);
662 			VN_RELE(rvp);
663 			rvp = rrvp;
664 		}
665 	} else
666 		error = EROFS;		/* no parent, no entry */
667 
668 	/*
669 	 * All we want is the permissions (and maybe ACLs and
670 	 * extended attributes), and we want to perform lookups
671 	 * by name.  Drivers occasionally change their minor
672 	 * number space.  If something changes, there's no
673 	 * much we can do about it here.
674 	 */
675 
676 	/* The shadow node checks out. We are done */
677 	if (error == 0) {
678 		dv->dv_attrvp = rvp;	/* with one hold */
679 
680 		/*
681 		 * Determine if we have (non-trivial) ACLs on this node.
682 		 * NB: This should be changed call fs_acl_nontrivial for
683 		 * new ACE flavor ACLs.
684 		 */
685 		vsa.vsa_mask = VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT;
686 		error = VOP_GETSECATTR(rvp, &vsa, 0, cred);
687 		dv->dv_flags &= ~DV_ACL;
688 		if (error == 0) {
689 			if (vsa.vsa_aclcnt > MIN_ACL_ENTRIES) {
690 				dv->dv_flags |= DV_ACL;	/* non-trivial ACL */
691 			}
692 			dv_free_vsa(&vsa);
693 		}
694 
695 		/*
696 		 * If we have synced out the memory attributes, free
697 		 * them and switch back to using the persistent store.
698 		 */
699 		if (rvp && dv->dv_attr) {
700 			kmem_free(dv->dv_attr, sizeof (struct vattr));
701 			dv->dv_attr = NULL;
702 		}
703 		if ((flags & DV_SHADOW_WRITE_HELD) == 0)
704 			rw_downgrade(&dv->dv_contents);
705 		ASSERT(RW_LOCK_HELD(&dv->dv_contents));
706 		return;
707 	}
708 
709 	/*
710 	 * Failed to find attribute in persistent backing store,
711 	 * get default permission bits.  For minors not created by
712 	 * ddi_create_priv_minor_node(), use devfs defaults.
713 	 */
714 	if (vp->v_type == VDIR) {
715 		vattr = dv_vattr_dir;
716 	} else if (dv->dv_flags & DV_NO_FSPERM) {
717 		vattr = dv_vattr_priv;
718 	} else {
719 		/*
720 		 * look up perm bits from minor_perm
721 		 */
722 		vattr = dv_vattr_file;
723 		if (dev_minorperm(dv->dv_devi, dv->dv_name, &mp) == 0) {
724 			VATTR_MP_MERGE(vattr, mp);
725 			dcmn_err5(("%s: minor perm mode 0%o\n",
726 			    dv->dv_name, vattr.va_mode));
727 		} else if (dv->dv_flags & DV_DFLT_MODE) {
728 			ASSERT((dv->dv_dflt_mode & ~S_IAMB) == 0);
729 			vattr.va_mode &= ~S_IAMB;
730 			vattr.va_mode |= dv->dv_dflt_mode;
731 			dcmn_err5(("%s: priv mode 0%o\n",
732 			    dv->dv_name, vattr.va_mode));
733 		}
734 	}
735 
736 	dv_vattr_merge(dv, &vattr);
737 	gethrestime(&vattr.va_atime);
738 	vattr.va_mtime = vattr.va_atime;
739 	vattr.va_ctime = vattr.va_atime;
740 
741 	/*
742 	 * Try to create shadow dir. This is necessary in case
743 	 * we need to create a shadow leaf node later, when user
744 	 * executes chmod.
745 	 */
746 	if ((error == ENOENT) && !create_tried) {
747 		switch (vp->v_type) {
748 		case VDIR:
749 			error = VOP_MKDIR(rdvp, nm, &vattr, &rvp, kcred);
750 			dsysdebug(error, ("vop_mkdir %s %s %d\n",
751 			    VTODV(dvp)->dv_name, nm, error));
752 			create_tried = 1;
753 			break;
754 
755 		case VCHR:
756 		case VBLK:
757 			/*
758 			 * Shadow nodes are only created on demand
759 			 */
760 			if (flags & DV_SHADOW_CREATE) {
761 				error = VOP_CREATE(rdvp, nm, &vattr, NONEXCL,
762 				    VREAD|VWRITE, &rvp, kcred, 0);
763 				dsysdebug(error, ("vop_create %s %s %d\n",
764 				    VTODV(dvp)->dv_name, nm, error));
765 				create_tried = 1;
766 			}
767 			break;
768 
769 		default:
770 			cmn_err(CE_PANIC, "devfs: %s: create", dvnm);
771 			/*NOTREACHED*/
772 		}
773 
774 		if (create_tried &&
775 		    (error == 0) || (error == EEXIST)) {
776 			VN_RELE(rvp);
777 			goto lookup;
778 		}
779 	}
780 
781 	/* Store attribute in memory */
782 	if (dv->dv_attr == NULL) {
783 		dv->dv_attr = kmem_alloc(sizeof (struct vattr), KM_SLEEP);
784 		*(dv->dv_attr) = vattr;
785 	}
786 
787 	if ((flags & DV_SHADOW_WRITE_HELD) == 0)
788 		rw_downgrade(&dv->dv_contents);
789 	ASSERT(RW_LOCK_HELD(&dv->dv_contents));
790 }
791 
792 /*
793  * Given a devinfo node, and a name, returns the appropriate
794  * minor information for that named node, if it exists.
795  */
796 static int
797 dv_find_leafnode(dev_info_t *devi, char *minor_nm, struct ddi_minor_data *r_mi)
798 {
799 	struct ddi_minor_data *dmd;
800 
801 	ASSERT(i_ddi_node_state(devi) >= DS_ATTACHED);
802 	ASSERT(MUTEX_HELD(&DEVI(devi)->devi_lock));
803 
804 	dcmn_err3(("dv_find_leafnode: %s\n", minor_nm));
805 	for (dmd = DEVI(devi)->devi_minor; dmd; dmd = dmd->next) {
806 
807 		/*
808 		 * Skip alias nodes and nodes without a name.
809 		 */
810 		if ((dmd->type == DDM_ALIAS) || (dmd->ddm_name == NULL))
811 			    continue;
812 
813 		dcmn_err4(("dv_find_leafnode: (%s,%s)\n",
814 			minor_nm, dmd->ddm_name));
815 		if (strcmp(minor_nm, dmd->ddm_name) == 0) {
816 			r_mi->ddm_dev = dmd->ddm_dev;
817 			r_mi->ddm_spec_type = dmd->ddm_spec_type;
818 			r_mi->type = dmd->type;
819 			r_mi->ddm_flags = dmd->ddm_flags;
820 			r_mi->ddm_node_priv = dmd->ddm_node_priv;
821 			r_mi->ddm_priv_mode = dmd->ddm_priv_mode;
822 			if (r_mi->ddm_node_priv)
823 				dphold(r_mi->ddm_node_priv);
824 			return (0);
825 		}
826 	}
827 
828 	dcmn_err3(("dv_find_leafnode: %s: ENOENT\n", minor_nm));
829 	return (ENOENT);
830 }
831 
832 /*
833  * Special handling for clone node:
834  *	Clone minor name is a driver name, the minor number will
835  *	be the major number of the driver. There is no minor
836  *	node under the clone driver, so we'll manufacture the
837  *	dev_t.
838  */
839 static struct dv_node *
840 dv_clone_mknod(struct dv_node *ddv, char *drvname)
841 {
842 	major_t	major;
843 	struct dv_node *dvp;
844 	char *devnm;
845 	struct ddi_minor_data *dmd;
846 
847 	/*
848 	 * Make sure drvname is a STREAMS driver. We load the driver,
849 	 * but don't attach to any instances. This makes stat(2)
850 	 * relatively cheap.
851 	 */
852 	major = ddi_name_to_major(drvname);
853 	if (major == (major_t)-1)
854 		return (NULL);
855 
856 	if (ddi_hold_driver(major) == NULL)
857 		return (NULL);
858 
859 	if (STREAMSTAB(major) == NULL) {
860 		ddi_rele_driver(major);
861 		return (NULL);
862 	}
863 
864 	ddi_rele_driver(major);
865 	devnm = kmem_alloc(MAXNAMELEN, KM_SLEEP);
866 	(void) snprintf(devnm, MAXNAMELEN, "clone@0:%s", drvname);
867 	dmd = kmem_zalloc(sizeof (*dmd), KM_SLEEP);
868 	dmd->ddm_dev = makedevice(clone_major, (minor_t)major);
869 	dmd->ddm_spec_type = S_IFCHR;
870 	dvp = dv_mknod(ddv, clone_dip, devnm, dmd);
871 	kmem_free(dmd, sizeof (*dmd));
872 	kmem_free(devnm, MAXNAMELEN);
873 	return (dvp);
874 }
875 
876 /*
877  * Given the parent directory node, and a name in it, returns the
878  * named dv_node to the caller (as a vnode).
879  *
880  * (We need pnp and rdir for doing shadow lookups; they can be NULL)
881  */
882 int
883 dv_find(struct dv_node *ddv, char *nm, struct vnode **vpp, struct pathname *pnp,
884 	struct vnode *rdir, struct cred *cred, uint_t ndi_flags)
885 {
886 	extern int isminiroot;	/* see modctl.c */
887 
888 	int rv = 0, was_busy = 0, nmlen;
889 	struct vnode *vp;
890 	struct dv_node *dv, *dup;
891 	dev_info_t *pdevi, *devi = NULL;
892 	char *mnm;
893 	struct ddi_minor_data *dmd;
894 
895 	dcmn_err3(("dv_find %s\n", nm));
896 
897 	rw_enter(&ddv->dv_contents, RW_READER);
898 start:
899 	if (DV_STALE(ddv)) {
900 		rw_exit(&ddv->dv_contents);
901 		return (ESTALE);
902 	}
903 
904 	/*
905 	 * Empty name or ., return node itself.
906 	 */
907 	nmlen = strlen(nm);
908 	if ((nmlen == 0) || ((nmlen == 1) && (nm[0] == '.'))) {
909 		*vpp = DVTOV(ddv);
910 		rw_exit(&ddv->dv_contents);
911 		VN_HOLD(*vpp);
912 		return (0);
913 	}
914 
915 	/*
916 	 * .., return the parent directory
917 	 */
918 	if ((nmlen == 2) && (strcmp(nm, "..") == 0)) {
919 		*vpp = DVTOV(ddv->dv_dotdot);
920 		rw_exit(&ddv->dv_contents);
921 		VN_HOLD(*vpp);
922 		return (0);
923 	}
924 
925 	/*
926 	 * Fail anything without a valid device name component
927 	 */
928 	if (nm[0] == '@' || nm[0] == ':') {
929 		dcmn_err3(("devfs: no driver '%s'\n", nm));
930 		rw_exit(&ddv->dv_contents);
931 		return (ENOENT);
932 	}
933 
934 	/*
935 	 * So, now we have to deal with the trickier stuff.
936 	 *
937 	 * (a) search the existing list of dv_nodes on this directory
938 	 */
939 	if ((dv = dv_findbyname(ddv, nm)) != NULL) {
940 founddv:
941 		ASSERT(RW_LOCK_HELD(&ddv->dv_contents));
942 		rw_enter(&dv->dv_contents, RW_READER);
943 		vp = DVTOV(dv);
944 		if ((dv->dv_attrvp != NULLVP) ||
945 		    (vp->v_type != VDIR && dv->dv_attr != NULL)) {
946 			/*
947 			 * Common case - we already have attributes
948 			 */
949 			rw_exit(&dv->dv_contents);
950 			rw_exit(&ddv->dv_contents);
951 			goto found;
952 		}
953 
954 		/*
955 		 * No attribute vp, try and build one.
956 		 */
957 		dv_shadow_node(DVTOV(ddv), nm, vp, pnp, rdir, cred, 0);
958 		rw_exit(&dv->dv_contents);
959 		rw_exit(&ddv->dv_contents);
960 		goto found;
961 	}
962 
963 	/*
964 	 * (b) Search the child devinfo nodes of our parent directory,
965 	 * looking for the named node.  If we find it, build a new
966 	 * node, then grab the writers lock, search the directory
967 	 * if it's still not there, then insert it.
968 	 *
969 	 * We drop the devfs locks before accessing the device tree.
970 	 * Take care to mark the node BUSY so that a forced devfs_clean
971 	 * doesn't mark the directory node stale.
972 	 *
973 	 * Also, check if we are called as part of devfs_clean or
974 	 * reset_perm. If so, simply return not found because there
975 	 * is nothing to clean.
976 	 */
977 	if (tsd_get(devfs_clean_key)) {
978 		rw_exit(&ddv->dv_contents);
979 		return (ENOENT);
980 	}
981 
982 	/*
983 	 * We could be either READ or WRITE locked at
984 	 * this point. Upgrade if we are read locked.
985 	 */
986 	ASSERT(RW_LOCK_HELD(&ddv->dv_contents));
987 	if (rw_read_locked(&ddv->dv_contents) &&
988 	    !rw_tryupgrade(&ddv->dv_contents)) {
989 		rw_exit(&ddv->dv_contents);
990 		rw_enter(&ddv->dv_contents, RW_WRITER);
991 		/*
992 		 * Things may have changed when we dropped
993 		 * the contents lock, so start from top again
994 		 */
995 		goto start;
996 	}
997 	ddv->dv_busy++;		/* mark busy before dropping lock */
998 	was_busy++;
999 	rw_exit(&ddv->dv_contents);
1000 
1001 	pdevi = ddv->dv_devi;
1002 	ASSERT(pdevi != NULL);
1003 
1004 	mnm = strchr(nm, ':');
1005 	if (mnm)
1006 		*mnm = (char)0;
1007 
1008 	/*
1009 	 * Configure one nexus child, will call nexus's bus_ops
1010 	 * If successful, devi is held upon returning.
1011 	 * Note: devfs lookup should not be configuring grandchildren.
1012 	 */
1013 	ASSERT((ndi_flags & NDI_CONFIG) == 0);
1014 
1015 	rv = ndi_devi_config_one(pdevi, nm, &devi, ndi_flags | NDI_NO_EVENT);
1016 	if (mnm)
1017 		*mnm = ':';
1018 	if (rv != NDI_SUCCESS) {
1019 		rv = ENOENT;
1020 		goto notfound;
1021 	}
1022 
1023 	/*
1024 	 * Don't make vhci clients visible under phci, unless we
1025 	 * are in miniroot.
1026 	 */
1027 	if (isminiroot == 0 && ddi_get_parent(devi) != pdevi) {
1028 		ndi_rele_devi(devi);
1029 		rv = ENOENT;
1030 		goto notfound;
1031 	}
1032 
1033 	ASSERT(devi && (i_ddi_node_state(devi) >= DS_ATTACHED));
1034 
1035 	/*
1036 	 * Invalidate cache to notice newly created minor nodes.
1037 	 */
1038 	rw_enter(&ddv->dv_contents, RW_WRITER);
1039 	ddv->dv_flags |= DV_BUILD;
1040 	rw_exit(&ddv->dv_contents);
1041 
1042 	/*
1043 	 * mkdir for nexus drivers and leaf nodes as well.  If we are racing
1044 	 * and create a duplicate, the duplicate will be destroyed below.
1045 	 */
1046 	if (mnm == NULL) {
1047 		dv = dv_mkdir(ddv, devi, nm);
1048 	} else {
1049 		/*
1050 		 * For clone minors, load the driver indicated by minor name.
1051 		 */
1052 		mutex_enter(&DEVI(devi)->devi_lock);
1053 		if (devi == clone_dip) {
1054 			dv = dv_clone_mknod(ddv, mnm + 1);
1055 		} else {
1056 			/*
1057 			 * Find minor node and make a dv_node
1058 			 */
1059 			dmd = kmem_zalloc(sizeof (*dmd), KM_SLEEP);
1060 			if (dv_find_leafnode(devi, mnm + 1, dmd) == 0) {
1061 				dv = dv_mknod(ddv, devi, nm, dmd);
1062 				if (dmd->ddm_node_priv)
1063 					dpfree(dmd->ddm_node_priv);
1064 			}
1065 			kmem_free(dmd, sizeof (*dmd));
1066 		}
1067 		mutex_exit(&DEVI(devi)->devi_lock);
1068 	}
1069 	/*
1070 	 * Release hold from ndi_devi_config_one()
1071 	 */
1072 	ndi_rele_devi(devi);
1073 
1074 	if (dv == NULL) {
1075 		rv = ENOENT;
1076 		goto notfound;
1077 	}
1078 
1079 	/*
1080 	 * We have released the dv_contents lock, need to check
1081 	 * if another thread already created a duplicate node
1082 	 */
1083 	rw_enter(&ddv->dv_contents, RW_WRITER);
1084 	if ((dup = dv_findbyname(ddv, nm)) == NULL) {
1085 		dv_insert(ddv, dv);
1086 	} else {
1087 		/*
1088 		 * Duplicate found, use the existing node
1089 		 */
1090 		VN_RELE(DVTOV(dv));
1091 		dv_destroy(dv, 0);
1092 		dv = dup;
1093 	}
1094 	goto founddv;
1095 	/*NOTREACHED*/
1096 
1097 found:
1098 	/*
1099 	 * Skip non-kernel lookups of internal nodes.
1100 	 * This use of kcred to distinguish between user and
1101 	 * internal kernel lookups is unfortunate.  The information
1102 	 * provided by the seg argument to lookupnameat should
1103 	 * evolve into a lookup flag for filesystems that need
1104 	 * this distinction.
1105 	 */
1106 	if ((dv->dv_flags & DV_INTERNAL) && (cred != kcred)) {
1107 		VN_RELE(vp);
1108 		rv = ENOENT;
1109 		goto notfound;
1110 	}
1111 
1112 	dcmn_err2(("dv_find: returning vp for nm %s\n", nm));
1113 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
1114 		/*
1115 		 * If vnode is a device, return special vnode instead
1116 		 * (though it knows all about -us- via sp->s_realvp,
1117 		 * sp->s_devvp, and sp->s_dip)
1118 		 */
1119 		*vpp = specvp_devfs(vp, vp->v_rdev, vp->v_type, cred,
1120 			dv->dv_devi);
1121 		VN_RELE(vp);
1122 		if (*vpp == NULLVP)
1123 			rv = ENOSYS;
1124 	} else
1125 		*vpp = vp;
1126 
1127 notfound:
1128 	rw_enter(&ddv->dv_contents, RW_WRITER);
1129 	if (was_busy)
1130 		ddv->dv_busy--;
1131 	rw_exit(&ddv->dv_contents);
1132 	return (rv);
1133 }
1134 
1135 /*
1136  * The given directory node is out-of-date; that is, it has been
1137  * marked as needing to be rebuilt, possibly because some new devinfo
1138  * node has come into existence, or possibly because this is the first
1139  * time we've been here.
1140  */
1141 void
1142 dv_filldir(struct dv_node *ddv)
1143 {
1144 	struct dv_node *dv;
1145 	dev_info_t *devi, *pdevi;
1146 	struct ddi_minor_data *dmd;
1147 	char devnm[MAXNAMELEN];
1148 	int circ;
1149 
1150 	ASSERT(DVTOV(ddv)->v_type == VDIR);
1151 	ASSERT(RW_WRITE_HELD(&ddv->dv_contents));
1152 	ASSERT(ddv->dv_flags & DV_BUILD);
1153 
1154 	dcmn_err3(("dv_filldir: %s\n", ddv->dv_name));
1155 	if (DV_STALE(ddv))
1156 		return;
1157 	pdevi = ddv->dv_devi;
1158 
1159 	if (ndi_devi_config(pdevi, NDI_NO_EVENT) != NDI_SUCCESS) {
1160 		dcmn_err3(("dv_filldir: config error %s\n",
1161 			ddv->dv_name));
1162 	}
1163 
1164 	ndi_devi_enter(pdevi, &circ);
1165 	for (devi = ddi_get_child(pdevi); devi;
1166 	    devi = ddi_get_next_sibling(devi)) {
1167 		if (i_ddi_node_state(devi) < DS_PROBED)
1168 			continue;
1169 
1170 		dcmn_err3(("dv_filldir: node %s\n", ddi_node_name(devi)));
1171 
1172 		mutex_enter(&DEVI(devi)->devi_lock);
1173 		for (dmd = DEVI(devi)->devi_minor; dmd; dmd = dmd->next) {
1174 			char *addr;
1175 
1176 			/*
1177 			 * Skip alias nodes, internal nodes, and nodes
1178 			 * without a name.  We allow DDM_DEFAULT nodes
1179 			 * to appear in readdir.
1180 			 */
1181 			if ((dmd->type == DDM_ALIAS) ||
1182 			    (dmd->type == DDM_INTERNAL_PATH) ||
1183 			    (dmd->ddm_name == NULL))
1184 				continue;
1185 
1186 			addr = ddi_get_name_addr(devi);
1187 			if (addr && *addr)
1188 				(void) sprintf(devnm, "%s@%s:%s",
1189 				    ddi_node_name(devi), addr, dmd->ddm_name);
1190 			else
1191 				(void) sprintf(devnm, "%s:%s",
1192 				    ddi_node_name(devi), dmd->ddm_name);
1193 
1194 			if ((dv = dv_findbyname(ddv, devnm)) != NULL) {
1195 				/* dv_node already exists */
1196 				VN_RELE(DVTOV(dv));
1197 				continue;
1198 			}
1199 
1200 			dv = dv_mknod(ddv, devi, devnm, dmd);
1201 			dv_insert(ddv, dv);
1202 			VN_RELE(DVTOV(dv));
1203 		}
1204 		mutex_exit(&DEVI(devi)->devi_lock);
1205 
1206 		(void) ddi_deviname(devi, devnm);
1207 		if ((dv = dv_findbyname(ddv, devnm + 1)) == NULL) {
1208 			/* directory doesn't exist */
1209 			dv = dv_mkdir(ddv, devi, devnm + 1);
1210 			dv_insert(ddv, dv);
1211 		}
1212 		VN_RELE(DVTOV(dv));
1213 	}
1214 	ndi_devi_exit(pdevi, circ);
1215 
1216 	ddv->dv_flags &= ~DV_BUILD;
1217 }
1218 
1219 /*
1220  * Given a directory node, clean out all the nodes beneath.
1221  *
1222  * VDIR:	Reinvoke to clean them, then delete the directory.
1223  * VCHR, VBLK:	Just blow them away.
1224  *
1225  * Mark the directories touched as in need of a rebuild, in case
1226  * we fall over part way through. When DV_CLEAN_FORCE is specified,
1227  * we mark referenced empty directories as stale to facilitate DR.
1228  */
1229 int
1230 dv_cleandir(struct dv_node *ddv, char *devnm, uint_t flags)
1231 {
1232 	struct dv_node *dv;
1233 	struct dv_node **pprev, **npprev;
1234 	struct vnode *vp;
1235 	int busy = 0;
1236 
1237 	dcmn_err3(("dv_cleandir: %s\n", ddv->dv_name));
1238 
1239 	if (!(flags & DV_CLEANDIR_LCK))
1240 		rw_enter(&ddv->dv_contents, RW_WRITER);
1241 	for (pprev = &ddv->dv_dot, dv = *pprev; dv;
1242 	    pprev = npprev, dv = *pprev) {
1243 		npprev = &dv->dv_next;
1244 
1245 		/*
1246 		 * If devnm is specified, the non-minor portion of the
1247 		 * name must match devnm.
1248 		 */
1249 		if (devnm &&
1250 		    (strncmp(devnm, dv->dv_name, strlen(devnm)) ||
1251 		    (dv->dv_name[strlen(devnm)] != ':' &&
1252 		    dv->dv_name[strlen(devnm)] != '\0')))
1253 			continue;
1254 
1255 		/* check type of what we are cleaning */
1256 		vp = DVTOV(dv);
1257 		if (vp->v_type == VDIR) {
1258 			/* recurse on directories */
1259 			rw_enter(&dv->dv_contents, RW_WRITER);
1260 			if (dv_cleandir(dv, NULL,
1261 			    flags | DV_CLEANDIR_LCK) == EBUSY) {
1262 				rw_exit(&dv->dv_contents);
1263 				goto set_busy;
1264 			}
1265 
1266 			/* A clean directory is an empty directory... */
1267 			ASSERT(dv->dv_nlink == 2);
1268 			mutex_enter(&vp->v_lock);
1269 			if (vp->v_count > 0) {
1270 				/*
1271 				 * ... but an empty directory can still have
1272 				 * references to it. If we have dv_busy or
1273 				 * DV_CLEAN_FORCE is *not* specified then a
1274 				 * referenced directory is considered busy.
1275 				 */
1276 				if (dv->dv_busy || !(flags & DV_CLEAN_FORCE)) {
1277 					mutex_exit(&vp->v_lock);
1278 					rw_exit(&dv->dv_contents);
1279 					goto set_busy;
1280 				}
1281 
1282 				/*
1283 				 * Mark referenced directory stale so that DR
1284 				 * will succeed even if a shell has
1285 				 * /devices/xxx as current directory (causing
1286 				 * VN_HOLD reference to an empty directory).
1287 				 */
1288 				ASSERT(!DV_STALE(dv));
1289 				ndi_rele_devi(dv->dv_devi);
1290 				dv->dv_devi = NULL;	/* mark DV_STALE */
1291 			}
1292 		} else {
1293 			ASSERT((vp->v_type == VCHR) || (vp->v_type == VBLK));
1294 			ASSERT(dv->dv_nlink == 1);	/* no hard links */
1295 			mutex_enter(&vp->v_lock);
1296 			if (vp->v_count > 0) {
1297 				mutex_exit(&vp->v_lock);
1298 				goto set_busy;
1299 			}
1300 		}
1301 
1302 		/* unlink from directory */
1303 		dv_unlink(ddv, dv, pprev);
1304 
1305 		/* drop locks */
1306 		mutex_exit(&vp->v_lock);
1307 		if (vp->v_type == VDIR)
1308 			rw_exit(&dv->dv_contents);
1309 
1310 		/* destroy vnode if ref count is zero */
1311 		if (vp->v_count == 0)
1312 			dv_destroy(dv, flags);
1313 
1314 		/* pointer to previous stays unchanged */
1315 		npprev = pprev;
1316 		continue;
1317 
1318 		/*
1319 		 * If devnm is not NULL we return immediately on busy,
1320 		 * otherwise we continue destroying unused dv_node's.
1321 		 */
1322 set_busy:	busy++;
1323 		if (devnm)
1324 			break;
1325 	}
1326 
1327 	/*
1328 	 * This code may be invoked to inform devfs that a new node has
1329 	 * been created in the kernel device tree. So we always set
1330 	 * the DV_BUILD flag to allow the next dv_filldir() to pick
1331 	 * the new devinfo nodes.
1332 	 */
1333 	ddv->dv_flags |= DV_BUILD;
1334 
1335 	if (!(flags & DV_CLEANDIR_LCK))
1336 		rw_exit(&ddv->dv_contents);
1337 
1338 	return (busy ? EBUSY : 0);
1339 }
1340 
1341 /*
1342  * Walk through the devfs hierarchy, correcting the permissions of
1343  * devices with default permissions that do not match those specified
1344  * by minor perm.  This can only be done for all drivers for now.
1345  */
1346 static int
1347 dv_reset_perm_dir(struct dv_node *ddv, uint_t flags)
1348 {
1349 	struct dv_node *dv, *next = NULL;
1350 	struct vnode *vp;
1351 	int retval = 0;
1352 	struct vattr *attrp;
1353 	mperm_t mp;
1354 	char *nm;
1355 	uid_t old_uid;
1356 	gid_t old_gid;
1357 	mode_t old_mode;
1358 
1359 	rw_enter(&ddv->dv_contents, RW_WRITER);
1360 	for (dv = ddv->dv_dot; dv; dv = next) {
1361 		int error = 0;
1362 		next = dv->dv_next;
1363 		nm = dv->dv_name;
1364 
1365 		rw_enter(&dv->dv_contents, RW_READER);
1366 		vp = DVTOV(dv);
1367 		if (vp->v_type == VDIR) {
1368 			rw_exit(&dv->dv_contents);
1369 			if (dv_reset_perm_dir(dv, flags) != 0) {
1370 				error = EBUSY;
1371 			}
1372 		} else {
1373 			ASSERT(vp->v_type == VCHR || vp->v_type == VBLK);
1374 
1375 			/*
1376 			 * Check for permissions from minor_perm
1377 			 * If there are none, we're done
1378 			 */
1379 			rw_exit(&dv->dv_contents);
1380 			if (dev_minorperm(dv->dv_devi, nm, &mp) != 0)
1381 				continue;
1382 
1383 			rw_enter(&dv->dv_contents, RW_READER);
1384 
1385 			/*
1386 			 * Allow a node's permissions to be altered
1387 			 * permanently from the defaults by chmod,
1388 			 * using the shadow node as backing store.
1389 			 * Otherwise, update node to minor_perm permissions.
1390 			 */
1391 			if (dv->dv_attrvp == NULLVP) {
1392 				/*
1393 				 * No attribute vp, try to find one.
1394 				 */
1395 				dv_shadow_node(DVTOV(ddv), nm, vp,
1396 					NULL, NULLVP, kcred, 0);
1397 			}
1398 			if (dv->dv_attrvp != NULLVP || dv->dv_attr == NULL) {
1399 				rw_exit(&dv->dv_contents);
1400 				continue;
1401 			}
1402 
1403 			attrp = dv->dv_attr;
1404 
1405 			if (VATTRP_MP_CMP(attrp, mp) == 0) {
1406 				dcmn_err5(("%s: no perm change: "
1407 				    "%d %d 0%o\n", nm, attrp->va_uid,
1408 				    attrp->va_gid, attrp->va_mode));
1409 				rw_exit(&dv->dv_contents);
1410 				continue;
1411 			}
1412 
1413 			old_uid = attrp->va_uid;
1414 			old_gid = attrp->va_gid;
1415 			old_mode = attrp->va_mode;
1416 
1417 			VATTRP_MP_MERGE(attrp, mp);
1418 			mutex_enter(&vp->v_lock);
1419 			if (vp->v_count > 0) {
1420 				error = EBUSY;
1421 			}
1422 			mutex_exit(&vp->v_lock);
1423 
1424 			dcmn_err5(("%s: perm %d/%d/0%o -> %d/%d/0%o (%d)\n",
1425 			    nm, old_uid, old_gid, old_mode, attrp->va_uid,
1426 			    attrp->va_gid, attrp->va_mode, error));
1427 
1428 			rw_exit(&dv->dv_contents);
1429 		}
1430 
1431 		if (error != 0) {
1432 			retval = error;
1433 		}
1434 	}
1435 
1436 	ddv->dv_flags |= DV_BUILD;
1437 
1438 	rw_exit(&ddv->dv_contents);
1439 
1440 	return (retval);
1441 }
1442 
1443 int
1444 devfs_reset_perm(uint_t flags)
1445 {
1446 	struct dv_node *dvp;
1447 	int rval;
1448 
1449 	if ((dvp = devfs_dip_to_dvnode(ddi_root_node())) == NULL)
1450 		return (0);
1451 
1452 	VN_HOLD(DVTOV(dvp));
1453 	rval = dv_reset_perm_dir(dvp, flags);
1454 	VN_RELE(DVTOV(dvp));
1455 	return (rval);
1456 }
1457 
1458 /*
1459  * Clean up dangling devfs shadow nodes for removed
1460  * drivers so that, in the event the driver is re-added
1461  * to the system, newly created nodes won't incorrectly
1462  * pick up these stale shadow node permissions.
1463  *
1464  * This is accomplished by walking down the pathname
1465  * to the directory, starting at the root's attribute
1466  * node, then removing all minors matching the specified
1467  * node name.  Care must be taken to remove all entries
1468  * in a directory before the directory itself, so that
1469  * the clean-up associated with rem_drv'ing a nexus driver
1470  * does not inadvertently result in an inconsistent
1471  * filesystem underlying devfs.
1472  */
1473 
1474 static int
1475 devfs_remdrv_rmdir(vnode_t *dirvp, const char *dir, vnode_t *rvp)
1476 {
1477 	int error;
1478 	vnode_t *vp;
1479 	int eof;
1480 	struct iovec iov;
1481 	struct uio uio;
1482 	struct dirent64 *dp;
1483 	dirent64_t *dbuf;
1484 	size_t dlen;
1485 	size_t dbuflen;
1486 	int ndirents = 64;
1487 	char *nm;
1488 
1489 	VN_HOLD(dirvp);
1490 
1491 	dlen = ndirents * (sizeof (*dbuf));
1492 	dbuf = kmem_alloc(dlen, KM_SLEEP);
1493 
1494 	uio.uio_iov = &iov;
1495 	uio.uio_iovcnt = 1;
1496 	uio.uio_segflg = UIO_SYSSPACE;
1497 	uio.uio_fmode = 0;
1498 	uio.uio_extflg = UIO_COPY_CACHED;
1499 	uio.uio_loffset = 0;
1500 	uio.uio_llimit = MAXOFFSET_T;
1501 
1502 	eof = 0;
1503 	error = 0;
1504 	while (!error && !eof) {
1505 		uio.uio_resid = dlen;
1506 		iov.iov_base = (char *)dbuf;
1507 		iov.iov_len = dlen;
1508 
1509 		(void) VOP_RWLOCK(dirvp, V_WRITELOCK_FALSE, NULL);
1510 		error = VOP_READDIR(dirvp, &uio, kcred, &eof);
1511 		VOP_RWUNLOCK(dirvp, V_WRITELOCK_FALSE, NULL);
1512 
1513 		dbuflen = dlen - uio.uio_resid;
1514 
1515 		if (error || dbuflen == 0)
1516 			break;
1517 
1518 		for (dp = dbuf; ((intptr_t)dp < (intptr_t)dbuf + dbuflen);
1519 			dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen)) {
1520 
1521 			nm = dp->d_name;
1522 
1523 			if (strcmp(nm, ".") == 0 || strcmp(nm, "..") == 0)
1524 				continue;
1525 
1526 			error = VOP_LOOKUP(dirvp,
1527 				nm, &vp, NULL, 0, NULL, kcred);
1528 
1529 			dsysdebug(error,
1530 			    ("rem_drv %s/%s lookup (%d)\n",
1531 			    dir, nm, error));
1532 
1533 			if (error)
1534 				continue;
1535 
1536 			ASSERT(vp->v_type == VDIR ||
1537 				vp->v_type == VCHR || vp->v_type == VBLK);
1538 
1539 			if (vp->v_type == VDIR) {
1540 				error = devfs_remdrv_rmdir(vp, nm, rvp);
1541 				if (error == 0) {
1542 					error = VOP_RMDIR(dirvp,
1543 					    (char *)nm, rvp, kcred);
1544 					dsysdebug(error,
1545 					    ("rem_drv %s/%s rmdir (%d)\n",
1546 					    dir, nm, error));
1547 				}
1548 			} else {
1549 				error = VOP_REMOVE(dirvp, (char *)nm, kcred);
1550 				dsysdebug(error,
1551 				    ("rem_drv %s/%s remove (%d)\n",
1552 				    dir, nm, error));
1553 			}
1554 
1555 			VN_RELE(vp);
1556 			if (error) {
1557 				goto exit;
1558 			}
1559 		}
1560 	}
1561 
1562 exit:
1563 	VN_RELE(dirvp);
1564 	kmem_free(dbuf, dlen);
1565 
1566 	return (error);
1567 }
1568 
1569 int
1570 devfs_remdrv_cleanup(const char *dir, const char *nodename)
1571 {
1572 	int error;
1573 	vnode_t *vp;
1574 	vnode_t *dirvp;
1575 	int eof;
1576 	struct iovec iov;
1577 	struct uio uio;
1578 	struct dirent64 *dp;
1579 	dirent64_t *dbuf;
1580 	size_t dlen;
1581 	size_t dbuflen;
1582 	int ndirents = 64;
1583 	int nodenamelen = strlen(nodename);
1584 	char *nm;
1585 	struct pathname pn;
1586 	vnode_t *rvp;		/* root node of the underlying attribute fs */
1587 
1588 	dcmn_err5(("devfs_remdrv_cleanup: %s %s\n", dir, nodename));
1589 
1590 	if (error = pn_get((char *)dir, UIO_SYSSPACE, &pn))
1591 		return (0);
1592 
1593 	rvp = dvroot->dv_attrvp;
1594 	ASSERT(rvp != NULL);
1595 	VN_HOLD(rvp);
1596 
1597 	pn_skipslash(&pn);
1598 	dirvp = rvp;
1599 	VN_HOLD(dirvp);
1600 
1601 	nm = kmem_alloc(MAXNAMELEN, KM_SLEEP);
1602 
1603 	while (pn_pathleft(&pn)) {
1604 		ASSERT(dirvp->v_type == VDIR);
1605 		(void) pn_getcomponent(&pn, nm);
1606 		ASSERT((strcmp(nm, ".") != 0) && (strcmp(nm, "..") != 0));
1607 		error = VOP_LOOKUP(dirvp, nm, &vp, NULL, 0, rvp, kcred);
1608 		if (error) {
1609 			dcmn_err5(("remdrv_cleanup %s lookup error %d\n",
1610 			    nm, error));
1611 			VN_RELE(dirvp);
1612 			if (dirvp != rvp)
1613 				VN_RELE(rvp);
1614 			pn_free(&pn);
1615 			kmem_free(nm, MAXNAMELEN);
1616 			return (0);
1617 		}
1618 		VN_RELE(dirvp);
1619 		dirvp = vp;
1620 		pn_skipslash(&pn);
1621 	}
1622 
1623 	ASSERT(dirvp->v_type == VDIR);
1624 	if (dirvp != rvp)
1625 		VN_RELE(rvp);
1626 	pn_free(&pn);
1627 	kmem_free(nm, MAXNAMELEN);
1628 
1629 	dlen = ndirents * (sizeof (*dbuf));
1630 	dbuf = kmem_alloc(dlen, KM_SLEEP);
1631 
1632 	uio.uio_iov = &iov;
1633 	uio.uio_iovcnt = 1;
1634 	uio.uio_segflg = UIO_SYSSPACE;
1635 	uio.uio_fmode = 0;
1636 	uio.uio_extflg = UIO_COPY_CACHED;
1637 	uio.uio_loffset = 0;
1638 	uio.uio_llimit = MAXOFFSET_T;
1639 
1640 	eof = 0;
1641 	error = 0;
1642 	while (!error && !eof) {
1643 		uio.uio_resid = dlen;
1644 		iov.iov_base = (char *)dbuf;
1645 		iov.iov_len = dlen;
1646 
1647 		(void) VOP_RWLOCK(dirvp, V_WRITELOCK_FALSE, NULL);
1648 		error = VOP_READDIR(dirvp, &uio, kcred, &eof);
1649 		VOP_RWUNLOCK(dirvp, V_WRITELOCK_FALSE, NULL);
1650 
1651 		dbuflen = dlen - uio.uio_resid;
1652 
1653 		if (error || dbuflen == 0)
1654 			break;
1655 
1656 		for (dp = dbuf; ((intptr_t)dp < (intptr_t)dbuf + dbuflen);
1657 			dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen)) {
1658 
1659 			nm = dp->d_name;
1660 
1661 			if (strcmp(nm, ".") == 0 || strcmp(nm, "..") == 0)
1662 				continue;
1663 
1664 			if (strncmp(nm, nodename, nodenamelen) != 0)
1665 				continue;
1666 
1667 			error = VOP_LOOKUP(dirvp, nm, &vp,
1668 			    NULL, 0, NULL, kcred);
1669 
1670 			dsysdebug(error,
1671 			    ("rem_drv %s/%s lookup (%d)\n",
1672 			    dir, nm, error));
1673 
1674 			if (error)
1675 				continue;
1676 
1677 			ASSERT(vp->v_type == VDIR ||
1678 				vp->v_type == VCHR || vp->v_type == VBLK);
1679 
1680 			if (vp->v_type == VDIR) {
1681 				error = devfs_remdrv_rmdir(vp, nm, rvp);
1682 				if (error == 0) {
1683 					error = VOP_RMDIR(dirvp,
1684 					    (char *)nm, rvp, kcred);
1685 					dsysdebug(error,
1686 					    ("rem_drv %s/%s rmdir (%d)\n",
1687 					    dir, nm, error));
1688 				}
1689 			} else {
1690 				error = VOP_REMOVE(dirvp, (char *)nm, kcred);
1691 				dsysdebug(error,
1692 				    ("rem_drv %s/%s remove (%d)\n",
1693 				    dir, nm, error));
1694 			}
1695 
1696 			VN_RELE(vp);
1697 			if (error)
1698 				goto exit;
1699 		}
1700 	}
1701 
1702 exit:
1703 	VN_RELE(dirvp);
1704 
1705 	kmem_free(dbuf, dlen);
1706 
1707 	return (0);
1708 }
1709 
1710 struct dv_list {
1711 	struct dv_node	*dv;
1712 	struct dv_list	*next;
1713 };
1714 
1715 void
1716 dv_walk(
1717 	struct dv_node	*ddv,
1718 	char		*devnm,
1719 	void		(*callback)(struct dv_node *, void *),
1720 	void		*arg)
1721 {
1722 	struct vnode	*dvp;
1723 	struct dv_node	*dv;
1724 	struct dv_list	*head, *tail, *next;
1725 	int		len;
1726 
1727 	dcmn_err3(("dv_walk: ddv = %s, devnm = %s\n",
1728 	    ddv->dv_name, devnm ? devnm : "<null>"));
1729 
1730 	dvp = DVTOV(ddv);
1731 
1732 	ASSERT(dvp->v_type == VDIR);
1733 
1734 	head = tail = next = NULL;
1735 
1736 	rw_enter(&ddv->dv_contents, RW_READER);
1737 	mutex_enter(&dvp->v_lock);
1738 	for (dv = ddv->dv_dot; dv; dv = dv->dv_next) {
1739 		/*
1740 		 * If devnm is not NULL and is not the empty string,
1741 		 * select only dv_nodes with matching non-minor name
1742 		 */
1743 		if (devnm && (len = strlen(devnm)) &&
1744 		    (strncmp(devnm, dv->dv_name, len) ||
1745 		    (dv->dv_name[len] != ':' && dv->dv_name[len] != '\0')))
1746 			continue;
1747 
1748 		callback(dv, arg);
1749 
1750 		if (DVTOV(dv)->v_type != VDIR)
1751 			continue;
1752 
1753 		next = kmem_zalloc(sizeof (*next), KM_SLEEP);
1754 		next->dv = dv;
1755 
1756 		if (tail)
1757 			tail->next = next;
1758 		else
1759 			head = next;
1760 
1761 		tail = next;
1762 	}
1763 
1764 	while (head) {
1765 		dv_walk(head->dv, NULL, callback, arg);
1766 		next = head->next;
1767 		kmem_free(head, sizeof (*head));
1768 		head = next;
1769 	}
1770 	rw_exit(&ddv->dv_contents);
1771 	mutex_exit(&dvp->v_lock);
1772 }
1773