xref: /titanic_50/usr/src/uts/common/fs/devfs/devfs_subr.c (revision c1c6f601cc48d067049ea58a07349897a8e225f2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * miscellaneous routines for the devfs
31  */
32 
33 #include <sys/types.h>
34 #include <sys/param.h>
35 #include <sys/t_lock.h>
36 #include <sys/systm.h>
37 #include <sys/sysmacros.h>
38 #include <sys/user.h>
39 #include <sys/time.h>
40 #include <sys/vfs.h>
41 #include <sys/vnode.h>
42 #include <sys/file.h>
43 #include <sys/fcntl.h>
44 #include <sys/flock.h>
45 #include <sys/kmem.h>
46 #include <sys/uio.h>
47 #include <sys/errno.h>
48 #include <sys/stat.h>
49 #include <sys/cred.h>
50 #include <sys/dirent.h>
51 #include <sys/pathname.h>
52 #include <sys/cmn_err.h>
53 #include <sys/debug.h>
54 #include <sys/modctl.h>
55 #include <fs/fs_subr.h>
56 #include <sys/fs/dv_node.h>
57 #include <sys/fs/snode.h>
58 #include <sys/sunndi.h>
59 #include <sys/sunmdi.h>
60 #include <sys/conf.h>
61 
62 #ifdef DEBUG
63 int devfs_debug = 0x0;
64 #endif
65 
66 const char	dvnm[] = "devfs";
67 kmem_cache_t	*dv_node_cache;	/* dv_node cache */
68 uint_t		devfs_clean_key;
69 struct dv_node *dvroot;
70 
71 /* prototype memory vattrs */
72 vattr_t dv_vattr_dir = {
73 	AT_TYPE|AT_MODE|AT_UID|AT_GID, 		/* va_mask */
74 	VDIR,					/* va_type */
75 	DV_DIRMODE_DEFAULT,			/* va_mode */
76 	DV_UID_DEFAULT,				/* va_uid */
77 	DV_GID_DEFAULT,				/* va_gid */
78 	0,					/* va_fsid; */
79 	0,					/* va_nodeid; */
80 	0,					/* va_nlink; */
81 	0,					/* va_size; */
82 	0,					/* va_atime; */
83 	0,					/* va_mtime; */
84 	0,					/* va_ctime; */
85 	0,					/* va_rdev; */
86 	0,					/* va_blksize; */
87 	0,					/* va_nblocks; */
88 	0,					/* va_seq; */
89 };
90 
91 vattr_t dv_vattr_file = {
92 	AT_TYPE|AT_MODE|AT_SIZE|AT_UID|AT_GID|AT_RDEV,	/* va_mask */
93 	0,					/* va_type */
94 	DV_DEVMODE_DEFAULT,			/* va_mode */
95 	DV_UID_DEFAULT,				/* va_uid */
96 	DV_GID_DEFAULT,				/* va_gid */
97 	0,					/* va_fsid; */
98 	0,					/* va_nodeid; */
99 	0,					/* va_nlink; */
100 	0,					/* va_size; */
101 	0,					/* va_atime; */
102 	0,					/* va_mtime; */
103 	0,					/* va_ctime; */
104 	0,					/* va_rdev; */
105 	0,					/* va_blksize; */
106 	0,					/* va_nblocks; */
107 	0,					/* va_seq; */
108 };
109 
110 vattr_t dv_vattr_priv = {
111 	AT_TYPE|AT_MODE|AT_SIZE|AT_UID|AT_GID|AT_RDEV,	/* va_mask */
112 	0,					/* va_type */
113 	DV_DEVMODE_PRIV,			/* va_mode */
114 	DV_UID_DEFAULT,				/* va_uid */
115 	DV_GID_DEFAULT,				/* va_gid */
116 	0,					/* va_fsid; */
117 	0,					/* va_nodeid; */
118 	0,					/* va_nlink; */
119 	0,					/* va_size; */
120 	0,					/* va_atime; */
121 	0,					/* va_mtime; */
122 	0,					/* va_ctime; */
123 	0,					/* va_rdev; */
124 	0,					/* va_blksize; */
125 	0,					/* va_nblocks; */
126 	0,					/* va_seq; */
127 };
128 
129 extern dev_info_t	*clone_dip;
130 extern major_t		clone_major;
131 extern struct dev_ops	*ddi_hold_driver(major_t);
132 
133 /*
134  * dv_node cache constructor, destructor, can cache creation
135  */
136 /*ARGSUSED1*/
137 static int
138 i_dv_node_ctor(void *buf, void *cfarg, int flag)
139 {
140 	struct dv_node	*dv = (struct dv_node *)buf;
141 	struct vnode	*vp;
142 
143 	bzero(buf, sizeof (struct dv_node));
144 
145 	/* initialize persistent parts of dv_node */
146 	rw_init(&dv->dv_contents, NULL, RW_DEFAULT, NULL);
147 
148 	/* allocate vnode and initialize link back to dv_node */
149 	dv->dv_vnode = vn_alloc(KM_SLEEP);
150 	vp = DVTOV(dv);
151 	vp->v_data = (caddr_t)dv;
152 	return (0);
153 }
154 
155 /* dev_info node destructor for kmem cache */
156 /*ARGSUSED1*/
157 static void
158 i_dv_node_dtor(void *buf, void *arg)
159 {
160 	struct dv_node	*dv = (struct dv_node *)buf;
161 	struct vnode	*vp = DVTOV(dv);
162 
163 	rw_destroy(&dv->dv_contents);
164 	vn_invalid(vp);
165 	vn_free(vp);
166 }
167 
168 
169 /* initialize dev_info node cache */
170 void
171 dv_node_cache_init()
172 {
173 	ASSERT(dv_node_cache == NULL);
174 	dv_node_cache = kmem_cache_create("dv_node_cache",
175 	    sizeof (struct dv_node), 0, i_dv_node_ctor, i_dv_node_dtor,
176 	    NULL, NULL, NULL, 0);
177 
178 	tsd_create(&devfs_clean_key, NULL);
179 }
180 
181 /* initialize dev_info node cache */
182 void
183 dv_node_cache_fini()
184 {
185 	ASSERT(dv_node_cache != NULL);
186 	kmem_cache_destroy(dv_node_cache);
187 	dv_node_cache = NULL;
188 
189 	tsd_destroy(&devfs_clean_key);
190 }
191 
192 /*
193  * dv_mkino - Generate a unique inode number for devfs nodes.
194  *
195  * Although ino_t is 64 bits, the inode number is truncated to 32 bits for 32
196  * bit non-LARGEFILE applications. This means that there is a requirement to
197  * maintain the inode number as a 32 bit value or applications will have
198  * stat(2) calls fail with EOVERFLOW.  We form a 32 bit inode number from the
199  * dev_t. but if the minor number is larger than L_MAXMIN32 we fold extra minor
200  *
201  * To generate inode numbers for directories, we assume that we will never use
202  * more than half the major space - this allows for ~8190 drivers. We use this
203  * upper major number space to allocate inode numbers for directories by
204  * encoding the major and instance into this space.
205  *
206  * We also skew the result so that inode 2 is reserved for the root of the file
207  * system.
208  *
209  * As part of the future support for 64-bit dev_t APIs, the upper minor bits
210  * should be folded into the high inode bits by adding the following code
211  * after "ino |= 1":
212  *
213  * #if (L_BITSMINOR32 != L_BITSMINOR)
214  *		|* fold overflow minor bits into high bits of inode number *|
215  *		ino |= ((ino_t)(minor >> L_BITSMINOR32)) << L_BITSMINOR;
216  * #endif |* (L_BITSMINOR32 != L_BITSMINOR) *|
217  *
218  * This way only applications that use devices that overflow their minor
219  * space will have an application level impact.
220  */
221 static ino_t
222 dv_mkino(dev_info_t *devi, vtype_t typ, dev_t dev)
223 {
224 	major_t		major;
225 	minor_t		minor;
226 	ino_t		ino;
227 	static int	warn;
228 
229 	if (typ == VDIR) {
230 		major = ((L_MAXMAJ32 + 1) >> 1) + DEVI(devi)->devi_major;
231 		minor = ddi_get_instance(devi);
232 
233 		/* makedevice32 in high half of major number space */
234 		ino = (ino_t)((major << L_BITSMINOR32) | (minor & L_MAXMIN32));
235 
236 		major = DEVI(devi)->devi_major;
237 	} else {
238 		major = getmajor(dev);
239 		minor = getminor(dev);
240 
241 		/* makedevice32 */
242 		ino = (ino_t)((major << L_BITSMINOR32) | (minor & L_MAXMIN32));
243 
244 		/* make ino for VCHR different than VBLK */
245 		ino <<= 1;
246 		if (typ == VCHR)
247 			ino |= 1;
248 	}
249 
250 	ino += DV_ROOTINO + 1;		/* skew */
251 
252 	/*
253 	 * diagnose things a little early because adding the skew to a large
254 	 * minor number could roll over the major.
255 	 */
256 	if ((major >= (L_MAXMAJ32 >> 1)) && (warn == 0)) {
257 		warn = 1;
258 		cmn_err(CE_WARN, "%s: inode numbers are not unique", dvnm);
259 	}
260 
261 	return (ino);
262 }
263 
264 /*
265  * dv_mkroot
266  *
267  * Build the first VDIR dv_node.
268  */
269 struct dv_node *
270 dv_mkroot(struct vfs *vfsp, dev_t devfsdev)
271 {
272 	struct dv_node *dv;
273 	struct vnode *vp;
274 
275 	ASSERT(ddi_root_node() != NULL);
276 	ASSERT(dv_node_cache != NULL);
277 
278 	dcmn_err3(("dv_mkroot\n"));
279 	dv = kmem_cache_alloc(dv_node_cache, KM_SLEEP);
280 	vp = DVTOV(dv);
281 	vn_reinit(vp);
282 	vp->v_flag = VROOT;
283 	vp->v_vfsp = vfsp;
284 	vp->v_type = VDIR;
285 	vp->v_rdev = devfsdev;
286 	vn_setops(vp, dv_vnodeops);
287 	vn_exists(vp);
288 
289 	dvroot = dv;
290 
291 	dv->dv_name = NULL;		/* not needed */
292 	dv->dv_namelen = 0;
293 
294 	dv->dv_devi = ddi_root_node();
295 
296 	dv->dv_ino = DV_ROOTINO;
297 	dv->dv_nlink = 2;		/* name + . (no dv_insert) */
298 	dv->dv_dotdot = dv;		/* .. == self */
299 	dv->dv_attrvp = NULLVP;
300 	dv->dv_attr = NULL;
301 	dv->dv_flags = DV_BUILD;
302 	dv->dv_priv = NULL;
303 	dv->dv_busy = 0;
304 	dv->dv_dflt_mode = 0;
305 
306 	return (dv);
307 }
308 
309 /*
310  * dv_mkdir
311  *
312  * Given an probed or attached nexus node, create a VDIR dv_node.
313  * No dv_attrvp is created at this point.
314  */
315 struct dv_node *
316 dv_mkdir(struct dv_node *ddv, dev_info_t *devi, char *nm)
317 {
318 	struct dv_node *dv;
319 	struct vnode *vp;
320 	size_t nmlen;
321 
322 	ASSERT((devi));
323 	dcmn_err4(("dv_mkdir: %s\n", nm));
324 
325 	dv = kmem_cache_alloc(dv_node_cache, KM_SLEEP);
326 	nmlen = strlen(nm) + 1;
327 	dv->dv_name = kmem_alloc(nmlen, KM_SLEEP);
328 	bcopy(nm, dv->dv_name, nmlen);
329 	dv->dv_namelen = nmlen - 1;	/* '\0' not included */
330 	vp = DVTOV(dv);
331 	vn_reinit(vp);
332 	vp->v_flag = 0;
333 	vp->v_vfsp = DVTOV(ddv)->v_vfsp;
334 	vp->v_type = VDIR;
335 	vp->v_rdev = DVTOV(ddv)->v_rdev;
336 	vn_setops(vp, vn_getops(DVTOV(ddv)));
337 	vn_exists(vp);
338 
339 	dv->dv_devi = devi;
340 	ndi_hold_devi(devi);
341 
342 	dv->dv_ino = dv_mkino(devi, VDIR, NODEV);
343 	dv->dv_nlink = 0;		/* updated on insert */
344 	dv->dv_dotdot = ddv;
345 	dv->dv_attrvp = NULLVP;
346 	dv->dv_attr = NULL;
347 	dv->dv_flags = DV_BUILD;
348 	dv->dv_priv = NULL;
349 	dv->dv_busy = 0;
350 	dv->dv_dflt_mode = 0;
351 
352 	return (dv);
353 }
354 
355 /*
356  * dv_mknod
357  *
358  * Given a minor node, create a VCHR or VBLK dv_node.
359  * No dv_attrvp is created at this point.
360  */
361 static struct dv_node *
362 dv_mknod(struct dv_node *ddv, dev_info_t *devi, char *nm,
363 	struct ddi_minor_data *dmd)
364 {
365 	struct dv_node *dv;
366 	struct vnode *vp;
367 	size_t nmlen;
368 
369 	dcmn_err4(("dv_mknod: %s\n", nm));
370 
371 	dv = kmem_cache_alloc(dv_node_cache, KM_SLEEP);
372 	nmlen = strlen(nm) + 1;
373 	dv->dv_name = kmem_alloc(nmlen, KM_SLEEP);
374 	bcopy(nm, dv->dv_name, nmlen);
375 	dv->dv_namelen = nmlen - 1;	/* no '\0' */
376 	vp = DVTOV(dv);
377 	vn_reinit(vp);
378 	vp->v_flag = 0;
379 	vp->v_vfsp = DVTOV(ddv)->v_vfsp;
380 	vp->v_type = dmd->ddm_spec_type == S_IFCHR ? VCHR : VBLK;
381 	vp->v_rdev = dmd->ddm_dev;
382 	vn_setops(vp, vn_getops(DVTOV(ddv)));
383 	vn_exists(vp);
384 
385 	ASSERT(MUTEX_HELD(&DEVI(devi)->devi_lock));
386 	dv->dv_devi = devi;
387 	DEVI(devi)->devi_ref++;
388 
389 	dv->dv_ino = dv_mkino(devi, vp->v_type, vp->v_rdev);
390 	dv->dv_nlink = 0;		/* updated on insert */
391 	dv->dv_dotdot = ddv;
392 	dv->dv_attrvp = NULLVP;
393 	dv->dv_attr = NULL;
394 	dv->dv_flags = 0;
395 
396 	if (dmd->type == DDM_INTERNAL_PATH)
397 		dv->dv_flags |= DV_INTERNAL;
398 	if (dmd->ddm_flags & DM_NO_FSPERM)
399 		dv->dv_flags |= DV_NO_FSPERM;
400 
401 	dv->dv_priv = dmd->ddm_node_priv;
402 	if (dv->dv_priv)
403 		dphold(dv->dv_priv);
404 
405 	/*
406 	 * Minors created with ddi_create_priv_minor_node can specify
407 	 * a default mode permission other than the devfs default.
408 	 */
409 	if (dv->dv_priv || dv->dv_flags & DV_NO_FSPERM) {
410 		dcmn_err5(("%s: dv_mknod default priv mode 0%o\n",
411 		    dv->dv_name, dmd->ddm_priv_mode));
412 		dv->dv_flags |= DV_DFLT_MODE;
413 		dv->dv_dflt_mode = dmd->ddm_priv_mode & S_IAMB;
414 	}
415 
416 	return (dv);
417 }
418 
419 /*
420  * dv_destroy
421  *
422  * Destroy what we created in dv_mkdir or dv_mknod.
423  * In the case of a *referenced* directory, do nothing.
424  */
425 /*ARGSUSED1*/
426 void
427 dv_destroy(struct dv_node *dv, uint_t flags)
428 {
429 	vnode_t *vp = DVTOV(dv);
430 	ASSERT(dv->dv_nlink == 0);		/* no references */
431 	ASSERT(dv->dv_next == NULL);		/* unlinked from directory */
432 
433 	dcmn_err4(("dv_destroy: %s\n", dv->dv_name));
434 
435 	/*
436 	 * We may be asked to unlink referenced directories.
437 	 * In this case, there is nothing to be done.
438 	 * The eventual memory free will be done in
439 	 * devfs_inactive.
440 	 */
441 	if (vp->v_count != 0) {
442 		ASSERT(vp->v_type == VDIR);
443 		ASSERT(flags & DV_CLEAN_FORCE);
444 		ASSERT(DV_STALE(dv));
445 		return;
446 	}
447 
448 	if (dv->dv_attrvp != NULLVP)
449 		VN_RELE(dv->dv_attrvp);
450 	if (dv->dv_attr != NULL)
451 		kmem_free(dv->dv_attr, sizeof (struct vattr));
452 	if (dv->dv_name != NULL)
453 		kmem_free(dv->dv_name, dv->dv_namelen + 1);
454 	if (dv->dv_devi != NULL) {
455 		ndi_rele_devi(dv->dv_devi);
456 	}
457 	if (dv->dv_priv != NULL) {
458 		dpfree(dv->dv_priv);
459 	}
460 
461 	kmem_cache_free(dv_node_cache, dv);
462 }
463 
464 /*
465  * Find and hold dv_node by name
466  */
467 struct dv_node *
468 dv_findbyname(struct dv_node *ddv, char *nm)
469 {
470 	struct dv_node	*dv;
471 	size_t		nmlen = strlen(nm);
472 
473 	ASSERT(RW_LOCK_HELD(&ddv->dv_contents));
474 	dcmn_err3(("dv_findbyname: %s\n", nm));
475 	for (dv = ddv->dv_dot; dv; dv = dv->dv_next) {
476 		if (dv->dv_namelen != nmlen)
477 			continue;
478 		if (strcmp(dv->dv_name, nm) == 0) {
479 			VN_HOLD(DVTOV(dv));
480 			return (dv);
481 		}
482 	}
483 	return (NULL);
484 }
485 
486 /*
487  * Inserts a new dv_node in a parent directory
488  */
489 void
490 dv_insert(struct dv_node *ddv, struct dv_node *dv)
491 {
492 	ASSERT(RW_WRITE_HELD(&ddv->dv_contents));
493 	ASSERT(DVTOV(ddv)->v_type == VDIR);
494 	ASSERT(ddv->dv_nlink >= 2);
495 	ASSERT(dv->dv_nlink == 0);
496 
497 	dcmn_err3(("dv_insert: %s\n", dv->dv_name));
498 
499 	dv->dv_dotdot = ddv;
500 	dv->dv_next = ddv->dv_dot;
501 	ddv->dv_dot = dv;
502 	if (DVTOV(dv)->v_type == VDIR) {
503 		ddv->dv_nlink++;	/* .. to containing directory */
504 		dv->dv_nlink = 2;	/* name + . */
505 	} else {
506 		dv->dv_nlink = 1;	/* name */
507 	}
508 }
509 
510 /*
511  * Unlink a dv_node from a perent directory
512  */
513 void
514 dv_unlink(struct dv_node *ddv, struct dv_node *dv, struct dv_node **dv_pprev)
515 {
516 	/* verify linkage of arguments */
517 	ASSERT(ddv && dv && dv_pprev);
518 	ASSERT(dv->dv_dotdot == ddv);
519 	ASSERT(*dv_pprev == dv);
520 	ASSERT(RW_WRITE_HELD(&ddv->dv_contents));
521 	ASSERT(DVTOV(ddv)->v_type == VDIR);
522 
523 	dcmn_err3(("dv_unlink: %s\n", dv->dv_name));
524 
525 	if (DVTOV(dv)->v_type == VDIR) {
526 		ddv->dv_nlink--;	/* .. to containing directory */
527 		dv->dv_nlink -= 2;	/* name + . */
528 	} else {
529 		dv->dv_nlink -= 1;	/* name */
530 	}
531 	ASSERT(ddv->dv_nlink >= 2);
532 	ASSERT(dv->dv_nlink == 0);
533 
534 	/* update ddv->dv_dot/dv_next */
535 	*dv_pprev = dv->dv_next;
536 
537 	dv->dv_dotdot = NULL;
538 	dv->dv_next = NULL;
539 	dv->dv_dot = NULL;
540 }
541 
542 /*
543  * Merge devfs node specific information into an attribute structure.
544  *
545  * NOTE: specfs provides ATIME,MTIME,CTIME,SIZE,BLKSIZE,NBLOCKS on leaf node.
546  */
547 void
548 dv_vattr_merge(struct dv_node *dv, struct vattr *vap)
549 {
550 	struct vnode *vp = DVTOV(dv);
551 
552 	vap->va_nodeid = dv->dv_ino;
553 	vap->va_nlink = dv->dv_nlink;
554 
555 	if (vp->v_type == VDIR) {
556 		vap->va_rdev = 0;
557 		vap->va_fsid = vp->v_rdev;
558 	} else {
559 		vap->va_rdev = vp->v_rdev;
560 		vap->va_fsid = DVTOV(dv->dv_dotdot)->v_rdev;
561 		vap->va_type = vp->v_type;
562 		/* don't trust the shadow file type */
563 		vap->va_mode &= ~S_IFMT;
564 		if (vap->va_type == VCHR)
565 			vap->va_mode |= S_IFCHR;
566 		else
567 			vap->va_mode |= S_IFBLK;
568 	}
569 }
570 
571 /*
572  * dv_shadow_node
573  *
574  * Given a VDIR dv_node, find/create the associated VDIR
575  * node in the shadow attribute filesystem.
576  *
577  * Given a VCHR/VBLK dv_node, find the associated VREG
578  * node in the shadow attribute filesystem.  These nodes
579  * are only created to persist non-default attributes.
580  * Lack of such a node implies the default permissions
581  * are sufficient.
582  *
583  * Managing the attribute file entries is slightly tricky (mostly
584  * because we can't intercept VN_HOLD and VN_RELE except on the last
585  * release).
586  *
587  * We assert that if the dv_attrvp pointer is non-NULL, it points
588  * to a singly-held (by us) vnode that represents the shadow entry
589  * in the underlying filesystem.  To avoid store-ordering issues,
590  * we assert that the pointer can only be tested under the dv_contents
591  * READERS lock.
592  */
593 
594 void
595 dv_shadow_node(
596 	struct vnode *dvp,	/* devfs parent directory vnode */
597 	char *nm,		/* name component */
598 	struct vnode *vp,	/* devfs vnode */
599 	struct pathname *pnp,	/* the path .. */
600 	struct vnode *rdir,	/* the root .. */
601 	struct cred *cred,	/* who's asking? */
602 	int flags)		/* optionally create shadow node */
603 {
604 	struct dv_node	*dv;	/* dv_node of named directory */
605 	struct vnode	*rdvp;	/* shadow parent directory vnode */
606 	struct vnode	*rvp;	/* shadow vnode */
607 	struct vnode	*rrvp;	/* realvp of shadow vnode */
608 	struct vattr	vattr;
609 	int		create_tried;
610 	int		error;
611 	mperm_t		mp;
612 
613 	ASSERT(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK);
614 	dv = VTODV(vp);
615 	dcmn_err3(("dv_shadow_node: name %s attr %p\n",
616 	    nm, (void *)dv->dv_attrvp));
617 
618 	if ((flags & DV_SHADOW_WRITE_HELD) == 0) {
619 		ASSERT(RW_READ_HELD(&dv->dv_contents));
620 		if (dv->dv_attrvp != NULLVP)
621 			return;
622 		if (!rw_tryupgrade(&dv->dv_contents)) {
623 			rw_exit(&dv->dv_contents);
624 			rw_enter(&dv->dv_contents, RW_WRITER);
625 			if (dv->dv_attrvp != NULLVP) {
626 				rw_downgrade(&dv->dv_contents);
627 				return;
628 			}
629 		}
630 	} else {
631 		ASSERT(RW_WRITE_HELD(&dv->dv_contents));
632 		if (dv->dv_attrvp != NULLVP)
633 			return;
634 	}
635 
636 	ASSERT(RW_WRITE_HELD(&dv->dv_contents) && dv->dv_attrvp == NULL);
637 
638 	rdvp = VTODV(dvp)->dv_attrvp;
639 	create_tried = 0;
640 lookup:
641 	if (rdvp && (dv->dv_flags & DV_NO_FSPERM) == 0) {
642 		error = VOP_LOOKUP(rdvp, nm, &rvp, pnp, LOOKUP_DIR, rdir, cred);
643 
644 		/* factor out the snode since we only want the attribute node */
645 		if ((error == 0) && (VOP_REALVP(rvp, &rrvp) == 0)) {
646 			VN_HOLD(rrvp);
647 			VN_RELE(rvp);
648 			rvp = rrvp;
649 		}
650 	} else
651 		error = EROFS;		/* no parent, no entry */
652 
653 	/*
654 	 * All we want is the permissions (and maybe ACLs and
655 	 * extended attributes), and we want to perform lookups
656 	 * by name.  Drivers occasionally change their minor
657 	 * number space.  If something changes, there's no
658 	 * much we can do about it here.
659 	 */
660 
661 	/* The shadow node checks out. We are done */
662 	if (error == 0) {
663 		dv->dv_attrvp = rvp;	/* with one hold */
664 
665 		/*
666 		 * Determine if we have non-trivial ACLs on this node.
667 		 * It is not necessary to VOP_RWLOCK since fs_acl_nontrivial
668 		 * only does VOP_GETSECATTR.
669 		 */
670 		dv->dv_flags &= ~DV_ACL;
671 
672 		if (fs_acl_nontrivial(rvp, cred))
673 			dv->dv_flags |= DV_ACL;
674 
675 		/*
676 		 * If we have synced out the memory attributes, free
677 		 * them and switch back to using the persistent store.
678 		 */
679 		if (rvp && dv->dv_attr) {
680 			kmem_free(dv->dv_attr, sizeof (struct vattr));
681 			dv->dv_attr = NULL;
682 		}
683 		if ((flags & DV_SHADOW_WRITE_HELD) == 0)
684 			rw_downgrade(&dv->dv_contents);
685 		ASSERT(RW_LOCK_HELD(&dv->dv_contents));
686 		return;
687 	}
688 
689 	/*
690 	 * Failed to find attribute in persistent backing store,
691 	 * get default permission bits.  For minors not created by
692 	 * ddi_create_priv_minor_node(), use devfs defaults.
693 	 */
694 	if (vp->v_type == VDIR) {
695 		vattr = dv_vattr_dir;
696 	} else if (dv->dv_flags & DV_NO_FSPERM) {
697 		vattr = dv_vattr_priv;
698 	} else {
699 		/*
700 		 * look up perm bits from minor_perm
701 		 */
702 		vattr = dv_vattr_file;
703 		if (dev_minorperm(dv->dv_devi, dv->dv_name, &mp) == 0) {
704 			VATTR_MP_MERGE(vattr, mp);
705 			dcmn_err5(("%s: minor perm mode 0%o\n",
706 			    dv->dv_name, vattr.va_mode));
707 		} else if (dv->dv_flags & DV_DFLT_MODE) {
708 			ASSERT((dv->dv_dflt_mode & ~S_IAMB) == 0);
709 			vattr.va_mode &= ~S_IAMB;
710 			vattr.va_mode |= dv->dv_dflt_mode;
711 			dcmn_err5(("%s: priv mode 0%o\n",
712 			    dv->dv_name, vattr.va_mode));
713 		}
714 	}
715 
716 	dv_vattr_merge(dv, &vattr);
717 	gethrestime(&vattr.va_atime);
718 	vattr.va_mtime = vattr.va_atime;
719 	vattr.va_ctime = vattr.va_atime;
720 
721 	/*
722 	 * Try to create shadow dir. This is necessary in case
723 	 * we need to create a shadow leaf node later, when user
724 	 * executes chmod.
725 	 */
726 	if ((error == ENOENT) && !create_tried) {
727 		switch (vp->v_type) {
728 		case VDIR:
729 			error = VOP_MKDIR(rdvp, nm, &vattr, &rvp, kcred);
730 			dsysdebug(error, ("vop_mkdir %s %s %d\n",
731 			    VTODV(dvp)->dv_name, nm, error));
732 			create_tried = 1;
733 			break;
734 
735 		case VCHR:
736 		case VBLK:
737 			/*
738 			 * Shadow nodes are only created on demand
739 			 */
740 			if (flags & DV_SHADOW_CREATE) {
741 				error = VOP_CREATE(rdvp, nm, &vattr, NONEXCL,
742 				    VREAD|VWRITE, &rvp, kcred, 0);
743 				dsysdebug(error, ("vop_create %s %s %d\n",
744 				    VTODV(dvp)->dv_name, nm, error));
745 				create_tried = 1;
746 			}
747 			break;
748 
749 		default:
750 			cmn_err(CE_PANIC, "devfs: %s: create", dvnm);
751 			/*NOTREACHED*/
752 		}
753 
754 		if (create_tried &&
755 		    (error == 0) || (error == EEXIST)) {
756 			VN_RELE(rvp);
757 			goto lookup;
758 		}
759 	}
760 
761 	/* Store attribute in memory */
762 	if (dv->dv_attr == NULL) {
763 		dv->dv_attr = kmem_alloc(sizeof (struct vattr), KM_SLEEP);
764 		*(dv->dv_attr) = vattr;
765 	}
766 
767 	if ((flags & DV_SHADOW_WRITE_HELD) == 0)
768 		rw_downgrade(&dv->dv_contents);
769 	ASSERT(RW_LOCK_HELD(&dv->dv_contents));
770 }
771 
772 /*
773  * Given a devinfo node, and a name, returns the appropriate
774  * minor information for that named node, if it exists.
775  */
776 static int
777 dv_find_leafnode(dev_info_t *devi, char *minor_nm, struct ddi_minor_data *r_mi)
778 {
779 	struct ddi_minor_data *dmd;
780 
781 	ASSERT(i_ddi_devi_attached(devi));
782 	ASSERT(MUTEX_HELD(&DEVI(devi)->devi_lock));
783 
784 	dcmn_err3(("dv_find_leafnode: %s\n", minor_nm));
785 	for (dmd = DEVI(devi)->devi_minor; dmd; dmd = dmd->next) {
786 
787 		/*
788 		 * Skip alias nodes and nodes without a name.
789 		 */
790 		if ((dmd->type == DDM_ALIAS) || (dmd->ddm_name == NULL))
791 			    continue;
792 
793 		dcmn_err4(("dv_find_leafnode: (%s,%s)\n",
794 			minor_nm, dmd->ddm_name));
795 		if (strcmp(minor_nm, dmd->ddm_name) == 0) {
796 			r_mi->ddm_dev = dmd->ddm_dev;
797 			r_mi->ddm_spec_type = dmd->ddm_spec_type;
798 			r_mi->type = dmd->type;
799 			r_mi->ddm_flags = dmd->ddm_flags;
800 			r_mi->ddm_node_priv = dmd->ddm_node_priv;
801 			r_mi->ddm_priv_mode = dmd->ddm_priv_mode;
802 			if (r_mi->ddm_node_priv)
803 				dphold(r_mi->ddm_node_priv);
804 			return (0);
805 		}
806 	}
807 
808 	dcmn_err3(("dv_find_leafnode: %s: ENOENT\n", minor_nm));
809 	return (ENOENT);
810 }
811 
812 /*
813  * Special handling for clone node:
814  *	Clone minor name is a driver name, the minor number will
815  *	be the major number of the driver. There is no minor
816  *	node under the clone driver, so we'll manufacture the
817  *	dev_t.
818  */
819 static struct dv_node *
820 dv_clone_mknod(struct dv_node *ddv, char *drvname)
821 {
822 	major_t	major;
823 	struct dv_node *dvp;
824 	char *devnm;
825 	struct ddi_minor_data *dmd;
826 
827 	/*
828 	 * Make sure drvname is a STREAMS driver. We load the driver,
829 	 * but don't attach to any instances. This makes stat(2)
830 	 * relatively cheap.
831 	 */
832 	major = ddi_name_to_major(drvname);
833 	if (major == (major_t)-1)
834 		return (NULL);
835 
836 	if (ddi_hold_driver(major) == NULL)
837 		return (NULL);
838 
839 	if (STREAMSTAB(major) == NULL) {
840 		ddi_rele_driver(major);
841 		return (NULL);
842 	}
843 
844 	ddi_rele_driver(major);
845 	devnm = kmem_alloc(MAXNAMELEN, KM_SLEEP);
846 	(void) snprintf(devnm, MAXNAMELEN, "clone@0:%s", drvname);
847 	dmd = kmem_zalloc(sizeof (*dmd), KM_SLEEP);
848 	dmd->ddm_dev = makedevice(clone_major, (minor_t)major);
849 	dmd->ddm_spec_type = S_IFCHR;
850 	dvp = dv_mknod(ddv, clone_dip, devnm, dmd);
851 	kmem_free(dmd, sizeof (*dmd));
852 	kmem_free(devnm, MAXNAMELEN);
853 	return (dvp);
854 }
855 
856 /*
857  * Given the parent directory node, and a name in it, returns the
858  * named dv_node to the caller (as a vnode).
859  *
860  * (We need pnp and rdir for doing shadow lookups; they can be NULL)
861  */
862 int
863 dv_find(struct dv_node *ddv, char *nm, struct vnode **vpp, struct pathname *pnp,
864 	struct vnode *rdir, struct cred *cred, uint_t ndi_flags)
865 {
866 	extern int isminiroot;	/* see modctl.c */
867 
868 	int rv = 0, was_busy = 0, nmlen;
869 	struct vnode *vp;
870 	struct dv_node *dv, *dup;
871 	dev_info_t *pdevi, *devi = NULL;
872 	char *mnm;
873 	struct ddi_minor_data *dmd;
874 
875 	dcmn_err3(("dv_find %s\n", nm));
876 
877 	rw_enter(&ddv->dv_contents, RW_READER);
878 start:
879 	if (DV_STALE(ddv)) {
880 		rw_exit(&ddv->dv_contents);
881 		return (ESTALE);
882 	}
883 
884 	/*
885 	 * Empty name or ., return node itself.
886 	 */
887 	nmlen = strlen(nm);
888 	if ((nmlen == 0) || ((nmlen == 1) && (nm[0] == '.'))) {
889 		*vpp = DVTOV(ddv);
890 		rw_exit(&ddv->dv_contents);
891 		VN_HOLD(*vpp);
892 		return (0);
893 	}
894 
895 	/*
896 	 * .., return the parent directory
897 	 */
898 	if ((nmlen == 2) && (strcmp(nm, "..") == 0)) {
899 		*vpp = DVTOV(ddv->dv_dotdot);
900 		rw_exit(&ddv->dv_contents);
901 		VN_HOLD(*vpp);
902 		return (0);
903 	}
904 
905 	/*
906 	 * Fail anything without a valid device name component
907 	 */
908 	if (nm[0] == '@' || nm[0] == ':') {
909 		dcmn_err3(("devfs: no driver '%s'\n", nm));
910 		rw_exit(&ddv->dv_contents);
911 		return (ENOENT);
912 	}
913 
914 	/*
915 	 * So, now we have to deal with the trickier stuff.
916 	 *
917 	 * (a) search the existing list of dv_nodes on this directory
918 	 */
919 	if ((dv = dv_findbyname(ddv, nm)) != NULL) {
920 founddv:
921 		ASSERT(RW_LOCK_HELD(&ddv->dv_contents));
922 		rw_enter(&dv->dv_contents, RW_READER);
923 		vp = DVTOV(dv);
924 		if ((dv->dv_attrvp != NULLVP) ||
925 		    (vp->v_type != VDIR && dv->dv_attr != NULL)) {
926 			/*
927 			 * Common case - we already have attributes
928 			 */
929 			rw_exit(&dv->dv_contents);
930 			rw_exit(&ddv->dv_contents);
931 			goto found;
932 		}
933 
934 		/*
935 		 * No attribute vp, try and build one.
936 		 */
937 		dv_shadow_node(DVTOV(ddv), nm, vp, pnp, rdir, cred, 0);
938 		rw_exit(&dv->dv_contents);
939 		rw_exit(&ddv->dv_contents);
940 		goto found;
941 	}
942 
943 	/*
944 	 * (b) Search the child devinfo nodes of our parent directory,
945 	 * looking for the named node.  If we find it, build a new
946 	 * node, then grab the writers lock, search the directory
947 	 * if it's still not there, then insert it.
948 	 *
949 	 * We drop the devfs locks before accessing the device tree.
950 	 * Take care to mark the node BUSY so that a forced devfs_clean
951 	 * doesn't mark the directory node stale.
952 	 *
953 	 * Also, check if we are called as part of devfs_clean or
954 	 * reset_perm. If so, simply return not found because there
955 	 * is nothing to clean.
956 	 */
957 	if (tsd_get(devfs_clean_key)) {
958 		rw_exit(&ddv->dv_contents);
959 		return (ENOENT);
960 	}
961 
962 	/*
963 	 * We could be either READ or WRITE locked at
964 	 * this point. Upgrade if we are read locked.
965 	 */
966 	ASSERT(RW_LOCK_HELD(&ddv->dv_contents));
967 	if (rw_read_locked(&ddv->dv_contents) &&
968 	    !rw_tryupgrade(&ddv->dv_contents)) {
969 		rw_exit(&ddv->dv_contents);
970 		rw_enter(&ddv->dv_contents, RW_WRITER);
971 		/*
972 		 * Things may have changed when we dropped
973 		 * the contents lock, so start from top again
974 		 */
975 		goto start;
976 	}
977 	ddv->dv_busy++;		/* mark busy before dropping lock */
978 	was_busy++;
979 	rw_exit(&ddv->dv_contents);
980 
981 	pdevi = ddv->dv_devi;
982 	ASSERT(pdevi != NULL);
983 
984 	mnm = strchr(nm, ':');
985 	if (mnm)
986 		*mnm = (char)0;
987 
988 	/*
989 	 * Configure one nexus child, will call nexus's bus_ops
990 	 * If successful, devi is held upon returning.
991 	 * Note: devfs lookup should not be configuring grandchildren.
992 	 */
993 	ASSERT((ndi_flags & NDI_CONFIG) == 0);
994 
995 	rv = ndi_devi_config_one(pdevi, nm, &devi, ndi_flags | NDI_NO_EVENT);
996 	if (mnm)
997 		*mnm = ':';
998 	if (rv != NDI_SUCCESS) {
999 		rv = ENOENT;
1000 		goto notfound;
1001 	}
1002 
1003 	/*
1004 	 * Don't make vhci clients visible under phci, unless we
1005 	 * are in miniroot.
1006 	 */
1007 	if (isminiroot == 0 && ddi_get_parent(devi) != pdevi) {
1008 		ndi_rele_devi(devi);
1009 		rv = ENOENT;
1010 		goto notfound;
1011 	}
1012 
1013 	ASSERT(devi && i_ddi_devi_attached(devi));
1014 
1015 	/*
1016 	 * Invalidate cache to notice newly created minor nodes.
1017 	 */
1018 	rw_enter(&ddv->dv_contents, RW_WRITER);
1019 	ddv->dv_flags |= DV_BUILD;
1020 	rw_exit(&ddv->dv_contents);
1021 
1022 	/*
1023 	 * mkdir for nexus drivers and leaf nodes as well.  If we are racing
1024 	 * and create a duplicate, the duplicate will be destroyed below.
1025 	 */
1026 	if (mnm == NULL) {
1027 		dv = dv_mkdir(ddv, devi, nm);
1028 	} else {
1029 		/*
1030 		 * For clone minors, load the driver indicated by minor name.
1031 		 */
1032 		mutex_enter(&DEVI(devi)->devi_lock);
1033 		if (devi == clone_dip) {
1034 			dv = dv_clone_mknod(ddv, mnm + 1);
1035 		} else {
1036 			/*
1037 			 * Find minor node and make a dv_node
1038 			 */
1039 			dmd = kmem_zalloc(sizeof (*dmd), KM_SLEEP);
1040 			if (dv_find_leafnode(devi, mnm + 1, dmd) == 0) {
1041 				dv = dv_mknod(ddv, devi, nm, dmd);
1042 				if (dmd->ddm_node_priv)
1043 					dpfree(dmd->ddm_node_priv);
1044 			}
1045 			kmem_free(dmd, sizeof (*dmd));
1046 		}
1047 		mutex_exit(&DEVI(devi)->devi_lock);
1048 	}
1049 	/*
1050 	 * Release hold from ndi_devi_config_one()
1051 	 */
1052 	ndi_rele_devi(devi);
1053 
1054 	if (dv == NULL) {
1055 		rv = ENOENT;
1056 		goto notfound;
1057 	}
1058 
1059 	/*
1060 	 * We have released the dv_contents lock, need to check
1061 	 * if another thread already created a duplicate node
1062 	 */
1063 	rw_enter(&ddv->dv_contents, RW_WRITER);
1064 	if ((dup = dv_findbyname(ddv, nm)) == NULL) {
1065 		dv_insert(ddv, dv);
1066 	} else {
1067 		/*
1068 		 * Duplicate found, use the existing node
1069 		 */
1070 		VN_RELE(DVTOV(dv));
1071 		dv_destroy(dv, 0);
1072 		dv = dup;
1073 	}
1074 	goto founddv;
1075 	/*NOTREACHED*/
1076 
1077 found:
1078 	/*
1079 	 * Skip non-kernel lookups of internal nodes.
1080 	 * This use of kcred to distinguish between user and
1081 	 * internal kernel lookups is unfortunate.  The information
1082 	 * provided by the seg argument to lookupnameat should
1083 	 * evolve into a lookup flag for filesystems that need
1084 	 * this distinction.
1085 	 */
1086 	if ((dv->dv_flags & DV_INTERNAL) && (cred != kcred)) {
1087 		VN_RELE(vp);
1088 		rv = ENOENT;
1089 		goto notfound;
1090 	}
1091 
1092 	dcmn_err2(("dv_find: returning vp for nm %s\n", nm));
1093 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
1094 		/*
1095 		 * If vnode is a device, return special vnode instead
1096 		 * (though it knows all about -us- via sp->s_realvp,
1097 		 * sp->s_devvp, and sp->s_dip)
1098 		 */
1099 		*vpp = specvp_devfs(vp, vp->v_rdev, vp->v_type, cred,
1100 			dv->dv_devi);
1101 		VN_RELE(vp);
1102 		if (*vpp == NULLVP)
1103 			rv = ENOSYS;
1104 	} else
1105 		*vpp = vp;
1106 
1107 notfound:
1108 	rw_enter(&ddv->dv_contents, RW_WRITER);
1109 	if (was_busy)
1110 		ddv->dv_busy--;
1111 	rw_exit(&ddv->dv_contents);
1112 	return (rv);
1113 }
1114 
1115 /*
1116  * The given directory node is out-of-date; that is, it has been
1117  * marked as needing to be rebuilt, possibly because some new devinfo
1118  * node has come into existence, or possibly because this is the first
1119  * time we've been here.
1120  */
1121 void
1122 dv_filldir(struct dv_node *ddv)
1123 {
1124 	struct dv_node *dv;
1125 	dev_info_t *devi, *pdevi;
1126 	struct ddi_minor_data *dmd;
1127 	char devnm[MAXNAMELEN];
1128 	int circ;
1129 
1130 	ASSERT(DVTOV(ddv)->v_type == VDIR);
1131 	ASSERT(RW_WRITE_HELD(&ddv->dv_contents));
1132 	ASSERT(ddv->dv_flags & DV_BUILD);
1133 
1134 	dcmn_err3(("dv_filldir: %s\n", ddv->dv_name));
1135 	if (DV_STALE(ddv))
1136 		return;
1137 	pdevi = ddv->dv_devi;
1138 
1139 	if (ndi_devi_config(pdevi, NDI_NO_EVENT) != NDI_SUCCESS) {
1140 		dcmn_err3(("dv_filldir: config error %s\n",
1141 			ddv->dv_name));
1142 	}
1143 
1144 	ndi_devi_enter(pdevi, &circ);
1145 	for (devi = ddi_get_child(pdevi); devi;
1146 	    devi = ddi_get_next_sibling(devi)) {
1147 		if (i_ddi_node_state(devi) < DS_PROBED)
1148 			continue;
1149 
1150 		dcmn_err3(("dv_filldir: node %s\n", ddi_node_name(devi)));
1151 
1152 		mutex_enter(&DEVI(devi)->devi_lock);
1153 		for (dmd = DEVI(devi)->devi_minor; dmd; dmd = dmd->next) {
1154 			char *addr;
1155 
1156 			/*
1157 			 * Skip alias nodes, internal nodes, and nodes
1158 			 * without a name.  We allow DDM_DEFAULT nodes
1159 			 * to appear in readdir.
1160 			 */
1161 			if ((dmd->type == DDM_ALIAS) ||
1162 			    (dmd->type == DDM_INTERNAL_PATH) ||
1163 			    (dmd->ddm_name == NULL))
1164 				continue;
1165 
1166 			addr = ddi_get_name_addr(devi);
1167 			if (addr && *addr)
1168 				(void) sprintf(devnm, "%s@%s:%s",
1169 				    ddi_node_name(devi), addr, dmd->ddm_name);
1170 			else
1171 				(void) sprintf(devnm, "%s:%s",
1172 				    ddi_node_name(devi), dmd->ddm_name);
1173 
1174 			if ((dv = dv_findbyname(ddv, devnm)) != NULL) {
1175 				/* dv_node already exists */
1176 				VN_RELE(DVTOV(dv));
1177 				continue;
1178 			}
1179 
1180 			dv = dv_mknod(ddv, devi, devnm, dmd);
1181 			dv_insert(ddv, dv);
1182 			VN_RELE(DVTOV(dv));
1183 		}
1184 		mutex_exit(&DEVI(devi)->devi_lock);
1185 
1186 		(void) ddi_deviname(devi, devnm);
1187 		if ((dv = dv_findbyname(ddv, devnm + 1)) == NULL) {
1188 			/* directory doesn't exist */
1189 			dv = dv_mkdir(ddv, devi, devnm + 1);
1190 			dv_insert(ddv, dv);
1191 		}
1192 		VN_RELE(DVTOV(dv));
1193 	}
1194 	ndi_devi_exit(pdevi, circ);
1195 
1196 	ddv->dv_flags &= ~DV_BUILD;
1197 }
1198 
1199 /*
1200  * Given a directory node, clean out all the nodes beneath.
1201  *
1202  * VDIR:	Reinvoke to clean them, then delete the directory.
1203  * VCHR, VBLK:	Just blow them away.
1204  *
1205  * Mark the directories touched as in need of a rebuild, in case
1206  * we fall over part way through. When DV_CLEAN_FORCE is specified,
1207  * we mark referenced empty directories as stale to facilitate DR.
1208  */
1209 int
1210 dv_cleandir(struct dv_node *ddv, char *devnm, uint_t flags)
1211 {
1212 	struct dv_node *dv;
1213 	struct dv_node **pprev, **npprev;
1214 	struct vnode *vp;
1215 	int busy = 0;
1216 
1217 	dcmn_err3(("dv_cleandir: %s\n", ddv->dv_name));
1218 
1219 	if (!(flags & DV_CLEANDIR_LCK))
1220 		rw_enter(&ddv->dv_contents, RW_WRITER);
1221 	for (pprev = &ddv->dv_dot, dv = *pprev; dv;
1222 	    pprev = npprev, dv = *pprev) {
1223 		npprev = &dv->dv_next;
1224 
1225 		/*
1226 		 * If devnm is specified, the non-minor portion of the
1227 		 * name must match devnm.
1228 		 */
1229 		if (devnm &&
1230 		    (strncmp(devnm, dv->dv_name, strlen(devnm)) ||
1231 		    (dv->dv_name[strlen(devnm)] != ':' &&
1232 		    dv->dv_name[strlen(devnm)] != '\0')))
1233 			continue;
1234 
1235 		/* check type of what we are cleaning */
1236 		vp = DVTOV(dv);
1237 		if (vp->v_type == VDIR) {
1238 			/* recurse on directories */
1239 			rw_enter(&dv->dv_contents, RW_WRITER);
1240 			if (dv_cleandir(dv, NULL,
1241 			    flags | DV_CLEANDIR_LCK) == EBUSY) {
1242 				rw_exit(&dv->dv_contents);
1243 				goto set_busy;
1244 			}
1245 
1246 			/* A clean directory is an empty directory... */
1247 			ASSERT(dv->dv_nlink == 2);
1248 			mutex_enter(&vp->v_lock);
1249 			if (vp->v_count > 0) {
1250 				/*
1251 				 * ... but an empty directory can still have
1252 				 * references to it. If we have dv_busy or
1253 				 * DV_CLEAN_FORCE is *not* specified then a
1254 				 * referenced directory is considered busy.
1255 				 */
1256 				if (dv->dv_busy || !(flags & DV_CLEAN_FORCE)) {
1257 					mutex_exit(&vp->v_lock);
1258 					rw_exit(&dv->dv_contents);
1259 					goto set_busy;
1260 				}
1261 
1262 				/*
1263 				 * Mark referenced directory stale so that DR
1264 				 * will succeed even if a shell has
1265 				 * /devices/xxx as current directory (causing
1266 				 * VN_HOLD reference to an empty directory).
1267 				 */
1268 				ASSERT(!DV_STALE(dv));
1269 				ndi_rele_devi(dv->dv_devi);
1270 				dv->dv_devi = NULL;	/* mark DV_STALE */
1271 			}
1272 		} else {
1273 			ASSERT((vp->v_type == VCHR) || (vp->v_type == VBLK));
1274 			ASSERT(dv->dv_nlink == 1);	/* no hard links */
1275 			mutex_enter(&vp->v_lock);
1276 			if (vp->v_count > 0) {
1277 				mutex_exit(&vp->v_lock);
1278 				goto set_busy;
1279 			}
1280 		}
1281 
1282 		/* unlink from directory */
1283 		dv_unlink(ddv, dv, pprev);
1284 
1285 		/* drop locks */
1286 		mutex_exit(&vp->v_lock);
1287 		if (vp->v_type == VDIR)
1288 			rw_exit(&dv->dv_contents);
1289 
1290 		/* destroy vnode if ref count is zero */
1291 		if (vp->v_count == 0)
1292 			dv_destroy(dv, flags);
1293 
1294 		/* pointer to previous stays unchanged */
1295 		npprev = pprev;
1296 		continue;
1297 
1298 		/*
1299 		 * If devnm is not NULL we return immediately on busy,
1300 		 * otherwise we continue destroying unused dv_node's.
1301 		 */
1302 set_busy:	busy++;
1303 		if (devnm)
1304 			break;
1305 	}
1306 
1307 	/*
1308 	 * This code may be invoked to inform devfs that a new node has
1309 	 * been created in the kernel device tree. So we always set
1310 	 * the DV_BUILD flag to allow the next dv_filldir() to pick
1311 	 * the new devinfo nodes.
1312 	 */
1313 	ddv->dv_flags |= DV_BUILD;
1314 
1315 	if (!(flags & DV_CLEANDIR_LCK))
1316 		rw_exit(&ddv->dv_contents);
1317 
1318 	return (busy ? EBUSY : 0);
1319 }
1320 
1321 /*
1322  * Walk through the devfs hierarchy, correcting the permissions of
1323  * devices with default permissions that do not match those specified
1324  * by minor perm.  This can only be done for all drivers for now.
1325  */
1326 static int
1327 dv_reset_perm_dir(struct dv_node *ddv, uint_t flags)
1328 {
1329 	struct dv_node *dv, *next = NULL;
1330 	struct vnode *vp;
1331 	int retval = 0;
1332 	struct vattr *attrp;
1333 	mperm_t mp;
1334 	char *nm;
1335 	uid_t old_uid;
1336 	gid_t old_gid;
1337 	mode_t old_mode;
1338 
1339 	rw_enter(&ddv->dv_contents, RW_WRITER);
1340 	for (dv = ddv->dv_dot; dv; dv = next) {
1341 		int error = 0;
1342 		next = dv->dv_next;
1343 		nm = dv->dv_name;
1344 
1345 		rw_enter(&dv->dv_contents, RW_READER);
1346 		vp = DVTOV(dv);
1347 		if (vp->v_type == VDIR) {
1348 			rw_exit(&dv->dv_contents);
1349 			if (dv_reset_perm_dir(dv, flags) != 0) {
1350 				error = EBUSY;
1351 			}
1352 		} else {
1353 			ASSERT(vp->v_type == VCHR || vp->v_type == VBLK);
1354 
1355 			/*
1356 			 * Check for permissions from minor_perm
1357 			 * If there are none, we're done
1358 			 */
1359 			rw_exit(&dv->dv_contents);
1360 			if (dev_minorperm(dv->dv_devi, nm, &mp) != 0)
1361 				continue;
1362 
1363 			rw_enter(&dv->dv_contents, RW_READER);
1364 
1365 			/*
1366 			 * Allow a node's permissions to be altered
1367 			 * permanently from the defaults by chmod,
1368 			 * using the shadow node as backing store.
1369 			 * Otherwise, update node to minor_perm permissions.
1370 			 */
1371 			if (dv->dv_attrvp == NULLVP) {
1372 				/*
1373 				 * No attribute vp, try to find one.
1374 				 */
1375 				dv_shadow_node(DVTOV(ddv), nm, vp,
1376 					NULL, NULLVP, kcred, 0);
1377 			}
1378 			if (dv->dv_attrvp != NULLVP || dv->dv_attr == NULL) {
1379 				rw_exit(&dv->dv_contents);
1380 				continue;
1381 			}
1382 
1383 			attrp = dv->dv_attr;
1384 
1385 			if (VATTRP_MP_CMP(attrp, mp) == 0) {
1386 				dcmn_err5(("%s: no perm change: "
1387 				    "%d %d 0%o\n", nm, attrp->va_uid,
1388 				    attrp->va_gid, attrp->va_mode));
1389 				rw_exit(&dv->dv_contents);
1390 				continue;
1391 			}
1392 
1393 			old_uid = attrp->va_uid;
1394 			old_gid = attrp->va_gid;
1395 			old_mode = attrp->va_mode;
1396 
1397 			VATTRP_MP_MERGE(attrp, mp);
1398 			mutex_enter(&vp->v_lock);
1399 			if (vp->v_count > 0) {
1400 				error = EBUSY;
1401 			}
1402 			mutex_exit(&vp->v_lock);
1403 
1404 			dcmn_err5(("%s: perm %d/%d/0%o -> %d/%d/0%o (%d)\n",
1405 			    nm, old_uid, old_gid, old_mode, attrp->va_uid,
1406 			    attrp->va_gid, attrp->va_mode, error));
1407 
1408 			rw_exit(&dv->dv_contents);
1409 		}
1410 
1411 		if (error != 0) {
1412 			retval = error;
1413 		}
1414 	}
1415 
1416 	ddv->dv_flags |= DV_BUILD;
1417 
1418 	rw_exit(&ddv->dv_contents);
1419 
1420 	return (retval);
1421 }
1422 
1423 int
1424 devfs_reset_perm(uint_t flags)
1425 {
1426 	struct dv_node *dvp;
1427 	int rval;
1428 
1429 	if ((dvp = devfs_dip_to_dvnode(ddi_root_node())) == NULL)
1430 		return (0);
1431 
1432 	VN_HOLD(DVTOV(dvp));
1433 	rval = dv_reset_perm_dir(dvp, flags);
1434 	VN_RELE(DVTOV(dvp));
1435 	return (rval);
1436 }
1437 
1438 /*
1439  * Clean up dangling devfs shadow nodes for removed
1440  * drivers so that, in the event the driver is re-added
1441  * to the system, newly created nodes won't incorrectly
1442  * pick up these stale shadow node permissions.
1443  *
1444  * This is accomplished by walking down the pathname
1445  * to the directory, starting at the root's attribute
1446  * node, then removing all minors matching the specified
1447  * node name.  Care must be taken to remove all entries
1448  * in a directory before the directory itself, so that
1449  * the clean-up associated with rem_drv'ing a nexus driver
1450  * does not inadvertently result in an inconsistent
1451  * filesystem underlying devfs.
1452  */
1453 
1454 static int
1455 devfs_remdrv_rmdir(vnode_t *dirvp, const char *dir, vnode_t *rvp)
1456 {
1457 	int error;
1458 	vnode_t *vp;
1459 	int eof;
1460 	struct iovec iov;
1461 	struct uio uio;
1462 	struct dirent64 *dp;
1463 	dirent64_t *dbuf;
1464 	size_t dlen;
1465 	size_t dbuflen;
1466 	int ndirents = 64;
1467 	char *nm;
1468 
1469 	VN_HOLD(dirvp);
1470 
1471 	dlen = ndirents * (sizeof (*dbuf));
1472 	dbuf = kmem_alloc(dlen, KM_SLEEP);
1473 
1474 	uio.uio_iov = &iov;
1475 	uio.uio_iovcnt = 1;
1476 	uio.uio_segflg = UIO_SYSSPACE;
1477 	uio.uio_fmode = 0;
1478 	uio.uio_extflg = UIO_COPY_CACHED;
1479 	uio.uio_loffset = 0;
1480 	uio.uio_llimit = MAXOFFSET_T;
1481 
1482 	eof = 0;
1483 	error = 0;
1484 	while (!error && !eof) {
1485 		uio.uio_resid = dlen;
1486 		iov.iov_base = (char *)dbuf;
1487 		iov.iov_len = dlen;
1488 
1489 		(void) VOP_RWLOCK(dirvp, V_WRITELOCK_FALSE, NULL);
1490 		error = VOP_READDIR(dirvp, &uio, kcred, &eof);
1491 		VOP_RWUNLOCK(dirvp, V_WRITELOCK_FALSE, NULL);
1492 
1493 		dbuflen = dlen - uio.uio_resid;
1494 
1495 		if (error || dbuflen == 0)
1496 			break;
1497 
1498 		for (dp = dbuf; ((intptr_t)dp < (intptr_t)dbuf + dbuflen);
1499 			dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen)) {
1500 
1501 			nm = dp->d_name;
1502 
1503 			if (strcmp(nm, ".") == 0 || strcmp(nm, "..") == 0)
1504 				continue;
1505 
1506 			error = VOP_LOOKUP(dirvp,
1507 				nm, &vp, NULL, 0, NULL, kcred);
1508 
1509 			dsysdebug(error,
1510 			    ("rem_drv %s/%s lookup (%d)\n",
1511 			    dir, nm, error));
1512 
1513 			if (error)
1514 				continue;
1515 
1516 			ASSERT(vp->v_type == VDIR ||
1517 				vp->v_type == VCHR || vp->v_type == VBLK);
1518 
1519 			if (vp->v_type == VDIR) {
1520 				error = devfs_remdrv_rmdir(vp, nm, rvp);
1521 				if (error == 0) {
1522 					error = VOP_RMDIR(dirvp,
1523 					    (char *)nm, rvp, kcred);
1524 					dsysdebug(error,
1525 					    ("rem_drv %s/%s rmdir (%d)\n",
1526 					    dir, nm, error));
1527 				}
1528 			} else {
1529 				error = VOP_REMOVE(dirvp, (char *)nm, kcred);
1530 				dsysdebug(error,
1531 				    ("rem_drv %s/%s remove (%d)\n",
1532 				    dir, nm, error));
1533 			}
1534 
1535 			VN_RELE(vp);
1536 			if (error) {
1537 				goto exit;
1538 			}
1539 		}
1540 	}
1541 
1542 exit:
1543 	VN_RELE(dirvp);
1544 	kmem_free(dbuf, dlen);
1545 
1546 	return (error);
1547 }
1548 
1549 int
1550 devfs_remdrv_cleanup(const char *dir, const char *nodename)
1551 {
1552 	int error;
1553 	vnode_t *vp;
1554 	vnode_t *dirvp;
1555 	int eof;
1556 	struct iovec iov;
1557 	struct uio uio;
1558 	struct dirent64 *dp;
1559 	dirent64_t *dbuf;
1560 	size_t dlen;
1561 	size_t dbuflen;
1562 	int ndirents = 64;
1563 	int nodenamelen = strlen(nodename);
1564 	char *nm;
1565 	struct pathname pn;
1566 	vnode_t *rvp;		/* root node of the underlying attribute fs */
1567 
1568 	dcmn_err5(("devfs_remdrv_cleanup: %s %s\n", dir, nodename));
1569 
1570 	if (error = pn_get((char *)dir, UIO_SYSSPACE, &pn))
1571 		return (0);
1572 
1573 	rvp = dvroot->dv_attrvp;
1574 	ASSERT(rvp != NULL);
1575 	VN_HOLD(rvp);
1576 
1577 	pn_skipslash(&pn);
1578 	dirvp = rvp;
1579 	VN_HOLD(dirvp);
1580 
1581 	nm = kmem_alloc(MAXNAMELEN, KM_SLEEP);
1582 
1583 	while (pn_pathleft(&pn)) {
1584 		ASSERT(dirvp->v_type == VDIR);
1585 		(void) pn_getcomponent(&pn, nm);
1586 		ASSERT((strcmp(nm, ".") != 0) && (strcmp(nm, "..") != 0));
1587 		error = VOP_LOOKUP(dirvp, nm, &vp, NULL, 0, rvp, kcred);
1588 		if (error) {
1589 			dcmn_err5(("remdrv_cleanup %s lookup error %d\n",
1590 			    nm, error));
1591 			VN_RELE(dirvp);
1592 			if (dirvp != rvp)
1593 				VN_RELE(rvp);
1594 			pn_free(&pn);
1595 			kmem_free(nm, MAXNAMELEN);
1596 			return (0);
1597 		}
1598 		VN_RELE(dirvp);
1599 		dirvp = vp;
1600 		pn_skipslash(&pn);
1601 	}
1602 
1603 	ASSERT(dirvp->v_type == VDIR);
1604 	if (dirvp != rvp)
1605 		VN_RELE(rvp);
1606 	pn_free(&pn);
1607 	kmem_free(nm, MAXNAMELEN);
1608 
1609 	dlen = ndirents * (sizeof (*dbuf));
1610 	dbuf = kmem_alloc(dlen, KM_SLEEP);
1611 
1612 	uio.uio_iov = &iov;
1613 	uio.uio_iovcnt = 1;
1614 	uio.uio_segflg = UIO_SYSSPACE;
1615 	uio.uio_fmode = 0;
1616 	uio.uio_extflg = UIO_COPY_CACHED;
1617 	uio.uio_loffset = 0;
1618 	uio.uio_llimit = MAXOFFSET_T;
1619 
1620 	eof = 0;
1621 	error = 0;
1622 	while (!error && !eof) {
1623 		uio.uio_resid = dlen;
1624 		iov.iov_base = (char *)dbuf;
1625 		iov.iov_len = dlen;
1626 
1627 		(void) VOP_RWLOCK(dirvp, V_WRITELOCK_FALSE, NULL);
1628 		error = VOP_READDIR(dirvp, &uio, kcred, &eof);
1629 		VOP_RWUNLOCK(dirvp, V_WRITELOCK_FALSE, NULL);
1630 
1631 		dbuflen = dlen - uio.uio_resid;
1632 
1633 		if (error || dbuflen == 0)
1634 			break;
1635 
1636 		for (dp = dbuf; ((intptr_t)dp < (intptr_t)dbuf + dbuflen);
1637 			dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen)) {
1638 
1639 			nm = dp->d_name;
1640 
1641 			if (strcmp(nm, ".") == 0 || strcmp(nm, "..") == 0)
1642 				continue;
1643 
1644 			if (strncmp(nm, nodename, nodenamelen) != 0)
1645 				continue;
1646 
1647 			error = VOP_LOOKUP(dirvp, nm, &vp,
1648 			    NULL, 0, NULL, kcred);
1649 
1650 			dsysdebug(error,
1651 			    ("rem_drv %s/%s lookup (%d)\n",
1652 			    dir, nm, error));
1653 
1654 			if (error)
1655 				continue;
1656 
1657 			ASSERT(vp->v_type == VDIR ||
1658 				vp->v_type == VCHR || vp->v_type == VBLK);
1659 
1660 			if (vp->v_type == VDIR) {
1661 				error = devfs_remdrv_rmdir(vp, nm, rvp);
1662 				if (error == 0) {
1663 					error = VOP_RMDIR(dirvp,
1664 					    (char *)nm, rvp, kcred);
1665 					dsysdebug(error,
1666 					    ("rem_drv %s/%s rmdir (%d)\n",
1667 					    dir, nm, error));
1668 				}
1669 			} else {
1670 				error = VOP_REMOVE(dirvp, (char *)nm, kcred);
1671 				dsysdebug(error,
1672 				    ("rem_drv %s/%s remove (%d)\n",
1673 				    dir, nm, error));
1674 			}
1675 
1676 			VN_RELE(vp);
1677 			if (error)
1678 				goto exit;
1679 		}
1680 	}
1681 
1682 exit:
1683 	VN_RELE(dirvp);
1684 
1685 	kmem_free(dbuf, dlen);
1686 
1687 	return (0);
1688 }
1689 
1690 struct dv_list {
1691 	struct dv_node	*dv;
1692 	struct dv_list	*next;
1693 };
1694 
1695 void
1696 dv_walk(
1697 	struct dv_node	*ddv,
1698 	char		*devnm,
1699 	void		(*callback)(struct dv_node *, void *),
1700 	void		*arg)
1701 {
1702 	struct vnode	*dvp;
1703 	struct dv_node	*dv;
1704 	struct dv_list	*head, *tail, *next;
1705 	int		len;
1706 
1707 	dcmn_err3(("dv_walk: ddv = %s, devnm = %s\n",
1708 	    ddv->dv_name, devnm ? devnm : "<null>"));
1709 
1710 	dvp = DVTOV(ddv);
1711 
1712 	ASSERT(dvp->v_type == VDIR);
1713 
1714 	head = tail = next = NULL;
1715 
1716 	rw_enter(&ddv->dv_contents, RW_READER);
1717 	mutex_enter(&dvp->v_lock);
1718 	for (dv = ddv->dv_dot; dv; dv = dv->dv_next) {
1719 		/*
1720 		 * If devnm is not NULL and is not the empty string,
1721 		 * select only dv_nodes with matching non-minor name
1722 		 */
1723 		if (devnm && (len = strlen(devnm)) &&
1724 		    (strncmp(devnm, dv->dv_name, len) ||
1725 		    (dv->dv_name[len] != ':' && dv->dv_name[len] != '\0')))
1726 			continue;
1727 
1728 		callback(dv, arg);
1729 
1730 		if (DVTOV(dv)->v_type != VDIR)
1731 			continue;
1732 
1733 		next = kmem_zalloc(sizeof (*next), KM_SLEEP);
1734 		next->dv = dv;
1735 
1736 		if (tail)
1737 			tail->next = next;
1738 		else
1739 			head = next;
1740 
1741 		tail = next;
1742 	}
1743 
1744 	while (head) {
1745 		dv_walk(head->dv, NULL, callback, arg);
1746 		next = head->next;
1747 		kmem_free(head, sizeof (*head));
1748 		head = next;
1749 	}
1750 	rw_exit(&ddv->dv_contents);
1751 	mutex_exit(&dvp->v_lock);
1752 }
1753