xref: /illumos-gate/usr/src/uts/common/fs/dev/sdev_plugin.c (revision 63f91fbc3c024870d86dc3332a4a0080fb29bc40)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2019 Joyent, Inc.
14  */
15 
16 /*
17  * Dynamic directory plugin interface for sdev.
18  *
19  * The sdev plugin interfaces provides a means for a dynamic directory based on
20  * in-kernel state to be simply created. Traditionally, dynamic directories were
21  * built into sdev itself. While these legacy plugins are useful, it makes more
22  * sense for these pieces of functionality to live with the individual drivers.
23  *
24  * The plugin interface requires folks to implement three interfaces and
25  * provides a series of callbacks that can be made in the context of those
26  * interfaces to interrogate the sdev_node_t without having to leak
27  * implementation details of the sdev_node_t. These interfaces are:
28  *
29  *   o spo_validate
30  *
31  *   Given a particular node, answer the question as to whether or not this
32  *   entry is still valid. Here, plugins should use the name and the dev_t
33  *   associated with the node to verify that it matches something that still
34  *   exists.
35  *
36  *   o spo_filldir
37  *
38  *   Fill all the entries inside of a directory. Note that some of these entries
39  *   may already exist.
40  *
41  *   o spo_inactive
42  *
43  *   The given node is no longer being used. This allows the consumer to
44  *   potentially tear down anything that was being held open related to this.
45  *   Note that this only fires when the given sdev_node_t becomes a zombie.
46  *
47  * During these callbacks a consumer is not allowed to register or unregister a
48  * plugin, especially their own. They may call the sdev_ctx style functions. All
49  * callbacks fire in a context where blocking is allowed (eg. the spl is below
50  * LOCK_LEVEL).
51  *
52  * When a plugin is added, we create its directory in the global zone. By doing
53  * that, we ensure that something isn't already there and that nothing else can
54  * come along and try and create something without our knowledge. We only have
55  * to create it in the GZ and not for all other instances of sdev because an
56  * instance of sdev that isn't at /dev does not have dynamic directories, and
57  * second, any instance of sdev present in a non-global zone cannot create
58  * anything, therefore we know that by it not being in the global zone's
59  * instance of sdev that we're good to go.
60  *
61  * Lock Ordering
62  * -------------
63  *
64  * The global sdev_plugin_lock must be held before any of the individual
65  * sdev_plugin_t`sp_lock. Further, once any plugin related lock has been held,
66  * it is not legal to take any holds on any sdev_node_t or to grab the
67  * sdev_node_t`contents_lock in any way.
68  */
69 
70 #include <sys/types.h>
71 #include <sys/stat.h>
72 #include <sys/fs/sdev_impl.h>
73 #include <sys/fs/sdev_plugin.h>
74 #include <fs/fs_subr.h>
75 #include <sys/ddi.h>
76 #include <sys/sunddi.h>
77 #include <sys/ksynch.h>
78 #include <sys/sysmacros.h>
79 #include <sys/list.h>
80 #include <sys/ctype.h>
81 
82 kmutex_t sdev_plugin_lock;
83 list_t sdev_plugin_list;
84 kmem_cache_t *sdev_plugin_cache;
85 struct vnodeops *sdev_plugin_vnops;
86 
87 #define	SDEV_PLUGIN_NAMELEN	64
88 
89 typedef struct sdev_plugin {
90 	list_node_t sp_link;
91 	char sp_name[SDEV_PLUGIN_NAMELEN];	/* E */
92 	int sp_nflags;				/* E */
93 	struct vnodeops *sp_vnops;		/* E */
94 	sdev_plugin_ops_t *sp_pops;		/* E */
95 	boolean_t sp_islegacy;			/* E */
96 	int (*sp_lvtor)(sdev_node_t *);		/* E */
97 	kmutex_t sp_lock;			/* Protects everything below */
98 	kcondvar_t sp_nodecv;
99 	size_t sp_nnodes;
100 } sdev_plugin_t;
101 
102 /* ARGSUSED */
103 static int
104 sdev_plugin_cache_constructor(void *buf, void *arg, int tags)
105 {
106 	sdev_plugin_t *spp = buf;
107 	mutex_init(&spp->sp_lock, NULL, MUTEX_DRIVER, 0);
108 	cv_init(&spp->sp_nodecv, NULL, CV_DRIVER, NULL);
109 	return (0);
110 }
111 
112 /* ARGSUSED */
113 static void
114 sdev_plugin_cache_destructor(void *buf, void *arg)
115 {
116 	sdev_plugin_t *spp = buf;
117 	cv_destroy(&spp->sp_nodecv);
118 	mutex_destroy(&spp->sp_lock);
119 }
120 
121 enum vtype
122 sdev_ctx_vtype(sdev_ctx_t ctx)
123 {
124 	sdev_node_t *sdp = (sdev_node_t *)ctx;
125 
126 	ASSERT(RW_LOCK_HELD(&sdp->sdev_contents));
127 	return (sdp->sdev_vnode->v_type);
128 }
129 
130 const char *
131 sdev_ctx_path(sdev_ctx_t ctx)
132 {
133 	sdev_node_t *sdp = (sdev_node_t *)ctx;
134 
135 	ASSERT(RW_LOCK_HELD(&sdp->sdev_contents));
136 	return (sdp->sdev_path);
137 }
138 
139 const char *
140 sdev_ctx_name(sdev_ctx_t ctx)
141 {
142 	sdev_node_t *sdp = (sdev_node_t *)ctx;
143 
144 	ASSERT(RW_LOCK_HELD(&sdp->sdev_contents));
145 	return (sdp->sdev_name);
146 }
147 
148 int
149 sdev_ctx_minor(sdev_ctx_t ctx, minor_t *minorp)
150 {
151 	sdev_node_t *sdp = (sdev_node_t *)ctx;
152 
153 	ASSERT(RW_LOCK_HELD(&sdp->sdev_contents));
154 	ASSERT(minorp != NULL);
155 	if (sdp->sdev_vnode->v_type == VCHR ||
156 	    sdp->sdev_vnode->v_type == VBLK) {
157 		*minorp = getminor(sdp->sdev_vnode->v_rdev);
158 		return (0);
159 	}
160 
161 	return (ENODEV);
162 }
163 
164 /*
165  * Currently we only support psasing through a single flag -- SDEV_IS_GLOBAL.
166  */
167 sdev_ctx_flags_t
168 sdev_ctx_flags(sdev_ctx_t ctx)
169 {
170 	sdev_node_t *sdp = (sdev_node_t *)ctx;
171 
172 	ASSERT(RW_LOCK_HELD(&sdp->sdev_contents));
173 	return (sdp->sdev_flags & SDEV_GLOBAL);
174 }
175 
176 /*
177  * Use the same rules as zones for a name. isalphanum + '-', '_', and '.'.
178  */
179 static int
180 sdev_plugin_name_isvalid(const char *c, int buflen)
181 {
182 	int i;
183 
184 	for (i = 0; i < buflen; i++, c++) {
185 		if (*c == '\0')
186 			return (1);
187 
188 		if (!isalnum(*c) && *c != '-' && *c != '_' && *c != '.')
189 			return (0);
190 	}
191 	/* Never found a null terminator */
192 	return (0);
193 }
194 
195 static int
196 sdev_plugin_mknode(sdev_plugin_t *spp, sdev_node_t *sdvp, char *name,
197     vattr_t *vap)
198 {
199 	int ret;
200 	sdev_node_t *svp;
201 
202 	ASSERT(RW_WRITE_HELD(&sdvp->sdev_contents));
203 	ASSERT(spp != NULL);
204 	svp = sdev_cache_lookup(sdvp, name);
205 	if (svp != NULL) {
206 		SDEV_SIMPLE_RELE(svp);
207 		return (EEXIST);
208 	}
209 
210 	ret = sdev_mknode(sdvp, name, &svp, vap, NULL, NULL, kcred,
211 	    SDEV_READY);
212 	if (ret != 0)
213 		return (ret);
214 	SDEV_SIMPLE_RELE(svp);
215 
216 	return (0);
217 }
218 
219 /*
220  * Plugin node creation callbacks
221  */
222 int
223 sdev_plugin_mkdir(sdev_ctx_t ctx, char *name)
224 {
225 	sdev_node_t *sdvp;
226 	timestruc_t now;
227 	struct vattr vap;
228 
229 	if (sdev_plugin_name_isvalid(name, SDEV_PLUGIN_NAMELEN) == 0)
230 		return (EINVAL);
231 
232 	sdvp = (sdev_node_t *)ctx;
233 	ASSERT(sdvp->sdev_private != NULL);
234 	ASSERT(RW_WRITE_HELD(&sdvp->sdev_contents));
235 
236 	vap = *sdev_getdefault_attr(VDIR);
237 	gethrestime(&now);
238 	vap.va_atime = now;
239 	vap.va_mtime = now;
240 	vap.va_ctime = now;
241 
242 	return (sdev_plugin_mknode(sdvp->sdev_private, sdvp, name, &vap));
243 }
244 
245 int
246 sdev_plugin_mknod(sdev_ctx_t ctx, char *name, mode_t mode, dev_t dev)
247 {
248 	sdev_node_t *sdvp;
249 	timestruc_t now;
250 	struct vattr vap;
251 	mode_t type = mode & S_IFMT;
252 	mode_t access = mode & S_IAMB;
253 
254 	if (sdev_plugin_name_isvalid(name, SDEV_PLUGIN_NAMELEN) == 0)
255 		return (EINVAL);
256 
257 	sdvp = (sdev_node_t *)ctx;
258 	ASSERT(RW_WRITE_HELD(&sdvp->sdev_contents));
259 
260 	/*
261 	 * Ensure only type and user/group/other permission bits are present.
262 	 * Do not allow setuid, setgid, etc.
263 	 */
264 	if ((mode & ~(S_IFMT | S_IAMB)) != 0)
265 		return (EINVAL);
266 
267 	/* Disallow types other than character and block devices */
268 	if (type != S_IFCHR && type != S_IFBLK)
269 		return (EINVAL);
270 
271 	/* Disallow execute bits */
272 	if ((access & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0)
273 		return (EINVAL);
274 
275 	/* No bits other than 0666 in access */
276 	ASSERT((access &
277 	    ~(S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH)) == 0);
278 
279 	/* Default to relatively safe access bits if none specified. */
280 	if (access == 0)
281 		access = 0600;
282 
283 	ASSERT(sdvp->sdev_private != NULL);
284 
285 	vap = *sdev_getdefault_attr(type == S_IFCHR ? VCHR : VBLK);
286 	gethrestime(&now);
287 	vap.va_atime = now;
288 	vap.va_mtime = now;
289 	vap.va_ctime = now;
290 	vap.va_rdev = dev;
291 	vap.va_mode = type | access;
292 
293 	/* Despite the similar name, this is in fact a different function */
294 	return (sdev_plugin_mknode(sdvp->sdev_private, sdvp, name, &vap));
295 }
296 
297 static int
298 sdev_plugin_validate(sdev_node_t *sdp)
299 {
300 	int ret;
301 	sdev_plugin_t *spp;
302 
303 	ASSERT(sdp->sdev_private != NULL);
304 	spp = sdp->sdev_private;
305 	ASSERT(spp->sp_islegacy == B_FALSE);
306 	ASSERT(spp->sp_pops != NULL);
307 	rw_enter(&sdp->sdev_contents, RW_READER);
308 	ret = spp->sp_pops->spo_validate((uintptr_t)sdp);
309 	rw_exit(&sdp->sdev_contents);
310 	return (ret);
311 }
312 
313 static void
314 sdev_plugin_validate_dir(sdev_node_t *sdvp)
315 {
316 	int ret;
317 	sdev_node_t *svp, *next;
318 
319 	ASSERT(RW_WRITE_HELD(&sdvp->sdev_contents));
320 
321 	for (svp = SDEV_FIRST_ENTRY(sdvp); svp != NULL; svp = next) {
322 
323 		next = SDEV_NEXT_ENTRY(sdvp, svp);
324 		ASSERT(svp->sdev_state != SDEV_ZOMBIE);
325 		/* skip nodes that aren't ready */
326 		if (svp->sdev_state == SDEV_INIT)
327 			continue;
328 
329 		switch (sdev_plugin_validate(svp)) {
330 		case SDEV_VTOR_VALID:
331 		case SDEV_VTOR_SKIP:
332 			continue;
333 		case SDEV_VTOR_INVALID:
334 		case SDEV_VTOR_STALE:
335 			break;
336 		}
337 
338 		SDEV_HOLD(svp);
339 
340 		/*
341 		 * Clean out everything underneath this node before we
342 		 * remove it.
343 		 */
344 		if (svp->sdev_vnode->v_type == VDIR) {
345 			ret = sdev_cleandir(svp, NULL, 0);
346 			ASSERT(ret == 0);
347 		}
348 		/* remove the cache node */
349 		(void) sdev_cache_update(sdvp, &svp, svp->sdev_name,
350 		    SDEV_CACHE_DELETE);
351 		SDEV_RELE(svp);
352 	}
353 }
354 
355 /* ARGSUSED */
356 static int
357 sdev_plugin_vop_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred,
358     int *eofp, caller_context_t *ct_unused, int flags_unused)
359 {
360 	int ret;
361 	sdev_node_t *sdvp = VTOSDEV(dvp);
362 	sdev_plugin_t *spp;
363 
364 	ASSERT(RW_READ_HELD(&sdvp->sdev_contents));
365 
366 	/* Sanity check we're not a zombie before we do anyting else */
367 	if (sdvp->sdev_state == SDEV_ZOMBIE)
368 		return (ENOENT);
369 
370 	spp = sdvp->sdev_private;
371 	ASSERT(spp != NULL);
372 	ASSERT(spp->sp_islegacy == B_FALSE);
373 	ASSERT(spp->sp_pops != NULL);
374 
375 	if (crgetzoneid(cred) == GLOBAL_ZONEID && !SDEV_IS_GLOBAL(sdvp))
376 		return (EPERM);
377 
378 	if (uiop->uio_offset == 0) {
379 		/*
380 		 * We upgrade to a write lock and grab the plugin's lock along
381 		 * the way. We're almost certainly going to get creation
382 		 * callbacks, so this is the only safe way to go.
383 		 */
384 		if (rw_tryupgrade(&sdvp->sdev_contents) == 0) {
385 			rw_exit(&sdvp->sdev_contents);
386 			rw_enter(&sdvp->sdev_contents, RW_WRITER);
387 			if (sdvp->sdev_state == SDEV_ZOMBIE) {
388 				rw_downgrade(&sdvp->sdev_contents);
389 				return (ENOENT);
390 			}
391 		}
392 
393 		sdev_plugin_validate_dir(sdvp);
394 		ret = spp->sp_pops->spo_filldir((uintptr_t)sdvp);
395 		rw_downgrade(&sdvp->sdev_contents);
396 		if (ret != 0)
397 			return (ret);
398 	}
399 
400 	return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
401 }
402 
403 /*
404  * If we don't have a callback function that returns a failure, then sdev will
405  * try to create a node for us which violates all of our basic assertions. To
406  * work around that we create our own callback for devname_lookup_func which
407  * always returns ENOENT as at this point either it was created with the filldir
408  * callback or it was not.
409  */
410 /*ARGSUSED*/
411 static int
412 sdev_plugin_vop_lookup_cb(sdev_node_t *ddv, char *nm, void **arg, cred_t *cred,
413     void *unused, char *unused2)
414 {
415 	return (ENOENT);
416 }
417 
418 /* ARGSUSED */
419 static int
420 sdev_plugin_vop_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
421     struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred,
422     caller_context_t *ct, int *direntflags, pathname_t *realpnp)
423 {
424 	int ret;
425 	sdev_node_t *sdvp;
426 	sdev_plugin_t *spp;
427 
428 	/* execute access is required to search the directory */
429 	if ((ret = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0)
430 		return (ret);
431 
432 	sdvp = VTOSDEV(dvp);
433 	spp = sdvp->sdev_private;
434 	ASSERT(spp != NULL);
435 	ASSERT(spp->sp_islegacy == B_FALSE);
436 	ASSERT(spp->sp_pops != NULL);
437 
438 	if (crgetzoneid(cred) == GLOBAL_ZONEID && !SDEV_IS_GLOBAL(sdvp))
439 		return (EPERM);
440 
441 	/*
442 	 * Go straight for the write lock.
443 	 */
444 	rw_enter(&sdvp->sdev_contents, RW_WRITER);
445 	if (sdvp->sdev_state == SDEV_ZOMBIE) {
446 		rw_exit(&sdvp->sdev_contents);
447 		return (ENOENT);
448 	}
449 	sdev_plugin_validate_dir(sdvp);
450 	ret = spp->sp_pops->spo_filldir((uintptr_t)sdvp);
451 	rw_exit(&sdvp->sdev_contents);
452 	if (ret != 0)
453 		return (ret);
454 
455 	return (devname_lookup_func(sdvp, nm, vpp, cred,
456 	    sdev_plugin_vop_lookup_cb, SDEV_VATTR));
457 }
458 
459 /*
460  * sdev is not a good citizen. We get inactive callbacks whenever a vnode goes
461  * to zero, but isn't necessairily a zombie yet. As such, to make things easier
462  * for users, we only fire the inactive callback when the node becomes a zombie
463  * and thus will be torn down here.
464  */
465 static void
466 sdev_plugin_vop_inactive_cb(struct vnode *dvp)
467 {
468 	sdev_node_t *sdp = VTOSDEV(dvp);
469 	sdev_plugin_t *spp = sdp->sdev_private;
470 
471 	rw_enter(&sdp->sdev_contents, RW_READER);
472 	if (sdp->sdev_state != SDEV_ZOMBIE) {
473 		rw_exit(&sdp->sdev_contents);
474 		return;
475 	}
476 	spp->sp_pops->spo_inactive((uintptr_t)sdp);
477 	mutex_enter(&spp->sp_lock);
478 	VERIFY(spp->sp_nnodes > 0);
479 	spp->sp_nnodes--;
480 	cv_signal(&spp->sp_nodecv);
481 	mutex_exit(&spp->sp_lock);
482 	rw_exit(&sdp->sdev_contents);
483 }
484 
485 /*ARGSUSED*/
486 static void
487 sdev_plugin_vop_inactive(struct vnode *dvp, struct cred *cred,
488     caller_context_t *ct)
489 {
490 	sdev_node_t *sdp = VTOSDEV(dvp);
491 	sdev_plugin_t *spp = sdp->sdev_private;
492 	ASSERT(sdp->sdev_private != NULL);
493 	ASSERT(spp->sp_islegacy == B_FALSE);
494 	devname_inactive_func(dvp, cred, sdev_plugin_vop_inactive_cb);
495 }
496 
497 const fs_operation_def_t sdev_plugin_vnodeops_tbl[] = {
498 	VOPNAME_READDIR,	{ .vop_readdir = sdev_plugin_vop_readdir },
499 	VOPNAME_LOOKUP,		{ .vop_lookup = sdev_plugin_vop_lookup },
500 	VOPNAME_INACTIVE,	{ .vop_inactive = sdev_plugin_vop_inactive },
501 	VOPNAME_CREATE,		{ .error = fs_nosys },
502 	VOPNAME_REMOVE,		{ .error = fs_nosys },
503 	VOPNAME_MKDIR,		{ .error = fs_nosys },
504 	VOPNAME_RMDIR,		{ .error = fs_nosys },
505 	VOPNAME_SYMLINK,	{ .error = fs_nosys },
506 	VOPNAME_SETSECATTR,	{ .error = fs_nosys },
507 	NULL,			NULL
508 };
509 
510 /*
511  * construct a new template with overrides from vtab
512  */
513 static fs_operation_def_t *
514 sdev_merge_vtab(const fs_operation_def_t tab[])
515 {
516 	fs_operation_def_t *new;
517 	const fs_operation_def_t *tab_entry;
518 
519 	/* make a copy of standard vnode ops table */
520 	new = kmem_alloc(sdev_vnodeops_tbl_size, KM_SLEEP);
521 	bcopy((void *)sdev_vnodeops_tbl, new, sdev_vnodeops_tbl_size);
522 
523 	/* replace the overrides from tab */
524 	for (tab_entry = tab; tab_entry->name != NULL; tab_entry++) {
525 		fs_operation_def_t *std_entry = new;
526 		while (std_entry->name) {
527 			if (strcmp(tab_entry->name, std_entry->name) == 0) {
528 				std_entry->func = tab_entry->func;
529 				break;
530 			}
531 			std_entry++;
532 		}
533 	}
534 
535 	return (new);
536 }
537 
538 /* free memory allocated by sdev_merge_vtab */
539 static void
540 sdev_free_vtab(fs_operation_def_t *new)
541 {
542 	kmem_free(new, sdev_vnodeops_tbl_size);
543 }
544 
545 /*
546  * Register a new plugin.
547  */
548 sdev_plugin_hdl_t
549 sdev_plugin_register(const char *name, sdev_plugin_ops_t *ops, int *errp)
550 {
551 	char buf[sizeof ("dev")] = "";
552 	struct pathname pn = { 0 };
553 	sdev_plugin_t *spp, *iter;
554 	vnode_t *vp, *nvp;
555 	sdev_node_t *sdp, *slp;
556 	timestruc_t now;
557 	struct vattr vap;
558 	int ret, err;
559 
560 	/*
561 	 * Some consumers don't care about why they failed. To keep the code
562 	 * simple, we'll just pretend they gave us something.
563 	 */
564 	if (errp == NULL)
565 		errp = &err;
566 
567 	if (sdev_plugin_name_isvalid(name, SDEV_PLUGIN_NAMELEN) == 0) {
568 		*errp = EINVAL;
569 		return ((sdev_plugin_hdl_t)NULL);
570 	}
571 
572 	if (ops->spo_version != 1) {
573 		*errp = EINVAL;
574 		return ((sdev_plugin_hdl_t)NULL);
575 	}
576 
577 	if (ops->spo_validate == NULL || ops->spo_filldir == NULL ||
578 	    ops->spo_inactive == NULL) {
579 		*errp = EINVAL;
580 		return ((sdev_plugin_hdl_t)NULL);
581 	}
582 
583 	if ((ops->spo_flags & ~SDEV_PLUGIN_FLAGS_MASK) != 0) {
584 		*errp = EINVAL;
585 		return ((sdev_plugin_hdl_t)NULL);
586 	}
587 
588 	spp = kmem_cache_alloc(sdev_plugin_cache, KM_SLEEP);
589 	(void) strlcpy(spp->sp_name, name, SDEV_PLUGIN_NAMELEN);
590 
591 	spp->sp_pops = ops;
592 	spp->sp_nflags = SDEV_DYNAMIC | SDEV_VTOR;
593 	if (ops->spo_flags & SDEV_PLUGIN_NO_NCACHE)
594 		spp->sp_nflags |= SDEV_NO_NCACHE;
595 	if (ops->spo_flags & SDEV_PLUGIN_SUBDIR)
596 		spp->sp_nflags |= SDEV_SUBDIR;
597 	spp->sp_vnops = sdev_plugin_vnops;
598 	spp->sp_islegacy = B_FALSE;
599 	spp->sp_lvtor = NULL;
600 	spp->sp_nnodes = 0;
601 
602 	/*
603 	 * Make sure our /dev entry is unique and install it.  We also need to
604 	 * go through and grab the sdev root node as we cannot grab any sdev
605 	 * node locks once we've grabbed the sdev_plugin_lock. We effectively
606 	 * assert that if a directory is not present in the GZ's /dev, then it
607 	 * doesn't exist in any of the local zones.
608 	 *
609 	 * Note that we may be in NGZ context: during a prof_filldir(".../dev/")
610 	 * enumeration, for example. So we have to dig as deep as lookuppnvp()
611 	 * to make sure we really get to the global /dev (i.e.  escape both
612 	 * CRED() and ->u_rdir).
613 	 */
614 	(void) pn_get_buf("dev", UIO_SYSSPACE, &pn, buf, sizeof (buf));
615 	VN_HOLD(rootdir);
616 	ret = lookuppnvp(&pn, NULL, NO_FOLLOW, NULLVPP,
617 	    &vp, rootdir, rootdir, kcred);
618 
619 	if (ret != 0) {
620 		*errp = ret;
621 		kmem_cache_free(sdev_plugin_cache, spp);
622 		return ((sdev_plugin_hdl_t)NULL);
623 	}
624 	/* Make sure we have the real vnode */
625 	if (VOP_REALVP(vp, &nvp, NULL) == 0) {
626 		VN_HOLD(nvp);
627 		VN_RELE(vp);
628 		vp = nvp;
629 		nvp = NULL;
630 	}
631 	VERIFY(vp->v_op == sdev_vnodeops);
632 	sdp = VTOSDEV(vp);
633 	rw_enter(&sdp->sdev_contents, RW_WRITER);
634 	slp = sdev_cache_lookup(sdp, spp->sp_name);
635 	if (slp != NULL) {
636 		SDEV_RELE(slp);
637 		rw_exit(&sdp->sdev_contents);
638 		VN_RELE(vp);
639 		*errp = EEXIST;
640 		kmem_cache_free(sdev_plugin_cache, spp);
641 		return ((sdev_plugin_hdl_t)NULL);
642 	}
643 
644 	mutex_enter(&sdev_plugin_lock);
645 	for (iter = list_head(&sdev_plugin_list); iter != NULL;
646 	    iter = list_next(&sdev_plugin_list, iter)) {
647 		if (strcmp(spp->sp_name, iter->sp_name) == 0) {
648 			mutex_exit(&sdev_plugin_lock);
649 			rw_exit(&sdp->sdev_contents);
650 			VN_RELE(vp);
651 			*errp = EEXIST;
652 			kmem_cache_free(sdev_plugin_cache, spp);
653 			return ((sdev_plugin_hdl_t)NULL);
654 		}
655 	}
656 
657 	list_insert_tail(&sdev_plugin_list, spp);
658 	mutex_exit(&sdev_plugin_lock);
659 
660 	/*
661 	 * Now go ahead and create the top level directory for the global zone.
662 	 */
663 	vap = *sdev_getdefault_attr(VDIR);
664 	gethrestime(&now);
665 	vap.va_atime = now;
666 	vap.va_mtime = now;
667 	vap.va_ctime = now;
668 
669 	(void) sdev_plugin_mknode(spp, sdp, spp->sp_name, &vap);
670 
671 	rw_exit(&sdp->sdev_contents);
672 	VN_RELE(vp);
673 
674 	*errp = 0;
675 
676 	return ((sdev_plugin_hdl_t)spp);
677 }
678 
679 static void
680 sdev_plugin_unregister_cb(sdev_node_t *rdp, void *arg)
681 {
682 	sdev_plugin_t *spp = arg;
683 	sdev_node_t *sdp;
684 
685 	rw_enter(&rdp->sdev_contents, RW_WRITER);
686 	sdp = sdev_cache_lookup(rdp, spp->sp_name);
687 	/* If it doesn't exist, we're done here */
688 	if (sdp == NULL) {
689 		rw_exit(&rdp->sdev_contents);
690 		return;
691 	}
692 
693 	/*
694 	 * We first delete the directory before recursively marking everything
695 	 * else stale. This ordering should ensure that we don't accidentally
696 	 * miss anything.
697 	 */
698 	sdev_cache_update(rdp, &sdp, spp->sp_name, SDEV_CACHE_DELETE);
699 	sdev_stale(sdp);
700 	SDEV_RELE(sdp);
701 	rw_exit(&rdp->sdev_contents);
702 }
703 
704 int sdev_plugin_unregister_allowed;
705 
706 /*
707  * Remove a plugin. This will block until everything has become a zombie, thus
708  * guaranteeing the caller that nothing will call into them again once this call
709  * returns. While the call is ongoing, it could be called into. Note that while
710  * this is ongoing, it will block other mounts.
711  *
712  * NB: this is not safe when used from detach() context - we will be DEVI_BUSY,
713  * and other sdev threads may be waiting for this.  Only use the over-ride if
714  * willing to risk it.
715  */
716 int
717 sdev_plugin_unregister(sdev_plugin_hdl_t hdl)
718 {
719 	sdev_plugin_t *spp = (sdev_plugin_t *)hdl;
720 	if (spp->sp_islegacy)
721 		return (EINVAL);
722 
723 	if (!sdev_plugin_unregister_allowed)
724 		return (EBUSY);
725 
726 	mutex_enter(&sdev_plugin_lock);
727 	list_remove(&sdev_plugin_list, spp);
728 	mutex_exit(&sdev_plugin_lock);
729 
730 	sdev_mnt_walk(sdev_plugin_unregister_cb, spp);
731 	mutex_enter(&spp->sp_lock);
732 	while (spp->sp_nnodes > 0)
733 		cv_wait(&spp->sp_nodecv, &spp->sp_lock);
734 	mutex_exit(&spp->sp_lock);
735 	kmem_cache_free(sdev_plugin_cache, spp);
736 	return (0);
737 }
738 
739 /*
740  * Register an old sdev style plugin to deal with what used to be in the vtab.
741  */
742 static int
743 sdev_plugin_register_legacy(struct sdev_vop_table *vtp)
744 {
745 	sdev_plugin_t *spp;
746 
747 	spp = kmem_cache_alloc(sdev_plugin_cache, KM_SLEEP);
748 	(void) strlcpy(spp->sp_name, vtp->vt_name, SDEV_PLUGIN_NAMELEN);
749 	spp->sp_islegacy = B_TRUE;
750 	spp->sp_pops = NULL;
751 	spp->sp_nflags = vtp->vt_flags;
752 	spp->sp_lvtor = vtp->vt_vtor;
753 	spp->sp_nnodes = 0;
754 
755 	if (vtp->vt_service != NULL) {
756 		fs_operation_def_t *templ;
757 		templ = sdev_merge_vtab(vtp->vt_service);
758 		if (vn_make_ops(vtp->vt_name,
759 		    (const fs_operation_def_t *)templ,
760 		    &spp->sp_vnops) != 0) {
761 			cmn_err(CE_WARN, "%s: malformed vnode ops\n",
762 			    vtp->vt_name);
763 			sdev_free_vtab(templ);
764 			kmem_cache_free(sdev_plugin_cache, spp);
765 			return (1);
766 		}
767 
768 		if (vtp->vt_global_vops) {
769 			*(vtp->vt_global_vops) = spp->sp_vnops;
770 		}
771 
772 		sdev_free_vtab(templ);
773 	} else {
774 		spp->sp_vnops = sdev_vnodeops;
775 	}
776 
777 	/*
778 	 * No need to check for EEXIST here. These are loaded as a part of the
779 	 * sdev's initialization function. Further, we don't have to create them
780 	 * as that's taken care of in sdev's mount for the GZ.
781 	 */
782 	mutex_enter(&sdev_plugin_lock);
783 	list_insert_tail(&sdev_plugin_list, spp);
784 	mutex_exit(&sdev_plugin_lock);
785 
786 	return (0);
787 }
788 
789 /*
790  * We need to match off of the sdev_path, not the sdev_name. We are only allowed
791  * to exist directly under /dev.
792  */
793 static sdev_plugin_t *
794 sdev_match(sdev_node_t *dv)
795 {
796 	int vlen;
797 	const char *path;
798 	sdev_plugin_t *spp;
799 
800 	if (strlen(dv->sdev_path) <= 5)
801 		return (NULL);
802 
803 	if (strncmp(dv->sdev_path, "/dev/", 5) != 0)
804 		return (NULL);
805 	path = dv->sdev_path + 5;
806 
807 	mutex_enter(&sdev_plugin_lock);
808 
809 	for (spp = list_head(&sdev_plugin_list); spp != NULL;
810 	    spp = list_next(&sdev_plugin_list, spp)) {
811 		if (strcmp(spp->sp_name, path) == 0) {
812 			mutex_exit(&sdev_plugin_lock);
813 			return (spp);
814 		}
815 
816 		if (spp->sp_nflags & SDEV_SUBDIR) {
817 			vlen = strlen(spp->sp_name);
818 			if ((strncmp(spp->sp_name, path,
819 			    vlen - 1) == 0) && path[vlen] == '/') {
820 				mutex_exit(&sdev_plugin_lock);
821 				return (spp);
822 			}
823 
824 		}
825 	}
826 
827 	mutex_exit(&sdev_plugin_lock);
828 	return (NULL);
829 }
830 
831 void
832 sdev_set_no_negcache(sdev_node_t *dv)
833 {
834 	char *path;
835 	sdev_plugin_t *spp;
836 
837 	ASSERT(dv->sdev_path);
838 	path = dv->sdev_path + strlen("/dev/");
839 
840 	mutex_enter(&sdev_plugin_lock);
841 	for (spp = list_head(&sdev_plugin_list); spp != NULL;
842 	    spp = list_next(&sdev_plugin_list, spp)) {
843 		if (strcmp(spp->sp_name, path) == 0) {
844 			if (spp->sp_nflags & SDEV_NO_NCACHE)
845 				dv->sdev_flags |= SDEV_NO_NCACHE;
846 			break;
847 		}
848 	}
849 	mutex_exit(&sdev_plugin_lock);
850 }
851 
852 struct vnodeops *
853 sdev_get_vop(sdev_node_t *dv)
854 {
855 	char *path;
856 	sdev_plugin_t *spp;
857 
858 	path = dv->sdev_path;
859 	ASSERT(path);
860 
861 	/* gets the relative path to /dev/ */
862 	path += 5;
863 
864 	if ((spp = sdev_match(dv)) != NULL) {
865 		dv->sdev_flags |= spp->sp_nflags;
866 		if (SDEV_IS_PERSIST(dv->sdev_dotdot) &&
867 		    (SDEV_IS_PERSIST(dv) || !SDEV_IS_DYNAMIC(dv)))
868 			dv->sdev_flags |= SDEV_PERSIST;
869 		return (spp->sp_vnops);
870 	}
871 
872 	/* child inherits the persistence of the parent */
873 	if (SDEV_IS_PERSIST(dv->sdev_dotdot))
874 		dv->sdev_flags |= SDEV_PERSIST;
875 	return (sdev_vnodeops);
876 }
877 
878 void *
879 sdev_get_vtor(sdev_node_t *dv)
880 {
881 	sdev_plugin_t *spp;
882 
883 	if (dv->sdev_private == NULL) {
884 		spp = sdev_match(dv);
885 		if (spp == NULL)
886 			return (NULL);
887 	} else {
888 		spp = dv->sdev_private;
889 	}
890 
891 	if (spp->sp_islegacy)
892 		return ((void *)spp->sp_lvtor);
893 	else
894 		return ((void *)sdev_plugin_validate);
895 }
896 
897 void
898 sdev_plugin_nodeready(sdev_node_t *sdp)
899 {
900 	sdev_plugin_t *spp;
901 
902 	ASSERT(RW_WRITE_HELD(&sdp->sdev_contents));
903 	ASSERT(sdp->sdev_private == NULL);
904 
905 	spp = sdev_match(sdp);
906 	if (spp == NULL)
907 		return;
908 	if (spp->sp_islegacy)
909 		return;
910 	sdp->sdev_private = spp;
911 	mutex_enter(&spp->sp_lock);
912 	spp->sp_nnodes++;
913 	mutex_exit(&spp->sp_lock);
914 }
915 
916 int
917 sdev_plugin_init(void)
918 {
919 	sdev_vop_table_t *vtp;
920 	fs_operation_def_t *templ;
921 
922 	sdev_plugin_cache = kmem_cache_create("sdev_plugin",
923 	    sizeof (sdev_plugin_t), 0, sdev_plugin_cache_constructor,
924 	    sdev_plugin_cache_destructor, NULL, NULL, NULL, 0);
925 	if (sdev_plugin_cache == NULL)
926 		return (1);
927 	mutex_init(&sdev_plugin_lock, NULL, MUTEX_DRIVER, NULL);
928 	list_create(&sdev_plugin_list, sizeof (sdev_plugin_t),
929 	    offsetof(sdev_plugin_t, sp_link));
930 
931 	/*
932 	 * Register all of the legacy vnops
933 	 */
934 	for (vtp = &vtab[0]; vtp->vt_name != NULL; vtp++)
935 		if (sdev_plugin_register_legacy(vtp) != 0)
936 			return (1);
937 
938 	templ = sdev_merge_vtab(sdev_plugin_vnodeops_tbl);
939 	if (vn_make_ops("sdev_plugin",
940 	    (const fs_operation_def_t *)templ,
941 	    &sdev_plugin_vnops) != 0) {
942 		sdev_free_vtab(templ);
943 		return (1);
944 	}
945 
946 	sdev_free_vtab(templ);
947 	return (0);
948 }
949