xref: /illumos-gate/usr/src/uts/common/fs/dev/sdev_zvolops.c (revision 8c69cc8fbe729fa7b091e901c4b50508ccc6bb33)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  * Copyright 2013, 2016 Joyent, Inc.  All rights reserved.
25  * Copyright (c) 2014 by Delphix. All rights reserved.
26  */
27 
28 /* vnode ops for the /dev/zvol directory */
29 
30 #include <sys/types.h>
31 #include <sys/param.h>
32 #include <sys/sysmacros.h>
33 #include <sys/ddi.h>
34 #include <sys/sunndi.h>
35 #include <sys/sunldi.h>
36 #include <fs/fs_subr.h>
37 #include <sys/fs/dv_node.h>
38 #include <sys/fs/sdev_impl.h>
39 #include <sys/zfs_ioctl.h>
40 #include <sys/policy.h>
41 #include <sys/stat.h>
42 #include <sys/vfs_opreg.h>
43 
44 struct vnodeops	*devzvol_vnodeops;
45 static major_t devzvol_major;
46 static taskq_ent_t devzvol_zclist_task;
47 
48 static kmutex_t devzvol_mtx;
49 /* Below are protected by devzvol_mtx */
50 static boolean_t devzvol_isopen;
51 static boolean_t devzvol_zclist_task_running = B_FALSE;
52 static uint64_t devzvol_gen = 0;
53 static uint64_t devzvol_zclist;
54 static size_t devzvol_zclist_size;
55 static ldi_ident_t devzvol_li;
56 static ldi_handle_t devzvol_lh;
57 
58 /*
59  * we need to use ddi_mod* since fs/dev gets loaded early on in
60  * startup(), and linking fs/dev to fs/zfs would drag in a lot of
61  * other stuff (like drv/random) before the rest of the system is
62  * ready to go
63  */
64 ddi_modhandle_t zfs_mod;
65 int (*szcm)(char *);
66 int (*szn2m)(char *, minor_t *);
67 
68 
69 /*
70  * Enable/disable snapshots from being created in /dev/zvol. By default,
71  * they are enabled, preserving the historic behavior.
72  */
73 boolean_t devzvol_snaps_allowed = B_TRUE;
74 
75 int
76 sdev_zvol_create_minor(char *dsname)
77 {
78 	if (szcm == NULL)
79 		return (-1);
80 	return ((*szcm)(dsname));
81 }
82 
83 int
84 sdev_zvol_name2minor(char *dsname, minor_t *minor)
85 {
86 	if (szn2m == NULL)
87 		return (-1);
88 	return ((*szn2m)(dsname, minor));
89 }
90 
91 int
92 devzvol_open_zfs()
93 {
94 	int rc;
95 	dev_t dv;
96 
97 	devzvol_li = ldi_ident_from_anon();
98 	if (ldi_open_by_name("/dev/zfs", FREAD | FWRITE, kcred,
99 	    &devzvol_lh, devzvol_li))
100 		return (-1);
101 	if (zfs_mod == NULL && ((zfs_mod = ddi_modopen("fs/zfs",
102 	    KRTLD_MODE_FIRST, &rc)) == NULL)) {
103 		return (rc);
104 	}
105 	ASSERT(szcm == NULL && szn2m == NULL);
106 	if ((szcm = (int (*)(char *))
107 	    ddi_modsym(zfs_mod, "zvol_create_minor", &rc)) == NULL) {
108 		cmn_err(CE_WARN, "couldn't resolve zvol_create_minor");
109 		return (rc);
110 	}
111 	if ((szn2m = (int(*)(char *, minor_t *))
112 	    ddi_modsym(zfs_mod, "zvol_name2minor", &rc)) == NULL) {
113 		cmn_err(CE_WARN, "couldn't resolve zvol_name2minor");
114 		return (rc);
115 	}
116 	if (ldi_get_dev(devzvol_lh, &dv))
117 		return (-1);
118 	devzvol_major = getmajor(dv);
119 	return (0);
120 }
121 
122 void
123 devzvol_close_zfs()
124 {
125 	szcm = NULL;
126 	szn2m = NULL;
127 	(void) ldi_close(devzvol_lh, FREAD|FWRITE, kcred);
128 	ldi_ident_release(devzvol_li);
129 	if (zfs_mod != NULL) {
130 		(void) ddi_modclose(zfs_mod);
131 		zfs_mod = NULL;
132 	}
133 }
134 
135 int
136 devzvol_handle_ioctl(int cmd, zfs_cmd_t *zc, size_t *alloc_size)
137 {
138 	uint64_t cookie;
139 	int size = 8000;
140 	int unused;
141 	int rc;
142 
143 	if (cmd != ZFS_IOC_POOL_CONFIGS)
144 		mutex_enter(&devzvol_mtx);
145 	if (!devzvol_isopen) {
146 		if ((rc = devzvol_open_zfs()) == 0) {
147 			devzvol_isopen = B_TRUE;
148 		} else {
149 			if (cmd != ZFS_IOC_POOL_CONFIGS)
150 				mutex_exit(&devzvol_mtx);
151 			return (ENXIO);
152 		}
153 	}
154 	cookie = zc->zc_cookie;
155 again:
156 	zc->zc_nvlist_dst = (uint64_t)(intptr_t)kmem_alloc(size,
157 	    KM_SLEEP);
158 	zc->zc_nvlist_dst_size = size;
159 	rc = ldi_ioctl(devzvol_lh, cmd, (intptr_t)zc, FKIOCTL, kcred,
160 	    &unused);
161 	if (rc == ENOMEM) {
162 		int newsize;
163 		newsize = zc->zc_nvlist_dst_size;
164 		ASSERT(newsize > size);
165 		kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, size);
166 		size = newsize;
167 		zc->zc_cookie = cookie;
168 		goto again;
169 	}
170 	if (alloc_size == NULL)
171 		kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, size);
172 	else
173 		*alloc_size = size;
174 	if (cmd != ZFS_IOC_POOL_CONFIGS)
175 		mutex_exit(&devzvol_mtx);
176 	return (rc);
177 }
178 
179 /* figures out if the objset exists and returns its type */
180 int
181 devzvol_objset_check(char *dsname, dmu_objset_type_t *type)
182 {
183 	boolean_t	ispool, is_snapshot;
184 	zfs_cmd_t	*zc;
185 	int rc;
186 	nvlist_t 	*nvl;
187 	size_t nvsz;
188 
189 	ispool = (strchr(dsname, '/') == NULL);
190 	is_snapshot = (strchr(dsname, '@') != NULL);
191 
192 	if (is_snapshot && !devzvol_snaps_allowed)
193 		return (ENOTSUP);
194 
195 	zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
196 	(void) strlcpy(zc->zc_name, dsname, MAXPATHLEN);
197 
198 	nvl = fnvlist_alloc();
199 	fnvlist_add_boolean_value(nvl, "cachedpropsonly", B_TRUE);
200 	zc->zc_nvlist_src = (uintptr_t)fnvlist_pack(nvl, &nvsz);
201 	zc->zc_nvlist_src_size = nvsz;
202 	fnvlist_free(nvl);
203 
204 	rc = devzvol_handle_ioctl(ispool ? ZFS_IOC_POOL_STATS :
205 	    ZFS_IOC_OBJSET_STATS, zc, NULL);
206 	if (type && rc == 0)
207 		*type = (ispool) ? DMU_OST_ZFS :
208 		    zc->zc_objset_stats.dds_type;
209 	fnvlist_pack_free((char *)(uintptr_t)zc->zc_nvlist_src, nvsz);
210 	kmem_free(zc, sizeof (zfs_cmd_t));
211 	return (rc);
212 }
213 
214 /*
215  * Returns what the zfs dataset name should be, given the /dev/zvol
216  * path and an optional name (can be NULL).
217  *
218  * Note that if the name param is NULL, then path must be an
219  * actual dataset's directory and not one of the top-level
220  * /dev/zvol/{dsk,rdsk} dirs, as these do not correspond to a
221  * specific dataset.
222  */
223 char *
224 devzvol_make_dsname(const char *path, const char *name)
225 {
226 	char *dsname;
227 	const char *ptr;
228 	int dslen;
229 
230 	if (strcmp(path, ZVOL_DIR) == 0)
231 		return (NULL);
232 	if (name && (strcmp(name, ".") == 0 || strcmp(name, "..") == 0))
233 		return (NULL);
234 	ptr = path + strlen(ZVOL_DIR);
235 	if (strncmp(ptr, "/dsk", 4) == 0)
236 		ptr += strlen("/dsk");
237 	else if (strncmp(ptr, "/rdsk", 5) == 0)
238 		ptr += strlen("/rdsk");
239 	else
240 		return (NULL);
241 
242 	if (*ptr == '/')
243 		ptr++;
244 	else if (name == NULL)
245 		return (NULL);
246 
247 	dslen = strlen(ptr);
248 	if (dslen)
249 		dslen++;			/* plus null */
250 	if (name)
251 		dslen += strlen(name) + 1;	/* plus slash */
252 	dsname = kmem_zalloc(dslen, KM_SLEEP);
253 	if (*ptr) {
254 		(void) strlcpy(dsname, ptr, dslen);
255 		if (name)
256 			(void) strlcat(dsname, "/", dslen);
257 	}
258 	if (name)
259 		(void) strlcat(dsname, name, dslen);
260 	return (dsname);
261 }
262 
263 /*
264  * check if the zvol's sdev_node is still valid, which means make
265  * sure the zvol is still valid.  zvol minors aren't proactively
266  * destroyed when the zvol is destroyed, so we use a validator to clean
267  * these up (in other words, when such nodes are encountered during
268  * subsequent lookup() and readdir() operations) so that only valid
269  * nodes are returned.  The ordering between devname_lookup_func and
270  * devzvol_validate is a little inefficient in the case of invalid
271  * or stale nodes because devname_lookup_func calls
272  * devzvol_create_{dir, link}, then the validator says it's invalid,
273  * and then the node gets cleaned up.
274  */
275 int
276 devzvol_validate(struct sdev_node *dv)
277 {
278 	vnode_t *vn = SDEVTOV(dv);
279 	dmu_objset_type_t do_type;
280 	char *dsname;
281 	char *nm = dv->sdev_name;
282 	int rc;
283 
284 	sdcmn_err13(("validating ('%s' '%s')", dv->sdev_path, nm));
285 	/*
286 	 * validate only READY nodes; if someone is sitting on the
287 	 * directory of a dataset that just got destroyed we could
288 	 * get a zombie node which we just skip.
289 	 */
290 	if (dv->sdev_state != SDEV_READY) {
291 		sdcmn_err13(("skipping '%s'", nm));
292 		return (SDEV_VTOR_SKIP);
293 	}
294 
295 	if ((strcmp(dv->sdev_path, ZVOL_DIR "/dsk") == 0) ||
296 	    (strcmp(dv->sdev_path, ZVOL_DIR "/rdsk") == 0))
297 		return (SDEV_VTOR_VALID);
298 	dsname = devzvol_make_dsname(dv->sdev_path, NULL);
299 	if (dsname == NULL)
300 		return (SDEV_VTOR_INVALID);
301 
302 	/*
303 	 * Leave any nodes alone that have been explicitly created by
304 	 * sdev profiles.
305 	 */
306 	if (!(dv->sdev_flags & SDEV_GLOBAL) && dv->sdev_origin != NULL) {
307 		kmem_free(dsname, strlen(dsname) + 1);
308 		return (SDEV_VTOR_VALID);
309 	}
310 
311 	rc = devzvol_objset_check(dsname, &do_type);
312 	sdcmn_err13(("  '%s' rc %d", dsname, rc));
313 	if (rc != 0) {
314 		sdev_node_t *parent = dv->sdev_dotdot;
315 		/*
316 		 * Explicitly passed-through zvols in our sdev profile can't
317 		 * be created as prof_* shadow nodes, because in the GZ they
318 		 * are symlinks, but in the NGZ they are actual device files.
319 		 *
320 		 * The objset_check will fail on these as they are outside
321 		 * any delegated dataset (zfs will not allow ioctl access to
322 		 * them from this zone). We still want them to work, though.
323 		 */
324 		if (!(parent->sdev_flags & SDEV_GLOBAL) &&
325 		    parent->sdev_origin != NULL &&
326 		    !(dv->sdev_flags & SDEV_GLOBAL) &&
327 		    (vn->v_type == VBLK || vn->v_type == VCHR) &&
328 		    prof_name_matched(nm, parent)) {
329 			do_type = DMU_OST_ZVOL;
330 		} else {
331 			kmem_free(dsname, strlen(dsname) + 1);
332 			return (SDEV_VTOR_INVALID);
333 		}
334 	}
335 
336 	sdcmn_err13(("  v_type %d do_type %d",
337 	    vn->v_type, do_type));
338 	if ((vn->v_type == VLNK && do_type != DMU_OST_ZVOL) ||
339 	    ((vn->v_type == VBLK || vn->v_type == VCHR) &&
340 	    do_type != DMU_OST_ZVOL) ||
341 	    (vn->v_type == VDIR && do_type == DMU_OST_ZVOL)) {
342 		kmem_free(dsname, strlen(dsname) + 1);
343 		return (SDEV_VTOR_STALE);
344 	}
345 	if (vn->v_type == VLNK) {
346 		char *ptr, *link;
347 		long val = 0;
348 		minor_t lminor, ominor;
349 
350 		rc = sdev_getlink(vn, &link);
351 		ASSERT(rc == 0);
352 
353 		ptr = strrchr(link, ':') + 1;
354 		rc = ddi_strtol(ptr, NULL, 10, &val);
355 		kmem_free(link, strlen(link) + 1);
356 		ASSERT(rc == 0 && val != 0);
357 		lminor = (minor_t)val;
358 		if (sdev_zvol_name2minor(dsname, &ominor) < 0 ||
359 		    ominor != lminor) {
360 			kmem_free(dsname, strlen(dsname) + 1);
361 			return (SDEV_VTOR_STALE);
362 		}
363 	}
364 	kmem_free(dsname, strlen(dsname) + 1);
365 	return (SDEV_VTOR_VALID);
366 }
367 
368 /*
369  * Taskq callback to update the devzvol_zclist.
370  *
371  * We need to defer this to the taskq to avoid it running with a user
372  * context that might be associated with some non-global zone, and thus
373  * not being able to list all of the pools on the entire system.
374  */
375 /*ARGSUSED*/
376 static void
377 devzvol_update_zclist_cb(void *arg)
378 {
379 	zfs_cmd_t	*zc;
380 	int		rc;
381 	size_t		size;
382 
383 	zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
384 	mutex_enter(&devzvol_mtx);
385 	zc->zc_cookie = devzvol_gen;
386 
387 	rc = devzvol_handle_ioctl(ZFS_IOC_POOL_CONFIGS, zc, &size);
388 	switch (rc) {
389 		case 0:
390 			/* new generation */
391 			ASSERT(devzvol_gen != zc->zc_cookie);
392 			devzvol_gen = zc->zc_cookie;
393 			if (devzvol_zclist)
394 				kmem_free((void *)(uintptr_t)devzvol_zclist,
395 				    devzvol_zclist_size);
396 			devzvol_zclist = zc->zc_nvlist_dst;
397 			/* Keep the alloc'd size, not the nvlist size. */
398 			devzvol_zclist_size = size;
399 			break;
400 		default:
401 			/*
402 			 * Either there was no change in pool configuration
403 			 * since we last asked (rc == EEXIST) or we got a
404 			 * catastrophic error.
405 			 *
406 			 * Give up memory and exit.
407 			 */
408 			kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst,
409 			    size);
410 			break;
411 	}
412 
413 	VERIFY(devzvol_zclist_task_running == B_TRUE);
414 	devzvol_zclist_task_running = B_FALSE;
415 	mutex_exit(&devzvol_mtx);
416 
417 	kmem_free(zc, sizeof (zfs_cmd_t));
418 }
419 
420 static void
421 devzvol_update_zclist(void)
422 {
423 	mutex_enter(&devzvol_mtx);
424 	if (devzvol_zclist_task_running == B_TRUE) {
425 		mutex_exit(&devzvol_mtx);
426 		goto wait;
427 	}
428 
429 	devzvol_zclist_task_running = B_TRUE;
430 
431 	taskq_dispatch_ent(sdev_taskq, devzvol_update_zclist_cb, NULL, 0,
432 	    &devzvol_zclist_task);
433 
434 	mutex_exit(&devzvol_mtx);
435 
436 wait:
437 	taskq_wait(sdev_taskq);
438 }
439 
440 /*
441  * Creates sub-directories for each zpool as needed in response to a
442  * readdir on one of the /dev/zvol/{dsk,rdsk} directories.
443  */
444 void
445 devzvol_create_pool_dirs(struct vnode *dvp)
446 {
447 	nvlist_t *nv = NULL;
448 	nvpair_t *elem = NULL;
449 	int pools = 0;
450 	int rc;
451 
452 	sdcmn_err13(("devzvol_create_pool_dirs"));
453 
454 	devzvol_update_zclist();
455 
456 	mutex_enter(&devzvol_mtx);
457 
458 	rc = nvlist_unpack((char *)(uintptr_t)devzvol_zclist,
459 	    devzvol_zclist_size, &nv, 0);
460 	if (rc) {
461 		ASSERT(rc == 0);
462 		kmem_free((void *)(uintptr_t)devzvol_zclist,
463 		    devzvol_zclist_size);
464 		devzvol_gen = 0;
465 		devzvol_zclist = NULL;
466 		devzvol_zclist_size = 0;
467 		goto out;
468 	}
469 	mutex_exit(&devzvol_mtx);
470 	while ((elem = nvlist_next_nvpair(nv, elem)) != NULL) {
471 		struct vnode *vp;
472 		ASSERT(dvp->v_count > 0);
473 		rc = VOP_LOOKUP(dvp, nvpair_name(elem), &vp, NULL, 0,
474 		    NULL, kcred, NULL, 0, NULL);
475 		/* should either work, or not be visible from a zone */
476 		ASSERT(rc == 0 || rc == ENOENT);
477 		if (rc == 0)
478 			VN_RELE(vp);
479 		pools++;
480 	}
481 	nvlist_free(nv);
482 	mutex_enter(&devzvol_mtx);
483 	if (devzvol_isopen && pools == 0) {
484 		/* clean up so zfs can be unloaded */
485 		devzvol_close_zfs();
486 		devzvol_isopen = B_FALSE;
487 	}
488 out:
489 	mutex_exit(&devzvol_mtx);
490 }
491 
492 /*ARGSUSED3*/
493 static int
494 devzvol_create_dir(struct sdev_node *ddv, char *nm, void **arg,
495     cred_t *cred, void *whatever, char *whichever)
496 {
497 	timestruc_t now;
498 	struct vattr *vap = (struct vattr *)arg;
499 
500 	sdcmn_err13(("create_dir (%s) (%s) '%s'", ddv->sdev_name,
501 	    ddv->sdev_path, nm));
502 	ASSERT(strncmp(ddv->sdev_path, ZVOL_DIR,
503 	    strlen(ZVOL_DIR)) == 0);
504 	*vap = *sdev_getdefault_attr(VDIR);
505 	gethrestime(&now);
506 	vap->va_atime = now;
507 	vap->va_mtime = now;
508 	vap->va_ctime = now;
509 	return (0);
510 }
511 
512 /*ARGSUSED3*/
513 static int
514 devzvol_create_link(struct sdev_node *ddv, char *nm,
515     void **arg, cred_t *cred, void *whatever, char *whichever)
516 {
517 	minor_t minor;
518 	char *pathname = (char *)*arg;
519 	int rc;
520 	char *dsname;
521 	char *x;
522 	char str[MAXNAMELEN];
523 	sdcmn_err13(("create_link (%s) (%s) '%s'", ddv->sdev_name,
524 	    ddv->sdev_path, nm));
525 	dsname = devzvol_make_dsname(ddv->sdev_path, nm);
526 	rc = sdev_zvol_create_minor(dsname);
527 	if ((rc != 0 && rc != EEXIST && rc != EBUSY) ||
528 	    sdev_zvol_name2minor(dsname, &minor)) {
529 		sdcmn_err13(("devzvol_create_link %d", rc));
530 		kmem_free(dsname, strlen(dsname) + 1);
531 		return (-1);
532 	}
533 	kmem_free(dsname, strlen(dsname) + 1);
534 
535 	/*
536 	 * This is a valid zvol; create a symlink that points to the
537 	 * minor which was created under /devices/pseudo/zfs@0
538 	 */
539 	*pathname = '\0';
540 	for (x = ddv->sdev_path; x = strchr(x, '/'); x++)
541 		(void) strcat(pathname, "../");
542 	(void) snprintf(str, sizeof (str), ZVOL_PSEUDO_DEV "%u", minor);
543 	(void) strncat(pathname, str, MAXPATHLEN);
544 	if (strncmp(ddv->sdev_path, ZVOL_FULL_RDEV_DIR,
545 	    strlen(ZVOL_FULL_RDEV_DIR)) == 0)
546 		(void) strcat(pathname, ",raw");
547 	return (0);
548 }
549 
550 /* Clean zvol sdev_nodes that are no longer valid.  */
551 static void
552 devzvol_prunedir(struct sdev_node *ddv)
553 {
554 	struct sdev_node *dv;
555 
556 	ASSERT(RW_READ_HELD(&ddv->sdev_contents));
557 
558 	sdcmn_err13(("prunedir '%s'", ddv->sdev_name));
559 	ASSERT(strncmp(ddv->sdev_path, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
560 	if (rw_tryupgrade(&ddv->sdev_contents) == 0) {
561 		rw_exit(&ddv->sdev_contents);
562 		rw_enter(&ddv->sdev_contents, RW_WRITER);
563 	}
564 
565 	dv = SDEV_FIRST_ENTRY(ddv);
566 	while (dv) {
567 		sdcmn_err13(("sdev_name '%s'", dv->sdev_name));
568 
569 		switch (devzvol_validate(dv)) {
570 		case SDEV_VTOR_VALID:
571 		case SDEV_VTOR_SKIP:
572 			dv = SDEV_NEXT_ENTRY(ddv, dv);
573 			continue;
574 		case SDEV_VTOR_INVALID:
575 			sdcmn_err7(("prunedir: destroy invalid "
576 			    "node: %s\n", dv->sdev_name));
577 			break;
578 		}
579 
580 		if ((SDEVTOV(dv)->v_type == VDIR) &&
581 		    (sdev_cleandir(dv, NULL, 0) != 0)) {
582 			dv = SDEV_NEXT_ENTRY(ddv, dv);
583 			continue;
584 		}
585 		SDEV_HOLD(dv);
586 		/* remove the cache node */
587 		sdev_cache_update(ddv, &dv, dv->sdev_name,
588 		    SDEV_CACHE_DELETE);
589 		SDEV_RELE(dv);
590 		dv = SDEV_FIRST_ENTRY(ddv);
591 	}
592 	rw_downgrade(&ddv->sdev_contents);
593 }
594 
595 /*
596  * This function is used to create a dir or dev inside a zone's /dev when the
597  * zone has a zvol that is dynamically created within the zone (i.e. inside
598  * of a delegated dataset.  Since there is no /devices tree within a zone,
599  * we create the chr/blk devices directly inside the zone's /dev instead of
600  * making symlinks.
601  */
602 static int
603 devzvol_mk_ngz_node(struct sdev_node *parent, char *nm)
604 {
605 	struct vattr vattr;
606 	timestruc_t now;
607 	enum vtype expected_type = VDIR;
608 	dmu_objset_type_t do_type;
609 	struct sdev_node *dv = NULL;
610 	int res;
611 	char *dsname;
612 
613 	bzero(&vattr, sizeof (vattr));
614 	gethrestime(&now);
615 	vattr.va_mask = AT_TYPE|AT_MODE|AT_UID|AT_GID;
616 	vattr.va_uid = SDEV_UID_DEFAULT;
617 	vattr.va_gid = SDEV_GID_DEFAULT;
618 	vattr.va_type = VNON;
619 	vattr.va_atime = now;
620 	vattr.va_mtime = now;
621 	vattr.va_ctime = now;
622 
623 	if ((dsname = devzvol_make_dsname(parent->sdev_path, nm)) == NULL)
624 		return (ENOENT);
625 
626 	if (devzvol_objset_check(dsname, &do_type) != 0) {
627 		/*
628 		 * objset_check will succeed on any valid objset in the global
629 		 * zone, and any valid delegated dataset. It will fail, however,
630 		 * in non-global zones on explicitly whitelisted zvol devices
631 		 * that are outside any delegated dataset.
632 		 *
633 		 * The directories leading up to the zvol device itself will be
634 		 * created by prof for us in advance (and will always validate
635 		 * because of the matching check in devzvol_validate). The zvol
636 		 * device itself can't be created by prof though because in the
637 		 * GZ it's a symlink, and in the NGZ it is not. So, we create
638 		 * such zvol device files here.
639 		 */
640 		if (!(parent->sdev_flags & SDEV_GLOBAL) &&
641 		    parent->sdev_origin != NULL &&
642 		    prof_name_matched(nm, parent)) {
643 			do_type = DMU_OST_ZVOL;
644 		} else {
645 			kmem_free(dsname, strlen(dsname) + 1);
646 			return (ENOENT);
647 		}
648 	}
649 
650 	if (do_type == DMU_OST_ZVOL)
651 		expected_type = VBLK;
652 
653 	if (expected_type == VDIR) {
654 		vattr.va_type = VDIR;
655 		vattr.va_mode = SDEV_DIRMODE_DEFAULT;
656 	} else {
657 		minor_t minor;
658 		dev_t devnum;
659 		int rc;
660 
661 		rc = sdev_zvol_create_minor(dsname);
662 		if ((rc != 0 && rc != EEXIST && rc != EBUSY) ||
663 		    sdev_zvol_name2minor(dsname, &minor)) {
664 			kmem_free(dsname, strlen(dsname) + 1);
665 			return (ENOENT);
666 		}
667 
668 		devnum = makedevice(devzvol_major, minor);
669 		vattr.va_rdev = devnum;
670 
671 		if (strstr(parent->sdev_path, "/rdsk/") != NULL)
672 			vattr.va_type = VCHR;
673 		else
674 			vattr.va_type = VBLK;
675 		vattr.va_mode = SDEV_DEVMODE_DEFAULT;
676 	}
677 	kmem_free(dsname, strlen(dsname) + 1);
678 
679 	rw_enter(&parent->sdev_contents, RW_WRITER);
680 
681 	res = sdev_mknode(parent, nm, &dv, &vattr,
682 	    NULL, NULL, kcred, SDEV_READY);
683 	rw_exit(&parent->sdev_contents);
684 	if (res != 0)
685 		return (ENOENT);
686 
687 	SDEV_RELE(dv);
688 	return (0);
689 }
690 
691 /*ARGSUSED*/
692 static int
693 devzvol_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
694     struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred,
695     caller_context_t *ct, int *direntflags, pathname_t *realpnp)
696 {
697 	enum vtype expected_type = VDIR;
698 	struct sdev_node *parent = VTOSDEV(dvp);
699 	char *dsname;
700 	dmu_objset_type_t do_type;
701 	int error;
702 
703 	sdcmn_err13(("devzvol_lookup '%s' '%s'", parent->sdev_path, nm));
704 	*vpp = NULL;
705 	/* execute access is required to search the directory */
706 	if ((error = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0)
707 		return (error);
708 
709 	rw_enter(&parent->sdev_contents, RW_READER);
710 	if (!SDEV_IS_GLOBAL(parent)) {
711 		int res;
712 
713 		rw_exit(&parent->sdev_contents);
714 
715 		/*
716 		 * If we're in the global zone and reach down into a non-global
717 		 * zone's /dev/zvol then this action could trigger the creation
718 		 * of all of the zvol devices for every zone into the non-global
719 		 * zone's /dev tree. This could be a big security hole. To
720 		 * prevent this, disallow the global zone from looking inside
721 		 * a non-global zones /dev/zvol. This behavior is similar to
722 		 * delegated datasets, which cannot be used by the global zone.
723 		 */
724 		if (getzoneid() == GLOBAL_ZONEID)
725 			return (EPERM);
726 
727 		res = prof_lookup(dvp, nm, vpp, cred);
728 
729 		/*
730 		 * We won't find a zvol that was dynamically created inside
731 		 * a NGZ, within a delegated dataset, in the zone's dev profile
732 		 * but prof_lookup will also find it via sdev_cache_lookup.
733 		 */
734 		if (res == ENOENT) {
735 			/*
736 			 * We have to create the sdev node for the dymamically
737 			 * created zvol.
738 			 */
739 			if (devzvol_mk_ngz_node(parent, nm) != 0)
740 				return (ENOENT);
741 			res = prof_lookup(dvp, nm, vpp, cred);
742 		}
743 
744 		return (res);
745 	}
746 
747 	/*
748 	 * Don't let the global-zone style lookup succeed here when we're not
749 	 * running in the global zone. This can happen because prof calls into
750 	 * us (in prof_filldir) trying to create an explicitly passed-through
751 	 * zvol device outside any delegated dataset.
752 	 *
753 	 * We have to stop this here or else we will create prof shadows of
754 	 * the global zone symlink, which will make no sense at all in the
755 	 * non-global zone (it has no /devices for the symlink to point at).
756 	 *
757 	 * These zvols will be created later (at access time) by mk_ngz_node
758 	 * instead. The dirs leading up to them will be created by prof
759 	 * internally.
760 	 *
761 	 * We have to return EPERM here, because ENOENT is given special
762 	 * meaning by prof in this context.
763 	 */
764 	if (getzoneid() != GLOBAL_ZONEID) {
765 		rw_exit(&parent->sdev_contents);
766 		return (EPERM);
767 	}
768 
769 	dsname = devzvol_make_dsname(parent->sdev_path, nm);
770 	rw_exit(&parent->sdev_contents);
771 	sdcmn_err13(("rvp dsname %s", dsname ? dsname : "(null)"));
772 	if (dsname) {
773 		error = devzvol_objset_check(dsname, &do_type);
774 		if (error != 0) {
775 			error = ENOENT;
776 			goto out;
777 		}
778 		if (do_type == DMU_OST_ZVOL)
779 			expected_type = VLNK;
780 	}
781 	/*
782 	 * the callbacks expect:
783 	 *
784 	 * parent->sdev_path		   nm
785 	 * /dev/zvol			   {r}dsk
786 	 * /dev/zvol/{r}dsk		   <pool name>
787 	 * /dev/zvol/{r}dsk/<dataset name> <last ds component>
788 	 *
789 	 * sdev_name is always last path component of sdev_path
790 	 */
791 	if (expected_type == VDIR) {
792 		error = devname_lookup_func(parent, nm, vpp, cred,
793 		    devzvol_create_dir, SDEV_VATTR);
794 	} else {
795 		error = devname_lookup_func(parent, nm, vpp, cred,
796 		    devzvol_create_link, SDEV_VLINK);
797 	}
798 	sdcmn_err13(("devzvol_lookup %d %d", expected_type, error));
799 	ASSERT(error || ((*vpp)->v_type == expected_type));
800 out:
801 	if (dsname)
802 		kmem_free(dsname, strlen(dsname) + 1);
803 	sdcmn_err13(("devzvol_lookup %d", error));
804 	return (error);
805 }
806 
807 /*
808  * We allow create to find existing nodes
809  *	- if the node doesn't exist - EROFS
810  *	- creating an existing dir read-only succeeds, otherwise EISDIR
811  *	- exclusive creates fail - EEXIST
812  */
813 /*ARGSUSED2*/
814 static int
815 devzvol_create(struct vnode *dvp, char *nm, struct vattr *vap, vcexcl_t excl,
816     int mode, struct vnode **vpp, struct cred *cred, int flag,
817     caller_context_t *ct, vsecattr_t *vsecp)
818 {
819 	int error;
820 	struct vnode *vp;
821 
822 	*vpp = NULL;
823 
824 	error = devzvol_lookup(dvp, nm, &vp, NULL, 0, NULL, cred, ct, NULL,
825 	    NULL);
826 	if (error == 0) {
827 		if (excl == EXCL)
828 			error = EEXIST;
829 		else if (vp->v_type == VDIR && (mode & VWRITE))
830 			error = EISDIR;
831 		else
832 			error = VOP_ACCESS(vp, mode, 0, cred, ct);
833 
834 		if (error) {
835 			VN_RELE(vp);
836 		} else
837 			*vpp = vp;
838 	} else if (error == ENOENT) {
839 		error = EROFS;
840 	}
841 
842 	return (error);
843 }
844 
845 void sdev_iter_snapshots(struct vnode *dvp, char *name);
846 
847 void
848 sdev_iter_datasets(struct vnode *dvp, int arg, char *name)
849 {
850 	zfs_cmd_t	*zc;
851 	int rc;
852 
853 	sdcmn_err13(("iter name is '%s' (arg %x)", name, arg));
854 	zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
855 	(void) strcpy(zc->zc_name, name);
856 
857 	while ((rc = devzvol_handle_ioctl(arg, zc, B_FALSE)) == 0) {
858 		struct vnode *vpp;
859 		char *ptr;
860 
861 		sdcmn_err13(("  name %s", zc->zc_name));
862 		if (strchr(zc->zc_name, '$') || strchr(zc->zc_name, '%'))
863 			goto skip;
864 		ptr = strrchr(zc->zc_name, '/') + 1;
865 		rc = devzvol_lookup(dvp, ptr, &vpp, NULL, 0, NULL,
866 		    kcred, NULL, NULL, NULL);
867 		if (rc == 0) {
868 			VN_RELE(vpp);
869 		} else if (rc == ENOENT) {
870 			goto skip;
871 		} else {
872 			/*
873 			 * EBUSY == problem with zvols's dmu holds?
874 			 * EPERM when in a NGZ and traversing up and out.
875 			 */
876 			goto skip;
877 		}
878 		if (arg == ZFS_IOC_DATASET_LIST_NEXT &&
879 		    zc->zc_objset_stats.dds_type == DMU_OST_ZVOL &&
880 		    devzvol_snaps_allowed)
881 			sdev_iter_snapshots(dvp, zc->zc_name);
882 skip:
883 		(void) strcpy(zc->zc_name, name);
884 	}
885 	kmem_free(zc, sizeof (zfs_cmd_t));
886 }
887 
888 void
889 sdev_iter_snapshots(struct vnode *dvp, char *name)
890 {
891 	sdev_iter_datasets(dvp, ZFS_IOC_SNAPSHOT_LIST_NEXT, name);
892 }
893 
894 /*ARGSUSED4*/
895 static int
896 devzvol_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred,
897     int *eofp, caller_context_t *ct_unused, int flags_unused)
898 {
899 	struct sdev_node *sdvp = VTOSDEV(dvp);
900 	char *ptr;
901 
902 	sdcmn_err13(("zv readdir of '%s' %s'", sdvp->sdev_path,
903 	    sdvp->sdev_name));
904 
905 	if (strcmp(sdvp->sdev_path, ZVOL_DIR) == 0) {
906 		struct vnode *vp;
907 
908 		rw_exit(&sdvp->sdev_contents);
909 		(void) devname_lookup_func(sdvp, "dsk", &vp, cred,
910 		    devzvol_create_dir, SDEV_VATTR);
911 		VN_RELE(vp);
912 		(void) devname_lookup_func(sdvp, "rdsk", &vp, cred,
913 		    devzvol_create_dir, SDEV_VATTR);
914 		VN_RELE(vp);
915 		rw_enter(&sdvp->sdev_contents, RW_READER);
916 		return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
917 	}
918 	if (uiop->uio_offset == 0)
919 		devzvol_prunedir(sdvp);
920 	ptr = sdvp->sdev_path + strlen(ZVOL_DIR);
921 	if ((strcmp(ptr, "/dsk") == 0) || (strcmp(ptr, "/rdsk") == 0)) {
922 		rw_exit(&sdvp->sdev_contents);
923 		devzvol_create_pool_dirs(dvp);
924 		rw_enter(&sdvp->sdev_contents, RW_READER);
925 		return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
926 	}
927 
928 	ptr = strchr(ptr + 1, '/');
929 	if (ptr == NULL)
930 		return (ENOENT);
931 	ptr++;
932 	rw_exit(&sdvp->sdev_contents);
933 	sdev_iter_datasets(dvp, ZFS_IOC_DATASET_LIST_NEXT, ptr);
934 	rw_enter(&sdvp->sdev_contents, RW_READER);
935 	return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
936 }
937 
938 const fs_operation_def_t devzvol_vnodeops_tbl[] = {
939 	VOPNAME_READDIR,	{ .vop_readdir = devzvol_readdir },
940 	VOPNAME_LOOKUP,		{ .vop_lookup = devzvol_lookup },
941 	VOPNAME_CREATE,		{ .vop_create = devzvol_create },
942 	VOPNAME_RENAME,		{ .error = fs_nosys },
943 	VOPNAME_MKDIR,		{ .error = fs_nosys },
944 	VOPNAME_RMDIR,		{ .error = fs_nosys },
945 	VOPNAME_REMOVE,		{ .error = fs_nosys },
946 	VOPNAME_SYMLINK,	{ .error = fs_nosys },
947 	NULL,			NULL
948 };
949