xref: /titanic_54/usr/src/uts/common/fs/dev/sdev_zvolops.c (revision dd9c3b29f8e9f6b99b80e1fd8fc03241abd67311)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  * Copyright 2013 Joyent, Inc.  All rights reserved.
25  */
26 
27 /* vnode ops for the /dev/zvol directory */
28 
29 #include <sys/types.h>
30 #include <sys/param.h>
31 #include <sys/sysmacros.h>
32 #include <sys/ddi.h>
33 #include <sys/sunndi.h>
34 #include <sys/sunldi.h>
35 #include <fs/fs_subr.h>
36 #include <sys/fs/dv_node.h>
37 #include <sys/fs/sdev_impl.h>
38 #include <sys/zfs_ioctl.h>
39 #include <sys/policy.h>
40 #include <sys/stat.h>
41 #include <sys/vfs_opreg.h>
42 
43 struct vnodeops	*devzvol_vnodeops;
44 static uint64_t devzvol_gen = 0;
45 static uint64_t devzvol_zclist;
46 static size_t devzvol_zclist_size;
47 static ldi_ident_t devzvol_li;
48 static ldi_handle_t devzvol_lh;
49 static kmutex_t devzvol_mtx;
50 static boolean_t devzvol_isopen;
51 static major_t devzvol_major;
52 
53 /*
54  * we need to use ddi_mod* since fs/dev gets loaded early on in
55  * startup(), and linking fs/dev to fs/zfs would drag in a lot of
56  * other stuff (like drv/random) before the rest of the system is
57  * ready to go
58  */
59 ddi_modhandle_t zfs_mod;
60 int (*szcm)(char *);
61 int (*szn2m)(char *, minor_t *);
62 
63 int
64 sdev_zvol_create_minor(char *dsname)
65 {
66 	if (szcm == NULL)
67 		return (-1);
68 	return ((*szcm)(dsname));
69 }
70 
71 int
72 sdev_zvol_name2minor(char *dsname, minor_t *minor)
73 {
74 	if (szn2m == NULL)
75 		return (-1);
76 	return ((*szn2m)(dsname, minor));
77 }
78 
79 int
80 devzvol_open_zfs()
81 {
82 	int rc;
83 	dev_t dv;
84 
85 	devzvol_li = ldi_ident_from_anon();
86 	if (ldi_open_by_name("/dev/zfs", FREAD | FWRITE, kcred,
87 	    &devzvol_lh, devzvol_li))
88 		return (-1);
89 	if (zfs_mod == NULL && ((zfs_mod = ddi_modopen("fs/zfs",
90 	    KRTLD_MODE_FIRST, &rc)) == NULL)) {
91 		return (rc);
92 	}
93 	ASSERT(szcm == NULL && szn2m == NULL);
94 	if ((szcm = (int (*)(char *))
95 	    ddi_modsym(zfs_mod, "zvol_create_minor", &rc)) == NULL) {
96 		cmn_err(CE_WARN, "couldn't resolve zvol_create_minor");
97 		return (rc);
98 	}
99 	if ((szn2m = (int(*)(char *, minor_t *))
100 	    ddi_modsym(zfs_mod, "zvol_name2minor", &rc)) == NULL) {
101 		cmn_err(CE_WARN, "couldn't resolve zvol_name2minor");
102 		return (rc);
103 	}
104 	if (ldi_get_dev(devzvol_lh, &dv))
105 		return (-1);
106 	devzvol_major = getmajor(dv);
107 	return (0);
108 }
109 
110 void
111 devzvol_close_zfs()
112 {
113 	szcm = NULL;
114 	szn2m = NULL;
115 	(void) ldi_close(devzvol_lh, FREAD|FWRITE, kcred);
116 	ldi_ident_release(devzvol_li);
117 	if (zfs_mod != NULL) {
118 		(void) ddi_modclose(zfs_mod);
119 		zfs_mod = NULL;
120 	}
121 }
122 
123 int
124 devzvol_handle_ioctl(int cmd, zfs_cmd_t *zc, size_t *alloc_size)
125 {
126 	uint64_t cookie;
127 	int size = 8000;
128 	int unused;
129 	int rc;
130 
131 	if (cmd != ZFS_IOC_POOL_CONFIGS)
132 		mutex_enter(&devzvol_mtx);
133 	if (!devzvol_isopen) {
134 		if ((rc = devzvol_open_zfs()) == 0) {
135 			devzvol_isopen = B_TRUE;
136 		} else {
137 			if (cmd != ZFS_IOC_POOL_CONFIGS)
138 				mutex_exit(&devzvol_mtx);
139 			return (ENXIO);
140 		}
141 	}
142 	cookie = zc->zc_cookie;
143 again:
144 	zc->zc_nvlist_dst = (uint64_t)(intptr_t)kmem_alloc(size,
145 	    KM_SLEEP);
146 	zc->zc_nvlist_dst_size = size;
147 	rc = ldi_ioctl(devzvol_lh, cmd, (intptr_t)zc, FKIOCTL, kcred,
148 	    &unused);
149 	if (rc == ENOMEM) {
150 		int newsize;
151 		newsize = zc->zc_nvlist_dst_size;
152 		ASSERT(newsize > size);
153 		kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, size);
154 		size = newsize;
155 		zc->zc_cookie = cookie;
156 		goto again;
157 	}
158 	if (alloc_size == NULL)
159 		kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, size);
160 	else
161 		*alloc_size = size;
162 	if (cmd != ZFS_IOC_POOL_CONFIGS)
163 		mutex_exit(&devzvol_mtx);
164 	return (rc);
165 }
166 
167 /* figures out if the objset exists and returns its type */
168 int
169 devzvol_objset_check(char *dsname, dmu_objset_type_t *type)
170 {
171 	boolean_t	ispool;
172 	zfs_cmd_t	*zc;
173 	int rc;
174 
175 	zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
176 	(void) strlcpy(zc->zc_name, dsname, MAXPATHLEN);
177 
178 	ispool = (strchr(dsname, '/') == NULL) ? B_TRUE : B_FALSE;
179 	if (!ispool && sdev_zvol_name2minor(dsname, NULL) == 0) {
180 		sdcmn_err13(("found cached minor node"));
181 		if (type)
182 			*type = DMU_OST_ZVOL;
183 		kmem_free(zc, sizeof (zfs_cmd_t));
184 		return (0);
185 	}
186 	rc = devzvol_handle_ioctl(ispool ? ZFS_IOC_POOL_STATS :
187 	    ZFS_IOC_OBJSET_STATS, zc, NULL);
188 	if (type && rc == 0)
189 		*type = (ispool) ? DMU_OST_ZFS :
190 		    zc->zc_objset_stats.dds_type;
191 	kmem_free(zc, sizeof (zfs_cmd_t));
192 	return (rc);
193 }
194 
195 /*
196  * returns what the zfs dataset name should be, given the /dev/zvol
197  * path and an optional name; otherwise NULL
198  */
199 char *
200 devzvol_make_dsname(const char *path, const char *name)
201 {
202 	char *dsname;
203 	const char *ptr;
204 	int dslen;
205 
206 	if (strcmp(path, ZVOL_DIR) == 0)
207 		return (NULL);
208 	if (name && (strcmp(name, ".") == 0 || strcmp(name, "..") == 0))
209 		return (NULL);
210 	ptr = path + strlen(ZVOL_DIR);
211 	if (strncmp(ptr, "/dsk", 4) == 0)
212 		ptr += strlen("/dsk");
213 	else if (strncmp(ptr, "/rdsk", 5) == 0)
214 		ptr += strlen("/rdsk");
215 	else
216 		return (NULL);
217 	if (*ptr == '/')
218 		ptr++;
219 
220 	dslen = strlen(ptr);
221 	if (dslen)
222 		dslen++;			/* plus null */
223 	if (name)
224 		dslen += strlen(name) + 1;	/* plus slash */
225 	dsname = kmem_zalloc(dslen, KM_SLEEP);
226 	if (*ptr) {
227 		(void) strlcpy(dsname, ptr, dslen);
228 		if (name)
229 			(void) strlcat(dsname, "/", dslen);
230 	}
231 	if (name)
232 		(void) strlcat(dsname, name, dslen);
233 	return (dsname);
234 }
235 
236 /*
237  * check if the zvol's sdev_node is still valid, which means make
238  * sure the zvol is still valid.  zvol minors aren't proactively
239  * destroyed when the zvol is destroyed, so we use a validator to clean
240  * these up (in other words, when such nodes are encountered during
241  * subsequent lookup() and readdir() operations) so that only valid
242  * nodes are returned.  The ordering between devname_lookup_func and
243  * devzvol_validate is a little inefficient in the case of invalid
244  * or stale nodes because devname_lookup_func calls
245  * devzvol_create_{dir, link}, then the validator says it's invalid,
246  * and then the node gets cleaned up.
247  */
248 int
249 devzvol_validate(struct sdev_node *dv)
250 {
251 	dmu_objset_type_t do_type;
252 	char *dsname;
253 	char *nm = dv->sdev_name;
254 	int rc;
255 
256 	sdcmn_err13(("validating ('%s' '%s')", dv->sdev_path, nm));
257 	/*
258 	 * validate only READY nodes; if someone is sitting on the
259 	 * directory of a dataset that just got destroyed we could
260 	 * get a zombie node which we just skip.
261 	 */
262 	if (dv->sdev_state != SDEV_READY) {
263 		sdcmn_err13(("skipping '%s'", nm));
264 		return (SDEV_VTOR_SKIP);
265 	}
266 
267 	if ((strcmp(dv->sdev_path, ZVOL_DIR "/dsk") == 0) ||
268 	    (strcmp(dv->sdev_path, ZVOL_DIR "/rdsk") == 0))
269 		return (SDEV_VTOR_VALID);
270 	dsname = devzvol_make_dsname(dv->sdev_path, NULL);
271 	if (dsname == NULL)
272 		return (SDEV_VTOR_INVALID);
273 
274 	rc = devzvol_objset_check(dsname, &do_type);
275 	sdcmn_err13(("  '%s' rc %d", dsname, rc));
276 	if (rc != 0) {
277 		kmem_free(dsname, strlen(dsname) + 1);
278 		return (SDEV_VTOR_INVALID);
279 	}
280 	sdcmn_err13(("  v_type %d do_type %d",
281 	    SDEVTOV(dv)->v_type, do_type));
282 	if ((SDEVTOV(dv)->v_type == VLNK && do_type != DMU_OST_ZVOL) ||
283 	    ((SDEVTOV(dv)->v_type == VBLK || SDEVTOV(dv)->v_type == VCHR) &&
284 	    do_type != DMU_OST_ZVOL) ||
285 	    (SDEVTOV(dv)->v_type == VDIR && do_type == DMU_OST_ZVOL)) {
286 		kmem_free(dsname, strlen(dsname) + 1);
287 		return (SDEV_VTOR_STALE);
288 	}
289 	if (SDEVTOV(dv)->v_type == VLNK) {
290 		char *ptr, *link;
291 		long val = 0;
292 		minor_t lminor, ominor;
293 
294 		rc = sdev_getlink(SDEVTOV(dv), &link);
295 		ASSERT(rc == 0);
296 
297 		ptr = strrchr(link, ':') + 1;
298 		rc = ddi_strtol(ptr, NULL, 10, &val);
299 		kmem_free(link, strlen(link) + 1);
300 		ASSERT(rc == 0 && val != 0);
301 		lminor = (minor_t)val;
302 		if (sdev_zvol_name2minor(dsname, &ominor) < 0 ||
303 		    ominor != lminor) {
304 			kmem_free(dsname, strlen(dsname) + 1);
305 			return (SDEV_VTOR_STALE);
306 		}
307 	}
308 	kmem_free(dsname, strlen(dsname) + 1);
309 	return (SDEV_VTOR_VALID);
310 }
311 
312 /*
313  * creates directories as needed in response to a readdir
314  */
315 void
316 devzvol_create_pool_dirs(struct vnode *dvp)
317 {
318 	zfs_cmd_t	*zc;
319 	nvlist_t *nv = NULL;
320 	nvpair_t *elem = NULL;
321 	size_t size;
322 	int pools = 0;
323 	int rc;
324 
325 	sdcmn_err13(("devzvol_create_pool_dirs"));
326 	zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
327 	mutex_enter(&devzvol_mtx);
328 	zc->zc_cookie = devzvol_gen;
329 
330 	rc = devzvol_handle_ioctl(ZFS_IOC_POOL_CONFIGS, zc, &size);
331 	switch (rc) {
332 		case 0:
333 			/* new generation */
334 			ASSERT(devzvol_gen != zc->zc_cookie);
335 			devzvol_gen = zc->zc_cookie;
336 			if (devzvol_zclist)
337 				kmem_free((void *)(uintptr_t)devzvol_zclist,
338 				    devzvol_zclist_size);
339 			devzvol_zclist = zc->zc_nvlist_dst;
340 			devzvol_zclist_size = size;
341 			break;
342 		case EEXIST:
343 			/*
344 			 * no change in the configuration; still need
345 			 * to do lookups in case we did a lookup in
346 			 * zvol/rdsk but not zvol/dsk (or vice versa)
347 			 */
348 			kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst,
349 			    size);
350 			break;
351 		default:
352 			kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst,
353 			    size);
354 			goto out;
355 	}
356 	rc = nvlist_unpack((char *)(uintptr_t)devzvol_zclist,
357 	    devzvol_zclist_size, &nv, 0);
358 	if (rc) {
359 		ASSERT(rc == 0);
360 		kmem_free((void *)(uintptr_t)devzvol_zclist,
361 		    devzvol_zclist_size);
362 		devzvol_gen = 0;
363 		devzvol_zclist = NULL;
364 		devzvol_zclist_size = 0;
365 		goto out;
366 	}
367 	mutex_exit(&devzvol_mtx);
368 	while ((elem = nvlist_next_nvpair(nv, elem)) != NULL) {
369 		struct vnode *vp;
370 		ASSERT(dvp->v_count > 0);
371 		rc = VOP_LOOKUP(dvp, nvpair_name(elem), &vp, NULL, 0,
372 		    NULL, kcred, NULL, 0, NULL);
373 		/* should either work, or not be visible from a zone */
374 		ASSERT(rc == 0 || rc == ENOENT);
375 		if (rc == 0)
376 			VN_RELE(vp);
377 		pools++;
378 	}
379 	nvlist_free(nv);
380 	mutex_enter(&devzvol_mtx);
381 	if (devzvol_isopen && pools == 0) {
382 		/* clean up so zfs can be unloaded */
383 		devzvol_close_zfs();
384 		devzvol_isopen = B_FALSE;
385 	}
386 out:
387 	mutex_exit(&devzvol_mtx);
388 	kmem_free(zc, sizeof (zfs_cmd_t));
389 }
390 
391 /*ARGSUSED3*/
392 static int
393 devzvol_create_dir(struct sdev_node *ddv, char *nm, void **arg,
394     cred_t *cred, void *whatever, char *whichever)
395 {
396 	timestruc_t now;
397 	struct vattr *vap = (struct vattr *)arg;
398 
399 	sdcmn_err13(("create_dir (%s) (%s) '%s'", ddv->sdev_name,
400 	    ddv->sdev_path, nm));
401 	ASSERT(strncmp(ddv->sdev_path, ZVOL_DIR,
402 	    strlen(ZVOL_DIR)) == 0);
403 	*vap = *sdev_getdefault_attr(VDIR);
404 	gethrestime(&now);
405 	vap->va_atime = now;
406 	vap->va_mtime = now;
407 	vap->va_ctime = now;
408 	return (0);
409 }
410 
411 /*ARGSUSED3*/
412 static int
413 devzvol_create_link(struct sdev_node *ddv, char *nm,
414     void **arg, cred_t *cred, void *whatever, char *whichever)
415 {
416 	minor_t minor;
417 	char *pathname = (char *)*arg;
418 	int rc;
419 	char *dsname;
420 	char *x;
421 	char str[MAXNAMELEN];
422 	sdcmn_err13(("create_link (%s) (%s) '%s'", ddv->sdev_name,
423 	    ddv->sdev_path, nm));
424 	dsname = devzvol_make_dsname(ddv->sdev_path, nm);
425 	rc = sdev_zvol_create_minor(dsname);
426 	if ((rc != 0 && rc != EEXIST && rc != EBUSY) ||
427 	    sdev_zvol_name2minor(dsname, &minor)) {
428 		sdcmn_err13(("devzvol_create_link %d", rc));
429 		kmem_free(dsname, strlen(dsname) + 1);
430 		return (-1);
431 	}
432 	kmem_free(dsname, strlen(dsname) + 1);
433 
434 	/*
435 	 * This is a valid zvol; create a symlink that points to the
436 	 * minor which was created under /devices/pseudo/zfs@0
437 	 */
438 	*pathname = '\0';
439 	for (x = ddv->sdev_path; x = strchr(x, '/'); x++)
440 		(void) strcat(pathname, "../");
441 	(void) snprintf(str, sizeof (str), ZVOL_PSEUDO_DEV "%u", minor);
442 	(void) strncat(pathname, str, MAXPATHLEN);
443 	if (strncmp(ddv->sdev_path, ZVOL_FULL_RDEV_DIR,
444 	    strlen(ZVOL_FULL_RDEV_DIR)) == 0)
445 		(void) strcat(pathname, ",raw");
446 	return (0);
447 }
448 
449 /* Clean zvol sdev_nodes that are no longer valid.  */
450 static void
451 devzvol_prunedir(struct sdev_node *ddv)
452 {
453 	struct sdev_node *dv;
454 
455 	ASSERT(RW_READ_HELD(&ddv->sdev_contents));
456 
457 	sdcmn_err13(("prunedir '%s'", ddv->sdev_name));
458 	ASSERT(strncmp(ddv->sdev_path, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
459 	if (rw_tryupgrade(&ddv->sdev_contents) == 0) {
460 		rw_exit(&ddv->sdev_contents);
461 		rw_enter(&ddv->sdev_contents, RW_WRITER);
462 	}
463 
464 	dv = SDEV_FIRST_ENTRY(ddv);
465 	while (dv) {
466 		sdcmn_err13(("sdev_name '%s'", dv->sdev_name));
467 		/* skip stale nodes */
468 		if (dv->sdev_flags & SDEV_STALE) {
469 			sdcmn_err13(("  stale"));
470 			dv = SDEV_NEXT_ENTRY(ddv, dv);
471 			continue;
472 		}
473 
474 		switch (devzvol_validate(dv)) {
475 		case SDEV_VTOR_VALID:
476 		case SDEV_VTOR_SKIP:
477 			dv = SDEV_NEXT_ENTRY(ddv, dv);
478 			continue;
479 		case SDEV_VTOR_INVALID:
480 			sdcmn_err7(("prunedir: destroy invalid "
481 			    "node: %s\n", dv->sdev_name));
482 			break;
483 		}
484 
485 		if ((SDEVTOV(dv)->v_type == VDIR) &&
486 		    (sdev_cleandir(dv, NULL, 0) != 0)) {
487 			dv = SDEV_NEXT_ENTRY(ddv, dv);
488 			continue;
489 		}
490 		SDEV_HOLD(dv);
491 		/* remove the cache node */
492 		if (sdev_cache_update(ddv, &dv, dv->sdev_name,
493 		    SDEV_CACHE_DELETE) == 0)
494 			dv = SDEV_FIRST_ENTRY(ddv);
495 		else
496 			dv = SDEV_NEXT_ENTRY(ddv, dv);
497 	}
498 	rw_downgrade(&ddv->sdev_contents);
499 }
500 
501 /*
502  * This function is used to create a dir or dev inside a zone's /dev when the
503  * zone has a zvol that is dynamically created within the zone (i.e. inside
504  * of a delegated dataset.  Since there is no /devices tree within a zone,
505  * we create the chr/blk devices directly inside the zone's /dev instead of
506  * making symlinks.
507  */
508 static int
509 devzvol_mk_ngz_node(struct sdev_node *parent, char *nm)
510 {
511 	struct vattr vattr;
512 	timestruc_t now;
513 	enum vtype expected_type = VDIR;
514 	dmu_objset_type_t do_type;
515 	struct sdev_node *dv = NULL;
516 	int res;
517 	char *dsname;
518 
519 	bzero(&vattr, sizeof (vattr));
520 	gethrestime(&now);
521 	vattr.va_mask = AT_TYPE|AT_MODE|AT_UID|AT_GID;
522 	vattr.va_uid = SDEV_UID_DEFAULT;
523 	vattr.va_gid = SDEV_GID_DEFAULT;
524 	vattr.va_type = VNON;
525 	vattr.va_atime = now;
526 	vattr.va_mtime = now;
527 	vattr.va_ctime = now;
528 
529 	if ((dsname = devzvol_make_dsname(parent->sdev_path, nm)) == NULL)
530 		return (ENOENT);
531 
532 	if (devzvol_objset_check(dsname, &do_type) != 0) {
533 		kmem_free(dsname, strlen(dsname) + 1);
534 		return (ENOENT);
535 	}
536 	if (do_type == DMU_OST_ZVOL)
537 		expected_type = VBLK;
538 
539 	if (expected_type == VDIR) {
540 		vattr.va_type = VDIR;
541 		vattr.va_mode = SDEV_DIRMODE_DEFAULT;
542 	} else {
543 		minor_t minor;
544 		dev_t devnum;
545 		int rc;
546 
547 		rc = sdev_zvol_create_minor(dsname);
548 		if ((rc != 0 && rc != EEXIST && rc != EBUSY) ||
549 		    sdev_zvol_name2minor(dsname, &minor)) {
550 			kmem_free(dsname, strlen(dsname) + 1);
551 			return (ENOENT);
552 		}
553 
554 		devnum = makedevice(devzvol_major, minor);
555 		vattr.va_rdev = devnum;
556 
557 		if (strstr(parent->sdev_path, "/rdsk/") != NULL)
558 			vattr.va_type = VCHR;
559 		else
560 			vattr.va_type = VBLK;
561 		vattr.va_mode = SDEV_DEVMODE_DEFAULT;
562 	}
563 	kmem_free(dsname, strlen(dsname) + 1);
564 
565 	rw_enter(&parent->sdev_contents, RW_WRITER);
566 
567 	res = sdev_mknode(parent, nm, &dv, &vattr,
568 	    NULL, NULL, kcred, SDEV_READY);
569 	rw_exit(&parent->sdev_contents);
570 	if (res != 0)
571 		return (ENOENT);
572 
573 	SDEV_RELE(dv);
574 	return (0);
575 }
576 
577 /*ARGSUSED*/
578 static int
579 devzvol_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
580     struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred,
581     caller_context_t *ct, int *direntflags, pathname_t *realpnp)
582 {
583 	enum vtype expected_type = VDIR;
584 	struct sdev_node *parent = VTOSDEV(dvp);
585 	char *dsname;
586 	dmu_objset_type_t do_type;
587 	int error;
588 
589 	sdcmn_err13(("devzvol_lookup '%s' '%s'", parent->sdev_path, nm));
590 	*vpp = NULL;
591 	/* execute access is required to search the directory */
592 	if ((error = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0)
593 		return (error);
594 
595 	rw_enter(&parent->sdev_contents, RW_READER);
596 	if (SDEV_IS_GLOBAL(parent)) {
597 		/*
598 		 * During iter_datasets, don't create GZ dev when running in
599 		 * NGZ.  We can't return ENOENT here since that could
600 		 * incorrectly trigger the creation of the dev from the
601 		 * recursive call through prof_filldir during iter_datasets.
602 		 */
603 		if (getzoneid() != GLOBAL_ZONEID) {
604 			rw_exit(&parent->sdev_contents);
605 			return (EPERM);
606 		}
607 	} else {
608 		int res;
609 
610 		rw_exit(&parent->sdev_contents);
611 
612 		/*
613 		 * If we're in the global zone and reach down into a non-global
614 		 * zone's /dev/zvol then this action could trigger the creation
615 		 * of all of the zvol devices for every zone into the non-global
616 		 * zone's /dev tree. This could be a big security hole. To
617 		 * prevent this, disallow the global zone from looking inside
618 		 * a non-global zones /dev/zvol. This behavior is similar to
619 		 * delegated datasets, which cannot be used by the global zone.
620 		 */
621 		if (getzoneid() == GLOBAL_ZONEID)
622 			return (EPERM);
623 
624 		res = prof_lookup(dvp, nm, vpp, cred);
625 
626 		/*
627 		 * We won't find a zvol that was dynamically created inside
628 		 * a NGZ, within a delegated dataset, in the zone's dev profile
629 		 * but prof_lookup will also find it via sdev_cache_lookup.
630 		 */
631 		if (res == ENOENT) {
632 			/*
633 			 * We have to create the sdev node for the dymamically
634 			 * created zvol.
635 			 */
636 			if (devzvol_mk_ngz_node(parent, nm) != 0)
637 				return (ENOENT);
638 			res = prof_lookup(dvp, nm, vpp, cred);
639 		}
640 
641 		return (res);
642 	}
643 
644 	dsname = devzvol_make_dsname(parent->sdev_path, nm);
645 	rw_exit(&parent->sdev_contents);
646 	sdcmn_err13(("rvp dsname %s", dsname ? dsname : "(null)"));
647 	if (dsname) {
648 		error = devzvol_objset_check(dsname, &do_type);
649 		if (error != 0) {
650 			error = ENOENT;
651 			goto out;
652 		}
653 		if (do_type == DMU_OST_ZVOL)
654 			expected_type = VLNK;
655 	}
656 	/*
657 	 * the callbacks expect:
658 	 *
659 	 * parent->sdev_path		   nm
660 	 * /dev/zvol			   {r}dsk
661 	 * /dev/zvol/{r}dsk		   <pool name>
662 	 * /dev/zvol/{r}dsk/<dataset name> <last ds component>
663 	 *
664 	 * sdev_name is always last path component of sdev_path
665 	 */
666 	if (expected_type == VDIR) {
667 		error = devname_lookup_func(parent, nm, vpp, cred,
668 		    devzvol_create_dir, SDEV_VATTR);
669 	} else {
670 		error = devname_lookup_func(parent, nm, vpp, cred,
671 		    devzvol_create_link, SDEV_VLINK);
672 	}
673 	sdcmn_err13(("devzvol_lookup %d %d", expected_type, error));
674 	ASSERT(error || ((*vpp)->v_type == expected_type));
675 out:
676 	if (dsname)
677 		kmem_free(dsname, strlen(dsname) + 1);
678 	sdcmn_err13(("devzvol_lookup %d", error));
679 	return (error);
680 }
681 
682 /*
683  * We allow create to find existing nodes
684  *	- if the node doesn't exist - EROFS
685  *	- creating an existing dir read-only succeeds, otherwise EISDIR
686  *	- exclusive creates fail - EEXIST
687  */
688 /*ARGSUSED2*/
689 static int
690 devzvol_create(struct vnode *dvp, char *nm, struct vattr *vap, vcexcl_t excl,
691     int mode, struct vnode **vpp, struct cred *cred, int flag,
692     caller_context_t *ct, vsecattr_t *vsecp)
693 {
694 	int error;
695 	struct vnode *vp;
696 
697 	*vpp = NULL;
698 
699 	error = devzvol_lookup(dvp, nm, &vp, NULL, 0, NULL, cred, ct, NULL,
700 	    NULL);
701 	if (error == 0) {
702 		if (excl == EXCL)
703 			error = EEXIST;
704 		else if (vp->v_type == VDIR && (mode & VWRITE))
705 			error = EISDIR;
706 		else
707 			error = VOP_ACCESS(vp, mode, 0, cred, ct);
708 
709 		if (error) {
710 			VN_RELE(vp);
711 		} else
712 			*vpp = vp;
713 	} else if (error == ENOENT) {
714 		error = EROFS;
715 	}
716 
717 	return (error);
718 }
719 
720 void sdev_iter_snapshots(struct vnode *dvp, char *name);
721 
722 void
723 sdev_iter_datasets(struct vnode *dvp, int arg, char *name)
724 {
725 	zfs_cmd_t	*zc;
726 	int rc;
727 
728 	sdcmn_err13(("iter name is '%s' (arg %x)", name, arg));
729 	zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
730 	(void) strcpy(zc->zc_name, name);
731 
732 	while ((rc = devzvol_handle_ioctl(arg, zc, B_FALSE)) == 0) {
733 		struct vnode *vpp;
734 		char *ptr;
735 
736 		sdcmn_err13(("  name %s", zc->zc_name));
737 		if (strchr(zc->zc_name, '$') || strchr(zc->zc_name, '%'))
738 			goto skip;
739 		ptr = strrchr(zc->zc_name, '/') + 1;
740 		rc = devzvol_lookup(dvp, ptr, &vpp, NULL, 0, NULL,
741 		    kcred, NULL, NULL, NULL);
742 		if (rc == 0) {
743 			VN_RELE(vpp);
744 		} else if (rc == ENOENT) {
745 			goto skip;
746 		} else {
747 			/*
748 			 * EBUSY == problem with zvols's dmu holds?
749 			 * EPERM when in a NGZ and traversing up and out.
750 			 */
751 			goto skip;
752 		}
753 		if (arg == ZFS_IOC_DATASET_LIST_NEXT &&
754 		    zc->zc_objset_stats.dds_type != DMU_OST_ZFS)
755 			sdev_iter_snapshots(dvp, zc->zc_name);
756 skip:
757 		(void) strcpy(zc->zc_name, name);
758 	}
759 	kmem_free(zc, sizeof (zfs_cmd_t));
760 }
761 
762 void
763 sdev_iter_snapshots(struct vnode *dvp, char *name)
764 {
765 	sdev_iter_datasets(dvp, ZFS_IOC_SNAPSHOT_LIST_NEXT, name);
766 }
767 
768 /*ARGSUSED4*/
769 static int
770 devzvol_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred,
771     int *eofp, caller_context_t *ct_unused, int flags_unused)
772 {
773 	struct sdev_node *sdvp = VTOSDEV(dvp);
774 	char *ptr;
775 
776 	sdcmn_err13(("zv readdir of '%s' %s'", sdvp->sdev_path,
777 	    sdvp->sdev_name));
778 
779 	if (strcmp(sdvp->sdev_path, ZVOL_DIR) == 0) {
780 		struct vnode *vp;
781 
782 		rw_exit(&sdvp->sdev_contents);
783 		(void) devname_lookup_func(sdvp, "dsk", &vp, cred,
784 		    devzvol_create_dir, SDEV_VATTR);
785 		VN_RELE(vp);
786 		(void) devname_lookup_func(sdvp, "rdsk", &vp, cred,
787 		    devzvol_create_dir, SDEV_VATTR);
788 		VN_RELE(vp);
789 		rw_enter(&sdvp->sdev_contents, RW_READER);
790 		return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
791 	}
792 	if (uiop->uio_offset == 0)
793 		devzvol_prunedir(sdvp);
794 	ptr = sdvp->sdev_path + strlen(ZVOL_DIR);
795 	if ((strcmp(ptr, "/dsk") == 0) || (strcmp(ptr, "/rdsk") == 0)) {
796 		rw_exit(&sdvp->sdev_contents);
797 		devzvol_create_pool_dirs(dvp);
798 		rw_enter(&sdvp->sdev_contents, RW_READER);
799 		return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
800 	}
801 
802 	ptr = strchr(ptr + 1, '/') + 1;
803 	rw_exit(&sdvp->sdev_contents);
804 	sdev_iter_datasets(dvp, ZFS_IOC_DATASET_LIST_NEXT, ptr);
805 	rw_enter(&sdvp->sdev_contents, RW_READER);
806 	return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
807 }
808 
809 const fs_operation_def_t devzvol_vnodeops_tbl[] = {
810 	VOPNAME_READDIR,	{ .vop_readdir = devzvol_readdir },
811 	VOPNAME_LOOKUP,		{ .vop_lookup = devzvol_lookup },
812 	VOPNAME_CREATE,		{ .vop_create = devzvol_create },
813 	VOPNAME_RENAME,		{ .error = fs_nosys },
814 	VOPNAME_MKDIR,		{ .error = fs_nosys },
815 	VOPNAME_RMDIR,		{ .error = fs_nosys },
816 	VOPNAME_REMOVE,		{ .error = fs_nosys },
817 	VOPNAME_SYMLINK,	{ .error = fs_nosys },
818 	NULL,			NULL
819 };
820