xref: /titanic_41/usr/src/uts/common/fs/dev/sdev_zvolops.c (revision d339dfeffb229c9d6600872a9e1175f6c68a4bb3)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  * Copyright 2013 Joyent, Inc.  All rights reserved.
25  */
26 
27 /* vnode ops for the /dev/zvol directory */
28 
29 #include <sys/types.h>
30 #include <sys/param.h>
31 #include <sys/sysmacros.h>
32 #include <sys/ddi.h>
33 #include <sys/sunndi.h>
34 #include <sys/sunldi.h>
35 #include <fs/fs_subr.h>
36 #include <sys/fs/dv_node.h>
37 #include <sys/fs/sdev_impl.h>
38 #include <sys/zfs_ioctl.h>
39 #include <sys/policy.h>
40 #include <sys/stat.h>
41 #include <sys/vfs_opreg.h>
42 
43 struct vnodeops	*devzvol_vnodeops;
44 static uint64_t devzvol_gen = 0;
45 static uint64_t devzvol_zclist;
46 static size_t devzvol_zclist_size;
47 static ldi_ident_t devzvol_li;
48 static ldi_handle_t devzvol_lh;
49 static kmutex_t devzvol_mtx;
50 static boolean_t devzvol_isopen;
51 static major_t devzvol_major;
52 
53 /*
54  * we need to use ddi_mod* since fs/dev gets loaded early on in
55  * startup(), and linking fs/dev to fs/zfs would drag in a lot of
56  * other stuff (like drv/random) before the rest of the system is
57  * ready to go
58  */
59 ddi_modhandle_t zfs_mod;
60 int (*szcm)(char *);
61 int (*szn2m)(char *, minor_t *);
62 
63 int
64 sdev_zvol_create_minor(char *dsname)
65 {
66 	if (szcm == NULL)
67 		return (-1);
68 	return ((*szcm)(dsname));
69 }
70 
71 int
72 sdev_zvol_name2minor(char *dsname, minor_t *minor)
73 {
74 	if (szn2m == NULL)
75 		return (-1);
76 	return ((*szn2m)(dsname, minor));
77 }
78 
79 int
80 devzvol_open_zfs()
81 {
82 	int rc;
83 	dev_t dv;
84 
85 	devzvol_li = ldi_ident_from_anon();
86 	if (ldi_open_by_name("/dev/zfs", FREAD | FWRITE, kcred,
87 	    &devzvol_lh, devzvol_li))
88 		return (-1);
89 	if (zfs_mod == NULL && ((zfs_mod = ddi_modopen("fs/zfs",
90 	    KRTLD_MODE_FIRST, &rc)) == NULL)) {
91 		return (rc);
92 	}
93 	ASSERT(szcm == NULL && szn2m == NULL);
94 	if ((szcm = (int (*)(char *))
95 	    ddi_modsym(zfs_mod, "zvol_create_minor", &rc)) == NULL) {
96 		cmn_err(CE_WARN, "couldn't resolve zvol_create_minor");
97 		return (rc);
98 	}
99 	if ((szn2m = (int(*)(char *, minor_t *))
100 	    ddi_modsym(zfs_mod, "zvol_name2minor", &rc)) == NULL) {
101 		cmn_err(CE_WARN, "couldn't resolve zvol_name2minor");
102 		return (rc);
103 	}
104 	if (ldi_get_dev(devzvol_lh, &dv))
105 		return (-1);
106 	devzvol_major = getmajor(dv);
107 	return (0);
108 }
109 
110 void
111 devzvol_close_zfs()
112 {
113 	szcm = NULL;
114 	szn2m = NULL;
115 	(void) ldi_close(devzvol_lh, FREAD|FWRITE, kcred);
116 	ldi_ident_release(devzvol_li);
117 	if (zfs_mod != NULL) {
118 		(void) ddi_modclose(zfs_mod);
119 		zfs_mod = NULL;
120 	}
121 }
122 
123 int
124 devzvol_handle_ioctl(int cmd, zfs_cmd_t *zc, size_t *alloc_size)
125 {
126 	uint64_t cookie;
127 	int size = 8000;
128 	int unused;
129 	int rc;
130 
131 	if (cmd != ZFS_IOC_POOL_CONFIGS)
132 		mutex_enter(&devzvol_mtx);
133 	if (!devzvol_isopen) {
134 		if ((rc = devzvol_open_zfs()) == 0) {
135 			devzvol_isopen = B_TRUE;
136 		} else {
137 			if (cmd != ZFS_IOC_POOL_CONFIGS)
138 				mutex_exit(&devzvol_mtx);
139 			return (ENXIO);
140 		}
141 	}
142 	cookie = zc->zc_cookie;
143 again:
144 	zc->zc_nvlist_dst = (uint64_t)(intptr_t)kmem_alloc(size,
145 	    KM_SLEEP);
146 	zc->zc_nvlist_dst_size = size;
147 	rc = ldi_ioctl(devzvol_lh, cmd, (intptr_t)zc, FKIOCTL, kcred,
148 	    &unused);
149 	if (rc == ENOMEM) {
150 		int newsize;
151 		newsize = zc->zc_nvlist_dst_size;
152 		ASSERT(newsize > size);
153 		kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, size);
154 		size = newsize;
155 		zc->zc_cookie = cookie;
156 		goto again;
157 	}
158 	if (alloc_size == NULL)
159 		kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, size);
160 	else
161 		*alloc_size = size;
162 	if (cmd != ZFS_IOC_POOL_CONFIGS)
163 		mutex_exit(&devzvol_mtx);
164 	return (rc);
165 }
166 
167 /* figures out if the objset exists and returns its type */
168 int
169 devzvol_objset_check(char *dsname, dmu_objset_type_t *type)
170 {
171 	boolean_t	ispool;
172 	zfs_cmd_t	*zc;
173 	int rc;
174 
175 	zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
176 	(void) strlcpy(zc->zc_name, dsname, MAXPATHLEN);
177 
178 	ispool = (strchr(dsname, '/') == NULL) ? B_TRUE : B_FALSE;
179 	if (!ispool && sdev_zvol_name2minor(dsname, NULL) == 0) {
180 		sdcmn_err13(("found cached minor node"));
181 		if (type)
182 			*type = DMU_OST_ZVOL;
183 		kmem_free(zc, sizeof (zfs_cmd_t));
184 		return (0);
185 	}
186 	rc = devzvol_handle_ioctl(ispool ? ZFS_IOC_POOL_STATS :
187 	    ZFS_IOC_OBJSET_STATS, zc, NULL);
188 	if (type && rc == 0)
189 		*type = (ispool) ? DMU_OST_ZFS :
190 		    zc->zc_objset_stats.dds_type;
191 	kmem_free(zc, sizeof (zfs_cmd_t));
192 	return (rc);
193 }
194 
195 /*
196  * returns what the zfs dataset name should be, given the /dev/zvol
197  * path and an optional name; otherwise NULL
198  */
199 char *
200 devzvol_make_dsname(const char *path, const char *name)
201 {
202 	char *dsname;
203 	const char *ptr;
204 	int dslen;
205 
206 	if (strcmp(path, ZVOL_DIR) == 0)
207 		return (NULL);
208 	if (name && (strcmp(name, ".") == 0 || strcmp(name, "..") == 0))
209 		return (NULL);
210 	ptr = path + strlen(ZVOL_DIR);
211 	if (strncmp(ptr, "/dsk", 4) == 0)
212 		ptr += strlen("/dsk");
213 	else if (strncmp(ptr, "/rdsk", 5) == 0)
214 		ptr += strlen("/rdsk");
215 	else
216 		return (NULL);
217 	if (*ptr == '/')
218 		ptr++;
219 
220 	dslen = strlen(ptr);
221 	if (dslen)
222 		dslen++;			/* plus null */
223 	if (name)
224 		dslen += strlen(name) + 1;	/* plus slash */
225 	dsname = kmem_zalloc(dslen, KM_SLEEP);
226 	if (*ptr) {
227 		(void) strlcpy(dsname, ptr, dslen);
228 		if (name)
229 			(void) strlcat(dsname, "/", dslen);
230 	}
231 	if (name)
232 		(void) strlcat(dsname, name, dslen);
233 	return (dsname);
234 }
235 
236 /*
237  * check if the zvol's sdev_node is still valid, which means make
238  * sure the zvol is still valid.  zvol minors aren't proactively
239  * destroyed when the zvol is destroyed, so we use a validator to clean
240  * these up (in other words, when such nodes are encountered during
241  * subsequent lookup() and readdir() operations) so that only valid
242  * nodes are returned.  The ordering between devname_lookup_func and
243  * devzvol_validate is a little inefficient in the case of invalid
244  * or stale nodes because devname_lookup_func calls
245  * devzvol_create_{dir, link}, then the validator says it's invalid,
246  * and then the node gets cleaned up.
247  */
248 int
249 devzvol_validate(struct sdev_node *dv)
250 {
251 	dmu_objset_type_t do_type;
252 	char *dsname;
253 	char *nm = dv->sdev_name;
254 	int rc;
255 
256 	sdcmn_err13(("validating ('%s' '%s')", dv->sdev_path, nm));
257 	/*
258 	 * validate only READY nodes; if someone is sitting on the
259 	 * directory of a dataset that just got destroyed we could
260 	 * get a zombie node which we just skip.
261 	 */
262 	if (dv->sdev_state != SDEV_READY) {
263 		sdcmn_err13(("skipping '%s'", nm));
264 		return (SDEV_VTOR_SKIP);
265 	}
266 
267 	if ((strcmp(dv->sdev_path, ZVOL_DIR "/dsk") == 0) ||
268 	    (strcmp(dv->sdev_path, ZVOL_DIR "/rdsk") == 0))
269 		return (SDEV_VTOR_VALID);
270 	dsname = devzvol_make_dsname(dv->sdev_path, NULL);
271 	if (dsname == NULL)
272 		return (SDEV_VTOR_INVALID);
273 
274 	rc = devzvol_objset_check(dsname, &do_type);
275 	sdcmn_err13(("  '%s' rc %d", dsname, rc));
276 	if (rc != 0) {
277 		kmem_free(dsname, strlen(dsname) + 1);
278 		return (SDEV_VTOR_INVALID);
279 	}
280 	sdcmn_err13(("  v_type %d do_type %d",
281 	    SDEVTOV(dv)->v_type, do_type));
282 	if ((SDEVTOV(dv)->v_type == VLNK && do_type != DMU_OST_ZVOL) ||
283 	    ((SDEVTOV(dv)->v_type == VBLK || SDEVTOV(dv)->v_type == VCHR) &&
284 	    do_type != DMU_OST_ZVOL) ||
285 	    (SDEVTOV(dv)->v_type == VDIR && do_type == DMU_OST_ZVOL)) {
286 		kmem_free(dsname, strlen(dsname) + 1);
287 		return (SDEV_VTOR_STALE);
288 	}
289 	if (SDEVTOV(dv)->v_type == VLNK) {
290 		char *ptr, *link;
291 		long val = 0;
292 		minor_t lminor, ominor;
293 
294 		rc = sdev_getlink(SDEVTOV(dv), &link);
295 		ASSERT(rc == 0);
296 
297 		ptr = strrchr(link, ':') + 1;
298 		rc = ddi_strtol(ptr, NULL, 10, &val);
299 		kmem_free(link, strlen(link) + 1);
300 		ASSERT(rc == 0 && val != 0);
301 		lminor = (minor_t)val;
302 		if (sdev_zvol_name2minor(dsname, &ominor) < 0 ||
303 		    ominor != lminor) {
304 			kmem_free(dsname, strlen(dsname) + 1);
305 			return (SDEV_VTOR_STALE);
306 		}
307 	}
308 	kmem_free(dsname, strlen(dsname) + 1);
309 	return (SDEV_VTOR_VALID);
310 }
311 
312 /*
313  * creates directories as needed in response to a readdir
314  */
315 void
316 devzvol_create_pool_dirs(struct vnode *dvp)
317 {
318 	zfs_cmd_t	*zc;
319 	nvlist_t *nv = NULL;
320 	nvpair_t *elem = NULL;
321 	size_t size;
322 	int pools = 0;
323 	int rc;
324 
325 	sdcmn_err13(("devzvol_create_pool_dirs"));
326 	zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
327 	mutex_enter(&devzvol_mtx);
328 	zc->zc_cookie = devzvol_gen;
329 
330 	rc = devzvol_handle_ioctl(ZFS_IOC_POOL_CONFIGS, zc, &size);
331 	switch (rc) {
332 		case 0:
333 			/* new generation */
334 			ASSERT(devzvol_gen != zc->zc_cookie);
335 			devzvol_gen = zc->zc_cookie;
336 			if (devzvol_zclist)
337 				kmem_free((void *)(uintptr_t)devzvol_zclist,
338 				    devzvol_zclist_size);
339 			devzvol_zclist = zc->zc_nvlist_dst;
340 			devzvol_zclist_size = size;
341 			break;
342 		case EEXIST:
343 			/*
344 			 * no change in the configuration; still need
345 			 * to do lookups in case we did a lookup in
346 			 * zvol/rdsk but not zvol/dsk (or vice versa)
347 			 */
348 			kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst,
349 			    size);
350 			break;
351 		default:
352 			kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst,
353 			    size);
354 			goto out;
355 	}
356 	rc = nvlist_unpack((char *)(uintptr_t)devzvol_zclist,
357 	    devzvol_zclist_size, &nv, 0);
358 	if (rc) {
359 		ASSERT(rc == 0);
360 		kmem_free((void *)(uintptr_t)devzvol_zclist,
361 		    devzvol_zclist_size);
362 		devzvol_gen = 0;
363 		devzvol_zclist = NULL;
364 		devzvol_zclist_size = 0;
365 		goto out;
366 	}
367 	mutex_exit(&devzvol_mtx);
368 	while ((elem = nvlist_next_nvpair(nv, elem)) != NULL) {
369 		struct vnode *vp;
370 		ASSERT(dvp->v_count > 0);
371 		rc = VOP_LOOKUP(dvp, nvpair_name(elem), &vp, NULL, 0,
372 		    NULL, kcred, NULL, 0, NULL);
373 		/* should either work, or not be visible from a zone */
374 		ASSERT(rc == 0 || rc == ENOENT);
375 		if (rc == 0)
376 			VN_RELE(vp);
377 		pools++;
378 	}
379 	nvlist_free(nv);
380 	mutex_enter(&devzvol_mtx);
381 	if (devzvol_isopen && pools == 0) {
382 		/* clean up so zfs can be unloaded */
383 		devzvol_close_zfs();
384 		devzvol_isopen = B_FALSE;
385 	}
386 out:
387 	mutex_exit(&devzvol_mtx);
388 	kmem_free(zc, sizeof (zfs_cmd_t));
389 }
390 
391 /*ARGSUSED3*/
392 static int
393 devzvol_create_dir(struct sdev_node *ddv, char *nm, void **arg,
394     cred_t *cred, void *whatever, char *whichever)
395 {
396 	timestruc_t now;
397 	struct vattr *vap = (struct vattr *)arg;
398 
399 	sdcmn_err13(("create_dir (%s) (%s) '%s'", ddv->sdev_name,
400 	    ddv->sdev_path, nm));
401 	ASSERT(strncmp(ddv->sdev_path, ZVOL_DIR,
402 	    strlen(ZVOL_DIR)) == 0);
403 	*vap = *sdev_getdefault_attr(VDIR);
404 	gethrestime(&now);
405 	vap->va_atime = now;
406 	vap->va_mtime = now;
407 	vap->va_ctime = now;
408 	return (0);
409 }
410 
411 /*ARGSUSED3*/
412 static int
413 devzvol_create_link(struct sdev_node *ddv, char *nm,
414     void **arg, cred_t *cred, void *whatever, char *whichever)
415 {
416 	minor_t minor;
417 	char *pathname = (char *)*arg;
418 	int rc;
419 	char *dsname;
420 	char *x;
421 	char str[MAXNAMELEN];
422 	sdcmn_err13(("create_link (%s) (%s) '%s'", ddv->sdev_name,
423 	    ddv->sdev_path, nm));
424 	dsname = devzvol_make_dsname(ddv->sdev_path, nm);
425 	rc = sdev_zvol_create_minor(dsname);
426 	if ((rc != 0 && rc != EEXIST && rc != EBUSY) ||
427 	    sdev_zvol_name2minor(dsname, &minor)) {
428 		sdcmn_err13(("devzvol_create_link %d", rc));
429 		kmem_free(dsname, strlen(dsname) + 1);
430 		return (-1);
431 	}
432 	kmem_free(dsname, strlen(dsname) + 1);
433 
434 	/*
435 	 * This is a valid zvol; create a symlink that points to the
436 	 * minor which was created under /devices/pseudo/zfs@0
437 	 */
438 	*pathname = '\0';
439 	for (x = ddv->sdev_path; x = strchr(x, '/'); x++)
440 		(void) strcat(pathname, "../");
441 	(void) snprintf(str, sizeof (str), ZVOL_PSEUDO_DEV "%u", minor);
442 	(void) strncat(pathname, str, MAXPATHLEN);
443 	if (strncmp(ddv->sdev_path, ZVOL_FULL_RDEV_DIR,
444 	    strlen(ZVOL_FULL_RDEV_DIR)) == 0)
445 		(void) strcat(pathname, ",raw");
446 	return (0);
447 }
448 
449 /* Clean zvol sdev_nodes that are no longer valid.  */
450 static void
451 devzvol_prunedir(struct sdev_node *ddv)
452 {
453 	struct sdev_node *dv;
454 
455 	ASSERT(RW_READ_HELD(&ddv->sdev_contents));
456 
457 	sdcmn_err13(("prunedir '%s'", ddv->sdev_name));
458 	ASSERT(strncmp(ddv->sdev_path, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
459 	if (rw_tryupgrade(&ddv->sdev_contents) == 0) {
460 		rw_exit(&ddv->sdev_contents);
461 		rw_enter(&ddv->sdev_contents, RW_WRITER);
462 	}
463 
464 	dv = SDEV_FIRST_ENTRY(ddv);
465 	while (dv) {
466 		sdcmn_err13(("sdev_name '%s'", dv->sdev_name));
467 
468 		switch (devzvol_validate(dv)) {
469 		case SDEV_VTOR_VALID:
470 		case SDEV_VTOR_SKIP:
471 			dv = SDEV_NEXT_ENTRY(ddv, dv);
472 			continue;
473 		case SDEV_VTOR_INVALID:
474 			sdcmn_err7(("prunedir: destroy invalid "
475 			    "node: %s\n", dv->sdev_name));
476 			break;
477 		}
478 
479 		if ((SDEVTOV(dv)->v_type == VDIR) &&
480 		    (sdev_cleandir(dv, NULL, 0) != 0)) {
481 			dv = SDEV_NEXT_ENTRY(ddv, dv);
482 			continue;
483 		}
484 		SDEV_HOLD(dv);
485 		/* remove the cache node */
486 		sdev_cache_update(ddv, &dv, dv->sdev_name,
487 		    SDEV_CACHE_DELETE);
488 		SDEV_RELE(dv);
489 		dv = SDEV_FIRST_ENTRY(ddv);
490 	}
491 	rw_downgrade(&ddv->sdev_contents);
492 }
493 
494 /*
495  * This function is used to create a dir or dev inside a zone's /dev when the
496  * zone has a zvol that is dynamically created within the zone (i.e. inside
497  * of a delegated dataset.  Since there is no /devices tree within a zone,
498  * we create the chr/blk devices directly inside the zone's /dev instead of
499  * making symlinks.
500  */
501 static int
502 devzvol_mk_ngz_node(struct sdev_node *parent, char *nm)
503 {
504 	struct vattr vattr;
505 	timestruc_t now;
506 	enum vtype expected_type = VDIR;
507 	dmu_objset_type_t do_type;
508 	struct sdev_node *dv = NULL;
509 	int res;
510 	char *dsname;
511 
512 	bzero(&vattr, sizeof (vattr));
513 	gethrestime(&now);
514 	vattr.va_mask = AT_TYPE|AT_MODE|AT_UID|AT_GID;
515 	vattr.va_uid = SDEV_UID_DEFAULT;
516 	vattr.va_gid = SDEV_GID_DEFAULT;
517 	vattr.va_type = VNON;
518 	vattr.va_atime = now;
519 	vattr.va_mtime = now;
520 	vattr.va_ctime = now;
521 
522 	if ((dsname = devzvol_make_dsname(parent->sdev_path, nm)) == NULL)
523 		return (ENOENT);
524 
525 	if (devzvol_objset_check(dsname, &do_type) != 0) {
526 		kmem_free(dsname, strlen(dsname) + 1);
527 		return (ENOENT);
528 	}
529 	if (do_type == DMU_OST_ZVOL)
530 		expected_type = VBLK;
531 
532 	if (expected_type == VDIR) {
533 		vattr.va_type = VDIR;
534 		vattr.va_mode = SDEV_DIRMODE_DEFAULT;
535 	} else {
536 		minor_t minor;
537 		dev_t devnum;
538 		int rc;
539 
540 		rc = sdev_zvol_create_minor(dsname);
541 		if ((rc != 0 && rc != EEXIST && rc != EBUSY) ||
542 		    sdev_zvol_name2minor(dsname, &minor)) {
543 			kmem_free(dsname, strlen(dsname) + 1);
544 			return (ENOENT);
545 		}
546 
547 		devnum = makedevice(devzvol_major, minor);
548 		vattr.va_rdev = devnum;
549 
550 		if (strstr(parent->sdev_path, "/rdsk/") != NULL)
551 			vattr.va_type = VCHR;
552 		else
553 			vattr.va_type = VBLK;
554 		vattr.va_mode = SDEV_DEVMODE_DEFAULT;
555 	}
556 	kmem_free(dsname, strlen(dsname) + 1);
557 
558 	rw_enter(&parent->sdev_contents, RW_WRITER);
559 
560 	res = sdev_mknode(parent, nm, &dv, &vattr,
561 	    NULL, NULL, kcred, SDEV_READY);
562 	rw_exit(&parent->sdev_contents);
563 	if (res != 0)
564 		return (ENOENT);
565 
566 	SDEV_RELE(dv);
567 	return (0);
568 }
569 
570 /*ARGSUSED*/
571 static int
572 devzvol_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
573     struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred,
574     caller_context_t *ct, int *direntflags, pathname_t *realpnp)
575 {
576 	enum vtype expected_type = VDIR;
577 	struct sdev_node *parent = VTOSDEV(dvp);
578 	char *dsname;
579 	dmu_objset_type_t do_type;
580 	int error;
581 
582 	sdcmn_err13(("devzvol_lookup '%s' '%s'", parent->sdev_path, nm));
583 	*vpp = NULL;
584 	/* execute access is required to search the directory */
585 	if ((error = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0)
586 		return (error);
587 
588 	rw_enter(&parent->sdev_contents, RW_READER);
589 	if (SDEV_IS_GLOBAL(parent)) {
590 		/*
591 		 * During iter_datasets, don't create GZ dev when running in
592 		 * NGZ.  We can't return ENOENT here since that could
593 		 * incorrectly trigger the creation of the dev from the
594 		 * recursive call through prof_filldir during iter_datasets.
595 		 */
596 		if (getzoneid() != GLOBAL_ZONEID) {
597 			rw_exit(&parent->sdev_contents);
598 			return (EPERM);
599 		}
600 	} else {
601 		int res;
602 
603 		rw_exit(&parent->sdev_contents);
604 
605 		/*
606 		 * If we're in the global zone and reach down into a non-global
607 		 * zone's /dev/zvol then this action could trigger the creation
608 		 * of all of the zvol devices for every zone into the non-global
609 		 * zone's /dev tree. This could be a big security hole. To
610 		 * prevent this, disallow the global zone from looking inside
611 		 * a non-global zones /dev/zvol. This behavior is similar to
612 		 * delegated datasets, which cannot be used by the global zone.
613 		 */
614 		if (getzoneid() == GLOBAL_ZONEID)
615 			return (EPERM);
616 
617 		res = prof_lookup(dvp, nm, vpp, cred);
618 
619 		/*
620 		 * We won't find a zvol that was dynamically created inside
621 		 * a NGZ, within a delegated dataset, in the zone's dev profile
622 		 * but prof_lookup will also find it via sdev_cache_lookup.
623 		 */
624 		if (res == ENOENT) {
625 			/*
626 			 * We have to create the sdev node for the dymamically
627 			 * created zvol.
628 			 */
629 			if (devzvol_mk_ngz_node(parent, nm) != 0)
630 				return (ENOENT);
631 			res = prof_lookup(dvp, nm, vpp, cred);
632 		}
633 
634 		return (res);
635 	}
636 
637 	dsname = devzvol_make_dsname(parent->sdev_path, nm);
638 	rw_exit(&parent->sdev_contents);
639 	sdcmn_err13(("rvp dsname %s", dsname ? dsname : "(null)"));
640 	if (dsname) {
641 		error = devzvol_objset_check(dsname, &do_type);
642 		if (error != 0) {
643 			error = ENOENT;
644 			goto out;
645 		}
646 		if (do_type == DMU_OST_ZVOL)
647 			expected_type = VLNK;
648 	}
649 	/*
650 	 * the callbacks expect:
651 	 *
652 	 * parent->sdev_path		   nm
653 	 * /dev/zvol			   {r}dsk
654 	 * /dev/zvol/{r}dsk		   <pool name>
655 	 * /dev/zvol/{r}dsk/<dataset name> <last ds component>
656 	 *
657 	 * sdev_name is always last path component of sdev_path
658 	 */
659 	if (expected_type == VDIR) {
660 		error = devname_lookup_func(parent, nm, vpp, cred,
661 		    devzvol_create_dir, SDEV_VATTR);
662 	} else {
663 		error = devname_lookup_func(parent, nm, vpp, cred,
664 		    devzvol_create_link, SDEV_VLINK);
665 	}
666 	sdcmn_err13(("devzvol_lookup %d %d", expected_type, error));
667 	ASSERT(error || ((*vpp)->v_type == expected_type));
668 out:
669 	if (dsname)
670 		kmem_free(dsname, strlen(dsname) + 1);
671 	sdcmn_err13(("devzvol_lookup %d", error));
672 	return (error);
673 }
674 
675 /*
676  * We allow create to find existing nodes
677  *	- if the node doesn't exist - EROFS
678  *	- creating an existing dir read-only succeeds, otherwise EISDIR
679  *	- exclusive creates fail - EEXIST
680  */
681 /*ARGSUSED2*/
682 static int
683 devzvol_create(struct vnode *dvp, char *nm, struct vattr *vap, vcexcl_t excl,
684     int mode, struct vnode **vpp, struct cred *cred, int flag,
685     caller_context_t *ct, vsecattr_t *vsecp)
686 {
687 	int error;
688 	struct vnode *vp;
689 
690 	*vpp = NULL;
691 
692 	error = devzvol_lookup(dvp, nm, &vp, NULL, 0, NULL, cred, ct, NULL,
693 	    NULL);
694 	if (error == 0) {
695 		if (excl == EXCL)
696 			error = EEXIST;
697 		else if (vp->v_type == VDIR && (mode & VWRITE))
698 			error = EISDIR;
699 		else
700 			error = VOP_ACCESS(vp, mode, 0, cred, ct);
701 
702 		if (error) {
703 			VN_RELE(vp);
704 		} else
705 			*vpp = vp;
706 	} else if (error == ENOENT) {
707 		error = EROFS;
708 	}
709 
710 	return (error);
711 }
712 
713 void sdev_iter_snapshots(struct vnode *dvp, char *name);
714 
715 void
716 sdev_iter_datasets(struct vnode *dvp, int arg, char *name)
717 {
718 	zfs_cmd_t	*zc;
719 	int rc;
720 
721 	sdcmn_err13(("iter name is '%s' (arg %x)", name, arg));
722 	zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
723 	(void) strcpy(zc->zc_name, name);
724 
725 	while ((rc = devzvol_handle_ioctl(arg, zc, B_FALSE)) == 0) {
726 		struct vnode *vpp;
727 		char *ptr;
728 
729 		sdcmn_err13(("  name %s", zc->zc_name));
730 		if (strchr(zc->zc_name, '$') || strchr(zc->zc_name, '%'))
731 			goto skip;
732 		ptr = strrchr(zc->zc_name, '/') + 1;
733 		rc = devzvol_lookup(dvp, ptr, &vpp, NULL, 0, NULL,
734 		    kcred, NULL, NULL, NULL);
735 		if (rc == 0) {
736 			VN_RELE(vpp);
737 		} else if (rc == ENOENT) {
738 			goto skip;
739 		} else {
740 			/*
741 			 * EBUSY == problem with zvols's dmu holds?
742 			 * EPERM when in a NGZ and traversing up and out.
743 			 */
744 			goto skip;
745 		}
746 		if (arg == ZFS_IOC_DATASET_LIST_NEXT &&
747 		    zc->zc_objset_stats.dds_type != DMU_OST_ZFS)
748 			sdev_iter_snapshots(dvp, zc->zc_name);
749 skip:
750 		(void) strcpy(zc->zc_name, name);
751 	}
752 	kmem_free(zc, sizeof (zfs_cmd_t));
753 }
754 
755 void
756 sdev_iter_snapshots(struct vnode *dvp, char *name)
757 {
758 	sdev_iter_datasets(dvp, ZFS_IOC_SNAPSHOT_LIST_NEXT, name);
759 }
760 
761 /*ARGSUSED4*/
762 static int
763 devzvol_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred,
764     int *eofp, caller_context_t *ct_unused, int flags_unused)
765 {
766 	struct sdev_node *sdvp = VTOSDEV(dvp);
767 	char *ptr;
768 
769 	sdcmn_err13(("zv readdir of '%s' %s'", sdvp->sdev_path,
770 	    sdvp->sdev_name));
771 
772 	if (strcmp(sdvp->sdev_path, ZVOL_DIR) == 0) {
773 		struct vnode *vp;
774 
775 		rw_exit(&sdvp->sdev_contents);
776 		(void) devname_lookup_func(sdvp, "dsk", &vp, cred,
777 		    devzvol_create_dir, SDEV_VATTR);
778 		VN_RELE(vp);
779 		(void) devname_lookup_func(sdvp, "rdsk", &vp, cred,
780 		    devzvol_create_dir, SDEV_VATTR);
781 		VN_RELE(vp);
782 		rw_enter(&sdvp->sdev_contents, RW_READER);
783 		return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
784 	}
785 	if (uiop->uio_offset == 0)
786 		devzvol_prunedir(sdvp);
787 	ptr = sdvp->sdev_path + strlen(ZVOL_DIR);
788 	if ((strcmp(ptr, "/dsk") == 0) || (strcmp(ptr, "/rdsk") == 0)) {
789 		rw_exit(&sdvp->sdev_contents);
790 		devzvol_create_pool_dirs(dvp);
791 		rw_enter(&sdvp->sdev_contents, RW_READER);
792 		return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
793 	}
794 
795 	ptr = strchr(ptr + 1, '/');
796 	if (ptr == NULL)
797 		return (ENOENT);
798 	ptr++;
799 	rw_exit(&sdvp->sdev_contents);
800 	sdev_iter_datasets(dvp, ZFS_IOC_DATASET_LIST_NEXT, ptr);
801 	rw_enter(&sdvp->sdev_contents, RW_READER);
802 	return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
803 }
804 
805 const fs_operation_def_t devzvol_vnodeops_tbl[] = {
806 	VOPNAME_READDIR,	{ .vop_readdir = devzvol_readdir },
807 	VOPNAME_LOOKUP,		{ .vop_lookup = devzvol_lookup },
808 	VOPNAME_CREATE,		{ .vop_create = devzvol_create },
809 	VOPNAME_RENAME,		{ .error = fs_nosys },
810 	VOPNAME_MKDIR,		{ .error = fs_nosys },
811 	VOPNAME_RMDIR,		{ .error = fs_nosys },
812 	VOPNAME_REMOVE,		{ .error = fs_nosys },
813 	VOPNAME_SYMLINK,	{ .error = fs_nosys },
814 	NULL,			NULL
815 };
816