xref: /titanic_54/usr/src/uts/common/fs/dev/sdev_zvolops.c (revision 0ad555ad6a787635be8c8a424168dc59cfbce6c7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  * Copyright 2013 Joyent, Inc.  All rights reserved.
25  */
26 
27 /* vnode ops for the /dev/zvol directory */
28 
29 #include <sys/types.h>
30 #include <sys/param.h>
31 #include <sys/sysmacros.h>
32 #include <sys/ddi.h>
33 #include <sys/sunndi.h>
34 #include <sys/sunldi.h>
35 #include <fs/fs_subr.h>
36 #include <sys/fs/dv_node.h>
37 #include <sys/fs/sdev_impl.h>
38 #include <sys/zfs_ioctl.h>
39 #include <sys/policy.h>
40 #include <sys/stat.h>
41 #include <sys/vfs_opreg.h>
42 
43 struct vnodeops	*devzvol_vnodeops;
44 static major_t devzvol_major;
45 static taskq_ent_t devzvol_zclist_task;
46 
47 static kmutex_t devzvol_mtx;
48 /* Below are protected by devzvol_mtx */
49 static boolean_t devzvol_isopen;
50 static boolean_t devzvol_zclist_task_running = B_FALSE;
51 static uint64_t devzvol_gen = 0;
52 static uint64_t devzvol_zclist;
53 static size_t devzvol_zclist_size;
54 static ldi_ident_t devzvol_li;
55 static ldi_handle_t devzvol_lh;
56 
57 /*
58  * we need to use ddi_mod* since fs/dev gets loaded early on in
59  * startup(), and linking fs/dev to fs/zfs would drag in a lot of
60  * other stuff (like drv/random) before the rest of the system is
61  * ready to go
62  */
63 ddi_modhandle_t zfs_mod;
64 int (*szcm)(char *);
65 int (*szn2m)(char *, minor_t *);
66 
67 int
68 sdev_zvol_create_minor(char *dsname)
69 {
70 	if (szcm == NULL)
71 		return (-1);
72 	return ((*szcm)(dsname));
73 }
74 
75 int
76 sdev_zvol_name2minor(char *dsname, minor_t *minor)
77 {
78 	if (szn2m == NULL)
79 		return (-1);
80 	return ((*szn2m)(dsname, minor));
81 }
82 
83 int
84 devzvol_open_zfs()
85 {
86 	int rc;
87 	dev_t dv;
88 
89 	devzvol_li = ldi_ident_from_anon();
90 	if (ldi_open_by_name("/dev/zfs", FREAD | FWRITE, kcred,
91 	    &devzvol_lh, devzvol_li))
92 		return (-1);
93 	if (zfs_mod == NULL && ((zfs_mod = ddi_modopen("fs/zfs",
94 	    KRTLD_MODE_FIRST, &rc)) == NULL)) {
95 		return (rc);
96 	}
97 	ASSERT(szcm == NULL && szn2m == NULL);
98 	if ((szcm = (int (*)(char *))
99 	    ddi_modsym(zfs_mod, "zvol_create_minor", &rc)) == NULL) {
100 		cmn_err(CE_WARN, "couldn't resolve zvol_create_minor");
101 		return (rc);
102 	}
103 	if ((szn2m = (int(*)(char *, minor_t *))
104 	    ddi_modsym(zfs_mod, "zvol_name2minor", &rc)) == NULL) {
105 		cmn_err(CE_WARN, "couldn't resolve zvol_name2minor");
106 		return (rc);
107 	}
108 	if (ldi_get_dev(devzvol_lh, &dv))
109 		return (-1);
110 	devzvol_major = getmajor(dv);
111 	return (0);
112 }
113 
114 void
115 devzvol_close_zfs()
116 {
117 	szcm = NULL;
118 	szn2m = NULL;
119 	(void) ldi_close(devzvol_lh, FREAD|FWRITE, kcred);
120 	ldi_ident_release(devzvol_li);
121 	if (zfs_mod != NULL) {
122 		(void) ddi_modclose(zfs_mod);
123 		zfs_mod = NULL;
124 	}
125 }
126 
127 int
128 devzvol_handle_ioctl(int cmd, zfs_cmd_t *zc, size_t *alloc_size)
129 {
130 	uint64_t cookie;
131 	int size = 8000;
132 	int unused;
133 	int rc;
134 
135 	if (cmd != ZFS_IOC_POOL_CONFIGS)
136 		mutex_enter(&devzvol_mtx);
137 	if (!devzvol_isopen) {
138 		if ((rc = devzvol_open_zfs()) == 0) {
139 			devzvol_isopen = B_TRUE;
140 		} else {
141 			if (cmd != ZFS_IOC_POOL_CONFIGS)
142 				mutex_exit(&devzvol_mtx);
143 			return (ENXIO);
144 		}
145 	}
146 	cookie = zc->zc_cookie;
147 again:
148 	zc->zc_nvlist_dst = (uint64_t)(intptr_t)kmem_alloc(size,
149 	    KM_SLEEP);
150 	zc->zc_nvlist_dst_size = size;
151 	rc = ldi_ioctl(devzvol_lh, cmd, (intptr_t)zc, FKIOCTL, kcred,
152 	    &unused);
153 	if (rc == ENOMEM) {
154 		int newsize;
155 		newsize = zc->zc_nvlist_dst_size;
156 		ASSERT(newsize > size);
157 		kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, size);
158 		size = newsize;
159 		zc->zc_cookie = cookie;
160 		goto again;
161 	}
162 	if (alloc_size == NULL)
163 		kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, size);
164 	else
165 		*alloc_size = size;
166 	if (cmd != ZFS_IOC_POOL_CONFIGS)
167 		mutex_exit(&devzvol_mtx);
168 	return (rc);
169 }
170 
171 /* figures out if the objset exists and returns its type */
172 int
173 devzvol_objset_check(char *dsname, dmu_objset_type_t *type)
174 {
175 	boolean_t	ispool;
176 	zfs_cmd_t	*zc;
177 	int rc;
178 
179 	zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
180 	(void) strlcpy(zc->zc_name, dsname, MAXPATHLEN);
181 
182 	ispool = (strchr(dsname, '/') == NULL) ? B_TRUE : B_FALSE;
183 	if (!ispool && sdev_zvol_name2minor(dsname, NULL) == 0) {
184 		sdcmn_err13(("found cached minor node"));
185 		if (type)
186 			*type = DMU_OST_ZVOL;
187 		kmem_free(zc, sizeof (zfs_cmd_t));
188 		return (0);
189 	}
190 	rc = devzvol_handle_ioctl(ispool ? ZFS_IOC_POOL_STATS :
191 	    ZFS_IOC_OBJSET_STATS, zc, NULL);
192 	if (type && rc == 0)
193 		*type = (ispool) ? DMU_OST_ZFS :
194 		    zc->zc_objset_stats.dds_type;
195 	kmem_free(zc, sizeof (zfs_cmd_t));
196 	return (rc);
197 }
198 
199 /*
200  * returns what the zfs dataset name should be, given the /dev/zvol
201  * path and an optional name; otherwise NULL
202  */
203 char *
204 devzvol_make_dsname(const char *path, const char *name)
205 {
206 	char *dsname;
207 	const char *ptr;
208 	int dslen;
209 
210 	if (strcmp(path, ZVOL_DIR) == 0)
211 		return (NULL);
212 	if (name && (strcmp(name, ".") == 0 || strcmp(name, "..") == 0))
213 		return (NULL);
214 	ptr = path + strlen(ZVOL_DIR);
215 	if (strncmp(ptr, "/dsk", 4) == 0)
216 		ptr += strlen("/dsk");
217 	else if (strncmp(ptr, "/rdsk", 5) == 0)
218 		ptr += strlen("/rdsk");
219 	else
220 		return (NULL);
221 	if (*ptr == '/')
222 		ptr++;
223 
224 	dslen = strlen(ptr);
225 	if (dslen)
226 		dslen++;			/* plus null */
227 	if (name)
228 		dslen += strlen(name) + 1;	/* plus slash */
229 	dsname = kmem_zalloc(dslen, KM_SLEEP);
230 	if (*ptr) {
231 		(void) strlcpy(dsname, ptr, dslen);
232 		if (name)
233 			(void) strlcat(dsname, "/", dslen);
234 	}
235 	if (name)
236 		(void) strlcat(dsname, name, dslen);
237 	return (dsname);
238 }
239 
240 /*
241  * check if the zvol's sdev_node is still valid, which means make
242  * sure the zvol is still valid.  zvol minors aren't proactively
243  * destroyed when the zvol is destroyed, so we use a validator to clean
244  * these up (in other words, when such nodes are encountered during
245  * subsequent lookup() and readdir() operations) so that only valid
246  * nodes are returned.  The ordering between devname_lookup_func and
247  * devzvol_validate is a little inefficient in the case of invalid
248  * or stale nodes because devname_lookup_func calls
249  * devzvol_create_{dir, link}, then the validator says it's invalid,
250  * and then the node gets cleaned up.
251  */
252 int
253 devzvol_validate(struct sdev_node *dv)
254 {
255 	dmu_objset_type_t do_type;
256 	char *dsname;
257 	char *nm = dv->sdev_name;
258 	int rc;
259 
260 	sdcmn_err13(("validating ('%s' '%s')", dv->sdev_path, nm));
261 	/*
262 	 * validate only READY nodes; if someone is sitting on the
263 	 * directory of a dataset that just got destroyed we could
264 	 * get a zombie node which we just skip.
265 	 */
266 	if (dv->sdev_state != SDEV_READY) {
267 		sdcmn_err13(("skipping '%s'", nm));
268 		return (SDEV_VTOR_SKIP);
269 	}
270 
271 	if ((strcmp(dv->sdev_path, ZVOL_DIR "/dsk") == 0) ||
272 	    (strcmp(dv->sdev_path, ZVOL_DIR "/rdsk") == 0))
273 		return (SDEV_VTOR_VALID);
274 	dsname = devzvol_make_dsname(dv->sdev_path, NULL);
275 	if (dsname == NULL)
276 		return (SDEV_VTOR_INVALID);
277 
278 	rc = devzvol_objset_check(dsname, &do_type);
279 	sdcmn_err13(("  '%s' rc %d", dsname, rc));
280 	if (rc != 0) {
281 		kmem_free(dsname, strlen(dsname) + 1);
282 		return (SDEV_VTOR_INVALID);
283 	}
284 	sdcmn_err13(("  v_type %d do_type %d",
285 	    SDEVTOV(dv)->v_type, do_type));
286 	if ((SDEVTOV(dv)->v_type == VLNK && do_type != DMU_OST_ZVOL) ||
287 	    ((SDEVTOV(dv)->v_type == VBLK || SDEVTOV(dv)->v_type == VCHR) &&
288 	    do_type != DMU_OST_ZVOL) ||
289 	    (SDEVTOV(dv)->v_type == VDIR && do_type == DMU_OST_ZVOL)) {
290 		kmem_free(dsname, strlen(dsname) + 1);
291 		return (SDEV_VTOR_STALE);
292 	}
293 	if (SDEVTOV(dv)->v_type == VLNK) {
294 		char *ptr, *link;
295 		long val = 0;
296 		minor_t lminor, ominor;
297 
298 		rc = sdev_getlink(SDEVTOV(dv), &link);
299 		ASSERT(rc == 0);
300 
301 		ptr = strrchr(link, ':') + 1;
302 		rc = ddi_strtol(ptr, NULL, 10, &val);
303 		kmem_free(link, strlen(link) + 1);
304 		ASSERT(rc == 0 && val != 0);
305 		lminor = (minor_t)val;
306 		if (sdev_zvol_name2minor(dsname, &ominor) < 0 ||
307 		    ominor != lminor) {
308 			kmem_free(dsname, strlen(dsname) + 1);
309 			return (SDEV_VTOR_STALE);
310 		}
311 	}
312 	kmem_free(dsname, strlen(dsname) + 1);
313 	return (SDEV_VTOR_VALID);
314 }
315 
316 /*
317  * Taskq callback to update the devzvol_zclist.
318  *
319  * We need to defer this to the taskq to avoid it running with a user
320  * context that might be associated with some non-global zone, and thus
321  * not being able to list all of the pools on the entire system.
322  */
323 /*ARGSUSED*/
324 static void
325 devzvol_update_zclist_cb(void *arg)
326 {
327 	zfs_cmd_t	*zc;
328 	int		rc;
329 	size_t		size;
330 
331 	zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
332 	mutex_enter(&devzvol_mtx);
333 	zc->zc_cookie = devzvol_gen;
334 
335 	rc = devzvol_handle_ioctl(ZFS_IOC_POOL_CONFIGS, zc, &size);
336 	switch (rc) {
337 		case 0:
338 			/* new generation */
339 			ASSERT(devzvol_gen != zc->zc_cookie);
340 			devzvol_gen = zc->zc_cookie;
341 			if (devzvol_zclist)
342 				kmem_free((void *)(uintptr_t)devzvol_zclist,
343 				    devzvol_zclist_size);
344 			devzvol_zclist = zc->zc_nvlist_dst;
345 			/* Keep the alloc'd size, not the nvlist size. */
346 			devzvol_zclist_size = size;
347 			break;
348 		default:
349 			/*
350 			 * Either there was no change in pool configuration
351 			 * since we last asked (rc == EEXIST) or we got a
352 			 * catastrophic error.
353 			 *
354 			 * Give up memory and exit.
355 			 */
356 			kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst,
357 			    size);
358 			break;
359 	}
360 
361 	VERIFY(devzvol_zclist_task_running == B_TRUE);
362 	devzvol_zclist_task_running = B_FALSE;
363 	mutex_exit(&devzvol_mtx);
364 
365 	kmem_free(zc, sizeof (zfs_cmd_t));
366 }
367 
368 static void
369 devzvol_update_zclist(void)
370 {
371 	mutex_enter(&devzvol_mtx);
372 	if (devzvol_zclist_task_running == B_TRUE) {
373 		mutex_exit(&devzvol_mtx);
374 		goto wait;
375 	}
376 
377 	devzvol_zclist_task_running = B_TRUE;
378 
379 	taskq_dispatch_ent(sdev_taskq, devzvol_update_zclist_cb, NULL, 0,
380 	    &devzvol_zclist_task);
381 
382 	mutex_exit(&devzvol_mtx);
383 
384 wait:
385 	taskq_wait(sdev_taskq);
386 }
387 
388 /*
389  * Creates sub-directories for each zpool as needed in response to a
390  * readdir on one of the /dev/zvol/{dsk,rdsk} directories.
391  */
392 void
393 devzvol_create_pool_dirs(struct vnode *dvp)
394 {
395 	nvlist_t *nv = NULL;
396 	nvpair_t *elem = NULL;
397 	int pools = 0;
398 	int rc;
399 
400 	sdcmn_err13(("devzvol_create_pool_dirs"));
401 
402 	devzvol_update_zclist();
403 
404 	mutex_enter(&devzvol_mtx);
405 
406 	rc = nvlist_unpack((char *)(uintptr_t)devzvol_zclist,
407 	    devzvol_zclist_size, &nv, 0);
408 	if (rc) {
409 		ASSERT(rc == 0);
410 		kmem_free((void *)(uintptr_t)devzvol_zclist,
411 		    devzvol_zclist_size);
412 		devzvol_gen = 0;
413 		devzvol_zclist = NULL;
414 		devzvol_zclist_size = 0;
415 		goto out;
416 	}
417 	mutex_exit(&devzvol_mtx);
418 	while ((elem = nvlist_next_nvpair(nv, elem)) != NULL) {
419 		struct vnode *vp;
420 		ASSERT(dvp->v_count > 0);
421 		rc = VOP_LOOKUP(dvp, nvpair_name(elem), &vp, NULL, 0,
422 		    NULL, kcred, NULL, 0, NULL);
423 		/* should either work, or not be visible from a zone */
424 		ASSERT(rc == 0 || rc == ENOENT);
425 		if (rc == 0)
426 			VN_RELE(vp);
427 		pools++;
428 	}
429 	nvlist_free(nv);
430 	mutex_enter(&devzvol_mtx);
431 	if (devzvol_isopen && pools == 0) {
432 		/* clean up so zfs can be unloaded */
433 		devzvol_close_zfs();
434 		devzvol_isopen = B_FALSE;
435 	}
436 out:
437 	mutex_exit(&devzvol_mtx);
438 }
439 
440 /*ARGSUSED3*/
441 static int
442 devzvol_create_dir(struct sdev_node *ddv, char *nm, void **arg,
443     cred_t *cred, void *whatever, char *whichever)
444 {
445 	timestruc_t now;
446 	struct vattr *vap = (struct vattr *)arg;
447 
448 	sdcmn_err13(("create_dir (%s) (%s) '%s'", ddv->sdev_name,
449 	    ddv->sdev_path, nm));
450 	ASSERT(strncmp(ddv->sdev_path, ZVOL_DIR,
451 	    strlen(ZVOL_DIR)) == 0);
452 	*vap = *sdev_getdefault_attr(VDIR);
453 	gethrestime(&now);
454 	vap->va_atime = now;
455 	vap->va_mtime = now;
456 	vap->va_ctime = now;
457 	return (0);
458 }
459 
460 /*ARGSUSED3*/
461 static int
462 devzvol_create_link(struct sdev_node *ddv, char *nm,
463     void **arg, cred_t *cred, void *whatever, char *whichever)
464 {
465 	minor_t minor;
466 	char *pathname = (char *)*arg;
467 	int rc;
468 	char *dsname;
469 	char *x;
470 	char str[MAXNAMELEN];
471 	sdcmn_err13(("create_link (%s) (%s) '%s'", ddv->sdev_name,
472 	    ddv->sdev_path, nm));
473 	dsname = devzvol_make_dsname(ddv->sdev_path, nm);
474 	rc = sdev_zvol_create_minor(dsname);
475 	if ((rc != 0 && rc != EEXIST && rc != EBUSY) ||
476 	    sdev_zvol_name2minor(dsname, &minor)) {
477 		sdcmn_err13(("devzvol_create_link %d", rc));
478 		kmem_free(dsname, strlen(dsname) + 1);
479 		return (-1);
480 	}
481 	kmem_free(dsname, strlen(dsname) + 1);
482 
483 	/*
484 	 * This is a valid zvol; create a symlink that points to the
485 	 * minor which was created under /devices/pseudo/zfs@0
486 	 */
487 	*pathname = '\0';
488 	for (x = ddv->sdev_path; x = strchr(x, '/'); x++)
489 		(void) strcat(pathname, "../");
490 	(void) snprintf(str, sizeof (str), ZVOL_PSEUDO_DEV "%u", minor);
491 	(void) strncat(pathname, str, MAXPATHLEN);
492 	if (strncmp(ddv->sdev_path, ZVOL_FULL_RDEV_DIR,
493 	    strlen(ZVOL_FULL_RDEV_DIR)) == 0)
494 		(void) strcat(pathname, ",raw");
495 	return (0);
496 }
497 
498 /* Clean zvol sdev_nodes that are no longer valid.  */
499 static void
500 devzvol_prunedir(struct sdev_node *ddv)
501 {
502 	struct sdev_node *dv;
503 
504 	ASSERT(RW_READ_HELD(&ddv->sdev_contents));
505 
506 	sdcmn_err13(("prunedir '%s'", ddv->sdev_name));
507 	ASSERT(strncmp(ddv->sdev_path, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
508 	if (rw_tryupgrade(&ddv->sdev_contents) == 0) {
509 		rw_exit(&ddv->sdev_contents);
510 		rw_enter(&ddv->sdev_contents, RW_WRITER);
511 	}
512 
513 	dv = SDEV_FIRST_ENTRY(ddv);
514 	while (dv) {
515 		sdcmn_err13(("sdev_name '%s'", dv->sdev_name));
516 
517 		switch (devzvol_validate(dv)) {
518 		case SDEV_VTOR_VALID:
519 		case SDEV_VTOR_SKIP:
520 			dv = SDEV_NEXT_ENTRY(ddv, dv);
521 			continue;
522 		case SDEV_VTOR_INVALID:
523 			sdcmn_err7(("prunedir: destroy invalid "
524 			    "node: %s\n", dv->sdev_name));
525 			break;
526 		}
527 
528 		if ((SDEVTOV(dv)->v_type == VDIR) &&
529 		    (sdev_cleandir(dv, NULL, 0) != 0)) {
530 			dv = SDEV_NEXT_ENTRY(ddv, dv);
531 			continue;
532 		}
533 		SDEV_HOLD(dv);
534 		/* remove the cache node */
535 		sdev_cache_update(ddv, &dv, dv->sdev_name,
536 		    SDEV_CACHE_DELETE);
537 		SDEV_RELE(dv);
538 		dv = SDEV_FIRST_ENTRY(ddv);
539 	}
540 	rw_downgrade(&ddv->sdev_contents);
541 }
542 
543 /*
544  * This function is used to create a dir or dev inside a zone's /dev when the
545  * zone has a zvol that is dynamically created within the zone (i.e. inside
546  * of a delegated dataset.  Since there is no /devices tree within a zone,
547  * we create the chr/blk devices directly inside the zone's /dev instead of
548  * making symlinks.
549  */
550 static int
551 devzvol_mk_ngz_node(struct sdev_node *parent, char *nm)
552 {
553 	struct vattr vattr;
554 	timestruc_t now;
555 	enum vtype expected_type = VDIR;
556 	dmu_objset_type_t do_type;
557 	struct sdev_node *dv = NULL;
558 	int res;
559 	char *dsname;
560 
561 	bzero(&vattr, sizeof (vattr));
562 	gethrestime(&now);
563 	vattr.va_mask = AT_TYPE|AT_MODE|AT_UID|AT_GID;
564 	vattr.va_uid = SDEV_UID_DEFAULT;
565 	vattr.va_gid = SDEV_GID_DEFAULT;
566 	vattr.va_type = VNON;
567 	vattr.va_atime = now;
568 	vattr.va_mtime = now;
569 	vattr.va_ctime = now;
570 
571 	if ((dsname = devzvol_make_dsname(parent->sdev_path, nm)) == NULL)
572 		return (ENOENT);
573 
574 	if (devzvol_objset_check(dsname, &do_type) != 0) {
575 		kmem_free(dsname, strlen(dsname) + 1);
576 		return (ENOENT);
577 	}
578 	if (do_type == DMU_OST_ZVOL)
579 		expected_type = VBLK;
580 
581 	if (expected_type == VDIR) {
582 		vattr.va_type = VDIR;
583 		vattr.va_mode = SDEV_DIRMODE_DEFAULT;
584 	} else {
585 		minor_t minor;
586 		dev_t devnum;
587 		int rc;
588 
589 		rc = sdev_zvol_create_minor(dsname);
590 		if ((rc != 0 && rc != EEXIST && rc != EBUSY) ||
591 		    sdev_zvol_name2minor(dsname, &minor)) {
592 			kmem_free(dsname, strlen(dsname) + 1);
593 			return (ENOENT);
594 		}
595 
596 		devnum = makedevice(devzvol_major, minor);
597 		vattr.va_rdev = devnum;
598 
599 		if (strstr(parent->sdev_path, "/rdsk/") != NULL)
600 			vattr.va_type = VCHR;
601 		else
602 			vattr.va_type = VBLK;
603 		vattr.va_mode = SDEV_DEVMODE_DEFAULT;
604 	}
605 	kmem_free(dsname, strlen(dsname) + 1);
606 
607 	rw_enter(&parent->sdev_contents, RW_WRITER);
608 
609 	res = sdev_mknode(parent, nm, &dv, &vattr,
610 	    NULL, NULL, kcred, SDEV_READY);
611 	rw_exit(&parent->sdev_contents);
612 	if (res != 0)
613 		return (ENOENT);
614 
615 	SDEV_RELE(dv);
616 	return (0);
617 }
618 
619 /*ARGSUSED*/
620 static int
621 devzvol_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
622     struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred,
623     caller_context_t *ct, int *direntflags, pathname_t *realpnp)
624 {
625 	enum vtype expected_type = VDIR;
626 	struct sdev_node *parent = VTOSDEV(dvp);
627 	char *dsname;
628 	dmu_objset_type_t do_type;
629 	int error;
630 
631 	sdcmn_err13(("devzvol_lookup '%s' '%s'", parent->sdev_path, nm));
632 	*vpp = NULL;
633 	/* execute access is required to search the directory */
634 	if ((error = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0)
635 		return (error);
636 
637 	rw_enter(&parent->sdev_contents, RW_READER);
638 	if (SDEV_IS_GLOBAL(parent)) {
639 		/*
640 		 * During iter_datasets, don't create GZ dev when running in
641 		 * NGZ.  We can't return ENOENT here since that could
642 		 * incorrectly trigger the creation of the dev from the
643 		 * recursive call through prof_filldir during iter_datasets.
644 		 */
645 		if (getzoneid() != GLOBAL_ZONEID) {
646 			rw_exit(&parent->sdev_contents);
647 			return (EPERM);
648 		}
649 	} else {
650 		int res;
651 
652 		rw_exit(&parent->sdev_contents);
653 
654 		/*
655 		 * If we're in the global zone and reach down into a non-global
656 		 * zone's /dev/zvol then this action could trigger the creation
657 		 * of all of the zvol devices for every zone into the non-global
658 		 * zone's /dev tree. This could be a big security hole. To
659 		 * prevent this, disallow the global zone from looking inside
660 		 * a non-global zones /dev/zvol. This behavior is similar to
661 		 * delegated datasets, which cannot be used by the global zone.
662 		 */
663 		if (getzoneid() == GLOBAL_ZONEID)
664 			return (EPERM);
665 
666 		res = prof_lookup(dvp, nm, vpp, cred);
667 
668 		/*
669 		 * We won't find a zvol that was dynamically created inside
670 		 * a NGZ, within a delegated dataset, in the zone's dev profile
671 		 * but prof_lookup will also find it via sdev_cache_lookup.
672 		 */
673 		if (res == ENOENT) {
674 			/*
675 			 * We have to create the sdev node for the dymamically
676 			 * created zvol.
677 			 */
678 			if (devzvol_mk_ngz_node(parent, nm) != 0)
679 				return (ENOENT);
680 			res = prof_lookup(dvp, nm, vpp, cred);
681 		}
682 
683 		return (res);
684 	}
685 
686 	dsname = devzvol_make_dsname(parent->sdev_path, nm);
687 	rw_exit(&parent->sdev_contents);
688 	sdcmn_err13(("rvp dsname %s", dsname ? dsname : "(null)"));
689 	if (dsname) {
690 		error = devzvol_objset_check(dsname, &do_type);
691 		if (error != 0) {
692 			error = ENOENT;
693 			goto out;
694 		}
695 		if (do_type == DMU_OST_ZVOL)
696 			expected_type = VLNK;
697 	}
698 	/*
699 	 * the callbacks expect:
700 	 *
701 	 * parent->sdev_path		   nm
702 	 * /dev/zvol			   {r}dsk
703 	 * /dev/zvol/{r}dsk		   <pool name>
704 	 * /dev/zvol/{r}dsk/<dataset name> <last ds component>
705 	 *
706 	 * sdev_name is always last path component of sdev_path
707 	 */
708 	if (expected_type == VDIR) {
709 		error = devname_lookup_func(parent, nm, vpp, cred,
710 		    devzvol_create_dir, SDEV_VATTR);
711 	} else {
712 		error = devname_lookup_func(parent, nm, vpp, cred,
713 		    devzvol_create_link, SDEV_VLINK);
714 	}
715 	sdcmn_err13(("devzvol_lookup %d %d", expected_type, error));
716 	ASSERT(error || ((*vpp)->v_type == expected_type));
717 out:
718 	if (dsname)
719 		kmem_free(dsname, strlen(dsname) + 1);
720 	sdcmn_err13(("devzvol_lookup %d", error));
721 	return (error);
722 }
723 
724 /*
725  * We allow create to find existing nodes
726  *	- if the node doesn't exist - EROFS
727  *	- creating an existing dir read-only succeeds, otherwise EISDIR
728  *	- exclusive creates fail - EEXIST
729  */
730 /*ARGSUSED2*/
731 static int
732 devzvol_create(struct vnode *dvp, char *nm, struct vattr *vap, vcexcl_t excl,
733     int mode, struct vnode **vpp, struct cred *cred, int flag,
734     caller_context_t *ct, vsecattr_t *vsecp)
735 {
736 	int error;
737 	struct vnode *vp;
738 
739 	*vpp = NULL;
740 
741 	error = devzvol_lookup(dvp, nm, &vp, NULL, 0, NULL, cred, ct, NULL,
742 	    NULL);
743 	if (error == 0) {
744 		if (excl == EXCL)
745 			error = EEXIST;
746 		else if (vp->v_type == VDIR && (mode & VWRITE))
747 			error = EISDIR;
748 		else
749 			error = VOP_ACCESS(vp, mode, 0, cred, ct);
750 
751 		if (error) {
752 			VN_RELE(vp);
753 		} else
754 			*vpp = vp;
755 	} else if (error == ENOENT) {
756 		error = EROFS;
757 	}
758 
759 	return (error);
760 }
761 
762 void sdev_iter_snapshots(struct vnode *dvp, char *name);
763 
764 void
765 sdev_iter_datasets(struct vnode *dvp, int arg, char *name)
766 {
767 	zfs_cmd_t	*zc;
768 	int rc;
769 
770 	sdcmn_err13(("iter name is '%s' (arg %x)", name, arg));
771 	zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
772 	(void) strcpy(zc->zc_name, name);
773 
774 	while ((rc = devzvol_handle_ioctl(arg, zc, B_FALSE)) == 0) {
775 		struct vnode *vpp;
776 		char *ptr;
777 
778 		sdcmn_err13(("  name %s", zc->zc_name));
779 		if (strchr(zc->zc_name, '$') || strchr(zc->zc_name, '%'))
780 			goto skip;
781 		ptr = strrchr(zc->zc_name, '/') + 1;
782 		rc = devzvol_lookup(dvp, ptr, &vpp, NULL, 0, NULL,
783 		    kcred, NULL, NULL, NULL);
784 		if (rc == 0) {
785 			VN_RELE(vpp);
786 		} else if (rc == ENOENT) {
787 			goto skip;
788 		} else {
789 			/*
790 			 * EBUSY == problem with zvols's dmu holds?
791 			 * EPERM when in a NGZ and traversing up and out.
792 			 */
793 			goto skip;
794 		}
795 		if (arg == ZFS_IOC_DATASET_LIST_NEXT &&
796 		    zc->zc_objset_stats.dds_type != DMU_OST_ZFS)
797 			sdev_iter_snapshots(dvp, zc->zc_name);
798 skip:
799 		(void) strcpy(zc->zc_name, name);
800 	}
801 	kmem_free(zc, sizeof (zfs_cmd_t));
802 }
803 
804 void
805 sdev_iter_snapshots(struct vnode *dvp, char *name)
806 {
807 	sdev_iter_datasets(dvp, ZFS_IOC_SNAPSHOT_LIST_NEXT, name);
808 }
809 
810 /*ARGSUSED4*/
811 static int
812 devzvol_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred,
813     int *eofp, caller_context_t *ct_unused, int flags_unused)
814 {
815 	struct sdev_node *sdvp = VTOSDEV(dvp);
816 	char *ptr;
817 
818 	sdcmn_err13(("zv readdir of '%s' %s'", sdvp->sdev_path,
819 	    sdvp->sdev_name));
820 
821 	if (strcmp(sdvp->sdev_path, ZVOL_DIR) == 0) {
822 		struct vnode *vp;
823 
824 		rw_exit(&sdvp->sdev_contents);
825 		(void) devname_lookup_func(sdvp, "dsk", &vp, cred,
826 		    devzvol_create_dir, SDEV_VATTR);
827 		VN_RELE(vp);
828 		(void) devname_lookup_func(sdvp, "rdsk", &vp, cred,
829 		    devzvol_create_dir, SDEV_VATTR);
830 		VN_RELE(vp);
831 		rw_enter(&sdvp->sdev_contents, RW_READER);
832 		return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
833 	}
834 	if (uiop->uio_offset == 0)
835 		devzvol_prunedir(sdvp);
836 	ptr = sdvp->sdev_path + strlen(ZVOL_DIR);
837 	if ((strcmp(ptr, "/dsk") == 0) || (strcmp(ptr, "/rdsk") == 0)) {
838 		rw_exit(&sdvp->sdev_contents);
839 		devzvol_create_pool_dirs(dvp);
840 		rw_enter(&sdvp->sdev_contents, RW_READER);
841 		return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
842 	}
843 
844 	ptr = strchr(ptr + 1, '/');
845 	if (ptr == NULL)
846 		return (ENOENT);
847 	ptr++;
848 	rw_exit(&sdvp->sdev_contents);
849 	sdev_iter_datasets(dvp, ZFS_IOC_DATASET_LIST_NEXT, ptr);
850 	rw_enter(&sdvp->sdev_contents, RW_READER);
851 	return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
852 }
853 
854 const fs_operation_def_t devzvol_vnodeops_tbl[] = {
855 	VOPNAME_READDIR,	{ .vop_readdir = devzvol_readdir },
856 	VOPNAME_LOOKUP,		{ .vop_lookup = devzvol_lookup },
857 	VOPNAME_CREATE,		{ .vop_create = devzvol_create },
858 	VOPNAME_RENAME,		{ .error = fs_nosys },
859 	VOPNAME_MKDIR,		{ .error = fs_nosys },
860 	VOPNAME_RMDIR,		{ .error = fs_nosys },
861 	VOPNAME_REMOVE,		{ .error = fs_nosys },
862 	VOPNAME_SYMLINK,	{ .error = fs_nosys },
863 	NULL,			NULL
864 };
865