1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 * Copyright 2013 Joyent, Inc. All rights reserved.
25 */
26
27 /* vnode ops for the /dev/zvol directory */
28
29 #include <sys/types.h>
30 #include <sys/param.h>
31 #include <sys/sysmacros.h>
32 #include <sys/ddi.h>
33 #include <sys/sunndi.h>
34 #include <sys/sunldi.h>
35 #include <fs/fs_subr.h>
36 #include <sys/fs/dv_node.h>
37 #include <sys/fs/sdev_impl.h>
38 #include <sys/zfs_ioctl.h>
39 #include <sys/policy.h>
40 #include <sys/stat.h>
41 #include <sys/vfs_opreg.h>
42
43 struct vnodeops *devzvol_vnodeops;
44 static major_t devzvol_major;
45 static taskq_ent_t devzvol_zclist_task;
46
47 static kmutex_t devzvol_mtx;
48 /* Below are protected by devzvol_mtx */
49 static boolean_t devzvol_isopen;
50 static boolean_t devzvol_zclist_task_running = B_FALSE;
51 static uint64_t devzvol_gen = 0;
52 static uint64_t devzvol_zclist;
53 static size_t devzvol_zclist_size;
54 static ldi_ident_t devzvol_li;
55 static ldi_handle_t devzvol_lh;
56
57 /*
58 * we need to use ddi_mod* since fs/dev gets loaded early on in
59 * startup(), and linking fs/dev to fs/zfs would drag in a lot of
60 * other stuff (like drv/random) before the rest of the system is
61 * ready to go
62 */
63 ddi_modhandle_t zfs_mod;
64 int (*szcm)(char *);
65 int (*szn2m)(char *, minor_t *);
66
67 int
sdev_zvol_create_minor(char * dsname)68 sdev_zvol_create_minor(char *dsname)
69 {
70 if (szcm == NULL)
71 return (-1);
72 return ((*szcm)(dsname));
73 }
74
75 int
sdev_zvol_name2minor(char * dsname,minor_t * minor)76 sdev_zvol_name2minor(char *dsname, minor_t *minor)
77 {
78 if (szn2m == NULL)
79 return (-1);
80 return ((*szn2m)(dsname, minor));
81 }
82
83 int
devzvol_open_zfs()84 devzvol_open_zfs()
85 {
86 int rc;
87 dev_t dv;
88
89 devzvol_li = ldi_ident_from_anon();
90 if (ldi_open_by_name("/dev/zfs", FREAD | FWRITE, kcred,
91 &devzvol_lh, devzvol_li))
92 return (-1);
93 if (zfs_mod == NULL && ((zfs_mod = ddi_modopen("fs/zfs",
94 KRTLD_MODE_FIRST, &rc)) == NULL)) {
95 return (rc);
96 }
97 ASSERT(szcm == NULL && szn2m == NULL);
98 if ((szcm = (int (*)(char *))
99 ddi_modsym(zfs_mod, "zvol_create_minor", &rc)) == NULL) {
100 cmn_err(CE_WARN, "couldn't resolve zvol_create_minor");
101 return (rc);
102 }
103 if ((szn2m = (int(*)(char *, minor_t *))
104 ddi_modsym(zfs_mod, "zvol_name2minor", &rc)) == NULL) {
105 cmn_err(CE_WARN, "couldn't resolve zvol_name2minor");
106 return (rc);
107 }
108 if (ldi_get_dev(devzvol_lh, &dv))
109 return (-1);
110 devzvol_major = getmajor(dv);
111 return (0);
112 }
113
114 void
devzvol_close_zfs()115 devzvol_close_zfs()
116 {
117 szcm = NULL;
118 szn2m = NULL;
119 (void) ldi_close(devzvol_lh, FREAD|FWRITE, kcred);
120 ldi_ident_release(devzvol_li);
121 if (zfs_mod != NULL) {
122 (void) ddi_modclose(zfs_mod);
123 zfs_mod = NULL;
124 }
125 }
126
127 int
devzvol_handle_ioctl(int cmd,zfs_cmd_t * zc,size_t * alloc_size)128 devzvol_handle_ioctl(int cmd, zfs_cmd_t *zc, size_t *alloc_size)
129 {
130 uint64_t cookie;
131 int size = 8000;
132 int unused;
133 int rc;
134
135 if (cmd != ZFS_IOC_POOL_CONFIGS)
136 mutex_enter(&devzvol_mtx);
137 if (!devzvol_isopen) {
138 if ((rc = devzvol_open_zfs()) == 0) {
139 devzvol_isopen = B_TRUE;
140 } else {
141 if (cmd != ZFS_IOC_POOL_CONFIGS)
142 mutex_exit(&devzvol_mtx);
143 return (ENXIO);
144 }
145 }
146 cookie = zc->zc_cookie;
147 again:
148 zc->zc_nvlist_dst = (uint64_t)(intptr_t)kmem_alloc(size,
149 KM_SLEEP);
150 zc->zc_nvlist_dst_size = size;
151 rc = ldi_ioctl(devzvol_lh, cmd, (intptr_t)zc, FKIOCTL, kcred,
152 &unused);
153 if (rc == ENOMEM) {
154 int newsize;
155 newsize = zc->zc_nvlist_dst_size;
156 ASSERT(newsize > size);
157 kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, size);
158 size = newsize;
159 zc->zc_cookie = cookie;
160 goto again;
161 }
162 if (alloc_size == NULL)
163 kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, size);
164 else
165 *alloc_size = size;
166 if (cmd != ZFS_IOC_POOL_CONFIGS)
167 mutex_exit(&devzvol_mtx);
168 return (rc);
169 }
170
171 /* figures out if the objset exists and returns its type */
172 int
devzvol_objset_check(char * dsname,dmu_objset_type_t * type)173 devzvol_objset_check(char *dsname, dmu_objset_type_t *type)
174 {
175 boolean_t ispool;
176 zfs_cmd_t *zc;
177 int rc;
178
179 zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
180 (void) strlcpy(zc->zc_name, dsname, MAXPATHLEN);
181
182 ispool = (strchr(dsname, '/') == NULL) ? B_TRUE : B_FALSE;
183 if (!ispool && sdev_zvol_name2minor(dsname, NULL) == 0) {
184 sdcmn_err13(("found cached minor node"));
185 if (type)
186 *type = DMU_OST_ZVOL;
187 kmem_free(zc, sizeof (zfs_cmd_t));
188 return (0);
189 }
190 rc = devzvol_handle_ioctl(ispool ? ZFS_IOC_POOL_STATS :
191 ZFS_IOC_OBJSET_STATS, zc, NULL);
192 if (type && rc == 0)
193 *type = (ispool) ? DMU_OST_ZFS :
194 zc->zc_objset_stats.dds_type;
195 kmem_free(zc, sizeof (zfs_cmd_t));
196 return (rc);
197 }
198
199 /*
200 * Returns what the zfs dataset name should be, given the /dev/zvol
201 * path and an optional name (can be NULL).
202 *
203 * Note that if the name param is NULL, then path must be an
204 * actual dataset's directory and not one of the top-level
205 * /dev/zvol/{dsk,rdsk} dirs, as these do not correspond to a
206 * specific dataset.
207 */
208 char *
devzvol_make_dsname(const char * path,const char * name)209 devzvol_make_dsname(const char *path, const char *name)
210 {
211 char *dsname;
212 const char *ptr;
213 int dslen;
214
215 if (strcmp(path, ZVOL_DIR) == 0)
216 return (NULL);
217 if (name && (strcmp(name, ".") == 0 || strcmp(name, "..") == 0))
218 return (NULL);
219 ptr = path + strlen(ZVOL_DIR);
220 if (strncmp(ptr, "/dsk", 4) == 0)
221 ptr += strlen("/dsk");
222 else if (strncmp(ptr, "/rdsk", 5) == 0)
223 ptr += strlen("/rdsk");
224 else
225 return (NULL);
226
227 if (*ptr == '/')
228 ptr++;
229 else if (name == NULL)
230 return (NULL);
231
232 dslen = strlen(ptr);
233 if (dslen)
234 dslen++; /* plus null */
235 if (name)
236 dslen += strlen(name) + 1; /* plus slash */
237 dsname = kmem_zalloc(dslen, KM_SLEEP);
238 if (*ptr) {
239 (void) strlcpy(dsname, ptr, dslen);
240 if (name)
241 (void) strlcat(dsname, "/", dslen);
242 }
243 if (name)
244 (void) strlcat(dsname, name, dslen);
245 return (dsname);
246 }
247
248 /*
249 * check if the zvol's sdev_node is still valid, which means make
250 * sure the zvol is still valid. zvol minors aren't proactively
251 * destroyed when the zvol is destroyed, so we use a validator to clean
252 * these up (in other words, when such nodes are encountered during
253 * subsequent lookup() and readdir() operations) so that only valid
254 * nodes are returned. The ordering between devname_lookup_func and
255 * devzvol_validate is a little inefficient in the case of invalid
256 * or stale nodes because devname_lookup_func calls
257 * devzvol_create_{dir, link}, then the validator says it's invalid,
258 * and then the node gets cleaned up.
259 */
260 int
devzvol_validate(struct sdev_node * dv)261 devzvol_validate(struct sdev_node *dv)
262 {
263 dmu_objset_type_t do_type;
264 char *dsname;
265 char *nm = dv->sdev_name;
266 int rc;
267
268 sdcmn_err13(("validating ('%s' '%s')", dv->sdev_path, nm));
269 /*
270 * validate only READY nodes; if someone is sitting on the
271 * directory of a dataset that just got destroyed we could
272 * get a zombie node which we just skip.
273 */
274 if (dv->sdev_state != SDEV_READY) {
275 sdcmn_err13(("skipping '%s'", nm));
276 return (SDEV_VTOR_SKIP);
277 }
278
279 if ((strcmp(dv->sdev_path, ZVOL_DIR "/dsk") == 0) ||
280 (strcmp(dv->sdev_path, ZVOL_DIR "/rdsk") == 0))
281 return (SDEV_VTOR_VALID);
282 dsname = devzvol_make_dsname(dv->sdev_path, NULL);
283 if (dsname == NULL)
284 return (SDEV_VTOR_INVALID);
285
286 rc = devzvol_objset_check(dsname, &do_type);
287 sdcmn_err13((" '%s' rc %d", dsname, rc));
288 if (rc != 0) {
289 kmem_free(dsname, strlen(dsname) + 1);
290 return (SDEV_VTOR_INVALID);
291 }
292 sdcmn_err13((" v_type %d do_type %d",
293 SDEVTOV(dv)->v_type, do_type));
294 if ((SDEVTOV(dv)->v_type == VLNK && do_type != DMU_OST_ZVOL) ||
295 ((SDEVTOV(dv)->v_type == VBLK || SDEVTOV(dv)->v_type == VCHR) &&
296 do_type != DMU_OST_ZVOL) ||
297 (SDEVTOV(dv)->v_type == VDIR && do_type == DMU_OST_ZVOL)) {
298 kmem_free(dsname, strlen(dsname) + 1);
299 return (SDEV_VTOR_STALE);
300 }
301 if (SDEVTOV(dv)->v_type == VLNK) {
302 char *ptr, *link;
303 long val = 0;
304 minor_t lminor, ominor;
305
306 rc = sdev_getlink(SDEVTOV(dv), &link);
307 ASSERT(rc == 0);
308
309 ptr = strrchr(link, ':') + 1;
310 rc = ddi_strtol(ptr, NULL, 10, &val);
311 kmem_free(link, strlen(link) + 1);
312 ASSERT(rc == 0 && val != 0);
313 lminor = (minor_t)val;
314 if (sdev_zvol_name2minor(dsname, &ominor) < 0 ||
315 ominor != lminor) {
316 kmem_free(dsname, strlen(dsname) + 1);
317 return (SDEV_VTOR_STALE);
318 }
319 }
320 kmem_free(dsname, strlen(dsname) + 1);
321 return (SDEV_VTOR_VALID);
322 }
323
324 /*
325 * Taskq callback to update the devzvol_zclist.
326 *
327 * We need to defer this to the taskq to avoid it running with a user
328 * context that might be associated with some non-global zone, and thus
329 * not being able to list all of the pools on the entire system.
330 */
331 /*ARGSUSED*/
332 static void
devzvol_update_zclist_cb(void * arg)333 devzvol_update_zclist_cb(void *arg)
334 {
335 zfs_cmd_t *zc;
336 int rc;
337 size_t size;
338
339 zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
340 mutex_enter(&devzvol_mtx);
341 zc->zc_cookie = devzvol_gen;
342
343 rc = devzvol_handle_ioctl(ZFS_IOC_POOL_CONFIGS, zc, &size);
344 switch (rc) {
345 case 0:
346 /* new generation */
347 ASSERT(devzvol_gen != zc->zc_cookie);
348 devzvol_gen = zc->zc_cookie;
349 if (devzvol_zclist)
350 kmem_free((void *)(uintptr_t)devzvol_zclist,
351 devzvol_zclist_size);
352 devzvol_zclist = zc->zc_nvlist_dst;
353 /* Keep the alloc'd size, not the nvlist size. */
354 devzvol_zclist_size = size;
355 break;
356 default:
357 /*
358 * Either there was no change in pool configuration
359 * since we last asked (rc == EEXIST) or we got a
360 * catastrophic error.
361 *
362 * Give up memory and exit.
363 */
364 kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst,
365 size);
366 break;
367 }
368
369 VERIFY(devzvol_zclist_task_running == B_TRUE);
370 devzvol_zclist_task_running = B_FALSE;
371 mutex_exit(&devzvol_mtx);
372
373 kmem_free(zc, sizeof (zfs_cmd_t));
374 }
375
376 static void
devzvol_update_zclist(void)377 devzvol_update_zclist(void)
378 {
379 mutex_enter(&devzvol_mtx);
380 if (devzvol_zclist_task_running == B_TRUE) {
381 mutex_exit(&devzvol_mtx);
382 goto wait;
383 }
384
385 devzvol_zclist_task_running = B_TRUE;
386
387 taskq_dispatch_ent(sdev_taskq, devzvol_update_zclist_cb, NULL, 0,
388 &devzvol_zclist_task);
389
390 mutex_exit(&devzvol_mtx);
391
392 wait:
393 taskq_wait(sdev_taskq);
394 }
395
396 /*
397 * Creates sub-directories for each zpool as needed in response to a
398 * readdir on one of the /dev/zvol/{dsk,rdsk} directories.
399 */
400 void
devzvol_create_pool_dirs(struct vnode * dvp)401 devzvol_create_pool_dirs(struct vnode *dvp)
402 {
403 nvlist_t *nv = NULL;
404 nvpair_t *elem = NULL;
405 int pools = 0;
406 int rc;
407
408 sdcmn_err13(("devzvol_create_pool_dirs"));
409
410 devzvol_update_zclist();
411
412 mutex_enter(&devzvol_mtx);
413
414 rc = nvlist_unpack((char *)(uintptr_t)devzvol_zclist,
415 devzvol_zclist_size, &nv, 0);
416 if (rc) {
417 ASSERT(rc == 0);
418 kmem_free((void *)(uintptr_t)devzvol_zclist,
419 devzvol_zclist_size);
420 devzvol_gen = 0;
421 devzvol_zclist = NULL;
422 devzvol_zclist_size = 0;
423 goto out;
424 }
425 mutex_exit(&devzvol_mtx);
426 while ((elem = nvlist_next_nvpair(nv, elem)) != NULL) {
427 struct vnode *vp;
428 ASSERT(dvp->v_count > 0);
429 rc = VOP_LOOKUP(dvp, nvpair_name(elem), &vp, NULL, 0,
430 NULL, kcred, NULL, 0, NULL);
431 /* should either work, or not be visible from a zone */
432 ASSERT(rc == 0 || rc == ENOENT);
433 if (rc == 0)
434 VN_RELE(vp);
435 pools++;
436 }
437 nvlist_free(nv);
438 mutex_enter(&devzvol_mtx);
439 if (devzvol_isopen && pools == 0) {
440 /* clean up so zfs can be unloaded */
441 devzvol_close_zfs();
442 devzvol_isopen = B_FALSE;
443 }
444 out:
445 mutex_exit(&devzvol_mtx);
446 }
447
448 /*ARGSUSED3*/
449 static int
devzvol_create_dir(struct sdev_node * ddv,char * nm,void ** arg,cred_t * cred,void * whatever,char * whichever)450 devzvol_create_dir(struct sdev_node *ddv, char *nm, void **arg,
451 cred_t *cred, void *whatever, char *whichever)
452 {
453 timestruc_t now;
454 struct vattr *vap = (struct vattr *)arg;
455
456 sdcmn_err13(("create_dir (%s) (%s) '%s'", ddv->sdev_name,
457 ddv->sdev_path, nm));
458 ASSERT(strncmp(ddv->sdev_path, ZVOL_DIR,
459 strlen(ZVOL_DIR)) == 0);
460 *vap = *sdev_getdefault_attr(VDIR);
461 gethrestime(&now);
462 vap->va_atime = now;
463 vap->va_mtime = now;
464 vap->va_ctime = now;
465 return (0);
466 }
467
468 /*ARGSUSED3*/
469 static int
devzvol_create_link(struct sdev_node * ddv,char * nm,void ** arg,cred_t * cred,void * whatever,char * whichever)470 devzvol_create_link(struct sdev_node *ddv, char *nm,
471 void **arg, cred_t *cred, void *whatever, char *whichever)
472 {
473 minor_t minor;
474 char *pathname = (char *)*arg;
475 int rc;
476 char *dsname;
477 char *x;
478 char str[MAXNAMELEN];
479 sdcmn_err13(("create_link (%s) (%s) '%s'", ddv->sdev_name,
480 ddv->sdev_path, nm));
481 dsname = devzvol_make_dsname(ddv->sdev_path, nm);
482 rc = sdev_zvol_create_minor(dsname);
483 if ((rc != 0 && rc != EEXIST && rc != EBUSY) ||
484 sdev_zvol_name2minor(dsname, &minor)) {
485 sdcmn_err13(("devzvol_create_link %d", rc));
486 kmem_free(dsname, strlen(dsname) + 1);
487 return (-1);
488 }
489 kmem_free(dsname, strlen(dsname) + 1);
490
491 /*
492 * This is a valid zvol; create a symlink that points to the
493 * minor which was created under /devices/pseudo/zfs@0
494 */
495 *pathname = '\0';
496 for (x = ddv->sdev_path; x = strchr(x, '/'); x++)
497 (void) strcat(pathname, "../");
498 (void) snprintf(str, sizeof (str), ZVOL_PSEUDO_DEV "%u", minor);
499 (void) strncat(pathname, str, MAXPATHLEN);
500 if (strncmp(ddv->sdev_path, ZVOL_FULL_RDEV_DIR,
501 strlen(ZVOL_FULL_RDEV_DIR)) == 0)
502 (void) strcat(pathname, ",raw");
503 return (0);
504 }
505
506 /* Clean zvol sdev_nodes that are no longer valid. */
507 static void
devzvol_prunedir(struct sdev_node * ddv)508 devzvol_prunedir(struct sdev_node *ddv)
509 {
510 struct sdev_node *dv;
511
512 ASSERT(RW_READ_HELD(&ddv->sdev_contents));
513
514 sdcmn_err13(("prunedir '%s'", ddv->sdev_name));
515 ASSERT(strncmp(ddv->sdev_path, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
516 if (rw_tryupgrade(&ddv->sdev_contents) == 0) {
517 rw_exit(&ddv->sdev_contents);
518 rw_enter(&ddv->sdev_contents, RW_WRITER);
519 }
520
521 dv = SDEV_FIRST_ENTRY(ddv);
522 while (dv) {
523 sdcmn_err13(("sdev_name '%s'", dv->sdev_name));
524
525 switch (devzvol_validate(dv)) {
526 case SDEV_VTOR_VALID:
527 case SDEV_VTOR_SKIP:
528 dv = SDEV_NEXT_ENTRY(ddv, dv);
529 continue;
530 case SDEV_VTOR_INVALID:
531 sdcmn_err7(("prunedir: destroy invalid "
532 "node: %s\n", dv->sdev_name));
533 break;
534 }
535
536 if ((SDEVTOV(dv)->v_type == VDIR) &&
537 (sdev_cleandir(dv, NULL, 0) != 0)) {
538 dv = SDEV_NEXT_ENTRY(ddv, dv);
539 continue;
540 }
541 SDEV_HOLD(dv);
542 /* remove the cache node */
543 sdev_cache_update(ddv, &dv, dv->sdev_name,
544 SDEV_CACHE_DELETE);
545 SDEV_RELE(dv);
546 dv = SDEV_FIRST_ENTRY(ddv);
547 }
548 rw_downgrade(&ddv->sdev_contents);
549 }
550
551 /*
552 * This function is used to create a dir or dev inside a zone's /dev when the
553 * zone has a zvol that is dynamically created within the zone (i.e. inside
554 * of a delegated dataset. Since there is no /devices tree within a zone,
555 * we create the chr/blk devices directly inside the zone's /dev instead of
556 * making symlinks.
557 */
558 static int
devzvol_mk_ngz_node(struct sdev_node * parent,char * nm)559 devzvol_mk_ngz_node(struct sdev_node *parent, char *nm)
560 {
561 struct vattr vattr;
562 timestruc_t now;
563 enum vtype expected_type = VDIR;
564 dmu_objset_type_t do_type;
565 struct sdev_node *dv = NULL;
566 int res;
567 char *dsname;
568
569 bzero(&vattr, sizeof (vattr));
570 gethrestime(&now);
571 vattr.va_mask = AT_TYPE|AT_MODE|AT_UID|AT_GID;
572 vattr.va_uid = SDEV_UID_DEFAULT;
573 vattr.va_gid = SDEV_GID_DEFAULT;
574 vattr.va_type = VNON;
575 vattr.va_atime = now;
576 vattr.va_mtime = now;
577 vattr.va_ctime = now;
578
579 if ((dsname = devzvol_make_dsname(parent->sdev_path, nm)) == NULL)
580 return (ENOENT);
581
582 if (devzvol_objset_check(dsname, &do_type) != 0) {
583 kmem_free(dsname, strlen(dsname) + 1);
584 return (ENOENT);
585 }
586 if (do_type == DMU_OST_ZVOL)
587 expected_type = VBLK;
588
589 if (expected_type == VDIR) {
590 vattr.va_type = VDIR;
591 vattr.va_mode = SDEV_DIRMODE_DEFAULT;
592 } else {
593 minor_t minor;
594 dev_t devnum;
595 int rc;
596
597 rc = sdev_zvol_create_minor(dsname);
598 if ((rc != 0 && rc != EEXIST && rc != EBUSY) ||
599 sdev_zvol_name2minor(dsname, &minor)) {
600 kmem_free(dsname, strlen(dsname) + 1);
601 return (ENOENT);
602 }
603
604 devnum = makedevice(devzvol_major, minor);
605 vattr.va_rdev = devnum;
606
607 if (strstr(parent->sdev_path, "/rdsk/") != NULL)
608 vattr.va_type = VCHR;
609 else
610 vattr.va_type = VBLK;
611 vattr.va_mode = SDEV_DEVMODE_DEFAULT;
612 }
613 kmem_free(dsname, strlen(dsname) + 1);
614
615 rw_enter(&parent->sdev_contents, RW_WRITER);
616
617 res = sdev_mknode(parent, nm, &dv, &vattr,
618 NULL, NULL, kcred, SDEV_READY);
619 rw_exit(&parent->sdev_contents);
620 if (res != 0)
621 return (ENOENT);
622
623 SDEV_RELE(dv);
624 return (0);
625 }
626
627 /*ARGSUSED*/
628 static int
devzvol_lookup(struct vnode * dvp,char * nm,struct vnode ** vpp,struct pathname * pnp,int flags,struct vnode * rdir,struct cred * cred,caller_context_t * ct,int * direntflags,pathname_t * realpnp)629 devzvol_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
630 struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred,
631 caller_context_t *ct, int *direntflags, pathname_t *realpnp)
632 {
633 enum vtype expected_type = VDIR;
634 struct sdev_node *parent = VTOSDEV(dvp);
635 char *dsname;
636 dmu_objset_type_t do_type;
637 int error;
638
639 sdcmn_err13(("devzvol_lookup '%s' '%s'", parent->sdev_path, nm));
640 *vpp = NULL;
641 /* execute access is required to search the directory */
642 if ((error = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0)
643 return (error);
644
645 rw_enter(&parent->sdev_contents, RW_READER);
646 if (SDEV_IS_GLOBAL(parent)) {
647 /*
648 * During iter_datasets, don't create GZ dev when running in
649 * NGZ. We can't return ENOENT here since that could
650 * incorrectly trigger the creation of the dev from the
651 * recursive call through prof_filldir during iter_datasets.
652 */
653 if (getzoneid() != GLOBAL_ZONEID) {
654 rw_exit(&parent->sdev_contents);
655 return (EPERM);
656 }
657 } else {
658 int res;
659
660 rw_exit(&parent->sdev_contents);
661
662 /*
663 * If we're in the global zone and reach down into a non-global
664 * zone's /dev/zvol then this action could trigger the creation
665 * of all of the zvol devices for every zone into the non-global
666 * zone's /dev tree. This could be a big security hole. To
667 * prevent this, disallow the global zone from looking inside
668 * a non-global zones /dev/zvol. This behavior is similar to
669 * delegated datasets, which cannot be used by the global zone.
670 */
671 if (getzoneid() == GLOBAL_ZONEID)
672 return (EPERM);
673
674 res = prof_lookup(dvp, nm, vpp, cred);
675
676 /*
677 * We won't find a zvol that was dynamically created inside
678 * a NGZ, within a delegated dataset, in the zone's dev profile
679 * but prof_lookup will also find it via sdev_cache_lookup.
680 */
681 if (res == ENOENT) {
682 /*
683 * We have to create the sdev node for the dymamically
684 * created zvol.
685 */
686 if (devzvol_mk_ngz_node(parent, nm) != 0)
687 return (ENOENT);
688 res = prof_lookup(dvp, nm, vpp, cred);
689 }
690
691 return (res);
692 }
693
694 dsname = devzvol_make_dsname(parent->sdev_path, nm);
695 rw_exit(&parent->sdev_contents);
696 sdcmn_err13(("rvp dsname %s", dsname ? dsname : "(null)"));
697 if (dsname) {
698 error = devzvol_objset_check(dsname, &do_type);
699 if (error != 0) {
700 error = ENOENT;
701 goto out;
702 }
703 if (do_type == DMU_OST_ZVOL)
704 expected_type = VLNK;
705 }
706 /*
707 * the callbacks expect:
708 *
709 * parent->sdev_path nm
710 * /dev/zvol {r}dsk
711 * /dev/zvol/{r}dsk <pool name>
712 * /dev/zvol/{r}dsk/<dataset name> <last ds component>
713 *
714 * sdev_name is always last path component of sdev_path
715 */
716 if (expected_type == VDIR) {
717 error = devname_lookup_func(parent, nm, vpp, cred,
718 devzvol_create_dir, SDEV_VATTR);
719 } else {
720 error = devname_lookup_func(parent, nm, vpp, cred,
721 devzvol_create_link, SDEV_VLINK);
722 }
723 sdcmn_err13(("devzvol_lookup %d %d", expected_type, error));
724 ASSERT(error || ((*vpp)->v_type == expected_type));
725 out:
726 if (dsname)
727 kmem_free(dsname, strlen(dsname) + 1);
728 sdcmn_err13(("devzvol_lookup %d", error));
729 return (error);
730 }
731
732 /*
733 * We allow create to find existing nodes
734 * - if the node doesn't exist - EROFS
735 * - creating an existing dir read-only succeeds, otherwise EISDIR
736 * - exclusive creates fail - EEXIST
737 */
738 /*ARGSUSED2*/
739 static int
devzvol_create(struct vnode * dvp,char * nm,struct vattr * vap,vcexcl_t excl,int mode,struct vnode ** vpp,struct cred * cred,int flag,caller_context_t * ct,vsecattr_t * vsecp)740 devzvol_create(struct vnode *dvp, char *nm, struct vattr *vap, vcexcl_t excl,
741 int mode, struct vnode **vpp, struct cred *cred, int flag,
742 caller_context_t *ct, vsecattr_t *vsecp)
743 {
744 int error;
745 struct vnode *vp;
746
747 *vpp = NULL;
748
749 error = devzvol_lookup(dvp, nm, &vp, NULL, 0, NULL, cred, ct, NULL,
750 NULL);
751 if (error == 0) {
752 if (excl == EXCL)
753 error = EEXIST;
754 else if (vp->v_type == VDIR && (mode & VWRITE))
755 error = EISDIR;
756 else
757 error = VOP_ACCESS(vp, mode, 0, cred, ct);
758
759 if (error) {
760 VN_RELE(vp);
761 } else
762 *vpp = vp;
763 } else if (error == ENOENT) {
764 error = EROFS;
765 }
766
767 return (error);
768 }
769
770 void sdev_iter_snapshots(struct vnode *dvp, char *name);
771
772 void
sdev_iter_datasets(struct vnode * dvp,int arg,char * name)773 sdev_iter_datasets(struct vnode *dvp, int arg, char *name)
774 {
775 zfs_cmd_t *zc;
776 int rc;
777
778 sdcmn_err13(("iter name is '%s' (arg %x)", name, arg));
779 zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
780 (void) strcpy(zc->zc_name, name);
781
782 while ((rc = devzvol_handle_ioctl(arg, zc, B_FALSE)) == 0) {
783 struct vnode *vpp;
784 char *ptr;
785
786 sdcmn_err13((" name %s", zc->zc_name));
787 if (strchr(zc->zc_name, '$') || strchr(zc->zc_name, '%'))
788 goto skip;
789 ptr = strrchr(zc->zc_name, '/') + 1;
790 rc = devzvol_lookup(dvp, ptr, &vpp, NULL, 0, NULL,
791 kcred, NULL, NULL, NULL);
792 if (rc == 0) {
793 VN_RELE(vpp);
794 } else if (rc == ENOENT) {
795 goto skip;
796 } else {
797 /*
798 * EBUSY == problem with zvols's dmu holds?
799 * EPERM when in a NGZ and traversing up and out.
800 */
801 goto skip;
802 }
803 if (arg == ZFS_IOC_DATASET_LIST_NEXT &&
804 zc->zc_objset_stats.dds_type != DMU_OST_ZFS)
805 sdev_iter_snapshots(dvp, zc->zc_name);
806 skip:
807 (void) strcpy(zc->zc_name, name);
808 }
809 kmem_free(zc, sizeof (zfs_cmd_t));
810 }
811
812 void
sdev_iter_snapshots(struct vnode * dvp,char * name)813 sdev_iter_snapshots(struct vnode *dvp, char *name)
814 {
815 sdev_iter_datasets(dvp, ZFS_IOC_SNAPSHOT_LIST_NEXT, name);
816 }
817
818 /*ARGSUSED4*/
819 static int
devzvol_readdir(struct vnode * dvp,struct uio * uiop,struct cred * cred,int * eofp,caller_context_t * ct_unused,int flags_unused)820 devzvol_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred,
821 int *eofp, caller_context_t *ct_unused, int flags_unused)
822 {
823 struct sdev_node *sdvp = VTOSDEV(dvp);
824 char *ptr;
825
826 sdcmn_err13(("zv readdir of '%s' %s'", sdvp->sdev_path,
827 sdvp->sdev_name));
828
829 if (strcmp(sdvp->sdev_path, ZVOL_DIR) == 0) {
830 struct vnode *vp;
831
832 rw_exit(&sdvp->sdev_contents);
833 (void) devname_lookup_func(sdvp, "dsk", &vp, cred,
834 devzvol_create_dir, SDEV_VATTR);
835 VN_RELE(vp);
836 (void) devname_lookup_func(sdvp, "rdsk", &vp, cred,
837 devzvol_create_dir, SDEV_VATTR);
838 VN_RELE(vp);
839 rw_enter(&sdvp->sdev_contents, RW_READER);
840 return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
841 }
842 if (uiop->uio_offset == 0)
843 devzvol_prunedir(sdvp);
844 ptr = sdvp->sdev_path + strlen(ZVOL_DIR);
845 if ((strcmp(ptr, "/dsk") == 0) || (strcmp(ptr, "/rdsk") == 0)) {
846 rw_exit(&sdvp->sdev_contents);
847 devzvol_create_pool_dirs(dvp);
848 rw_enter(&sdvp->sdev_contents, RW_READER);
849 return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
850 }
851
852 ptr = strchr(ptr + 1, '/');
853 if (ptr == NULL)
854 return (ENOENT);
855 ptr++;
856 rw_exit(&sdvp->sdev_contents);
857 sdev_iter_datasets(dvp, ZFS_IOC_DATASET_LIST_NEXT, ptr);
858 rw_enter(&sdvp->sdev_contents, RW_READER);
859 return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
860 }
861
862 const fs_operation_def_t devzvol_vnodeops_tbl[] = {
863 VOPNAME_READDIR, { .vop_readdir = devzvol_readdir },
864 VOPNAME_LOOKUP, { .vop_lookup = devzvol_lookup },
865 VOPNAME_CREATE, { .vop_create = devzvol_create },
866 VOPNAME_RENAME, { .error = fs_nosys },
867 VOPNAME_MKDIR, { .error = fs_nosys },
868 VOPNAME_RMDIR, { .error = fs_nosys },
869 VOPNAME_REMOVE, { .error = fs_nosys },
870 VOPNAME_SYMLINK, { .error = fs_nosys },
871 NULL, NULL
872 };
873