1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * Copyright 2013, 2016 Joyent, Inc. All rights reserved. 25 * Copyright (c) 2014 by Delphix. All rights reserved. 26 */ 27 28 /* vnode ops for the /dev/zvol directory */ 29 30 #include <sys/types.h> 31 #include <sys/param.h> 32 #include <sys/sysmacros.h> 33 #include <sys/ddi.h> 34 #include <sys/sunndi.h> 35 #include <sys/sunldi.h> 36 #include <fs/fs_subr.h> 37 #include <sys/fs/dv_node.h> 38 #include <sys/fs/sdev_impl.h> 39 #include <sys/zfs_ioctl.h> 40 #include <sys/policy.h> 41 #include <sys/stat.h> 42 #include <sys/vfs_opreg.h> 43 44 struct vnodeops *devzvol_vnodeops; 45 static major_t devzvol_major; 46 static taskq_ent_t devzvol_zclist_task; 47 48 static kmutex_t devzvol_mtx; 49 /* Below are protected by devzvol_mtx */ 50 static boolean_t devzvol_isopen; 51 static boolean_t devzvol_zclist_task_running = B_FALSE; 52 static uint64_t devzvol_gen = 0; 53 static uint64_t devzvol_zclist; 54 static size_t devzvol_zclist_size; 55 static ldi_ident_t devzvol_li; 56 static ldi_handle_t devzvol_lh; 57 58 /* 59 * we need to use ddi_mod* since fs/dev gets loaded early on in 60 * startup(), and linking fs/dev to fs/zfs would drag in a lot of 61 * other stuff (like drv/random) before the rest of the system is 62 * ready to go 63 */ 64 ddi_modhandle_t zfs_mod; 65 int (*szcm)(char *); 66 int (*szn2m)(char *, minor_t *); 67 68 69 /* 70 * Enable/disable snapshots from being created in /dev/zvol. By default, 71 * they are enabled, preserving the historic behavior. 72 */ 73 boolean_t devzvol_snaps_allowed = B_TRUE; 74 75 int 76 sdev_zvol_create_minor(char *dsname) 77 { 78 if (szcm == NULL) 79 return (-1); 80 return ((*szcm)(dsname)); 81 } 82 83 int 84 sdev_zvol_name2minor(char *dsname, minor_t *minor) 85 { 86 if (szn2m == NULL) 87 return (-1); 88 return ((*szn2m)(dsname, minor)); 89 } 90 91 int 92 devzvol_open_zfs() 93 { 94 int rc; 95 dev_t dv; 96 97 devzvol_li = ldi_ident_from_anon(); 98 if (ldi_open_by_name("/dev/zfs", FREAD | FWRITE, kcred, 99 &devzvol_lh, devzvol_li)) 100 return (-1); 101 if (zfs_mod == NULL && ((zfs_mod = ddi_modopen("fs/zfs", 102 KRTLD_MODE_FIRST, &rc)) == NULL)) { 103 return (rc); 104 } 105 ASSERT(szcm == NULL && szn2m == NULL); 106 if ((szcm = (int (*)(char *)) 107 ddi_modsym(zfs_mod, "zvol_create_minor", &rc)) == NULL) { 108 cmn_err(CE_WARN, "couldn't resolve zvol_create_minor"); 109 return (rc); 110 } 111 if ((szn2m = (int(*)(char *, minor_t *)) 112 ddi_modsym(zfs_mod, "zvol_name2minor", &rc)) == NULL) { 113 cmn_err(CE_WARN, "couldn't resolve zvol_name2minor"); 114 return (rc); 115 } 116 if (ldi_get_dev(devzvol_lh, &dv)) 117 return (-1); 118 devzvol_major = getmajor(dv); 119 return (0); 120 } 121 122 void 123 devzvol_close_zfs() 124 { 125 szcm = NULL; 126 szn2m = NULL; 127 (void) ldi_close(devzvol_lh, FREAD|FWRITE, kcred); 128 ldi_ident_release(devzvol_li); 129 if (zfs_mod != NULL) { 130 (void) ddi_modclose(zfs_mod); 131 zfs_mod = NULL; 132 } 133 } 134 135 int 136 devzvol_handle_ioctl(int cmd, zfs_cmd_t *zc, size_t *alloc_size) 137 { 138 uint64_t cookie; 139 int size = 8000; 140 int unused; 141 int rc; 142 143 if (cmd != ZFS_IOC_POOL_CONFIGS) 144 mutex_enter(&devzvol_mtx); 145 if (!devzvol_isopen) { 146 if ((rc = devzvol_open_zfs()) == 0) { 147 devzvol_isopen = B_TRUE; 148 } else { 149 if (cmd != ZFS_IOC_POOL_CONFIGS) 150 mutex_exit(&devzvol_mtx); 151 return (ENXIO); 152 } 153 } 154 cookie = zc->zc_cookie; 155 again: 156 zc->zc_nvlist_dst = (uint64_t)(intptr_t)kmem_alloc(size, 157 KM_SLEEP); 158 zc->zc_nvlist_dst_size = size; 159 rc = ldi_ioctl(devzvol_lh, cmd, (intptr_t)zc, FKIOCTL, kcred, 160 &unused); 161 if (rc == ENOMEM) { 162 int newsize; 163 newsize = zc->zc_nvlist_dst_size; 164 ASSERT(newsize > size); 165 kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, size); 166 size = newsize; 167 zc->zc_cookie = cookie; 168 goto again; 169 } 170 if (alloc_size == NULL) 171 kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, size); 172 else 173 *alloc_size = size; 174 if (cmd != ZFS_IOC_POOL_CONFIGS) 175 mutex_exit(&devzvol_mtx); 176 return (rc); 177 } 178 179 /* figures out if the objset exists and returns its type */ 180 int 181 devzvol_objset_check(char *dsname, dmu_objset_type_t *type) 182 { 183 boolean_t ispool, is_snapshot; 184 zfs_cmd_t *zc; 185 int rc; 186 nvlist_t *nvl; 187 size_t nvsz; 188 189 ispool = (strchr(dsname, '/') == NULL); 190 is_snapshot = (strchr(dsname, '@') != NULL); 191 192 if (is_snapshot && !devzvol_snaps_allowed) 193 return (ENOTSUP); 194 195 zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); 196 (void) strlcpy(zc->zc_name, dsname, MAXPATHLEN); 197 198 nvl = fnvlist_alloc(); 199 fnvlist_add_boolean_value(nvl, "cachedpropsonly", B_TRUE); 200 zc->zc_nvlist_src = (uintptr_t)fnvlist_pack(nvl, &nvsz); 201 zc->zc_nvlist_src_size = nvsz; 202 fnvlist_free(nvl); 203 204 rc = devzvol_handle_ioctl(ispool ? ZFS_IOC_POOL_STATS : 205 ZFS_IOC_OBJSET_STATS, zc, NULL); 206 if (type && rc == 0) 207 *type = (ispool) ? DMU_OST_ZFS : 208 zc->zc_objset_stats.dds_type; 209 fnvlist_pack_free((char *)(uintptr_t)zc->zc_nvlist_src, nvsz); 210 kmem_free(zc, sizeof (zfs_cmd_t)); 211 return (rc); 212 } 213 214 /* 215 * Returns what the zfs dataset name should be, given the /dev/zvol 216 * path and an optional name (can be NULL). 217 * 218 * Note that if the name param is NULL, then path must be an 219 * actual dataset's directory and not one of the top-level 220 * /dev/zvol/{dsk,rdsk} dirs, as these do not correspond to a 221 * specific dataset. 222 */ 223 char * 224 devzvol_make_dsname(const char *path, const char *name) 225 { 226 char *dsname; 227 const char *ptr; 228 int dslen; 229 230 if (strcmp(path, ZVOL_DIR) == 0) 231 return (NULL); 232 if (name && (strcmp(name, ".") == 0 || strcmp(name, "..") == 0)) 233 return (NULL); 234 ptr = path + strlen(ZVOL_DIR); 235 if (strncmp(ptr, "/dsk", 4) == 0) 236 ptr += strlen("/dsk"); 237 else if (strncmp(ptr, "/rdsk", 5) == 0) 238 ptr += strlen("/rdsk"); 239 else 240 return (NULL); 241 242 if (*ptr == '/') 243 ptr++; 244 else if (name == NULL) 245 return (NULL); 246 247 dslen = strlen(ptr); 248 if (dslen) 249 dslen++; /* plus null */ 250 if (name) 251 dslen += strlen(name) + 1; /* plus slash */ 252 dsname = kmem_zalloc(dslen, KM_SLEEP); 253 if (*ptr) { 254 (void) strlcpy(dsname, ptr, dslen); 255 if (name) 256 (void) strlcat(dsname, "/", dslen); 257 } 258 if (name) 259 (void) strlcat(dsname, name, dslen); 260 return (dsname); 261 } 262 263 /* 264 * check if the zvol's sdev_node is still valid, which means make 265 * sure the zvol is still valid. zvol minors aren't proactively 266 * destroyed when the zvol is destroyed, so we use a validator to clean 267 * these up (in other words, when such nodes are encountered during 268 * subsequent lookup() and readdir() operations) so that only valid 269 * nodes are returned. The ordering between devname_lookup_func and 270 * devzvol_validate is a little inefficient in the case of invalid 271 * or stale nodes because devname_lookup_func calls 272 * devzvol_create_{dir, link}, then the validator says it's invalid, 273 * and then the node gets cleaned up. 274 */ 275 int 276 devzvol_validate(struct sdev_node *dv) 277 { 278 vnode_t *vn = SDEVTOV(dv); 279 dmu_objset_type_t do_type; 280 char *dsname; 281 char *nm = dv->sdev_name; 282 int rc; 283 284 sdcmn_err13(("validating ('%s' '%s')", dv->sdev_path, nm)); 285 /* 286 * validate only READY nodes; if someone is sitting on the 287 * directory of a dataset that just got destroyed we could 288 * get a zombie node which we just skip. 289 */ 290 if (dv->sdev_state != SDEV_READY) { 291 sdcmn_err13(("skipping '%s'", nm)); 292 return (SDEV_VTOR_SKIP); 293 } 294 295 if ((strcmp(dv->sdev_path, ZVOL_DIR "/dsk") == 0) || 296 (strcmp(dv->sdev_path, ZVOL_DIR "/rdsk") == 0)) 297 return (SDEV_VTOR_VALID); 298 dsname = devzvol_make_dsname(dv->sdev_path, NULL); 299 if (dsname == NULL) 300 return (SDEV_VTOR_INVALID); 301 302 /* 303 * Leave any nodes alone that have been explicitly created by 304 * sdev profiles. 305 */ 306 if (!(dv->sdev_flags & SDEV_GLOBAL) && dv->sdev_origin != NULL) { 307 kmem_free(dsname, strlen(dsname) + 1); 308 return (SDEV_VTOR_VALID); 309 } 310 311 rc = devzvol_objset_check(dsname, &do_type); 312 sdcmn_err13((" '%s' rc %d", dsname, rc)); 313 if (rc != 0) { 314 sdev_node_t *parent = dv->sdev_dotdot; 315 /* 316 * Explicitly passed-through zvols in our sdev profile can't 317 * be created as prof_* shadow nodes, because in the GZ they 318 * are symlinks, but in the NGZ they are actual device files. 319 * 320 * The objset_check will fail on these as they are outside 321 * any delegated dataset (zfs will not allow ioctl access to 322 * them from this zone). We still want them to work, though. 323 */ 324 if (!(parent->sdev_flags & SDEV_GLOBAL) && 325 parent->sdev_origin != NULL && 326 !(dv->sdev_flags & SDEV_GLOBAL) && 327 (vn->v_type == VBLK || vn->v_type == VCHR) && 328 prof_name_matched(nm, parent)) { 329 do_type = DMU_OST_ZVOL; 330 } else { 331 kmem_free(dsname, strlen(dsname) + 1); 332 return (SDEV_VTOR_INVALID); 333 } 334 } 335 336 sdcmn_err13((" v_type %d do_type %d", 337 vn->v_type, do_type)); 338 if ((vn->v_type == VLNK && do_type != DMU_OST_ZVOL) || 339 ((vn->v_type == VBLK || vn->v_type == VCHR) && 340 do_type != DMU_OST_ZVOL) || 341 (vn->v_type == VDIR && do_type == DMU_OST_ZVOL)) { 342 kmem_free(dsname, strlen(dsname) + 1); 343 return (SDEV_VTOR_STALE); 344 } 345 if (vn->v_type == VLNK) { 346 char *ptr, *link; 347 long val = 0; 348 minor_t lminor, ominor; 349 350 rc = sdev_getlink(vn, &link); 351 ASSERT(rc == 0); 352 353 ptr = strrchr(link, ':') + 1; 354 rc = ddi_strtol(ptr, NULL, 10, &val); 355 kmem_free(link, strlen(link) + 1); 356 ASSERT(rc == 0 && val != 0); 357 lminor = (minor_t)val; 358 if (sdev_zvol_name2minor(dsname, &ominor) < 0 || 359 ominor != lminor) { 360 kmem_free(dsname, strlen(dsname) + 1); 361 return (SDEV_VTOR_STALE); 362 } 363 } 364 kmem_free(dsname, strlen(dsname) + 1); 365 return (SDEV_VTOR_VALID); 366 } 367 368 /* 369 * Taskq callback to update the devzvol_zclist. 370 * 371 * We need to defer this to the taskq to avoid it running with a user 372 * context that might be associated with some non-global zone, and thus 373 * not being able to list all of the pools on the entire system. 374 */ 375 /*ARGSUSED*/ 376 static void 377 devzvol_update_zclist_cb(void *arg) 378 { 379 zfs_cmd_t *zc; 380 int rc; 381 size_t size; 382 383 zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); 384 mutex_enter(&devzvol_mtx); 385 zc->zc_cookie = devzvol_gen; 386 387 rc = devzvol_handle_ioctl(ZFS_IOC_POOL_CONFIGS, zc, &size); 388 switch (rc) { 389 case 0: 390 /* new generation */ 391 ASSERT(devzvol_gen != zc->zc_cookie); 392 devzvol_gen = zc->zc_cookie; 393 if (devzvol_zclist) 394 kmem_free((void *)(uintptr_t)devzvol_zclist, 395 devzvol_zclist_size); 396 devzvol_zclist = zc->zc_nvlist_dst; 397 /* Keep the alloc'd size, not the nvlist size. */ 398 devzvol_zclist_size = size; 399 break; 400 default: 401 /* 402 * Either there was no change in pool configuration 403 * since we last asked (rc == EEXIST) or we got a 404 * catastrophic error. 405 * 406 * Give up memory and exit. 407 */ 408 kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, 409 size); 410 break; 411 } 412 413 VERIFY(devzvol_zclist_task_running == B_TRUE); 414 devzvol_zclist_task_running = B_FALSE; 415 mutex_exit(&devzvol_mtx); 416 417 kmem_free(zc, sizeof (zfs_cmd_t)); 418 } 419 420 static void 421 devzvol_update_zclist(void) 422 { 423 mutex_enter(&devzvol_mtx); 424 if (devzvol_zclist_task_running == B_TRUE) { 425 mutex_exit(&devzvol_mtx); 426 goto wait; 427 } 428 429 devzvol_zclist_task_running = B_TRUE; 430 431 taskq_dispatch_ent(sdev_taskq, devzvol_update_zclist_cb, NULL, 0, 432 &devzvol_zclist_task); 433 434 mutex_exit(&devzvol_mtx); 435 436 wait: 437 taskq_wait(sdev_taskq); 438 } 439 440 /* 441 * Creates sub-directories for each zpool as needed in response to a 442 * readdir on one of the /dev/zvol/{dsk,rdsk} directories. 443 */ 444 void 445 devzvol_create_pool_dirs(struct vnode *dvp) 446 { 447 nvlist_t *nv = NULL; 448 nvpair_t *elem = NULL; 449 int pools = 0; 450 int rc; 451 452 sdcmn_err13(("devzvol_create_pool_dirs")); 453 454 devzvol_update_zclist(); 455 456 mutex_enter(&devzvol_mtx); 457 458 rc = nvlist_unpack((char *)(uintptr_t)devzvol_zclist, 459 devzvol_zclist_size, &nv, 0); 460 if (rc) { 461 ASSERT(rc == 0); 462 kmem_free((void *)(uintptr_t)devzvol_zclist, 463 devzvol_zclist_size); 464 devzvol_gen = 0; 465 devzvol_zclist = NULL; 466 devzvol_zclist_size = 0; 467 goto out; 468 } 469 mutex_exit(&devzvol_mtx); 470 while ((elem = nvlist_next_nvpair(nv, elem)) != NULL) { 471 struct vnode *vp; 472 ASSERT(dvp->v_count > 0); 473 rc = VOP_LOOKUP(dvp, nvpair_name(elem), &vp, NULL, 0, 474 NULL, kcred, NULL, 0, NULL); 475 /* should either work, or not be visible from a zone */ 476 ASSERT(rc == 0 || rc == ENOENT); 477 if (rc == 0) 478 VN_RELE(vp); 479 pools++; 480 } 481 nvlist_free(nv); 482 mutex_enter(&devzvol_mtx); 483 if (devzvol_isopen && pools == 0) { 484 /* clean up so zfs can be unloaded */ 485 devzvol_close_zfs(); 486 devzvol_isopen = B_FALSE; 487 } 488 out: 489 mutex_exit(&devzvol_mtx); 490 } 491 492 /*ARGSUSED3*/ 493 static int 494 devzvol_create_dir(struct sdev_node *ddv, char *nm, void **arg, 495 cred_t *cred, void *whatever, char *whichever) 496 { 497 timestruc_t now; 498 struct vattr *vap = (struct vattr *)arg; 499 500 sdcmn_err13(("create_dir (%s) (%s) '%s'", ddv->sdev_name, 501 ddv->sdev_path, nm)); 502 ASSERT(strncmp(ddv->sdev_path, ZVOL_DIR, 503 strlen(ZVOL_DIR)) == 0); 504 *vap = *sdev_getdefault_attr(VDIR); 505 gethrestime(&now); 506 vap->va_atime = now; 507 vap->va_mtime = now; 508 vap->va_ctime = now; 509 return (0); 510 } 511 512 /*ARGSUSED3*/ 513 static int 514 devzvol_create_link(struct sdev_node *ddv, char *nm, 515 void **arg, cred_t *cred, void *whatever, char *whichever) 516 { 517 minor_t minor; 518 char *pathname = (char *)*arg; 519 int rc; 520 char *dsname; 521 char *x; 522 char str[MAXNAMELEN]; 523 sdcmn_err13(("create_link (%s) (%s) '%s'", ddv->sdev_name, 524 ddv->sdev_path, nm)); 525 dsname = devzvol_make_dsname(ddv->sdev_path, nm); 526 rc = sdev_zvol_create_minor(dsname); 527 if ((rc != 0 && rc != EEXIST && rc != EBUSY) || 528 sdev_zvol_name2minor(dsname, &minor)) { 529 sdcmn_err13(("devzvol_create_link %d", rc)); 530 kmem_free(dsname, strlen(dsname) + 1); 531 return (-1); 532 } 533 kmem_free(dsname, strlen(dsname) + 1); 534 535 /* 536 * This is a valid zvol; create a symlink that points to the 537 * minor which was created under /devices/pseudo/zfs@0 538 */ 539 *pathname = '\0'; 540 for (x = ddv->sdev_path; x = strchr(x, '/'); x++) 541 (void) strcat(pathname, "../"); 542 (void) snprintf(str, sizeof (str), ZVOL_PSEUDO_DEV "%u", minor); 543 (void) strncat(pathname, str, MAXPATHLEN); 544 if (strncmp(ddv->sdev_path, ZVOL_FULL_RDEV_DIR, 545 strlen(ZVOL_FULL_RDEV_DIR)) == 0) 546 (void) strcat(pathname, ",raw"); 547 return (0); 548 } 549 550 /* Clean zvol sdev_nodes that are no longer valid. */ 551 static void 552 devzvol_prunedir(struct sdev_node *ddv) 553 { 554 struct sdev_node *dv; 555 556 ASSERT(RW_READ_HELD(&ddv->sdev_contents)); 557 558 sdcmn_err13(("prunedir '%s'", ddv->sdev_name)); 559 ASSERT(strncmp(ddv->sdev_path, ZVOL_DIR, strlen(ZVOL_DIR)) == 0); 560 if (rw_tryupgrade(&ddv->sdev_contents) == 0) { 561 rw_exit(&ddv->sdev_contents); 562 rw_enter(&ddv->sdev_contents, RW_WRITER); 563 } 564 565 dv = SDEV_FIRST_ENTRY(ddv); 566 while (dv) { 567 sdcmn_err13(("sdev_name '%s'", dv->sdev_name)); 568 569 switch (devzvol_validate(dv)) { 570 case SDEV_VTOR_VALID: 571 case SDEV_VTOR_SKIP: 572 dv = SDEV_NEXT_ENTRY(ddv, dv); 573 continue; 574 case SDEV_VTOR_INVALID: 575 sdcmn_err7(("prunedir: destroy invalid " 576 "node: %s\n", dv->sdev_name)); 577 break; 578 } 579 580 if ((SDEVTOV(dv)->v_type == VDIR) && 581 (sdev_cleandir(dv, NULL, 0) != 0)) { 582 dv = SDEV_NEXT_ENTRY(ddv, dv); 583 continue; 584 } 585 SDEV_HOLD(dv); 586 /* remove the cache node */ 587 sdev_cache_update(ddv, &dv, dv->sdev_name, 588 SDEV_CACHE_DELETE); 589 SDEV_RELE(dv); 590 dv = SDEV_FIRST_ENTRY(ddv); 591 } 592 rw_downgrade(&ddv->sdev_contents); 593 } 594 595 /* 596 * This function is used to create a dir or dev inside a zone's /dev when the 597 * zone has a zvol that is dynamically created within the zone (i.e. inside 598 * of a delegated dataset. Since there is no /devices tree within a zone, 599 * we create the chr/blk devices directly inside the zone's /dev instead of 600 * making symlinks. 601 */ 602 static int 603 devzvol_mk_ngz_node(struct sdev_node *parent, char *nm) 604 { 605 struct vattr vattr; 606 timestruc_t now; 607 enum vtype expected_type = VDIR; 608 dmu_objset_type_t do_type; 609 struct sdev_node *dv = NULL; 610 int res; 611 char *dsname; 612 613 bzero(&vattr, sizeof (vattr)); 614 gethrestime(&now); 615 vattr.va_mask = AT_TYPE|AT_MODE|AT_UID|AT_GID; 616 vattr.va_uid = SDEV_UID_DEFAULT; 617 vattr.va_gid = SDEV_GID_DEFAULT; 618 vattr.va_type = VNON; 619 vattr.va_atime = now; 620 vattr.va_mtime = now; 621 vattr.va_ctime = now; 622 623 if ((dsname = devzvol_make_dsname(parent->sdev_path, nm)) == NULL) 624 return (ENOENT); 625 626 if (devzvol_objset_check(dsname, &do_type) != 0) { 627 /* 628 * objset_check will succeed on any valid objset in the global 629 * zone, and any valid delegated dataset. It will fail, however, 630 * in non-global zones on explicitly whitelisted zvol devices 631 * that are outside any delegated dataset. 632 * 633 * The directories leading up to the zvol device itself will be 634 * created by prof for us in advance (and will always validate 635 * because of the matching check in devzvol_validate). The zvol 636 * device itself can't be created by prof though because in the 637 * GZ it's a symlink, and in the NGZ it is not. So, we create 638 * such zvol device files here. 639 */ 640 if (!(parent->sdev_flags & SDEV_GLOBAL) && 641 parent->sdev_origin != NULL && 642 prof_name_matched(nm, parent)) { 643 do_type = DMU_OST_ZVOL; 644 } else { 645 kmem_free(dsname, strlen(dsname) + 1); 646 return (ENOENT); 647 } 648 } 649 650 if (do_type == DMU_OST_ZVOL) 651 expected_type = VBLK; 652 653 if (expected_type == VDIR) { 654 vattr.va_type = VDIR; 655 vattr.va_mode = SDEV_DIRMODE_DEFAULT; 656 } else { 657 minor_t minor; 658 dev_t devnum; 659 int rc; 660 661 rc = sdev_zvol_create_minor(dsname); 662 if ((rc != 0 && rc != EEXIST && rc != EBUSY) || 663 sdev_zvol_name2minor(dsname, &minor)) { 664 kmem_free(dsname, strlen(dsname) + 1); 665 return (ENOENT); 666 } 667 668 devnum = makedevice(devzvol_major, minor); 669 vattr.va_rdev = devnum; 670 671 if (strstr(parent->sdev_path, "/rdsk/") != NULL) 672 vattr.va_type = VCHR; 673 else 674 vattr.va_type = VBLK; 675 vattr.va_mode = SDEV_DEVMODE_DEFAULT; 676 } 677 kmem_free(dsname, strlen(dsname) + 1); 678 679 rw_enter(&parent->sdev_contents, RW_WRITER); 680 681 res = sdev_mknode(parent, nm, &dv, &vattr, 682 NULL, NULL, kcred, SDEV_READY); 683 rw_exit(&parent->sdev_contents); 684 if (res != 0) 685 return (ENOENT); 686 687 SDEV_RELE(dv); 688 return (0); 689 } 690 691 /*ARGSUSED*/ 692 static int 693 devzvol_lookup(struct vnode *dvp, char *nm, struct vnode **vpp, 694 struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred, 695 caller_context_t *ct, int *direntflags, pathname_t *realpnp) 696 { 697 enum vtype expected_type = VDIR; 698 struct sdev_node *parent = VTOSDEV(dvp); 699 char *dsname; 700 dmu_objset_type_t do_type; 701 int error; 702 703 sdcmn_err13(("devzvol_lookup '%s' '%s'", parent->sdev_path, nm)); 704 *vpp = NULL; 705 /* execute access is required to search the directory */ 706 if ((error = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0) 707 return (error); 708 709 rw_enter(&parent->sdev_contents, RW_READER); 710 if (!SDEV_IS_GLOBAL(parent)) { 711 int res; 712 713 rw_exit(&parent->sdev_contents); 714 715 /* 716 * If we're in the global zone and reach down into a non-global 717 * zone's /dev/zvol then this action could trigger the creation 718 * of all of the zvol devices for every zone into the non-global 719 * zone's /dev tree. This could be a big security hole. To 720 * prevent this, disallow the global zone from looking inside 721 * a non-global zones /dev/zvol. This behavior is similar to 722 * delegated datasets, which cannot be used by the global zone. 723 */ 724 if (getzoneid() == GLOBAL_ZONEID) 725 return (EPERM); 726 727 res = prof_lookup(dvp, nm, vpp, cred); 728 729 /* 730 * We won't find a zvol that was dynamically created inside 731 * a NGZ, within a delegated dataset, in the zone's dev profile 732 * but prof_lookup will also find it via sdev_cache_lookup. 733 */ 734 if (res == ENOENT) { 735 /* 736 * We have to create the sdev node for the dymamically 737 * created zvol. 738 */ 739 if (devzvol_mk_ngz_node(parent, nm) != 0) 740 return (ENOENT); 741 res = prof_lookup(dvp, nm, vpp, cred); 742 } 743 744 return (res); 745 } 746 747 /* 748 * Don't let the global-zone style lookup succeed here when we're not 749 * running in the global zone. This can happen because prof calls into 750 * us (in prof_filldir) trying to create an explicitly passed-through 751 * zvol device outside any delegated dataset. 752 * 753 * We have to stop this here or else we will create prof shadows of 754 * the global zone symlink, which will make no sense at all in the 755 * non-global zone (it has no /devices for the symlink to point at). 756 * 757 * These zvols will be created later (at access time) by mk_ngz_node 758 * instead. The dirs leading up to them will be created by prof 759 * internally. 760 * 761 * We have to return EPERM here, because ENOENT is given special 762 * meaning by prof in this context. 763 */ 764 if (getzoneid() != GLOBAL_ZONEID) { 765 rw_exit(&parent->sdev_contents); 766 return (EPERM); 767 } 768 769 dsname = devzvol_make_dsname(parent->sdev_path, nm); 770 rw_exit(&parent->sdev_contents); 771 sdcmn_err13(("rvp dsname %s", dsname ? dsname : "(null)")); 772 if (dsname) { 773 error = devzvol_objset_check(dsname, &do_type); 774 if (error != 0) { 775 error = ENOENT; 776 goto out; 777 } 778 if (do_type == DMU_OST_ZVOL) 779 expected_type = VLNK; 780 } 781 /* 782 * the callbacks expect: 783 * 784 * parent->sdev_path nm 785 * /dev/zvol {r}dsk 786 * /dev/zvol/{r}dsk <pool name> 787 * /dev/zvol/{r}dsk/<dataset name> <last ds component> 788 * 789 * sdev_name is always last path component of sdev_path 790 */ 791 if (expected_type == VDIR) { 792 error = devname_lookup_func(parent, nm, vpp, cred, 793 devzvol_create_dir, SDEV_VATTR); 794 } else { 795 error = devname_lookup_func(parent, nm, vpp, cred, 796 devzvol_create_link, SDEV_VLINK); 797 } 798 sdcmn_err13(("devzvol_lookup %d %d", expected_type, error)); 799 ASSERT(error || ((*vpp)->v_type == expected_type)); 800 out: 801 if (dsname) 802 kmem_free(dsname, strlen(dsname) + 1); 803 sdcmn_err13(("devzvol_lookup %d", error)); 804 return (error); 805 } 806 807 /* 808 * We allow create to find existing nodes 809 * - if the node doesn't exist - EROFS 810 * - creating an existing dir read-only succeeds, otherwise EISDIR 811 * - exclusive creates fail - EEXIST 812 */ 813 /*ARGSUSED2*/ 814 static int 815 devzvol_create(struct vnode *dvp, char *nm, struct vattr *vap, vcexcl_t excl, 816 int mode, struct vnode **vpp, struct cred *cred, int flag, 817 caller_context_t *ct, vsecattr_t *vsecp) 818 { 819 int error; 820 struct vnode *vp; 821 822 *vpp = NULL; 823 824 error = devzvol_lookup(dvp, nm, &vp, NULL, 0, NULL, cred, ct, NULL, 825 NULL); 826 if (error == 0) { 827 if (excl == EXCL) 828 error = EEXIST; 829 else if (vp->v_type == VDIR && (mode & VWRITE)) 830 error = EISDIR; 831 else 832 error = VOP_ACCESS(vp, mode, 0, cred, ct); 833 834 if (error) { 835 VN_RELE(vp); 836 } else 837 *vpp = vp; 838 } else if (error == ENOENT) { 839 error = EROFS; 840 } 841 842 return (error); 843 } 844 845 void sdev_iter_snapshots(struct vnode *dvp, char *name); 846 847 void 848 sdev_iter_datasets(struct vnode *dvp, int arg, char *name) 849 { 850 zfs_cmd_t *zc; 851 int rc; 852 853 sdcmn_err13(("iter name is '%s' (arg %x)", name, arg)); 854 zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); 855 (void) strcpy(zc->zc_name, name); 856 857 while ((rc = devzvol_handle_ioctl(arg, zc, B_FALSE)) == 0) { 858 struct vnode *vpp; 859 char *ptr; 860 861 sdcmn_err13((" name %s", zc->zc_name)); 862 if (strchr(zc->zc_name, '$') || strchr(zc->zc_name, '%')) 863 goto skip; 864 ptr = strrchr(zc->zc_name, '/') + 1; 865 rc = devzvol_lookup(dvp, ptr, &vpp, NULL, 0, NULL, 866 kcred, NULL, NULL, NULL); 867 if (rc == 0) { 868 VN_RELE(vpp); 869 } else if (rc == ENOENT) { 870 goto skip; 871 } else { 872 /* 873 * EBUSY == problem with zvols's dmu holds? 874 * EPERM when in a NGZ and traversing up and out. 875 */ 876 goto skip; 877 } 878 if (arg == ZFS_IOC_DATASET_LIST_NEXT && 879 zc->zc_objset_stats.dds_type == DMU_OST_ZVOL && 880 devzvol_snaps_allowed) 881 sdev_iter_snapshots(dvp, zc->zc_name); 882 skip: 883 (void) strcpy(zc->zc_name, name); 884 } 885 kmem_free(zc, sizeof (zfs_cmd_t)); 886 } 887 888 void 889 sdev_iter_snapshots(struct vnode *dvp, char *name) 890 { 891 sdev_iter_datasets(dvp, ZFS_IOC_SNAPSHOT_LIST_NEXT, name); 892 } 893 894 /*ARGSUSED4*/ 895 static int 896 devzvol_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred, 897 int *eofp, caller_context_t *ct_unused, int flags_unused) 898 { 899 struct sdev_node *sdvp = VTOSDEV(dvp); 900 char *ptr; 901 902 sdcmn_err13(("zv readdir of '%s' %s'", sdvp->sdev_path, 903 sdvp->sdev_name)); 904 905 if (strcmp(sdvp->sdev_path, ZVOL_DIR) == 0) { 906 struct vnode *vp; 907 908 rw_exit(&sdvp->sdev_contents); 909 (void) devname_lookup_func(sdvp, "dsk", &vp, cred, 910 devzvol_create_dir, SDEV_VATTR); 911 VN_RELE(vp); 912 (void) devname_lookup_func(sdvp, "rdsk", &vp, cred, 913 devzvol_create_dir, SDEV_VATTR); 914 VN_RELE(vp); 915 rw_enter(&sdvp->sdev_contents, RW_READER); 916 return (devname_readdir_func(dvp, uiop, cred, eofp, 0)); 917 } 918 if (uiop->uio_offset == 0) 919 devzvol_prunedir(sdvp); 920 ptr = sdvp->sdev_path + strlen(ZVOL_DIR); 921 if ((strcmp(ptr, "/dsk") == 0) || (strcmp(ptr, "/rdsk") == 0)) { 922 rw_exit(&sdvp->sdev_contents); 923 devzvol_create_pool_dirs(dvp); 924 rw_enter(&sdvp->sdev_contents, RW_READER); 925 return (devname_readdir_func(dvp, uiop, cred, eofp, 0)); 926 } 927 928 ptr = strchr(ptr + 1, '/'); 929 if (ptr == NULL) 930 return (ENOENT); 931 ptr++; 932 rw_exit(&sdvp->sdev_contents); 933 sdev_iter_datasets(dvp, ZFS_IOC_DATASET_LIST_NEXT, ptr); 934 rw_enter(&sdvp->sdev_contents, RW_READER); 935 return (devname_readdir_func(dvp, uiop, cred, eofp, 0)); 936 } 937 938 const fs_operation_def_t devzvol_vnodeops_tbl[] = { 939 VOPNAME_READDIR, { .vop_readdir = devzvol_readdir }, 940 VOPNAME_LOOKUP, { .vop_lookup = devzvol_lookup }, 941 VOPNAME_CREATE, { .vop_create = devzvol_create }, 942 VOPNAME_RENAME, { .error = fs_nosys }, 943 VOPNAME_MKDIR, { .error = fs_nosys }, 944 VOPNAME_RMDIR, { .error = fs_nosys }, 945 VOPNAME_REMOVE, { .error = fs_nosys }, 946 VOPNAME_SYMLINK, { .error = fs_nosys }, 947 NULL, NULL 948 }; 949