1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/param.h> 29 #include <sys/errno.h> 30 #include <sys/vfs.h> 31 #include <sys/vnode.h> 32 #include <sys/uio.h> 33 #include <sys/pathname.h> 34 #include <sys/kmem.h> 35 #include <sys/cred.h> 36 #include <sys/statvfs.h> 37 #include <sys/fs/lofs_info.h> 38 #include <sys/fs/lofs_node.h> 39 #include <sys/mount.h> 40 #include <sys/mntent.h> 41 #include <sys/mkdev.h> 42 #include <sys/priv.h> 43 #include <sys/sysmacros.h> 44 #include <sys/systm.h> 45 #include <sys/cmn_err.h> 46 #include <sys/policy.h> 47 #include <sys/tsol/label.h> 48 #include "fs/fs_subr.h" 49 50 /* 51 * This is the loadable module wrapper. 52 */ 53 #include <sys/modctl.h> 54 55 static mntopts_t lofs_mntopts; 56 57 static int lofsinit(int, char *); 58 59 static vfsdef_t vfw = { 60 VFSDEF_VERSION, 61 "lofs", 62 lofsinit, 63 VSW_HASPROTO|VSW_STATS, 64 &lofs_mntopts 65 }; 66 67 /* 68 * Stuff needed to support "zonedevfs" mode. 69 */ 70 static major_t lofs_major; 71 static minor_t lofs_minor; 72 static kmutex_t lofs_minor_lock; 73 74 /* 75 * LOFS mount options table 76 */ 77 static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL }; 78 static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL }; 79 static char *zonedevfs_cancel[] = { MNTOPT_LOFS_NOZONEDEVFS, NULL }; 80 static char *nozonedevfs_cancel[] = { MNTOPT_LOFS_ZONEDEVFS, NULL }; 81 static char *sub_cancel[] = { MNTOPT_LOFS_NOSUB, NULL }; 82 static char *nosub_cancel[] = { MNTOPT_LOFS_SUB, NULL }; 83 84 static mntopt_t mntopts[] = { 85 /* 86 * option name cancel option default arg flags 87 * private data 88 */ 89 { MNTOPT_XATTR, xattr_cancel, NULL, 0, 90 (void *)0 }, 91 { MNTOPT_NOXATTR, noxattr_cancel, NULL, 0, 92 (void *)0 }, 93 { MNTOPT_LOFS_ZONEDEVFS, zonedevfs_cancel, NULL, 0, 94 (void *)0 }, 95 { MNTOPT_LOFS_NOZONEDEVFS, nozonedevfs_cancel, NULL, 0, 96 (void *)0 }, 97 { MNTOPT_LOFS_SUB, sub_cancel, NULL, 0, 98 (void *)0 }, 99 { MNTOPT_LOFS_NOSUB, nosub_cancel, NULL, 0, 100 (void *)0 }, 101 }; 102 103 static mntopts_t lofs_mntopts = { 104 sizeof (mntopts) / sizeof (mntopt_t), 105 mntopts 106 }; 107 108 /* 109 * Module linkage information for the kernel. 110 */ 111 112 static struct modlfs modlfs = { 113 &mod_fsops, "filesystem for lofs", &vfw 114 }; 115 116 static struct modlinkage modlinkage = { 117 MODREV_1, (void *)&modlfs, NULL 118 }; 119 120 /* 121 * This is the module initialization routine. 122 */ 123 124 int 125 _init(void) 126 { 127 int status; 128 129 lofs_subrinit(); 130 status = mod_install(&modlinkage); 131 if (status != 0) { 132 /* 133 * Cleanup previously initialized work. 134 */ 135 lofs_subrfini(); 136 } 137 138 return (status); 139 } 140 141 /* 142 * Don't allow the lofs module to be unloaded for now. 143 * There is a memory leak if it gets unloaded. 144 */ 145 146 int 147 _fini(void) 148 { 149 return (EBUSY); 150 } 151 152 int 153 _info(struct modinfo *modinfop) 154 { 155 return (mod_info(&modlinkage, modinfop)); 156 } 157 158 159 static int lofsfstype; 160 vfsops_t *lo_vfsops; 161 162 /* 163 * lo mount vfsop 164 * Set up mount info record and attach it to vfs struct. 165 */ 166 /*ARGSUSED*/ 167 static int 168 lo_mount(struct vfs *vfsp, 169 struct vnode *vp, 170 struct mounta *uap, 171 struct cred *cr) 172 { 173 int error; 174 struct vnode *srootvp = NULL; /* the server's root */ 175 struct vnode *realrootvp; 176 struct loinfo *li; 177 int is_zonedevfs = 0; 178 int nodev; 179 180 nodev = vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL); 181 182 if ((error = secpolicy_fs_mount(cr, vp, vfsp)) != 0) 183 return (EPERM); 184 185 /* 186 * Loopback devices which get "nodevices" added can be done without 187 * "nodevices" set because we cannot import devices into a zone 188 * with loopback. Note that we have all zone privileges when 189 * this happens; if not, we'd have gotten "nosuid". 190 */ 191 if (!nodev && vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) 192 vfs_setmntopt(vfsp, MNTOPT_DEVICES, NULL, VFS_NODISPLAY); 193 194 /* 195 * We must ensure that only the global zone applies the 'zonedevfs' 196 * option; we don't want non-global zones to be able to establish 197 * lofs mounts using the special dev_t we use to ensure that the 198 * contents of a zone's /dev cannot be victim to link(2) or rename(2). 199 * See below, where we set all of this up. 200 * 201 * Since this is more like a privilege check, we use crgetzoneid(cr) 202 * instead of getzoneid(). 203 */ 204 is_zonedevfs = vfs_optionisset(vfsp, MNTOPT_LOFS_ZONEDEVFS, NULL); 205 if (crgetzoneid(cr) != GLOBAL_ZONEID && is_zonedevfs) 206 return (EPERM); 207 208 mutex_enter(&vp->v_lock); 209 if (!(uap->flags & MS_OVERLAY) && 210 (vp->v_count != 1 || (vp->v_flag & VROOT))) { 211 mutex_exit(&vp->v_lock); 212 return (EBUSY); 213 } 214 mutex_exit(&vp->v_lock); 215 216 /* 217 * Find real root, and make vfs point to real vfs 218 */ 219 if (error = lookupname(uap->spec, (uap->flags & MS_SYSSPACE) ? 220 UIO_SYSSPACE : UIO_USERSPACE, FOLLOW, NULLVPP, 221 &realrootvp)) 222 return (error); 223 224 /* 225 * Enforce MAC policy if needed. 226 * 227 * Loopback mounts must not allow writing up. The dominance test 228 * is intended to prevent a global zone caller from accidentally 229 * creating write-up conditions between two labeled zones. 230 * Local zones can't violate MAC on their own without help from 231 * the global zone because they can't name a pathname that 232 * they don't already have. 233 * 234 * The special case check for the NET_MAC_AWARE process flag is 235 * to support the case of the automounter in the global zone. We 236 * permit automounting of local zone directories such as home 237 * directories, into the global zone as required by setlabel, 238 * zonecopy, and saving of desktop sessions. Such mounts are 239 * trusted not to expose the contents of one zone's directories 240 * to another by leaking them through the global zone. 241 */ 242 if (is_system_labeled() && crgetzoneid(cr) == GLOBAL_ZONEID) { 243 void *specname; 244 zone_t *from_zptr; 245 zone_t *to_zptr; 246 247 if (uap->flags & MS_SYSSPACE) { 248 specname = uap->spec; 249 } else { 250 specname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 251 error = copyinstr(uap->spec, specname, MAXPATHLEN, 252 NULL); 253 if (error) { 254 kmem_free(specname, MAXPATHLEN); 255 return (error); 256 } 257 } 258 from_zptr = zone_find_by_path(specname); 259 if (!(uap->flags & MS_SYSSPACE)) 260 kmem_free(specname, MAXPATHLEN); 261 262 to_zptr = zone_find_by_path(refstr_value(vfsp->vfs_mntpt)); 263 264 /* 265 * Special case for zone devfs: the zone for /dev will 266 * incorrectly appear as the global zone since it's not 267 * under the zone rootpath. So for zone devfs check allow 268 * read-write mounts. 269 */ 270 271 if (from_zptr != to_zptr && !is_zonedevfs) { 272 /* 273 * We know at this point that the labels aren't equal 274 * because the zone pointers aren't equal, and zones 275 * can't share a label. 276 * 277 * If the source is the global zone then making 278 * it available to a local zone must be done in 279 * read-only mode as the label will become admin_low. 280 * 281 * If it is a mount between local zones then if 282 * the current process is in the global zone and has 283 * the NET_MAC_AWARE flag, then regular read-write 284 * access is allowed. If it's in some other zone, but 285 * the label on the mount point dominates the original 286 * source, then allow the mount as read-only 287 * ("read-down"). 288 */ 289 if (from_zptr->zone_id == GLOBAL_ZONEID) { 290 /* make the mount read-only */ 291 vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0); 292 } else { /* cross-zone mount */ 293 if (to_zptr->zone_id == GLOBAL_ZONEID && 294 /* LINTED: no consequent */ 295 getpflags(NET_MAC_AWARE, cr) != 0) { 296 /* Allow the mount as read-write */ 297 } else if (bldominates( 298 label2bslabel(to_zptr->zone_slabel), 299 label2bslabel(from_zptr->zone_slabel))) { 300 /* make the mount read-only */ 301 vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0); 302 } else { 303 zone_rele(to_zptr); 304 zone_rele(from_zptr); 305 return (EACCES); 306 } 307 } 308 } 309 zone_rele(to_zptr); 310 zone_rele(from_zptr); 311 } 312 313 /* 314 * realrootvp may be an AUTOFS node, in which case we 315 * perform a VOP_ACCESS() to trigger the mount of the 316 * intended filesystem, so we loopback mount the intended 317 * filesystem instead of the AUTOFS filesystem. 318 */ 319 (void) VOP_ACCESS(realrootvp, 0, 0, cr); 320 321 /* 322 * We're interested in the top most filesystem. 323 * This is specially important when uap->spec is a trigger 324 * AUTOFS node, since we're really interested in mounting the 325 * filesystem AUTOFS mounted as result of the VOP_ACCESS() 326 * call not the AUTOFS node itself. 327 */ 328 if (vn_mountedvfs(realrootvp) != NULL) { 329 if (error = traverse(&realrootvp)) { 330 VN_RELE(realrootvp); 331 return (error); 332 } 333 } 334 335 /* 336 * Allocate a vfs info struct and attach it 337 */ 338 li = kmem_zalloc(sizeof (struct loinfo), KM_SLEEP); 339 li->li_realvfs = realrootvp->v_vfsp; 340 li->li_mountvfs = vfsp; 341 342 /* 343 * Set mount flags to be inherited by loopback vfs's 344 */ 345 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) { 346 li->li_mflag |= VFS_RDONLY; 347 } 348 if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { 349 li->li_mflag |= (VFS_NOSETUID|VFS_NODEVICES); 350 } 351 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) { 352 li->li_mflag |= VFS_NODEVICES; 353 } 354 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { 355 li->li_mflag |= VFS_NOSETUID; 356 } 357 /* 358 * Permissive flags are added to the "deny" bitmap. 359 */ 360 if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) { 361 li->li_dflag |= VFS_XATTR; 362 } 363 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) { 364 li->li_dflag |= VFS_NBMAND; 365 } 366 367 /* 368 * Propagate inheritable mount flags from the real vfs. 369 */ 370 if ((li->li_realvfs->vfs_flag & VFS_RDONLY) && 371 !vfs_optionisset(vfsp, MNTOPT_RO, NULL)) 372 vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 373 VFS_NODISPLAY); 374 if ((li->li_realvfs->vfs_flag & VFS_NOSETUID) && 375 !vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) 376 vfs_setmntopt(vfsp, MNTOPT_NOSETUID, NULL, 377 VFS_NODISPLAY); 378 if ((li->li_realvfs->vfs_flag & VFS_NODEVICES) && 379 !vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) 380 vfs_setmntopt(vfsp, MNTOPT_NODEVICES, NULL, 381 VFS_NODISPLAY); 382 /* 383 * Permissive flags such as VFS_XATTR, as opposed to restrictive flags 384 * such as VFS_RDONLY, are handled differently. An explicit 385 * MNTOPT_NOXATTR should override the underlying filesystem's VFS_XATTR. 386 */ 387 if ((li->li_realvfs->vfs_flag & VFS_XATTR) && 388 !vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL) && 389 !vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) 390 vfs_setmntopt(vfsp, MNTOPT_XATTR, NULL, 391 VFS_NODISPLAY); 392 if ((li->li_realvfs->vfs_flag & VFS_NBMAND) && 393 !vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL) && 394 !vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) 395 vfs_setmntopt(vfsp, MNTOPT_NBMAND, NULL, 396 VFS_NODISPLAY); 397 398 li->li_refct = 0; 399 vfsp->vfs_data = (caddr_t)li; 400 vfsp->vfs_bcount = 0; 401 vfsp->vfs_fstype = lofsfstype; 402 vfsp->vfs_bsize = li->li_realvfs->vfs_bsize; 403 404 /* 405 * Test to see if we need to be in "zone /dev" mode. In zonedevfs 406 * mode, we pull a nasty trick; we make sure that the lofs dev_t does 407 * *not* reflect the underlying device, so that no renames or links 408 * can occur to or from the /dev hierarchy. 409 */ 410 if (is_zonedevfs) { 411 dev_t dev; 412 413 mutex_enter(&lofs_minor_lock); 414 do { 415 lofs_minor = (lofs_minor + 1) & MAXMIN32; 416 dev = makedevice(lofs_major, lofs_minor); 417 } while (vfs_devismounted(dev)); 418 mutex_exit(&lofs_minor_lock); 419 420 vfsp->vfs_dev = dev; 421 vfs_make_fsid(&vfsp->vfs_fsid, dev, lofsfstype); 422 423 li->li_flag |= LO_ZONEDEVFS; 424 } else { 425 vfsp->vfs_dev = li->li_realvfs->vfs_dev; 426 vfsp->vfs_fsid.val[0] = li->li_realvfs->vfs_fsid.val[0]; 427 vfsp->vfs_fsid.val[1] = li->li_realvfs->vfs_fsid.val[1]; 428 } 429 430 if (vfs_optionisset(vfsp, MNTOPT_LOFS_NOSUB, NULL)) { 431 li->li_flag |= LO_NOSUB; 432 } 433 434 /* 435 * Setup the hashtable. If the root of this mount isn't a directory, 436 * there's no point in allocating a large hashtable. A table with one 437 * bucket is sufficient. 438 */ 439 if (realrootvp->v_type != VDIR) 440 lsetup(li, 1); 441 else 442 lsetup(li, 0); 443 444 /* 445 * Make the root vnode 446 */ 447 srootvp = makelonode(realrootvp, li, 0); 448 srootvp->v_flag |= VROOT; 449 li->li_rootvp = srootvp; 450 451 #ifdef LODEBUG 452 lo_dprint(4, "lo_mount: vfs %p realvfs %p root %p realroot %p li %p\n", 453 vfsp, li->li_realvfs, srootvp, realrootvp, li); 454 #endif 455 return (0); 456 } 457 458 /* 459 * Undo loopback mount 460 */ 461 static int 462 lo_unmount(struct vfs *vfsp, int flag, struct cred *cr) 463 { 464 struct loinfo *li; 465 466 if (secpolicy_fs_unmount(cr, vfsp) != 0) 467 return (EPERM); 468 469 /* 470 * Forced unmount is not supported by this file system 471 * and thus, ENOTSUP, is being returned. 472 */ 473 if (flag & MS_FORCE) 474 return (ENOTSUP); 475 476 li = vtoli(vfsp); 477 #ifdef LODEBUG 478 lo_dprint(4, "lo_unmount(%p) li %p\n", vfsp, li); 479 #endif 480 if (li->li_refct != 1 || li->li_rootvp->v_count != 1) { 481 #ifdef LODEBUG 482 lo_dprint(4, "refct %d v_ct %d\n", li->li_refct, 483 li->li_rootvp->v_count); 484 #endif 485 return (EBUSY); 486 } 487 VN_RELE(li->li_rootvp); 488 return (0); 489 } 490 491 /* 492 * Find root of lofs mount. 493 */ 494 static int 495 lo_root(struct vfs *vfsp, struct vnode **vpp) 496 { 497 *vpp = vtoli(vfsp)->li_rootvp; 498 #ifdef LODEBUG 499 lo_dprint(4, "lo_root(0x%p) = %p\n", vfsp, *vpp); 500 #endif 501 /* 502 * If the root of the filesystem is a special file, return the specvp 503 * version of the vnode. We don't save the specvp vnode in our 504 * hashtable since that's exclusively for lnodes. 505 */ 506 if (IS_DEVVP(*vpp)) { 507 struct vnode *svp; 508 509 svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, kcred); 510 if (svp == NULL) 511 return (ENOSYS); 512 *vpp = svp; 513 } else { 514 VN_HOLD(*vpp); 515 } 516 517 return (0); 518 } 519 520 /* 521 * Get file system statistics. 522 */ 523 static int 524 lo_statvfs(register struct vfs *vfsp, struct statvfs64 *sbp) 525 { 526 vnode_t *realrootvp; 527 528 #ifdef LODEBUG 529 lo_dprint(4, "lostatvfs %p\n", vfsp); 530 #endif 531 /* 532 * Using realrootvp->v_vfsp (instead of the realvfsp that was 533 * cached) is necessary to make lofs work woth forced UFS unmounts. 534 * In the case of a forced unmount, UFS stores a set of dummy vfsops 535 * in all the (i)vnodes in the filesystem. The dummy ops simply 536 * returns back EIO. 537 */ 538 (void) lo_realvfs(vfsp, &realrootvp); 539 if (realrootvp != NULL) 540 return (VFS_STATVFS(realrootvp->v_vfsp, sbp)); 541 else 542 return (EIO); 543 } 544 545 /* 546 * LOFS doesn't have any data or metadata to flush, pending I/O on the 547 * underlying filesystem will be flushed when such filesystem is synched. 548 */ 549 /* ARGSUSED */ 550 static int 551 lo_sync(struct vfs *vfsp, 552 short flag, 553 struct cred *cr) 554 { 555 #ifdef LODEBUG 556 lo_dprint(4, "lo_sync: %p\n", vfsp); 557 #endif 558 return (0); 559 } 560 561 /* 562 * Obtain the vnode from the underlying filesystem. 563 */ 564 static int 565 lo_vget(struct vfs *vfsp, struct vnode **vpp, struct fid *fidp) 566 { 567 vnode_t *realrootvp; 568 569 #ifdef LODEBUG 570 lo_dprint(4, "lo_vget: %p\n", vfsp); 571 #endif 572 (void) lo_realvfs(vfsp, &realrootvp); 573 if (realrootvp != NULL) 574 return (VFS_VGET(realrootvp->v_vfsp, vpp, fidp)); 575 else 576 return (EIO); 577 } 578 579 /* 580 * Free mount-specific data. 581 */ 582 static void 583 lo_freevfs(struct vfs *vfsp) 584 { 585 struct loinfo *li = vtoli(vfsp); 586 587 ldestroy(li); 588 kmem_free(li, sizeof (struct loinfo)); 589 } 590 591 static int 592 lofsinit(int fstyp, char *name) 593 { 594 static const fs_operation_def_t lo_vfsops_template[] = { 595 VFSNAME_MOUNT, lo_mount, 596 VFSNAME_UNMOUNT, lo_unmount, 597 VFSNAME_ROOT, lo_root, 598 VFSNAME_STATVFS, lo_statvfs, 599 VFSNAME_SYNC, (fs_generic_func_p) lo_sync, 600 VFSNAME_VGET, lo_vget, 601 VFSNAME_FREEVFS, (fs_generic_func_p) lo_freevfs, 602 NULL, NULL 603 }; 604 int error; 605 606 error = vfs_setfsops(fstyp, lo_vfsops_template, &lo_vfsops); 607 if (error != 0) { 608 cmn_err(CE_WARN, "lofsinit: bad vfs ops template"); 609 return (error); 610 } 611 612 error = vn_make_ops(name, lo_vnodeops_template, &lo_vnodeops); 613 if (error != 0) { 614 (void) vfs_freevfsops_by_type(fstyp); 615 cmn_err(CE_WARN, "lofsinit: bad vnode ops template"); 616 return (error); 617 } 618 619 lofsfstype = fstyp; 620 621 if ((lofs_major = getudev()) == (major_t)-1) { 622 (void) vfs_freevfsops_by_type(fstyp); 623 cmn_err(CE_WARN, "lofsinit: Can't get unique device number."); 624 return (ENXIO); 625 } 626 627 lofs_minor = 0; 628 mutex_init(&lofs_minor_lock, NULL, MUTEX_DEFAULT, NULL); 629 630 return (0); 631 } 632