1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/param.h> 29 #include <sys/errno.h> 30 #include <sys/vfs.h> 31 #include <sys/vnode.h> 32 #include <sys/uio.h> 33 #include <sys/pathname.h> 34 #include <sys/kmem.h> 35 #include <sys/cred.h> 36 #include <sys/statvfs.h> 37 #include <sys/fs/lofs_info.h> 38 #include <sys/fs/lofs_node.h> 39 #include <sys/mount.h> 40 #include <sys/mntent.h> 41 #include <sys/mkdev.h> 42 #include <sys/priv.h> 43 #include <sys/sysmacros.h> 44 #include <sys/systm.h> 45 #include <sys/cmn_err.h> 46 #include <sys/policy.h> 47 #include <sys/tsol/label.h> 48 #include "fs/fs_subr.h" 49 50 /* 51 * This is the loadable module wrapper. 52 */ 53 #include <sys/modctl.h> 54 55 static mntopts_t lofs_mntopts; 56 57 static int lofsinit(int, char *); 58 59 static vfsdef_t vfw = { 60 VFSDEF_VERSION, 61 "lofs", 62 lofsinit, 63 VSW_HASPROTO|VSW_STATS, 64 &lofs_mntopts 65 }; 66 67 /* 68 * Stuff needed to support "zonedevfs" mode. 69 */ 70 static major_t lofs_major; 71 static minor_t lofs_minor; 72 static kmutex_t lofs_minor_lock; 73 74 /* 75 * LOFS mount options table 76 */ 77 static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL }; 78 static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL }; 79 static char *zonedevfs_cancel[] = { MNTOPT_LOFS_NOZONEDEVFS, NULL }; 80 static char *nozonedevfs_cancel[] = { MNTOPT_LOFS_ZONEDEVFS, NULL }; 81 static char *sub_cancel[] = { MNTOPT_LOFS_NOSUB, NULL }; 82 static char *nosub_cancel[] = { MNTOPT_LOFS_SUB, NULL }; 83 84 static mntopt_t mntopts[] = { 85 /* 86 * option name cancel option default arg flags 87 * private data 88 */ 89 { MNTOPT_XATTR, xattr_cancel, NULL, 0, 90 (void *)0 }, 91 { MNTOPT_NOXATTR, noxattr_cancel, NULL, 0, 92 (void *)0 }, 93 { MNTOPT_LOFS_ZONEDEVFS, zonedevfs_cancel, NULL, 0, 94 (void *)0 }, 95 { MNTOPT_LOFS_NOZONEDEVFS, nozonedevfs_cancel, NULL, 0, 96 (void *)0 }, 97 { MNTOPT_LOFS_SUB, sub_cancel, NULL, 0, 98 (void *)0 }, 99 { MNTOPT_LOFS_NOSUB, nosub_cancel, NULL, 0, 100 (void *)0 }, 101 }; 102 103 static mntopts_t lofs_mntopts = { 104 sizeof (mntopts) / sizeof (mntopt_t), 105 mntopts 106 }; 107 108 /* 109 * Module linkage information for the kernel. 110 */ 111 112 static struct modlfs modlfs = { 113 &mod_fsops, "filesystem for lofs", &vfw 114 }; 115 116 static struct modlinkage modlinkage = { 117 MODREV_1, (void *)&modlfs, NULL 118 }; 119 120 /* 121 * This is the module initialization routine. 122 */ 123 124 int 125 _init(void) 126 { 127 int status; 128 129 lofs_subrinit(); 130 status = mod_install(&modlinkage); 131 if (status != 0) { 132 /* 133 * Cleanup previously initialized work. 134 */ 135 lofs_subrfini(); 136 } 137 138 return (status); 139 } 140 141 /* 142 * Don't allow the lofs module to be unloaded for now. 143 * There is a memory leak if it gets unloaded. 144 */ 145 146 int 147 _fini(void) 148 { 149 return (EBUSY); 150 } 151 152 int 153 _info(struct modinfo *modinfop) 154 { 155 return (mod_info(&modlinkage, modinfop)); 156 } 157 158 159 static int lofsfstype; 160 vfsops_t *lo_vfsops; 161 162 /* 163 * lo mount vfsop 164 * Set up mount info record and attach it to vfs struct. 165 */ 166 /*ARGSUSED*/ 167 static int 168 lo_mount(struct vfs *vfsp, 169 struct vnode *vp, 170 struct mounta *uap, 171 struct cred *cr) 172 { 173 int error; 174 struct vnode *srootvp = NULL; /* the server's root */ 175 struct vnode *realrootvp; 176 struct loinfo *li; 177 int is_zonedevfs = 0; 178 int nodev; 179 180 nodev = vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL); 181 182 if ((error = secpolicy_fs_mount(cr, vp, vfsp)) != 0) 183 return (EPERM); 184 185 /* 186 * Loopback devices which get "nodevices" added can be done without 187 * "nodevices" set because we cannot import devices into a zone 188 * with loopback. Note that we have all zone privileges when 189 * this happens; if not, we'd have gotten "nosuid". 190 */ 191 if (!nodev && vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) 192 vfs_setmntopt(vfsp, MNTOPT_DEVICES, NULL, VFS_NODISPLAY); 193 194 /* 195 * We must ensure that only the global zone applies the 'zonedevfs' 196 * option; we don't want non-global zones to be able to establish 197 * lofs mounts using the special dev_t we use to ensure that the 198 * contents of a zone's /dev cannot be victim to link(2) or rename(2). 199 * See below, where we set all of this up. 200 * 201 * Since this is more like a privilege check, we use crgetzoneid(cr) 202 * instead of getzoneid(). 203 */ 204 is_zonedevfs = vfs_optionisset(vfsp, MNTOPT_LOFS_ZONEDEVFS, NULL); 205 if (crgetzoneid(cr) != GLOBAL_ZONEID && is_zonedevfs) 206 return (EPERM); 207 208 mutex_enter(&vp->v_lock); 209 if (!(uap->flags & MS_OVERLAY) && 210 (vp->v_count != 1 || (vp->v_flag & VROOT))) { 211 mutex_exit(&vp->v_lock); 212 return (EBUSY); 213 } 214 mutex_exit(&vp->v_lock); 215 216 /* 217 * Find real root, and make vfs point to real vfs 218 */ 219 if (error = lookupname(uap->spec, (uap->flags & MS_SYSSPACE) ? 220 UIO_SYSSPACE : UIO_USERSPACE, FOLLOW, NULLVPP, 221 &realrootvp)) 222 return (error); 223 224 /* 225 * Enforce MAC policy if needed. 226 * 227 * Loopback mounts must not allow writing up. The dominance test 228 * is intended to prevent a global zone caller from accidentally 229 * creating write-up conditions between two labeled zones. 230 * Local zones can't violate MAC on their own without help from 231 * the global zone because they can't name a pathname that 232 * they don't already have. 233 * 234 * The special case check for the NET_MAC_AWARE process flag is 235 * to support the case of the automounter in the global zone. We 236 * permit automounting of local zone directories such as home 237 * directories, into the global zone as required by setlabel, 238 * zonecopy, and saving of desktop sessions. Such mounts are 239 * trusted not to expose the contents of one zone's directories 240 * to another by leaking them through the global zone. 241 */ 242 if (is_system_labeled() && crgetzoneid(cr) == GLOBAL_ZONEID) { 243 char specname[MAXPATHLEN]; 244 zone_t *from_zptr; 245 zone_t *to_zptr; 246 247 if (vnodetopath(NULL, realrootvp, specname, 248 sizeof (specname), CRED()) != 0) 249 return (EACCES); 250 251 from_zptr = zone_find_by_path(specname); 252 to_zptr = zone_find_by_path(refstr_value(vfsp->vfs_mntpt)); 253 254 /* 255 * Special case for zone devfs: the zone for /dev will 256 * incorrectly appear as the global zone since it's not 257 * under the zone rootpath. So for zone devfs check allow 258 * read-write mounts. 259 */ 260 261 if (from_zptr != to_zptr && !is_zonedevfs) { 262 /* 263 * We know at this point that the labels aren't equal 264 * because the zone pointers aren't equal, and zones 265 * can't share a label. 266 * 267 * If the source is the global zone then making 268 * it available to a local zone must be done in 269 * read-only mode as the label will become admin_low. 270 * 271 * If it is a mount between local zones then if 272 * the current process is in the global zone and has 273 * the NET_MAC_AWARE flag, then regular read-write 274 * access is allowed. If it's in some other zone, but 275 * the label on the mount point dominates the original 276 * source, then allow the mount as read-only 277 * ("read-down"). 278 */ 279 if (from_zptr->zone_id == GLOBAL_ZONEID) { 280 /* make the mount read-only */ 281 vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0); 282 } else { /* cross-zone mount */ 283 if (to_zptr->zone_id == GLOBAL_ZONEID && 284 /* LINTED: no consequent */ 285 getpflags(NET_MAC_AWARE, cr) != 0) { 286 /* Allow the mount as read-write */ 287 } else if (bldominates( 288 label2bslabel(to_zptr->zone_slabel), 289 label2bslabel(from_zptr->zone_slabel))) { 290 /* make the mount read-only */ 291 vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0); 292 } else { 293 zone_rele(to_zptr); 294 zone_rele(from_zptr); 295 return (EACCES); 296 } 297 } 298 } 299 zone_rele(to_zptr); 300 zone_rele(from_zptr); 301 } 302 303 /* 304 * realrootvp may be an AUTOFS node, in which case we 305 * perform a VOP_ACCESS() to trigger the mount of the 306 * intended filesystem, so we loopback mount the intended 307 * filesystem instead of the AUTOFS filesystem. 308 */ 309 (void) VOP_ACCESS(realrootvp, 0, 0, cr); 310 311 /* 312 * We're interested in the top most filesystem. 313 * This is specially important when uap->spec is a trigger 314 * AUTOFS node, since we're really interested in mounting the 315 * filesystem AUTOFS mounted as result of the VOP_ACCESS() 316 * call not the AUTOFS node itself. 317 */ 318 if (vn_mountedvfs(realrootvp) != NULL) { 319 if (error = traverse(&realrootvp)) { 320 VN_RELE(realrootvp); 321 return (error); 322 } 323 } 324 325 /* 326 * Allocate a vfs info struct and attach it 327 */ 328 li = kmem_zalloc(sizeof (struct loinfo), KM_SLEEP); 329 li->li_realvfs = realrootvp->v_vfsp; 330 li->li_mountvfs = vfsp; 331 332 /* 333 * Set mount flags to be inherited by loopback vfs's 334 */ 335 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) { 336 li->li_mflag |= VFS_RDONLY; 337 } 338 if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { 339 li->li_mflag |= (VFS_NOSETUID|VFS_NODEVICES); 340 } 341 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) { 342 li->li_mflag |= VFS_NODEVICES; 343 } 344 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { 345 li->li_mflag |= VFS_NOSETUID; 346 } 347 /* 348 * Permissive flags are added to the "deny" bitmap. 349 */ 350 if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) { 351 li->li_dflag |= VFS_XATTR; 352 } 353 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) { 354 li->li_dflag |= VFS_NBMAND; 355 } 356 357 /* 358 * Propagate inheritable mount flags from the real vfs. 359 */ 360 if ((li->li_realvfs->vfs_flag & VFS_RDONLY) && 361 !vfs_optionisset(vfsp, MNTOPT_RO, NULL)) 362 vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 363 VFS_NODISPLAY); 364 if ((li->li_realvfs->vfs_flag & VFS_NOSETUID) && 365 !vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) 366 vfs_setmntopt(vfsp, MNTOPT_NOSETUID, NULL, 367 VFS_NODISPLAY); 368 if ((li->li_realvfs->vfs_flag & VFS_NODEVICES) && 369 !vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) 370 vfs_setmntopt(vfsp, MNTOPT_NODEVICES, NULL, 371 VFS_NODISPLAY); 372 /* 373 * Permissive flags such as VFS_XATTR, as opposed to restrictive flags 374 * such as VFS_RDONLY, are handled differently. An explicit 375 * MNTOPT_NOXATTR should override the underlying filesystem's VFS_XATTR. 376 */ 377 if ((li->li_realvfs->vfs_flag & VFS_XATTR) && 378 !vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL) && 379 !vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) 380 vfs_setmntopt(vfsp, MNTOPT_XATTR, NULL, 381 VFS_NODISPLAY); 382 if ((li->li_realvfs->vfs_flag & VFS_NBMAND) && 383 !vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL) && 384 !vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) 385 vfs_setmntopt(vfsp, MNTOPT_NBMAND, NULL, 386 VFS_NODISPLAY); 387 388 li->li_refct = 0; 389 vfsp->vfs_data = (caddr_t)li; 390 vfsp->vfs_bcount = 0; 391 vfsp->vfs_fstype = lofsfstype; 392 vfsp->vfs_bsize = li->li_realvfs->vfs_bsize; 393 394 /* 395 * Test to see if we need to be in "zone /dev" mode. In zonedevfs 396 * mode, we pull a nasty trick; we make sure that the lofs dev_t does 397 * *not* reflect the underlying device, so that no renames or links 398 * can occur to or from the /dev hierarchy. 399 */ 400 if (is_zonedevfs) { 401 dev_t dev; 402 403 mutex_enter(&lofs_minor_lock); 404 do { 405 lofs_minor = (lofs_minor + 1) & MAXMIN32; 406 dev = makedevice(lofs_major, lofs_minor); 407 } while (vfs_devismounted(dev)); 408 mutex_exit(&lofs_minor_lock); 409 410 vfsp->vfs_dev = dev; 411 vfs_make_fsid(&vfsp->vfs_fsid, dev, lofsfstype); 412 413 li->li_flag |= LO_ZONEDEVFS; 414 } else { 415 vfsp->vfs_dev = li->li_realvfs->vfs_dev; 416 vfsp->vfs_fsid.val[0] = li->li_realvfs->vfs_fsid.val[0]; 417 vfsp->vfs_fsid.val[1] = li->li_realvfs->vfs_fsid.val[1]; 418 } 419 420 if (vfs_optionisset(vfsp, MNTOPT_LOFS_NOSUB, NULL)) { 421 li->li_flag |= LO_NOSUB; 422 } 423 424 /* 425 * Setup the hashtable. If the root of this mount isn't a directory, 426 * there's no point in allocating a large hashtable. A table with one 427 * bucket is sufficient. 428 */ 429 if (realrootvp->v_type != VDIR) 430 lsetup(li, 1); 431 else 432 lsetup(li, 0); 433 434 /* 435 * Make the root vnode 436 */ 437 srootvp = makelonode(realrootvp, li, 0); 438 srootvp->v_flag |= VROOT; 439 li->li_rootvp = srootvp; 440 441 #ifdef LODEBUG 442 lo_dprint(4, "lo_mount: vfs %p realvfs %p root %p realroot %p li %p\n", 443 vfsp, li->li_realvfs, srootvp, realrootvp, li); 444 #endif 445 return (0); 446 } 447 448 /* 449 * Undo loopback mount 450 */ 451 static int 452 lo_unmount(struct vfs *vfsp, int flag, struct cred *cr) 453 { 454 struct loinfo *li; 455 456 if (secpolicy_fs_unmount(cr, vfsp) != 0) 457 return (EPERM); 458 459 /* 460 * Forced unmount is not supported by this file system 461 * and thus, ENOTSUP, is being returned. 462 */ 463 if (flag & MS_FORCE) 464 return (ENOTSUP); 465 466 li = vtoli(vfsp); 467 #ifdef LODEBUG 468 lo_dprint(4, "lo_unmount(%p) li %p\n", vfsp, li); 469 #endif 470 if (li->li_refct != 1 || li->li_rootvp->v_count != 1) { 471 #ifdef LODEBUG 472 lo_dprint(4, "refct %d v_ct %d\n", li->li_refct, 473 li->li_rootvp->v_count); 474 #endif 475 return (EBUSY); 476 } 477 VN_RELE(li->li_rootvp); 478 return (0); 479 } 480 481 /* 482 * Find root of lofs mount. 483 */ 484 static int 485 lo_root(struct vfs *vfsp, struct vnode **vpp) 486 { 487 *vpp = vtoli(vfsp)->li_rootvp; 488 #ifdef LODEBUG 489 lo_dprint(4, "lo_root(0x%p) = %p\n", vfsp, *vpp); 490 #endif 491 /* 492 * If the root of the filesystem is a special file, return the specvp 493 * version of the vnode. We don't save the specvp vnode in our 494 * hashtable since that's exclusively for lnodes. 495 */ 496 if (IS_DEVVP(*vpp)) { 497 struct vnode *svp; 498 499 svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, kcred); 500 if (svp == NULL) 501 return (ENOSYS); 502 *vpp = svp; 503 } else { 504 VN_HOLD(*vpp); 505 } 506 507 return (0); 508 } 509 510 /* 511 * Get file system statistics. 512 */ 513 static int 514 lo_statvfs(register struct vfs *vfsp, struct statvfs64 *sbp) 515 { 516 vnode_t *realrootvp; 517 518 #ifdef LODEBUG 519 lo_dprint(4, "lostatvfs %p\n", vfsp); 520 #endif 521 /* 522 * Using realrootvp->v_vfsp (instead of the realvfsp that was 523 * cached) is necessary to make lofs work woth forced UFS unmounts. 524 * In the case of a forced unmount, UFS stores a set of dummy vfsops 525 * in all the (i)vnodes in the filesystem. The dummy ops simply 526 * returns back EIO. 527 */ 528 (void) lo_realvfs(vfsp, &realrootvp); 529 if (realrootvp != NULL) 530 return (VFS_STATVFS(realrootvp->v_vfsp, sbp)); 531 else 532 return (EIO); 533 } 534 535 /* 536 * LOFS doesn't have any data or metadata to flush, pending I/O on the 537 * underlying filesystem will be flushed when such filesystem is synched. 538 */ 539 /* ARGSUSED */ 540 static int 541 lo_sync(struct vfs *vfsp, 542 short flag, 543 struct cred *cr) 544 { 545 #ifdef LODEBUG 546 lo_dprint(4, "lo_sync: %p\n", vfsp); 547 #endif 548 return (0); 549 } 550 551 /* 552 * Obtain the vnode from the underlying filesystem. 553 */ 554 static int 555 lo_vget(struct vfs *vfsp, struct vnode **vpp, struct fid *fidp) 556 { 557 vnode_t *realrootvp; 558 559 #ifdef LODEBUG 560 lo_dprint(4, "lo_vget: %p\n", vfsp); 561 #endif 562 (void) lo_realvfs(vfsp, &realrootvp); 563 if (realrootvp != NULL) 564 return (VFS_VGET(realrootvp->v_vfsp, vpp, fidp)); 565 else 566 return (EIO); 567 } 568 569 /* 570 * Free mount-specific data. 571 */ 572 static void 573 lo_freevfs(struct vfs *vfsp) 574 { 575 struct loinfo *li = vtoli(vfsp); 576 577 ldestroy(li); 578 kmem_free(li, sizeof (struct loinfo)); 579 } 580 581 static int 582 lofsinit(int fstyp, char *name) 583 { 584 static const fs_operation_def_t lo_vfsops_template[] = { 585 VFSNAME_MOUNT, lo_mount, 586 VFSNAME_UNMOUNT, lo_unmount, 587 VFSNAME_ROOT, lo_root, 588 VFSNAME_STATVFS, lo_statvfs, 589 VFSNAME_SYNC, (fs_generic_func_p) lo_sync, 590 VFSNAME_VGET, lo_vget, 591 VFSNAME_FREEVFS, (fs_generic_func_p) lo_freevfs, 592 NULL, NULL 593 }; 594 int error; 595 596 error = vfs_setfsops(fstyp, lo_vfsops_template, &lo_vfsops); 597 if (error != 0) { 598 cmn_err(CE_WARN, "lofsinit: bad vfs ops template"); 599 return (error); 600 } 601 602 error = vn_make_ops(name, lo_vnodeops_template, &lo_vnodeops); 603 if (error != 0) { 604 (void) vfs_freevfsops_by_type(fstyp); 605 cmn_err(CE_WARN, "lofsinit: bad vnode ops template"); 606 return (error); 607 } 608 609 lofsfstype = fstyp; 610 611 if ((lofs_major = getudev()) == (major_t)-1) { 612 (void) vfs_freevfsops_by_type(fstyp); 613 cmn_err(CE_WARN, "lofsinit: Can't get unique device number."); 614 return (ENXIO); 615 } 616 617 lofs_minor = 0; 618 mutex_init(&lofs_minor_lock, NULL, MUTEX_DEFAULT, NULL); 619 620 return (0); 621 } 622