1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/param.h> 29 #include <sys/errno.h> 30 #include <sys/vfs.h> 31 #include <sys/vnode.h> 32 #include <sys/uio.h> 33 #include <sys/pathname.h> 34 #include <sys/kmem.h> 35 #include <sys/cred.h> 36 #include <sys/statvfs.h> 37 #include <sys/fs/lofs_info.h> 38 #include <sys/fs/lofs_node.h> 39 #include <sys/mount.h> 40 #include <sys/mntent.h> 41 #include <sys/mkdev.h> 42 #include <sys/priv.h> 43 #include <sys/sysmacros.h> 44 #include <sys/systm.h> 45 #include <sys/cmn_err.h> 46 #include <sys/policy.h> 47 #include <sys/tsol/label.h> 48 #include "fs/fs_subr.h" 49 50 /* 51 * This is the loadable module wrapper. 52 */ 53 #include <sys/modctl.h> 54 55 static mntopts_t lofs_mntopts; 56 57 static int lofsinit(int, char *); 58 59 static vfsdef_t vfw = { 60 VFSDEF_VERSION, 61 "lofs", 62 lofsinit, 63 VSW_HASPROTO|VSW_STATS, 64 &lofs_mntopts 65 }; 66 67 /* 68 * Stuff needed to support "zonedevfs" mode. 69 */ 70 static major_t lofs_major; 71 static minor_t lofs_minor; 72 static kmutex_t lofs_minor_lock; 73 74 /* 75 * LOFS mount options table 76 */ 77 static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL }; 78 static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL }; 79 static char *zonedevfs_cancel[] = { MNTOPT_LOFS_NOZONEDEVFS, NULL }; 80 static char *nozonedevfs_cancel[] = { MNTOPT_LOFS_ZONEDEVFS, NULL }; 81 static char *sub_cancel[] = { MNTOPT_LOFS_NOSUB, NULL }; 82 static char *nosub_cancel[] = { MNTOPT_LOFS_SUB, NULL }; 83 84 static mntopt_t mntopts[] = { 85 /* 86 * option name cancel option default arg flags 87 * private data 88 */ 89 { MNTOPT_XATTR, xattr_cancel, NULL, 0, 90 (void *)0 }, 91 { MNTOPT_NOXATTR, noxattr_cancel, NULL, 0, 92 (void *)0 }, 93 { MNTOPT_LOFS_ZONEDEVFS, zonedevfs_cancel, NULL, 0, 94 (void *)0 }, 95 { MNTOPT_LOFS_NOZONEDEVFS, nozonedevfs_cancel, NULL, 0, 96 (void *)0 }, 97 { MNTOPT_LOFS_SUB, sub_cancel, NULL, 0, 98 (void *)0 }, 99 { MNTOPT_LOFS_NOSUB, nosub_cancel, NULL, 0, 100 (void *)0 }, 101 }; 102 103 static mntopts_t lofs_mntopts = { 104 sizeof (mntopts) / sizeof (mntopt_t), 105 mntopts 106 }; 107 108 /* 109 * Module linkage information for the kernel. 110 */ 111 112 static struct modlfs modlfs = { 113 &mod_fsops, "filesystem for lofs", &vfw 114 }; 115 116 static struct modlinkage modlinkage = { 117 MODREV_1, (void *)&modlfs, NULL 118 }; 119 120 /* 121 * This is the module initialization routine. 122 */ 123 124 int 125 _init(void) 126 { 127 int status; 128 129 lofs_subrinit(); 130 status = mod_install(&modlinkage); 131 if (status != 0) { 132 /* 133 * Cleanup previously initialized work. 134 */ 135 lofs_subrfini(); 136 } 137 138 return (status); 139 } 140 141 /* 142 * Don't allow the lofs module to be unloaded for now. 143 * There is a memory leak if it gets unloaded. 144 */ 145 146 int 147 _fini(void) 148 { 149 return (EBUSY); 150 } 151 152 int 153 _info(struct modinfo *modinfop) 154 { 155 return (mod_info(&modlinkage, modinfop)); 156 } 157 158 159 static int lofsfstype; 160 vfsops_t *lo_vfsops; 161 162 /* 163 * lo mount vfsop 164 * Set up mount info record and attach it to vfs struct. 165 */ 166 /*ARGSUSED*/ 167 static int 168 lo_mount(struct vfs *vfsp, 169 struct vnode *vp, 170 struct mounta *uap, 171 struct cred *cr) 172 { 173 int error; 174 struct vnode *srootvp = NULL; /* the server's root */ 175 struct vnode *realrootvp; 176 struct loinfo *li; 177 int is_zonedevfs = 0; 178 int nodev; 179 180 nodev = vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL); 181 182 if ((error = secpolicy_fs_mount(cr, vp, vfsp)) != 0) 183 return (EPERM); 184 185 /* 186 * Loopback devices which get "nodevices" added can be done without 187 * "nodevices" set because we cannot import devices into a zone 188 * with loopback. Note that we have all zone privileges when 189 * this happens; if not, we'd have gotten "nosuid". 190 */ 191 if (!nodev && vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) 192 vfs_setmntopt(vfsp, MNTOPT_DEVICES, NULL, VFS_NODISPLAY); 193 194 /* 195 * We must ensure that only the global zone applies the 'zonedevfs' 196 * option; we don't want non-global zones to be able to establish 197 * lofs mounts using the special dev_t we use to ensure that the 198 * contents of a zone's /dev cannot be victim to link(2) or rename(2). 199 * See below, where we set all of this up. 200 * 201 * Since this is more like a privilege check, we use crgetzoneid(cr) 202 * instead of getzoneid(). 203 */ 204 is_zonedevfs = vfs_optionisset(vfsp, MNTOPT_LOFS_ZONEDEVFS, NULL); 205 if (crgetzoneid(cr) != GLOBAL_ZONEID && is_zonedevfs) 206 return (EPERM); 207 208 mutex_enter(&vp->v_lock); 209 if (!(uap->flags & MS_OVERLAY) && 210 (vp->v_count != 1 || (vp->v_flag & VROOT))) { 211 mutex_exit(&vp->v_lock); 212 return (EBUSY); 213 } 214 mutex_exit(&vp->v_lock); 215 216 /* 217 * Find real root, and make vfs point to real vfs 218 */ 219 if (error = lookupname(uap->spec, (uap->flags & MS_SYSSPACE) ? 220 UIO_SYSSPACE : UIO_USERSPACE, FOLLOW, NULLVPP, 221 &realrootvp)) 222 return (error); 223 224 /* 225 * Enforce MAC policy if needed. 226 * 227 * Loopback mounts must not allow writing up. The dominance test 228 * is intended to prevent a global zone caller from accidentally 229 * creating write-up conditions between two labeled zones. 230 * Local zones can't violate MAC on their own without help from 231 * the global zone because they can't name a pathname that 232 * they don't already have. 233 * 234 * The special case check for the NET_MAC_AWARE process flag is 235 * to support the case of the automounter in the global zone. We 236 * permit automounting of local zone directories such as home 237 * directories, into the global zone as required by setlabel, 238 * zonecopy, and saving of desktop sessions. Such mounts are 239 * trusted not to expose the contents of one zone's directories 240 * to another by leaking them through the global zone. 241 */ 242 if (is_system_labeled() && crgetzoneid(cr) == GLOBAL_ZONEID) { 243 char specname[MAXPATHLEN]; 244 zone_t *from_zptr; 245 zone_t *to_zptr; 246 247 if (vnodetopath(NULL, realrootvp, specname, 248 sizeof (specname), CRED()) != 0) 249 return (EACCES); 250 251 from_zptr = zone_find_by_path(specname); 252 to_zptr = zone_find_by_path(refstr_value(vfsp->vfs_mntpt)); 253 254 /* 255 * Special case for zone devfs: the zone for /dev will 256 * incorrectly appear as the global zone since it's not 257 * under the zone rootpath. So for zone devfs check allow 258 * read-write mounts. 259 * 260 * Second special case for scratch zones used for Live Upgrade: 261 * this is used to mount the zone's root from /root to /a in 262 * the scratch zone. As with the other special case, this 263 * appears to be outside of the zone because it's not under 264 * the zone rootpath, which is $ZONEPATH/lu in the scratch 265 * zone case. 266 */ 267 268 if (from_zptr != to_zptr && !is_zonedevfs && 269 !(to_zptr->zone_flags & ZF_IS_SCRATCH)) { 270 /* 271 * We know at this point that the labels aren't equal 272 * because the zone pointers aren't equal, and zones 273 * can't share a label. 274 * 275 * If the source is the global zone then making 276 * it available to a local zone must be done in 277 * read-only mode as the label will become admin_low. 278 * 279 * If it is a mount between local zones then if 280 * the current process is in the global zone and has 281 * the NET_MAC_AWARE flag, then regular read-write 282 * access is allowed. If it's in some other zone, but 283 * the label on the mount point dominates the original 284 * source, then allow the mount as read-only 285 * ("read-down"). 286 */ 287 if (from_zptr->zone_id == GLOBAL_ZONEID) { 288 /* make the mount read-only */ 289 vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0); 290 } else { /* cross-zone mount */ 291 if (to_zptr->zone_id == GLOBAL_ZONEID && 292 /* LINTED: no consequent */ 293 getpflags(NET_MAC_AWARE, cr) != 0) { 294 /* Allow the mount as read-write */ 295 } else if (bldominates( 296 label2bslabel(to_zptr->zone_slabel), 297 label2bslabel(from_zptr->zone_slabel))) { 298 /* make the mount read-only */ 299 vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0); 300 } else { 301 zone_rele(to_zptr); 302 zone_rele(from_zptr); 303 return (EACCES); 304 } 305 } 306 } 307 zone_rele(to_zptr); 308 zone_rele(from_zptr); 309 } 310 311 /* 312 * realrootvp may be an AUTOFS node, in which case we 313 * perform a VOP_ACCESS() to trigger the mount of the 314 * intended filesystem, so we loopback mount the intended 315 * filesystem instead of the AUTOFS filesystem. 316 */ 317 (void) VOP_ACCESS(realrootvp, 0, 0, cr); 318 319 /* 320 * We're interested in the top most filesystem. 321 * This is specially important when uap->spec is a trigger 322 * AUTOFS node, since we're really interested in mounting the 323 * filesystem AUTOFS mounted as result of the VOP_ACCESS() 324 * call not the AUTOFS node itself. 325 */ 326 if (vn_mountedvfs(realrootvp) != NULL) { 327 if (error = traverse(&realrootvp)) { 328 VN_RELE(realrootvp); 329 return (error); 330 } 331 } 332 333 /* 334 * Allocate a vfs info struct and attach it 335 */ 336 li = kmem_zalloc(sizeof (struct loinfo), KM_SLEEP); 337 li->li_realvfs = realrootvp->v_vfsp; 338 li->li_mountvfs = vfsp; 339 340 /* 341 * Set mount flags to be inherited by loopback vfs's 342 */ 343 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) { 344 li->li_mflag |= VFS_RDONLY; 345 } 346 if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { 347 li->li_mflag |= (VFS_NOSETUID|VFS_NODEVICES); 348 } 349 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) { 350 li->li_mflag |= VFS_NODEVICES; 351 } 352 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { 353 li->li_mflag |= VFS_NOSETUID; 354 } 355 /* 356 * Permissive flags are added to the "deny" bitmap. 357 */ 358 if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) { 359 li->li_dflag |= VFS_XATTR; 360 } 361 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) { 362 li->li_dflag |= VFS_NBMAND; 363 } 364 365 /* 366 * Propagate inheritable mount flags from the real vfs. 367 */ 368 if ((li->li_realvfs->vfs_flag & VFS_RDONLY) && 369 !vfs_optionisset(vfsp, MNTOPT_RO, NULL)) 370 vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 371 VFS_NODISPLAY); 372 if ((li->li_realvfs->vfs_flag & VFS_NOSETUID) && 373 !vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) 374 vfs_setmntopt(vfsp, MNTOPT_NOSETUID, NULL, 375 VFS_NODISPLAY); 376 if ((li->li_realvfs->vfs_flag & VFS_NODEVICES) && 377 !vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) 378 vfs_setmntopt(vfsp, MNTOPT_NODEVICES, NULL, 379 VFS_NODISPLAY); 380 /* 381 * Permissive flags such as VFS_XATTR, as opposed to restrictive flags 382 * such as VFS_RDONLY, are handled differently. An explicit 383 * MNTOPT_NOXATTR should override the underlying filesystem's VFS_XATTR. 384 */ 385 if ((li->li_realvfs->vfs_flag & VFS_XATTR) && 386 !vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL) && 387 !vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) 388 vfs_setmntopt(vfsp, MNTOPT_XATTR, NULL, 389 VFS_NODISPLAY); 390 if ((li->li_realvfs->vfs_flag & VFS_NBMAND) && 391 !vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL) && 392 !vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) 393 vfs_setmntopt(vfsp, MNTOPT_NBMAND, NULL, 394 VFS_NODISPLAY); 395 396 li->li_refct = 0; 397 vfsp->vfs_data = (caddr_t)li; 398 vfsp->vfs_bcount = 0; 399 vfsp->vfs_fstype = lofsfstype; 400 vfsp->vfs_bsize = li->li_realvfs->vfs_bsize; 401 402 /* 403 * Test to see if we need to be in "zone /dev" mode. In zonedevfs 404 * mode, we pull a nasty trick; we make sure that the lofs dev_t does 405 * *not* reflect the underlying device, so that no renames or links 406 * can occur to or from the /dev hierarchy. 407 */ 408 if (is_zonedevfs) { 409 dev_t dev; 410 411 mutex_enter(&lofs_minor_lock); 412 do { 413 lofs_minor = (lofs_minor + 1) & MAXMIN32; 414 dev = makedevice(lofs_major, lofs_minor); 415 } while (vfs_devismounted(dev)); 416 mutex_exit(&lofs_minor_lock); 417 418 vfsp->vfs_dev = dev; 419 vfs_make_fsid(&vfsp->vfs_fsid, dev, lofsfstype); 420 421 li->li_flag |= LO_ZONEDEVFS; 422 } else { 423 vfsp->vfs_dev = li->li_realvfs->vfs_dev; 424 vfsp->vfs_fsid.val[0] = li->li_realvfs->vfs_fsid.val[0]; 425 vfsp->vfs_fsid.val[1] = li->li_realvfs->vfs_fsid.val[1]; 426 } 427 428 if (vfs_optionisset(vfsp, MNTOPT_LOFS_NOSUB, NULL)) { 429 li->li_flag |= LO_NOSUB; 430 } 431 432 /* 433 * Setup the hashtable. If the root of this mount isn't a directory, 434 * there's no point in allocating a large hashtable. A table with one 435 * bucket is sufficient. 436 */ 437 if (realrootvp->v_type != VDIR) 438 lsetup(li, 1); 439 else 440 lsetup(li, 0); 441 442 /* 443 * Make the root vnode 444 */ 445 srootvp = makelonode(realrootvp, li, 0); 446 srootvp->v_flag |= VROOT; 447 li->li_rootvp = srootvp; 448 449 #ifdef LODEBUG 450 lo_dprint(4, "lo_mount: vfs %p realvfs %p root %p realroot %p li %p\n", 451 vfsp, li->li_realvfs, srootvp, realrootvp, li); 452 #endif 453 return (0); 454 } 455 456 /* 457 * Undo loopback mount 458 */ 459 static int 460 lo_unmount(struct vfs *vfsp, int flag, struct cred *cr) 461 { 462 struct loinfo *li; 463 464 if (secpolicy_fs_unmount(cr, vfsp) != 0) 465 return (EPERM); 466 467 /* 468 * Forced unmount is not supported by this file system 469 * and thus, ENOTSUP, is being returned. 470 */ 471 if (flag & MS_FORCE) 472 return (ENOTSUP); 473 474 li = vtoli(vfsp); 475 #ifdef LODEBUG 476 lo_dprint(4, "lo_unmount(%p) li %p\n", vfsp, li); 477 #endif 478 if (li->li_refct != 1 || li->li_rootvp->v_count != 1) { 479 #ifdef LODEBUG 480 lo_dprint(4, "refct %d v_ct %d\n", li->li_refct, 481 li->li_rootvp->v_count); 482 #endif 483 return (EBUSY); 484 } 485 VN_RELE(li->li_rootvp); 486 return (0); 487 } 488 489 /* 490 * Find root of lofs mount. 491 */ 492 static int 493 lo_root(struct vfs *vfsp, struct vnode **vpp) 494 { 495 *vpp = vtoli(vfsp)->li_rootvp; 496 #ifdef LODEBUG 497 lo_dprint(4, "lo_root(0x%p) = %p\n", vfsp, *vpp); 498 #endif 499 /* 500 * If the root of the filesystem is a special file, return the specvp 501 * version of the vnode. We don't save the specvp vnode in our 502 * hashtable since that's exclusively for lnodes. 503 */ 504 if (IS_DEVVP(*vpp)) { 505 struct vnode *svp; 506 507 svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, kcred); 508 if (svp == NULL) 509 return (ENOSYS); 510 *vpp = svp; 511 } else { 512 VN_HOLD(*vpp); 513 } 514 515 return (0); 516 } 517 518 /* 519 * Get file system statistics. 520 */ 521 static int 522 lo_statvfs(register struct vfs *vfsp, struct statvfs64 *sbp) 523 { 524 vnode_t *realrootvp; 525 526 #ifdef LODEBUG 527 lo_dprint(4, "lostatvfs %p\n", vfsp); 528 #endif 529 /* 530 * Using realrootvp->v_vfsp (instead of the realvfsp that was 531 * cached) is necessary to make lofs work woth forced UFS unmounts. 532 * In the case of a forced unmount, UFS stores a set of dummy vfsops 533 * in all the (i)vnodes in the filesystem. The dummy ops simply 534 * returns back EIO. 535 */ 536 (void) lo_realvfs(vfsp, &realrootvp); 537 if (realrootvp != NULL) 538 return (VFS_STATVFS(realrootvp->v_vfsp, sbp)); 539 else 540 return (EIO); 541 } 542 543 /* 544 * LOFS doesn't have any data or metadata to flush, pending I/O on the 545 * underlying filesystem will be flushed when such filesystem is synched. 546 */ 547 /* ARGSUSED */ 548 static int 549 lo_sync(struct vfs *vfsp, 550 short flag, 551 struct cred *cr) 552 { 553 #ifdef LODEBUG 554 lo_dprint(4, "lo_sync: %p\n", vfsp); 555 #endif 556 return (0); 557 } 558 559 /* 560 * Obtain the vnode from the underlying filesystem. 561 */ 562 static int 563 lo_vget(struct vfs *vfsp, struct vnode **vpp, struct fid *fidp) 564 { 565 vnode_t *realrootvp; 566 567 #ifdef LODEBUG 568 lo_dprint(4, "lo_vget: %p\n", vfsp); 569 #endif 570 (void) lo_realvfs(vfsp, &realrootvp); 571 if (realrootvp != NULL) 572 return (VFS_VGET(realrootvp->v_vfsp, vpp, fidp)); 573 else 574 return (EIO); 575 } 576 577 /* 578 * Free mount-specific data. 579 */ 580 static void 581 lo_freevfs(struct vfs *vfsp) 582 { 583 struct loinfo *li = vtoli(vfsp); 584 585 ldestroy(li); 586 kmem_free(li, sizeof (struct loinfo)); 587 } 588 589 static int 590 lofsinit(int fstyp, char *name) 591 { 592 static const fs_operation_def_t lo_vfsops_template[] = { 593 VFSNAME_MOUNT, lo_mount, 594 VFSNAME_UNMOUNT, lo_unmount, 595 VFSNAME_ROOT, lo_root, 596 VFSNAME_STATVFS, lo_statvfs, 597 VFSNAME_SYNC, (fs_generic_func_p) lo_sync, 598 VFSNAME_VGET, lo_vget, 599 VFSNAME_FREEVFS, (fs_generic_func_p) lo_freevfs, 600 NULL, NULL 601 }; 602 int error; 603 604 error = vfs_setfsops(fstyp, lo_vfsops_template, &lo_vfsops); 605 if (error != 0) { 606 cmn_err(CE_WARN, "lofsinit: bad vfs ops template"); 607 return (error); 608 } 609 610 error = vn_make_ops(name, lo_vnodeops_template, &lo_vnodeops); 611 if (error != 0) { 612 (void) vfs_freevfsops_by_type(fstyp); 613 cmn_err(CE_WARN, "lofsinit: bad vnode ops template"); 614 return (error); 615 } 616 617 lofsfstype = fstyp; 618 619 if ((lofs_major = getudev()) == (major_t)-1) { 620 (void) vfs_freevfsops_by_type(fstyp); 621 cmn_err(CE_WARN, "lofsinit: Can't get unique device number."); 622 return (ENXIO); 623 } 624 625 lofs_minor = 0; 626 mutex_init(&lofs_minor_lock, NULL, MUTEX_DEFAULT, NULL); 627 628 return (0); 629 } 630