1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/types.h> 29 #include <sys/param.h> 30 #include <sys/sysmacros.h> 31 #include <sys/kmem.h> 32 #include <sys/time.h> 33 #include <sys/pathname.h> 34 #include <sys/vfs.h> 35 #include <sys/vfs_opreg.h> 36 #include <sys/vnode.h> 37 #include <sys/stat.h> 38 #include <sys/uio.h> 39 #include <sys/stat.h> 40 #include <sys/errno.h> 41 #include <sys/cmn_err.h> 42 #include <sys/cred.h> 43 #include <sys/statvfs.h> 44 #include <sys/mount.h> 45 #include <sys/debug.h> 46 #include <sys/systm.h> 47 #include <sys/mntent.h> 48 #include <fs/fs_subr.h> 49 #include <vm/page.h> 50 #include <vm/anon.h> 51 #include <sys/model.h> 52 #include <sys/policy.h> 53 54 #include <sys/fs/swapnode.h> 55 #include <sys/fs/tmp.h> 56 #include <sys/fs/tmpnode.h> 57 58 static int tmpfsfstype; 59 60 /* 61 * tmpfs vfs operations. 62 */ 63 static int tmpfsinit(int, char *); 64 static int tmp_mount(struct vfs *, struct vnode *, 65 struct mounta *, struct cred *); 66 static int tmp_unmount(struct vfs *, int, struct cred *); 67 static int tmp_root(struct vfs *, struct vnode **); 68 static int tmp_statvfs(struct vfs *, struct statvfs64 *); 69 static int tmp_vget(struct vfs *, struct vnode **, struct fid *); 70 71 /* 72 * Loadable module wrapper 73 */ 74 #include <sys/modctl.h> 75 76 static mntopts_t tmpfs_proto_opttbl; 77 78 static vfsdef_t vfw = { 79 VFSDEF_VERSION, 80 "tmpfs", 81 tmpfsinit, 82 VSW_HASPROTO|VSW_STATS, 83 &tmpfs_proto_opttbl 84 }; 85 86 /* 87 * in-kernel mnttab options 88 */ 89 static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL }; 90 static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL }; 91 92 static mntopt_t tmpfs_options[] = { 93 /* Option name Cancel Opt Arg Flags Data */ 94 { MNTOPT_XATTR, xattr_cancel, NULL, MO_DEFAULT, NULL}, 95 { MNTOPT_NOXATTR, noxattr_cancel, NULL, NULL, NULL}, 96 { "size", NULL, "0", MO_HASVALUE, NULL} 97 }; 98 99 100 static mntopts_t tmpfs_proto_opttbl = { 101 sizeof (tmpfs_options) / sizeof (mntopt_t), 102 tmpfs_options 103 }; 104 105 /* 106 * Module linkage information 107 */ 108 static struct modlfs modlfs = { 109 &mod_fsops, "filesystem for tmpfs", &vfw 110 }; 111 112 static struct modlinkage modlinkage = { 113 MODREV_1, &modlfs, NULL 114 }; 115 116 int 117 _init() 118 { 119 return (mod_install(&modlinkage)); 120 } 121 122 int 123 _fini() 124 { 125 int error; 126 127 error = mod_remove(&modlinkage); 128 if (error) 129 return (error); 130 /* 131 * Tear down the operations vectors 132 */ 133 (void) vfs_freevfsops_by_type(tmpfsfstype); 134 vn_freevnodeops(tmp_vnodeops); 135 return (0); 136 } 137 138 int 139 _info(struct modinfo *modinfop) 140 { 141 return (mod_info(&modlinkage, modinfop)); 142 } 143 144 /* 145 * The following are patchable variables limiting the amount of system 146 * resources tmpfs can use. 147 * 148 * tmpfs_maxkmem limits the amount of kernel kmem_alloc memory 149 * tmpfs can use for it's data structures (e.g. tmpnodes, directory entries) 150 * It is not determined by setting a hard limit but rather as a percentage of 151 * physical memory which is determined when tmpfs is first used in the system. 152 * 153 * tmpfs_minfree is the minimum amount of swap space that tmpfs leaves for 154 * the rest of the system. In other words, if the amount of free swap space 155 * in the system (i.e. anoninfo.ani_free) drops below tmpfs_minfree, tmpfs 156 * anon allocations will fail. 157 * 158 * There is also a per mount limit on the amount of swap space 159 * (tmount.tm_anonmax) settable via a mount option. 160 */ 161 size_t tmpfs_maxkmem = 0; 162 size_t tmpfs_minfree = 0; 163 size_t tmp_kmemspace; /* bytes of kernel heap used by all tmpfs */ 164 165 static major_t tmpfs_major; 166 static minor_t tmpfs_minor; 167 static kmutex_t tmpfs_minor_lock; 168 169 /* 170 * initialize global tmpfs locks and such 171 * called when loading tmpfs module 172 */ 173 static int 174 tmpfsinit(int fstype, char *name) 175 { 176 static const fs_operation_def_t tmp_vfsops_template[] = { 177 VFSNAME_MOUNT, { .vfs_mount = tmp_mount }, 178 VFSNAME_UNMOUNT, { .vfs_unmount = tmp_unmount }, 179 VFSNAME_ROOT, { .vfs_root = tmp_root }, 180 VFSNAME_STATVFS, { .vfs_statvfs = tmp_statvfs }, 181 VFSNAME_VGET, { .vfs_vget = tmp_vget }, 182 NULL, NULL 183 }; 184 int error; 185 extern void tmpfs_hash_init(); 186 187 tmpfs_hash_init(); 188 tmpfsfstype = fstype; 189 ASSERT(tmpfsfstype != 0); 190 191 error = vfs_setfsops(fstype, tmp_vfsops_template, NULL); 192 if (error != 0) { 193 cmn_err(CE_WARN, "tmpfsinit: bad vfs ops template"); 194 return (error); 195 } 196 197 error = vn_make_ops(name, tmp_vnodeops_template, &tmp_vnodeops); 198 if (error != 0) { 199 (void) vfs_freevfsops_by_type(fstype); 200 cmn_err(CE_WARN, "tmpfsinit: bad vnode ops template"); 201 return (error); 202 } 203 204 /* 205 * tmpfs_minfree doesn't need to be some function of configured 206 * swap space since it really is an absolute limit of swap space 207 * which still allows other processes to execute. 208 */ 209 if (tmpfs_minfree == 0) { 210 /* 211 * Set if not patched 212 */ 213 tmpfs_minfree = btopr(TMPMINFREE); 214 } 215 216 /* 217 * The maximum amount of space tmpfs can allocate is 218 * TMPMAXPROCKMEM percent of kernel memory 219 */ 220 if (tmpfs_maxkmem == 0) 221 tmpfs_maxkmem = MAX(PAGESIZE, kmem_maxavail() / TMPMAXFRACKMEM); 222 223 if ((tmpfs_major = getudev()) == (major_t)-1) { 224 cmn_err(CE_WARN, "tmpfsinit: Can't get unique device number."); 225 tmpfs_major = 0; 226 } 227 mutex_init(&tmpfs_minor_lock, NULL, MUTEX_DEFAULT, NULL); 228 return (0); 229 } 230 231 static int 232 tmp_mount( 233 struct vfs *vfsp, 234 struct vnode *mvp, 235 struct mounta *uap, 236 struct cred *cr) 237 { 238 struct tmount *tm = NULL; 239 struct tmpnode *tp; 240 struct pathname dpn; 241 int error; 242 pgcnt_t anonmax; 243 struct vattr rattr; 244 int got_attrs; 245 246 char *sizestr; 247 248 if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0) 249 return (error); 250 251 if (mvp->v_type != VDIR) 252 return (ENOTDIR); 253 254 mutex_enter(&mvp->v_lock); 255 if ((uap->flags & MS_OVERLAY) == 0 && 256 (mvp->v_count != 1 || (mvp->v_flag & VROOT))) { 257 mutex_exit(&mvp->v_lock); 258 return (EBUSY); 259 } 260 mutex_exit(&mvp->v_lock); 261 262 /* 263 * Having the resource be anything but "swap" doesn't make sense. 264 */ 265 vfs_setresource(vfsp, "swap"); 266 267 /* 268 * now look for options we understand... 269 */ 270 271 /* tmpfs doesn't support read-only mounts */ 272 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) { 273 error = EINVAL; 274 goto out; 275 } 276 277 /* 278 * tm_anonmax is set according to the mount arguments 279 * if any. Otherwise, it is set to a maximum value. 280 */ 281 if (vfs_optionisset(vfsp, "size", &sizestr)) { 282 if ((error = tmp_convnum(sizestr, &anonmax)) != 0) 283 goto out; 284 } else { 285 anonmax = ULONG_MAX; 286 } 287 288 if (error = pn_get(uap->dir, 289 (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE, &dpn)) 290 goto out; 291 292 if ((tm = tmp_memalloc(sizeof (struct tmount), 0)) == NULL) { 293 pn_free(&dpn); 294 error = ENOMEM; 295 goto out; 296 } 297 298 /* 299 * find an available minor device number for this mount 300 */ 301 mutex_enter(&tmpfs_minor_lock); 302 do { 303 tmpfs_minor = (tmpfs_minor + 1) & L_MAXMIN32; 304 tm->tm_dev = makedevice(tmpfs_major, tmpfs_minor); 305 } while (vfs_devismounted(tm->tm_dev)); 306 mutex_exit(&tmpfs_minor_lock); 307 308 /* 309 * Set but don't bother entering the mutex 310 * (tmount not on mount list yet) 311 */ 312 mutex_init(&tm->tm_contents, NULL, MUTEX_DEFAULT, NULL); 313 mutex_init(&tm->tm_renamelck, NULL, MUTEX_DEFAULT, NULL); 314 315 tm->tm_vfsp = vfsp; 316 tm->tm_anonmax = anonmax; 317 318 vfsp->vfs_data = (caddr_t)tm; 319 vfsp->vfs_fstype = tmpfsfstype; 320 vfsp->vfs_dev = tm->tm_dev; 321 vfsp->vfs_bsize = PAGESIZE; 322 vfsp->vfs_flag |= VFS_NOTRUNC; 323 vfs_make_fsid(&vfsp->vfs_fsid, tm->tm_dev, tmpfsfstype); 324 tm->tm_mntpath = tmp_memalloc(dpn.pn_pathlen + 1, TMP_MUSTHAVE); 325 (void) strcpy(tm->tm_mntpath, dpn.pn_path); 326 327 /* 328 * allocate and initialize root tmpnode structure 329 */ 330 bzero(&rattr, sizeof (struct vattr)); 331 rattr.va_mode = (mode_t)(S_IFDIR | 0777); /* XXX modes */ 332 rattr.va_type = VDIR; 333 rattr.va_rdev = 0; 334 tp = tmp_memalloc(sizeof (struct tmpnode), TMP_MUSTHAVE); 335 tmpnode_init(tm, tp, &rattr, cr); 336 337 /* 338 * Get the mode, uid, and gid from the underlying mount point. 339 */ 340 rattr.va_mask = AT_MODE|AT_UID|AT_GID; /* Hint to getattr */ 341 got_attrs = VOP_GETATTR(mvp, &rattr, 0, cr, NULL); 342 343 rw_enter(&tp->tn_rwlock, RW_WRITER); 344 TNTOV(tp)->v_flag |= VROOT; 345 346 /* 347 * If the getattr succeeded, use its results. Otherwise allow 348 * the previously set hardwired defaults to prevail. 349 */ 350 if (got_attrs == 0) { 351 tp->tn_mode = rattr.va_mode; 352 tp->tn_uid = rattr.va_uid; 353 tp->tn_gid = rattr.va_gid; 354 } 355 356 /* 357 * initialize linked list of tmpnodes so that the back pointer of 358 * the root tmpnode always points to the last one on the list 359 * and the forward pointer of the last node is null 360 */ 361 tp->tn_back = tp; 362 tp->tn_forw = NULL; 363 tp->tn_nlink = 0; 364 tm->tm_rootnode = tp; 365 366 tdirinit(tp, tp); 367 368 rw_exit(&tp->tn_rwlock); 369 370 pn_free(&dpn); 371 error = 0; 372 373 out: 374 if (error == 0) 375 vfs_set_feature(vfsp, VFSFT_XVATTR); 376 377 return (error); 378 } 379 380 static int 381 tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr) 382 { 383 struct tmount *tm = (struct tmount *)VFSTOTM(vfsp); 384 struct tmpnode *tnp, *cancel; 385 struct vnode *vp; 386 int error; 387 388 if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0) 389 return (error); 390 391 /* 392 * forced unmount is not supported by this file system 393 * and thus, ENOTSUP, is being returned. 394 */ 395 if (flag & MS_FORCE) 396 return (ENOTSUP); 397 398 mutex_enter(&tm->tm_contents); 399 400 /* 401 * If there are no open files, only the root node should have 402 * a reference count. 403 * With tm_contents held, nothing can be added or removed. 404 * There may be some dirty pages. To prevent fsflush from 405 * disrupting the unmount, put a hold on each node while scanning. 406 * If we find a previously referenced node, undo the holds we have 407 * placed and fail EBUSY. 408 */ 409 tnp = tm->tm_rootnode; 410 if (TNTOV(tnp)->v_count > 1) { 411 mutex_exit(&tm->tm_contents); 412 return (EBUSY); 413 } 414 415 for (tnp = tnp->tn_forw; tnp; tnp = tnp->tn_forw) { 416 if ((vp = TNTOV(tnp))->v_count > 0) { 417 cancel = tm->tm_rootnode->tn_forw; 418 while (cancel != tnp) { 419 vp = TNTOV(cancel); 420 ASSERT(vp->v_count > 0); 421 VN_RELE(vp); 422 cancel = cancel->tn_forw; 423 } 424 mutex_exit(&tm->tm_contents); 425 return (EBUSY); 426 } 427 VN_HOLD(vp); 428 } 429 430 /* 431 * We can drop the mutex now because no one can find this mount 432 */ 433 mutex_exit(&tm->tm_contents); 434 435 /* 436 * Free all kmemalloc'd and anonalloc'd memory associated with 437 * this filesystem. To do this, we go through the file list twice, 438 * once to remove all the directory entries, and then to remove 439 * all the files. We do this because there is useful code in 440 * tmpnode_free which assumes that the directory entry has been 441 * removed before the file. 442 */ 443 /* 444 * Remove all directory entries 445 */ 446 for (tnp = tm->tm_rootnode; tnp; tnp = tnp->tn_forw) { 447 rw_enter(&tnp->tn_rwlock, RW_WRITER); 448 if (tnp->tn_type == VDIR) 449 tdirtrunc(tnp); 450 if (tnp->tn_vnode->v_flag & V_XATTRDIR) { 451 /* 452 * Account for implicit attrdir reference. 453 */ 454 ASSERT(tnp->tn_nlink > 0); 455 DECR_COUNT(&tnp->tn_nlink, &tnp->tn_tlock); 456 } 457 rw_exit(&tnp->tn_rwlock); 458 } 459 460 ASSERT(tm->tm_rootnode); 461 462 /* 463 * All links are gone, v_count is keeping nodes in place. 464 * VN_RELE should make the node disappear, unless somebody 465 * is holding pages against it. Nap and retry until it disappears. 466 * 467 * We re-acquire the lock to prevent others who have a HOLD on 468 * a tmpnode via its pages or anon slots from blowing it away 469 * (in tmp_inactive) while we're trying to get to it here. Once 470 * we have a HOLD on it we know it'll stick around. 471 * 472 */ 473 mutex_enter(&tm->tm_contents); 474 /* 475 * Remove all the files (except the rootnode) backwards. 476 */ 477 while ((tnp = tm->tm_rootnode->tn_back) != tm->tm_rootnode) { 478 mutex_exit(&tm->tm_contents); 479 /* 480 * Inhibit tmp_inactive from touching attribute directory 481 * as all nodes will be released here. 482 * Note we handled the link count in pass 2 above. 483 */ 484 rw_enter(&tnp->tn_rwlock, RW_WRITER); 485 tnp->tn_xattrdp = NULL; 486 rw_exit(&tnp->tn_rwlock); 487 vp = TNTOV(tnp); 488 VN_RELE(vp); 489 mutex_enter(&tm->tm_contents); 490 /* 491 * It's still there after the RELE. Someone else like pageout 492 * has a hold on it so wait a bit and then try again - we know 493 * they'll give it up soon. 494 */ 495 if (tnp == tm->tm_rootnode->tn_back) { 496 VN_HOLD(vp); 497 mutex_exit(&tm->tm_contents); 498 delay(hz / 4); 499 mutex_enter(&tm->tm_contents); 500 } 501 } 502 mutex_exit(&tm->tm_contents); 503 504 tm->tm_rootnode->tn_xattrdp = NULL; 505 VN_RELE(TNTOV(tm->tm_rootnode)); 506 507 ASSERT(tm->tm_mntpath); 508 509 tmp_memfree(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1); 510 511 ASSERT(tm->tm_anonmem == 0); 512 513 mutex_destroy(&tm->tm_contents); 514 mutex_destroy(&tm->tm_renamelck); 515 tmp_memfree(tm, sizeof (struct tmount)); 516 517 return (0); 518 } 519 520 /* 521 * return root tmpnode for given vnode 522 */ 523 static int 524 tmp_root(struct vfs *vfsp, struct vnode **vpp) 525 { 526 struct tmount *tm = (struct tmount *)VFSTOTM(vfsp); 527 struct tmpnode *tp = tm->tm_rootnode; 528 struct vnode *vp; 529 530 ASSERT(tp); 531 532 vp = TNTOV(tp); 533 VN_HOLD(vp); 534 *vpp = vp; 535 return (0); 536 } 537 538 static int 539 tmp_statvfs(struct vfs *vfsp, struct statvfs64 *sbp) 540 { 541 struct tmount *tm = (struct tmount *)VFSTOTM(vfsp); 542 ulong_t blocks; 543 dev32_t d32; 544 545 sbp->f_bsize = PAGESIZE; 546 sbp->f_frsize = PAGESIZE; 547 548 /* 549 * Find the amount of available physical and memory swap 550 */ 551 mutex_enter(&anoninfo_lock); 552 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 553 blocks = (ulong_t)CURRENT_TOTAL_AVAILABLE_SWAP; 554 mutex_exit(&anoninfo_lock); 555 556 /* 557 * If tm_anonmax for this mount is less than the available swap space 558 * (minus the amount tmpfs can't use), use that instead 559 */ 560 if (blocks > tmpfs_minfree) 561 sbp->f_bfree = MIN(blocks - tmpfs_minfree, 562 tm->tm_anonmax - tm->tm_anonmem); 563 else 564 sbp->f_bfree = 0; 565 566 sbp->f_bavail = sbp->f_bfree; 567 568 /* 569 * Total number of blocks is what's available plus what's been used 570 */ 571 sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree + tm->tm_anonmem); 572 573 /* 574 * The maximum number of files available is approximately the number 575 * of tmpnodes we can allocate from the remaining kernel memory 576 * available to tmpfs. This is fairly inaccurate since it doesn't 577 * take into account the names stored in the directory entries. 578 */ 579 if (tmpfs_maxkmem > tmp_kmemspace) 580 sbp->f_ffree = (tmpfs_maxkmem - tmp_kmemspace) / 581 (sizeof (struct tmpnode) + sizeof (struct tdirent)); 582 else 583 sbp->f_ffree = 0; 584 585 sbp->f_files = tmpfs_maxkmem / 586 (sizeof (struct tmpnode) + sizeof (struct tdirent)); 587 sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree); 588 (void) cmpldev(&d32, vfsp->vfs_dev); 589 sbp->f_fsid = d32; 590 (void) strcpy(sbp->f_basetype, vfssw[tmpfsfstype].vsw_name); 591 (void) strncpy(sbp->f_fstr, tm->tm_mntpath, sizeof (sbp->f_fstr)); 592 /* 593 * ensure null termination 594 */ 595 sbp->f_fstr[sizeof (sbp->f_fstr) - 1] = '\0'; 596 sbp->f_flag = vf_to_stf(vfsp->vfs_flag); 597 sbp->f_namemax = MAXNAMELEN - 1; 598 return (0); 599 } 600 601 static int 602 tmp_vget(struct vfs *vfsp, struct vnode **vpp, struct fid *fidp) 603 { 604 struct tfid *tfid; 605 struct tmount *tm = (struct tmount *)VFSTOTM(vfsp); 606 struct tmpnode *tp = NULL; 607 608 tfid = (struct tfid *)fidp; 609 *vpp = NULL; 610 611 mutex_enter(&tm->tm_contents); 612 for (tp = tm->tm_rootnode; tp; tp = tp->tn_forw) { 613 mutex_enter(&tp->tn_tlock); 614 if (tp->tn_nodeid == tfid->tfid_ino) { 615 /* 616 * If the gen numbers don't match we know the 617 * file won't be found since only one tmpnode 618 * can have this number at a time. 619 */ 620 if (tp->tn_gen != tfid->tfid_gen || tp->tn_nlink == 0) { 621 mutex_exit(&tp->tn_tlock); 622 mutex_exit(&tm->tm_contents); 623 return (0); 624 } 625 *vpp = (struct vnode *)TNTOV(tp); 626 627 VN_HOLD(*vpp); 628 629 if ((tp->tn_mode & S_ISVTX) && 630 !(tp->tn_mode & (S_IXUSR | S_IFDIR))) { 631 mutex_enter(&(*vpp)->v_lock); 632 (*vpp)->v_flag |= VISSWAP; 633 mutex_exit(&(*vpp)->v_lock); 634 } 635 mutex_exit(&tp->tn_tlock); 636 mutex_exit(&tm->tm_contents); 637 return (0); 638 } 639 mutex_exit(&tp->tn_tlock); 640 } 641 mutex_exit(&tm->tm_contents); 642 return (0); 643 } 644