1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/param.h> 28 #include <sys/sysmacros.h> 29 #include <sys/kmem.h> 30 #include <sys/time.h> 31 #include <sys/pathname.h> 32 #include <sys/vfs.h> 33 #include <sys/vfs_opreg.h> 34 #include <sys/vnode.h> 35 #include <sys/stat.h> 36 #include <sys/uio.h> 37 #include <sys/stat.h> 38 #include <sys/errno.h> 39 #include <sys/cmn_err.h> 40 #include <sys/cred.h> 41 #include <sys/statvfs.h> 42 #include <sys/mount.h> 43 #include <sys/debug.h> 44 #include <sys/systm.h> 45 #include <sys/mntent.h> 46 #include <fs/fs_subr.h> 47 #include <vm/page.h> 48 #include <vm/anon.h> 49 #include <sys/model.h> 50 #include <sys/policy.h> 51 52 #include <sys/fs/swapnode.h> 53 #include <sys/fs/tmp.h> 54 #include <sys/fs/tmpnode.h> 55 56 static int tmpfsfstype; 57 58 /* 59 * tmpfs vfs operations. 60 */ 61 static int tmpfsinit(int, char *); 62 static int tmp_mount(struct vfs *, struct vnode *, 63 struct mounta *, struct cred *); 64 static int tmp_unmount(struct vfs *, int, struct cred *); 65 static int tmp_root(struct vfs *, struct vnode **); 66 static int tmp_statvfs(struct vfs *, struct statvfs64 *); 67 static int tmp_vget(struct vfs *, struct vnode **, struct fid *); 68 69 /* 70 * Loadable module wrapper 71 */ 72 #include <sys/modctl.h> 73 74 static mntopts_t tmpfs_proto_opttbl; 75 76 static vfsdef_t vfw = { 77 VFSDEF_VERSION, 78 "tmpfs", 79 tmpfsinit, 80 VSW_HASPROTO|VSW_STATS, 81 &tmpfs_proto_opttbl 82 }; 83 84 /* 85 * in-kernel mnttab options 86 */ 87 static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL }; 88 static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL }; 89 90 static mntopt_t tmpfs_options[] = { 91 /* Option name Cancel Opt Arg Flags Data */ 92 { MNTOPT_XATTR, xattr_cancel, NULL, MO_DEFAULT, NULL}, 93 { MNTOPT_NOXATTR, noxattr_cancel, NULL, NULL, NULL}, 94 { "size", NULL, "0", MO_HASVALUE, NULL} 95 }; 96 97 98 static mntopts_t tmpfs_proto_opttbl = { 99 sizeof (tmpfs_options) / sizeof (mntopt_t), 100 tmpfs_options 101 }; 102 103 /* 104 * Module linkage information 105 */ 106 static struct modlfs modlfs = { 107 &mod_fsops, "filesystem for tmpfs", &vfw 108 }; 109 110 static struct modlinkage modlinkage = { 111 MODREV_1, &modlfs, NULL 112 }; 113 114 int 115 _init() 116 { 117 return (mod_install(&modlinkage)); 118 } 119 120 int 121 _fini() 122 { 123 int error; 124 125 error = mod_remove(&modlinkage); 126 if (error) 127 return (error); 128 /* 129 * Tear down the operations vectors 130 */ 131 (void) vfs_freevfsops_by_type(tmpfsfstype); 132 vn_freevnodeops(tmp_vnodeops); 133 return (0); 134 } 135 136 int 137 _info(struct modinfo *modinfop) 138 { 139 return (mod_info(&modlinkage, modinfop)); 140 } 141 142 /* 143 * The following are patchable variables limiting the amount of system 144 * resources tmpfs can use. 145 * 146 * tmpfs_maxkmem limits the amount of kernel kmem_alloc memory 147 * tmpfs can use for it's data structures (e.g. tmpnodes, directory entries) 148 * It is not determined by setting a hard limit but rather as a percentage of 149 * physical memory which is determined when tmpfs is first used in the system. 150 * 151 * tmpfs_minfree is the minimum amount of swap space that tmpfs leaves for 152 * the rest of the system. In other words, if the amount of free swap space 153 * in the system (i.e. anoninfo.ani_free) drops below tmpfs_minfree, tmpfs 154 * anon allocations will fail. 155 * 156 * There is also a per mount limit on the amount of swap space 157 * (tmount.tm_anonmax) settable via a mount option. 158 */ 159 size_t tmpfs_maxkmem = 0; 160 size_t tmpfs_minfree = 0; 161 size_t tmp_kmemspace; /* bytes of kernel heap used by all tmpfs */ 162 163 static major_t tmpfs_major; 164 static minor_t tmpfs_minor; 165 static kmutex_t tmpfs_minor_lock; 166 167 /* 168 * initialize global tmpfs locks and such 169 * called when loading tmpfs module 170 */ 171 static int 172 tmpfsinit(int fstype, char *name) 173 { 174 static const fs_operation_def_t tmp_vfsops_template[] = { 175 VFSNAME_MOUNT, { .vfs_mount = tmp_mount }, 176 VFSNAME_UNMOUNT, { .vfs_unmount = tmp_unmount }, 177 VFSNAME_ROOT, { .vfs_root = tmp_root }, 178 VFSNAME_STATVFS, { .vfs_statvfs = tmp_statvfs }, 179 VFSNAME_VGET, { .vfs_vget = tmp_vget }, 180 NULL, NULL 181 }; 182 int error; 183 extern void tmpfs_hash_init(); 184 185 tmpfs_hash_init(); 186 tmpfsfstype = fstype; 187 ASSERT(tmpfsfstype != 0); 188 189 error = vfs_setfsops(fstype, tmp_vfsops_template, NULL); 190 if (error != 0) { 191 cmn_err(CE_WARN, "tmpfsinit: bad vfs ops template"); 192 return (error); 193 } 194 195 error = vn_make_ops(name, tmp_vnodeops_template, &tmp_vnodeops); 196 if (error != 0) { 197 (void) vfs_freevfsops_by_type(fstype); 198 cmn_err(CE_WARN, "tmpfsinit: bad vnode ops template"); 199 return (error); 200 } 201 202 /* 203 * tmpfs_minfree doesn't need to be some function of configured 204 * swap space since it really is an absolute limit of swap space 205 * which still allows other processes to execute. 206 */ 207 if (tmpfs_minfree == 0) { 208 /* 209 * Set if not patched 210 */ 211 tmpfs_minfree = btopr(TMPMINFREE); 212 } 213 214 /* 215 * The maximum amount of space tmpfs can allocate is 216 * TMPMAXPROCKMEM percent of kernel memory 217 */ 218 if (tmpfs_maxkmem == 0) 219 tmpfs_maxkmem = MAX(PAGESIZE, kmem_maxavail() / TMPMAXFRACKMEM); 220 221 if ((tmpfs_major = getudev()) == (major_t)-1) { 222 cmn_err(CE_WARN, "tmpfsinit: Can't get unique device number."); 223 tmpfs_major = 0; 224 } 225 mutex_init(&tmpfs_minor_lock, NULL, MUTEX_DEFAULT, NULL); 226 return (0); 227 } 228 229 static int 230 tmp_mount( 231 struct vfs *vfsp, 232 struct vnode *mvp, 233 struct mounta *uap, 234 struct cred *cr) 235 { 236 struct tmount *tm = NULL; 237 struct tmpnode *tp; 238 struct pathname dpn; 239 int error; 240 pgcnt_t anonmax; 241 struct vattr rattr; 242 int got_attrs; 243 244 char *sizestr; 245 246 if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0) 247 return (error); 248 249 if (mvp->v_type != VDIR) 250 return (ENOTDIR); 251 252 mutex_enter(&mvp->v_lock); 253 if ((uap->flags & MS_OVERLAY) == 0 && 254 (mvp->v_count != 1 || (mvp->v_flag & VROOT))) { 255 mutex_exit(&mvp->v_lock); 256 return (EBUSY); 257 } 258 mutex_exit(&mvp->v_lock); 259 260 /* 261 * Having the resource be anything but "swap" doesn't make sense. 262 */ 263 vfs_setresource(vfsp, "swap"); 264 265 /* 266 * now look for options we understand... 267 */ 268 269 /* tmpfs doesn't support read-only mounts */ 270 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) { 271 error = EINVAL; 272 goto out; 273 } 274 275 /* 276 * tm_anonmax is set according to the mount arguments 277 * if any. Otherwise, it is set to a maximum value. 278 */ 279 if (vfs_optionisset(vfsp, "size", &sizestr)) { 280 if ((error = tmp_convnum(sizestr, &anonmax)) != 0) 281 goto out; 282 } else { 283 anonmax = ULONG_MAX; 284 } 285 286 if (error = pn_get(uap->dir, 287 (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE, &dpn)) 288 goto out; 289 290 if ((tm = tmp_memalloc(sizeof (struct tmount), 0)) == NULL) { 291 pn_free(&dpn); 292 error = ENOMEM; 293 goto out; 294 } 295 296 /* 297 * find an available minor device number for this mount 298 */ 299 mutex_enter(&tmpfs_minor_lock); 300 do { 301 tmpfs_minor = (tmpfs_minor + 1) & L_MAXMIN32; 302 tm->tm_dev = makedevice(tmpfs_major, tmpfs_minor); 303 } while (vfs_devismounted(tm->tm_dev)); 304 mutex_exit(&tmpfs_minor_lock); 305 306 /* 307 * Set but don't bother entering the mutex 308 * (tmount not on mount list yet) 309 */ 310 mutex_init(&tm->tm_contents, NULL, MUTEX_DEFAULT, NULL); 311 mutex_init(&tm->tm_renamelck, NULL, MUTEX_DEFAULT, NULL); 312 313 tm->tm_vfsp = vfsp; 314 tm->tm_anonmax = anonmax; 315 316 vfsp->vfs_data = (caddr_t)tm; 317 vfsp->vfs_fstype = tmpfsfstype; 318 vfsp->vfs_dev = tm->tm_dev; 319 vfsp->vfs_bsize = PAGESIZE; 320 vfsp->vfs_flag |= VFS_NOTRUNC; 321 vfs_make_fsid(&vfsp->vfs_fsid, tm->tm_dev, tmpfsfstype); 322 tm->tm_mntpath = tmp_memalloc(dpn.pn_pathlen + 1, TMP_MUSTHAVE); 323 (void) strcpy(tm->tm_mntpath, dpn.pn_path); 324 325 /* 326 * allocate and initialize root tmpnode structure 327 */ 328 bzero(&rattr, sizeof (struct vattr)); 329 rattr.va_mode = (mode_t)(S_IFDIR | 0777); /* XXX modes */ 330 rattr.va_type = VDIR; 331 rattr.va_rdev = 0; 332 tp = tmp_memalloc(sizeof (struct tmpnode), TMP_MUSTHAVE); 333 tmpnode_init(tm, tp, &rattr, cr); 334 335 /* 336 * Get the mode, uid, and gid from the underlying mount point. 337 */ 338 rattr.va_mask = AT_MODE|AT_UID|AT_GID; /* Hint to getattr */ 339 got_attrs = VOP_GETATTR(mvp, &rattr, 0, cr, NULL); 340 341 rw_enter(&tp->tn_rwlock, RW_WRITER); 342 TNTOV(tp)->v_flag |= VROOT; 343 344 /* 345 * If the getattr succeeded, use its results. Otherwise allow 346 * the previously set hardwired defaults to prevail. 347 */ 348 if (got_attrs == 0) { 349 tp->tn_mode = rattr.va_mode; 350 tp->tn_uid = rattr.va_uid; 351 tp->tn_gid = rattr.va_gid; 352 } 353 354 /* 355 * initialize linked list of tmpnodes so that the back pointer of 356 * the root tmpnode always points to the last one on the list 357 * and the forward pointer of the last node is null 358 */ 359 tp->tn_back = tp; 360 tp->tn_forw = NULL; 361 tp->tn_nlink = 0; 362 tm->tm_rootnode = tp; 363 364 tdirinit(tp, tp); 365 366 rw_exit(&tp->tn_rwlock); 367 368 pn_free(&dpn); 369 error = 0; 370 371 out: 372 if (error == 0) 373 vfs_set_feature(vfsp, VFSFT_SYSATTR_VIEWS); 374 375 return (error); 376 } 377 378 static int 379 tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr) 380 { 381 struct tmount *tm = (struct tmount *)VFSTOTM(vfsp); 382 struct tmpnode *tnp, *cancel; 383 struct vnode *vp; 384 int error; 385 386 if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0) 387 return (error); 388 389 /* 390 * forced unmount is not supported by this file system 391 * and thus, ENOTSUP, is being returned. 392 */ 393 if (flag & MS_FORCE) 394 return (ENOTSUP); 395 396 mutex_enter(&tm->tm_contents); 397 398 /* 399 * If there are no open files, only the root node should have 400 * a reference count. 401 * With tm_contents held, nothing can be added or removed. 402 * There may be some dirty pages. To prevent fsflush from 403 * disrupting the unmount, put a hold on each node while scanning. 404 * If we find a previously referenced node, undo the holds we have 405 * placed and fail EBUSY. 406 */ 407 tnp = tm->tm_rootnode; 408 if (TNTOV(tnp)->v_count > 1) { 409 mutex_exit(&tm->tm_contents); 410 return (EBUSY); 411 } 412 413 for (tnp = tnp->tn_forw; tnp; tnp = tnp->tn_forw) { 414 if ((vp = TNTOV(tnp))->v_count > 0) { 415 cancel = tm->tm_rootnode->tn_forw; 416 while (cancel != tnp) { 417 vp = TNTOV(cancel); 418 ASSERT(vp->v_count > 0); 419 VN_RELE(vp); 420 cancel = cancel->tn_forw; 421 } 422 mutex_exit(&tm->tm_contents); 423 return (EBUSY); 424 } 425 VN_HOLD(vp); 426 } 427 428 /* 429 * We can drop the mutex now because no one can find this mount 430 */ 431 mutex_exit(&tm->tm_contents); 432 433 /* 434 * Free all kmemalloc'd and anonalloc'd memory associated with 435 * this filesystem. To do this, we go through the file list twice, 436 * once to remove all the directory entries, and then to remove 437 * all the files. We do this because there is useful code in 438 * tmpnode_free which assumes that the directory entry has been 439 * removed before the file. 440 */ 441 /* 442 * Remove all directory entries 443 */ 444 for (tnp = tm->tm_rootnode; tnp; tnp = tnp->tn_forw) { 445 rw_enter(&tnp->tn_rwlock, RW_WRITER); 446 if (tnp->tn_type == VDIR) 447 tdirtrunc(tnp); 448 if (tnp->tn_vnode->v_flag & V_XATTRDIR) { 449 /* 450 * Account for implicit attrdir reference. 451 */ 452 ASSERT(tnp->tn_nlink > 0); 453 DECR_COUNT(&tnp->tn_nlink, &tnp->tn_tlock); 454 } 455 rw_exit(&tnp->tn_rwlock); 456 } 457 458 ASSERT(tm->tm_rootnode); 459 460 /* 461 * All links are gone, v_count is keeping nodes in place. 462 * VN_RELE should make the node disappear, unless somebody 463 * is holding pages against it. Nap and retry until it disappears. 464 * 465 * We re-acquire the lock to prevent others who have a HOLD on 466 * a tmpnode via its pages or anon slots from blowing it away 467 * (in tmp_inactive) while we're trying to get to it here. Once 468 * we have a HOLD on it we know it'll stick around. 469 * 470 */ 471 mutex_enter(&tm->tm_contents); 472 /* 473 * Remove all the files (except the rootnode) backwards. 474 */ 475 while ((tnp = tm->tm_rootnode->tn_back) != tm->tm_rootnode) { 476 mutex_exit(&tm->tm_contents); 477 /* 478 * Inhibit tmp_inactive from touching attribute directory 479 * as all nodes will be released here. 480 * Note we handled the link count in pass 2 above. 481 */ 482 rw_enter(&tnp->tn_rwlock, RW_WRITER); 483 tnp->tn_xattrdp = NULL; 484 rw_exit(&tnp->tn_rwlock); 485 vp = TNTOV(tnp); 486 VN_RELE(vp); 487 mutex_enter(&tm->tm_contents); 488 /* 489 * It's still there after the RELE. Someone else like pageout 490 * has a hold on it so wait a bit and then try again - we know 491 * they'll give it up soon. 492 */ 493 if (tnp == tm->tm_rootnode->tn_back) { 494 VN_HOLD(vp); 495 mutex_exit(&tm->tm_contents); 496 delay(hz / 4); 497 mutex_enter(&tm->tm_contents); 498 } 499 } 500 mutex_exit(&tm->tm_contents); 501 502 tm->tm_rootnode->tn_xattrdp = NULL; 503 VN_RELE(TNTOV(tm->tm_rootnode)); 504 505 ASSERT(tm->tm_mntpath); 506 507 tmp_memfree(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1); 508 509 ASSERT(tm->tm_anonmem == 0); 510 511 mutex_destroy(&tm->tm_contents); 512 mutex_destroy(&tm->tm_renamelck); 513 tmp_memfree(tm, sizeof (struct tmount)); 514 515 return (0); 516 } 517 518 /* 519 * return root tmpnode for given vnode 520 */ 521 static int 522 tmp_root(struct vfs *vfsp, struct vnode **vpp) 523 { 524 struct tmount *tm = (struct tmount *)VFSTOTM(vfsp); 525 struct tmpnode *tp = tm->tm_rootnode; 526 struct vnode *vp; 527 528 ASSERT(tp); 529 530 vp = TNTOV(tp); 531 VN_HOLD(vp); 532 *vpp = vp; 533 return (0); 534 } 535 536 static int 537 tmp_statvfs(struct vfs *vfsp, struct statvfs64 *sbp) 538 { 539 struct tmount *tm = (struct tmount *)VFSTOTM(vfsp); 540 ulong_t blocks; 541 dev32_t d32; 542 543 sbp->f_bsize = PAGESIZE; 544 sbp->f_frsize = PAGESIZE; 545 546 /* 547 * Find the amount of available physical and memory swap 548 */ 549 mutex_enter(&anoninfo_lock); 550 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 551 blocks = (ulong_t)CURRENT_TOTAL_AVAILABLE_SWAP; 552 mutex_exit(&anoninfo_lock); 553 554 /* 555 * If tm_anonmax for this mount is less than the available swap space 556 * (minus the amount tmpfs can't use), use that instead 557 */ 558 if (blocks > tmpfs_minfree) 559 sbp->f_bfree = MIN(blocks - tmpfs_minfree, 560 tm->tm_anonmax - tm->tm_anonmem); 561 else 562 sbp->f_bfree = 0; 563 564 sbp->f_bavail = sbp->f_bfree; 565 566 /* 567 * Total number of blocks is what's available plus what's been used 568 */ 569 sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree + tm->tm_anonmem); 570 571 /* 572 * The maximum number of files available is approximately the number 573 * of tmpnodes we can allocate from the remaining kernel memory 574 * available to tmpfs. This is fairly inaccurate since it doesn't 575 * take into account the names stored in the directory entries. 576 */ 577 if (tmpfs_maxkmem > tmp_kmemspace) 578 sbp->f_ffree = (tmpfs_maxkmem - tmp_kmemspace) / 579 (sizeof (struct tmpnode) + sizeof (struct tdirent)); 580 else 581 sbp->f_ffree = 0; 582 583 sbp->f_files = tmpfs_maxkmem / 584 (sizeof (struct tmpnode) + sizeof (struct tdirent)); 585 sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree); 586 (void) cmpldev(&d32, vfsp->vfs_dev); 587 sbp->f_fsid = d32; 588 (void) strcpy(sbp->f_basetype, vfssw[tmpfsfstype].vsw_name); 589 (void) strncpy(sbp->f_fstr, tm->tm_mntpath, sizeof (sbp->f_fstr)); 590 /* 591 * ensure null termination 592 */ 593 sbp->f_fstr[sizeof (sbp->f_fstr) - 1] = '\0'; 594 sbp->f_flag = vf_to_stf(vfsp->vfs_flag); 595 sbp->f_namemax = MAXNAMELEN - 1; 596 return (0); 597 } 598 599 static int 600 tmp_vget(struct vfs *vfsp, struct vnode **vpp, struct fid *fidp) 601 { 602 struct tfid *tfid; 603 struct tmount *tm = (struct tmount *)VFSTOTM(vfsp); 604 struct tmpnode *tp = NULL; 605 606 tfid = (struct tfid *)fidp; 607 *vpp = NULL; 608 609 mutex_enter(&tm->tm_contents); 610 for (tp = tm->tm_rootnode; tp; tp = tp->tn_forw) { 611 mutex_enter(&tp->tn_tlock); 612 if (tp->tn_nodeid == tfid->tfid_ino) { 613 /* 614 * If the gen numbers don't match we know the 615 * file won't be found since only one tmpnode 616 * can have this number at a time. 617 */ 618 if (tp->tn_gen != tfid->tfid_gen || tp->tn_nlink == 0) { 619 mutex_exit(&tp->tn_tlock); 620 mutex_exit(&tm->tm_contents); 621 return (0); 622 } 623 *vpp = (struct vnode *)TNTOV(tp); 624 625 VN_HOLD(*vpp); 626 627 if ((tp->tn_mode & S_ISVTX) && 628 !(tp->tn_mode & (S_IXUSR | S_IFDIR))) { 629 mutex_enter(&(*vpp)->v_lock); 630 (*vpp)->v_flag |= VISSWAP; 631 mutex_exit(&(*vpp)->v_lock); 632 } 633 mutex_exit(&tp->tn_tlock); 634 mutex_exit(&tm->tm_contents); 635 return (0); 636 } 637 mutex_exit(&tp->tn_tlock); 638 } 639 mutex_exit(&tm->tm_contents); 640 return (0); 641 } 642