1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/types.h> 29 #include <sys/param.h> 30 #include <sys/sysmacros.h> 31 #include <sys/kmem.h> 32 #include <sys/time.h> 33 #include <sys/pathname.h> 34 #include <sys/vfs.h> 35 #include <sys/vnode.h> 36 #include <sys/stat.h> 37 #include <sys/uio.h> 38 #include <sys/stat.h> 39 #include <sys/errno.h> 40 #include <sys/cmn_err.h> 41 #include <sys/cred.h> 42 #include <sys/statvfs.h> 43 #include <sys/mount.h> 44 #include <sys/debug.h> 45 #include <sys/systm.h> 46 #include <sys/mntent.h> 47 #include <fs/fs_subr.h> 48 #include <vm/page.h> 49 #include <vm/anon.h> 50 #include <sys/model.h> 51 #include <sys/policy.h> 52 53 #include <sys/fs/swapnode.h> 54 #include <sys/fs/tmp.h> 55 #include <sys/fs/tmpnode.h> 56 57 static int tmpfsfstype; 58 59 /* 60 * tmpfs vfs operations. 61 */ 62 static int tmpfsinit(int, char *); 63 static int tmp_mount(struct vfs *, struct vnode *, 64 struct mounta *, struct cred *); 65 static int tmp_unmount(struct vfs *, int, struct cred *); 66 static int tmp_root(struct vfs *, struct vnode **); 67 static int tmp_statvfs(struct vfs *, struct statvfs64 *); 68 static int tmp_vget(struct vfs *, struct vnode **, struct fid *); 69 70 /* 71 * Loadable module wrapper 72 */ 73 #include <sys/modctl.h> 74 75 static mntopts_t tmpfs_proto_opttbl; 76 77 static vfsdef_t vfw = { 78 VFSDEF_VERSION, 79 "tmpfs", 80 tmpfsinit, 81 VSW_HASPROTO|VSW_STATS, 82 &tmpfs_proto_opttbl 83 }; 84 85 /* 86 * in-kernel mnttab options 87 */ 88 static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL }; 89 static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL }; 90 91 static mntopt_t tmpfs_options[] = { 92 /* Option name Cancel Opt Arg Flags Data */ 93 { MNTOPT_XATTR, xattr_cancel, NULL, MO_DEFAULT, NULL}, 94 { MNTOPT_NOXATTR, noxattr_cancel, NULL, NULL, NULL}, 95 { "size", NULL, "0", MO_HASVALUE, NULL} 96 }; 97 98 99 static mntopts_t tmpfs_proto_opttbl = { 100 sizeof (tmpfs_options) / sizeof (mntopt_t), 101 tmpfs_options 102 }; 103 104 /* 105 * Module linkage information 106 */ 107 static struct modlfs modlfs = { 108 &mod_fsops, "filesystem for tmpfs", &vfw 109 }; 110 111 static struct modlinkage modlinkage = { 112 MODREV_1, &modlfs, NULL 113 }; 114 115 int 116 _init() 117 { 118 return (mod_install(&modlinkage)); 119 } 120 121 int 122 _fini() 123 { 124 int error; 125 126 error = mod_remove(&modlinkage); 127 if (error) 128 return (error); 129 /* 130 * Tear down the operations vectors 131 */ 132 (void) vfs_freevfsops_by_type(tmpfsfstype); 133 vn_freevnodeops(tmp_vnodeops); 134 return (0); 135 } 136 137 int 138 _info(struct modinfo *modinfop) 139 { 140 return (mod_info(&modlinkage, modinfop)); 141 } 142 143 /* 144 * The following are patchable variables limiting the amount of system 145 * resources tmpfs can use. 146 * 147 * tmpfs_maxkmem limits the amount of kernel kmem_alloc memory 148 * tmpfs can use for it's data structures (e.g. tmpnodes, directory entries) 149 * It is not determined by setting a hard limit but rather as a percentage of 150 * physical memory which is determined when tmpfs is first used in the system. 151 * 152 * tmpfs_minfree is the minimum amount of swap space that tmpfs leaves for 153 * the rest of the system. In other words, if the amount of free swap space 154 * in the system (i.e. anoninfo.ani_free) drops below tmpfs_minfree, tmpfs 155 * anon allocations will fail. 156 * 157 * There is also a per mount limit on the amount of swap space 158 * (tmount.tm_anonmax) settable via a mount option. 159 */ 160 size_t tmpfs_maxkmem = 0; 161 size_t tmpfs_minfree = 0; 162 size_t tmp_kmemspace; /* bytes of kernel heap used by all tmpfs */ 163 164 static major_t tmpfs_major; 165 static minor_t tmpfs_minor; 166 static kmutex_t tmpfs_minor_lock; 167 168 /* 169 * initialize global tmpfs locks and such 170 * called when loading tmpfs module 171 */ 172 static int 173 tmpfsinit(int fstype, char *name) 174 { 175 static const fs_operation_def_t tmp_vfsops_template[] = { 176 VFSNAME_MOUNT, tmp_mount, 177 VFSNAME_UNMOUNT, tmp_unmount, 178 VFSNAME_ROOT, tmp_root, 179 VFSNAME_STATVFS, tmp_statvfs, 180 VFSNAME_VGET, tmp_vget, 181 NULL, NULL 182 }; 183 int error; 184 extern void tmpfs_hash_init(); 185 186 tmpfs_hash_init(); 187 tmpfsfstype = fstype; 188 ASSERT(tmpfsfstype != 0); 189 190 error = vfs_setfsops(fstype, tmp_vfsops_template, NULL); 191 if (error != 0) { 192 cmn_err(CE_WARN, "tmpfsinit: bad vfs ops template"); 193 return (error); 194 } 195 196 error = vn_make_ops(name, tmp_vnodeops_template, &tmp_vnodeops); 197 if (error != 0) { 198 (void) vfs_freevfsops_by_type(fstype); 199 cmn_err(CE_WARN, "tmpfsinit: bad vnode ops template"); 200 return (error); 201 } 202 203 /* 204 * tmpfs_minfree doesn't need to be some function of configured 205 * swap space since it really is an absolute limit of swap space 206 * which still allows other processes to execute. 207 */ 208 if (tmpfs_minfree == 0) { 209 /* 210 * Set if not patched 211 */ 212 tmpfs_minfree = btopr(TMPMINFREE); 213 } 214 215 /* 216 * The maximum amount of space tmpfs can allocate is 217 * TMPMAXPROCKMEM percent of kernel memory 218 */ 219 if (tmpfs_maxkmem == 0) 220 tmpfs_maxkmem = MAX(PAGESIZE, kmem_maxavail() / TMPMAXFRACKMEM); 221 222 if ((tmpfs_major = getudev()) == (major_t)-1) { 223 cmn_err(CE_WARN, "tmpfsinit: Can't get unique device number."); 224 tmpfs_major = 0; 225 } 226 mutex_init(&tmpfs_minor_lock, NULL, MUTEX_DEFAULT, NULL); 227 return (0); 228 } 229 230 static int 231 tmp_mount( 232 struct vfs *vfsp, 233 struct vnode *mvp, 234 struct mounta *uap, 235 struct cred *cr) 236 { 237 struct tmount *tm = NULL; 238 struct tmpnode *tp; 239 struct pathname dpn; 240 int error; 241 pgcnt_t anonmax; 242 struct vattr rattr; 243 int got_attrs; 244 245 char *sizestr; 246 247 if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0) 248 return (error); 249 250 if (mvp->v_type != VDIR) 251 return (ENOTDIR); 252 253 mutex_enter(&mvp->v_lock); 254 if ((uap->flags & MS_OVERLAY) == 0 && 255 (mvp->v_count != 1 || (mvp->v_flag & VROOT))) { 256 mutex_exit(&mvp->v_lock); 257 return (EBUSY); 258 } 259 mutex_exit(&mvp->v_lock); 260 261 /* 262 * Having the resource be anything but "swap" doesn't make sense. 263 */ 264 vfs_setresource(vfsp, "swap"); 265 266 /* 267 * now look for options we understand... 268 */ 269 270 /* tmpfs doesn't support read-only mounts */ 271 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) { 272 error = EINVAL; 273 goto out; 274 } 275 276 /* 277 * tm_anonmax is set according to the mount arguments 278 * if any. Otherwise, it is set to a maximum value. 279 */ 280 if (vfs_optionisset(vfsp, "size", &sizestr)) { 281 if ((error = tmp_convnum(sizestr, &anonmax)) != 0) 282 goto out; 283 } else { 284 anonmax = ULONG_MAX; 285 } 286 287 if (error = pn_get(uap->dir, 288 (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE, &dpn)) 289 goto out; 290 291 if ((tm = tmp_memalloc(sizeof (struct tmount), 0)) == NULL) { 292 pn_free(&dpn); 293 error = ENOMEM; 294 goto out; 295 } 296 297 /* 298 * find an available minor device number for this mount 299 */ 300 mutex_enter(&tmpfs_minor_lock); 301 do { 302 tmpfs_minor = (tmpfs_minor + 1) & L_MAXMIN32; 303 tm->tm_dev = makedevice(tmpfs_major, tmpfs_minor); 304 } while (vfs_devismounted(tm->tm_dev)); 305 mutex_exit(&tmpfs_minor_lock); 306 307 /* 308 * Set but don't bother entering the mutex 309 * (tmount not on mount list yet) 310 */ 311 mutex_init(&tm->tm_contents, NULL, MUTEX_DEFAULT, NULL); 312 mutex_init(&tm->tm_renamelck, NULL, MUTEX_DEFAULT, NULL); 313 314 tm->tm_vfsp = vfsp; 315 tm->tm_anonmax = anonmax; 316 317 vfsp->vfs_data = (caddr_t)tm; 318 vfsp->vfs_fstype = tmpfsfstype; 319 vfsp->vfs_dev = tm->tm_dev; 320 vfsp->vfs_bsize = PAGESIZE; 321 vfsp->vfs_flag |= VFS_NOTRUNC; 322 vfs_make_fsid(&vfsp->vfs_fsid, tm->tm_dev, tmpfsfstype); 323 tm->tm_mntpath = tmp_memalloc(dpn.pn_pathlen + 1, TMP_MUSTHAVE); 324 (void) strcpy(tm->tm_mntpath, dpn.pn_path); 325 326 /* 327 * allocate and initialize root tmpnode structure 328 */ 329 bzero(&rattr, sizeof (struct vattr)); 330 rattr.va_mode = (mode_t)(S_IFDIR | 0777); /* XXX modes */ 331 rattr.va_type = VDIR; 332 rattr.va_rdev = 0; 333 tp = tmp_memalloc(sizeof (struct tmpnode), TMP_MUSTHAVE); 334 tmpnode_init(tm, tp, &rattr, cr); 335 336 /* 337 * Get the mode, uid, and gid from the underlying mount point. 338 */ 339 rattr.va_mask = AT_MODE|AT_UID|AT_GID; /* Hint to getattr */ 340 got_attrs = VOP_GETATTR(mvp, &rattr, 0, cr); 341 342 rw_enter(&tp->tn_rwlock, RW_WRITER); 343 TNTOV(tp)->v_flag |= VROOT; 344 345 /* 346 * If the getattr succeeded, use its results. Otherwise allow 347 * the previously set hardwired defaults to prevail. 348 */ 349 if (got_attrs == 0) { 350 tp->tn_mode = rattr.va_mode; 351 tp->tn_uid = rattr.va_uid; 352 tp->tn_gid = rattr.va_gid; 353 } 354 355 /* 356 * initialize linked list of tmpnodes so that the back pointer of 357 * the root tmpnode always points to the last one on the list 358 * and the forward pointer of the last node is null 359 */ 360 tp->tn_back = tp; 361 tp->tn_forw = NULL; 362 tp->tn_nlink = 0; 363 tm->tm_rootnode = tp; 364 365 tdirinit(tp, tp); 366 367 rw_exit(&tp->tn_rwlock); 368 369 pn_free(&dpn); 370 error = 0; 371 372 out: 373 return (error); 374 } 375 376 static int 377 tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr) 378 { 379 struct tmount *tm = (struct tmount *)VFSTOTM(vfsp); 380 struct tmpnode *tnp, *cancel; 381 struct vnode *vp; 382 int error; 383 384 if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0) 385 return (error); 386 387 /* 388 * forced unmount is not supported by this file system 389 * and thus, ENOTSUP, is being returned. 390 */ 391 if (flag & MS_FORCE) 392 return (ENOTSUP); 393 394 mutex_enter(&tm->tm_contents); 395 396 /* 397 * If there are no open files, only the root node should have 398 * a reference count. 399 * With tm_contents held, nothing can be added or removed. 400 * There may be some dirty pages. To prevent fsflush from 401 * disrupting the unmount, put a hold on each node while scanning. 402 * If we find a previously referenced node, undo the holds we have 403 * placed and fail EBUSY. 404 */ 405 tnp = tm->tm_rootnode; 406 if (TNTOV(tnp)->v_count > 1) { 407 mutex_exit(&tm->tm_contents); 408 return (EBUSY); 409 } 410 411 for (tnp = tnp->tn_forw; tnp; tnp = tnp->tn_forw) { 412 if ((vp = TNTOV(tnp))->v_count > 0) { 413 cancel = tm->tm_rootnode->tn_forw; 414 while (cancel != tnp) { 415 vp = TNTOV(cancel); 416 ASSERT(vp->v_count > 0); 417 VN_RELE(vp); 418 cancel = cancel->tn_forw; 419 } 420 mutex_exit(&tm->tm_contents); 421 return (EBUSY); 422 } 423 VN_HOLD(vp); 424 } 425 426 /* 427 * We can drop the mutex now because no one can find this mount 428 */ 429 mutex_exit(&tm->tm_contents); 430 431 /* 432 * Free all kmemalloc'd and anonalloc'd memory associated with 433 * this filesystem. To do this, we go through the file list twice, 434 * once to remove all the directory entries, and then to remove 435 * all the files. We do this because there is useful code in 436 * tmpnode_free which assumes that the directory entry has been 437 * removed before the file. 438 */ 439 /* 440 * Remove all directory entries 441 */ 442 for (tnp = tm->tm_rootnode; tnp; tnp = tnp->tn_forw) { 443 rw_enter(&tnp->tn_rwlock, RW_WRITER); 444 if (tnp->tn_type == VDIR) 445 tdirtrunc(tnp); 446 if (tnp->tn_vnode->v_flag & V_XATTRDIR) { 447 /* 448 * Account for implicit attrdir reference. 449 */ 450 ASSERT(tnp->tn_nlink > 0); 451 DECR_COUNT(&tnp->tn_nlink, &tnp->tn_tlock); 452 } 453 rw_exit(&tnp->tn_rwlock); 454 } 455 456 ASSERT(tm->tm_rootnode); 457 458 /* 459 * All links are gone, v_count is keeping nodes in place. 460 * VN_RELE should make the node disappear, unless somebody 461 * is holding pages against it. Nap and retry until it disappears. 462 * 463 * We re-acquire the lock to prevent others who have a HOLD on 464 * a tmpnode via its pages or anon slots from blowing it away 465 * (in tmp_inactive) while we're trying to get to it here. Once 466 * we have a HOLD on it we know it'll stick around. 467 * 468 */ 469 mutex_enter(&tm->tm_contents); 470 /* 471 * Remove all the files (except the rootnode) backwards. 472 */ 473 while ((tnp = tm->tm_rootnode->tn_back) != tm->tm_rootnode) { 474 mutex_exit(&tm->tm_contents); 475 /* 476 * Inhibit tmp_inactive from touching attribute directory 477 * as all nodes will be released here. 478 * Note we handled the link count in pass 2 above. 479 */ 480 rw_enter(&tnp->tn_rwlock, RW_WRITER); 481 tnp->tn_xattrdp = NULL; 482 rw_exit(&tnp->tn_rwlock); 483 vp = TNTOV(tnp); 484 VN_RELE(vp); 485 mutex_enter(&tm->tm_contents); 486 /* 487 * It's still there after the RELE. Someone else like pageout 488 * has a hold on it so wait a bit and then try again - we know 489 * they'll give it up soon. 490 */ 491 if (tnp == tm->tm_rootnode->tn_back) { 492 VN_HOLD(vp); 493 mutex_exit(&tm->tm_contents); 494 delay(hz / 4); 495 mutex_enter(&tm->tm_contents); 496 } 497 } 498 mutex_exit(&tm->tm_contents); 499 500 tm->tm_rootnode->tn_xattrdp = NULL; 501 VN_RELE(TNTOV(tm->tm_rootnode)); 502 503 ASSERT(tm->tm_mntpath); 504 505 tmp_memfree(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1); 506 507 ASSERT(tm->tm_anonmem == 0); 508 509 mutex_destroy(&tm->tm_contents); 510 mutex_destroy(&tm->tm_renamelck); 511 tmp_memfree(tm, sizeof (struct tmount)); 512 513 return (0); 514 } 515 516 /* 517 * return root tmpnode for given vnode 518 */ 519 static int 520 tmp_root(struct vfs *vfsp, struct vnode **vpp) 521 { 522 struct tmount *tm = (struct tmount *)VFSTOTM(vfsp); 523 struct tmpnode *tp = tm->tm_rootnode; 524 struct vnode *vp; 525 526 ASSERT(tp); 527 528 vp = TNTOV(tp); 529 VN_HOLD(vp); 530 *vpp = vp; 531 return (0); 532 } 533 534 static int 535 tmp_statvfs(struct vfs *vfsp, struct statvfs64 *sbp) 536 { 537 struct tmount *tm = (struct tmount *)VFSTOTM(vfsp); 538 ulong_t blocks; 539 dev32_t d32; 540 541 sbp->f_bsize = PAGESIZE; 542 sbp->f_frsize = PAGESIZE; 543 544 /* 545 * Find the amount of available physical and memory swap 546 */ 547 mutex_enter(&anoninfo_lock); 548 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 549 blocks = (ulong_t)CURRENT_TOTAL_AVAILABLE_SWAP; 550 mutex_exit(&anoninfo_lock); 551 552 /* 553 * If tm_anonmax for this mount is less than the available swap space 554 * (minus the amount tmpfs can't use), use that instead 555 */ 556 if (blocks > tmpfs_minfree) 557 sbp->f_bfree = MIN(blocks - tmpfs_minfree, 558 tm->tm_anonmax - tm->tm_anonmem); 559 else 560 sbp->f_bfree = 0; 561 562 sbp->f_bavail = sbp->f_bfree; 563 564 /* 565 * Total number of blocks is what's available plus what's been used 566 */ 567 sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree + tm->tm_anonmem); 568 569 /* 570 * The maximum number of files available is approximately the number 571 * of tmpnodes we can allocate from the remaining kernel memory 572 * available to tmpfs. This is fairly inaccurate since it doesn't 573 * take into account the names stored in the directory entries. 574 */ 575 if (tmpfs_maxkmem > tmp_kmemspace) 576 sbp->f_ffree = (tmpfs_maxkmem - tmp_kmemspace) / 577 (sizeof (struct tmpnode) + sizeof (struct tdirent)); 578 else 579 sbp->f_ffree = 0; 580 581 sbp->f_files = tmpfs_maxkmem / 582 (sizeof (struct tmpnode) + sizeof (struct tdirent)); 583 sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree); 584 (void) cmpldev(&d32, vfsp->vfs_dev); 585 sbp->f_fsid = d32; 586 (void) strcpy(sbp->f_basetype, vfssw[tmpfsfstype].vsw_name); 587 (void) strncpy(sbp->f_fstr, tm->tm_mntpath, sizeof (sbp->f_fstr)); 588 /* 589 * ensure null termination 590 */ 591 sbp->f_fstr[sizeof (sbp->f_fstr) - 1] = '\0'; 592 sbp->f_flag = vf_to_stf(vfsp->vfs_flag); 593 sbp->f_namemax = MAXNAMELEN - 1; 594 return (0); 595 } 596 597 static int 598 tmp_vget(struct vfs *vfsp, struct vnode **vpp, struct fid *fidp) 599 { 600 struct tfid *tfid; 601 struct tmount *tm = (struct tmount *)VFSTOTM(vfsp); 602 struct tmpnode *tp = NULL; 603 604 tfid = (struct tfid *)fidp; 605 *vpp = NULL; 606 607 mutex_enter(&tm->tm_contents); 608 for (tp = tm->tm_rootnode; tp; tp = tp->tn_forw) { 609 mutex_enter(&tp->tn_tlock); 610 if (tp->tn_nodeid == tfid->tfid_ino) { 611 /* 612 * If the gen numbers don't match we know the 613 * file won't be found since only one tmpnode 614 * can have this number at a time. 615 */ 616 if (tp->tn_gen != tfid->tfid_gen || tp->tn_nlink == 0) { 617 mutex_exit(&tp->tn_tlock); 618 mutex_exit(&tm->tm_contents); 619 return (0); 620 } 621 *vpp = (struct vnode *)TNTOV(tp); 622 623 VN_HOLD(*vpp); 624 625 if ((tp->tn_mode & S_ISVTX) && 626 !(tp->tn_mode & (S_IXUSR | S_IFDIR))) { 627 mutex_enter(&(*vpp)->v_lock); 628 (*vpp)->v_flag |= VISSWAP; 629 mutex_exit(&(*vpp)->v_lock); 630 } 631 mutex_exit(&tp->tn_tlock); 632 mutex_exit(&tm->tm_contents); 633 return (0); 634 } 635 mutex_exit(&tp->tn_tlock); 636 } 637 mutex_exit(&tm->tm_contents); 638 return (0); 639 } 640