1 /* $NetBSD: tmpfs_subr.c,v 1.35 2007/07/09 21:10:50 ad Exp $ */ 2 3 /*- 4 * SPDX-License-Identifier: BSD-2-Clause 5 * 6 * Copyright (c) 2005 The NetBSD Foundation, Inc. 7 * All rights reserved. 8 * 9 * This code is derived from software contributed to The NetBSD Foundation 10 * by Julio M. Merino Vidal, developed as part of Google's Summer of Code 11 * 2005 program. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 24 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 26 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 29 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 30 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 32 * POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 /* 36 * Efficient memory file system supporting functions. 37 */ 38 #include <sys/cdefs.h> 39 __FBSDID("$FreeBSD$"); 40 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/dirent.h> 44 #include <sys/fnv_hash.h> 45 #include <sys/lock.h> 46 #include <sys/limits.h> 47 #include <sys/mount.h> 48 #include <sys/namei.h> 49 #include <sys/priv.h> 50 #include <sys/proc.h> 51 #include <sys/random.h> 52 #include <sys/refcount.h> 53 #include <sys/rwlock.h> 54 #include <sys/smr.h> 55 #include <sys/stat.h> 56 #include <sys/sysctl.h> 57 #include <sys/user.h> 58 #include <sys/vnode.h> 59 #include <sys/vmmeter.h> 60 61 #include <vm/vm.h> 62 #include <vm/vm_param.h> 63 #include <vm/vm_object.h> 64 #include <vm/vm_page.h> 65 #include <vm/vm_pageout.h> 66 #include <vm/vm_pager.h> 67 #include <vm/vm_extern.h> 68 #include <vm/swap_pager.h> 69 70 #include <fs/tmpfs/tmpfs.h> 71 #include <fs/tmpfs/tmpfs_fifoops.h> 72 #include <fs/tmpfs/tmpfs_vnops.h> 73 74 SYSCTL_NODE(_vfs, OID_AUTO, tmpfs, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 75 "tmpfs file system"); 76 77 static long tmpfs_pages_reserved = TMPFS_PAGES_MINRESERVED; 78 79 MALLOC_DEFINE(M_TMPFSDIR, "tmpfs dir", "tmpfs dirent structure"); 80 static uma_zone_t tmpfs_node_pool; 81 VFS_SMR_DECLARE; 82 83 int tmpfs_pager_type = -1; 84 85 static vm_object_t 86 tmpfs_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, 87 vm_ooffset_t offset, struct ucred *cred) 88 { 89 vm_object_t object; 90 91 MPASS(handle == NULL); 92 MPASS(offset == 0); 93 object = vm_object_allocate_dyn(tmpfs_pager_type, size, 94 OBJ_COLORED | OBJ_SWAP); 95 if (!swap_pager_init_object(object, NULL, NULL, size, 0)) { 96 vm_object_deallocate(object); 97 object = NULL; 98 } 99 return (object); 100 } 101 102 /* 103 * Make sure tmpfs vnodes with writable mappings can be found on the lazy list. 104 * 105 * This allows for periodic mtime updates while only scanning vnodes which are 106 * plausibly dirty, see tmpfs_update_mtime_lazy. 107 */ 108 static void 109 tmpfs_pager_writecount_recalc(vm_object_t object, vm_offset_t old, 110 vm_offset_t new) 111 { 112 struct vnode *vp; 113 114 VM_OBJECT_ASSERT_WLOCKED(object); 115 116 vp = VM_TO_TMPFS_VP(object); 117 118 /* 119 * Forced unmount? 120 */ 121 if (vp == NULL) { 122 KASSERT((object->flags & OBJ_TMPFS_VREF) == 0, 123 ("object %p with OBJ_TMPFS_VREF but without vnode", 124 object)); 125 VM_OBJECT_WUNLOCK(object); 126 return; 127 } 128 129 if (old == 0) { 130 VNASSERT((object->flags & OBJ_TMPFS_VREF) == 0, vp, 131 ("object without writable mappings has a reference")); 132 VNPASS(vp->v_usecount > 0, vp); 133 } else { 134 VNASSERT((object->flags & OBJ_TMPFS_VREF) != 0, vp, 135 ("object with writable mappings does not " 136 "have a reference")); 137 } 138 139 if (old == new) { 140 VM_OBJECT_WUNLOCK(object); 141 return; 142 } 143 144 if (new == 0) { 145 vm_object_clear_flag(object, OBJ_TMPFS_VREF); 146 VM_OBJECT_WUNLOCK(object); 147 vrele(vp); 148 } else { 149 if ((object->flags & OBJ_TMPFS_VREF) == 0) { 150 vref(vp); 151 vlazy(vp); 152 vm_object_set_flag(object, OBJ_TMPFS_VREF); 153 } 154 VM_OBJECT_WUNLOCK(object); 155 } 156 } 157 158 static void 159 tmpfs_pager_update_writecount(vm_object_t object, vm_offset_t start, 160 vm_offset_t end) 161 { 162 vm_offset_t new, old; 163 164 VM_OBJECT_WLOCK(object); 165 KASSERT((object->flags & OBJ_ANON) == 0, 166 ("%s: object %p with OBJ_ANON", __func__, object)); 167 old = object->un_pager.swp.writemappings; 168 object->un_pager.swp.writemappings += (vm_ooffset_t)end - start; 169 new = object->un_pager.swp.writemappings; 170 tmpfs_pager_writecount_recalc(object, old, new); 171 VM_OBJECT_ASSERT_UNLOCKED(object); 172 } 173 174 static void 175 tmpfs_pager_release_writecount(vm_object_t object, vm_offset_t start, 176 vm_offset_t end) 177 { 178 vm_offset_t new, old; 179 180 VM_OBJECT_WLOCK(object); 181 KASSERT((object->flags & OBJ_ANON) == 0, 182 ("%s: object %p with OBJ_ANON", __func__, object)); 183 old = object->un_pager.swp.writemappings; 184 object->un_pager.swp.writemappings -= (vm_ooffset_t)end - start; 185 new = object->un_pager.swp.writemappings; 186 tmpfs_pager_writecount_recalc(object, old, new); 187 VM_OBJECT_ASSERT_UNLOCKED(object); 188 } 189 190 static void 191 tmpfs_pager_getvp(vm_object_t object, struct vnode **vpp, bool *vp_heldp) 192 { 193 struct vnode *vp; 194 195 /* 196 * Tmpfs VREG node, which was reclaimed, has tmpfs_pager_type 197 * type. In this case there is no v_writecount to adjust. 198 */ 199 if (vp_heldp != NULL) 200 VM_OBJECT_RLOCK(object); 201 else 202 VM_OBJECT_ASSERT_LOCKED(object); 203 if ((object->flags & OBJ_TMPFS) != 0) { 204 vp = VM_TO_TMPFS_VP(object); 205 if (vp != NULL) { 206 *vpp = vp; 207 if (vp_heldp != NULL) { 208 vhold(vp); 209 *vp_heldp = true; 210 } 211 } 212 } 213 if (vp_heldp != NULL) 214 VM_OBJECT_RUNLOCK(object); 215 } 216 217 static void 218 tmpfs_pager_freespace(vm_object_t obj, vm_pindex_t start, vm_size_t size) 219 { 220 struct tmpfs_node *node; 221 struct tmpfs_mount *tm; 222 vm_size_t c; 223 224 swap_pager_freespace(obj, start, size, &c); 225 if ((obj->flags & OBJ_TMPFS) == 0 || c == 0) 226 return; 227 228 node = obj->un_pager.swp.swp_priv; 229 MPASS(node->tn_type == VREG); 230 tm = node->tn_reg.tn_tmp; 231 232 KASSERT(tm->tm_pages_used >= c, 233 ("tmpfs tm %p pages %jd free %jd", tm, 234 (uintmax_t)tm->tm_pages_used, (uintmax_t)c)); 235 atomic_add_long(&tm->tm_pages_used, -c); 236 KASSERT(node->tn_reg.tn_pages >= c, 237 ("tmpfs node %p pages %jd free %jd", node, 238 (uintmax_t)node->tn_reg.tn_pages, (uintmax_t)c)); 239 node->tn_reg.tn_pages -= c; 240 } 241 242 static void 243 tmpfs_page_inserted(vm_object_t obj, vm_page_t m) 244 { 245 struct tmpfs_node *node; 246 struct tmpfs_mount *tm; 247 248 if ((obj->flags & OBJ_TMPFS) == 0) 249 return; 250 251 node = obj->un_pager.swp.swp_priv; 252 MPASS(node->tn_type == VREG); 253 tm = node->tn_reg.tn_tmp; 254 255 if (!vm_pager_has_page(obj, m->pindex, NULL, NULL)) { 256 atomic_add_long(&tm->tm_pages_used, 1); 257 node->tn_reg.tn_pages += 1; 258 } 259 } 260 261 static void 262 tmpfs_page_removed(vm_object_t obj, vm_page_t m) 263 { 264 struct tmpfs_node *node; 265 struct tmpfs_mount *tm; 266 267 if ((obj->flags & OBJ_TMPFS) == 0) 268 return; 269 270 node = obj->un_pager.swp.swp_priv; 271 MPASS(node->tn_type == VREG); 272 tm = node->tn_reg.tn_tmp; 273 274 if (!vm_pager_has_page(obj, m->pindex, NULL, NULL)) { 275 KASSERT(tm->tm_pages_used >= 1, 276 ("tmpfs tm %p pages %jd free 1", tm, 277 (uintmax_t)tm->tm_pages_used)); 278 atomic_add_long(&tm->tm_pages_used, -1); 279 KASSERT(node->tn_reg.tn_pages >= 1, 280 ("tmpfs node %p pages %jd free 1", node, 281 (uintmax_t)node->tn_reg.tn_pages)); 282 node->tn_reg.tn_pages -= 1; 283 } 284 } 285 286 static boolean_t 287 tmpfs_can_alloc_page(vm_object_t obj, vm_pindex_t pindex) 288 { 289 struct tmpfs_mount *tm; 290 291 tm = VM_TO_TMPFS_MP(obj); 292 if (tm == NULL || vm_pager_has_page(obj, pindex, NULL, NULL) || 293 tm->tm_pages_max == 0) 294 return (true); 295 return (tm->tm_pages_max > atomic_load_long(&tm->tm_pages_used)); 296 } 297 298 struct pagerops tmpfs_pager_ops = { 299 .pgo_kvme_type = KVME_TYPE_VNODE, 300 .pgo_alloc = tmpfs_pager_alloc, 301 .pgo_set_writeable_dirty = vm_object_set_writeable_dirty_, 302 .pgo_update_writecount = tmpfs_pager_update_writecount, 303 .pgo_release_writecount = tmpfs_pager_release_writecount, 304 .pgo_mightbedirty = vm_object_mightbedirty_, 305 .pgo_getvp = tmpfs_pager_getvp, 306 .pgo_freespace = tmpfs_pager_freespace, 307 .pgo_page_inserted = tmpfs_page_inserted, 308 .pgo_page_removed = tmpfs_page_removed, 309 .pgo_can_alloc_page = tmpfs_can_alloc_page, 310 }; 311 312 static int 313 tmpfs_node_ctor(void *mem, int size, void *arg, int flags) 314 { 315 struct tmpfs_node *node; 316 317 node = mem; 318 node->tn_gen++; 319 node->tn_size = 0; 320 node->tn_status = 0; 321 node->tn_accessed = false; 322 node->tn_flags = 0; 323 node->tn_links = 0; 324 node->tn_vnode = NULL; 325 node->tn_vpstate = 0; 326 return (0); 327 } 328 329 static void 330 tmpfs_node_dtor(void *mem, int size, void *arg) 331 { 332 struct tmpfs_node *node; 333 334 node = mem; 335 node->tn_type = VNON; 336 } 337 338 static int 339 tmpfs_node_init(void *mem, int size, int flags) 340 { 341 struct tmpfs_node *node; 342 343 node = mem; 344 node->tn_id = 0; 345 mtx_init(&node->tn_interlock, "tmpfsni", NULL, MTX_DEF); 346 node->tn_gen = arc4random(); 347 return (0); 348 } 349 350 static void 351 tmpfs_node_fini(void *mem, int size) 352 { 353 struct tmpfs_node *node; 354 355 node = mem; 356 mtx_destroy(&node->tn_interlock); 357 } 358 359 int 360 tmpfs_subr_init(void) 361 { 362 tmpfs_pager_type = vm_pager_alloc_dyn_type(&tmpfs_pager_ops, 363 OBJT_SWAP); 364 if (tmpfs_pager_type == -1) 365 return (EINVAL); 366 tmpfs_node_pool = uma_zcreate("TMPFS node", 367 sizeof(struct tmpfs_node), tmpfs_node_ctor, tmpfs_node_dtor, 368 tmpfs_node_init, tmpfs_node_fini, UMA_ALIGN_PTR, 0); 369 VFS_SMR_ZONE_SET(tmpfs_node_pool); 370 return (0); 371 } 372 373 void 374 tmpfs_subr_uninit(void) 375 { 376 if (tmpfs_pager_type != -1) 377 vm_pager_free_dyn_type(tmpfs_pager_type); 378 tmpfs_pager_type = -1; 379 uma_zdestroy(tmpfs_node_pool); 380 } 381 382 static int 383 sysctl_mem_reserved(SYSCTL_HANDLER_ARGS) 384 { 385 int error; 386 long pages, bytes; 387 388 pages = *(long *)arg1; 389 bytes = pages * PAGE_SIZE; 390 391 error = sysctl_handle_long(oidp, &bytes, 0, req); 392 if (error || !req->newptr) 393 return (error); 394 395 pages = bytes / PAGE_SIZE; 396 if (pages < TMPFS_PAGES_MINRESERVED) 397 return (EINVAL); 398 399 *(long *)arg1 = pages; 400 return (0); 401 } 402 403 SYSCTL_PROC(_vfs_tmpfs, OID_AUTO, memory_reserved, 404 CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &tmpfs_pages_reserved, 0, 405 sysctl_mem_reserved, "L", 406 "Amount of available memory and swap below which tmpfs growth stops"); 407 408 static __inline int tmpfs_dirtree_cmp(struct tmpfs_dirent *a, 409 struct tmpfs_dirent *b); 410 RB_PROTOTYPE_STATIC(tmpfs_dir, tmpfs_dirent, uh.td_entries, tmpfs_dirtree_cmp); 411 412 size_t 413 tmpfs_mem_avail(void) 414 { 415 size_t avail; 416 long reserved; 417 418 avail = swap_pager_avail + vm_free_count(); 419 reserved = atomic_load_long(&tmpfs_pages_reserved); 420 if (__predict_false(avail < reserved)) 421 return (0); 422 return (avail - reserved); 423 } 424 425 size_t 426 tmpfs_pages_used(struct tmpfs_mount *tmp) 427 { 428 const size_t node_size = sizeof(struct tmpfs_node) + 429 sizeof(struct tmpfs_dirent); 430 size_t meta_pages; 431 432 meta_pages = howmany((uintmax_t)tmp->tm_nodes_inuse * node_size, 433 PAGE_SIZE); 434 return (meta_pages + tmp->tm_pages_used); 435 } 436 437 bool 438 tmpfs_pages_check_avail(struct tmpfs_mount *tmp, size_t req_pages) 439 { 440 if (tmpfs_mem_avail() < req_pages) 441 return (false); 442 443 if (tmp->tm_pages_max != ULONG_MAX && 444 tmp->tm_pages_max < req_pages + tmpfs_pages_used(tmp)) 445 return (false); 446 447 return (true); 448 } 449 450 static int 451 tmpfs_partial_page_invalidate(vm_object_t object, vm_pindex_t idx, int base, 452 int end, boolean_t ignerr) 453 { 454 vm_page_t m; 455 int rv, error; 456 457 VM_OBJECT_ASSERT_WLOCKED(object); 458 KASSERT(base >= 0, ("%s: base %d", __func__, base)); 459 KASSERT(end - base <= PAGE_SIZE, ("%s: base %d end %d", __func__, base, 460 end)); 461 error = 0; 462 463 retry: 464 m = vm_page_grab(object, idx, VM_ALLOC_NOCREAT); 465 if (m != NULL) { 466 MPASS(vm_page_all_valid(m)); 467 } else if (vm_pager_has_page(object, idx, NULL, NULL)) { 468 m = vm_page_alloc(object, idx, VM_ALLOC_NORMAL | 469 VM_ALLOC_WAITFAIL); 470 if (m == NULL) 471 goto retry; 472 vm_object_pip_add(object, 1); 473 VM_OBJECT_WUNLOCK(object); 474 rv = vm_pager_get_pages(object, &m, 1, NULL, NULL); 475 VM_OBJECT_WLOCK(object); 476 vm_object_pip_wakeup(object); 477 if (rv == VM_PAGER_OK) { 478 /* 479 * Since the page was not resident, and therefore not 480 * recently accessed, immediately enqueue it for 481 * asynchronous laundering. The current operation is 482 * not regarded as an access. 483 */ 484 vm_page_launder(m); 485 } else { 486 vm_page_free(m); 487 m = NULL; 488 if (!ignerr) 489 error = EIO; 490 } 491 } 492 if (m != NULL) { 493 pmap_zero_page_area(m, base, end - base); 494 vm_page_set_dirty(m); 495 vm_page_xunbusy(m); 496 } 497 498 return (error); 499 } 500 501 void 502 tmpfs_ref_node(struct tmpfs_node *node) 503 { 504 #ifdef INVARIANTS 505 u_int old; 506 507 old = 508 #endif 509 refcount_acquire(&node->tn_refcount); 510 #ifdef INVARIANTS 511 KASSERT(old > 0, ("node %p zero refcount", node)); 512 #endif 513 } 514 515 /* 516 * Allocates a new node of type 'type' inside the 'tmp' mount point, with 517 * its owner set to 'uid', its group to 'gid' and its mode set to 'mode', 518 * using the credentials of the process 'p'. 519 * 520 * If the node type is set to 'VDIR', then the parent parameter must point 521 * to the parent directory of the node being created. It may only be NULL 522 * while allocating the root node. 523 * 524 * If the node type is set to 'VBLK' or 'VCHR', then the rdev parameter 525 * specifies the device the node represents. 526 * 527 * If the node type is set to 'VLNK', then the parameter target specifies 528 * the file name of the target file for the symbolic link that is being 529 * created. 530 * 531 * Note that new nodes are retrieved from the available list if it has 532 * items or, if it is empty, from the node pool as long as there is enough 533 * space to create them. 534 * 535 * Returns zero on success or an appropriate error code on failure. 536 */ 537 int 538 tmpfs_alloc_node(struct mount *mp, struct tmpfs_mount *tmp, __enum_uint8(vtype) type, 539 uid_t uid, gid_t gid, mode_t mode, struct tmpfs_node *parent, 540 const char *target, dev_t rdev, struct tmpfs_node **node) 541 { 542 struct tmpfs_node *nnode; 543 char *symlink; 544 char symlink_smr; 545 546 /* If the root directory of the 'tmp' file system is not yet 547 * allocated, this must be the request to do it. */ 548 MPASS(IMPLIES(tmp->tm_root == NULL, parent == NULL && type == VDIR)); 549 550 MPASS((type == VLNK) ^ (target == NULL)); 551 MPASS((type == VBLK || type == VCHR) ^ (rdev == VNOVAL)); 552 553 if (tmp->tm_nodes_inuse >= tmp->tm_nodes_max) 554 return (ENOSPC); 555 if (!tmpfs_pages_check_avail(tmp, 1)) 556 return (ENOSPC); 557 558 if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) { 559 /* 560 * When a new tmpfs node is created for fully 561 * constructed mount point, there must be a parent 562 * node, which vnode is locked exclusively. As 563 * consequence, if the unmount is executing in 564 * parallel, vflush() cannot reclaim the parent vnode. 565 * Due to this, the check for MNTK_UNMOUNT flag is not 566 * racy: if we did not see MNTK_UNMOUNT flag, then tmp 567 * cannot be destroyed until node construction is 568 * finished and the parent vnode unlocked. 569 * 570 * Tmpfs does not need to instantiate new nodes during 571 * unmount. 572 */ 573 return (EBUSY); 574 } 575 if ((mp->mnt_kern_flag & MNT_RDONLY) != 0) 576 return (EROFS); 577 578 nnode = uma_zalloc_smr(tmpfs_node_pool, M_WAITOK); 579 580 /* Generic initialization. */ 581 nnode->tn_type = type; 582 vfs_timestamp(&nnode->tn_atime); 583 nnode->tn_birthtime = nnode->tn_ctime = nnode->tn_mtime = 584 nnode->tn_atime; 585 nnode->tn_uid = uid; 586 nnode->tn_gid = gid; 587 nnode->tn_mode = mode; 588 nnode->tn_id = alloc_unr64(&tmp->tm_ino_unr); 589 nnode->tn_refcount = 1; 590 LIST_INIT(&nnode->tn_extattrs); 591 592 /* Type-specific initialization. */ 593 switch (nnode->tn_type) { 594 case VBLK: 595 case VCHR: 596 nnode->tn_rdev = rdev; 597 break; 598 599 case VDIR: 600 RB_INIT(&nnode->tn_dir.tn_dirhead); 601 LIST_INIT(&nnode->tn_dir.tn_dupindex); 602 MPASS(parent != nnode); 603 MPASS(IMPLIES(parent == NULL, tmp->tm_root == NULL)); 604 nnode->tn_dir.tn_parent = (parent == NULL) ? nnode : parent; 605 nnode->tn_dir.tn_readdir_lastn = 0; 606 nnode->tn_dir.tn_readdir_lastp = NULL; 607 nnode->tn_links++; 608 TMPFS_NODE_LOCK(nnode->tn_dir.tn_parent); 609 nnode->tn_dir.tn_parent->tn_links++; 610 TMPFS_NODE_UNLOCK(nnode->tn_dir.tn_parent); 611 break; 612 613 case VFIFO: 614 /* FALLTHROUGH */ 615 case VSOCK: 616 break; 617 618 case VLNK: 619 MPASS(strlen(target) < MAXPATHLEN); 620 nnode->tn_size = strlen(target); 621 622 symlink = NULL; 623 if (!tmp->tm_nonc) { 624 symlink = cache_symlink_alloc(nnode->tn_size + 1, 625 M_WAITOK); 626 symlink_smr = true; 627 } 628 if (symlink == NULL) { 629 symlink = malloc(nnode->tn_size + 1, M_TMPFSNAME, 630 M_WAITOK); 631 symlink_smr = false; 632 } 633 memcpy(symlink, target, nnode->tn_size + 1); 634 635 /* 636 * Allow safe symlink resolving for lockless lookup. 637 * tmpfs_fplookup_symlink references this comment. 638 * 639 * 1. nnode is not yet visible to the world 640 * 2. both tn_link_target and tn_link_smr get populated 641 * 3. release fence publishes their content 642 * 4. tn_link_target content is immutable until node 643 * destruction, where the pointer gets set to NULL 644 * 5. tn_link_smr is never changed once set 645 * 646 * As a result it is sufficient to issue load consume 647 * on the node pointer to also get the above content 648 * in a stable manner. Worst case tn_link_smr flag 649 * may be set to true despite being stale, while the 650 * target buffer is already cleared out. 651 */ 652 atomic_store_ptr(&nnode->tn_link_target, symlink); 653 atomic_store_char((char *)&nnode->tn_link_smr, symlink_smr); 654 atomic_thread_fence_rel(); 655 break; 656 657 case VREG: 658 nnode->tn_reg.tn_aobj = 659 vm_pager_allocate(tmpfs_pager_type, NULL, 0, 660 VM_PROT_DEFAULT, 0, 661 NULL /* XXXKIB - tmpfs needs swap reservation */); 662 nnode->tn_reg.tn_aobj->un_pager.swp.swp_priv = nnode; 663 vm_object_set_flag(nnode->tn_reg.tn_aobj, OBJ_TMPFS); 664 nnode->tn_reg.tn_tmp = tmp; 665 nnode->tn_reg.tn_pages = 0; 666 break; 667 668 default: 669 panic("tmpfs_alloc_node: type %p %d", nnode, 670 (int)nnode->tn_type); 671 } 672 673 TMPFS_LOCK(tmp); 674 LIST_INSERT_HEAD(&tmp->tm_nodes_used, nnode, tn_entries); 675 nnode->tn_attached = true; 676 tmp->tm_nodes_inuse++; 677 tmp->tm_refcount++; 678 TMPFS_UNLOCK(tmp); 679 680 *node = nnode; 681 return (0); 682 } 683 684 /* 685 * Destroys the node pointed to by node from the file system 'tmp'. 686 * If the node references a directory, no entries are allowed. 687 */ 688 void 689 tmpfs_free_node(struct tmpfs_mount *tmp, struct tmpfs_node *node) 690 { 691 if (refcount_release_if_not_last(&node->tn_refcount)) 692 return; 693 694 TMPFS_LOCK(tmp); 695 TMPFS_NODE_LOCK(node); 696 if (!tmpfs_free_node_locked(tmp, node, false)) { 697 TMPFS_NODE_UNLOCK(node); 698 TMPFS_UNLOCK(tmp); 699 } 700 } 701 702 bool 703 tmpfs_free_node_locked(struct tmpfs_mount *tmp, struct tmpfs_node *node, 704 bool detach) 705 { 706 struct tmpfs_extattr *ea; 707 vm_object_t uobj; 708 char *symlink; 709 bool last; 710 711 TMPFS_MP_ASSERT_LOCKED(tmp); 712 TMPFS_NODE_ASSERT_LOCKED(node); 713 714 last = refcount_release(&node->tn_refcount); 715 if (node->tn_attached && (detach || last)) { 716 MPASS(tmp->tm_nodes_inuse > 0); 717 tmp->tm_nodes_inuse--; 718 LIST_REMOVE(node, tn_entries); 719 node->tn_attached = false; 720 } 721 if (!last) 722 return (false); 723 724 TMPFS_NODE_UNLOCK(node); 725 726 #ifdef INVARIANTS 727 MPASS(node->tn_vnode == NULL); 728 MPASS((node->tn_vpstate & TMPFS_VNODE_ALLOCATING) == 0); 729 730 /* 731 * Make sure this is a node type we can deal with. Everything 732 * is explicitly enumerated without the 'default' clause so 733 * the compiler can throw an error in case a new type is 734 * added. 735 */ 736 switch (node->tn_type) { 737 case VBLK: 738 case VCHR: 739 case VDIR: 740 case VFIFO: 741 case VSOCK: 742 case VLNK: 743 case VREG: 744 break; 745 case VNON: 746 case VBAD: 747 case VMARKER: 748 panic("%s: bad type %d for node %p", __func__, 749 (int)node->tn_type, node); 750 } 751 #endif 752 753 while ((ea = LIST_FIRST(&node->tn_extattrs)) != NULL) { 754 LIST_REMOVE(ea, ea_extattrs); 755 tmpfs_extattr_free(ea); 756 } 757 758 switch (node->tn_type) { 759 case VREG: 760 uobj = node->tn_reg.tn_aobj; 761 node->tn_reg.tn_aobj = NULL; 762 if (uobj != NULL) { 763 VM_OBJECT_WLOCK(uobj); 764 KASSERT((uobj->flags & OBJ_TMPFS) != 0, 765 ("tmpfs node %p uobj %p not tmpfs", node, uobj)); 766 vm_object_clear_flag(uobj, OBJ_TMPFS); 767 KASSERT(tmp->tm_pages_used >= node->tn_reg.tn_pages, 768 ("tmpfs tmp %p node %p pages %jd free %jd", tmp, 769 node, (uintmax_t)tmp->tm_pages_used, 770 (uintmax_t)node->tn_reg.tn_pages)); 771 atomic_add_long(&tmp->tm_pages_used, 772 -node->tn_reg.tn_pages); 773 VM_OBJECT_WUNLOCK(uobj); 774 } 775 tmpfs_free_tmp(tmp); 776 777 /* 778 * vm_object_deallocate() must not be called while 779 * owning tm_allnode_lock, because deallocate might 780 * sleep. Call it after tmpfs_free_tmp() does the 781 * unlock. 782 */ 783 if (uobj != NULL) 784 vm_object_deallocate(uobj); 785 786 break; 787 case VLNK: 788 tmpfs_free_tmp(tmp); 789 790 symlink = node->tn_link_target; 791 atomic_store_ptr(&node->tn_link_target, NULL); 792 if (atomic_load_char(&node->tn_link_smr)) { 793 cache_symlink_free(symlink, node->tn_size + 1); 794 } else { 795 free(symlink, M_TMPFSNAME); 796 } 797 break; 798 default: 799 tmpfs_free_tmp(tmp); 800 break; 801 } 802 803 uma_zfree_smr(tmpfs_node_pool, node); 804 return (true); 805 } 806 807 static __inline uint32_t 808 tmpfs_dirent_hash(const char *name, u_int len) 809 { 810 uint32_t hash; 811 812 hash = fnv_32_buf(name, len, FNV1_32_INIT + len) & TMPFS_DIRCOOKIE_MASK; 813 #ifdef TMPFS_DEBUG_DIRCOOKIE_DUP 814 hash &= 0xf; 815 #endif 816 if (hash < TMPFS_DIRCOOKIE_MIN) 817 hash += TMPFS_DIRCOOKIE_MIN; 818 819 return (hash); 820 } 821 822 static __inline off_t 823 tmpfs_dirent_cookie(struct tmpfs_dirent *de) 824 { 825 if (de == NULL) 826 return (TMPFS_DIRCOOKIE_EOF); 827 828 MPASS(de->td_cookie >= TMPFS_DIRCOOKIE_MIN); 829 830 return (de->td_cookie); 831 } 832 833 static __inline boolean_t 834 tmpfs_dirent_dup(struct tmpfs_dirent *de) 835 { 836 return ((de->td_cookie & TMPFS_DIRCOOKIE_DUP) != 0); 837 } 838 839 static __inline boolean_t 840 tmpfs_dirent_duphead(struct tmpfs_dirent *de) 841 { 842 return ((de->td_cookie & TMPFS_DIRCOOKIE_DUPHEAD) != 0); 843 } 844 845 void 846 tmpfs_dirent_init(struct tmpfs_dirent *de, const char *name, u_int namelen) 847 { 848 de->td_hash = de->td_cookie = tmpfs_dirent_hash(name, namelen); 849 memcpy(de->ud.td_name, name, namelen); 850 de->td_namelen = namelen; 851 } 852 853 /* 854 * Allocates a new directory entry for the node node with a name of name. 855 * The new directory entry is returned in *de. 856 * 857 * The link count of node is increased by one to reflect the new object 858 * referencing it. 859 * 860 * Returns zero on success or an appropriate error code on failure. 861 */ 862 int 863 tmpfs_alloc_dirent(struct tmpfs_mount *tmp, struct tmpfs_node *node, 864 const char *name, u_int len, struct tmpfs_dirent **de) 865 { 866 struct tmpfs_dirent *nde; 867 868 nde = malloc(sizeof(*nde), M_TMPFSDIR, M_WAITOK); 869 nde->td_node = node; 870 if (name != NULL) { 871 nde->ud.td_name = malloc(len, M_TMPFSNAME, M_WAITOK); 872 tmpfs_dirent_init(nde, name, len); 873 } else 874 nde->td_namelen = 0; 875 if (node != NULL) 876 node->tn_links++; 877 878 *de = nde; 879 880 return (0); 881 } 882 883 /* 884 * Frees a directory entry. It is the caller's responsibility to destroy 885 * the node referenced by it if needed. 886 * 887 * The link count of node is decreased by one to reflect the removal of an 888 * object that referenced it. This only happens if 'node_exists' is true; 889 * otherwise the function will not access the node referred to by the 890 * directory entry, as it may already have been released from the outside. 891 */ 892 void 893 tmpfs_free_dirent(struct tmpfs_mount *tmp, struct tmpfs_dirent *de) 894 { 895 struct tmpfs_node *node; 896 897 node = de->td_node; 898 if (node != NULL) { 899 MPASS(node->tn_links > 0); 900 node->tn_links--; 901 } 902 if (!tmpfs_dirent_duphead(de) && de->ud.td_name != NULL) 903 free(de->ud.td_name, M_TMPFSNAME); 904 free(de, M_TMPFSDIR); 905 } 906 907 void 908 tmpfs_destroy_vobject(struct vnode *vp, vm_object_t obj) 909 { 910 bool want_vrele; 911 912 ASSERT_VOP_ELOCKED(vp, "tmpfs_destroy_vobject"); 913 if (vp->v_type != VREG || obj == NULL) 914 return; 915 916 VM_OBJECT_WLOCK(obj); 917 VI_LOCK(vp); 918 /* 919 * May be going through forced unmount. 920 */ 921 want_vrele = false; 922 if ((obj->flags & OBJ_TMPFS_VREF) != 0) { 923 vm_object_clear_flag(obj, OBJ_TMPFS_VREF); 924 want_vrele = true; 925 } 926 927 if (vp->v_writecount < 0) 928 vp->v_writecount = 0; 929 VI_UNLOCK(vp); 930 VM_OBJECT_WUNLOCK(obj); 931 if (want_vrele) { 932 vrele(vp); 933 } 934 } 935 936 /* 937 * Allocates a new vnode for the node node or returns a new reference to 938 * an existing one if the node had already a vnode referencing it. The 939 * resulting locked vnode is returned in *vpp. 940 * 941 * Returns zero on success or an appropriate error code on failure. 942 */ 943 int 944 tmpfs_alloc_vp(struct mount *mp, struct tmpfs_node *node, int lkflag, 945 struct vnode **vpp) 946 { 947 struct vnode *vp; 948 enum vgetstate vs; 949 struct tmpfs_mount *tm; 950 vm_object_t object; 951 int error; 952 953 error = 0; 954 tm = VFS_TO_TMPFS(mp); 955 TMPFS_NODE_LOCK(node); 956 tmpfs_ref_node(node); 957 loop: 958 TMPFS_NODE_ASSERT_LOCKED(node); 959 if ((vp = node->tn_vnode) != NULL) { 960 MPASS((node->tn_vpstate & TMPFS_VNODE_DOOMED) == 0); 961 if ((node->tn_type == VDIR && node->tn_dir.tn_parent == NULL) || 962 (VN_IS_DOOMED(vp) && 963 (lkflag & LK_NOWAIT) != 0)) { 964 TMPFS_NODE_UNLOCK(node); 965 error = ENOENT; 966 vp = NULL; 967 goto out; 968 } 969 if (VN_IS_DOOMED(vp)) { 970 node->tn_vpstate |= TMPFS_VNODE_WRECLAIM; 971 while ((node->tn_vpstate & TMPFS_VNODE_WRECLAIM) != 0) { 972 msleep(&node->tn_vnode, TMPFS_NODE_MTX(node), 973 0, "tmpfsE", 0); 974 } 975 goto loop; 976 } 977 vs = vget_prep(vp); 978 TMPFS_NODE_UNLOCK(node); 979 error = vget_finish(vp, lkflag, vs); 980 if (error == ENOENT) { 981 TMPFS_NODE_LOCK(node); 982 goto loop; 983 } 984 if (error != 0) { 985 vp = NULL; 986 goto out; 987 } 988 989 /* 990 * Make sure the vnode is still there after 991 * getting the interlock to avoid racing a free. 992 */ 993 if (node->tn_vnode != vp) { 994 vput(vp); 995 TMPFS_NODE_LOCK(node); 996 goto loop; 997 } 998 999 goto out; 1000 } 1001 1002 if ((node->tn_vpstate & TMPFS_VNODE_DOOMED) || 1003 (node->tn_type == VDIR && node->tn_dir.tn_parent == NULL)) { 1004 TMPFS_NODE_UNLOCK(node); 1005 error = ENOENT; 1006 vp = NULL; 1007 goto out; 1008 } 1009 1010 /* 1011 * otherwise lock the vp list while we call getnewvnode 1012 * since that can block. 1013 */ 1014 if (node->tn_vpstate & TMPFS_VNODE_ALLOCATING) { 1015 node->tn_vpstate |= TMPFS_VNODE_WANT; 1016 error = msleep((caddr_t) &node->tn_vpstate, 1017 TMPFS_NODE_MTX(node), 0, "tmpfs_alloc_vp", 0); 1018 if (error != 0) 1019 goto out; 1020 goto loop; 1021 } else 1022 node->tn_vpstate |= TMPFS_VNODE_ALLOCATING; 1023 1024 TMPFS_NODE_UNLOCK(node); 1025 1026 /* Get a new vnode and associate it with our node. */ 1027 error = getnewvnode("tmpfs", mp, VFS_TO_TMPFS(mp)->tm_nonc ? 1028 &tmpfs_vnodeop_nonc_entries : &tmpfs_vnodeop_entries, &vp); 1029 if (error != 0) 1030 goto unlock; 1031 MPASS(vp != NULL); 1032 1033 /* lkflag is ignored, the lock is exclusive */ 1034 (void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1035 1036 vp->v_data = node; 1037 vp->v_type = node->tn_type; 1038 1039 /* Type-specific initialization. */ 1040 switch (node->tn_type) { 1041 case VBLK: 1042 /* FALLTHROUGH */ 1043 case VCHR: 1044 /* FALLTHROUGH */ 1045 case VLNK: 1046 /* FALLTHROUGH */ 1047 case VSOCK: 1048 break; 1049 case VFIFO: 1050 vp->v_op = &tmpfs_fifoop_entries; 1051 break; 1052 case VREG: 1053 object = node->tn_reg.tn_aobj; 1054 VM_OBJECT_WLOCK(object); 1055 KASSERT((object->flags & OBJ_TMPFS_VREF) == 0, 1056 ("%s: object %p with OBJ_TMPFS_VREF but without vnode", 1057 __func__, object)); 1058 KASSERT(object->un_pager.swp.writemappings == 0, 1059 ("%s: object %p has writemappings", 1060 __func__, object)); 1061 VI_LOCK(vp); 1062 KASSERT(vp->v_object == NULL, ("Not NULL v_object in tmpfs")); 1063 vp->v_object = object; 1064 vn_irflag_set_locked(vp, (tm->tm_pgread ? VIRF_PGREAD : 0) | 1065 VIRF_TEXT_REF); 1066 VI_UNLOCK(vp); 1067 VM_OBJECT_WUNLOCK(object); 1068 break; 1069 case VDIR: 1070 MPASS(node->tn_dir.tn_parent != NULL); 1071 if (node->tn_dir.tn_parent == node) 1072 vp->v_vflag |= VV_ROOT; 1073 break; 1074 1075 default: 1076 panic("tmpfs_alloc_vp: type %p %d", node, (int)node->tn_type); 1077 } 1078 if (vp->v_type != VFIFO) 1079 VN_LOCK_ASHARE(vp); 1080 1081 error = insmntque1(vp, mp); 1082 if (error != 0) { 1083 /* Need to clear v_object for insmntque failure. */ 1084 tmpfs_destroy_vobject(vp, vp->v_object); 1085 vp->v_object = NULL; 1086 vp->v_data = NULL; 1087 vp->v_op = &dead_vnodeops; 1088 vgone(vp); 1089 vput(vp); 1090 vp = NULL; 1091 } else { 1092 vn_set_state(vp, VSTATE_CONSTRUCTED); 1093 } 1094 1095 unlock: 1096 TMPFS_NODE_LOCK(node); 1097 1098 MPASS(node->tn_vpstate & TMPFS_VNODE_ALLOCATING); 1099 node->tn_vpstate &= ~TMPFS_VNODE_ALLOCATING; 1100 node->tn_vnode = vp; 1101 1102 if (node->tn_vpstate & TMPFS_VNODE_WANT) { 1103 node->tn_vpstate &= ~TMPFS_VNODE_WANT; 1104 TMPFS_NODE_UNLOCK(node); 1105 wakeup((caddr_t) &node->tn_vpstate); 1106 } else 1107 TMPFS_NODE_UNLOCK(node); 1108 1109 out: 1110 if (error == 0) { 1111 *vpp = vp; 1112 1113 #ifdef INVARIANTS 1114 MPASS(*vpp != NULL); 1115 ASSERT_VOP_LOCKED(*vpp, __func__); 1116 TMPFS_NODE_LOCK(node); 1117 MPASS(*vpp == node->tn_vnode); 1118 TMPFS_NODE_UNLOCK(node); 1119 #endif 1120 } 1121 tmpfs_free_node(tm, node); 1122 1123 return (error); 1124 } 1125 1126 /* 1127 * Destroys the association between the vnode vp and the node it 1128 * references. 1129 */ 1130 void 1131 tmpfs_free_vp(struct vnode *vp) 1132 { 1133 struct tmpfs_node *node; 1134 1135 node = VP_TO_TMPFS_NODE(vp); 1136 1137 TMPFS_NODE_ASSERT_LOCKED(node); 1138 node->tn_vnode = NULL; 1139 if ((node->tn_vpstate & TMPFS_VNODE_WRECLAIM) != 0) 1140 wakeup(&node->tn_vnode); 1141 node->tn_vpstate &= ~TMPFS_VNODE_WRECLAIM; 1142 vp->v_data = NULL; 1143 } 1144 1145 /* 1146 * Allocates a new file of type 'type' and adds it to the parent directory 1147 * 'dvp'; this addition is done using the component name given in 'cnp'. 1148 * The ownership of the new file is automatically assigned based on the 1149 * credentials of the caller (through 'cnp'), the group is set based on 1150 * the parent directory and the mode is determined from the 'vap' argument. 1151 * If successful, *vpp holds a vnode to the newly created file and zero 1152 * is returned. Otherwise *vpp is NULL and the function returns an 1153 * appropriate error code. 1154 */ 1155 int 1156 tmpfs_alloc_file(struct vnode *dvp, struct vnode **vpp, struct vattr *vap, 1157 struct componentname *cnp, const char *target) 1158 { 1159 int error; 1160 struct tmpfs_dirent *de; 1161 struct tmpfs_mount *tmp; 1162 struct tmpfs_node *dnode; 1163 struct tmpfs_node *node; 1164 struct tmpfs_node *parent; 1165 1166 ASSERT_VOP_ELOCKED(dvp, "tmpfs_alloc_file"); 1167 1168 tmp = VFS_TO_TMPFS(dvp->v_mount); 1169 dnode = VP_TO_TMPFS_DIR(dvp); 1170 *vpp = NULL; 1171 1172 /* If the entry we are creating is a directory, we cannot overflow 1173 * the number of links of its parent, because it will get a new 1174 * link. */ 1175 if (vap->va_type == VDIR) { 1176 /* Ensure that we do not overflow the maximum number of links 1177 * imposed by the system. */ 1178 MPASS(dnode->tn_links <= TMPFS_LINK_MAX); 1179 if (dnode->tn_links == TMPFS_LINK_MAX) { 1180 return (EMLINK); 1181 } 1182 1183 parent = dnode; 1184 MPASS(parent != NULL); 1185 } else 1186 parent = NULL; 1187 1188 /* Allocate a node that represents the new file. */ 1189 error = tmpfs_alloc_node(dvp->v_mount, tmp, vap->va_type, 1190 cnp->cn_cred->cr_uid, dnode->tn_gid, vap->va_mode, parent, 1191 target, vap->va_rdev, &node); 1192 if (error != 0) 1193 return (error); 1194 1195 /* Allocate a directory entry that points to the new file. */ 1196 error = tmpfs_alloc_dirent(tmp, node, cnp->cn_nameptr, cnp->cn_namelen, 1197 &de); 1198 if (error != 0) { 1199 tmpfs_free_node(tmp, node); 1200 return (error); 1201 } 1202 1203 /* Allocate a vnode for the new file. */ 1204 error = tmpfs_alloc_vp(dvp->v_mount, node, LK_EXCLUSIVE, vpp); 1205 if (error != 0) { 1206 tmpfs_free_dirent(tmp, de); 1207 tmpfs_free_node(tmp, node); 1208 return (error); 1209 } 1210 1211 /* Now that all required items are allocated, we can proceed to 1212 * insert the new node into the directory, an operation that 1213 * cannot fail. */ 1214 if (cnp->cn_flags & ISWHITEOUT) 1215 tmpfs_dir_whiteout_remove(dvp, cnp); 1216 tmpfs_dir_attach(dvp, de); 1217 return (0); 1218 } 1219 1220 struct tmpfs_dirent * 1221 tmpfs_dir_first(struct tmpfs_node *dnode, struct tmpfs_dir_cursor *dc) 1222 { 1223 struct tmpfs_dirent *de; 1224 1225 de = RB_MIN(tmpfs_dir, &dnode->tn_dir.tn_dirhead); 1226 dc->tdc_tree = de; 1227 if (de != NULL && tmpfs_dirent_duphead(de)) 1228 de = LIST_FIRST(&de->ud.td_duphead); 1229 dc->tdc_current = de; 1230 1231 return (dc->tdc_current); 1232 } 1233 1234 struct tmpfs_dirent * 1235 tmpfs_dir_next(struct tmpfs_node *dnode, struct tmpfs_dir_cursor *dc) 1236 { 1237 struct tmpfs_dirent *de; 1238 1239 MPASS(dc->tdc_tree != NULL); 1240 if (tmpfs_dirent_dup(dc->tdc_current)) { 1241 dc->tdc_current = LIST_NEXT(dc->tdc_current, uh.td_dup.entries); 1242 if (dc->tdc_current != NULL) 1243 return (dc->tdc_current); 1244 } 1245 dc->tdc_tree = dc->tdc_current = RB_NEXT(tmpfs_dir, 1246 &dnode->tn_dir.tn_dirhead, dc->tdc_tree); 1247 if ((de = dc->tdc_current) != NULL && tmpfs_dirent_duphead(de)) { 1248 dc->tdc_current = LIST_FIRST(&de->ud.td_duphead); 1249 MPASS(dc->tdc_current != NULL); 1250 } 1251 1252 return (dc->tdc_current); 1253 } 1254 1255 /* Lookup directory entry in RB-Tree. Function may return duphead entry. */ 1256 static struct tmpfs_dirent * 1257 tmpfs_dir_xlookup_hash(struct tmpfs_node *dnode, uint32_t hash) 1258 { 1259 struct tmpfs_dirent *de, dekey; 1260 1261 dekey.td_hash = hash; 1262 de = RB_FIND(tmpfs_dir, &dnode->tn_dir.tn_dirhead, &dekey); 1263 return (de); 1264 } 1265 1266 /* Lookup directory entry by cookie, initialize directory cursor accordingly. */ 1267 static struct tmpfs_dirent * 1268 tmpfs_dir_lookup_cookie(struct tmpfs_node *node, off_t cookie, 1269 struct tmpfs_dir_cursor *dc) 1270 { 1271 struct tmpfs_dir *dirhead = &node->tn_dir.tn_dirhead; 1272 struct tmpfs_dirent *de, dekey; 1273 1274 MPASS(cookie >= TMPFS_DIRCOOKIE_MIN); 1275 1276 if (cookie == node->tn_dir.tn_readdir_lastn && 1277 (de = node->tn_dir.tn_readdir_lastp) != NULL) { 1278 /* Protect against possible race, tn_readdir_last[pn] 1279 * may be updated with only shared vnode lock held. */ 1280 if (cookie == tmpfs_dirent_cookie(de)) 1281 goto out; 1282 } 1283 1284 if ((cookie & TMPFS_DIRCOOKIE_DUP) != 0) { 1285 LIST_FOREACH(de, &node->tn_dir.tn_dupindex, 1286 uh.td_dup.index_entries) { 1287 MPASS(tmpfs_dirent_dup(de)); 1288 if (de->td_cookie == cookie) 1289 goto out; 1290 /* dupindex list is sorted. */ 1291 if (de->td_cookie < cookie) { 1292 de = NULL; 1293 goto out; 1294 } 1295 } 1296 MPASS(de == NULL); 1297 goto out; 1298 } 1299 1300 if ((cookie & TMPFS_DIRCOOKIE_MASK) != cookie) { 1301 de = NULL; 1302 } else { 1303 dekey.td_hash = cookie; 1304 /* Recover if direntry for cookie was removed */ 1305 de = RB_NFIND(tmpfs_dir, dirhead, &dekey); 1306 } 1307 dc->tdc_tree = de; 1308 dc->tdc_current = de; 1309 if (de != NULL && tmpfs_dirent_duphead(de)) { 1310 dc->tdc_current = LIST_FIRST(&de->ud.td_duphead); 1311 MPASS(dc->tdc_current != NULL); 1312 } 1313 return (dc->tdc_current); 1314 1315 out: 1316 dc->tdc_tree = de; 1317 dc->tdc_current = de; 1318 if (de != NULL && tmpfs_dirent_dup(de)) 1319 dc->tdc_tree = tmpfs_dir_xlookup_hash(node, 1320 de->td_hash); 1321 return (dc->tdc_current); 1322 } 1323 1324 /* 1325 * Looks for a directory entry in the directory represented by node. 1326 * 'cnp' describes the name of the entry to look for. Note that the . 1327 * and .. components are not allowed as they do not physically exist 1328 * within directories. 1329 * 1330 * Returns a pointer to the entry when found, otherwise NULL. 1331 */ 1332 struct tmpfs_dirent * 1333 tmpfs_dir_lookup(struct tmpfs_node *node, struct tmpfs_node *f, 1334 struct componentname *cnp) 1335 { 1336 struct tmpfs_dir_duphead *duphead; 1337 struct tmpfs_dirent *de; 1338 uint32_t hash; 1339 1340 MPASS(IMPLIES(cnp->cn_namelen == 1, cnp->cn_nameptr[0] != '.')); 1341 MPASS(IMPLIES(cnp->cn_namelen == 2, !(cnp->cn_nameptr[0] == '.' && 1342 cnp->cn_nameptr[1] == '.'))); 1343 TMPFS_VALIDATE_DIR(node); 1344 1345 hash = tmpfs_dirent_hash(cnp->cn_nameptr, cnp->cn_namelen); 1346 de = tmpfs_dir_xlookup_hash(node, hash); 1347 if (de != NULL && tmpfs_dirent_duphead(de)) { 1348 duphead = &de->ud.td_duphead; 1349 LIST_FOREACH(de, duphead, uh.td_dup.entries) { 1350 if (TMPFS_DIRENT_MATCHES(de, cnp->cn_nameptr, 1351 cnp->cn_namelen)) 1352 break; 1353 } 1354 } else if (de != NULL) { 1355 if (!TMPFS_DIRENT_MATCHES(de, cnp->cn_nameptr, 1356 cnp->cn_namelen)) 1357 de = NULL; 1358 } 1359 if (de != NULL && f != NULL && de->td_node != f) 1360 de = NULL; 1361 1362 return (de); 1363 } 1364 1365 /* 1366 * Attach duplicate-cookie directory entry nde to dnode and insert to dupindex 1367 * list, allocate new cookie value. 1368 */ 1369 static void 1370 tmpfs_dir_attach_dup(struct tmpfs_node *dnode, 1371 struct tmpfs_dir_duphead *duphead, struct tmpfs_dirent *nde) 1372 { 1373 struct tmpfs_dir_duphead *dupindex; 1374 struct tmpfs_dirent *de, *pde; 1375 1376 dupindex = &dnode->tn_dir.tn_dupindex; 1377 de = LIST_FIRST(dupindex); 1378 if (de == NULL || de->td_cookie < TMPFS_DIRCOOKIE_DUP_MAX) { 1379 if (de == NULL) 1380 nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MIN; 1381 else 1382 nde->td_cookie = de->td_cookie + 1; 1383 MPASS(tmpfs_dirent_dup(nde)); 1384 LIST_INSERT_HEAD(dupindex, nde, uh.td_dup.index_entries); 1385 LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); 1386 return; 1387 } 1388 1389 /* 1390 * Cookie numbers are near exhaustion. Scan dupindex list for unused 1391 * numbers. dupindex list is sorted in descending order. Keep it so 1392 * after inserting nde. 1393 */ 1394 while (1) { 1395 pde = de; 1396 de = LIST_NEXT(de, uh.td_dup.index_entries); 1397 if (de == NULL && pde->td_cookie != TMPFS_DIRCOOKIE_DUP_MIN) { 1398 /* 1399 * Last element of the index doesn't have minimal cookie 1400 * value, use it. 1401 */ 1402 nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MIN; 1403 LIST_INSERT_AFTER(pde, nde, uh.td_dup.index_entries); 1404 LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); 1405 return; 1406 } else if (de == NULL) { 1407 /* 1408 * We are so lucky have 2^30 hash duplicates in single 1409 * directory :) Return largest possible cookie value. 1410 * It should be fine except possible issues with 1411 * VOP_READDIR restart. 1412 */ 1413 nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MAX; 1414 LIST_INSERT_HEAD(dupindex, nde, 1415 uh.td_dup.index_entries); 1416 LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); 1417 return; 1418 } 1419 if (de->td_cookie + 1 == pde->td_cookie || 1420 de->td_cookie >= TMPFS_DIRCOOKIE_DUP_MAX) 1421 continue; /* No hole or invalid cookie. */ 1422 nde->td_cookie = de->td_cookie + 1; 1423 MPASS(tmpfs_dirent_dup(nde)); 1424 MPASS(pde->td_cookie > nde->td_cookie); 1425 MPASS(nde->td_cookie > de->td_cookie); 1426 LIST_INSERT_BEFORE(de, nde, uh.td_dup.index_entries); 1427 LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); 1428 return; 1429 } 1430 } 1431 1432 /* 1433 * Attaches the directory entry de to the directory represented by vp. 1434 * Note that this does not change the link count of the node pointed by 1435 * the directory entry, as this is done by tmpfs_alloc_dirent. 1436 */ 1437 void 1438 tmpfs_dir_attach(struct vnode *vp, struct tmpfs_dirent *de) 1439 { 1440 struct tmpfs_node *dnode; 1441 struct tmpfs_dirent *xde, *nde; 1442 1443 ASSERT_VOP_ELOCKED(vp, __func__); 1444 MPASS(de->td_namelen > 0); 1445 MPASS(de->td_hash >= TMPFS_DIRCOOKIE_MIN); 1446 MPASS(de->td_cookie == de->td_hash); 1447 1448 dnode = VP_TO_TMPFS_DIR(vp); 1449 dnode->tn_dir.tn_readdir_lastn = 0; 1450 dnode->tn_dir.tn_readdir_lastp = NULL; 1451 1452 MPASS(!tmpfs_dirent_dup(de)); 1453 xde = RB_INSERT(tmpfs_dir, &dnode->tn_dir.tn_dirhead, de); 1454 if (xde != NULL && tmpfs_dirent_duphead(xde)) 1455 tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, de); 1456 else if (xde != NULL) { 1457 /* 1458 * Allocate new duphead. Swap xde with duphead to avoid 1459 * adding/removing elements with the same hash. 1460 */ 1461 MPASS(!tmpfs_dirent_dup(xde)); 1462 tmpfs_alloc_dirent(VFS_TO_TMPFS(vp->v_mount), NULL, NULL, 0, 1463 &nde); 1464 /* *nde = *xde; XXX gcc 4.2.1 may generate invalid code. */ 1465 memcpy(nde, xde, sizeof(*xde)); 1466 xde->td_cookie |= TMPFS_DIRCOOKIE_DUPHEAD; 1467 LIST_INIT(&xde->ud.td_duphead); 1468 xde->td_namelen = 0; 1469 xde->td_node = NULL; 1470 tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, nde); 1471 tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, de); 1472 } 1473 dnode->tn_size += sizeof(struct tmpfs_dirent); 1474 dnode->tn_status |= TMPFS_NODE_CHANGED | TMPFS_NODE_MODIFIED; 1475 dnode->tn_accessed = true; 1476 tmpfs_update(vp); 1477 } 1478 1479 /* 1480 * Detaches the directory entry de from the directory represented by vp. 1481 * Note that this does not change the link count of the node pointed by 1482 * the directory entry, as this is done by tmpfs_free_dirent. 1483 */ 1484 void 1485 tmpfs_dir_detach(struct vnode *vp, struct tmpfs_dirent *de) 1486 { 1487 struct tmpfs_mount *tmp; 1488 struct tmpfs_dir *head; 1489 struct tmpfs_node *dnode; 1490 struct tmpfs_dirent *xde; 1491 1492 ASSERT_VOP_ELOCKED(vp, __func__); 1493 1494 dnode = VP_TO_TMPFS_DIR(vp); 1495 head = &dnode->tn_dir.tn_dirhead; 1496 dnode->tn_dir.tn_readdir_lastn = 0; 1497 dnode->tn_dir.tn_readdir_lastp = NULL; 1498 1499 if (tmpfs_dirent_dup(de)) { 1500 /* Remove duphead if de was last entry. */ 1501 if (LIST_NEXT(de, uh.td_dup.entries) == NULL) { 1502 xde = tmpfs_dir_xlookup_hash(dnode, de->td_hash); 1503 MPASS(tmpfs_dirent_duphead(xde)); 1504 } else 1505 xde = NULL; 1506 LIST_REMOVE(de, uh.td_dup.entries); 1507 LIST_REMOVE(de, uh.td_dup.index_entries); 1508 if (xde != NULL) { 1509 if (LIST_EMPTY(&xde->ud.td_duphead)) { 1510 RB_REMOVE(tmpfs_dir, head, xde); 1511 tmp = VFS_TO_TMPFS(vp->v_mount); 1512 MPASS(xde->td_node == NULL); 1513 tmpfs_free_dirent(tmp, xde); 1514 } 1515 } 1516 de->td_cookie = de->td_hash; 1517 } else 1518 RB_REMOVE(tmpfs_dir, head, de); 1519 1520 dnode->tn_size -= sizeof(struct tmpfs_dirent); 1521 dnode->tn_status |= TMPFS_NODE_CHANGED | TMPFS_NODE_MODIFIED; 1522 dnode->tn_accessed = true; 1523 tmpfs_update(vp); 1524 } 1525 1526 void 1527 tmpfs_dir_destroy(struct tmpfs_mount *tmp, struct tmpfs_node *dnode) 1528 { 1529 struct tmpfs_dirent *de, *dde, *nde; 1530 1531 RB_FOREACH_SAFE(de, tmpfs_dir, &dnode->tn_dir.tn_dirhead, nde) { 1532 RB_REMOVE(tmpfs_dir, &dnode->tn_dir.tn_dirhead, de); 1533 /* Node may already be destroyed. */ 1534 de->td_node = NULL; 1535 if (tmpfs_dirent_duphead(de)) { 1536 while ((dde = LIST_FIRST(&de->ud.td_duphead)) != NULL) { 1537 LIST_REMOVE(dde, uh.td_dup.entries); 1538 dde->td_node = NULL; 1539 tmpfs_free_dirent(tmp, dde); 1540 } 1541 } 1542 tmpfs_free_dirent(tmp, de); 1543 } 1544 } 1545 1546 /* 1547 * Helper function for tmpfs_readdir. Creates a '.' entry for the given 1548 * directory and returns it in the uio space. The function returns 0 1549 * on success, -1 if there was not enough space in the uio structure to 1550 * hold the directory entry or an appropriate error code if another 1551 * error happens. 1552 */ 1553 static int 1554 tmpfs_dir_getdotdent(struct tmpfs_mount *tm, struct tmpfs_node *node, 1555 struct uio *uio) 1556 { 1557 int error; 1558 struct dirent dent; 1559 1560 TMPFS_VALIDATE_DIR(node); 1561 MPASS(uio->uio_offset == TMPFS_DIRCOOKIE_DOT); 1562 1563 dent.d_fileno = node->tn_id; 1564 dent.d_off = TMPFS_DIRCOOKIE_DOTDOT; 1565 dent.d_type = DT_DIR; 1566 dent.d_namlen = 1; 1567 dent.d_name[0] = '.'; 1568 dent.d_reclen = GENERIC_DIRSIZ(&dent); 1569 dirent_terminate(&dent); 1570 1571 if (dent.d_reclen > uio->uio_resid) 1572 error = EJUSTRETURN; 1573 else 1574 error = uiomove(&dent, dent.d_reclen, uio); 1575 1576 tmpfs_set_accessed(tm, node); 1577 1578 return (error); 1579 } 1580 1581 /* 1582 * Helper function for tmpfs_readdir. Creates a '..' entry for the given 1583 * directory and returns it in the uio space. The function returns 0 1584 * on success, -1 if there was not enough space in the uio structure to 1585 * hold the directory entry or an appropriate error code if another 1586 * error happens. 1587 */ 1588 static int 1589 tmpfs_dir_getdotdotdent(struct tmpfs_mount *tm, struct tmpfs_node *node, 1590 struct uio *uio, off_t next) 1591 { 1592 struct tmpfs_node *parent; 1593 struct dirent dent; 1594 int error; 1595 1596 TMPFS_VALIDATE_DIR(node); 1597 MPASS(uio->uio_offset == TMPFS_DIRCOOKIE_DOTDOT); 1598 1599 /* 1600 * Return ENOENT if the current node is already removed. 1601 */ 1602 TMPFS_ASSERT_LOCKED(node); 1603 parent = node->tn_dir.tn_parent; 1604 if (parent == NULL) 1605 return (ENOENT); 1606 1607 dent.d_fileno = parent->tn_id; 1608 dent.d_off = next; 1609 dent.d_type = DT_DIR; 1610 dent.d_namlen = 2; 1611 dent.d_name[0] = '.'; 1612 dent.d_name[1] = '.'; 1613 dent.d_reclen = GENERIC_DIRSIZ(&dent); 1614 dirent_terminate(&dent); 1615 1616 if (dent.d_reclen > uio->uio_resid) 1617 error = EJUSTRETURN; 1618 else 1619 error = uiomove(&dent, dent.d_reclen, uio); 1620 1621 tmpfs_set_accessed(tm, node); 1622 1623 return (error); 1624 } 1625 1626 /* 1627 * Helper function for tmpfs_readdir. Returns as much directory entries 1628 * as can fit in the uio space. The read starts at uio->uio_offset. 1629 * The function returns 0 on success, -1 if there was not enough space 1630 * in the uio structure to hold the directory entry or an appropriate 1631 * error code if another error happens. 1632 */ 1633 int 1634 tmpfs_dir_getdents(struct tmpfs_mount *tm, struct tmpfs_node *node, 1635 struct uio *uio, int maxcookies, uint64_t *cookies, int *ncookies) 1636 { 1637 struct tmpfs_dir_cursor dc; 1638 struct tmpfs_dirent *de, *nde; 1639 off_t off; 1640 int error; 1641 1642 TMPFS_VALIDATE_DIR(node); 1643 1644 off = 0; 1645 1646 /* 1647 * Lookup the node from the current offset. The starting offset of 1648 * 0 will lookup both '.' and '..', and then the first real entry, 1649 * or EOF if there are none. Then find all entries for the dir that 1650 * fit into the buffer. Once no more entries are found (de == NULL), 1651 * the offset is set to TMPFS_DIRCOOKIE_EOF, which will cause the next 1652 * call to return 0. 1653 */ 1654 switch (uio->uio_offset) { 1655 case TMPFS_DIRCOOKIE_DOT: 1656 error = tmpfs_dir_getdotdent(tm, node, uio); 1657 if (error != 0) 1658 return (error); 1659 uio->uio_offset = off = TMPFS_DIRCOOKIE_DOTDOT; 1660 if (cookies != NULL) 1661 cookies[(*ncookies)++] = off; 1662 /* FALLTHROUGH */ 1663 case TMPFS_DIRCOOKIE_DOTDOT: 1664 de = tmpfs_dir_first(node, &dc); 1665 off = tmpfs_dirent_cookie(de); 1666 error = tmpfs_dir_getdotdotdent(tm, node, uio, off); 1667 if (error != 0) 1668 return (error); 1669 uio->uio_offset = off; 1670 if (cookies != NULL) 1671 cookies[(*ncookies)++] = off; 1672 /* EOF. */ 1673 if (de == NULL) 1674 return (0); 1675 break; 1676 case TMPFS_DIRCOOKIE_EOF: 1677 return (0); 1678 default: 1679 de = tmpfs_dir_lookup_cookie(node, uio->uio_offset, &dc); 1680 if (de == NULL) 1681 return (EINVAL); 1682 if (cookies != NULL) 1683 off = tmpfs_dirent_cookie(de); 1684 } 1685 1686 /* 1687 * Read as much entries as possible; i.e., until we reach the end of the 1688 * directory or we exhaust uio space. 1689 */ 1690 do { 1691 struct dirent d; 1692 1693 /* 1694 * Create a dirent structure representing the current tmpfs_node 1695 * and fill it. 1696 */ 1697 if (de->td_node == NULL) { 1698 d.d_fileno = 1; 1699 d.d_type = DT_WHT; 1700 } else { 1701 d.d_fileno = de->td_node->tn_id; 1702 switch (de->td_node->tn_type) { 1703 case VBLK: 1704 d.d_type = DT_BLK; 1705 break; 1706 1707 case VCHR: 1708 d.d_type = DT_CHR; 1709 break; 1710 1711 case VDIR: 1712 d.d_type = DT_DIR; 1713 break; 1714 1715 case VFIFO: 1716 d.d_type = DT_FIFO; 1717 break; 1718 1719 case VLNK: 1720 d.d_type = DT_LNK; 1721 break; 1722 1723 case VREG: 1724 d.d_type = DT_REG; 1725 break; 1726 1727 case VSOCK: 1728 d.d_type = DT_SOCK; 1729 break; 1730 1731 default: 1732 panic("tmpfs_dir_getdents: type %p %d", 1733 de->td_node, (int)de->td_node->tn_type); 1734 } 1735 } 1736 d.d_namlen = de->td_namelen; 1737 MPASS(de->td_namelen < sizeof(d.d_name)); 1738 (void)memcpy(d.d_name, de->ud.td_name, de->td_namelen); 1739 d.d_reclen = GENERIC_DIRSIZ(&d); 1740 1741 /* 1742 * Stop reading if the directory entry we are treating is bigger 1743 * than the amount of data that can be returned. 1744 */ 1745 if (d.d_reclen > uio->uio_resid) { 1746 error = EJUSTRETURN; 1747 break; 1748 } 1749 1750 nde = tmpfs_dir_next(node, &dc); 1751 d.d_off = tmpfs_dirent_cookie(nde); 1752 dirent_terminate(&d); 1753 1754 /* 1755 * Copy the new dirent structure into the output buffer and 1756 * advance pointers. 1757 */ 1758 error = uiomove(&d, d.d_reclen, uio); 1759 if (error == 0) { 1760 de = nde; 1761 if (cookies != NULL) { 1762 off = tmpfs_dirent_cookie(de); 1763 MPASS(*ncookies < maxcookies); 1764 cookies[(*ncookies)++] = off; 1765 } 1766 } 1767 } while (error == 0 && uio->uio_resid > 0 && de != NULL); 1768 1769 /* Skip setting off when using cookies as it is already done above. */ 1770 if (cookies == NULL) 1771 off = tmpfs_dirent_cookie(de); 1772 1773 /* Update the offset and cache. */ 1774 uio->uio_offset = off; 1775 node->tn_dir.tn_readdir_lastn = off; 1776 node->tn_dir.tn_readdir_lastp = de; 1777 1778 tmpfs_set_accessed(tm, node); 1779 return (error); 1780 } 1781 1782 int 1783 tmpfs_dir_whiteout_add(struct vnode *dvp, struct componentname *cnp) 1784 { 1785 struct tmpfs_dirent *de; 1786 int error; 1787 1788 error = tmpfs_alloc_dirent(VFS_TO_TMPFS(dvp->v_mount), NULL, 1789 cnp->cn_nameptr, cnp->cn_namelen, &de); 1790 if (error != 0) 1791 return (error); 1792 tmpfs_dir_attach(dvp, de); 1793 return (0); 1794 } 1795 1796 void 1797 tmpfs_dir_whiteout_remove(struct vnode *dvp, struct componentname *cnp) 1798 { 1799 struct tmpfs_dirent *de; 1800 1801 de = tmpfs_dir_lookup(VP_TO_TMPFS_DIR(dvp), NULL, cnp); 1802 MPASS(de != NULL && de->td_node == NULL); 1803 tmpfs_dir_detach(dvp, de); 1804 tmpfs_free_dirent(VFS_TO_TMPFS(dvp->v_mount), de); 1805 } 1806 1807 /* 1808 * Resizes the aobj associated with the regular file pointed to by 'vp' to the 1809 * size 'newsize'. 'vp' must point to a vnode that represents a regular file. 1810 * 'newsize' must be positive. 1811 * 1812 * Returns zero on success or an appropriate error code on failure. 1813 */ 1814 int 1815 tmpfs_reg_resize(struct vnode *vp, off_t newsize, boolean_t ignerr) 1816 { 1817 struct tmpfs_node *node; 1818 vm_object_t uobj; 1819 vm_pindex_t idx, newpages, oldpages; 1820 off_t oldsize; 1821 int base, error; 1822 1823 MPASS(vp->v_type == VREG); 1824 MPASS(newsize >= 0); 1825 1826 node = VP_TO_TMPFS_NODE(vp); 1827 uobj = node->tn_reg.tn_aobj; 1828 1829 /* 1830 * Convert the old and new sizes to the number of pages needed to 1831 * store them. It may happen that we do not need to do anything 1832 * because the last allocated page can accommodate the change on 1833 * its own. 1834 */ 1835 oldsize = node->tn_size; 1836 oldpages = OFF_TO_IDX(oldsize + PAGE_MASK); 1837 MPASS(oldpages == uobj->size); 1838 newpages = OFF_TO_IDX(newsize + PAGE_MASK); 1839 1840 if (__predict_true(newpages == oldpages && newsize >= oldsize)) { 1841 node->tn_size = newsize; 1842 return (0); 1843 } 1844 1845 VM_OBJECT_WLOCK(uobj); 1846 if (newsize < oldsize) { 1847 /* 1848 * Zero the truncated part of the last page. 1849 */ 1850 base = newsize & PAGE_MASK; 1851 if (base != 0) { 1852 idx = OFF_TO_IDX(newsize); 1853 error = tmpfs_partial_page_invalidate(uobj, idx, base, 1854 PAGE_SIZE, ignerr); 1855 if (error != 0) { 1856 VM_OBJECT_WUNLOCK(uobj); 1857 return (error); 1858 } 1859 } 1860 1861 /* 1862 * Release any swap space and free any whole pages. 1863 */ 1864 if (newpages < oldpages) 1865 vm_object_page_remove(uobj, newpages, 0, 0); 1866 } 1867 uobj->size = newpages; 1868 VM_OBJECT_WUNLOCK(uobj); 1869 1870 node->tn_size = newsize; 1871 return (0); 1872 } 1873 1874 /* 1875 * Punch hole in the aobj associated with the regular file pointed to by 'vp'. 1876 * Requests completely beyond the end-of-file are converted to no-op. 1877 * 1878 * Returns 0 on success or error code from tmpfs_partial_page_invalidate() on 1879 * failure. 1880 */ 1881 int 1882 tmpfs_reg_punch_hole(struct vnode *vp, off_t *offset, off_t *length) 1883 { 1884 struct tmpfs_node *node; 1885 vm_object_t object; 1886 vm_pindex_t pistart, pi, piend; 1887 int startofs, endofs, end; 1888 off_t off, len; 1889 int error; 1890 1891 KASSERT(*length <= OFF_MAX - *offset, ("%s: offset + length overflows", 1892 __func__)); 1893 node = VP_TO_TMPFS_NODE(vp); 1894 KASSERT(node->tn_type == VREG, ("%s: node is not regular file", 1895 __func__)); 1896 object = node->tn_reg.tn_aobj; 1897 off = *offset; 1898 len = omin(node->tn_size - off, *length); 1899 startofs = off & PAGE_MASK; 1900 endofs = (off + len) & PAGE_MASK; 1901 pistart = OFF_TO_IDX(off); 1902 piend = OFF_TO_IDX(off + len); 1903 pi = OFF_TO_IDX((vm_ooffset_t)off + PAGE_MASK); 1904 error = 0; 1905 1906 /* Handle the case when offset is on or beyond file size. */ 1907 if (len <= 0) { 1908 *length = 0; 1909 return (0); 1910 } 1911 1912 VM_OBJECT_WLOCK(object); 1913 1914 /* 1915 * If there is a partial page at the beginning of the hole-punching 1916 * request, fill the partial page with zeroes. 1917 */ 1918 if (startofs != 0) { 1919 end = pistart != piend ? PAGE_SIZE : endofs; 1920 error = tmpfs_partial_page_invalidate(object, pistart, startofs, 1921 end, FALSE); 1922 if (error != 0) 1923 goto out; 1924 off += end - startofs; 1925 len -= end - startofs; 1926 } 1927 1928 /* 1929 * Toss away the full pages in the affected area. 1930 */ 1931 if (pi < piend) { 1932 vm_object_page_remove(object, pi, piend, 0); 1933 off += IDX_TO_OFF(piend - pi); 1934 len -= IDX_TO_OFF(piend - pi); 1935 } 1936 1937 /* 1938 * If there is a partial page at the end of the hole-punching request, 1939 * fill the partial page with zeroes. 1940 */ 1941 if (endofs != 0 && pistart != piend) { 1942 error = tmpfs_partial_page_invalidate(object, piend, 0, endofs, 1943 FALSE); 1944 if (error != 0) 1945 goto out; 1946 off += endofs; 1947 len -= endofs; 1948 } 1949 1950 out: 1951 VM_OBJECT_WUNLOCK(object); 1952 *offset = off; 1953 *length = len; 1954 return (error); 1955 } 1956 1957 void 1958 tmpfs_check_mtime(struct vnode *vp) 1959 { 1960 struct tmpfs_node *node; 1961 struct vm_object *obj; 1962 1963 ASSERT_VOP_ELOCKED(vp, "check_mtime"); 1964 if (vp->v_type != VREG) 1965 return; 1966 obj = vp->v_object; 1967 KASSERT(obj->type == tmpfs_pager_type && 1968 (obj->flags & (OBJ_SWAP | OBJ_TMPFS)) == 1969 (OBJ_SWAP | OBJ_TMPFS), ("non-tmpfs obj")); 1970 /* unlocked read */ 1971 if (obj->generation != obj->cleangeneration) { 1972 VM_OBJECT_WLOCK(obj); 1973 if (obj->generation != obj->cleangeneration) { 1974 obj->cleangeneration = obj->generation; 1975 node = VP_TO_TMPFS_NODE(vp); 1976 node->tn_status |= TMPFS_NODE_MODIFIED | 1977 TMPFS_NODE_CHANGED; 1978 } 1979 VM_OBJECT_WUNLOCK(obj); 1980 } 1981 } 1982 1983 /* 1984 * Change flags of the given vnode. 1985 * Caller should execute tmpfs_update on vp after a successful execution. 1986 * The vnode must be locked on entry and remain locked on exit. 1987 */ 1988 int 1989 tmpfs_chflags(struct vnode *vp, u_long flags, struct ucred *cred, 1990 struct thread *td) 1991 { 1992 int error; 1993 struct tmpfs_node *node; 1994 1995 ASSERT_VOP_ELOCKED(vp, "chflags"); 1996 1997 node = VP_TO_TMPFS_NODE(vp); 1998 1999 if ((flags & ~(SF_APPEND | SF_ARCHIVED | SF_IMMUTABLE | SF_NOUNLINK | 2000 UF_APPEND | UF_ARCHIVE | UF_HIDDEN | UF_IMMUTABLE | UF_NODUMP | 2001 UF_NOUNLINK | UF_OFFLINE | UF_OPAQUE | UF_READONLY | UF_REPARSE | 2002 UF_SPARSE | UF_SYSTEM)) != 0) 2003 return (EOPNOTSUPP); 2004 2005 /* Disallow this operation if the file system is mounted read-only. */ 2006 if (vp->v_mount->mnt_flag & MNT_RDONLY) 2007 return (EROFS); 2008 2009 /* 2010 * Callers may only modify the file flags on objects they 2011 * have VADMIN rights for. 2012 */ 2013 if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) 2014 return (error); 2015 /* 2016 * Unprivileged processes are not permitted to unset system 2017 * flags, or modify flags if any system flags are set. 2018 */ 2019 if (!priv_check_cred(cred, PRIV_VFS_SYSFLAGS)) { 2020 if (node->tn_flags & 2021 (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND)) { 2022 error = securelevel_gt(cred, 0); 2023 if (error) 2024 return (error); 2025 } 2026 } else { 2027 if (node->tn_flags & 2028 (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND) || 2029 ((flags ^ node->tn_flags) & SF_SETTABLE)) 2030 return (EPERM); 2031 } 2032 node->tn_flags = flags; 2033 node->tn_status |= TMPFS_NODE_CHANGED; 2034 2035 ASSERT_VOP_ELOCKED(vp, "chflags2"); 2036 2037 return (0); 2038 } 2039 2040 /* 2041 * Change access mode on the given vnode. 2042 * Caller should execute tmpfs_update on vp after a successful execution. 2043 * The vnode must be locked on entry and remain locked on exit. 2044 */ 2045 int 2046 tmpfs_chmod(struct vnode *vp, mode_t mode, struct ucred *cred, 2047 struct thread *td) 2048 { 2049 int error; 2050 struct tmpfs_node *node; 2051 mode_t newmode; 2052 2053 ASSERT_VOP_ELOCKED(vp, "chmod"); 2054 ASSERT_VOP_IN_SEQC(vp); 2055 2056 node = VP_TO_TMPFS_NODE(vp); 2057 2058 /* Disallow this operation if the file system is mounted read-only. */ 2059 if (vp->v_mount->mnt_flag & MNT_RDONLY) 2060 return (EROFS); 2061 2062 /* Immutable or append-only files cannot be modified, either. */ 2063 if (node->tn_flags & (IMMUTABLE | APPEND)) 2064 return (EPERM); 2065 2066 /* 2067 * To modify the permissions on a file, must possess VADMIN 2068 * for that file. 2069 */ 2070 if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) 2071 return (error); 2072 2073 /* 2074 * Privileged processes may set the sticky bit on non-directories, 2075 * as well as set the setgid bit on a file with a group that the 2076 * process is not a member of. 2077 */ 2078 if (vp->v_type != VDIR && (mode & S_ISTXT)) { 2079 if (priv_check_cred(cred, PRIV_VFS_STICKYFILE)) 2080 return (EFTYPE); 2081 } 2082 if (!groupmember(node->tn_gid, cred) && (mode & S_ISGID)) { 2083 error = priv_check_cred(cred, PRIV_VFS_SETGID); 2084 if (error) 2085 return (error); 2086 } 2087 2088 newmode = node->tn_mode & ~ALLPERMS; 2089 newmode |= mode & ALLPERMS; 2090 atomic_store_short(&node->tn_mode, newmode); 2091 2092 node->tn_status |= TMPFS_NODE_CHANGED; 2093 2094 ASSERT_VOP_ELOCKED(vp, "chmod2"); 2095 2096 return (0); 2097 } 2098 2099 /* 2100 * Change ownership of the given vnode. At least one of uid or gid must 2101 * be different than VNOVAL. If one is set to that value, the attribute 2102 * is unchanged. 2103 * Caller should execute tmpfs_update on vp after a successful execution. 2104 * The vnode must be locked on entry and remain locked on exit. 2105 */ 2106 int 2107 tmpfs_chown(struct vnode *vp, uid_t uid, gid_t gid, struct ucred *cred, 2108 struct thread *td) 2109 { 2110 int error; 2111 struct tmpfs_node *node; 2112 uid_t ouid; 2113 gid_t ogid; 2114 mode_t newmode; 2115 2116 ASSERT_VOP_ELOCKED(vp, "chown"); 2117 ASSERT_VOP_IN_SEQC(vp); 2118 2119 node = VP_TO_TMPFS_NODE(vp); 2120 2121 /* Assign default values if they are unknown. */ 2122 MPASS(uid != VNOVAL || gid != VNOVAL); 2123 if (uid == VNOVAL) 2124 uid = node->tn_uid; 2125 if (gid == VNOVAL) 2126 gid = node->tn_gid; 2127 MPASS(uid != VNOVAL && gid != VNOVAL); 2128 2129 /* Disallow this operation if the file system is mounted read-only. */ 2130 if (vp->v_mount->mnt_flag & MNT_RDONLY) 2131 return (EROFS); 2132 2133 /* Immutable or append-only files cannot be modified, either. */ 2134 if (node->tn_flags & (IMMUTABLE | APPEND)) 2135 return (EPERM); 2136 2137 /* 2138 * To modify the ownership of a file, must possess VADMIN for that 2139 * file. 2140 */ 2141 if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) 2142 return (error); 2143 2144 /* 2145 * To change the owner of a file, or change the group of a file to a 2146 * group of which we are not a member, the caller must have 2147 * privilege. 2148 */ 2149 if ((uid != node->tn_uid || 2150 (gid != node->tn_gid && !groupmember(gid, cred))) && 2151 (error = priv_check_cred(cred, PRIV_VFS_CHOWN))) 2152 return (error); 2153 2154 ogid = node->tn_gid; 2155 ouid = node->tn_uid; 2156 2157 node->tn_uid = uid; 2158 node->tn_gid = gid; 2159 2160 node->tn_status |= TMPFS_NODE_CHANGED; 2161 2162 if ((node->tn_mode & (S_ISUID | S_ISGID)) != 0 && 2163 (ouid != uid || ogid != gid)) { 2164 if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID)) { 2165 newmode = node->tn_mode & ~(S_ISUID | S_ISGID); 2166 atomic_store_short(&node->tn_mode, newmode); 2167 } 2168 } 2169 2170 ASSERT_VOP_ELOCKED(vp, "chown2"); 2171 2172 return (0); 2173 } 2174 2175 /* 2176 * Change size of the given vnode. 2177 * Caller should execute tmpfs_update on vp after a successful execution. 2178 * The vnode must be locked on entry and remain locked on exit. 2179 */ 2180 int 2181 tmpfs_chsize(struct vnode *vp, u_quad_t size, struct ucred *cred, 2182 struct thread *td) 2183 { 2184 int error; 2185 struct tmpfs_node *node; 2186 2187 ASSERT_VOP_ELOCKED(vp, "chsize"); 2188 2189 node = VP_TO_TMPFS_NODE(vp); 2190 2191 /* Decide whether this is a valid operation based on the file type. */ 2192 error = 0; 2193 switch (vp->v_type) { 2194 case VDIR: 2195 return (EISDIR); 2196 2197 case VREG: 2198 if (vp->v_mount->mnt_flag & MNT_RDONLY) 2199 return (EROFS); 2200 break; 2201 2202 case VBLK: 2203 /* FALLTHROUGH */ 2204 case VCHR: 2205 /* FALLTHROUGH */ 2206 case VFIFO: 2207 /* 2208 * Allow modifications of special files even if in the file 2209 * system is mounted read-only (we are not modifying the 2210 * files themselves, but the objects they represent). 2211 */ 2212 return (0); 2213 2214 default: 2215 /* Anything else is unsupported. */ 2216 return (EOPNOTSUPP); 2217 } 2218 2219 /* Immutable or append-only files cannot be modified, either. */ 2220 if (node->tn_flags & (IMMUTABLE | APPEND)) 2221 return (EPERM); 2222 2223 error = vn_rlimit_trunc(size, td); 2224 if (error != 0) 2225 return (error); 2226 2227 error = tmpfs_truncate(vp, size); 2228 /* 2229 * tmpfs_truncate will raise the NOTE_EXTEND and NOTE_ATTRIB kevents 2230 * for us, as will update tn_status; no need to do that here. 2231 */ 2232 2233 ASSERT_VOP_ELOCKED(vp, "chsize2"); 2234 2235 return (error); 2236 } 2237 2238 /* 2239 * Change access and modification times of the given vnode. 2240 * Caller should execute tmpfs_update on vp after a successful execution. 2241 * The vnode must be locked on entry and remain locked on exit. 2242 */ 2243 int 2244 tmpfs_chtimes(struct vnode *vp, struct vattr *vap, 2245 struct ucred *cred, struct thread *td) 2246 { 2247 int error; 2248 struct tmpfs_node *node; 2249 2250 ASSERT_VOP_ELOCKED(vp, "chtimes"); 2251 2252 node = VP_TO_TMPFS_NODE(vp); 2253 2254 /* Disallow this operation if the file system is mounted read-only. */ 2255 if (vp->v_mount->mnt_flag & MNT_RDONLY) 2256 return (EROFS); 2257 2258 /* Immutable or append-only files cannot be modified, either. */ 2259 if (node->tn_flags & (IMMUTABLE | APPEND)) 2260 return (EPERM); 2261 2262 error = vn_utimes_perm(vp, vap, cred, td); 2263 if (error != 0) 2264 return (error); 2265 2266 if (vap->va_atime.tv_sec != VNOVAL) 2267 node->tn_accessed = true; 2268 if (vap->va_mtime.tv_sec != VNOVAL) 2269 node->tn_status |= TMPFS_NODE_MODIFIED; 2270 if (vap->va_birthtime.tv_sec != VNOVAL) 2271 node->tn_status |= TMPFS_NODE_MODIFIED; 2272 tmpfs_itimes(vp, &vap->va_atime, &vap->va_mtime); 2273 if (vap->va_birthtime.tv_sec != VNOVAL) 2274 node->tn_birthtime = vap->va_birthtime; 2275 ASSERT_VOP_ELOCKED(vp, "chtimes2"); 2276 2277 return (0); 2278 } 2279 2280 void 2281 tmpfs_set_status(struct tmpfs_mount *tm, struct tmpfs_node *node, int status) 2282 { 2283 2284 if ((node->tn_status & status) == status || tm->tm_ronly) 2285 return; 2286 TMPFS_NODE_LOCK(node); 2287 node->tn_status |= status; 2288 TMPFS_NODE_UNLOCK(node); 2289 } 2290 2291 void 2292 tmpfs_set_accessed(struct tmpfs_mount *tm, struct tmpfs_node *node) 2293 { 2294 if (node->tn_accessed || tm->tm_ronly) 2295 return; 2296 atomic_store_8(&node->tn_accessed, true); 2297 } 2298 2299 /* Sync timestamps */ 2300 void 2301 tmpfs_itimes(struct vnode *vp, const struct timespec *acc, 2302 const struct timespec *mod) 2303 { 2304 struct tmpfs_node *node; 2305 struct timespec now; 2306 2307 ASSERT_VOP_LOCKED(vp, "tmpfs_itimes"); 2308 node = VP_TO_TMPFS_NODE(vp); 2309 2310 if (!node->tn_accessed && 2311 (node->tn_status & (TMPFS_NODE_MODIFIED | TMPFS_NODE_CHANGED)) == 0) 2312 return; 2313 2314 vfs_timestamp(&now); 2315 TMPFS_NODE_LOCK(node); 2316 if (node->tn_accessed) { 2317 if (acc == NULL) 2318 acc = &now; 2319 node->tn_atime = *acc; 2320 } 2321 if (node->tn_status & TMPFS_NODE_MODIFIED) { 2322 if (mod == NULL) 2323 mod = &now; 2324 node->tn_mtime = *mod; 2325 } 2326 if (node->tn_status & TMPFS_NODE_CHANGED) 2327 node->tn_ctime = now; 2328 node->tn_status &= ~(TMPFS_NODE_MODIFIED | TMPFS_NODE_CHANGED); 2329 node->tn_accessed = false; 2330 TMPFS_NODE_UNLOCK(node); 2331 2332 /* XXX: FIX? The entropy here is desirable, but the harvesting may be expensive */ 2333 random_harvest_queue(node, sizeof(*node), RANDOM_FS_ATIME); 2334 } 2335 2336 int 2337 tmpfs_truncate(struct vnode *vp, off_t length) 2338 { 2339 struct tmpfs_node *node; 2340 int error; 2341 2342 if (length < 0) 2343 return (EINVAL); 2344 if (length > VFS_TO_TMPFS(vp->v_mount)->tm_maxfilesize) 2345 return (EFBIG); 2346 2347 node = VP_TO_TMPFS_NODE(vp); 2348 error = node->tn_size == length ? 0 : tmpfs_reg_resize(vp, length, 2349 FALSE); 2350 if (error == 0) 2351 node->tn_status |= TMPFS_NODE_CHANGED | TMPFS_NODE_MODIFIED; 2352 tmpfs_update(vp); 2353 2354 return (error); 2355 } 2356 2357 static __inline int 2358 tmpfs_dirtree_cmp(struct tmpfs_dirent *a, struct tmpfs_dirent *b) 2359 { 2360 if (a->td_hash > b->td_hash) 2361 return (1); 2362 else if (a->td_hash < b->td_hash) 2363 return (-1); 2364 return (0); 2365 } 2366 2367 RB_GENERATE_STATIC(tmpfs_dir, tmpfs_dirent, uh.td_entries, tmpfs_dirtree_cmp); 2368