1 /* $NetBSD: tmpfs_subr.c,v 1.35 2007/07/09 21:10:50 ad Exp $ */ 2 3 /*- 4 * SPDX-License-Identifier: BSD-2-Clause 5 * 6 * Copyright (c) 2005 The NetBSD Foundation, Inc. 7 * All rights reserved. 8 * 9 * This code is derived from software contributed to The NetBSD Foundation 10 * by Julio M. Merino Vidal, developed as part of Google's Summer of Code 11 * 2005 program. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 24 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 26 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 29 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 30 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 32 * POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 /* 36 * Efficient memory file system supporting functions. 37 */ 38 #include <sys/cdefs.h> 39 #include <sys/param.h> 40 #include <sys/systm.h> 41 #include <sys/dirent.h> 42 #include <sys/fnv_hash.h> 43 #include <sys/lock.h> 44 #include <sys/limits.h> 45 #include <sys/mount.h> 46 #include <sys/namei.h> 47 #include <sys/priv.h> 48 #include <sys/proc.h> 49 #include <sys/random.h> 50 #include <sys/refcount.h> 51 #include <sys/rwlock.h> 52 #include <sys/smr.h> 53 #include <sys/stat.h> 54 #include <sys/sysctl.h> 55 #include <sys/user.h> 56 #include <sys/vnode.h> 57 #include <sys/vmmeter.h> 58 59 #include <vm/vm.h> 60 #include <vm/vm_param.h> 61 #include <vm/vm_object.h> 62 #include <vm/vm_page.h> 63 #include <vm/vm_pageout.h> 64 #include <vm/vm_pager.h> 65 #include <vm/vm_extern.h> 66 #include <vm/swap_pager.h> 67 68 #include <fs/tmpfs/tmpfs.h> 69 #include <fs/tmpfs/tmpfs_fifoops.h> 70 #include <fs/tmpfs/tmpfs_vnops.h> 71 72 SYSCTL_NODE(_vfs, OID_AUTO, tmpfs, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 73 "tmpfs file system"); 74 75 static long tmpfs_pages_reserved = TMPFS_PAGES_MINRESERVED; 76 77 MALLOC_DEFINE(M_TMPFSDIR, "tmpfs dir", "tmpfs dirent structure"); 78 static uma_zone_t tmpfs_node_pool; 79 VFS_SMR_DECLARE; 80 81 int tmpfs_pager_type = -1; 82 83 static vm_object_t 84 tmpfs_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, 85 vm_ooffset_t offset, struct ucred *cred) 86 { 87 vm_object_t object; 88 89 MPASS(handle == NULL); 90 MPASS(offset == 0); 91 object = vm_object_allocate_dyn(tmpfs_pager_type, size, 92 OBJ_COLORED | OBJ_SWAP); 93 if (!swap_pager_init_object(object, NULL, NULL, size, 0)) { 94 vm_object_deallocate(object); 95 object = NULL; 96 } 97 return (object); 98 } 99 100 /* 101 * Make sure tmpfs vnodes with writable mappings can be found on the lazy list. 102 * 103 * This allows for periodic mtime updates while only scanning vnodes which are 104 * plausibly dirty, see tmpfs_update_mtime_lazy. 105 */ 106 static void 107 tmpfs_pager_writecount_recalc(vm_object_t object, vm_offset_t old, 108 vm_offset_t new) 109 { 110 struct vnode *vp; 111 112 VM_OBJECT_ASSERT_WLOCKED(object); 113 114 vp = VM_TO_TMPFS_VP(object); 115 116 /* 117 * Forced unmount? 118 */ 119 if (vp == NULL) { 120 KASSERT((object->flags & OBJ_TMPFS_VREF) == 0, 121 ("object %p with OBJ_TMPFS_VREF but without vnode", 122 object)); 123 VM_OBJECT_WUNLOCK(object); 124 return; 125 } 126 127 if (old == 0) { 128 VNASSERT((object->flags & OBJ_TMPFS_VREF) == 0, vp, 129 ("object without writable mappings has a reference")); 130 VNPASS(vp->v_usecount > 0, vp); 131 } else { 132 VNASSERT((object->flags & OBJ_TMPFS_VREF) != 0, vp, 133 ("object with writable mappings does not " 134 "have a reference")); 135 } 136 137 if (old == new) { 138 VM_OBJECT_WUNLOCK(object); 139 return; 140 } 141 142 if (new == 0) { 143 vm_object_clear_flag(object, OBJ_TMPFS_VREF); 144 VM_OBJECT_WUNLOCK(object); 145 vrele(vp); 146 } else { 147 if ((object->flags & OBJ_TMPFS_VREF) == 0) { 148 vref(vp); 149 vlazy(vp); 150 vm_object_set_flag(object, OBJ_TMPFS_VREF); 151 } 152 VM_OBJECT_WUNLOCK(object); 153 } 154 } 155 156 static void 157 tmpfs_pager_update_writecount(vm_object_t object, vm_offset_t start, 158 vm_offset_t end) 159 { 160 vm_offset_t new, old; 161 162 VM_OBJECT_WLOCK(object); 163 KASSERT((object->flags & OBJ_ANON) == 0, 164 ("%s: object %p with OBJ_ANON", __func__, object)); 165 old = object->un_pager.swp.writemappings; 166 object->un_pager.swp.writemappings += (vm_ooffset_t)end - start; 167 new = object->un_pager.swp.writemappings; 168 tmpfs_pager_writecount_recalc(object, old, new); 169 VM_OBJECT_ASSERT_UNLOCKED(object); 170 } 171 172 static void 173 tmpfs_pager_release_writecount(vm_object_t object, vm_offset_t start, 174 vm_offset_t end) 175 { 176 vm_offset_t new, old; 177 178 VM_OBJECT_WLOCK(object); 179 KASSERT((object->flags & OBJ_ANON) == 0, 180 ("%s: object %p with OBJ_ANON", __func__, object)); 181 old = object->un_pager.swp.writemappings; 182 object->un_pager.swp.writemappings -= (vm_ooffset_t)end - start; 183 new = object->un_pager.swp.writemappings; 184 tmpfs_pager_writecount_recalc(object, old, new); 185 VM_OBJECT_ASSERT_UNLOCKED(object); 186 } 187 188 static void 189 tmpfs_pager_getvp(vm_object_t object, struct vnode **vpp, bool *vp_heldp) 190 { 191 struct vnode *vp; 192 193 /* 194 * Tmpfs VREG node, which was reclaimed, has tmpfs_pager_type 195 * type. In this case there is no v_writecount to adjust. 196 */ 197 if (vp_heldp != NULL) 198 VM_OBJECT_RLOCK(object); 199 else 200 VM_OBJECT_ASSERT_LOCKED(object); 201 if ((object->flags & OBJ_TMPFS) != 0) { 202 vp = VM_TO_TMPFS_VP(object); 203 if (vp != NULL) { 204 *vpp = vp; 205 if (vp_heldp != NULL) { 206 vhold(vp); 207 *vp_heldp = true; 208 } 209 } 210 } 211 if (vp_heldp != NULL) 212 VM_OBJECT_RUNLOCK(object); 213 } 214 215 static void 216 tmpfs_pager_freespace(vm_object_t obj, vm_pindex_t start, vm_size_t size) 217 { 218 struct tmpfs_node *node; 219 struct tmpfs_mount *tm; 220 vm_size_t c; 221 222 swap_pager_freespace(obj, start, size, &c); 223 if ((obj->flags & OBJ_TMPFS) == 0 || c == 0) 224 return; 225 226 node = obj->un_pager.swp.swp_priv; 227 MPASS(node->tn_type == VREG); 228 tm = node->tn_reg.tn_tmp; 229 230 KASSERT(tm->tm_pages_used >= c, 231 ("tmpfs tm %p pages %jd free %jd", tm, 232 (uintmax_t)tm->tm_pages_used, (uintmax_t)c)); 233 atomic_add_long(&tm->tm_pages_used, -c); 234 KASSERT(node->tn_reg.tn_pages >= c, 235 ("tmpfs node %p pages %jd free %jd", node, 236 (uintmax_t)node->tn_reg.tn_pages, (uintmax_t)c)); 237 node->tn_reg.tn_pages -= c; 238 } 239 240 static void 241 tmpfs_page_inserted(vm_object_t obj, vm_page_t m) 242 { 243 struct tmpfs_node *node; 244 struct tmpfs_mount *tm; 245 246 if ((obj->flags & OBJ_TMPFS) == 0) 247 return; 248 249 node = obj->un_pager.swp.swp_priv; 250 MPASS(node->tn_type == VREG); 251 tm = node->tn_reg.tn_tmp; 252 253 if (!vm_pager_has_page(obj, m->pindex, NULL, NULL)) { 254 atomic_add_long(&tm->tm_pages_used, 1); 255 node->tn_reg.tn_pages += 1; 256 } 257 } 258 259 static void 260 tmpfs_page_removed(vm_object_t obj, vm_page_t m) 261 { 262 struct tmpfs_node *node; 263 struct tmpfs_mount *tm; 264 265 if ((obj->flags & OBJ_TMPFS) == 0) 266 return; 267 268 node = obj->un_pager.swp.swp_priv; 269 MPASS(node->tn_type == VREG); 270 tm = node->tn_reg.tn_tmp; 271 272 if (!vm_pager_has_page(obj, m->pindex, NULL, NULL)) { 273 KASSERT(tm->tm_pages_used >= 1, 274 ("tmpfs tm %p pages %jd free 1", tm, 275 (uintmax_t)tm->tm_pages_used)); 276 atomic_add_long(&tm->tm_pages_used, -1); 277 KASSERT(node->tn_reg.tn_pages >= 1, 278 ("tmpfs node %p pages %jd free 1", node, 279 (uintmax_t)node->tn_reg.tn_pages)); 280 node->tn_reg.tn_pages -= 1; 281 } 282 } 283 284 static boolean_t 285 tmpfs_can_alloc_page(vm_object_t obj, vm_pindex_t pindex) 286 { 287 struct tmpfs_mount *tm; 288 289 tm = VM_TO_TMPFS_MP(obj); 290 if (tm == NULL || vm_pager_has_page(obj, pindex, NULL, NULL) || 291 tm->tm_pages_max == 0) 292 return (true); 293 return (tm->tm_pages_max > atomic_load_long(&tm->tm_pages_used)); 294 } 295 296 struct pagerops tmpfs_pager_ops = { 297 .pgo_kvme_type = KVME_TYPE_VNODE, 298 .pgo_alloc = tmpfs_pager_alloc, 299 .pgo_set_writeable_dirty = vm_object_set_writeable_dirty_, 300 .pgo_update_writecount = tmpfs_pager_update_writecount, 301 .pgo_release_writecount = tmpfs_pager_release_writecount, 302 .pgo_mightbedirty = vm_object_mightbedirty_, 303 .pgo_getvp = tmpfs_pager_getvp, 304 .pgo_freespace = tmpfs_pager_freespace, 305 .pgo_page_inserted = tmpfs_page_inserted, 306 .pgo_page_removed = tmpfs_page_removed, 307 .pgo_can_alloc_page = tmpfs_can_alloc_page, 308 }; 309 310 static int 311 tmpfs_node_ctor(void *mem, int size, void *arg, int flags) 312 { 313 struct tmpfs_node *node; 314 315 node = mem; 316 node->tn_gen++; 317 node->tn_size = 0; 318 node->tn_status = 0; 319 node->tn_accessed = false; 320 node->tn_flags = 0; 321 node->tn_links = 0; 322 node->tn_vnode = NULL; 323 node->tn_vpstate = 0; 324 return (0); 325 } 326 327 static void 328 tmpfs_node_dtor(void *mem, int size, void *arg) 329 { 330 struct tmpfs_node *node; 331 332 node = mem; 333 node->tn_type = VNON; 334 } 335 336 static int 337 tmpfs_node_init(void *mem, int size, int flags) 338 { 339 struct tmpfs_node *node; 340 341 node = mem; 342 node->tn_id = 0; 343 mtx_init(&node->tn_interlock, "tmpfsni", NULL, MTX_DEF); 344 node->tn_gen = arc4random(); 345 return (0); 346 } 347 348 static void 349 tmpfs_node_fini(void *mem, int size) 350 { 351 struct tmpfs_node *node; 352 353 node = mem; 354 mtx_destroy(&node->tn_interlock); 355 } 356 357 int 358 tmpfs_subr_init(void) 359 { 360 tmpfs_pager_type = vm_pager_alloc_dyn_type(&tmpfs_pager_ops, 361 OBJT_SWAP); 362 if (tmpfs_pager_type == -1) 363 return (EINVAL); 364 tmpfs_node_pool = uma_zcreate("TMPFS node", 365 sizeof(struct tmpfs_node), tmpfs_node_ctor, tmpfs_node_dtor, 366 tmpfs_node_init, tmpfs_node_fini, UMA_ALIGN_PTR, 0); 367 VFS_SMR_ZONE_SET(tmpfs_node_pool); 368 return (0); 369 } 370 371 void 372 tmpfs_subr_uninit(void) 373 { 374 if (tmpfs_pager_type != -1) 375 vm_pager_free_dyn_type(tmpfs_pager_type); 376 tmpfs_pager_type = -1; 377 uma_zdestroy(tmpfs_node_pool); 378 } 379 380 static int 381 sysctl_mem_reserved(SYSCTL_HANDLER_ARGS) 382 { 383 int error; 384 long pages, bytes; 385 386 pages = *(long *)arg1; 387 bytes = pages * PAGE_SIZE; 388 389 error = sysctl_handle_long(oidp, &bytes, 0, req); 390 if (error || !req->newptr) 391 return (error); 392 393 pages = bytes / PAGE_SIZE; 394 if (pages < TMPFS_PAGES_MINRESERVED) 395 return (EINVAL); 396 397 *(long *)arg1 = pages; 398 return (0); 399 } 400 401 SYSCTL_PROC(_vfs_tmpfs, OID_AUTO, memory_reserved, 402 CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &tmpfs_pages_reserved, 0, 403 sysctl_mem_reserved, "L", 404 "Amount of available memory and swap below which tmpfs growth stops"); 405 406 static __inline int tmpfs_dirtree_cmp(struct tmpfs_dirent *a, 407 struct tmpfs_dirent *b); 408 RB_PROTOTYPE_STATIC(tmpfs_dir, tmpfs_dirent, uh.td_entries, tmpfs_dirtree_cmp); 409 410 size_t 411 tmpfs_mem_avail(void) 412 { 413 size_t avail; 414 long reserved; 415 416 avail = swap_pager_avail + vm_free_count(); 417 reserved = atomic_load_long(&tmpfs_pages_reserved); 418 if (__predict_false(avail < reserved)) 419 return (0); 420 return (avail - reserved); 421 } 422 423 size_t 424 tmpfs_pages_used(struct tmpfs_mount *tmp) 425 { 426 const size_t node_size = sizeof(struct tmpfs_node) + 427 sizeof(struct tmpfs_dirent); 428 size_t meta_pages; 429 430 meta_pages = howmany((uintmax_t)tmp->tm_nodes_inuse * node_size, 431 PAGE_SIZE); 432 return (meta_pages + tmp->tm_pages_used); 433 } 434 435 bool 436 tmpfs_pages_check_avail(struct tmpfs_mount *tmp, size_t req_pages) 437 { 438 if (tmpfs_mem_avail() < req_pages) 439 return (false); 440 441 if (tmp->tm_pages_max != ULONG_MAX && 442 tmp->tm_pages_max < req_pages + tmpfs_pages_used(tmp)) 443 return (false); 444 445 return (true); 446 } 447 448 static int 449 tmpfs_partial_page_invalidate(vm_object_t object, vm_pindex_t idx, int base, 450 int end, boolean_t ignerr) 451 { 452 vm_page_t m; 453 int rv, error; 454 455 VM_OBJECT_ASSERT_WLOCKED(object); 456 KASSERT(base >= 0, ("%s: base %d", __func__, base)); 457 KASSERT(end - base <= PAGE_SIZE, ("%s: base %d end %d", __func__, base, 458 end)); 459 error = 0; 460 461 retry: 462 m = vm_page_grab(object, idx, VM_ALLOC_NOCREAT); 463 if (m != NULL) { 464 MPASS(vm_page_all_valid(m)); 465 } else if (vm_pager_has_page(object, idx, NULL, NULL)) { 466 m = vm_page_alloc(object, idx, VM_ALLOC_NORMAL | 467 VM_ALLOC_WAITFAIL); 468 if (m == NULL) 469 goto retry; 470 vm_object_pip_add(object, 1); 471 VM_OBJECT_WUNLOCK(object); 472 rv = vm_pager_get_pages(object, &m, 1, NULL, NULL); 473 VM_OBJECT_WLOCK(object); 474 vm_object_pip_wakeup(object); 475 if (rv == VM_PAGER_OK) { 476 /* 477 * Since the page was not resident, and therefore not 478 * recently accessed, immediately enqueue it for 479 * asynchronous laundering. The current operation is 480 * not regarded as an access. 481 */ 482 vm_page_launder(m); 483 } else { 484 vm_page_free(m); 485 m = NULL; 486 if (!ignerr) 487 error = EIO; 488 } 489 } 490 if (m != NULL) { 491 pmap_zero_page_area(m, base, end - base); 492 vm_page_set_dirty(m); 493 vm_page_xunbusy(m); 494 } 495 496 return (error); 497 } 498 499 void 500 tmpfs_ref_node(struct tmpfs_node *node) 501 { 502 #ifdef INVARIANTS 503 u_int old; 504 505 old = 506 #endif 507 refcount_acquire(&node->tn_refcount); 508 #ifdef INVARIANTS 509 KASSERT(old > 0, ("node %p zero refcount", node)); 510 #endif 511 } 512 513 /* 514 * Allocates a new node of type 'type' inside the 'tmp' mount point, with 515 * its owner set to 'uid', its group to 'gid' and its mode set to 'mode', 516 * using the credentials of the process 'p'. 517 * 518 * If the node type is set to 'VDIR', then the parent parameter must point 519 * to the parent directory of the node being created. It may only be NULL 520 * while allocating the root node. 521 * 522 * If the node type is set to 'VBLK' or 'VCHR', then the rdev parameter 523 * specifies the device the node represents. 524 * 525 * If the node type is set to 'VLNK', then the parameter target specifies 526 * the file name of the target file for the symbolic link that is being 527 * created. 528 * 529 * Note that new nodes are retrieved from the available list if it has 530 * items or, if it is empty, from the node pool as long as there is enough 531 * space to create them. 532 * 533 * Returns zero on success or an appropriate error code on failure. 534 */ 535 int 536 tmpfs_alloc_node(struct mount *mp, struct tmpfs_mount *tmp, __enum_uint8(vtype) type, 537 uid_t uid, gid_t gid, mode_t mode, struct tmpfs_node *parent, 538 const char *target, dev_t rdev, struct tmpfs_node **node) 539 { 540 struct tmpfs_node *nnode; 541 char *symlink; 542 char symlink_smr; 543 544 /* If the root directory of the 'tmp' file system is not yet 545 * allocated, this must be the request to do it. */ 546 MPASS(IMPLIES(tmp->tm_root == NULL, parent == NULL && type == VDIR)); 547 548 MPASS((type == VLNK) ^ (target == NULL)); 549 MPASS((type == VBLK || type == VCHR) ^ (rdev == VNOVAL)); 550 551 if (tmp->tm_nodes_inuse >= tmp->tm_nodes_max) 552 return (ENOSPC); 553 if (!tmpfs_pages_check_avail(tmp, 1)) 554 return (ENOSPC); 555 556 if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) { 557 /* 558 * When a new tmpfs node is created for fully 559 * constructed mount point, there must be a parent 560 * node, which vnode is locked exclusively. As 561 * consequence, if the unmount is executing in 562 * parallel, vflush() cannot reclaim the parent vnode. 563 * Due to this, the check for MNTK_UNMOUNT flag is not 564 * racy: if we did not see MNTK_UNMOUNT flag, then tmp 565 * cannot be destroyed until node construction is 566 * finished and the parent vnode unlocked. 567 * 568 * Tmpfs does not need to instantiate new nodes during 569 * unmount. 570 */ 571 return (EBUSY); 572 } 573 if ((mp->mnt_kern_flag & MNT_RDONLY) != 0) 574 return (EROFS); 575 576 nnode = uma_zalloc_smr(tmpfs_node_pool, M_WAITOK); 577 578 /* Generic initialization. */ 579 nnode->tn_type = type; 580 vfs_timestamp(&nnode->tn_atime); 581 nnode->tn_birthtime = nnode->tn_ctime = nnode->tn_mtime = 582 nnode->tn_atime; 583 nnode->tn_uid = uid; 584 nnode->tn_gid = gid; 585 nnode->tn_mode = mode; 586 nnode->tn_id = alloc_unr64(&tmp->tm_ino_unr); 587 nnode->tn_refcount = 1; 588 LIST_INIT(&nnode->tn_extattrs); 589 590 /* Type-specific initialization. */ 591 switch (nnode->tn_type) { 592 case VBLK: 593 case VCHR: 594 nnode->tn_rdev = rdev; 595 break; 596 597 case VDIR: 598 RB_INIT(&nnode->tn_dir.tn_dirhead); 599 LIST_INIT(&nnode->tn_dir.tn_dupindex); 600 MPASS(parent != nnode); 601 MPASS(IMPLIES(parent == NULL, tmp->tm_root == NULL)); 602 nnode->tn_dir.tn_parent = (parent == NULL) ? nnode : parent; 603 nnode->tn_dir.tn_readdir_lastn = 0; 604 nnode->tn_dir.tn_readdir_lastp = NULL; 605 nnode->tn_links++; 606 TMPFS_NODE_LOCK(nnode->tn_dir.tn_parent); 607 nnode->tn_dir.tn_parent->tn_links++; 608 TMPFS_NODE_UNLOCK(nnode->tn_dir.tn_parent); 609 break; 610 611 case VFIFO: 612 /* FALLTHROUGH */ 613 case VSOCK: 614 break; 615 616 case VLNK: 617 MPASS(strlen(target) < MAXPATHLEN); 618 nnode->tn_size = strlen(target); 619 620 symlink = NULL; 621 if (!tmp->tm_nonc) { 622 symlink = cache_symlink_alloc(nnode->tn_size + 1, 623 M_WAITOK); 624 symlink_smr = true; 625 } 626 if (symlink == NULL) { 627 symlink = malloc(nnode->tn_size + 1, M_TMPFSNAME, 628 M_WAITOK); 629 symlink_smr = false; 630 } 631 memcpy(symlink, target, nnode->tn_size + 1); 632 633 /* 634 * Allow safe symlink resolving for lockless lookup. 635 * tmpfs_fplookup_symlink references this comment. 636 * 637 * 1. nnode is not yet visible to the world 638 * 2. both tn_link_target and tn_link_smr get populated 639 * 3. release fence publishes their content 640 * 4. tn_link_target content is immutable until node 641 * destruction, where the pointer gets set to NULL 642 * 5. tn_link_smr is never changed once set 643 * 644 * As a result it is sufficient to issue load consume 645 * on the node pointer to also get the above content 646 * in a stable manner. Worst case tn_link_smr flag 647 * may be set to true despite being stale, while the 648 * target buffer is already cleared out. 649 */ 650 atomic_store_ptr(&nnode->tn_link_target, symlink); 651 atomic_store_char((char *)&nnode->tn_link_smr, symlink_smr); 652 atomic_thread_fence_rel(); 653 break; 654 655 case VREG: 656 nnode->tn_reg.tn_aobj = 657 vm_pager_allocate(tmpfs_pager_type, NULL, 0, 658 VM_PROT_DEFAULT, 0, 659 NULL /* XXXKIB - tmpfs needs swap reservation */); 660 nnode->tn_reg.tn_aobj->un_pager.swp.swp_priv = nnode; 661 vm_object_set_flag(nnode->tn_reg.tn_aobj, OBJ_TMPFS); 662 nnode->tn_reg.tn_tmp = tmp; 663 nnode->tn_reg.tn_pages = 0; 664 break; 665 666 default: 667 panic("tmpfs_alloc_node: type %p %d", nnode, 668 (int)nnode->tn_type); 669 } 670 671 TMPFS_LOCK(tmp); 672 LIST_INSERT_HEAD(&tmp->tm_nodes_used, nnode, tn_entries); 673 nnode->tn_attached = true; 674 tmp->tm_nodes_inuse++; 675 tmp->tm_refcount++; 676 TMPFS_UNLOCK(tmp); 677 678 *node = nnode; 679 return (0); 680 } 681 682 /* 683 * Destroys the node pointed to by node from the file system 'tmp'. 684 * If the node references a directory, no entries are allowed. 685 */ 686 void 687 tmpfs_free_node(struct tmpfs_mount *tmp, struct tmpfs_node *node) 688 { 689 if (refcount_release_if_not_last(&node->tn_refcount)) 690 return; 691 692 TMPFS_LOCK(tmp); 693 TMPFS_NODE_LOCK(node); 694 if (!tmpfs_free_node_locked(tmp, node, false)) { 695 TMPFS_NODE_UNLOCK(node); 696 TMPFS_UNLOCK(tmp); 697 } 698 } 699 700 bool 701 tmpfs_free_node_locked(struct tmpfs_mount *tmp, struct tmpfs_node *node, 702 bool detach) 703 { 704 struct tmpfs_extattr *ea; 705 vm_object_t uobj; 706 char *symlink; 707 bool last; 708 709 TMPFS_MP_ASSERT_LOCKED(tmp); 710 TMPFS_NODE_ASSERT_LOCKED(node); 711 712 last = refcount_release(&node->tn_refcount); 713 if (node->tn_attached && (detach || last)) { 714 MPASS(tmp->tm_nodes_inuse > 0); 715 tmp->tm_nodes_inuse--; 716 LIST_REMOVE(node, tn_entries); 717 node->tn_attached = false; 718 } 719 if (!last) 720 return (false); 721 722 TMPFS_NODE_UNLOCK(node); 723 724 #ifdef INVARIANTS 725 MPASS(node->tn_vnode == NULL); 726 MPASS((node->tn_vpstate & TMPFS_VNODE_ALLOCATING) == 0); 727 728 /* 729 * Make sure this is a node type we can deal with. Everything 730 * is explicitly enumerated without the 'default' clause so 731 * the compiler can throw an error in case a new type is 732 * added. 733 */ 734 switch (node->tn_type) { 735 case VBLK: 736 case VCHR: 737 case VDIR: 738 case VFIFO: 739 case VSOCK: 740 case VLNK: 741 case VREG: 742 break; 743 case VNON: 744 case VBAD: 745 case VMARKER: 746 panic("%s: bad type %d for node %p", __func__, 747 (int)node->tn_type, node); 748 } 749 #endif 750 751 while ((ea = LIST_FIRST(&node->tn_extattrs)) != NULL) { 752 LIST_REMOVE(ea, ea_extattrs); 753 tmpfs_extattr_free(ea); 754 } 755 756 switch (node->tn_type) { 757 case VREG: 758 uobj = node->tn_reg.tn_aobj; 759 node->tn_reg.tn_aobj = NULL; 760 if (uobj != NULL) { 761 VM_OBJECT_WLOCK(uobj); 762 KASSERT((uobj->flags & OBJ_TMPFS) != 0, 763 ("tmpfs node %p uobj %p not tmpfs", node, uobj)); 764 vm_object_clear_flag(uobj, OBJ_TMPFS); 765 KASSERT(tmp->tm_pages_used >= node->tn_reg.tn_pages, 766 ("tmpfs tmp %p node %p pages %jd free %jd", tmp, 767 node, (uintmax_t)tmp->tm_pages_used, 768 (uintmax_t)node->tn_reg.tn_pages)); 769 atomic_add_long(&tmp->tm_pages_used, 770 -node->tn_reg.tn_pages); 771 VM_OBJECT_WUNLOCK(uobj); 772 } 773 tmpfs_free_tmp(tmp); 774 775 /* 776 * vm_object_deallocate() must not be called while 777 * owning tm_allnode_lock, because deallocate might 778 * sleep. Call it after tmpfs_free_tmp() does the 779 * unlock. 780 */ 781 if (uobj != NULL) 782 vm_object_deallocate(uobj); 783 784 break; 785 case VLNK: 786 tmpfs_free_tmp(tmp); 787 788 symlink = node->tn_link_target; 789 atomic_store_ptr(&node->tn_link_target, NULL); 790 if (atomic_load_char(&node->tn_link_smr)) { 791 cache_symlink_free(symlink, node->tn_size + 1); 792 } else { 793 free(symlink, M_TMPFSNAME); 794 } 795 break; 796 default: 797 tmpfs_free_tmp(tmp); 798 break; 799 } 800 801 uma_zfree_smr(tmpfs_node_pool, node); 802 return (true); 803 } 804 805 static __inline uint32_t 806 tmpfs_dirent_hash(const char *name, u_int len) 807 { 808 uint32_t hash; 809 810 hash = fnv_32_buf(name, len, FNV1_32_INIT + len) & TMPFS_DIRCOOKIE_MASK; 811 #ifdef TMPFS_DEBUG_DIRCOOKIE_DUP 812 hash &= 0xf; 813 #endif 814 if (hash < TMPFS_DIRCOOKIE_MIN) 815 hash += TMPFS_DIRCOOKIE_MIN; 816 817 return (hash); 818 } 819 820 static __inline off_t 821 tmpfs_dirent_cookie(struct tmpfs_dirent *de) 822 { 823 if (de == NULL) 824 return (TMPFS_DIRCOOKIE_EOF); 825 826 MPASS(de->td_cookie >= TMPFS_DIRCOOKIE_MIN); 827 828 return (de->td_cookie); 829 } 830 831 static __inline boolean_t 832 tmpfs_dirent_dup(struct tmpfs_dirent *de) 833 { 834 return ((de->td_cookie & TMPFS_DIRCOOKIE_DUP) != 0); 835 } 836 837 static __inline boolean_t 838 tmpfs_dirent_duphead(struct tmpfs_dirent *de) 839 { 840 return ((de->td_cookie & TMPFS_DIRCOOKIE_DUPHEAD) != 0); 841 } 842 843 void 844 tmpfs_dirent_init(struct tmpfs_dirent *de, const char *name, u_int namelen) 845 { 846 de->td_hash = de->td_cookie = tmpfs_dirent_hash(name, namelen); 847 memcpy(de->ud.td_name, name, namelen); 848 de->td_namelen = namelen; 849 } 850 851 /* 852 * Allocates a new directory entry for the node node with a name of name. 853 * The new directory entry is returned in *de. 854 * 855 * The link count of node is increased by one to reflect the new object 856 * referencing it. 857 * 858 * Returns zero on success or an appropriate error code on failure. 859 */ 860 int 861 tmpfs_alloc_dirent(struct tmpfs_mount *tmp, struct tmpfs_node *node, 862 const char *name, u_int len, struct tmpfs_dirent **de) 863 { 864 struct tmpfs_dirent *nde; 865 866 nde = malloc(sizeof(*nde), M_TMPFSDIR, M_WAITOK); 867 nde->td_node = node; 868 if (name != NULL) { 869 nde->ud.td_name = malloc(len, M_TMPFSNAME, M_WAITOK); 870 tmpfs_dirent_init(nde, name, len); 871 } else 872 nde->td_namelen = 0; 873 if (node != NULL) 874 node->tn_links++; 875 876 *de = nde; 877 878 return (0); 879 } 880 881 /* 882 * Frees a directory entry. It is the caller's responsibility to destroy 883 * the node referenced by it if needed. 884 * 885 * The link count of node is decreased by one to reflect the removal of an 886 * object that referenced it. This only happens if 'node_exists' is true; 887 * otherwise the function will not access the node referred to by the 888 * directory entry, as it may already have been released from the outside. 889 */ 890 void 891 tmpfs_free_dirent(struct tmpfs_mount *tmp, struct tmpfs_dirent *de) 892 { 893 struct tmpfs_node *node; 894 895 node = de->td_node; 896 if (node != NULL) { 897 MPASS(node->tn_links > 0); 898 node->tn_links--; 899 } 900 if (!tmpfs_dirent_duphead(de) && de->ud.td_name != NULL) 901 free(de->ud.td_name, M_TMPFSNAME); 902 free(de, M_TMPFSDIR); 903 } 904 905 void 906 tmpfs_destroy_vobject(struct vnode *vp, vm_object_t obj) 907 { 908 bool want_vrele; 909 910 ASSERT_VOP_ELOCKED(vp, "tmpfs_destroy_vobject"); 911 if (vp->v_type != VREG || obj == NULL) 912 return; 913 914 VM_OBJECT_WLOCK(obj); 915 VI_LOCK(vp); 916 /* 917 * May be going through forced unmount. 918 */ 919 want_vrele = false; 920 if ((obj->flags & OBJ_TMPFS_VREF) != 0) { 921 vm_object_clear_flag(obj, OBJ_TMPFS_VREF); 922 want_vrele = true; 923 } 924 925 if (vp->v_writecount < 0) 926 vp->v_writecount = 0; 927 VI_UNLOCK(vp); 928 VM_OBJECT_WUNLOCK(obj); 929 if (want_vrele) { 930 vrele(vp); 931 } 932 } 933 934 /* 935 * Allocates a new vnode for the node node or returns a new reference to 936 * an existing one if the node had already a vnode referencing it. The 937 * resulting locked vnode is returned in *vpp. 938 * 939 * Returns zero on success or an appropriate error code on failure. 940 */ 941 int 942 tmpfs_alloc_vp(struct mount *mp, struct tmpfs_node *node, int lkflag, 943 struct vnode **vpp) 944 { 945 struct vnode *vp; 946 enum vgetstate vs; 947 struct tmpfs_mount *tm; 948 vm_object_t object; 949 int error; 950 951 error = 0; 952 tm = VFS_TO_TMPFS(mp); 953 TMPFS_NODE_LOCK(node); 954 tmpfs_ref_node(node); 955 loop: 956 TMPFS_NODE_ASSERT_LOCKED(node); 957 if ((vp = node->tn_vnode) != NULL) { 958 MPASS((node->tn_vpstate & TMPFS_VNODE_DOOMED) == 0); 959 if ((node->tn_type == VDIR && node->tn_dir.tn_parent == NULL) || 960 (VN_IS_DOOMED(vp) && 961 (lkflag & LK_NOWAIT) != 0)) { 962 TMPFS_NODE_UNLOCK(node); 963 error = ENOENT; 964 vp = NULL; 965 goto out; 966 } 967 if (VN_IS_DOOMED(vp)) { 968 node->tn_vpstate |= TMPFS_VNODE_WRECLAIM; 969 while ((node->tn_vpstate & TMPFS_VNODE_WRECLAIM) != 0) { 970 msleep(&node->tn_vnode, TMPFS_NODE_MTX(node), 971 0, "tmpfsE", 0); 972 } 973 goto loop; 974 } 975 vs = vget_prep(vp); 976 TMPFS_NODE_UNLOCK(node); 977 error = vget_finish(vp, lkflag, vs); 978 if (error == ENOENT) { 979 TMPFS_NODE_LOCK(node); 980 goto loop; 981 } 982 if (error != 0) { 983 vp = NULL; 984 goto out; 985 } 986 987 /* 988 * Make sure the vnode is still there after 989 * getting the interlock to avoid racing a free. 990 */ 991 if (node->tn_vnode != vp) { 992 vput(vp); 993 TMPFS_NODE_LOCK(node); 994 goto loop; 995 } 996 997 goto out; 998 } 999 1000 if ((node->tn_vpstate & TMPFS_VNODE_DOOMED) || 1001 (node->tn_type == VDIR && node->tn_dir.tn_parent == NULL)) { 1002 TMPFS_NODE_UNLOCK(node); 1003 error = ENOENT; 1004 vp = NULL; 1005 goto out; 1006 } 1007 1008 /* 1009 * otherwise lock the vp list while we call getnewvnode 1010 * since that can block. 1011 */ 1012 if (node->tn_vpstate & TMPFS_VNODE_ALLOCATING) { 1013 node->tn_vpstate |= TMPFS_VNODE_WANT; 1014 error = msleep((caddr_t) &node->tn_vpstate, 1015 TMPFS_NODE_MTX(node), 0, "tmpfs_alloc_vp", 0); 1016 if (error != 0) 1017 goto out; 1018 goto loop; 1019 } else 1020 node->tn_vpstate |= TMPFS_VNODE_ALLOCATING; 1021 1022 TMPFS_NODE_UNLOCK(node); 1023 1024 /* Get a new vnode and associate it with our node. */ 1025 error = getnewvnode("tmpfs", mp, VFS_TO_TMPFS(mp)->tm_nonc ? 1026 &tmpfs_vnodeop_nonc_entries : &tmpfs_vnodeop_entries, &vp); 1027 if (error != 0) 1028 goto unlock; 1029 MPASS(vp != NULL); 1030 1031 /* lkflag is ignored, the lock is exclusive */ 1032 (void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1033 1034 vp->v_data = node; 1035 vp->v_type = node->tn_type; 1036 1037 /* Type-specific initialization. */ 1038 switch (node->tn_type) { 1039 case VBLK: 1040 /* FALLTHROUGH */ 1041 case VCHR: 1042 /* FALLTHROUGH */ 1043 case VLNK: 1044 /* FALLTHROUGH */ 1045 case VSOCK: 1046 break; 1047 case VFIFO: 1048 vp->v_op = &tmpfs_fifoop_entries; 1049 break; 1050 case VREG: 1051 object = node->tn_reg.tn_aobj; 1052 VM_OBJECT_WLOCK(object); 1053 KASSERT((object->flags & OBJ_TMPFS_VREF) == 0, 1054 ("%s: object %p with OBJ_TMPFS_VREF but without vnode", 1055 __func__, object)); 1056 KASSERT(object->un_pager.swp.writemappings == 0, 1057 ("%s: object %p has writemappings", 1058 __func__, object)); 1059 VI_LOCK(vp); 1060 KASSERT(vp->v_object == NULL, ("Not NULL v_object in tmpfs")); 1061 vp->v_object = object; 1062 vn_irflag_set_locked(vp, (tm->tm_pgread ? VIRF_PGREAD : 0) | 1063 VIRF_TEXT_REF); 1064 VI_UNLOCK(vp); 1065 VM_OBJECT_WUNLOCK(object); 1066 break; 1067 case VDIR: 1068 MPASS(node->tn_dir.tn_parent != NULL); 1069 if (node->tn_dir.tn_parent == node) 1070 vp->v_vflag |= VV_ROOT; 1071 break; 1072 1073 default: 1074 panic("tmpfs_alloc_vp: type %p %d", node, (int)node->tn_type); 1075 } 1076 if (vp->v_type != VFIFO) 1077 VN_LOCK_ASHARE(vp); 1078 1079 error = insmntque1(vp, mp); 1080 if (error != 0) { 1081 /* Need to clear v_object for insmntque failure. */ 1082 tmpfs_destroy_vobject(vp, vp->v_object); 1083 vp->v_object = NULL; 1084 vp->v_data = NULL; 1085 vp->v_op = &dead_vnodeops; 1086 vgone(vp); 1087 vput(vp); 1088 vp = NULL; 1089 } else { 1090 vn_set_state(vp, VSTATE_CONSTRUCTED); 1091 } 1092 1093 unlock: 1094 TMPFS_NODE_LOCK(node); 1095 1096 MPASS(node->tn_vpstate & TMPFS_VNODE_ALLOCATING); 1097 node->tn_vpstate &= ~TMPFS_VNODE_ALLOCATING; 1098 node->tn_vnode = vp; 1099 1100 if (node->tn_vpstate & TMPFS_VNODE_WANT) { 1101 node->tn_vpstate &= ~TMPFS_VNODE_WANT; 1102 TMPFS_NODE_UNLOCK(node); 1103 wakeup((caddr_t) &node->tn_vpstate); 1104 } else 1105 TMPFS_NODE_UNLOCK(node); 1106 1107 out: 1108 if (error == 0) { 1109 *vpp = vp; 1110 1111 #ifdef INVARIANTS 1112 MPASS(*vpp != NULL); 1113 ASSERT_VOP_LOCKED(*vpp, __func__); 1114 TMPFS_NODE_LOCK(node); 1115 MPASS(*vpp == node->tn_vnode); 1116 TMPFS_NODE_UNLOCK(node); 1117 #endif 1118 } 1119 tmpfs_free_node(tm, node); 1120 1121 return (error); 1122 } 1123 1124 /* 1125 * Destroys the association between the vnode vp and the node it 1126 * references. 1127 */ 1128 void 1129 tmpfs_free_vp(struct vnode *vp) 1130 { 1131 struct tmpfs_node *node; 1132 1133 node = VP_TO_TMPFS_NODE(vp); 1134 1135 TMPFS_NODE_ASSERT_LOCKED(node); 1136 node->tn_vnode = NULL; 1137 if ((node->tn_vpstate & TMPFS_VNODE_WRECLAIM) != 0) 1138 wakeup(&node->tn_vnode); 1139 node->tn_vpstate &= ~TMPFS_VNODE_WRECLAIM; 1140 vp->v_data = NULL; 1141 } 1142 1143 /* 1144 * Allocates a new file of type 'type' and adds it to the parent directory 1145 * 'dvp'; this addition is done using the component name given in 'cnp'. 1146 * The ownership of the new file is automatically assigned based on the 1147 * credentials of the caller (through 'cnp'), the group is set based on 1148 * the parent directory and the mode is determined from the 'vap' argument. 1149 * If successful, *vpp holds a vnode to the newly created file and zero 1150 * is returned. Otherwise *vpp is NULL and the function returns an 1151 * appropriate error code. 1152 */ 1153 int 1154 tmpfs_alloc_file(struct vnode *dvp, struct vnode **vpp, struct vattr *vap, 1155 struct componentname *cnp, const char *target) 1156 { 1157 int error; 1158 struct tmpfs_dirent *de; 1159 struct tmpfs_mount *tmp; 1160 struct tmpfs_node *dnode; 1161 struct tmpfs_node *node; 1162 struct tmpfs_node *parent; 1163 1164 ASSERT_VOP_ELOCKED(dvp, "tmpfs_alloc_file"); 1165 1166 tmp = VFS_TO_TMPFS(dvp->v_mount); 1167 dnode = VP_TO_TMPFS_DIR(dvp); 1168 *vpp = NULL; 1169 1170 /* If the entry we are creating is a directory, we cannot overflow 1171 * the number of links of its parent, because it will get a new 1172 * link. */ 1173 if (vap->va_type == VDIR) { 1174 /* Ensure that we do not overflow the maximum number of links 1175 * imposed by the system. */ 1176 MPASS(dnode->tn_links <= TMPFS_LINK_MAX); 1177 if (dnode->tn_links == TMPFS_LINK_MAX) { 1178 return (EMLINK); 1179 } 1180 1181 parent = dnode; 1182 MPASS(parent != NULL); 1183 } else 1184 parent = NULL; 1185 1186 /* Allocate a node that represents the new file. */ 1187 error = tmpfs_alloc_node(dvp->v_mount, tmp, vap->va_type, 1188 cnp->cn_cred->cr_uid, dnode->tn_gid, vap->va_mode, parent, 1189 target, vap->va_rdev, &node); 1190 if (error != 0) 1191 return (error); 1192 1193 /* Allocate a directory entry that points to the new file. */ 1194 error = tmpfs_alloc_dirent(tmp, node, cnp->cn_nameptr, cnp->cn_namelen, 1195 &de); 1196 if (error != 0) { 1197 tmpfs_free_node(tmp, node); 1198 return (error); 1199 } 1200 1201 /* Allocate a vnode for the new file. */ 1202 error = tmpfs_alloc_vp(dvp->v_mount, node, LK_EXCLUSIVE, vpp); 1203 if (error != 0) { 1204 tmpfs_free_dirent(tmp, de); 1205 tmpfs_free_node(tmp, node); 1206 return (error); 1207 } 1208 1209 /* Now that all required items are allocated, we can proceed to 1210 * insert the new node into the directory, an operation that 1211 * cannot fail. */ 1212 if (cnp->cn_flags & ISWHITEOUT) 1213 tmpfs_dir_whiteout_remove(dvp, cnp); 1214 tmpfs_dir_attach(dvp, de); 1215 return (0); 1216 } 1217 1218 struct tmpfs_dirent * 1219 tmpfs_dir_first(struct tmpfs_node *dnode, struct tmpfs_dir_cursor *dc) 1220 { 1221 struct tmpfs_dirent *de; 1222 1223 de = RB_MIN(tmpfs_dir, &dnode->tn_dir.tn_dirhead); 1224 dc->tdc_tree = de; 1225 if (de != NULL && tmpfs_dirent_duphead(de)) 1226 de = LIST_FIRST(&de->ud.td_duphead); 1227 dc->tdc_current = de; 1228 1229 return (dc->tdc_current); 1230 } 1231 1232 struct tmpfs_dirent * 1233 tmpfs_dir_next(struct tmpfs_node *dnode, struct tmpfs_dir_cursor *dc) 1234 { 1235 struct tmpfs_dirent *de; 1236 1237 MPASS(dc->tdc_tree != NULL); 1238 if (tmpfs_dirent_dup(dc->tdc_current)) { 1239 dc->tdc_current = LIST_NEXT(dc->tdc_current, uh.td_dup.entries); 1240 if (dc->tdc_current != NULL) 1241 return (dc->tdc_current); 1242 } 1243 dc->tdc_tree = dc->tdc_current = RB_NEXT(tmpfs_dir, 1244 &dnode->tn_dir.tn_dirhead, dc->tdc_tree); 1245 if ((de = dc->tdc_current) != NULL && tmpfs_dirent_duphead(de)) { 1246 dc->tdc_current = LIST_FIRST(&de->ud.td_duphead); 1247 MPASS(dc->tdc_current != NULL); 1248 } 1249 1250 return (dc->tdc_current); 1251 } 1252 1253 /* Lookup directory entry in RB-Tree. Function may return duphead entry. */ 1254 static struct tmpfs_dirent * 1255 tmpfs_dir_xlookup_hash(struct tmpfs_node *dnode, uint32_t hash) 1256 { 1257 struct tmpfs_dirent *de, dekey; 1258 1259 dekey.td_hash = hash; 1260 de = RB_FIND(tmpfs_dir, &dnode->tn_dir.tn_dirhead, &dekey); 1261 return (de); 1262 } 1263 1264 /* Lookup directory entry by cookie, initialize directory cursor accordingly. */ 1265 static struct tmpfs_dirent * 1266 tmpfs_dir_lookup_cookie(struct tmpfs_node *node, off_t cookie, 1267 struct tmpfs_dir_cursor *dc) 1268 { 1269 struct tmpfs_dir *dirhead = &node->tn_dir.tn_dirhead; 1270 struct tmpfs_dirent *de, dekey; 1271 1272 MPASS(cookie >= TMPFS_DIRCOOKIE_MIN); 1273 1274 if (cookie == node->tn_dir.tn_readdir_lastn && 1275 (de = node->tn_dir.tn_readdir_lastp) != NULL) { 1276 /* Protect against possible race, tn_readdir_last[pn] 1277 * may be updated with only shared vnode lock held. */ 1278 if (cookie == tmpfs_dirent_cookie(de)) 1279 goto out; 1280 } 1281 1282 if ((cookie & TMPFS_DIRCOOKIE_DUP) != 0) { 1283 LIST_FOREACH(de, &node->tn_dir.tn_dupindex, 1284 uh.td_dup.index_entries) { 1285 MPASS(tmpfs_dirent_dup(de)); 1286 if (de->td_cookie == cookie) 1287 goto out; 1288 /* dupindex list is sorted. */ 1289 if (de->td_cookie < cookie) { 1290 de = NULL; 1291 goto out; 1292 } 1293 } 1294 MPASS(de == NULL); 1295 goto out; 1296 } 1297 1298 if ((cookie & TMPFS_DIRCOOKIE_MASK) != cookie) { 1299 de = NULL; 1300 } else { 1301 dekey.td_hash = cookie; 1302 /* Recover if direntry for cookie was removed */ 1303 de = RB_NFIND(tmpfs_dir, dirhead, &dekey); 1304 } 1305 dc->tdc_tree = de; 1306 dc->tdc_current = de; 1307 if (de != NULL && tmpfs_dirent_duphead(de)) { 1308 dc->tdc_current = LIST_FIRST(&de->ud.td_duphead); 1309 MPASS(dc->tdc_current != NULL); 1310 } 1311 return (dc->tdc_current); 1312 1313 out: 1314 dc->tdc_tree = de; 1315 dc->tdc_current = de; 1316 if (de != NULL && tmpfs_dirent_dup(de)) 1317 dc->tdc_tree = tmpfs_dir_xlookup_hash(node, 1318 de->td_hash); 1319 return (dc->tdc_current); 1320 } 1321 1322 /* 1323 * Looks for a directory entry in the directory represented by node. 1324 * 'cnp' describes the name of the entry to look for. Note that the . 1325 * and .. components are not allowed as they do not physically exist 1326 * within directories. 1327 * 1328 * Returns a pointer to the entry when found, otherwise NULL. 1329 */ 1330 struct tmpfs_dirent * 1331 tmpfs_dir_lookup(struct tmpfs_node *node, struct tmpfs_node *f, 1332 struct componentname *cnp) 1333 { 1334 struct tmpfs_dir_duphead *duphead; 1335 struct tmpfs_dirent *de; 1336 uint32_t hash; 1337 1338 MPASS(IMPLIES(cnp->cn_namelen == 1, cnp->cn_nameptr[0] != '.')); 1339 MPASS(IMPLIES(cnp->cn_namelen == 2, !(cnp->cn_nameptr[0] == '.' && 1340 cnp->cn_nameptr[1] == '.'))); 1341 TMPFS_VALIDATE_DIR(node); 1342 1343 hash = tmpfs_dirent_hash(cnp->cn_nameptr, cnp->cn_namelen); 1344 de = tmpfs_dir_xlookup_hash(node, hash); 1345 if (de != NULL && tmpfs_dirent_duphead(de)) { 1346 duphead = &de->ud.td_duphead; 1347 LIST_FOREACH(de, duphead, uh.td_dup.entries) { 1348 if (TMPFS_DIRENT_MATCHES(de, cnp->cn_nameptr, 1349 cnp->cn_namelen)) 1350 break; 1351 } 1352 } else if (de != NULL) { 1353 if (!TMPFS_DIRENT_MATCHES(de, cnp->cn_nameptr, 1354 cnp->cn_namelen)) 1355 de = NULL; 1356 } 1357 if (de != NULL && f != NULL && de->td_node != f) 1358 de = NULL; 1359 1360 return (de); 1361 } 1362 1363 /* 1364 * Attach duplicate-cookie directory entry nde to dnode and insert to dupindex 1365 * list, allocate new cookie value. 1366 */ 1367 static void 1368 tmpfs_dir_attach_dup(struct tmpfs_node *dnode, 1369 struct tmpfs_dir_duphead *duphead, struct tmpfs_dirent *nde) 1370 { 1371 struct tmpfs_dir_duphead *dupindex; 1372 struct tmpfs_dirent *de, *pde; 1373 1374 dupindex = &dnode->tn_dir.tn_dupindex; 1375 de = LIST_FIRST(dupindex); 1376 if (de == NULL || de->td_cookie < TMPFS_DIRCOOKIE_DUP_MAX) { 1377 if (de == NULL) 1378 nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MIN; 1379 else 1380 nde->td_cookie = de->td_cookie + 1; 1381 MPASS(tmpfs_dirent_dup(nde)); 1382 LIST_INSERT_HEAD(dupindex, nde, uh.td_dup.index_entries); 1383 LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); 1384 return; 1385 } 1386 1387 /* 1388 * Cookie numbers are near exhaustion. Scan dupindex list for unused 1389 * numbers. dupindex list is sorted in descending order. Keep it so 1390 * after inserting nde. 1391 */ 1392 while (1) { 1393 pde = de; 1394 de = LIST_NEXT(de, uh.td_dup.index_entries); 1395 if (de == NULL && pde->td_cookie != TMPFS_DIRCOOKIE_DUP_MIN) { 1396 /* 1397 * Last element of the index doesn't have minimal cookie 1398 * value, use it. 1399 */ 1400 nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MIN; 1401 LIST_INSERT_AFTER(pde, nde, uh.td_dup.index_entries); 1402 LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); 1403 return; 1404 } else if (de == NULL) { 1405 /* 1406 * We are so lucky have 2^30 hash duplicates in single 1407 * directory :) Return largest possible cookie value. 1408 * It should be fine except possible issues with 1409 * VOP_READDIR restart. 1410 */ 1411 nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MAX; 1412 LIST_INSERT_HEAD(dupindex, nde, 1413 uh.td_dup.index_entries); 1414 LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); 1415 return; 1416 } 1417 if (de->td_cookie + 1 == pde->td_cookie || 1418 de->td_cookie >= TMPFS_DIRCOOKIE_DUP_MAX) 1419 continue; /* No hole or invalid cookie. */ 1420 nde->td_cookie = de->td_cookie + 1; 1421 MPASS(tmpfs_dirent_dup(nde)); 1422 MPASS(pde->td_cookie > nde->td_cookie); 1423 MPASS(nde->td_cookie > de->td_cookie); 1424 LIST_INSERT_BEFORE(de, nde, uh.td_dup.index_entries); 1425 LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); 1426 return; 1427 } 1428 } 1429 1430 /* 1431 * Attaches the directory entry de to the directory represented by vp. 1432 * Note that this does not change the link count of the node pointed by 1433 * the directory entry, as this is done by tmpfs_alloc_dirent. 1434 */ 1435 void 1436 tmpfs_dir_attach(struct vnode *vp, struct tmpfs_dirent *de) 1437 { 1438 struct tmpfs_node *dnode; 1439 struct tmpfs_dirent *xde, *nde; 1440 1441 ASSERT_VOP_ELOCKED(vp, __func__); 1442 MPASS(de->td_namelen > 0); 1443 MPASS(de->td_hash >= TMPFS_DIRCOOKIE_MIN); 1444 MPASS(de->td_cookie == de->td_hash); 1445 1446 dnode = VP_TO_TMPFS_DIR(vp); 1447 dnode->tn_dir.tn_readdir_lastn = 0; 1448 dnode->tn_dir.tn_readdir_lastp = NULL; 1449 1450 MPASS(!tmpfs_dirent_dup(de)); 1451 xde = RB_INSERT(tmpfs_dir, &dnode->tn_dir.tn_dirhead, de); 1452 if (xde != NULL && tmpfs_dirent_duphead(xde)) 1453 tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, de); 1454 else if (xde != NULL) { 1455 /* 1456 * Allocate new duphead. Swap xde with duphead to avoid 1457 * adding/removing elements with the same hash. 1458 */ 1459 MPASS(!tmpfs_dirent_dup(xde)); 1460 tmpfs_alloc_dirent(VFS_TO_TMPFS(vp->v_mount), NULL, NULL, 0, 1461 &nde); 1462 /* *nde = *xde; XXX gcc 4.2.1 may generate invalid code. */ 1463 memcpy(nde, xde, sizeof(*xde)); 1464 xde->td_cookie |= TMPFS_DIRCOOKIE_DUPHEAD; 1465 LIST_INIT(&xde->ud.td_duphead); 1466 xde->td_namelen = 0; 1467 xde->td_node = NULL; 1468 tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, nde); 1469 tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, de); 1470 } 1471 dnode->tn_size += sizeof(struct tmpfs_dirent); 1472 dnode->tn_status |= TMPFS_NODE_CHANGED | TMPFS_NODE_MODIFIED; 1473 dnode->tn_accessed = true; 1474 tmpfs_update(vp); 1475 } 1476 1477 /* 1478 * Detaches the directory entry de from the directory represented by vp. 1479 * Note that this does not change the link count of the node pointed by 1480 * the directory entry, as this is done by tmpfs_free_dirent. 1481 */ 1482 void 1483 tmpfs_dir_detach(struct vnode *vp, struct tmpfs_dirent *de) 1484 { 1485 struct tmpfs_mount *tmp; 1486 struct tmpfs_dir *head; 1487 struct tmpfs_node *dnode; 1488 struct tmpfs_dirent *xde; 1489 1490 ASSERT_VOP_ELOCKED(vp, __func__); 1491 1492 dnode = VP_TO_TMPFS_DIR(vp); 1493 head = &dnode->tn_dir.tn_dirhead; 1494 dnode->tn_dir.tn_readdir_lastn = 0; 1495 dnode->tn_dir.tn_readdir_lastp = NULL; 1496 1497 if (tmpfs_dirent_dup(de)) { 1498 /* Remove duphead if de was last entry. */ 1499 if (LIST_NEXT(de, uh.td_dup.entries) == NULL) { 1500 xde = tmpfs_dir_xlookup_hash(dnode, de->td_hash); 1501 MPASS(tmpfs_dirent_duphead(xde)); 1502 } else 1503 xde = NULL; 1504 LIST_REMOVE(de, uh.td_dup.entries); 1505 LIST_REMOVE(de, uh.td_dup.index_entries); 1506 if (xde != NULL) { 1507 if (LIST_EMPTY(&xde->ud.td_duphead)) { 1508 RB_REMOVE(tmpfs_dir, head, xde); 1509 tmp = VFS_TO_TMPFS(vp->v_mount); 1510 MPASS(xde->td_node == NULL); 1511 tmpfs_free_dirent(tmp, xde); 1512 } 1513 } 1514 de->td_cookie = de->td_hash; 1515 } else 1516 RB_REMOVE(tmpfs_dir, head, de); 1517 1518 dnode->tn_size -= sizeof(struct tmpfs_dirent); 1519 dnode->tn_status |= TMPFS_NODE_CHANGED | TMPFS_NODE_MODIFIED; 1520 dnode->tn_accessed = true; 1521 tmpfs_update(vp); 1522 } 1523 1524 void 1525 tmpfs_dir_destroy(struct tmpfs_mount *tmp, struct tmpfs_node *dnode) 1526 { 1527 struct tmpfs_dirent *de, *dde, *nde; 1528 1529 RB_FOREACH_SAFE(de, tmpfs_dir, &dnode->tn_dir.tn_dirhead, nde) { 1530 RB_REMOVE(tmpfs_dir, &dnode->tn_dir.tn_dirhead, de); 1531 /* Node may already be destroyed. */ 1532 de->td_node = NULL; 1533 if (tmpfs_dirent_duphead(de)) { 1534 while ((dde = LIST_FIRST(&de->ud.td_duphead)) != NULL) { 1535 LIST_REMOVE(dde, uh.td_dup.entries); 1536 dde->td_node = NULL; 1537 tmpfs_free_dirent(tmp, dde); 1538 } 1539 } 1540 tmpfs_free_dirent(tmp, de); 1541 } 1542 } 1543 1544 /* 1545 * Helper function for tmpfs_readdir. Creates a '.' entry for the given 1546 * directory and returns it in the uio space. The function returns 0 1547 * on success, -1 if there was not enough space in the uio structure to 1548 * hold the directory entry or an appropriate error code if another 1549 * error happens. 1550 */ 1551 static int 1552 tmpfs_dir_getdotdent(struct tmpfs_mount *tm, struct tmpfs_node *node, 1553 struct uio *uio) 1554 { 1555 int error; 1556 struct dirent dent; 1557 1558 TMPFS_VALIDATE_DIR(node); 1559 MPASS(uio->uio_offset == TMPFS_DIRCOOKIE_DOT); 1560 1561 dent.d_fileno = node->tn_id; 1562 dent.d_off = TMPFS_DIRCOOKIE_DOTDOT; 1563 dent.d_type = DT_DIR; 1564 dent.d_namlen = 1; 1565 dent.d_name[0] = '.'; 1566 dent.d_reclen = GENERIC_DIRSIZ(&dent); 1567 dirent_terminate(&dent); 1568 1569 if (dent.d_reclen > uio->uio_resid) 1570 error = EJUSTRETURN; 1571 else 1572 error = uiomove(&dent, dent.d_reclen, uio); 1573 1574 tmpfs_set_accessed(tm, node); 1575 1576 return (error); 1577 } 1578 1579 /* 1580 * Helper function for tmpfs_readdir. Creates a '..' entry for the given 1581 * directory and returns it in the uio space. The function returns 0 1582 * on success, -1 if there was not enough space in the uio structure to 1583 * hold the directory entry or an appropriate error code if another 1584 * error happens. 1585 */ 1586 static int 1587 tmpfs_dir_getdotdotdent(struct tmpfs_mount *tm, struct tmpfs_node *node, 1588 struct uio *uio, off_t next) 1589 { 1590 struct tmpfs_node *parent; 1591 struct dirent dent; 1592 int error; 1593 1594 TMPFS_VALIDATE_DIR(node); 1595 MPASS(uio->uio_offset == TMPFS_DIRCOOKIE_DOTDOT); 1596 1597 /* 1598 * Return ENOENT if the current node is already removed. 1599 */ 1600 TMPFS_ASSERT_LOCKED(node); 1601 parent = node->tn_dir.tn_parent; 1602 if (parent == NULL) 1603 return (ENOENT); 1604 1605 dent.d_fileno = parent->tn_id; 1606 dent.d_off = next; 1607 dent.d_type = DT_DIR; 1608 dent.d_namlen = 2; 1609 dent.d_name[0] = '.'; 1610 dent.d_name[1] = '.'; 1611 dent.d_reclen = GENERIC_DIRSIZ(&dent); 1612 dirent_terminate(&dent); 1613 1614 if (dent.d_reclen > uio->uio_resid) 1615 error = EJUSTRETURN; 1616 else 1617 error = uiomove(&dent, dent.d_reclen, uio); 1618 1619 tmpfs_set_accessed(tm, node); 1620 1621 return (error); 1622 } 1623 1624 /* 1625 * Helper function for tmpfs_readdir. Returns as much directory entries 1626 * as can fit in the uio space. The read starts at uio->uio_offset. 1627 * The function returns 0 on success, -1 if there was not enough space 1628 * in the uio structure to hold the directory entry or an appropriate 1629 * error code if another error happens. 1630 */ 1631 int 1632 tmpfs_dir_getdents(struct tmpfs_mount *tm, struct tmpfs_node *node, 1633 struct uio *uio, int maxcookies, uint64_t *cookies, int *ncookies) 1634 { 1635 struct tmpfs_dir_cursor dc; 1636 struct tmpfs_dirent *de, *nde; 1637 off_t off; 1638 int error; 1639 1640 TMPFS_VALIDATE_DIR(node); 1641 1642 off = 0; 1643 1644 /* 1645 * Lookup the node from the current offset. The starting offset of 1646 * 0 will lookup both '.' and '..', and then the first real entry, 1647 * or EOF if there are none. Then find all entries for the dir that 1648 * fit into the buffer. Once no more entries are found (de == NULL), 1649 * the offset is set to TMPFS_DIRCOOKIE_EOF, which will cause the next 1650 * call to return 0. 1651 */ 1652 switch (uio->uio_offset) { 1653 case TMPFS_DIRCOOKIE_DOT: 1654 error = tmpfs_dir_getdotdent(tm, node, uio); 1655 if (error != 0) 1656 return (error); 1657 uio->uio_offset = off = TMPFS_DIRCOOKIE_DOTDOT; 1658 if (cookies != NULL) 1659 cookies[(*ncookies)++] = off; 1660 /* FALLTHROUGH */ 1661 case TMPFS_DIRCOOKIE_DOTDOT: 1662 de = tmpfs_dir_first(node, &dc); 1663 off = tmpfs_dirent_cookie(de); 1664 error = tmpfs_dir_getdotdotdent(tm, node, uio, off); 1665 if (error != 0) 1666 return (error); 1667 uio->uio_offset = off; 1668 if (cookies != NULL) 1669 cookies[(*ncookies)++] = off; 1670 /* EOF. */ 1671 if (de == NULL) 1672 return (0); 1673 break; 1674 case TMPFS_DIRCOOKIE_EOF: 1675 return (0); 1676 default: 1677 de = tmpfs_dir_lookup_cookie(node, uio->uio_offset, &dc); 1678 if (de == NULL) 1679 return (EINVAL); 1680 if (cookies != NULL) 1681 off = tmpfs_dirent_cookie(de); 1682 } 1683 1684 /* 1685 * Read as much entries as possible; i.e., until we reach the end of the 1686 * directory or we exhaust uio space. 1687 */ 1688 do { 1689 struct dirent d; 1690 1691 /* 1692 * Create a dirent structure representing the current tmpfs_node 1693 * and fill it. 1694 */ 1695 if (de->td_node == NULL) { 1696 d.d_fileno = 1; 1697 d.d_type = DT_WHT; 1698 } else { 1699 d.d_fileno = de->td_node->tn_id; 1700 switch (de->td_node->tn_type) { 1701 case VBLK: 1702 d.d_type = DT_BLK; 1703 break; 1704 1705 case VCHR: 1706 d.d_type = DT_CHR; 1707 break; 1708 1709 case VDIR: 1710 d.d_type = DT_DIR; 1711 break; 1712 1713 case VFIFO: 1714 d.d_type = DT_FIFO; 1715 break; 1716 1717 case VLNK: 1718 d.d_type = DT_LNK; 1719 break; 1720 1721 case VREG: 1722 d.d_type = DT_REG; 1723 break; 1724 1725 case VSOCK: 1726 d.d_type = DT_SOCK; 1727 break; 1728 1729 default: 1730 panic("tmpfs_dir_getdents: type %p %d", 1731 de->td_node, (int)de->td_node->tn_type); 1732 } 1733 } 1734 d.d_namlen = de->td_namelen; 1735 MPASS(de->td_namelen < sizeof(d.d_name)); 1736 (void)memcpy(d.d_name, de->ud.td_name, de->td_namelen); 1737 d.d_reclen = GENERIC_DIRSIZ(&d); 1738 1739 /* 1740 * Stop reading if the directory entry we are treating is bigger 1741 * than the amount of data that can be returned. 1742 */ 1743 if (d.d_reclen > uio->uio_resid) { 1744 error = EJUSTRETURN; 1745 break; 1746 } 1747 1748 nde = tmpfs_dir_next(node, &dc); 1749 d.d_off = tmpfs_dirent_cookie(nde); 1750 dirent_terminate(&d); 1751 1752 /* 1753 * Copy the new dirent structure into the output buffer and 1754 * advance pointers. 1755 */ 1756 error = uiomove(&d, d.d_reclen, uio); 1757 if (error == 0) { 1758 de = nde; 1759 if (cookies != NULL) { 1760 off = tmpfs_dirent_cookie(de); 1761 MPASS(*ncookies < maxcookies); 1762 cookies[(*ncookies)++] = off; 1763 } 1764 } 1765 } while (error == 0 && uio->uio_resid > 0 && de != NULL); 1766 1767 /* Skip setting off when using cookies as it is already done above. */ 1768 if (cookies == NULL) 1769 off = tmpfs_dirent_cookie(de); 1770 1771 /* Update the offset and cache. */ 1772 uio->uio_offset = off; 1773 node->tn_dir.tn_readdir_lastn = off; 1774 node->tn_dir.tn_readdir_lastp = de; 1775 1776 tmpfs_set_accessed(tm, node); 1777 return (error); 1778 } 1779 1780 int 1781 tmpfs_dir_whiteout_add(struct vnode *dvp, struct componentname *cnp) 1782 { 1783 struct tmpfs_dirent *de; 1784 int error; 1785 1786 error = tmpfs_alloc_dirent(VFS_TO_TMPFS(dvp->v_mount), NULL, 1787 cnp->cn_nameptr, cnp->cn_namelen, &de); 1788 if (error != 0) 1789 return (error); 1790 tmpfs_dir_attach(dvp, de); 1791 return (0); 1792 } 1793 1794 void 1795 tmpfs_dir_whiteout_remove(struct vnode *dvp, struct componentname *cnp) 1796 { 1797 struct tmpfs_dirent *de; 1798 1799 de = tmpfs_dir_lookup(VP_TO_TMPFS_DIR(dvp), NULL, cnp); 1800 MPASS(de != NULL && de->td_node == NULL); 1801 tmpfs_dir_detach(dvp, de); 1802 tmpfs_free_dirent(VFS_TO_TMPFS(dvp->v_mount), de); 1803 } 1804 1805 /* 1806 * Resizes the aobj associated with the regular file pointed to by 'vp' to the 1807 * size 'newsize'. 'vp' must point to a vnode that represents a regular file. 1808 * 'newsize' must be positive. 1809 * 1810 * Returns zero on success or an appropriate error code on failure. 1811 */ 1812 int 1813 tmpfs_reg_resize(struct vnode *vp, off_t newsize, boolean_t ignerr) 1814 { 1815 struct tmpfs_node *node; 1816 vm_object_t uobj; 1817 vm_pindex_t idx, newpages, oldpages; 1818 off_t oldsize; 1819 int base, error; 1820 1821 MPASS(vp->v_type == VREG); 1822 MPASS(newsize >= 0); 1823 1824 node = VP_TO_TMPFS_NODE(vp); 1825 uobj = node->tn_reg.tn_aobj; 1826 1827 /* 1828 * Convert the old and new sizes to the number of pages needed to 1829 * store them. It may happen that we do not need to do anything 1830 * because the last allocated page can accommodate the change on 1831 * its own. 1832 */ 1833 oldsize = node->tn_size; 1834 oldpages = OFF_TO_IDX(oldsize + PAGE_MASK); 1835 MPASS(oldpages == uobj->size); 1836 newpages = OFF_TO_IDX(newsize + PAGE_MASK); 1837 1838 if (__predict_true(newpages == oldpages && newsize >= oldsize)) { 1839 node->tn_size = newsize; 1840 return (0); 1841 } 1842 1843 VM_OBJECT_WLOCK(uobj); 1844 if (newsize < oldsize) { 1845 /* 1846 * Zero the truncated part of the last page. 1847 */ 1848 base = newsize & PAGE_MASK; 1849 if (base != 0) { 1850 idx = OFF_TO_IDX(newsize); 1851 error = tmpfs_partial_page_invalidate(uobj, idx, base, 1852 PAGE_SIZE, ignerr); 1853 if (error != 0) { 1854 VM_OBJECT_WUNLOCK(uobj); 1855 return (error); 1856 } 1857 } 1858 1859 /* 1860 * Release any swap space and free any whole pages. 1861 */ 1862 if (newpages < oldpages) 1863 vm_object_page_remove(uobj, newpages, 0, 0); 1864 } 1865 uobj->size = newpages; 1866 VM_OBJECT_WUNLOCK(uobj); 1867 1868 node->tn_size = newsize; 1869 return (0); 1870 } 1871 1872 /* 1873 * Punch hole in the aobj associated with the regular file pointed to by 'vp'. 1874 * Requests completely beyond the end-of-file are converted to no-op. 1875 * 1876 * Returns 0 on success or error code from tmpfs_partial_page_invalidate() on 1877 * failure. 1878 */ 1879 int 1880 tmpfs_reg_punch_hole(struct vnode *vp, off_t *offset, off_t *length) 1881 { 1882 struct tmpfs_node *node; 1883 vm_object_t object; 1884 vm_pindex_t pistart, pi, piend; 1885 int startofs, endofs, end; 1886 off_t off, len; 1887 int error; 1888 1889 KASSERT(*length <= OFF_MAX - *offset, ("%s: offset + length overflows", 1890 __func__)); 1891 node = VP_TO_TMPFS_NODE(vp); 1892 KASSERT(node->tn_type == VREG, ("%s: node is not regular file", 1893 __func__)); 1894 object = node->tn_reg.tn_aobj; 1895 off = *offset; 1896 len = omin(node->tn_size - off, *length); 1897 startofs = off & PAGE_MASK; 1898 endofs = (off + len) & PAGE_MASK; 1899 pistart = OFF_TO_IDX(off); 1900 piend = OFF_TO_IDX(off + len); 1901 pi = OFF_TO_IDX((vm_ooffset_t)off + PAGE_MASK); 1902 error = 0; 1903 1904 /* Handle the case when offset is on or beyond file size. */ 1905 if (len <= 0) { 1906 *length = 0; 1907 return (0); 1908 } 1909 1910 VM_OBJECT_WLOCK(object); 1911 1912 /* 1913 * If there is a partial page at the beginning of the hole-punching 1914 * request, fill the partial page with zeroes. 1915 */ 1916 if (startofs != 0) { 1917 end = pistart != piend ? PAGE_SIZE : endofs; 1918 error = tmpfs_partial_page_invalidate(object, pistart, startofs, 1919 end, FALSE); 1920 if (error != 0) 1921 goto out; 1922 off += end - startofs; 1923 len -= end - startofs; 1924 } 1925 1926 /* 1927 * Toss away the full pages in the affected area. 1928 */ 1929 if (pi < piend) { 1930 vm_object_page_remove(object, pi, piend, 0); 1931 off += IDX_TO_OFF(piend - pi); 1932 len -= IDX_TO_OFF(piend - pi); 1933 } 1934 1935 /* 1936 * If there is a partial page at the end of the hole-punching request, 1937 * fill the partial page with zeroes. 1938 */ 1939 if (endofs != 0 && pistart != piend) { 1940 error = tmpfs_partial_page_invalidate(object, piend, 0, endofs, 1941 FALSE); 1942 if (error != 0) 1943 goto out; 1944 off += endofs; 1945 len -= endofs; 1946 } 1947 1948 out: 1949 VM_OBJECT_WUNLOCK(object); 1950 *offset = off; 1951 *length = len; 1952 return (error); 1953 } 1954 1955 void 1956 tmpfs_check_mtime(struct vnode *vp) 1957 { 1958 struct tmpfs_node *node; 1959 struct vm_object *obj; 1960 1961 ASSERT_VOP_ELOCKED(vp, "check_mtime"); 1962 if (vp->v_type != VREG) 1963 return; 1964 obj = vp->v_object; 1965 KASSERT(obj->type == tmpfs_pager_type && 1966 (obj->flags & (OBJ_SWAP | OBJ_TMPFS)) == 1967 (OBJ_SWAP | OBJ_TMPFS), ("non-tmpfs obj")); 1968 /* unlocked read */ 1969 if (obj->generation != obj->cleangeneration) { 1970 VM_OBJECT_WLOCK(obj); 1971 if (obj->generation != obj->cleangeneration) { 1972 obj->cleangeneration = obj->generation; 1973 node = VP_TO_TMPFS_NODE(vp); 1974 node->tn_status |= TMPFS_NODE_MODIFIED | 1975 TMPFS_NODE_CHANGED; 1976 } 1977 VM_OBJECT_WUNLOCK(obj); 1978 } 1979 } 1980 1981 /* 1982 * Change flags of the given vnode. 1983 * Caller should execute tmpfs_update on vp after a successful execution. 1984 * The vnode must be locked on entry and remain locked on exit. 1985 */ 1986 int 1987 tmpfs_chflags(struct vnode *vp, u_long flags, struct ucred *cred, 1988 struct thread *td) 1989 { 1990 int error; 1991 struct tmpfs_node *node; 1992 1993 ASSERT_VOP_ELOCKED(vp, "chflags"); 1994 1995 node = VP_TO_TMPFS_NODE(vp); 1996 1997 if ((flags & ~(SF_APPEND | SF_ARCHIVED | SF_IMMUTABLE | SF_NOUNLINK | 1998 UF_APPEND | UF_ARCHIVE | UF_HIDDEN | UF_IMMUTABLE | UF_NODUMP | 1999 UF_NOUNLINK | UF_OFFLINE | UF_OPAQUE | UF_READONLY | UF_REPARSE | 2000 UF_SPARSE | UF_SYSTEM)) != 0) 2001 return (EOPNOTSUPP); 2002 2003 /* Disallow this operation if the file system is mounted read-only. */ 2004 if (vp->v_mount->mnt_flag & MNT_RDONLY) 2005 return (EROFS); 2006 2007 /* 2008 * Callers may only modify the file flags on objects they 2009 * have VADMIN rights for. 2010 */ 2011 if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) 2012 return (error); 2013 /* 2014 * Unprivileged processes are not permitted to unset system 2015 * flags, or modify flags if any system flags are set. 2016 */ 2017 if (!priv_check_cred(cred, PRIV_VFS_SYSFLAGS)) { 2018 if (node->tn_flags & 2019 (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND)) { 2020 error = securelevel_gt(cred, 0); 2021 if (error) 2022 return (error); 2023 } 2024 } else { 2025 if (node->tn_flags & 2026 (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND) || 2027 ((flags ^ node->tn_flags) & SF_SETTABLE)) 2028 return (EPERM); 2029 } 2030 node->tn_flags = flags; 2031 node->tn_status |= TMPFS_NODE_CHANGED; 2032 2033 ASSERT_VOP_ELOCKED(vp, "chflags2"); 2034 2035 return (0); 2036 } 2037 2038 /* 2039 * Change access mode on the given vnode. 2040 * Caller should execute tmpfs_update on vp after a successful execution. 2041 * The vnode must be locked on entry and remain locked on exit. 2042 */ 2043 int 2044 tmpfs_chmod(struct vnode *vp, mode_t mode, struct ucred *cred, 2045 struct thread *td) 2046 { 2047 int error; 2048 struct tmpfs_node *node; 2049 mode_t newmode; 2050 2051 ASSERT_VOP_ELOCKED(vp, "chmod"); 2052 ASSERT_VOP_IN_SEQC(vp); 2053 2054 node = VP_TO_TMPFS_NODE(vp); 2055 2056 /* Disallow this operation if the file system is mounted read-only. */ 2057 if (vp->v_mount->mnt_flag & MNT_RDONLY) 2058 return (EROFS); 2059 2060 /* Immutable or append-only files cannot be modified, either. */ 2061 if (node->tn_flags & (IMMUTABLE | APPEND)) 2062 return (EPERM); 2063 2064 /* 2065 * To modify the permissions on a file, must possess VADMIN 2066 * for that file. 2067 */ 2068 if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) 2069 return (error); 2070 2071 /* 2072 * Privileged processes may set the sticky bit on non-directories, 2073 * as well as set the setgid bit on a file with a group that the 2074 * process is not a member of. 2075 */ 2076 if (vp->v_type != VDIR && (mode & S_ISTXT)) { 2077 if (priv_check_cred(cred, PRIV_VFS_STICKYFILE)) 2078 return (EFTYPE); 2079 } 2080 if (!groupmember(node->tn_gid, cred) && (mode & S_ISGID)) { 2081 error = priv_check_cred(cred, PRIV_VFS_SETGID); 2082 if (error) 2083 return (error); 2084 } 2085 2086 newmode = node->tn_mode & ~ALLPERMS; 2087 newmode |= mode & ALLPERMS; 2088 atomic_store_short(&node->tn_mode, newmode); 2089 2090 node->tn_status |= TMPFS_NODE_CHANGED; 2091 2092 ASSERT_VOP_ELOCKED(vp, "chmod2"); 2093 2094 return (0); 2095 } 2096 2097 /* 2098 * Change ownership of the given vnode. At least one of uid or gid must 2099 * be different than VNOVAL. If one is set to that value, the attribute 2100 * is unchanged. 2101 * Caller should execute tmpfs_update on vp after a successful execution. 2102 * The vnode must be locked on entry and remain locked on exit. 2103 */ 2104 int 2105 tmpfs_chown(struct vnode *vp, uid_t uid, gid_t gid, struct ucred *cred, 2106 struct thread *td) 2107 { 2108 int error; 2109 struct tmpfs_node *node; 2110 uid_t ouid; 2111 gid_t ogid; 2112 mode_t newmode; 2113 2114 ASSERT_VOP_ELOCKED(vp, "chown"); 2115 ASSERT_VOP_IN_SEQC(vp); 2116 2117 node = VP_TO_TMPFS_NODE(vp); 2118 2119 /* Assign default values if they are unknown. */ 2120 MPASS(uid != VNOVAL || gid != VNOVAL); 2121 if (uid == VNOVAL) 2122 uid = node->tn_uid; 2123 if (gid == VNOVAL) 2124 gid = node->tn_gid; 2125 MPASS(uid != VNOVAL && gid != VNOVAL); 2126 2127 /* Disallow this operation if the file system is mounted read-only. */ 2128 if (vp->v_mount->mnt_flag & MNT_RDONLY) 2129 return (EROFS); 2130 2131 /* Immutable or append-only files cannot be modified, either. */ 2132 if (node->tn_flags & (IMMUTABLE | APPEND)) 2133 return (EPERM); 2134 2135 /* 2136 * To modify the ownership of a file, must possess VADMIN for that 2137 * file. 2138 */ 2139 if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) 2140 return (error); 2141 2142 /* 2143 * To change the owner of a file, or change the group of a file to a 2144 * group of which we are not a member, the caller must have 2145 * privilege. 2146 */ 2147 if ((uid != node->tn_uid || 2148 (gid != node->tn_gid && !groupmember(gid, cred))) && 2149 (error = priv_check_cred(cred, PRIV_VFS_CHOWN))) 2150 return (error); 2151 2152 ogid = node->tn_gid; 2153 ouid = node->tn_uid; 2154 2155 node->tn_uid = uid; 2156 node->tn_gid = gid; 2157 2158 node->tn_status |= TMPFS_NODE_CHANGED; 2159 2160 if ((node->tn_mode & (S_ISUID | S_ISGID)) != 0 && 2161 (ouid != uid || ogid != gid)) { 2162 if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID)) { 2163 newmode = node->tn_mode & ~(S_ISUID | S_ISGID); 2164 atomic_store_short(&node->tn_mode, newmode); 2165 } 2166 } 2167 2168 ASSERT_VOP_ELOCKED(vp, "chown2"); 2169 2170 return (0); 2171 } 2172 2173 /* 2174 * Change size of the given vnode. 2175 * Caller should execute tmpfs_update on vp after a successful execution. 2176 * The vnode must be locked on entry and remain locked on exit. 2177 */ 2178 int 2179 tmpfs_chsize(struct vnode *vp, u_quad_t size, struct ucred *cred, 2180 struct thread *td) 2181 { 2182 int error; 2183 struct tmpfs_node *node; 2184 2185 ASSERT_VOP_ELOCKED(vp, "chsize"); 2186 2187 node = VP_TO_TMPFS_NODE(vp); 2188 2189 /* Decide whether this is a valid operation based on the file type. */ 2190 error = 0; 2191 switch (vp->v_type) { 2192 case VDIR: 2193 return (EISDIR); 2194 2195 case VREG: 2196 if (vp->v_mount->mnt_flag & MNT_RDONLY) 2197 return (EROFS); 2198 break; 2199 2200 case VBLK: 2201 /* FALLTHROUGH */ 2202 case VCHR: 2203 /* FALLTHROUGH */ 2204 case VFIFO: 2205 /* 2206 * Allow modifications of special files even if in the file 2207 * system is mounted read-only (we are not modifying the 2208 * files themselves, but the objects they represent). 2209 */ 2210 return (0); 2211 2212 default: 2213 /* Anything else is unsupported. */ 2214 return (EOPNOTSUPP); 2215 } 2216 2217 /* Immutable or append-only files cannot be modified, either. */ 2218 if (node->tn_flags & (IMMUTABLE | APPEND)) 2219 return (EPERM); 2220 2221 error = vn_rlimit_trunc(size, td); 2222 if (error != 0) 2223 return (error); 2224 2225 error = tmpfs_truncate(vp, size); 2226 /* 2227 * tmpfs_truncate will raise the NOTE_EXTEND and NOTE_ATTRIB kevents 2228 * for us, as will update tn_status; no need to do that here. 2229 */ 2230 2231 ASSERT_VOP_ELOCKED(vp, "chsize2"); 2232 2233 return (error); 2234 } 2235 2236 /* 2237 * Change access and modification times of the given vnode. 2238 * Caller should execute tmpfs_update on vp after a successful execution. 2239 * The vnode must be locked on entry and remain locked on exit. 2240 */ 2241 int 2242 tmpfs_chtimes(struct vnode *vp, struct vattr *vap, 2243 struct ucred *cred, struct thread *td) 2244 { 2245 int error; 2246 struct tmpfs_node *node; 2247 2248 ASSERT_VOP_ELOCKED(vp, "chtimes"); 2249 2250 node = VP_TO_TMPFS_NODE(vp); 2251 2252 /* Disallow this operation if the file system is mounted read-only. */ 2253 if (vp->v_mount->mnt_flag & MNT_RDONLY) 2254 return (EROFS); 2255 2256 /* Immutable or append-only files cannot be modified, either. */ 2257 if (node->tn_flags & (IMMUTABLE | APPEND)) 2258 return (EPERM); 2259 2260 error = vn_utimes_perm(vp, vap, cred, td); 2261 if (error != 0) 2262 return (error); 2263 2264 if (vap->va_atime.tv_sec != VNOVAL) 2265 node->tn_accessed = true; 2266 if (vap->va_mtime.tv_sec != VNOVAL) 2267 node->tn_status |= TMPFS_NODE_MODIFIED; 2268 if (vap->va_birthtime.tv_sec != VNOVAL) 2269 node->tn_status |= TMPFS_NODE_MODIFIED; 2270 tmpfs_itimes(vp, &vap->va_atime, &vap->va_mtime); 2271 if (vap->va_birthtime.tv_sec != VNOVAL) 2272 node->tn_birthtime = vap->va_birthtime; 2273 ASSERT_VOP_ELOCKED(vp, "chtimes2"); 2274 2275 return (0); 2276 } 2277 2278 void 2279 tmpfs_set_status(struct tmpfs_mount *tm, struct tmpfs_node *node, int status) 2280 { 2281 2282 if ((node->tn_status & status) == status || tm->tm_ronly) 2283 return; 2284 TMPFS_NODE_LOCK(node); 2285 node->tn_status |= status; 2286 TMPFS_NODE_UNLOCK(node); 2287 } 2288 2289 void 2290 tmpfs_set_accessed(struct tmpfs_mount *tm, struct tmpfs_node *node) 2291 { 2292 if (node->tn_accessed || tm->tm_ronly) 2293 return; 2294 atomic_store_8(&node->tn_accessed, true); 2295 } 2296 2297 /* Sync timestamps */ 2298 void 2299 tmpfs_itimes(struct vnode *vp, const struct timespec *acc, 2300 const struct timespec *mod) 2301 { 2302 struct tmpfs_node *node; 2303 struct timespec now; 2304 2305 ASSERT_VOP_LOCKED(vp, "tmpfs_itimes"); 2306 node = VP_TO_TMPFS_NODE(vp); 2307 2308 if (!node->tn_accessed && 2309 (node->tn_status & (TMPFS_NODE_MODIFIED | TMPFS_NODE_CHANGED)) == 0) 2310 return; 2311 2312 vfs_timestamp(&now); 2313 TMPFS_NODE_LOCK(node); 2314 if (node->tn_accessed) { 2315 if (acc == NULL) 2316 acc = &now; 2317 node->tn_atime = *acc; 2318 } 2319 if (node->tn_status & TMPFS_NODE_MODIFIED) { 2320 if (mod == NULL) 2321 mod = &now; 2322 node->tn_mtime = *mod; 2323 } 2324 if (node->tn_status & TMPFS_NODE_CHANGED) 2325 node->tn_ctime = now; 2326 node->tn_status &= ~(TMPFS_NODE_MODIFIED | TMPFS_NODE_CHANGED); 2327 node->tn_accessed = false; 2328 TMPFS_NODE_UNLOCK(node); 2329 2330 /* XXX: FIX? The entropy here is desirable, but the harvesting may be expensive */ 2331 random_harvest_queue(node, sizeof(*node), RANDOM_FS_ATIME); 2332 } 2333 2334 int 2335 tmpfs_truncate(struct vnode *vp, off_t length) 2336 { 2337 struct tmpfs_node *node; 2338 int error; 2339 2340 if (length < 0) 2341 return (EINVAL); 2342 if (length > VFS_TO_TMPFS(vp->v_mount)->tm_maxfilesize) 2343 return (EFBIG); 2344 2345 node = VP_TO_TMPFS_NODE(vp); 2346 error = node->tn_size == length ? 0 : tmpfs_reg_resize(vp, length, 2347 FALSE); 2348 if (error == 0) 2349 node->tn_status |= TMPFS_NODE_CHANGED | TMPFS_NODE_MODIFIED; 2350 tmpfs_update(vp); 2351 2352 return (error); 2353 } 2354 2355 static __inline int 2356 tmpfs_dirtree_cmp(struct tmpfs_dirent *a, struct tmpfs_dirent *b) 2357 { 2358 if (a->td_hash > b->td_hash) 2359 return (1); 2360 else if (a->td_hash < b->td_hash) 2361 return (-1); 2362 return (0); 2363 } 2364 2365 RB_GENERATE_STATIC(tmpfs_dir, tmpfs_dirent, uh.td_entries, tmpfs_dirtree_cmp); 2366