1 /* $NetBSD: tmpfs_subr.c,v 1.35 2007/07/09 21:10:50 ad Exp $ */ 2 3 /*- 4 * SPDX-License-Identifier: BSD-2-Clause 5 * 6 * Copyright (c) 2005 The NetBSD Foundation, Inc. 7 * All rights reserved. 8 * 9 * This code is derived from software contributed to The NetBSD Foundation 10 * by Julio M. Merino Vidal, developed as part of Google's Summer of Code 11 * 2005 program. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 24 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 26 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 29 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 30 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 32 * POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 /* 36 * Efficient memory file system supporting functions. 37 */ 38 39 #include <sys/param.h> 40 #include <sys/systm.h> 41 #include <sys/dirent.h> 42 #include <sys/fnv_hash.h> 43 #include <sys/lock.h> 44 #include <sys/limits.h> 45 #include <sys/mount.h> 46 #include <sys/namei.h> 47 #include <sys/priv.h> 48 #include <sys/proc.h> 49 #include <sys/random.h> 50 #include <sys/refcount.h> 51 #include <sys/rwlock.h> 52 #include <sys/smr.h> 53 #include <sys/stat.h> 54 #include <sys/sysctl.h> 55 #include <sys/user.h> 56 #include <sys/vnode.h> 57 #include <sys/vmmeter.h> 58 59 #include <vm/vm.h> 60 #include <vm/vm_param.h> 61 #include <vm/vm_object.h> 62 #include <vm/vm_page.h> 63 #include <vm/vm_pageout.h> 64 #include <vm/vm_pager.h> 65 #include <vm/vm_extern.h> 66 #include <vm/swap_pager.h> 67 #include <vm/uma.h> 68 69 #include <fs/tmpfs/tmpfs.h> 70 #include <fs/tmpfs/tmpfs_fifoops.h> 71 #include <fs/tmpfs/tmpfs_vnops.h> 72 73 SYSCTL_NODE(_vfs, OID_AUTO, tmpfs, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 74 "tmpfs file system"); 75 76 static long tmpfs_pages_reserved = TMPFS_PAGES_MINRESERVED; 77 static long tmpfs_pages_avail_init; 78 static int tmpfs_mem_percent = TMPFS_MEM_PERCENT; 79 static void tmpfs_set_reserve_from_percent(void); 80 81 MALLOC_DEFINE(M_TMPFSDIR, "tmpfs dir", "tmpfs dirent structure"); 82 static uma_zone_t tmpfs_node_pool; 83 VFS_SMR_DECLARE; 84 85 int tmpfs_pager_type = -1; 86 87 static vm_object_t 88 tmpfs_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, 89 vm_ooffset_t offset, struct ucred *cred) 90 { 91 vm_object_t object; 92 93 MPASS(handle == NULL); 94 MPASS(offset == 0); 95 object = vm_object_allocate_dyn(tmpfs_pager_type, size, 96 OBJ_COLORED | OBJ_SWAP); 97 if (!swap_pager_init_object(object, NULL, NULL, size, 0)) { 98 vm_object_deallocate(object); 99 object = NULL; 100 } 101 return (object); 102 } 103 104 /* 105 * Make sure tmpfs vnodes with writable mappings can be found on the lazy list. 106 * 107 * This allows for periodic mtime updates while only scanning vnodes which are 108 * plausibly dirty, see tmpfs_update_mtime_lazy. 109 */ 110 static void 111 tmpfs_pager_writecount_recalc(vm_object_t object, vm_offset_t old, 112 vm_offset_t new) 113 { 114 struct vnode *vp; 115 116 VM_OBJECT_ASSERT_WLOCKED(object); 117 118 vp = VM_TO_TMPFS_VP(object); 119 120 /* 121 * Forced unmount? 122 */ 123 if (vp == NULL || vp->v_object == NULL) { 124 KASSERT((object->flags & OBJ_TMPFS_VREF) == 0, 125 ("object %p with OBJ_TMPFS_VREF but without vnode", 126 object)); 127 VM_OBJECT_WUNLOCK(object); 128 return; 129 } 130 131 if (old == 0) { 132 VNASSERT((object->flags & OBJ_TMPFS_VREF) == 0, vp, 133 ("object without writable mappings has a reference")); 134 VNPASS(vp->v_usecount > 0, vp); 135 } else { 136 VNASSERT((object->flags & OBJ_TMPFS_VREF) != 0, vp, 137 ("object with writable mappings does not " 138 "have a reference")); 139 } 140 141 if (old == new) { 142 VM_OBJECT_WUNLOCK(object); 143 return; 144 } 145 146 if (new == 0) { 147 vm_object_clear_flag(object, OBJ_TMPFS_VREF); 148 VM_OBJECT_WUNLOCK(object); 149 vrele(vp); 150 } else { 151 if ((object->flags & OBJ_TMPFS_VREF) == 0) { 152 vref(vp); 153 vlazy(vp); 154 vm_object_set_flag(object, OBJ_TMPFS_VREF); 155 } 156 VM_OBJECT_WUNLOCK(object); 157 } 158 } 159 160 static void 161 tmpfs_pager_update_writecount(vm_object_t object, vm_offset_t start, 162 vm_offset_t end) 163 { 164 vm_offset_t new, old; 165 166 VM_OBJECT_WLOCK(object); 167 KASSERT((object->flags & OBJ_ANON) == 0, 168 ("%s: object %p with OBJ_ANON", __func__, object)); 169 old = object->un_pager.swp.writemappings; 170 object->un_pager.swp.writemappings += (vm_ooffset_t)end - start; 171 new = object->un_pager.swp.writemappings; 172 tmpfs_pager_writecount_recalc(object, old, new); 173 VM_OBJECT_ASSERT_UNLOCKED(object); 174 } 175 176 static void 177 tmpfs_pager_release_writecount(vm_object_t object, vm_offset_t start, 178 vm_offset_t end) 179 { 180 vm_offset_t new, old; 181 182 VM_OBJECT_WLOCK(object); 183 KASSERT((object->flags & OBJ_ANON) == 0, 184 ("%s: object %p with OBJ_ANON", __func__, object)); 185 old = object->un_pager.swp.writemappings; 186 KASSERT(old >= (vm_ooffset_t)end - start, 187 ("tmpfs obj %p writecount %jx dec %jx", object, (uintmax_t)old, 188 (uintmax_t)((vm_ooffset_t)end - start))); 189 object->un_pager.swp.writemappings -= (vm_ooffset_t)end - start; 190 new = object->un_pager.swp.writemappings; 191 tmpfs_pager_writecount_recalc(object, old, new); 192 VM_OBJECT_ASSERT_UNLOCKED(object); 193 } 194 195 static void 196 tmpfs_pager_getvp(vm_object_t object, struct vnode **vpp, bool *vp_heldp) 197 { 198 struct vnode *vp; 199 200 /* 201 * Tmpfs VREG node, which was reclaimed, has tmpfs_pager_type 202 * type. In this case there is no v_writecount to adjust. 203 */ 204 if (vp_heldp != NULL) 205 VM_OBJECT_RLOCK(object); 206 else 207 VM_OBJECT_ASSERT_LOCKED(object); 208 if ((object->flags & OBJ_TMPFS) != 0) { 209 vp = VM_TO_TMPFS_VP(object); 210 if (vp != NULL) { 211 *vpp = vp; 212 if (vp_heldp != NULL) { 213 vhold(vp); 214 *vp_heldp = true; 215 } 216 } 217 } 218 if (vp_heldp != NULL) 219 VM_OBJECT_RUNLOCK(object); 220 } 221 222 static void 223 tmpfs_pager_freespace(vm_object_t obj, vm_pindex_t start, vm_size_t size) 224 { 225 struct tmpfs_node *node; 226 struct tmpfs_mount *tm; 227 vm_size_t c; 228 229 swap_pager_freespace(obj, start, size, &c); 230 if ((obj->flags & OBJ_TMPFS) == 0 || c == 0) 231 return; 232 233 node = obj->un_pager.swp.swp_priv; 234 MPASS(node->tn_type == VREG); 235 tm = node->tn_reg.tn_tmp; 236 237 KASSERT(tm->tm_pages_used >= c, 238 ("tmpfs tm %p pages %jd free %jd", tm, 239 (uintmax_t)tm->tm_pages_used, (uintmax_t)c)); 240 atomic_add_long(&tm->tm_pages_used, -c); 241 KASSERT(node->tn_reg.tn_pages >= c, 242 ("tmpfs node %p pages %jd free %jd", node, 243 (uintmax_t)node->tn_reg.tn_pages, (uintmax_t)c)); 244 node->tn_reg.tn_pages -= c; 245 } 246 247 static void 248 tmpfs_page_inserted(vm_object_t obj, vm_page_t m) 249 { 250 struct tmpfs_node *node; 251 struct tmpfs_mount *tm; 252 253 if ((obj->flags & OBJ_TMPFS) == 0) 254 return; 255 256 node = obj->un_pager.swp.swp_priv; 257 MPASS(node->tn_type == VREG); 258 tm = node->tn_reg.tn_tmp; 259 260 if (!vm_pager_has_page(obj, m->pindex, NULL, NULL)) { 261 atomic_add_long(&tm->tm_pages_used, 1); 262 node->tn_reg.tn_pages += 1; 263 } 264 } 265 266 static void 267 tmpfs_page_removed(vm_object_t obj, vm_page_t m) 268 { 269 struct tmpfs_node *node; 270 struct tmpfs_mount *tm; 271 272 if ((obj->flags & OBJ_TMPFS) == 0) 273 return; 274 275 node = obj->un_pager.swp.swp_priv; 276 MPASS(node->tn_type == VREG); 277 tm = node->tn_reg.tn_tmp; 278 279 if (!vm_pager_has_page(obj, m->pindex, NULL, NULL)) { 280 KASSERT(tm->tm_pages_used >= 1, 281 ("tmpfs tm %p pages %jd free 1", tm, 282 (uintmax_t)tm->tm_pages_used)); 283 atomic_add_long(&tm->tm_pages_used, -1); 284 KASSERT(node->tn_reg.tn_pages >= 1, 285 ("tmpfs node %p pages %jd free 1", node, 286 (uintmax_t)node->tn_reg.tn_pages)); 287 node->tn_reg.tn_pages -= 1; 288 } 289 } 290 291 static boolean_t 292 tmpfs_can_alloc_page(vm_object_t obj, vm_pindex_t pindex) 293 { 294 struct tmpfs_mount *tm; 295 296 tm = VM_TO_TMPFS_MP(obj); 297 if (tm == NULL || vm_pager_has_page(obj, pindex, NULL, NULL) || 298 tm->tm_pages_max == 0) 299 return (true); 300 if (tm->tm_pages_max == ULONG_MAX) 301 return (tmpfs_mem_avail() >= 1); 302 return (tm->tm_pages_max > atomic_load_long(&tm->tm_pages_used)); 303 } 304 305 struct pagerops tmpfs_pager_ops = { 306 .pgo_kvme_type = KVME_TYPE_VNODE, 307 .pgo_alloc = tmpfs_pager_alloc, 308 .pgo_set_writeable_dirty = vm_object_set_writeable_dirty_, 309 .pgo_update_writecount = tmpfs_pager_update_writecount, 310 .pgo_release_writecount = tmpfs_pager_release_writecount, 311 .pgo_mightbedirty = vm_object_mightbedirty_, 312 .pgo_getvp = tmpfs_pager_getvp, 313 .pgo_freespace = tmpfs_pager_freespace, 314 .pgo_page_inserted = tmpfs_page_inserted, 315 .pgo_page_removed = tmpfs_page_removed, 316 .pgo_can_alloc_page = tmpfs_can_alloc_page, 317 }; 318 319 static int 320 tmpfs_node_ctor(void *mem, int size, void *arg, int flags) 321 { 322 struct tmpfs_node *node; 323 324 node = mem; 325 node->tn_gen++; 326 node->tn_size = 0; 327 node->tn_status = 0; 328 node->tn_accessed = false; 329 node->tn_flags = 0; 330 node->tn_links = 0; 331 node->tn_vnode = NULL; 332 node->tn_vpstate = 0; 333 return (0); 334 } 335 336 static void 337 tmpfs_node_dtor(void *mem, int size, void *arg) 338 { 339 struct tmpfs_node *node; 340 341 node = mem; 342 node->tn_type = VNON; 343 } 344 345 static int 346 tmpfs_node_init(void *mem, int size, int flags) 347 { 348 struct tmpfs_node *node; 349 350 node = mem; 351 node->tn_id = 0; 352 mtx_init(&node->tn_interlock, "tmpfsni", NULL, MTX_DEF | MTX_NEW); 353 node->tn_gen = arc4random(); 354 return (0); 355 } 356 357 static void 358 tmpfs_node_fini(void *mem, int size) 359 { 360 struct tmpfs_node *node; 361 362 node = mem; 363 mtx_destroy(&node->tn_interlock); 364 } 365 366 int 367 tmpfs_subr_init(void) 368 { 369 tmpfs_pager_type = vm_pager_alloc_dyn_type(&tmpfs_pager_ops, 370 OBJT_SWAP); 371 if (tmpfs_pager_type == -1) 372 return (EINVAL); 373 tmpfs_node_pool = uma_zcreate("TMPFS node", 374 sizeof(struct tmpfs_node), tmpfs_node_ctor, tmpfs_node_dtor, 375 tmpfs_node_init, tmpfs_node_fini, UMA_ALIGN_PTR, 0); 376 VFS_SMR_ZONE_SET(tmpfs_node_pool); 377 378 tmpfs_pages_avail_init = tmpfs_mem_avail(); 379 tmpfs_set_reserve_from_percent(); 380 return (0); 381 } 382 383 void 384 tmpfs_subr_uninit(void) 385 { 386 if (tmpfs_pager_type != -1) 387 vm_pager_free_dyn_type(tmpfs_pager_type); 388 tmpfs_pager_type = -1; 389 uma_zdestroy(tmpfs_node_pool); 390 } 391 392 static int 393 sysctl_mem_reserved(SYSCTL_HANDLER_ARGS) 394 { 395 int error; 396 long pages, bytes; 397 398 pages = *(long *)arg1; 399 bytes = pages * PAGE_SIZE; 400 401 error = sysctl_handle_long(oidp, &bytes, 0, req); 402 if (error || !req->newptr) 403 return (error); 404 405 pages = bytes / PAGE_SIZE; 406 if (pages < TMPFS_PAGES_MINRESERVED) 407 return (EINVAL); 408 409 *(long *)arg1 = pages; 410 return (0); 411 } 412 413 SYSCTL_PROC(_vfs_tmpfs, OID_AUTO, memory_reserved, 414 CTLTYPE_LONG | CTLFLAG_MPSAFE | CTLFLAG_RW, &tmpfs_pages_reserved, 0, 415 sysctl_mem_reserved, "L", 416 "Amount of available memory and swap below which tmpfs growth stops"); 417 418 static int 419 sysctl_mem_percent(SYSCTL_HANDLER_ARGS) 420 { 421 int error, percent; 422 423 percent = *(int *)arg1; 424 error = sysctl_handle_int(oidp, &percent, 0, req); 425 if (error || !req->newptr) 426 return (error); 427 428 if ((unsigned) percent > 100) 429 return (EINVAL); 430 431 *(int *)arg1 = percent; 432 tmpfs_set_reserve_from_percent(); 433 return (0); 434 } 435 436 static void 437 tmpfs_set_reserve_from_percent(void) 438 { 439 size_t reserved; 440 441 reserved = tmpfs_pages_avail_init * (100 - tmpfs_mem_percent) / 100; 442 tmpfs_pages_reserved = max(reserved, TMPFS_PAGES_MINRESERVED); 443 } 444 445 SYSCTL_PROC(_vfs_tmpfs, OID_AUTO, memory_percent, 446 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, &tmpfs_mem_percent, 0, 447 sysctl_mem_percent, "I", 448 "Percent of available memory that can be used if no size limit"); 449 450 static __inline int tmpfs_dirtree_cmp(struct tmpfs_dirent *a, 451 struct tmpfs_dirent *b); 452 RB_PROTOTYPE_STATIC(tmpfs_dir, tmpfs_dirent, uh.td_entries, tmpfs_dirtree_cmp); 453 454 size_t 455 tmpfs_mem_avail(void) 456 { 457 size_t avail; 458 long reserved; 459 460 avail = swap_pager_avail + vm_free_count(); 461 reserved = atomic_load_long(&tmpfs_pages_reserved); 462 if (__predict_false(avail < reserved)) 463 return (0); 464 return (avail - reserved); 465 } 466 467 size_t 468 tmpfs_pages_used(struct tmpfs_mount *tmp) 469 { 470 const size_t node_size = sizeof(struct tmpfs_node) + 471 sizeof(struct tmpfs_dirent); 472 size_t meta_pages; 473 474 meta_pages = howmany((uintmax_t)tmp->tm_nodes_inuse * node_size, 475 PAGE_SIZE); 476 return (meta_pages + tmp->tm_pages_used); 477 } 478 479 bool 480 tmpfs_pages_check_avail(struct tmpfs_mount *tmp, size_t req_pages) 481 { 482 if (tmpfs_mem_avail() < req_pages) 483 return (false); 484 485 if (tmp->tm_pages_max != ULONG_MAX && 486 tmp->tm_pages_max < req_pages + tmpfs_pages_used(tmp)) 487 return (false); 488 489 return (true); 490 } 491 492 static int 493 tmpfs_partial_page_invalidate(vm_object_t object, vm_pindex_t idx, int base, 494 int end, boolean_t ignerr) 495 { 496 int error; 497 498 error = vm_page_grab_zero_partial(object, idx, base, end); 499 if (ignerr) 500 error = 0; 501 return (error); 502 } 503 504 void 505 tmpfs_ref_node(struct tmpfs_node *node) 506 { 507 #ifdef INVARIANTS 508 u_int old; 509 510 old = 511 #endif 512 refcount_acquire(&node->tn_refcount); 513 #ifdef INVARIANTS 514 KASSERT(old > 0, ("node %p zero refcount", node)); 515 #endif 516 } 517 518 /* 519 * Allocates a new node of type 'type' inside the 'tmp' mount point, with 520 * its owner set to 'uid', its group to 'gid' and its mode set to 'mode', 521 * using the credentials of the process 'p'. 522 * 523 * If the node type is set to 'VDIR', then the parent parameter must point 524 * to the parent directory of the node being created. It may only be NULL 525 * while allocating the root node. 526 * 527 * If the node type is set to 'VBLK' or 'VCHR', then the rdev parameter 528 * specifies the device the node represents. 529 * 530 * If the node type is set to 'VLNK', then the parameter target specifies 531 * the file name of the target file for the symbolic link that is being 532 * created. 533 * 534 * Note that new nodes are retrieved from the available list if it has 535 * items or, if it is empty, from the node pool as long as there is enough 536 * space to create them. 537 * 538 * Returns zero on success or an appropriate error code on failure. 539 */ 540 int 541 tmpfs_alloc_node(struct mount *mp, struct tmpfs_mount *tmp, __enum_uint8(vtype) type, 542 uid_t uid, gid_t gid, mode_t mode, struct tmpfs_node *parent, 543 const char *target, dev_t rdev, struct tmpfs_node **node) 544 { 545 struct tmpfs_node *nnode; 546 char *symlink; 547 char symlink_smr; 548 549 /* If the root directory of the 'tmp' file system is not yet 550 * allocated, this must be the request to do it. */ 551 MPASS(IMPLIES(tmp->tm_root == NULL, parent == NULL && type == VDIR)); 552 553 MPASS((type == VLNK) ^ (target == NULL)); 554 MPASS((type == VBLK || type == VCHR) ^ (rdev == VNOVAL)); 555 556 if (tmp->tm_nodes_inuse >= tmp->tm_nodes_max) 557 return (ENOSPC); 558 if (!tmpfs_pages_check_avail(tmp, 1)) 559 return (ENOSPC); 560 561 if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) { 562 /* 563 * When a new tmpfs node is created for fully 564 * constructed mount point, there must be a parent 565 * node, which vnode is locked exclusively. As 566 * consequence, if the unmount is executing in 567 * parallel, vflush() cannot reclaim the parent vnode. 568 * Due to this, the check for MNTK_UNMOUNT flag is not 569 * racy: if we did not see MNTK_UNMOUNT flag, then tmp 570 * cannot be destroyed until node construction is 571 * finished and the parent vnode unlocked. 572 * 573 * Tmpfs does not need to instantiate new nodes during 574 * unmount. 575 */ 576 return (EBUSY); 577 } 578 if ((mp->mnt_kern_flag & MNT_RDONLY) != 0) 579 return (EROFS); 580 581 nnode = uma_zalloc_smr(tmpfs_node_pool, M_WAITOK); 582 583 /* Generic initialization. */ 584 nnode->tn_type = type; 585 vfs_timestamp(&nnode->tn_atime); 586 nnode->tn_birthtime = nnode->tn_ctime = nnode->tn_mtime = 587 nnode->tn_atime; 588 nnode->tn_uid = uid; 589 nnode->tn_gid = gid; 590 nnode->tn_mode = mode; 591 nnode->tn_id = alloc_unr64(&tmp->tm_ino_unr); 592 nnode->tn_refcount = 1; 593 LIST_INIT(&nnode->tn_extattrs); 594 595 /* Type-specific initialization. */ 596 switch (nnode->tn_type) { 597 case VBLK: 598 case VCHR: 599 nnode->tn_rdev = rdev; 600 break; 601 602 case VDIR: 603 RB_INIT(&nnode->tn_dir.tn_dirhead); 604 LIST_INIT(&nnode->tn_dir.tn_dupindex); 605 MPASS(parent != nnode); 606 MPASS(IMPLIES(parent == NULL, tmp->tm_root == NULL)); 607 nnode->tn_dir.tn_parent = (parent == NULL) ? nnode : parent; 608 nnode->tn_dir.tn_readdir_lastn = 0; 609 nnode->tn_dir.tn_readdir_lastp = NULL; 610 nnode->tn_dir.tn_wht_size = 0; 611 nnode->tn_links++; 612 TMPFS_NODE_LOCK(nnode->tn_dir.tn_parent); 613 nnode->tn_dir.tn_parent->tn_links++; 614 TMPFS_NODE_UNLOCK(nnode->tn_dir.tn_parent); 615 break; 616 617 case VFIFO: 618 /* FALLTHROUGH */ 619 case VSOCK: 620 break; 621 622 case VLNK: 623 MPASS(strlen(target) < MAXPATHLEN); 624 nnode->tn_size = strlen(target); 625 626 symlink = NULL; 627 if (!tmp->tm_nonc) { 628 symlink = cache_symlink_alloc(nnode->tn_size + 1, 629 M_WAITOK); 630 symlink_smr = true; 631 } 632 if (symlink == NULL) { 633 symlink = malloc(nnode->tn_size + 1, M_TMPFSNAME, 634 M_WAITOK); 635 symlink_smr = false; 636 } 637 memcpy(symlink, target, nnode->tn_size + 1); 638 639 /* 640 * Allow safe symlink resolving for lockless lookup. 641 * tmpfs_fplookup_symlink references this comment. 642 * 643 * 1. nnode is not yet visible to the world 644 * 2. both tn_link_target and tn_link_smr get populated 645 * 3. release fence publishes their content 646 * 4. tn_link_target content is immutable until node 647 * destruction, where the pointer gets set to NULL 648 * 5. tn_link_smr is never changed once set 649 * 650 * As a result it is sufficient to issue load consume 651 * on the node pointer to also get the above content 652 * in a stable manner. Worst case tn_link_smr flag 653 * may be set to true despite being stale, while the 654 * target buffer is already cleared out. 655 */ 656 atomic_store_ptr(&nnode->tn_link_target, symlink); 657 atomic_store_char((char *)&nnode->tn_link_smr, symlink_smr); 658 atomic_thread_fence_rel(); 659 break; 660 661 case VREG: 662 nnode->tn_reg.tn_aobj = 663 vm_pager_allocate(tmpfs_pager_type, NULL, 0, 664 VM_PROT_DEFAULT, 0, 665 NULL /* XXXKIB - tmpfs needs swap reservation */); 666 nnode->tn_reg.tn_aobj->un_pager.swp.swp_priv = nnode; 667 vm_object_set_flag(nnode->tn_reg.tn_aobj, OBJ_TMPFS); 668 nnode->tn_reg.tn_tmp = tmp; 669 nnode->tn_reg.tn_pages = 0; 670 break; 671 672 default: 673 panic("tmpfs_alloc_node: type %p %d", nnode, 674 (int)nnode->tn_type); 675 } 676 677 TMPFS_LOCK(tmp); 678 LIST_INSERT_HEAD(&tmp->tm_nodes_used, nnode, tn_entries); 679 nnode->tn_attached = true; 680 tmp->tm_nodes_inuse++; 681 tmp->tm_refcount++; 682 TMPFS_UNLOCK(tmp); 683 684 *node = nnode; 685 return (0); 686 } 687 688 /* 689 * Destroys the node pointed to by node from the file system 'tmp'. 690 * If the node references a directory, no entries are allowed. 691 */ 692 void 693 tmpfs_free_node(struct tmpfs_mount *tmp, struct tmpfs_node *node) 694 { 695 if (refcount_release_if_not_last(&node->tn_refcount)) 696 return; 697 698 TMPFS_LOCK(tmp); 699 TMPFS_NODE_LOCK(node); 700 if (!tmpfs_free_node_locked(tmp, node, false)) { 701 TMPFS_NODE_UNLOCK(node); 702 TMPFS_UNLOCK(tmp); 703 } 704 } 705 706 bool 707 tmpfs_free_node_locked(struct tmpfs_mount *tmp, struct tmpfs_node *node, 708 bool detach) 709 { 710 struct tmpfs_extattr *ea; 711 vm_object_t uobj; 712 char *symlink; 713 bool last; 714 715 TMPFS_MP_ASSERT_LOCKED(tmp); 716 TMPFS_NODE_ASSERT_LOCKED(node); 717 718 last = refcount_release(&node->tn_refcount); 719 if (node->tn_attached && (detach || last)) { 720 MPASS(tmp->tm_nodes_inuse > 0); 721 tmp->tm_nodes_inuse--; 722 LIST_REMOVE(node, tn_entries); 723 node->tn_attached = false; 724 } 725 if (!last) 726 return (false); 727 728 TMPFS_NODE_UNLOCK(node); 729 730 #ifdef INVARIANTS 731 MPASS(node->tn_vnode == NULL); 732 MPASS((node->tn_vpstate & TMPFS_VNODE_ALLOCATING) == 0); 733 734 /* 735 * Make sure this is a node type we can deal with. Everything 736 * is explicitly enumerated without the 'default' clause so 737 * the compiler can throw an error in case a new type is 738 * added. 739 */ 740 switch (node->tn_type) { 741 case VBLK: 742 case VCHR: 743 case VDIR: 744 case VFIFO: 745 case VSOCK: 746 case VLNK: 747 case VREG: 748 break; 749 case VNON: 750 case VBAD: 751 case VMARKER: 752 panic("%s: bad type %d for node %p", __func__, 753 (int)node->tn_type, node); 754 } 755 #endif 756 757 while ((ea = LIST_FIRST(&node->tn_extattrs)) != NULL) { 758 LIST_REMOVE(ea, ea_extattrs); 759 tmpfs_extattr_free(ea); 760 } 761 762 switch (node->tn_type) { 763 case VREG: 764 uobj = node->tn_reg.tn_aobj; 765 node->tn_reg.tn_aobj = NULL; 766 if (uobj != NULL) { 767 VM_OBJECT_WLOCK(uobj); 768 KASSERT((uobj->flags & OBJ_TMPFS) != 0, 769 ("tmpfs node %p uobj %p not tmpfs", node, uobj)); 770 vm_object_clear_flag(uobj, OBJ_TMPFS); 771 KASSERT(tmp->tm_pages_used >= node->tn_reg.tn_pages, 772 ("tmpfs tmp %p node %p pages %jd free %jd", tmp, 773 node, (uintmax_t)tmp->tm_pages_used, 774 (uintmax_t)node->tn_reg.tn_pages)); 775 atomic_add_long(&tmp->tm_pages_used, 776 -node->tn_reg.tn_pages); 777 VM_OBJECT_WUNLOCK(uobj); 778 } 779 tmpfs_free_tmp(tmp); 780 781 /* 782 * vm_object_deallocate() must not be called while 783 * owning tm_allnode_lock, because deallocate might 784 * sleep. Call it after tmpfs_free_tmp() does the 785 * unlock. 786 */ 787 if (uobj != NULL) 788 vm_object_deallocate(uobj); 789 790 break; 791 case VLNK: 792 tmpfs_free_tmp(tmp); 793 794 symlink = node->tn_link_target; 795 atomic_store_ptr(&node->tn_link_target, NULL); 796 if (atomic_load_char(&node->tn_link_smr)) { 797 cache_symlink_free(symlink, node->tn_size + 1); 798 } else { 799 free(symlink, M_TMPFSNAME); 800 } 801 break; 802 default: 803 tmpfs_free_tmp(tmp); 804 break; 805 } 806 807 uma_zfree_smr(tmpfs_node_pool, node); 808 return (true); 809 } 810 811 static __inline uint32_t 812 tmpfs_dirent_hash(const char *name, u_int len) 813 { 814 uint32_t hash; 815 816 hash = fnv_32_buf(name, len, FNV1_32_INIT + len) & TMPFS_DIRCOOKIE_MASK; 817 #ifdef TMPFS_DEBUG_DIRCOOKIE_DUP 818 hash &= 0xf; 819 #endif 820 if (hash < TMPFS_DIRCOOKIE_MIN) 821 hash += TMPFS_DIRCOOKIE_MIN; 822 823 return (hash); 824 } 825 826 static __inline off_t 827 tmpfs_dirent_cookie(struct tmpfs_dirent *de) 828 { 829 if (de == NULL) 830 return (TMPFS_DIRCOOKIE_EOF); 831 832 MPASS(de->td_cookie >= TMPFS_DIRCOOKIE_MIN); 833 834 return (de->td_cookie); 835 } 836 837 static __inline boolean_t 838 tmpfs_dirent_dup(struct tmpfs_dirent *de) 839 { 840 return ((de->td_cookie & TMPFS_DIRCOOKIE_DUP) != 0); 841 } 842 843 static __inline boolean_t 844 tmpfs_dirent_duphead(struct tmpfs_dirent *de) 845 { 846 return ((de->td_cookie & TMPFS_DIRCOOKIE_DUPHEAD) != 0); 847 } 848 849 void 850 tmpfs_dirent_init(struct tmpfs_dirent *de, const char *name, u_int namelen) 851 { 852 de->td_hash = de->td_cookie = tmpfs_dirent_hash(name, namelen); 853 memcpy(de->ud.td_name, name, namelen); 854 de->td_namelen = namelen; 855 } 856 857 /* 858 * Allocates a new directory entry for the node node with a name of name. 859 * The new directory entry is returned in *de. 860 * 861 * The link count of node is increased by one to reflect the new object 862 * referencing it. 863 * 864 * Returns zero on success or an appropriate error code on failure. 865 */ 866 int 867 tmpfs_alloc_dirent(struct tmpfs_mount *tmp, struct tmpfs_node *node, 868 const char *name, u_int len, struct tmpfs_dirent **de) 869 { 870 struct tmpfs_dirent *nde; 871 872 nde = malloc(sizeof(*nde), M_TMPFSDIR, M_WAITOK); 873 nde->td_node = node; 874 if (name != NULL) { 875 nde->ud.td_name = malloc(len, M_TMPFSNAME, M_WAITOK); 876 tmpfs_dirent_init(nde, name, len); 877 } else 878 nde->td_namelen = 0; 879 if (node != NULL) 880 node->tn_links++; 881 882 *de = nde; 883 884 return (0); 885 } 886 887 /* 888 * Frees a directory entry. It is the caller's responsibility to destroy 889 * the node referenced by it if needed. 890 * 891 * The link count of node is decreased by one to reflect the removal of an 892 * object that referenced it. This only happens if 'node_exists' is true; 893 * otherwise the function will not access the node referred to by the 894 * directory entry, as it may already have been released from the outside. 895 */ 896 void 897 tmpfs_free_dirent(struct tmpfs_mount *tmp, struct tmpfs_dirent *de) 898 { 899 struct tmpfs_node *node; 900 901 node = de->td_node; 902 if (node != NULL) { 903 MPASS(node->tn_links > 0); 904 node->tn_links--; 905 } 906 if (!tmpfs_dirent_duphead(de) && de->ud.td_name != NULL) 907 free(de->ud.td_name, M_TMPFSNAME); 908 free(de, M_TMPFSDIR); 909 } 910 911 void 912 tmpfs_destroy_vobject(struct vnode *vp, vm_object_t obj) 913 { 914 bool want_vrele; 915 916 ASSERT_VOP_ELOCKED(vp, "tmpfs_destroy_vobject"); 917 if (vp->v_type != VREG || obj == NULL) 918 return; 919 920 VM_OBJECT_WLOCK(obj); 921 VI_LOCK(vp); 922 vp->v_object = NULL; 923 924 /* 925 * May be going through forced unmount. 926 */ 927 want_vrele = false; 928 if ((obj->flags & OBJ_TMPFS_VREF) != 0) { 929 vm_object_clear_flag(obj, OBJ_TMPFS_VREF); 930 want_vrele = true; 931 } 932 933 if (vp->v_writecount < 0) 934 vp->v_writecount = 0; 935 VI_UNLOCK(vp); 936 VM_OBJECT_WUNLOCK(obj); 937 if (want_vrele) { 938 vrele(vp); 939 } 940 } 941 942 /* 943 * Allocates a new vnode for the node node or returns a new reference to 944 * an existing one if the node had already a vnode referencing it. The 945 * resulting locked vnode is returned in *vpp. 946 * 947 * Returns zero on success or an appropriate error code on failure. 948 */ 949 int 950 tmpfs_alloc_vp(struct mount *mp, struct tmpfs_node *node, int lkflag, 951 struct vnode **vpp) 952 { 953 struct vnode *vp; 954 enum vgetstate vs; 955 struct tmpfs_mount *tm; 956 vm_object_t object; 957 int error; 958 959 error = 0; 960 tm = VFS_TO_TMPFS(mp); 961 TMPFS_NODE_LOCK(node); 962 tmpfs_ref_node(node); 963 loop: 964 TMPFS_NODE_ASSERT_LOCKED(node); 965 if ((vp = node->tn_vnode) != NULL) { 966 MPASS((node->tn_vpstate & TMPFS_VNODE_DOOMED) == 0); 967 if ((node->tn_type == VDIR && node->tn_dir.tn_parent == NULL) || 968 (VN_IS_DOOMED(vp) && 969 (lkflag & LK_NOWAIT) != 0)) { 970 TMPFS_NODE_UNLOCK(node); 971 error = ENOENT; 972 vp = NULL; 973 goto out; 974 } 975 if (VN_IS_DOOMED(vp)) { 976 node->tn_vpstate |= TMPFS_VNODE_WRECLAIM; 977 while ((node->tn_vpstate & TMPFS_VNODE_WRECLAIM) != 0) { 978 msleep(&node->tn_vnode, TMPFS_NODE_MTX(node), 979 0, "tmpfsE", 0); 980 } 981 goto loop; 982 } 983 vs = vget_prep(vp); 984 TMPFS_NODE_UNLOCK(node); 985 error = vget_finish(vp, lkflag, vs); 986 if (error == ENOENT) { 987 TMPFS_NODE_LOCK(node); 988 goto loop; 989 } 990 if (error != 0) { 991 vp = NULL; 992 goto out; 993 } 994 995 /* 996 * Make sure the vnode is still there after 997 * getting the interlock to avoid racing a free. 998 */ 999 if (node->tn_vnode != vp) { 1000 vput(vp); 1001 TMPFS_NODE_LOCK(node); 1002 goto loop; 1003 } 1004 1005 goto out; 1006 } 1007 1008 if ((node->tn_vpstate & TMPFS_VNODE_DOOMED) || 1009 (node->tn_type == VDIR && node->tn_dir.tn_parent == NULL)) { 1010 TMPFS_NODE_UNLOCK(node); 1011 error = ENOENT; 1012 vp = NULL; 1013 goto out; 1014 } 1015 1016 /* 1017 * otherwise lock the vp list while we call getnewvnode 1018 * since that can block. 1019 */ 1020 if (node->tn_vpstate & TMPFS_VNODE_ALLOCATING) { 1021 node->tn_vpstate |= TMPFS_VNODE_WANT; 1022 error = msleep((caddr_t) &node->tn_vpstate, 1023 TMPFS_NODE_MTX(node), 0, "tmpfs_alloc_vp", 0); 1024 if (error != 0) 1025 goto out; 1026 goto loop; 1027 } else 1028 node->tn_vpstate |= TMPFS_VNODE_ALLOCATING; 1029 1030 TMPFS_NODE_UNLOCK(node); 1031 1032 /* Get a new vnode and associate it with our node. */ 1033 error = getnewvnode("tmpfs", mp, VFS_TO_TMPFS(mp)->tm_nonc ? 1034 &tmpfs_vnodeop_nonc_entries : &tmpfs_vnodeop_entries, &vp); 1035 if (error != 0) 1036 goto unlock; 1037 MPASS(vp != NULL); 1038 1039 /* lkflag is ignored, the lock is exclusive */ 1040 (void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1041 1042 vp->v_data = node; 1043 vp->v_type = node->tn_type; 1044 1045 /* Type-specific initialization. */ 1046 switch (node->tn_type) { 1047 case VBLK: 1048 /* FALLTHROUGH */ 1049 case VCHR: 1050 /* FALLTHROUGH */ 1051 case VLNK: 1052 /* FALLTHROUGH */ 1053 case VSOCK: 1054 break; 1055 case VFIFO: 1056 vp->v_op = &tmpfs_fifoop_entries; 1057 break; 1058 case VREG: 1059 object = node->tn_reg.tn_aobj; 1060 VM_OBJECT_WLOCK(object); 1061 KASSERT((object->flags & OBJ_TMPFS_VREF) == 0, 1062 ("%s: object %p with OBJ_TMPFS_VREF but without vnode", 1063 __func__, object)); 1064 VI_LOCK(vp); 1065 KASSERT(vp->v_object == NULL, ("Not NULL v_object in tmpfs")); 1066 vp->v_object = object; 1067 vn_irflag_set_locked(vp, (tm->tm_pgread ? VIRF_PGREAD : 0) | 1068 VIRF_TEXT_REF); 1069 VI_UNLOCK(vp); 1070 VNASSERT((object->flags & OBJ_TMPFS_VREF) == 0, vp, 1071 ("leaked OBJ_TMPFS_VREF")); 1072 if (object->un_pager.swp.writemappings > 0) { 1073 vrefact(vp); 1074 vlazy(vp); 1075 vm_object_set_flag(object, OBJ_TMPFS_VREF); 1076 } 1077 VM_OBJECT_WUNLOCK(object); 1078 break; 1079 case VDIR: 1080 MPASS(node->tn_dir.tn_parent != NULL); 1081 if (node->tn_dir.tn_parent == node) 1082 vp->v_vflag |= VV_ROOT; 1083 break; 1084 1085 default: 1086 panic("tmpfs_alloc_vp: type %p %d", node, (int)node->tn_type); 1087 } 1088 if (vp->v_type != VFIFO) 1089 VN_LOCK_ASHARE(vp); 1090 1091 error = insmntque1(vp, mp); 1092 if (error != 0) { 1093 /* Need to clear v_object for insmntque failure. */ 1094 tmpfs_destroy_vobject(vp, vp->v_object); 1095 vp->v_object = NULL; 1096 vp->v_data = NULL; 1097 vp->v_op = &dead_vnodeops; 1098 vgone(vp); 1099 vput(vp); 1100 vp = NULL; 1101 } else { 1102 vn_set_state(vp, VSTATE_CONSTRUCTED); 1103 } 1104 1105 unlock: 1106 TMPFS_NODE_LOCK(node); 1107 1108 MPASS(node->tn_vpstate & TMPFS_VNODE_ALLOCATING); 1109 node->tn_vpstate &= ~TMPFS_VNODE_ALLOCATING; 1110 node->tn_vnode = vp; 1111 1112 if (node->tn_vpstate & TMPFS_VNODE_WANT) { 1113 node->tn_vpstate &= ~TMPFS_VNODE_WANT; 1114 TMPFS_NODE_UNLOCK(node); 1115 wakeup((caddr_t) &node->tn_vpstate); 1116 } else 1117 TMPFS_NODE_UNLOCK(node); 1118 1119 out: 1120 if (error == 0) { 1121 *vpp = vp; 1122 1123 #ifdef INVARIANTS 1124 MPASS(*vpp != NULL); 1125 ASSERT_VOP_LOCKED(*vpp, __func__); 1126 TMPFS_NODE_LOCK(node); 1127 MPASS(*vpp == node->tn_vnode); 1128 TMPFS_NODE_UNLOCK(node); 1129 #endif 1130 } 1131 tmpfs_free_node(tm, node); 1132 1133 return (error); 1134 } 1135 1136 /* 1137 * Destroys the association between the vnode vp and the node it 1138 * references. 1139 */ 1140 void 1141 tmpfs_free_vp(struct vnode *vp) 1142 { 1143 struct tmpfs_node *node; 1144 1145 node = VP_TO_TMPFS_NODE(vp); 1146 1147 TMPFS_NODE_ASSERT_LOCKED(node); 1148 node->tn_vnode = NULL; 1149 if ((node->tn_vpstate & TMPFS_VNODE_WRECLAIM) != 0) 1150 wakeup(&node->tn_vnode); 1151 node->tn_vpstate &= ~TMPFS_VNODE_WRECLAIM; 1152 vp->v_data = NULL; 1153 } 1154 1155 /* 1156 * Allocates a new file of type 'type' and adds it to the parent directory 1157 * 'dvp'; this addition is done using the component name given in 'cnp'. 1158 * The ownership of the new file is automatically assigned based on the 1159 * credentials of the caller (through 'cnp'), the group is set based on 1160 * the parent directory and the mode is determined from the 'vap' argument. 1161 * If successful, *vpp holds a vnode to the newly created file and zero 1162 * is returned. Otherwise *vpp is NULL and the function returns an 1163 * appropriate error code. 1164 */ 1165 int 1166 tmpfs_alloc_file(struct vnode *dvp, struct vnode **vpp, struct vattr *vap, 1167 struct componentname *cnp, const char *target) 1168 { 1169 int error; 1170 struct tmpfs_dirent *de; 1171 struct tmpfs_mount *tmp; 1172 struct tmpfs_node *dnode; 1173 struct tmpfs_node *node; 1174 struct tmpfs_node *parent; 1175 1176 ASSERT_VOP_ELOCKED(dvp, "tmpfs_alloc_file"); 1177 1178 tmp = VFS_TO_TMPFS(dvp->v_mount); 1179 dnode = VP_TO_TMPFS_DIR(dvp); 1180 *vpp = NULL; 1181 1182 /* If the entry we are creating is a directory, we cannot overflow 1183 * the number of links of its parent, because it will get a new 1184 * link. */ 1185 if (vap->va_type == VDIR) { 1186 /* Ensure that we do not overflow the maximum number of links 1187 * imposed by the system. */ 1188 MPASS(dnode->tn_links <= TMPFS_LINK_MAX); 1189 if (dnode->tn_links == TMPFS_LINK_MAX) { 1190 return (EMLINK); 1191 } 1192 1193 parent = dnode; 1194 MPASS(parent != NULL); 1195 } else 1196 parent = NULL; 1197 1198 /* Allocate a node that represents the new file. */ 1199 error = tmpfs_alloc_node(dvp->v_mount, tmp, vap->va_type, 1200 cnp->cn_cred->cr_uid, dnode->tn_gid, vap->va_mode, parent, 1201 target, vap->va_rdev, &node); 1202 if (error != 0) 1203 return (error); 1204 1205 /* Allocate a directory entry that points to the new file. */ 1206 error = tmpfs_alloc_dirent(tmp, node, cnp->cn_nameptr, cnp->cn_namelen, 1207 &de); 1208 if (error != 0) { 1209 tmpfs_free_node(tmp, node); 1210 return (error); 1211 } 1212 1213 /* Allocate a vnode for the new file. */ 1214 error = tmpfs_alloc_vp(dvp->v_mount, node, LK_EXCLUSIVE, vpp); 1215 if (error != 0) { 1216 tmpfs_free_dirent(tmp, de); 1217 tmpfs_free_node(tmp, node); 1218 return (error); 1219 } 1220 1221 /* Now that all required items are allocated, we can proceed to 1222 * insert the new node into the directory, an operation that 1223 * cannot fail. */ 1224 if (cnp->cn_flags & ISWHITEOUT) 1225 tmpfs_dir_whiteout_remove(dvp, cnp); 1226 tmpfs_dir_attach(dvp, de); 1227 return (0); 1228 } 1229 1230 struct tmpfs_dirent * 1231 tmpfs_dir_first(struct tmpfs_node *dnode, struct tmpfs_dir_cursor *dc) 1232 { 1233 struct tmpfs_dirent *de; 1234 1235 de = RB_MIN(tmpfs_dir, &dnode->tn_dir.tn_dirhead); 1236 dc->tdc_tree = de; 1237 if (de != NULL && tmpfs_dirent_duphead(de)) 1238 de = LIST_FIRST(&de->ud.td_duphead); 1239 dc->tdc_current = de; 1240 1241 return (dc->tdc_current); 1242 } 1243 1244 struct tmpfs_dirent * 1245 tmpfs_dir_next(struct tmpfs_node *dnode, struct tmpfs_dir_cursor *dc) 1246 { 1247 struct tmpfs_dirent *de; 1248 1249 MPASS(dc->tdc_tree != NULL); 1250 if (tmpfs_dirent_dup(dc->tdc_current)) { 1251 dc->tdc_current = LIST_NEXT(dc->tdc_current, uh.td_dup.entries); 1252 if (dc->tdc_current != NULL) 1253 return (dc->tdc_current); 1254 } 1255 dc->tdc_tree = dc->tdc_current = RB_NEXT(tmpfs_dir, 1256 &dnode->tn_dir.tn_dirhead, dc->tdc_tree); 1257 if ((de = dc->tdc_current) != NULL && tmpfs_dirent_duphead(de)) { 1258 dc->tdc_current = LIST_FIRST(&de->ud.td_duphead); 1259 MPASS(dc->tdc_current != NULL); 1260 } 1261 1262 return (dc->tdc_current); 1263 } 1264 1265 /* Lookup directory entry in RB-Tree. Function may return duphead entry. */ 1266 static struct tmpfs_dirent * 1267 tmpfs_dir_xlookup_hash(struct tmpfs_node *dnode, uint32_t hash) 1268 { 1269 struct tmpfs_dirent *de, dekey; 1270 1271 dekey.td_hash = hash; 1272 de = RB_FIND(tmpfs_dir, &dnode->tn_dir.tn_dirhead, &dekey); 1273 return (de); 1274 } 1275 1276 /* Lookup directory entry by cookie, initialize directory cursor accordingly. */ 1277 static struct tmpfs_dirent * 1278 tmpfs_dir_lookup_cookie(struct tmpfs_node *node, off_t cookie, 1279 struct tmpfs_dir_cursor *dc) 1280 { 1281 struct tmpfs_dir *dirhead = &node->tn_dir.tn_dirhead; 1282 struct tmpfs_dirent *de, dekey; 1283 1284 MPASS(cookie >= TMPFS_DIRCOOKIE_MIN); 1285 1286 if (cookie == node->tn_dir.tn_readdir_lastn && 1287 (de = node->tn_dir.tn_readdir_lastp) != NULL) { 1288 /* Protect against possible race, tn_readdir_last[pn] 1289 * may be updated with only shared vnode lock held. */ 1290 if (cookie == tmpfs_dirent_cookie(de)) 1291 goto out; 1292 } 1293 1294 if ((cookie & TMPFS_DIRCOOKIE_DUP) != 0) { 1295 LIST_FOREACH(de, &node->tn_dir.tn_dupindex, 1296 uh.td_dup.index_entries) { 1297 MPASS(tmpfs_dirent_dup(de)); 1298 if (de->td_cookie == cookie) 1299 goto out; 1300 /* dupindex list is sorted. */ 1301 if (de->td_cookie < cookie) { 1302 de = NULL; 1303 goto out; 1304 } 1305 } 1306 MPASS(de == NULL); 1307 goto out; 1308 } 1309 1310 if ((cookie & TMPFS_DIRCOOKIE_MASK) != cookie) { 1311 de = NULL; 1312 } else { 1313 dekey.td_hash = cookie; 1314 /* Recover if direntry for cookie was removed */ 1315 de = RB_NFIND(tmpfs_dir, dirhead, &dekey); 1316 } 1317 dc->tdc_tree = de; 1318 dc->tdc_current = de; 1319 if (de != NULL && tmpfs_dirent_duphead(de)) { 1320 dc->tdc_current = LIST_FIRST(&de->ud.td_duphead); 1321 MPASS(dc->tdc_current != NULL); 1322 } 1323 return (dc->tdc_current); 1324 1325 out: 1326 dc->tdc_tree = de; 1327 dc->tdc_current = de; 1328 if (de != NULL && tmpfs_dirent_dup(de)) 1329 dc->tdc_tree = tmpfs_dir_xlookup_hash(node, 1330 de->td_hash); 1331 return (dc->tdc_current); 1332 } 1333 1334 /* 1335 * Looks for a directory entry in the directory represented by node. 1336 * 'cnp' describes the name of the entry to look for. Note that the . 1337 * and .. components are not allowed as they do not physically exist 1338 * within directories. 1339 * 1340 * Returns a pointer to the entry when found, otherwise NULL. 1341 */ 1342 struct tmpfs_dirent * 1343 tmpfs_dir_lookup(struct tmpfs_node *node, struct tmpfs_node *f, 1344 struct componentname *cnp) 1345 { 1346 struct tmpfs_dir_duphead *duphead; 1347 struct tmpfs_dirent *de; 1348 uint32_t hash; 1349 1350 MPASS(IMPLIES(cnp->cn_namelen == 1, cnp->cn_nameptr[0] != '.')); 1351 MPASS(IMPLIES(cnp->cn_namelen == 2, !(cnp->cn_nameptr[0] == '.' && 1352 cnp->cn_nameptr[1] == '.'))); 1353 TMPFS_VALIDATE_DIR(node); 1354 1355 hash = tmpfs_dirent_hash(cnp->cn_nameptr, cnp->cn_namelen); 1356 de = tmpfs_dir_xlookup_hash(node, hash); 1357 if (de != NULL && tmpfs_dirent_duphead(de)) { 1358 duphead = &de->ud.td_duphead; 1359 LIST_FOREACH(de, duphead, uh.td_dup.entries) { 1360 if (TMPFS_DIRENT_MATCHES(de, cnp->cn_nameptr, 1361 cnp->cn_namelen)) 1362 break; 1363 } 1364 } else if (de != NULL) { 1365 if (!TMPFS_DIRENT_MATCHES(de, cnp->cn_nameptr, 1366 cnp->cn_namelen)) 1367 de = NULL; 1368 } 1369 if (de != NULL && f != NULL && de->td_node != f) 1370 de = NULL; 1371 1372 return (de); 1373 } 1374 1375 /* 1376 * Attach duplicate-cookie directory entry nde to dnode and insert to dupindex 1377 * list, allocate new cookie value. 1378 */ 1379 static void 1380 tmpfs_dir_attach_dup(struct tmpfs_node *dnode, 1381 struct tmpfs_dir_duphead *duphead, struct tmpfs_dirent *nde) 1382 { 1383 struct tmpfs_dir_duphead *dupindex; 1384 struct tmpfs_dirent *de, *pde; 1385 1386 dupindex = &dnode->tn_dir.tn_dupindex; 1387 de = LIST_FIRST(dupindex); 1388 if (de == NULL || de->td_cookie < TMPFS_DIRCOOKIE_DUP_MAX) { 1389 if (de == NULL) 1390 nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MIN; 1391 else 1392 nde->td_cookie = de->td_cookie + 1; 1393 MPASS(tmpfs_dirent_dup(nde)); 1394 LIST_INSERT_HEAD(dupindex, nde, uh.td_dup.index_entries); 1395 LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); 1396 return; 1397 } 1398 1399 /* 1400 * Cookie numbers are near exhaustion. Scan dupindex list for unused 1401 * numbers. dupindex list is sorted in descending order. Keep it so 1402 * after inserting nde. 1403 */ 1404 while (1) { 1405 pde = de; 1406 de = LIST_NEXT(de, uh.td_dup.index_entries); 1407 if (de == NULL && pde->td_cookie != TMPFS_DIRCOOKIE_DUP_MIN) { 1408 /* 1409 * Last element of the index doesn't have minimal cookie 1410 * value, use it. 1411 */ 1412 nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MIN; 1413 LIST_INSERT_AFTER(pde, nde, uh.td_dup.index_entries); 1414 LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); 1415 return; 1416 } else if (de == NULL) { 1417 /* 1418 * We are so lucky have 2^30 hash duplicates in single 1419 * directory :) Return largest possible cookie value. 1420 * It should be fine except possible issues with 1421 * VOP_READDIR restart. 1422 */ 1423 nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MAX; 1424 LIST_INSERT_HEAD(dupindex, nde, 1425 uh.td_dup.index_entries); 1426 LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); 1427 return; 1428 } 1429 if (de->td_cookie + 1 == pde->td_cookie || 1430 de->td_cookie >= TMPFS_DIRCOOKIE_DUP_MAX) 1431 continue; /* No hole or invalid cookie. */ 1432 nde->td_cookie = de->td_cookie + 1; 1433 MPASS(tmpfs_dirent_dup(nde)); 1434 MPASS(pde->td_cookie > nde->td_cookie); 1435 MPASS(nde->td_cookie > de->td_cookie); 1436 LIST_INSERT_BEFORE(de, nde, uh.td_dup.index_entries); 1437 LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); 1438 return; 1439 } 1440 } 1441 1442 /* 1443 * Attaches the directory entry de to the directory represented by vp. 1444 * Note that this does not change the link count of the node pointed by 1445 * the directory entry, as this is done by tmpfs_alloc_dirent. 1446 */ 1447 void 1448 tmpfs_dir_attach(struct vnode *vp, struct tmpfs_dirent *de) 1449 { 1450 struct tmpfs_node *dnode; 1451 struct tmpfs_dirent *xde, *nde; 1452 1453 ASSERT_VOP_ELOCKED(vp, __func__); 1454 MPASS(de->td_namelen > 0); 1455 MPASS(de->td_hash >= TMPFS_DIRCOOKIE_MIN); 1456 MPASS(de->td_cookie == de->td_hash); 1457 1458 dnode = VP_TO_TMPFS_DIR(vp); 1459 dnode->tn_dir.tn_readdir_lastn = 0; 1460 dnode->tn_dir.tn_readdir_lastp = NULL; 1461 1462 MPASS(!tmpfs_dirent_dup(de)); 1463 xde = RB_INSERT(tmpfs_dir, &dnode->tn_dir.tn_dirhead, de); 1464 if (xde != NULL && tmpfs_dirent_duphead(xde)) 1465 tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, de); 1466 else if (xde != NULL) { 1467 /* 1468 * Allocate new duphead. Swap xde with duphead to avoid 1469 * adding/removing elements with the same hash. 1470 */ 1471 MPASS(!tmpfs_dirent_dup(xde)); 1472 tmpfs_alloc_dirent(VFS_TO_TMPFS(vp->v_mount), NULL, NULL, 0, 1473 &nde); 1474 /* *nde = *xde; XXX gcc 4.2.1 may generate invalid code. */ 1475 memcpy(nde, xde, sizeof(*xde)); 1476 xde->td_cookie |= TMPFS_DIRCOOKIE_DUPHEAD; 1477 LIST_INIT(&xde->ud.td_duphead); 1478 xde->td_namelen = 0; 1479 xde->td_node = NULL; 1480 tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, nde); 1481 tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, de); 1482 } 1483 dnode->tn_size += sizeof(struct tmpfs_dirent); 1484 dnode->tn_status |= TMPFS_NODE_CHANGED | TMPFS_NODE_MODIFIED; 1485 dnode->tn_accessed = true; 1486 tmpfs_update(vp); 1487 } 1488 1489 /* 1490 * Detaches the directory entry de from the directory represented by vp. 1491 * Note that this does not change the link count of the node pointed by 1492 * the directory entry, as this is done by tmpfs_free_dirent. 1493 */ 1494 void 1495 tmpfs_dir_detach(struct vnode *vp, struct tmpfs_dirent *de) 1496 { 1497 struct tmpfs_mount *tmp; 1498 struct tmpfs_dir *head; 1499 struct tmpfs_node *dnode; 1500 struct tmpfs_dirent *xde; 1501 1502 ASSERT_VOP_ELOCKED(vp, __func__); 1503 1504 dnode = VP_TO_TMPFS_DIR(vp); 1505 head = &dnode->tn_dir.tn_dirhead; 1506 dnode->tn_dir.tn_readdir_lastn = 0; 1507 dnode->tn_dir.tn_readdir_lastp = NULL; 1508 1509 if (tmpfs_dirent_dup(de)) { 1510 /* Remove duphead if de was last entry. */ 1511 if (LIST_NEXT(de, uh.td_dup.entries) == NULL) { 1512 xde = tmpfs_dir_xlookup_hash(dnode, de->td_hash); 1513 MPASS(tmpfs_dirent_duphead(xde)); 1514 } else 1515 xde = NULL; 1516 LIST_REMOVE(de, uh.td_dup.entries); 1517 LIST_REMOVE(de, uh.td_dup.index_entries); 1518 if (xde != NULL) { 1519 if (LIST_EMPTY(&xde->ud.td_duphead)) { 1520 RB_REMOVE(tmpfs_dir, head, xde); 1521 tmp = VFS_TO_TMPFS(vp->v_mount); 1522 MPASS(xde->td_node == NULL); 1523 tmpfs_free_dirent(tmp, xde); 1524 } 1525 } 1526 de->td_cookie = de->td_hash; 1527 } else 1528 RB_REMOVE(tmpfs_dir, head, de); 1529 1530 dnode->tn_size -= sizeof(struct tmpfs_dirent); 1531 dnode->tn_status |= TMPFS_NODE_CHANGED | TMPFS_NODE_MODIFIED; 1532 dnode->tn_accessed = true; 1533 tmpfs_update(vp); 1534 } 1535 1536 void 1537 tmpfs_dir_destroy(struct tmpfs_mount *tmp, struct tmpfs_node *dnode) 1538 { 1539 struct tmpfs_dirent *de, *dde, *nde; 1540 1541 RB_FOREACH_SAFE(de, tmpfs_dir, &dnode->tn_dir.tn_dirhead, nde) { 1542 RB_REMOVE(tmpfs_dir, &dnode->tn_dir.tn_dirhead, de); 1543 /* Node may already be destroyed. */ 1544 de->td_node = NULL; 1545 if (tmpfs_dirent_duphead(de)) { 1546 while ((dde = LIST_FIRST(&de->ud.td_duphead)) != NULL) { 1547 LIST_REMOVE(dde, uh.td_dup.entries); 1548 dde->td_node = NULL; 1549 tmpfs_free_dirent(tmp, dde); 1550 } 1551 } 1552 tmpfs_free_dirent(tmp, de); 1553 } 1554 } 1555 1556 /* 1557 * Helper function for tmpfs_readdir. Creates a '.' entry for the given 1558 * directory and returns it in the uio space. The function returns 0 1559 * on success, -1 if there was not enough space in the uio structure to 1560 * hold the directory entry or an appropriate error code if another 1561 * error happens. 1562 */ 1563 static int 1564 tmpfs_dir_getdotdent(struct tmpfs_mount *tm, struct tmpfs_node *node, 1565 struct uio *uio) 1566 { 1567 int error; 1568 struct dirent dent; 1569 1570 TMPFS_VALIDATE_DIR(node); 1571 MPASS(uio->uio_offset == TMPFS_DIRCOOKIE_DOT); 1572 1573 dent.d_fileno = node->tn_id; 1574 dent.d_off = TMPFS_DIRCOOKIE_DOTDOT; 1575 dent.d_type = DT_DIR; 1576 dent.d_namlen = 1; 1577 dent.d_name[0] = '.'; 1578 dent.d_reclen = GENERIC_DIRSIZ(&dent); 1579 dirent_terminate(&dent); 1580 1581 if (dent.d_reclen > uio->uio_resid) 1582 error = EJUSTRETURN; 1583 else 1584 error = uiomove(&dent, dent.d_reclen, uio); 1585 1586 tmpfs_set_accessed(tm, node); 1587 1588 return (error); 1589 } 1590 1591 /* 1592 * Helper function for tmpfs_readdir. Creates a '..' entry for the given 1593 * directory and returns it in the uio space. The function returns 0 1594 * on success, -1 if there was not enough space in the uio structure to 1595 * hold the directory entry or an appropriate error code if another 1596 * error happens. 1597 */ 1598 static int 1599 tmpfs_dir_getdotdotdent(struct tmpfs_mount *tm, struct tmpfs_node *node, 1600 struct uio *uio, off_t next) 1601 { 1602 struct tmpfs_node *parent; 1603 struct dirent dent; 1604 int error; 1605 1606 TMPFS_VALIDATE_DIR(node); 1607 MPASS(uio->uio_offset == TMPFS_DIRCOOKIE_DOTDOT); 1608 1609 /* 1610 * Return ENOENT if the current node is already removed. 1611 */ 1612 TMPFS_ASSERT_LOCKED(node); 1613 parent = node->tn_dir.tn_parent; 1614 if (parent == NULL) 1615 return (ENOENT); 1616 1617 dent.d_fileno = parent->tn_id; 1618 dent.d_off = next; 1619 dent.d_type = DT_DIR; 1620 dent.d_namlen = 2; 1621 dent.d_name[0] = '.'; 1622 dent.d_name[1] = '.'; 1623 dent.d_reclen = GENERIC_DIRSIZ(&dent); 1624 dirent_terminate(&dent); 1625 1626 if (dent.d_reclen > uio->uio_resid) 1627 error = EJUSTRETURN; 1628 else 1629 error = uiomove(&dent, dent.d_reclen, uio); 1630 1631 tmpfs_set_accessed(tm, node); 1632 1633 return (error); 1634 } 1635 1636 /* 1637 * Helper function for tmpfs_readdir. Returns as much directory entries 1638 * as can fit in the uio space. The read starts at uio->uio_offset. 1639 * The function returns 0 on success, -1 if there was not enough space 1640 * in the uio structure to hold the directory entry or an appropriate 1641 * error code if another error happens. 1642 */ 1643 int 1644 tmpfs_dir_getdents(struct tmpfs_mount *tm, struct tmpfs_node *node, 1645 struct uio *uio, int maxcookies, uint64_t *cookies, int *ncookies) 1646 { 1647 struct tmpfs_dir_cursor dc; 1648 struct tmpfs_dirent *de, *nde; 1649 off_t off; 1650 int error; 1651 1652 TMPFS_VALIDATE_DIR(node); 1653 1654 off = 0; 1655 1656 /* 1657 * Lookup the node from the current offset. The starting offset of 1658 * 0 will lookup both '.' and '..', and then the first real entry, 1659 * or EOF if there are none. Then find all entries for the dir that 1660 * fit into the buffer. Once no more entries are found (de == NULL), 1661 * the offset is set to TMPFS_DIRCOOKIE_EOF, which will cause the next 1662 * call to return 0. 1663 */ 1664 switch (uio->uio_offset) { 1665 case TMPFS_DIRCOOKIE_DOT: 1666 error = tmpfs_dir_getdotdent(tm, node, uio); 1667 if (error != 0) 1668 return (error); 1669 uio->uio_offset = off = TMPFS_DIRCOOKIE_DOTDOT; 1670 if (cookies != NULL) 1671 cookies[(*ncookies)++] = off; 1672 /* FALLTHROUGH */ 1673 case TMPFS_DIRCOOKIE_DOTDOT: 1674 de = tmpfs_dir_first(node, &dc); 1675 off = tmpfs_dirent_cookie(de); 1676 error = tmpfs_dir_getdotdotdent(tm, node, uio, off); 1677 if (error != 0) 1678 return (error); 1679 uio->uio_offset = off; 1680 if (cookies != NULL) 1681 cookies[(*ncookies)++] = off; 1682 /* EOF. */ 1683 if (de == NULL) 1684 return (0); 1685 break; 1686 case TMPFS_DIRCOOKIE_EOF: 1687 return (0); 1688 default: 1689 de = tmpfs_dir_lookup_cookie(node, uio->uio_offset, &dc); 1690 if (de == NULL) 1691 return (EINVAL); 1692 if (cookies != NULL) 1693 off = tmpfs_dirent_cookie(de); 1694 } 1695 1696 /* 1697 * Read as much entries as possible; i.e., until we reach the end of the 1698 * directory or we exhaust uio space. 1699 */ 1700 do { 1701 struct dirent d; 1702 1703 /* 1704 * Create a dirent structure representing the current tmpfs_node 1705 * and fill it. 1706 */ 1707 if (de->td_node == NULL) { 1708 d.d_fileno = 1; 1709 d.d_type = DT_WHT; 1710 } else { 1711 d.d_fileno = de->td_node->tn_id; 1712 switch (de->td_node->tn_type) { 1713 case VBLK: 1714 d.d_type = DT_BLK; 1715 break; 1716 1717 case VCHR: 1718 d.d_type = DT_CHR; 1719 break; 1720 1721 case VDIR: 1722 d.d_type = DT_DIR; 1723 break; 1724 1725 case VFIFO: 1726 d.d_type = DT_FIFO; 1727 break; 1728 1729 case VLNK: 1730 d.d_type = DT_LNK; 1731 break; 1732 1733 case VREG: 1734 d.d_type = DT_REG; 1735 break; 1736 1737 case VSOCK: 1738 d.d_type = DT_SOCK; 1739 break; 1740 1741 default: 1742 panic("tmpfs_dir_getdents: type %p %d", 1743 de->td_node, (int)de->td_node->tn_type); 1744 } 1745 } 1746 d.d_namlen = de->td_namelen; 1747 MPASS(de->td_namelen < sizeof(d.d_name)); 1748 (void)memcpy(d.d_name, de->ud.td_name, de->td_namelen); 1749 d.d_reclen = GENERIC_DIRSIZ(&d); 1750 1751 /* 1752 * Stop reading if the directory entry we are treating is bigger 1753 * than the amount of data that can be returned. 1754 */ 1755 if (d.d_reclen > uio->uio_resid) { 1756 error = EJUSTRETURN; 1757 break; 1758 } 1759 1760 nde = tmpfs_dir_next(node, &dc); 1761 d.d_off = tmpfs_dirent_cookie(nde); 1762 dirent_terminate(&d); 1763 1764 /* 1765 * Copy the new dirent structure into the output buffer and 1766 * advance pointers. 1767 */ 1768 error = uiomove(&d, d.d_reclen, uio); 1769 if (error == 0) { 1770 de = nde; 1771 if (cookies != NULL) { 1772 off = tmpfs_dirent_cookie(de); 1773 MPASS(*ncookies < maxcookies); 1774 cookies[(*ncookies)++] = off; 1775 } 1776 } 1777 } while (error == 0 && uio->uio_resid > 0 && de != NULL); 1778 1779 /* Skip setting off when using cookies as it is already done above. */ 1780 if (cookies == NULL) 1781 off = tmpfs_dirent_cookie(de); 1782 1783 /* Update the offset and cache. */ 1784 uio->uio_offset = off; 1785 node->tn_dir.tn_readdir_lastn = off; 1786 node->tn_dir.tn_readdir_lastp = de; 1787 1788 tmpfs_set_accessed(tm, node); 1789 return (error); 1790 } 1791 1792 int 1793 tmpfs_dir_whiteout_add(struct vnode *dvp, struct componentname *cnp) 1794 { 1795 struct tmpfs_dirent *de; 1796 struct tmpfs_node *dnode; 1797 int error; 1798 1799 error = tmpfs_alloc_dirent(VFS_TO_TMPFS(dvp->v_mount), NULL, 1800 cnp->cn_nameptr, cnp->cn_namelen, &de); 1801 if (error != 0) 1802 return (error); 1803 dnode = VP_TO_TMPFS_DIR(dvp); 1804 tmpfs_dir_attach(dvp, de); 1805 dnode->tn_dir.tn_wht_size += sizeof(*de); 1806 return (0); 1807 } 1808 1809 void 1810 tmpfs_dir_whiteout_remove(struct vnode *dvp, struct componentname *cnp) 1811 { 1812 struct tmpfs_dirent *de; 1813 struct tmpfs_node *dnode; 1814 1815 dnode = VP_TO_TMPFS_DIR(dvp); 1816 de = tmpfs_dir_lookup(dnode, NULL, cnp); 1817 MPASS(de != NULL && de->td_node == NULL); 1818 MPASS(dnode->tn_dir.tn_wht_size >= sizeof(*de)); 1819 dnode->tn_dir.tn_wht_size -= sizeof(*de); 1820 tmpfs_dir_detach(dvp, de); 1821 tmpfs_free_dirent(VFS_TO_TMPFS(dvp->v_mount), de); 1822 } 1823 1824 /* 1825 * Frees any dirents still associated with the directory represented 1826 * by dvp in preparation for the removal of the directory. This is 1827 * required when removing a directory which contains only whiteout 1828 * entries. 1829 */ 1830 void 1831 tmpfs_dir_clear_whiteouts(struct vnode *dvp) 1832 { 1833 struct tmpfs_dir_cursor dc; 1834 struct tmpfs_dirent *de; 1835 struct tmpfs_node *dnode; 1836 1837 dnode = VP_TO_TMPFS_DIR(dvp); 1838 1839 while ((de = tmpfs_dir_first(dnode, &dc)) != NULL) { 1840 KASSERT(de->td_node == NULL, ("%s: non-whiteout dirent %p", 1841 __func__, de)); 1842 dnode->tn_dir.tn_wht_size -= sizeof(*de); 1843 tmpfs_dir_detach(dvp, de); 1844 tmpfs_free_dirent(VFS_TO_TMPFS(dvp->v_mount), de); 1845 } 1846 MPASS(dnode->tn_size == 0); 1847 MPASS(dnode->tn_dir.tn_wht_size == 0); 1848 } 1849 1850 /* 1851 * Resizes the aobj associated with the regular file pointed to by 'vp' to the 1852 * size 'newsize'. 'vp' must point to a vnode that represents a regular file. 1853 * 'newsize' must be positive. 1854 * 1855 * Returns zero on success or an appropriate error code on failure. 1856 */ 1857 int 1858 tmpfs_reg_resize(struct vnode *vp, off_t newsize, boolean_t ignerr) 1859 { 1860 struct tmpfs_node *node; 1861 vm_object_t uobj; 1862 vm_pindex_t idx, newpages, oldpages; 1863 off_t oldsize; 1864 int base, error; 1865 1866 MPASS(vp->v_type == VREG); 1867 MPASS(newsize >= 0); 1868 1869 node = VP_TO_TMPFS_NODE(vp); 1870 uobj = node->tn_reg.tn_aobj; 1871 1872 /* 1873 * Convert the old and new sizes to the number of pages needed to 1874 * store them. It may happen that we do not need to do anything 1875 * because the last allocated page can accommodate the change on 1876 * its own. 1877 */ 1878 oldsize = node->tn_size; 1879 oldpages = OFF_TO_IDX(oldsize + PAGE_MASK); 1880 MPASS(oldpages == uobj->size); 1881 newpages = OFF_TO_IDX(newsize + PAGE_MASK); 1882 1883 if (__predict_true(newpages == oldpages && newsize >= oldsize)) { 1884 node->tn_size = newsize; 1885 return (0); 1886 } 1887 1888 VM_OBJECT_WLOCK(uobj); 1889 if (newsize < oldsize) { 1890 /* 1891 * Zero the truncated part of the last page. 1892 */ 1893 base = newsize & PAGE_MASK; 1894 if (base != 0) { 1895 idx = OFF_TO_IDX(newsize); 1896 error = tmpfs_partial_page_invalidate(uobj, idx, base, 1897 PAGE_SIZE, ignerr); 1898 if (error != 0) { 1899 VM_OBJECT_WUNLOCK(uobj); 1900 return (error); 1901 } 1902 } 1903 1904 /* 1905 * Release any swap space and free any whole pages. 1906 */ 1907 if (newpages < oldpages) 1908 vm_object_page_remove(uobj, newpages, 0, 0); 1909 } 1910 uobj->size = newpages; 1911 VM_OBJECT_WUNLOCK(uobj); 1912 1913 node->tn_size = newsize; 1914 return (0); 1915 } 1916 1917 /* 1918 * Punch hole in the aobj associated with the regular file pointed to by 'vp'. 1919 * Requests completely beyond the end-of-file are converted to no-op. 1920 * 1921 * Returns 0 on success or error code from tmpfs_partial_page_invalidate() on 1922 * failure. 1923 */ 1924 int 1925 tmpfs_reg_punch_hole(struct vnode *vp, off_t *offset, off_t *length) 1926 { 1927 struct tmpfs_node *node; 1928 vm_object_t object; 1929 vm_pindex_t pistart, pi, piend; 1930 int startofs, endofs, end; 1931 off_t off, len; 1932 int error; 1933 1934 KASSERT(*length <= OFF_MAX - *offset, ("%s: offset + length overflows", 1935 __func__)); 1936 node = VP_TO_TMPFS_NODE(vp); 1937 KASSERT(node->tn_type == VREG, ("%s: node is not regular file", 1938 __func__)); 1939 object = node->tn_reg.tn_aobj; 1940 off = *offset; 1941 len = omin(node->tn_size - off, *length); 1942 startofs = off & PAGE_MASK; 1943 endofs = (off + len) & PAGE_MASK; 1944 pistart = OFF_TO_IDX(off); 1945 piend = OFF_TO_IDX(off + len); 1946 pi = OFF_TO_IDX((vm_ooffset_t)off + PAGE_MASK); 1947 error = 0; 1948 1949 /* Handle the case when offset is on or beyond file size. */ 1950 if (len <= 0) { 1951 *length = 0; 1952 return (0); 1953 } 1954 1955 VM_OBJECT_WLOCK(object); 1956 1957 /* 1958 * If there is a partial page at the beginning of the hole-punching 1959 * request, fill the partial page with zeroes. 1960 */ 1961 if (startofs != 0) { 1962 end = pistart != piend ? PAGE_SIZE : endofs; 1963 error = tmpfs_partial_page_invalidate(object, pistart, startofs, 1964 end, FALSE); 1965 if (error != 0) 1966 goto out; 1967 off += end - startofs; 1968 len -= end - startofs; 1969 } 1970 1971 /* 1972 * Toss away the full pages in the affected area. 1973 */ 1974 if (pi < piend) { 1975 vm_object_page_remove(object, pi, piend, 0); 1976 off += IDX_TO_OFF(piend - pi); 1977 len -= IDX_TO_OFF(piend - pi); 1978 } 1979 1980 /* 1981 * If there is a partial page at the end of the hole-punching request, 1982 * fill the partial page with zeroes. 1983 */ 1984 if (endofs != 0 && pistart != piend) { 1985 error = tmpfs_partial_page_invalidate(object, piend, 0, endofs, 1986 FALSE); 1987 if (error != 0) 1988 goto out; 1989 off += endofs; 1990 len -= endofs; 1991 } 1992 1993 out: 1994 VM_OBJECT_WUNLOCK(object); 1995 *offset = off; 1996 *length = len; 1997 return (error); 1998 } 1999 2000 void 2001 tmpfs_check_mtime(struct vnode *vp) 2002 { 2003 struct tmpfs_node *node; 2004 struct vm_object *obj; 2005 2006 ASSERT_VOP_ELOCKED(vp, "check_mtime"); 2007 if (vp->v_type != VREG) 2008 return; 2009 obj = vp->v_object; 2010 KASSERT(obj->type == tmpfs_pager_type && 2011 (obj->flags & (OBJ_SWAP | OBJ_TMPFS)) == 2012 (OBJ_SWAP | OBJ_TMPFS), ("non-tmpfs obj")); 2013 /* unlocked read */ 2014 if (obj->generation != obj->cleangeneration) { 2015 VM_OBJECT_WLOCK(obj); 2016 if (obj->generation != obj->cleangeneration) { 2017 obj->cleangeneration = obj->generation; 2018 node = VP_TO_TMPFS_NODE(vp); 2019 node->tn_status |= TMPFS_NODE_MODIFIED | 2020 TMPFS_NODE_CHANGED; 2021 } 2022 VM_OBJECT_WUNLOCK(obj); 2023 } 2024 } 2025 2026 /* 2027 * Change flags of the given vnode. 2028 * Caller should execute tmpfs_update on vp after a successful execution. 2029 * The vnode must be locked on entry and remain locked on exit. 2030 */ 2031 int 2032 tmpfs_chflags(struct vnode *vp, u_long flags, struct ucred *cred, 2033 struct thread *td) 2034 { 2035 int error; 2036 struct tmpfs_node *node; 2037 2038 ASSERT_VOP_ELOCKED(vp, "chflags"); 2039 2040 node = VP_TO_TMPFS_NODE(vp); 2041 2042 if ((flags & ~(SF_APPEND | SF_ARCHIVED | SF_IMMUTABLE | SF_NOUNLINK | 2043 UF_APPEND | UF_ARCHIVE | UF_HIDDEN | UF_IMMUTABLE | UF_NODUMP | 2044 UF_NOUNLINK | UF_OFFLINE | UF_OPAQUE | UF_READONLY | UF_REPARSE | 2045 UF_SPARSE | UF_SYSTEM)) != 0) 2046 return (EOPNOTSUPP); 2047 2048 /* Disallow this operation if the file system is mounted read-only. */ 2049 if (vp->v_mount->mnt_flag & MNT_RDONLY) 2050 return (EROFS); 2051 2052 /* 2053 * Callers may only modify the file flags on objects they 2054 * have VADMIN rights for. 2055 */ 2056 if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) 2057 return (error); 2058 /* 2059 * Unprivileged processes are not permitted to unset system 2060 * flags, or modify flags if any system flags are set. 2061 */ 2062 if (!priv_check_cred(cred, PRIV_VFS_SYSFLAGS)) { 2063 if (node->tn_flags & 2064 (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND)) { 2065 error = securelevel_gt(cred, 0); 2066 if (error) 2067 return (error); 2068 } 2069 } else { 2070 if (node->tn_flags & 2071 (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND) || 2072 ((flags ^ node->tn_flags) & SF_SETTABLE)) 2073 return (EPERM); 2074 } 2075 node->tn_flags = flags; 2076 node->tn_status |= TMPFS_NODE_CHANGED; 2077 2078 ASSERT_VOP_ELOCKED(vp, "chflags2"); 2079 2080 return (0); 2081 } 2082 2083 /* 2084 * Change access mode on the given vnode. 2085 * Caller should execute tmpfs_update on vp after a successful execution. 2086 * The vnode must be locked on entry and remain locked on exit. 2087 */ 2088 int 2089 tmpfs_chmod(struct vnode *vp, mode_t mode, struct ucred *cred, 2090 struct thread *td) 2091 { 2092 int error; 2093 struct tmpfs_node *node; 2094 mode_t newmode; 2095 2096 ASSERT_VOP_ELOCKED(vp, "chmod"); 2097 ASSERT_VOP_IN_SEQC(vp); 2098 2099 node = VP_TO_TMPFS_NODE(vp); 2100 2101 /* Disallow this operation if the file system is mounted read-only. */ 2102 if (vp->v_mount->mnt_flag & MNT_RDONLY) 2103 return (EROFS); 2104 2105 /* Immutable or append-only files cannot be modified, either. */ 2106 if (node->tn_flags & (IMMUTABLE | APPEND)) 2107 return (EPERM); 2108 2109 /* 2110 * To modify the permissions on a file, must possess VADMIN 2111 * for that file. 2112 */ 2113 if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) 2114 return (error); 2115 2116 /* 2117 * Privileged processes may set the sticky bit on non-directories, 2118 * as well as set the setgid bit on a file with a group that the 2119 * process is not a member of. 2120 */ 2121 if (vp->v_type != VDIR && (mode & S_ISTXT)) { 2122 if (priv_check_cred(cred, PRIV_VFS_STICKYFILE)) 2123 return (EFTYPE); 2124 } 2125 if (!groupmember(node->tn_gid, cred) && (mode & S_ISGID)) { 2126 error = priv_check_cred(cred, PRIV_VFS_SETGID); 2127 if (error) 2128 return (error); 2129 } 2130 2131 newmode = node->tn_mode & ~ALLPERMS; 2132 newmode |= mode & ALLPERMS; 2133 atomic_store_short(&node->tn_mode, newmode); 2134 2135 node->tn_status |= TMPFS_NODE_CHANGED; 2136 2137 ASSERT_VOP_ELOCKED(vp, "chmod2"); 2138 2139 return (0); 2140 } 2141 2142 /* 2143 * Change ownership of the given vnode. At least one of uid or gid must 2144 * be different than VNOVAL. If one is set to that value, the attribute 2145 * is unchanged. 2146 * Caller should execute tmpfs_update on vp after a successful execution. 2147 * The vnode must be locked on entry and remain locked on exit. 2148 */ 2149 int 2150 tmpfs_chown(struct vnode *vp, uid_t uid, gid_t gid, struct ucred *cred, 2151 struct thread *td) 2152 { 2153 int error; 2154 struct tmpfs_node *node; 2155 uid_t ouid; 2156 gid_t ogid; 2157 mode_t newmode; 2158 2159 ASSERT_VOP_ELOCKED(vp, "chown"); 2160 ASSERT_VOP_IN_SEQC(vp); 2161 2162 node = VP_TO_TMPFS_NODE(vp); 2163 2164 /* Assign default values if they are unknown. */ 2165 MPASS(uid != VNOVAL || gid != VNOVAL); 2166 if (uid == VNOVAL) 2167 uid = node->tn_uid; 2168 if (gid == VNOVAL) 2169 gid = node->tn_gid; 2170 MPASS(uid != VNOVAL && gid != VNOVAL); 2171 2172 /* Disallow this operation if the file system is mounted read-only. */ 2173 if (vp->v_mount->mnt_flag & MNT_RDONLY) 2174 return (EROFS); 2175 2176 /* Immutable or append-only files cannot be modified, either. */ 2177 if (node->tn_flags & (IMMUTABLE | APPEND)) 2178 return (EPERM); 2179 2180 /* 2181 * To modify the ownership of a file, must possess VADMIN for that 2182 * file. 2183 */ 2184 if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) 2185 return (error); 2186 2187 /* 2188 * To change the owner of a file, or change the group of a file to a 2189 * group of which we are not a member, the caller must have 2190 * privilege. 2191 */ 2192 if ((uid != node->tn_uid || 2193 (gid != node->tn_gid && !groupmember(gid, cred))) && 2194 (error = priv_check_cred(cred, PRIV_VFS_CHOWN))) 2195 return (error); 2196 2197 ogid = node->tn_gid; 2198 ouid = node->tn_uid; 2199 2200 node->tn_uid = uid; 2201 node->tn_gid = gid; 2202 2203 node->tn_status |= TMPFS_NODE_CHANGED; 2204 2205 if ((node->tn_mode & (S_ISUID | S_ISGID)) != 0 && 2206 (ouid != uid || ogid != gid)) { 2207 if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID)) { 2208 newmode = node->tn_mode & ~(S_ISUID | S_ISGID); 2209 atomic_store_short(&node->tn_mode, newmode); 2210 } 2211 } 2212 2213 ASSERT_VOP_ELOCKED(vp, "chown2"); 2214 2215 return (0); 2216 } 2217 2218 /* 2219 * Change size of the given vnode. 2220 * Caller should execute tmpfs_update on vp after a successful execution. 2221 * The vnode must be locked on entry and remain locked on exit. 2222 */ 2223 int 2224 tmpfs_chsize(struct vnode *vp, u_quad_t size, struct ucred *cred, 2225 struct thread *td) 2226 { 2227 int error; 2228 struct tmpfs_node *node; 2229 2230 ASSERT_VOP_ELOCKED(vp, "chsize"); 2231 2232 node = VP_TO_TMPFS_NODE(vp); 2233 2234 /* Decide whether this is a valid operation based on the file type. */ 2235 error = 0; 2236 switch (vp->v_type) { 2237 case VDIR: 2238 return (EISDIR); 2239 2240 case VREG: 2241 if (vp->v_mount->mnt_flag & MNT_RDONLY) 2242 return (EROFS); 2243 break; 2244 2245 case VBLK: 2246 /* FALLTHROUGH */ 2247 case VCHR: 2248 /* FALLTHROUGH */ 2249 case VFIFO: 2250 /* 2251 * Allow modifications of special files even if in the file 2252 * system is mounted read-only (we are not modifying the 2253 * files themselves, but the objects they represent). 2254 */ 2255 return (0); 2256 2257 default: 2258 /* Anything else is unsupported. */ 2259 return (EOPNOTSUPP); 2260 } 2261 2262 /* Immutable or append-only files cannot be modified, either. */ 2263 if (node->tn_flags & (IMMUTABLE | APPEND)) 2264 return (EPERM); 2265 2266 error = vn_rlimit_trunc(size, td); 2267 if (error != 0) 2268 return (error); 2269 2270 error = tmpfs_truncate(vp, size); 2271 /* 2272 * tmpfs_truncate will raise the NOTE_EXTEND and NOTE_ATTRIB kevents 2273 * for us, as will update tn_status; no need to do that here. 2274 */ 2275 2276 ASSERT_VOP_ELOCKED(vp, "chsize2"); 2277 2278 return (error); 2279 } 2280 2281 /* 2282 * Change access and modification times of the given vnode. 2283 * Caller should execute tmpfs_update on vp after a successful execution. 2284 * The vnode must be locked on entry and remain locked on exit. 2285 */ 2286 int 2287 tmpfs_chtimes(struct vnode *vp, struct vattr *vap, 2288 struct ucred *cred, struct thread *td) 2289 { 2290 int error; 2291 struct tmpfs_node *node; 2292 2293 ASSERT_VOP_ELOCKED(vp, "chtimes"); 2294 2295 node = VP_TO_TMPFS_NODE(vp); 2296 2297 /* Disallow this operation if the file system is mounted read-only. */ 2298 if (vp->v_mount->mnt_flag & MNT_RDONLY) 2299 return (EROFS); 2300 2301 /* Immutable or append-only files cannot be modified, either. */ 2302 if (node->tn_flags & (IMMUTABLE | APPEND)) 2303 return (EPERM); 2304 2305 error = vn_utimes_perm(vp, vap, cred, td); 2306 if (error != 0) 2307 return (error); 2308 2309 if (vap->va_atime.tv_sec != VNOVAL) 2310 node->tn_accessed = true; 2311 if (vap->va_mtime.tv_sec != VNOVAL) 2312 node->tn_status |= TMPFS_NODE_MODIFIED; 2313 if (vap->va_birthtime.tv_sec != VNOVAL) 2314 node->tn_status |= TMPFS_NODE_MODIFIED; 2315 tmpfs_itimes(vp, &vap->va_atime, &vap->va_mtime); 2316 if (vap->va_birthtime.tv_sec != VNOVAL) 2317 node->tn_birthtime = vap->va_birthtime; 2318 ASSERT_VOP_ELOCKED(vp, "chtimes2"); 2319 2320 return (0); 2321 } 2322 2323 void 2324 tmpfs_set_status(struct tmpfs_mount *tm, struct tmpfs_node *node, int status) 2325 { 2326 2327 if ((node->tn_status & status) == status || tm->tm_ronly) 2328 return; 2329 TMPFS_NODE_LOCK(node); 2330 node->tn_status |= status; 2331 TMPFS_NODE_UNLOCK(node); 2332 } 2333 2334 void 2335 tmpfs_set_accessed(struct tmpfs_mount *tm, struct tmpfs_node *node) 2336 { 2337 if (node->tn_accessed || tm->tm_ronly) 2338 return; 2339 atomic_store_8(&node->tn_accessed, true); 2340 } 2341 2342 /* Sync timestamps */ 2343 void 2344 tmpfs_itimes(struct vnode *vp, const struct timespec *acc, 2345 const struct timespec *mod) 2346 { 2347 struct tmpfs_node *node; 2348 struct timespec now; 2349 2350 ASSERT_VOP_LOCKED(vp, "tmpfs_itimes"); 2351 node = VP_TO_TMPFS_NODE(vp); 2352 2353 if (!node->tn_accessed && 2354 (node->tn_status & (TMPFS_NODE_MODIFIED | TMPFS_NODE_CHANGED)) == 0) 2355 return; 2356 2357 vfs_timestamp(&now); 2358 TMPFS_NODE_LOCK(node); 2359 if (node->tn_accessed) { 2360 if (acc == NULL) 2361 acc = &now; 2362 node->tn_atime = *acc; 2363 } 2364 if (node->tn_status & TMPFS_NODE_MODIFIED) { 2365 if (mod == NULL) 2366 mod = &now; 2367 node->tn_mtime = *mod; 2368 } 2369 if (node->tn_status & TMPFS_NODE_CHANGED) 2370 node->tn_ctime = now; 2371 node->tn_status &= ~(TMPFS_NODE_MODIFIED | TMPFS_NODE_CHANGED); 2372 node->tn_accessed = false; 2373 TMPFS_NODE_UNLOCK(node); 2374 2375 /* XXX: FIX? The entropy here is desirable, but the harvesting may be expensive */ 2376 random_harvest_queue(node, sizeof(*node), RANDOM_FS_ATIME); 2377 } 2378 2379 int 2380 tmpfs_truncate(struct vnode *vp, off_t length) 2381 { 2382 struct tmpfs_node *node; 2383 int error; 2384 2385 if (length < 0) 2386 return (EINVAL); 2387 if (length > VFS_TO_TMPFS(vp->v_mount)->tm_maxfilesize) 2388 return (EFBIG); 2389 2390 node = VP_TO_TMPFS_NODE(vp); 2391 error = node->tn_size == length ? 0 : tmpfs_reg_resize(vp, length, 2392 FALSE); 2393 if (error == 0) 2394 node->tn_status |= TMPFS_NODE_CHANGED | TMPFS_NODE_MODIFIED; 2395 tmpfs_update(vp); 2396 2397 return (error); 2398 } 2399 2400 static __inline int 2401 tmpfs_dirtree_cmp(struct tmpfs_dirent *a, struct tmpfs_dirent *b) 2402 { 2403 if (a->td_hash > b->td_hash) 2404 return (1); 2405 else if (a->td_hash < b->td_hash) 2406 return (-1); 2407 return (0); 2408 } 2409 2410 RB_GENERATE_STATIC(tmpfs_dir, tmpfs_dirent, uh.td_entries, tmpfs_dirtree_cmp); 2411