1 /* $NetBSD: tmpfs_subr.c,v 1.35 2007/07/09 21:10:50 ad Exp $ */ 2 3 /*- 4 * SPDX-License-Identifier: BSD-2-Clause 5 * 6 * Copyright (c) 2005 The NetBSD Foundation, Inc. 7 * All rights reserved. 8 * 9 * This code is derived from software contributed to The NetBSD Foundation 10 * by Julio M. Merino Vidal, developed as part of Google's Summer of Code 11 * 2005 program. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 24 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 26 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 29 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 30 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 32 * POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 /* 36 * Efficient memory file system supporting functions. 37 */ 38 39 #include <sys/param.h> 40 #include <sys/systm.h> 41 #include <sys/dirent.h> 42 #include <sys/fnv_hash.h> 43 #include <sys/lock.h> 44 #include <sys/limits.h> 45 #include <sys/mount.h> 46 #include <sys/namei.h> 47 #include <sys/priv.h> 48 #include <sys/proc.h> 49 #include <sys/random.h> 50 #include <sys/refcount.h> 51 #include <sys/rwlock.h> 52 #include <sys/smr.h> 53 #include <sys/stat.h> 54 #include <sys/sysctl.h> 55 #include <sys/user.h> 56 #include <sys/vnode.h> 57 #include <sys/vmmeter.h> 58 59 #include <vm/vm.h> 60 #include <vm/vm_param.h> 61 #include <vm/vm_object.h> 62 #include <vm/vm_page.h> 63 #include <vm/vm_pageout.h> 64 #include <vm/vm_pager.h> 65 #include <vm/vm_extern.h> 66 #include <vm/swap_pager.h> 67 68 #include <fs/tmpfs/tmpfs.h> 69 #include <fs/tmpfs/tmpfs_fifoops.h> 70 #include <fs/tmpfs/tmpfs_vnops.h> 71 72 SYSCTL_NODE(_vfs, OID_AUTO, tmpfs, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 73 "tmpfs file system"); 74 75 static long tmpfs_pages_reserved = TMPFS_PAGES_MINRESERVED; 76 static long tmpfs_pages_avail_init; 77 static int tmpfs_mem_percent = TMPFS_MEM_PERCENT; 78 static void tmpfs_set_reserve_from_percent(void); 79 80 MALLOC_DEFINE(M_TMPFSDIR, "tmpfs dir", "tmpfs dirent structure"); 81 static uma_zone_t tmpfs_node_pool; 82 VFS_SMR_DECLARE; 83 84 int tmpfs_pager_type = -1; 85 86 static vm_object_t 87 tmpfs_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, 88 vm_ooffset_t offset, struct ucred *cred) 89 { 90 vm_object_t object; 91 92 MPASS(handle == NULL); 93 MPASS(offset == 0); 94 object = vm_object_allocate_dyn(tmpfs_pager_type, size, 95 OBJ_COLORED | OBJ_SWAP); 96 if (!swap_pager_init_object(object, NULL, NULL, size, 0)) { 97 vm_object_deallocate(object); 98 object = NULL; 99 } 100 return (object); 101 } 102 103 /* 104 * Make sure tmpfs vnodes with writable mappings can be found on the lazy list. 105 * 106 * This allows for periodic mtime updates while only scanning vnodes which are 107 * plausibly dirty, see tmpfs_update_mtime_lazy. 108 */ 109 static void 110 tmpfs_pager_writecount_recalc(vm_object_t object, vm_offset_t old, 111 vm_offset_t new) 112 { 113 struct vnode *vp; 114 115 VM_OBJECT_ASSERT_WLOCKED(object); 116 117 vp = VM_TO_TMPFS_VP(object); 118 119 /* 120 * Forced unmount? 121 */ 122 if (vp == NULL) { 123 KASSERT((object->flags & OBJ_TMPFS_VREF) == 0, 124 ("object %p with OBJ_TMPFS_VREF but without vnode", 125 object)); 126 VM_OBJECT_WUNLOCK(object); 127 return; 128 } 129 130 if (old == 0) { 131 VNASSERT((object->flags & OBJ_TMPFS_VREF) == 0, vp, 132 ("object without writable mappings has a reference")); 133 VNPASS(vp->v_usecount > 0, vp); 134 } else { 135 VNASSERT((object->flags & OBJ_TMPFS_VREF) != 0, vp, 136 ("object with writable mappings does not " 137 "have a reference")); 138 } 139 140 if (old == new) { 141 VM_OBJECT_WUNLOCK(object); 142 return; 143 } 144 145 if (new == 0) { 146 vm_object_clear_flag(object, OBJ_TMPFS_VREF); 147 VM_OBJECT_WUNLOCK(object); 148 vrele(vp); 149 } else { 150 if ((object->flags & OBJ_TMPFS_VREF) == 0) { 151 vref(vp); 152 vlazy(vp); 153 vm_object_set_flag(object, OBJ_TMPFS_VREF); 154 } 155 VM_OBJECT_WUNLOCK(object); 156 } 157 } 158 159 static void 160 tmpfs_pager_update_writecount(vm_object_t object, vm_offset_t start, 161 vm_offset_t end) 162 { 163 vm_offset_t new, old; 164 165 VM_OBJECT_WLOCK(object); 166 KASSERT((object->flags & OBJ_ANON) == 0, 167 ("%s: object %p with OBJ_ANON", __func__, object)); 168 old = object->un_pager.swp.writemappings; 169 object->un_pager.swp.writemappings += (vm_ooffset_t)end - start; 170 new = object->un_pager.swp.writemappings; 171 tmpfs_pager_writecount_recalc(object, old, new); 172 VM_OBJECT_ASSERT_UNLOCKED(object); 173 } 174 175 static void 176 tmpfs_pager_release_writecount(vm_object_t object, vm_offset_t start, 177 vm_offset_t end) 178 { 179 vm_offset_t new, old; 180 181 VM_OBJECT_WLOCK(object); 182 KASSERT((object->flags & OBJ_ANON) == 0, 183 ("%s: object %p with OBJ_ANON", __func__, object)); 184 old = object->un_pager.swp.writemappings; 185 object->un_pager.swp.writemappings -= (vm_ooffset_t)end - start; 186 new = object->un_pager.swp.writemappings; 187 tmpfs_pager_writecount_recalc(object, old, new); 188 VM_OBJECT_ASSERT_UNLOCKED(object); 189 } 190 191 static void 192 tmpfs_pager_getvp(vm_object_t object, struct vnode **vpp, bool *vp_heldp) 193 { 194 struct vnode *vp; 195 196 /* 197 * Tmpfs VREG node, which was reclaimed, has tmpfs_pager_type 198 * type. In this case there is no v_writecount to adjust. 199 */ 200 if (vp_heldp != NULL) 201 VM_OBJECT_RLOCK(object); 202 else 203 VM_OBJECT_ASSERT_LOCKED(object); 204 if ((object->flags & OBJ_TMPFS) != 0) { 205 vp = VM_TO_TMPFS_VP(object); 206 if (vp != NULL) { 207 *vpp = vp; 208 if (vp_heldp != NULL) { 209 vhold(vp); 210 *vp_heldp = true; 211 } 212 } 213 } 214 if (vp_heldp != NULL) 215 VM_OBJECT_RUNLOCK(object); 216 } 217 218 static void 219 tmpfs_pager_freespace(vm_object_t obj, vm_pindex_t start, vm_size_t size) 220 { 221 struct tmpfs_node *node; 222 struct tmpfs_mount *tm; 223 vm_size_t c; 224 225 swap_pager_freespace(obj, start, size, &c); 226 if ((obj->flags & OBJ_TMPFS) == 0 || c == 0) 227 return; 228 229 node = obj->un_pager.swp.swp_priv; 230 MPASS(node->tn_type == VREG); 231 tm = node->tn_reg.tn_tmp; 232 233 KASSERT(tm->tm_pages_used >= c, 234 ("tmpfs tm %p pages %jd free %jd", tm, 235 (uintmax_t)tm->tm_pages_used, (uintmax_t)c)); 236 atomic_add_long(&tm->tm_pages_used, -c); 237 KASSERT(node->tn_reg.tn_pages >= c, 238 ("tmpfs node %p pages %jd free %jd", node, 239 (uintmax_t)node->tn_reg.tn_pages, (uintmax_t)c)); 240 node->tn_reg.tn_pages -= c; 241 } 242 243 static void 244 tmpfs_page_inserted(vm_object_t obj, vm_page_t m) 245 { 246 struct tmpfs_node *node; 247 struct tmpfs_mount *tm; 248 249 if ((obj->flags & OBJ_TMPFS) == 0) 250 return; 251 252 node = obj->un_pager.swp.swp_priv; 253 MPASS(node->tn_type == VREG); 254 tm = node->tn_reg.tn_tmp; 255 256 if (!vm_pager_has_page(obj, m->pindex, NULL, NULL)) { 257 atomic_add_long(&tm->tm_pages_used, 1); 258 node->tn_reg.tn_pages += 1; 259 } 260 } 261 262 static void 263 tmpfs_page_removed(vm_object_t obj, vm_page_t m) 264 { 265 struct tmpfs_node *node; 266 struct tmpfs_mount *tm; 267 268 if ((obj->flags & OBJ_TMPFS) == 0) 269 return; 270 271 node = obj->un_pager.swp.swp_priv; 272 MPASS(node->tn_type == VREG); 273 tm = node->tn_reg.tn_tmp; 274 275 if (!vm_pager_has_page(obj, m->pindex, NULL, NULL)) { 276 KASSERT(tm->tm_pages_used >= 1, 277 ("tmpfs tm %p pages %jd free 1", tm, 278 (uintmax_t)tm->tm_pages_used)); 279 atomic_add_long(&tm->tm_pages_used, -1); 280 KASSERT(node->tn_reg.tn_pages >= 1, 281 ("tmpfs node %p pages %jd free 1", node, 282 (uintmax_t)node->tn_reg.tn_pages)); 283 node->tn_reg.tn_pages -= 1; 284 } 285 } 286 287 static boolean_t 288 tmpfs_can_alloc_page(vm_object_t obj, vm_pindex_t pindex) 289 { 290 struct tmpfs_mount *tm; 291 292 tm = VM_TO_TMPFS_MP(obj); 293 if (tm == NULL || vm_pager_has_page(obj, pindex, NULL, NULL) || 294 tm->tm_pages_max == 0) 295 return (true); 296 if (tm->tm_pages_max == ULONG_MAX) 297 return (tmpfs_mem_avail() >= 1); 298 return (tm->tm_pages_max > atomic_load_long(&tm->tm_pages_used)); 299 } 300 301 struct pagerops tmpfs_pager_ops = { 302 .pgo_kvme_type = KVME_TYPE_VNODE, 303 .pgo_alloc = tmpfs_pager_alloc, 304 .pgo_set_writeable_dirty = vm_object_set_writeable_dirty_, 305 .pgo_update_writecount = tmpfs_pager_update_writecount, 306 .pgo_release_writecount = tmpfs_pager_release_writecount, 307 .pgo_mightbedirty = vm_object_mightbedirty_, 308 .pgo_getvp = tmpfs_pager_getvp, 309 .pgo_freespace = tmpfs_pager_freespace, 310 .pgo_page_inserted = tmpfs_page_inserted, 311 .pgo_page_removed = tmpfs_page_removed, 312 .pgo_can_alloc_page = tmpfs_can_alloc_page, 313 }; 314 315 static int 316 tmpfs_node_ctor(void *mem, int size, void *arg, int flags) 317 { 318 struct tmpfs_node *node; 319 320 node = mem; 321 node->tn_gen++; 322 node->tn_size = 0; 323 node->tn_status = 0; 324 node->tn_accessed = false; 325 node->tn_flags = 0; 326 node->tn_links = 0; 327 node->tn_vnode = NULL; 328 node->tn_vpstate = 0; 329 return (0); 330 } 331 332 static void 333 tmpfs_node_dtor(void *mem, int size, void *arg) 334 { 335 struct tmpfs_node *node; 336 337 node = mem; 338 node->tn_type = VNON; 339 } 340 341 static int 342 tmpfs_node_init(void *mem, int size, int flags) 343 { 344 struct tmpfs_node *node; 345 346 node = mem; 347 node->tn_id = 0; 348 mtx_init(&node->tn_interlock, "tmpfsni", NULL, MTX_DEF); 349 node->tn_gen = arc4random(); 350 return (0); 351 } 352 353 static void 354 tmpfs_node_fini(void *mem, int size) 355 { 356 struct tmpfs_node *node; 357 358 node = mem; 359 mtx_destroy(&node->tn_interlock); 360 } 361 362 int 363 tmpfs_subr_init(void) 364 { 365 tmpfs_pager_type = vm_pager_alloc_dyn_type(&tmpfs_pager_ops, 366 OBJT_SWAP); 367 if (tmpfs_pager_type == -1) 368 return (EINVAL); 369 tmpfs_node_pool = uma_zcreate("TMPFS node", 370 sizeof(struct tmpfs_node), tmpfs_node_ctor, tmpfs_node_dtor, 371 tmpfs_node_init, tmpfs_node_fini, UMA_ALIGN_PTR, 0); 372 VFS_SMR_ZONE_SET(tmpfs_node_pool); 373 374 tmpfs_pages_avail_init = tmpfs_mem_avail(); 375 tmpfs_set_reserve_from_percent(); 376 return (0); 377 } 378 379 void 380 tmpfs_subr_uninit(void) 381 { 382 if (tmpfs_pager_type != -1) 383 vm_pager_free_dyn_type(tmpfs_pager_type); 384 tmpfs_pager_type = -1; 385 uma_zdestroy(tmpfs_node_pool); 386 } 387 388 static int 389 sysctl_mem_reserved(SYSCTL_HANDLER_ARGS) 390 { 391 int error; 392 long pages, bytes; 393 394 pages = *(long *)arg1; 395 bytes = pages * PAGE_SIZE; 396 397 error = sysctl_handle_long(oidp, &bytes, 0, req); 398 if (error || !req->newptr) 399 return (error); 400 401 pages = bytes / PAGE_SIZE; 402 if (pages < TMPFS_PAGES_MINRESERVED) 403 return (EINVAL); 404 405 *(long *)arg1 = pages; 406 return (0); 407 } 408 409 SYSCTL_PROC(_vfs_tmpfs, OID_AUTO, memory_reserved, 410 CTLTYPE_LONG | CTLFLAG_MPSAFE | CTLFLAG_RW, &tmpfs_pages_reserved, 0, 411 sysctl_mem_reserved, "L", 412 "Amount of available memory and swap below which tmpfs growth stops"); 413 414 static int 415 sysctl_mem_percent(SYSCTL_HANDLER_ARGS) 416 { 417 int error, percent; 418 419 percent = *(int *)arg1; 420 error = sysctl_handle_int(oidp, &percent, 0, req); 421 if (error || !req->newptr) 422 return (error); 423 424 if ((unsigned) percent > 100) 425 return (EINVAL); 426 427 *(long *)arg1 = percent; 428 tmpfs_set_reserve_from_percent(); 429 return (0); 430 } 431 432 static void 433 tmpfs_set_reserve_from_percent(void) 434 { 435 size_t reserved; 436 437 reserved = tmpfs_pages_avail_init * (100 - tmpfs_mem_percent) / 100; 438 tmpfs_pages_reserved = max(reserved, TMPFS_PAGES_MINRESERVED); 439 } 440 441 SYSCTL_PROC(_vfs_tmpfs, OID_AUTO, memory_percent, 442 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, &tmpfs_mem_percent, 0, 443 sysctl_mem_percent, "I", 444 "Percent of available memory that can be used if no size limit"); 445 446 static __inline int tmpfs_dirtree_cmp(struct tmpfs_dirent *a, 447 struct tmpfs_dirent *b); 448 RB_PROTOTYPE_STATIC(tmpfs_dir, tmpfs_dirent, uh.td_entries, tmpfs_dirtree_cmp); 449 450 size_t 451 tmpfs_mem_avail(void) 452 { 453 size_t avail; 454 long reserved; 455 456 avail = swap_pager_avail + vm_free_count(); 457 reserved = atomic_load_long(&tmpfs_pages_reserved); 458 if (__predict_false(avail < reserved)) 459 return (0); 460 return (avail - reserved); 461 } 462 463 size_t 464 tmpfs_pages_used(struct tmpfs_mount *tmp) 465 { 466 const size_t node_size = sizeof(struct tmpfs_node) + 467 sizeof(struct tmpfs_dirent); 468 size_t meta_pages; 469 470 meta_pages = howmany((uintmax_t)tmp->tm_nodes_inuse * node_size, 471 PAGE_SIZE); 472 return (meta_pages + tmp->tm_pages_used); 473 } 474 475 bool 476 tmpfs_pages_check_avail(struct tmpfs_mount *tmp, size_t req_pages) 477 { 478 if (tmpfs_mem_avail() < req_pages) 479 return (false); 480 481 if (tmp->tm_pages_max != ULONG_MAX && 482 tmp->tm_pages_max < req_pages + tmpfs_pages_used(tmp)) 483 return (false); 484 485 return (true); 486 } 487 488 static int 489 tmpfs_partial_page_invalidate(vm_object_t object, vm_pindex_t idx, int base, 490 int end, boolean_t ignerr) 491 { 492 vm_page_t m; 493 int rv, error; 494 495 VM_OBJECT_ASSERT_WLOCKED(object); 496 KASSERT(base >= 0, ("%s: base %d", __func__, base)); 497 KASSERT(end - base <= PAGE_SIZE, ("%s: base %d end %d", __func__, base, 498 end)); 499 error = 0; 500 501 retry: 502 m = vm_page_grab(object, idx, VM_ALLOC_NOCREAT); 503 if (m != NULL) { 504 MPASS(vm_page_all_valid(m)); 505 } else if (vm_pager_has_page(object, idx, NULL, NULL)) { 506 m = vm_page_alloc(object, idx, VM_ALLOC_NORMAL | 507 VM_ALLOC_WAITFAIL); 508 if (m == NULL) 509 goto retry; 510 vm_object_pip_add(object, 1); 511 VM_OBJECT_WUNLOCK(object); 512 rv = vm_pager_get_pages(object, &m, 1, NULL, NULL); 513 VM_OBJECT_WLOCK(object); 514 vm_object_pip_wakeup(object); 515 if (rv == VM_PAGER_OK) { 516 /* 517 * Since the page was not resident, and therefore not 518 * recently accessed, immediately enqueue it for 519 * asynchronous laundering. The current operation is 520 * not regarded as an access. 521 */ 522 vm_page_launder(m); 523 } else { 524 vm_page_free(m); 525 m = NULL; 526 if (!ignerr) 527 error = EIO; 528 } 529 } 530 if (m != NULL) { 531 pmap_zero_page_area(m, base, end - base); 532 vm_page_set_dirty(m); 533 vm_page_xunbusy(m); 534 } 535 536 return (error); 537 } 538 539 void 540 tmpfs_ref_node(struct tmpfs_node *node) 541 { 542 #ifdef INVARIANTS 543 u_int old; 544 545 old = 546 #endif 547 refcount_acquire(&node->tn_refcount); 548 #ifdef INVARIANTS 549 KASSERT(old > 0, ("node %p zero refcount", node)); 550 #endif 551 } 552 553 /* 554 * Allocates a new node of type 'type' inside the 'tmp' mount point, with 555 * its owner set to 'uid', its group to 'gid' and its mode set to 'mode', 556 * using the credentials of the process 'p'. 557 * 558 * If the node type is set to 'VDIR', then the parent parameter must point 559 * to the parent directory of the node being created. It may only be NULL 560 * while allocating the root node. 561 * 562 * If the node type is set to 'VBLK' or 'VCHR', then the rdev parameter 563 * specifies the device the node represents. 564 * 565 * If the node type is set to 'VLNK', then the parameter target specifies 566 * the file name of the target file for the symbolic link that is being 567 * created. 568 * 569 * Note that new nodes are retrieved from the available list if it has 570 * items or, if it is empty, from the node pool as long as there is enough 571 * space to create them. 572 * 573 * Returns zero on success or an appropriate error code on failure. 574 */ 575 int 576 tmpfs_alloc_node(struct mount *mp, struct tmpfs_mount *tmp, __enum_uint8(vtype) type, 577 uid_t uid, gid_t gid, mode_t mode, struct tmpfs_node *parent, 578 const char *target, dev_t rdev, struct tmpfs_node **node) 579 { 580 struct tmpfs_node *nnode; 581 char *symlink; 582 char symlink_smr; 583 584 /* If the root directory of the 'tmp' file system is not yet 585 * allocated, this must be the request to do it. */ 586 MPASS(IMPLIES(tmp->tm_root == NULL, parent == NULL && type == VDIR)); 587 588 MPASS((type == VLNK) ^ (target == NULL)); 589 MPASS((type == VBLK || type == VCHR) ^ (rdev == VNOVAL)); 590 591 if (tmp->tm_nodes_inuse >= tmp->tm_nodes_max) 592 return (ENOSPC); 593 if (!tmpfs_pages_check_avail(tmp, 1)) 594 return (ENOSPC); 595 596 if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) { 597 /* 598 * When a new tmpfs node is created for fully 599 * constructed mount point, there must be a parent 600 * node, which vnode is locked exclusively. As 601 * consequence, if the unmount is executing in 602 * parallel, vflush() cannot reclaim the parent vnode. 603 * Due to this, the check for MNTK_UNMOUNT flag is not 604 * racy: if we did not see MNTK_UNMOUNT flag, then tmp 605 * cannot be destroyed until node construction is 606 * finished and the parent vnode unlocked. 607 * 608 * Tmpfs does not need to instantiate new nodes during 609 * unmount. 610 */ 611 return (EBUSY); 612 } 613 if ((mp->mnt_kern_flag & MNT_RDONLY) != 0) 614 return (EROFS); 615 616 nnode = uma_zalloc_smr(tmpfs_node_pool, M_WAITOK); 617 618 /* Generic initialization. */ 619 nnode->tn_type = type; 620 vfs_timestamp(&nnode->tn_atime); 621 nnode->tn_birthtime = nnode->tn_ctime = nnode->tn_mtime = 622 nnode->tn_atime; 623 nnode->tn_uid = uid; 624 nnode->tn_gid = gid; 625 nnode->tn_mode = mode; 626 nnode->tn_id = alloc_unr64(&tmp->tm_ino_unr); 627 nnode->tn_refcount = 1; 628 LIST_INIT(&nnode->tn_extattrs); 629 630 /* Type-specific initialization. */ 631 switch (nnode->tn_type) { 632 case VBLK: 633 case VCHR: 634 nnode->tn_rdev = rdev; 635 break; 636 637 case VDIR: 638 RB_INIT(&nnode->tn_dir.tn_dirhead); 639 LIST_INIT(&nnode->tn_dir.tn_dupindex); 640 MPASS(parent != nnode); 641 MPASS(IMPLIES(parent == NULL, tmp->tm_root == NULL)); 642 nnode->tn_dir.tn_parent = (parent == NULL) ? nnode : parent; 643 nnode->tn_dir.tn_readdir_lastn = 0; 644 nnode->tn_dir.tn_readdir_lastp = NULL; 645 nnode->tn_links++; 646 TMPFS_NODE_LOCK(nnode->tn_dir.tn_parent); 647 nnode->tn_dir.tn_parent->tn_links++; 648 TMPFS_NODE_UNLOCK(nnode->tn_dir.tn_parent); 649 break; 650 651 case VFIFO: 652 /* FALLTHROUGH */ 653 case VSOCK: 654 break; 655 656 case VLNK: 657 MPASS(strlen(target) < MAXPATHLEN); 658 nnode->tn_size = strlen(target); 659 660 symlink = NULL; 661 if (!tmp->tm_nonc) { 662 symlink = cache_symlink_alloc(nnode->tn_size + 1, 663 M_WAITOK); 664 symlink_smr = true; 665 } 666 if (symlink == NULL) { 667 symlink = malloc(nnode->tn_size + 1, M_TMPFSNAME, 668 M_WAITOK); 669 symlink_smr = false; 670 } 671 memcpy(symlink, target, nnode->tn_size + 1); 672 673 /* 674 * Allow safe symlink resolving for lockless lookup. 675 * tmpfs_fplookup_symlink references this comment. 676 * 677 * 1. nnode is not yet visible to the world 678 * 2. both tn_link_target and tn_link_smr get populated 679 * 3. release fence publishes their content 680 * 4. tn_link_target content is immutable until node 681 * destruction, where the pointer gets set to NULL 682 * 5. tn_link_smr is never changed once set 683 * 684 * As a result it is sufficient to issue load consume 685 * on the node pointer to also get the above content 686 * in a stable manner. Worst case tn_link_smr flag 687 * may be set to true despite being stale, while the 688 * target buffer is already cleared out. 689 */ 690 atomic_store_ptr(&nnode->tn_link_target, symlink); 691 atomic_store_char((char *)&nnode->tn_link_smr, symlink_smr); 692 atomic_thread_fence_rel(); 693 break; 694 695 case VREG: 696 nnode->tn_reg.tn_aobj = 697 vm_pager_allocate(tmpfs_pager_type, NULL, 0, 698 VM_PROT_DEFAULT, 0, 699 NULL /* XXXKIB - tmpfs needs swap reservation */); 700 nnode->tn_reg.tn_aobj->un_pager.swp.swp_priv = nnode; 701 vm_object_set_flag(nnode->tn_reg.tn_aobj, OBJ_TMPFS); 702 nnode->tn_reg.tn_tmp = tmp; 703 nnode->tn_reg.tn_pages = 0; 704 break; 705 706 default: 707 panic("tmpfs_alloc_node: type %p %d", nnode, 708 (int)nnode->tn_type); 709 } 710 711 TMPFS_LOCK(tmp); 712 LIST_INSERT_HEAD(&tmp->tm_nodes_used, nnode, tn_entries); 713 nnode->tn_attached = true; 714 tmp->tm_nodes_inuse++; 715 tmp->tm_refcount++; 716 TMPFS_UNLOCK(tmp); 717 718 *node = nnode; 719 return (0); 720 } 721 722 /* 723 * Destroys the node pointed to by node from the file system 'tmp'. 724 * If the node references a directory, no entries are allowed. 725 */ 726 void 727 tmpfs_free_node(struct tmpfs_mount *tmp, struct tmpfs_node *node) 728 { 729 if (refcount_release_if_not_last(&node->tn_refcount)) 730 return; 731 732 TMPFS_LOCK(tmp); 733 TMPFS_NODE_LOCK(node); 734 if (!tmpfs_free_node_locked(tmp, node, false)) { 735 TMPFS_NODE_UNLOCK(node); 736 TMPFS_UNLOCK(tmp); 737 } 738 } 739 740 bool 741 tmpfs_free_node_locked(struct tmpfs_mount *tmp, struct tmpfs_node *node, 742 bool detach) 743 { 744 struct tmpfs_extattr *ea; 745 vm_object_t uobj; 746 char *symlink; 747 bool last; 748 749 TMPFS_MP_ASSERT_LOCKED(tmp); 750 TMPFS_NODE_ASSERT_LOCKED(node); 751 752 last = refcount_release(&node->tn_refcount); 753 if (node->tn_attached && (detach || last)) { 754 MPASS(tmp->tm_nodes_inuse > 0); 755 tmp->tm_nodes_inuse--; 756 LIST_REMOVE(node, tn_entries); 757 node->tn_attached = false; 758 } 759 if (!last) 760 return (false); 761 762 TMPFS_NODE_UNLOCK(node); 763 764 #ifdef INVARIANTS 765 MPASS(node->tn_vnode == NULL); 766 MPASS((node->tn_vpstate & TMPFS_VNODE_ALLOCATING) == 0); 767 768 /* 769 * Make sure this is a node type we can deal with. Everything 770 * is explicitly enumerated without the 'default' clause so 771 * the compiler can throw an error in case a new type is 772 * added. 773 */ 774 switch (node->tn_type) { 775 case VBLK: 776 case VCHR: 777 case VDIR: 778 case VFIFO: 779 case VSOCK: 780 case VLNK: 781 case VREG: 782 break; 783 case VNON: 784 case VBAD: 785 case VMARKER: 786 panic("%s: bad type %d for node %p", __func__, 787 (int)node->tn_type, node); 788 } 789 #endif 790 791 while ((ea = LIST_FIRST(&node->tn_extattrs)) != NULL) { 792 LIST_REMOVE(ea, ea_extattrs); 793 tmpfs_extattr_free(ea); 794 } 795 796 switch (node->tn_type) { 797 case VREG: 798 uobj = node->tn_reg.tn_aobj; 799 node->tn_reg.tn_aobj = NULL; 800 if (uobj != NULL) { 801 VM_OBJECT_WLOCK(uobj); 802 KASSERT((uobj->flags & OBJ_TMPFS) != 0, 803 ("tmpfs node %p uobj %p not tmpfs", node, uobj)); 804 vm_object_clear_flag(uobj, OBJ_TMPFS); 805 KASSERT(tmp->tm_pages_used >= node->tn_reg.tn_pages, 806 ("tmpfs tmp %p node %p pages %jd free %jd", tmp, 807 node, (uintmax_t)tmp->tm_pages_used, 808 (uintmax_t)node->tn_reg.tn_pages)); 809 atomic_add_long(&tmp->tm_pages_used, 810 -node->tn_reg.tn_pages); 811 VM_OBJECT_WUNLOCK(uobj); 812 } 813 tmpfs_free_tmp(tmp); 814 815 /* 816 * vm_object_deallocate() must not be called while 817 * owning tm_allnode_lock, because deallocate might 818 * sleep. Call it after tmpfs_free_tmp() does the 819 * unlock. 820 */ 821 if (uobj != NULL) 822 vm_object_deallocate(uobj); 823 824 break; 825 case VLNK: 826 tmpfs_free_tmp(tmp); 827 828 symlink = node->tn_link_target; 829 atomic_store_ptr(&node->tn_link_target, NULL); 830 if (atomic_load_char(&node->tn_link_smr)) { 831 cache_symlink_free(symlink, node->tn_size + 1); 832 } else { 833 free(symlink, M_TMPFSNAME); 834 } 835 break; 836 default: 837 tmpfs_free_tmp(tmp); 838 break; 839 } 840 841 uma_zfree_smr(tmpfs_node_pool, node); 842 return (true); 843 } 844 845 static __inline uint32_t 846 tmpfs_dirent_hash(const char *name, u_int len) 847 { 848 uint32_t hash; 849 850 hash = fnv_32_buf(name, len, FNV1_32_INIT + len) & TMPFS_DIRCOOKIE_MASK; 851 #ifdef TMPFS_DEBUG_DIRCOOKIE_DUP 852 hash &= 0xf; 853 #endif 854 if (hash < TMPFS_DIRCOOKIE_MIN) 855 hash += TMPFS_DIRCOOKIE_MIN; 856 857 return (hash); 858 } 859 860 static __inline off_t 861 tmpfs_dirent_cookie(struct tmpfs_dirent *de) 862 { 863 if (de == NULL) 864 return (TMPFS_DIRCOOKIE_EOF); 865 866 MPASS(de->td_cookie >= TMPFS_DIRCOOKIE_MIN); 867 868 return (de->td_cookie); 869 } 870 871 static __inline boolean_t 872 tmpfs_dirent_dup(struct tmpfs_dirent *de) 873 { 874 return ((de->td_cookie & TMPFS_DIRCOOKIE_DUP) != 0); 875 } 876 877 static __inline boolean_t 878 tmpfs_dirent_duphead(struct tmpfs_dirent *de) 879 { 880 return ((de->td_cookie & TMPFS_DIRCOOKIE_DUPHEAD) != 0); 881 } 882 883 void 884 tmpfs_dirent_init(struct tmpfs_dirent *de, const char *name, u_int namelen) 885 { 886 de->td_hash = de->td_cookie = tmpfs_dirent_hash(name, namelen); 887 memcpy(de->ud.td_name, name, namelen); 888 de->td_namelen = namelen; 889 } 890 891 /* 892 * Allocates a new directory entry for the node node with a name of name. 893 * The new directory entry is returned in *de. 894 * 895 * The link count of node is increased by one to reflect the new object 896 * referencing it. 897 * 898 * Returns zero on success or an appropriate error code on failure. 899 */ 900 int 901 tmpfs_alloc_dirent(struct tmpfs_mount *tmp, struct tmpfs_node *node, 902 const char *name, u_int len, struct tmpfs_dirent **de) 903 { 904 struct tmpfs_dirent *nde; 905 906 nde = malloc(sizeof(*nde), M_TMPFSDIR, M_WAITOK); 907 nde->td_node = node; 908 if (name != NULL) { 909 nde->ud.td_name = malloc(len, M_TMPFSNAME, M_WAITOK); 910 tmpfs_dirent_init(nde, name, len); 911 } else 912 nde->td_namelen = 0; 913 if (node != NULL) 914 node->tn_links++; 915 916 *de = nde; 917 918 return (0); 919 } 920 921 /* 922 * Frees a directory entry. It is the caller's responsibility to destroy 923 * the node referenced by it if needed. 924 * 925 * The link count of node is decreased by one to reflect the removal of an 926 * object that referenced it. This only happens if 'node_exists' is true; 927 * otherwise the function will not access the node referred to by the 928 * directory entry, as it may already have been released from the outside. 929 */ 930 void 931 tmpfs_free_dirent(struct tmpfs_mount *tmp, struct tmpfs_dirent *de) 932 { 933 struct tmpfs_node *node; 934 935 node = de->td_node; 936 if (node != NULL) { 937 MPASS(node->tn_links > 0); 938 node->tn_links--; 939 } 940 if (!tmpfs_dirent_duphead(de) && de->ud.td_name != NULL) 941 free(de->ud.td_name, M_TMPFSNAME); 942 free(de, M_TMPFSDIR); 943 } 944 945 void 946 tmpfs_destroy_vobject(struct vnode *vp, vm_object_t obj) 947 { 948 bool want_vrele; 949 950 ASSERT_VOP_ELOCKED(vp, "tmpfs_destroy_vobject"); 951 if (vp->v_type != VREG || obj == NULL) 952 return; 953 954 VM_OBJECT_WLOCK(obj); 955 VI_LOCK(vp); 956 /* 957 * May be going through forced unmount. 958 */ 959 want_vrele = false; 960 if ((obj->flags & OBJ_TMPFS_VREF) != 0) { 961 vm_object_clear_flag(obj, OBJ_TMPFS_VREF); 962 want_vrele = true; 963 } 964 965 if (vp->v_writecount < 0) 966 vp->v_writecount = 0; 967 VI_UNLOCK(vp); 968 VM_OBJECT_WUNLOCK(obj); 969 if (want_vrele) { 970 vrele(vp); 971 } 972 } 973 974 /* 975 * Allocates a new vnode for the node node or returns a new reference to 976 * an existing one if the node had already a vnode referencing it. The 977 * resulting locked vnode is returned in *vpp. 978 * 979 * Returns zero on success or an appropriate error code on failure. 980 */ 981 int 982 tmpfs_alloc_vp(struct mount *mp, struct tmpfs_node *node, int lkflag, 983 struct vnode **vpp) 984 { 985 struct vnode *vp; 986 enum vgetstate vs; 987 struct tmpfs_mount *tm; 988 vm_object_t object; 989 int error; 990 991 error = 0; 992 tm = VFS_TO_TMPFS(mp); 993 TMPFS_NODE_LOCK(node); 994 tmpfs_ref_node(node); 995 loop: 996 TMPFS_NODE_ASSERT_LOCKED(node); 997 if ((vp = node->tn_vnode) != NULL) { 998 MPASS((node->tn_vpstate & TMPFS_VNODE_DOOMED) == 0); 999 if ((node->tn_type == VDIR && node->tn_dir.tn_parent == NULL) || 1000 (VN_IS_DOOMED(vp) && 1001 (lkflag & LK_NOWAIT) != 0)) { 1002 TMPFS_NODE_UNLOCK(node); 1003 error = ENOENT; 1004 vp = NULL; 1005 goto out; 1006 } 1007 if (VN_IS_DOOMED(vp)) { 1008 node->tn_vpstate |= TMPFS_VNODE_WRECLAIM; 1009 while ((node->tn_vpstate & TMPFS_VNODE_WRECLAIM) != 0) { 1010 msleep(&node->tn_vnode, TMPFS_NODE_MTX(node), 1011 0, "tmpfsE", 0); 1012 } 1013 goto loop; 1014 } 1015 vs = vget_prep(vp); 1016 TMPFS_NODE_UNLOCK(node); 1017 error = vget_finish(vp, lkflag, vs); 1018 if (error == ENOENT) { 1019 TMPFS_NODE_LOCK(node); 1020 goto loop; 1021 } 1022 if (error != 0) { 1023 vp = NULL; 1024 goto out; 1025 } 1026 1027 /* 1028 * Make sure the vnode is still there after 1029 * getting the interlock to avoid racing a free. 1030 */ 1031 if (node->tn_vnode != vp) { 1032 vput(vp); 1033 TMPFS_NODE_LOCK(node); 1034 goto loop; 1035 } 1036 1037 goto out; 1038 } 1039 1040 if ((node->tn_vpstate & TMPFS_VNODE_DOOMED) || 1041 (node->tn_type == VDIR && node->tn_dir.tn_parent == NULL)) { 1042 TMPFS_NODE_UNLOCK(node); 1043 error = ENOENT; 1044 vp = NULL; 1045 goto out; 1046 } 1047 1048 /* 1049 * otherwise lock the vp list while we call getnewvnode 1050 * since that can block. 1051 */ 1052 if (node->tn_vpstate & TMPFS_VNODE_ALLOCATING) { 1053 node->tn_vpstate |= TMPFS_VNODE_WANT; 1054 error = msleep((caddr_t) &node->tn_vpstate, 1055 TMPFS_NODE_MTX(node), 0, "tmpfs_alloc_vp", 0); 1056 if (error != 0) 1057 goto out; 1058 goto loop; 1059 } else 1060 node->tn_vpstate |= TMPFS_VNODE_ALLOCATING; 1061 1062 TMPFS_NODE_UNLOCK(node); 1063 1064 /* Get a new vnode and associate it with our node. */ 1065 error = getnewvnode("tmpfs", mp, VFS_TO_TMPFS(mp)->tm_nonc ? 1066 &tmpfs_vnodeop_nonc_entries : &tmpfs_vnodeop_entries, &vp); 1067 if (error != 0) 1068 goto unlock; 1069 MPASS(vp != NULL); 1070 1071 /* lkflag is ignored, the lock is exclusive */ 1072 (void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1073 1074 vp->v_data = node; 1075 vp->v_type = node->tn_type; 1076 1077 /* Type-specific initialization. */ 1078 switch (node->tn_type) { 1079 case VBLK: 1080 /* FALLTHROUGH */ 1081 case VCHR: 1082 /* FALLTHROUGH */ 1083 case VLNK: 1084 /* FALLTHROUGH */ 1085 case VSOCK: 1086 break; 1087 case VFIFO: 1088 vp->v_op = &tmpfs_fifoop_entries; 1089 break; 1090 case VREG: 1091 object = node->tn_reg.tn_aobj; 1092 VM_OBJECT_WLOCK(object); 1093 KASSERT((object->flags & OBJ_TMPFS_VREF) == 0, 1094 ("%s: object %p with OBJ_TMPFS_VREF but without vnode", 1095 __func__, object)); 1096 KASSERT(object->un_pager.swp.writemappings == 0, 1097 ("%s: object %p has writemappings", 1098 __func__, object)); 1099 VI_LOCK(vp); 1100 KASSERT(vp->v_object == NULL, ("Not NULL v_object in tmpfs")); 1101 vp->v_object = object; 1102 vn_irflag_set_locked(vp, (tm->tm_pgread ? VIRF_PGREAD : 0) | 1103 VIRF_TEXT_REF); 1104 VI_UNLOCK(vp); 1105 VM_OBJECT_WUNLOCK(object); 1106 break; 1107 case VDIR: 1108 MPASS(node->tn_dir.tn_parent != NULL); 1109 if (node->tn_dir.tn_parent == node) 1110 vp->v_vflag |= VV_ROOT; 1111 break; 1112 1113 default: 1114 panic("tmpfs_alloc_vp: type %p %d", node, (int)node->tn_type); 1115 } 1116 if (vp->v_type != VFIFO) 1117 VN_LOCK_ASHARE(vp); 1118 1119 error = insmntque1(vp, mp); 1120 if (error != 0) { 1121 /* Need to clear v_object for insmntque failure. */ 1122 tmpfs_destroy_vobject(vp, vp->v_object); 1123 vp->v_object = NULL; 1124 vp->v_data = NULL; 1125 vp->v_op = &dead_vnodeops; 1126 vgone(vp); 1127 vput(vp); 1128 vp = NULL; 1129 } else { 1130 vn_set_state(vp, VSTATE_CONSTRUCTED); 1131 } 1132 1133 unlock: 1134 TMPFS_NODE_LOCK(node); 1135 1136 MPASS(node->tn_vpstate & TMPFS_VNODE_ALLOCATING); 1137 node->tn_vpstate &= ~TMPFS_VNODE_ALLOCATING; 1138 node->tn_vnode = vp; 1139 1140 if (node->tn_vpstate & TMPFS_VNODE_WANT) { 1141 node->tn_vpstate &= ~TMPFS_VNODE_WANT; 1142 TMPFS_NODE_UNLOCK(node); 1143 wakeup((caddr_t) &node->tn_vpstate); 1144 } else 1145 TMPFS_NODE_UNLOCK(node); 1146 1147 out: 1148 if (error == 0) { 1149 *vpp = vp; 1150 1151 #ifdef INVARIANTS 1152 MPASS(*vpp != NULL); 1153 ASSERT_VOP_LOCKED(*vpp, __func__); 1154 TMPFS_NODE_LOCK(node); 1155 MPASS(*vpp == node->tn_vnode); 1156 TMPFS_NODE_UNLOCK(node); 1157 #endif 1158 } 1159 tmpfs_free_node(tm, node); 1160 1161 return (error); 1162 } 1163 1164 /* 1165 * Destroys the association between the vnode vp and the node it 1166 * references. 1167 */ 1168 void 1169 tmpfs_free_vp(struct vnode *vp) 1170 { 1171 struct tmpfs_node *node; 1172 1173 node = VP_TO_TMPFS_NODE(vp); 1174 1175 TMPFS_NODE_ASSERT_LOCKED(node); 1176 node->tn_vnode = NULL; 1177 if ((node->tn_vpstate & TMPFS_VNODE_WRECLAIM) != 0) 1178 wakeup(&node->tn_vnode); 1179 node->tn_vpstate &= ~TMPFS_VNODE_WRECLAIM; 1180 vp->v_data = NULL; 1181 } 1182 1183 /* 1184 * Allocates a new file of type 'type' and adds it to the parent directory 1185 * 'dvp'; this addition is done using the component name given in 'cnp'. 1186 * The ownership of the new file is automatically assigned based on the 1187 * credentials of the caller (through 'cnp'), the group is set based on 1188 * the parent directory and the mode is determined from the 'vap' argument. 1189 * If successful, *vpp holds a vnode to the newly created file and zero 1190 * is returned. Otherwise *vpp is NULL and the function returns an 1191 * appropriate error code. 1192 */ 1193 int 1194 tmpfs_alloc_file(struct vnode *dvp, struct vnode **vpp, struct vattr *vap, 1195 struct componentname *cnp, const char *target) 1196 { 1197 int error; 1198 struct tmpfs_dirent *de; 1199 struct tmpfs_mount *tmp; 1200 struct tmpfs_node *dnode; 1201 struct tmpfs_node *node; 1202 struct tmpfs_node *parent; 1203 1204 ASSERT_VOP_ELOCKED(dvp, "tmpfs_alloc_file"); 1205 1206 tmp = VFS_TO_TMPFS(dvp->v_mount); 1207 dnode = VP_TO_TMPFS_DIR(dvp); 1208 *vpp = NULL; 1209 1210 /* If the entry we are creating is a directory, we cannot overflow 1211 * the number of links of its parent, because it will get a new 1212 * link. */ 1213 if (vap->va_type == VDIR) { 1214 /* Ensure that we do not overflow the maximum number of links 1215 * imposed by the system. */ 1216 MPASS(dnode->tn_links <= TMPFS_LINK_MAX); 1217 if (dnode->tn_links == TMPFS_LINK_MAX) { 1218 return (EMLINK); 1219 } 1220 1221 parent = dnode; 1222 MPASS(parent != NULL); 1223 } else 1224 parent = NULL; 1225 1226 /* Allocate a node that represents the new file. */ 1227 error = tmpfs_alloc_node(dvp->v_mount, tmp, vap->va_type, 1228 cnp->cn_cred->cr_uid, dnode->tn_gid, vap->va_mode, parent, 1229 target, vap->va_rdev, &node); 1230 if (error != 0) 1231 return (error); 1232 1233 /* Allocate a directory entry that points to the new file. */ 1234 error = tmpfs_alloc_dirent(tmp, node, cnp->cn_nameptr, cnp->cn_namelen, 1235 &de); 1236 if (error != 0) { 1237 tmpfs_free_node(tmp, node); 1238 return (error); 1239 } 1240 1241 /* Allocate a vnode for the new file. */ 1242 error = tmpfs_alloc_vp(dvp->v_mount, node, LK_EXCLUSIVE, vpp); 1243 if (error != 0) { 1244 tmpfs_free_dirent(tmp, de); 1245 tmpfs_free_node(tmp, node); 1246 return (error); 1247 } 1248 1249 /* Now that all required items are allocated, we can proceed to 1250 * insert the new node into the directory, an operation that 1251 * cannot fail. */ 1252 if (cnp->cn_flags & ISWHITEOUT) 1253 tmpfs_dir_whiteout_remove(dvp, cnp); 1254 tmpfs_dir_attach(dvp, de); 1255 return (0); 1256 } 1257 1258 struct tmpfs_dirent * 1259 tmpfs_dir_first(struct tmpfs_node *dnode, struct tmpfs_dir_cursor *dc) 1260 { 1261 struct tmpfs_dirent *de; 1262 1263 de = RB_MIN(tmpfs_dir, &dnode->tn_dir.tn_dirhead); 1264 dc->tdc_tree = de; 1265 if (de != NULL && tmpfs_dirent_duphead(de)) 1266 de = LIST_FIRST(&de->ud.td_duphead); 1267 dc->tdc_current = de; 1268 1269 return (dc->tdc_current); 1270 } 1271 1272 struct tmpfs_dirent * 1273 tmpfs_dir_next(struct tmpfs_node *dnode, struct tmpfs_dir_cursor *dc) 1274 { 1275 struct tmpfs_dirent *de; 1276 1277 MPASS(dc->tdc_tree != NULL); 1278 if (tmpfs_dirent_dup(dc->tdc_current)) { 1279 dc->tdc_current = LIST_NEXT(dc->tdc_current, uh.td_dup.entries); 1280 if (dc->tdc_current != NULL) 1281 return (dc->tdc_current); 1282 } 1283 dc->tdc_tree = dc->tdc_current = RB_NEXT(tmpfs_dir, 1284 &dnode->tn_dir.tn_dirhead, dc->tdc_tree); 1285 if ((de = dc->tdc_current) != NULL && tmpfs_dirent_duphead(de)) { 1286 dc->tdc_current = LIST_FIRST(&de->ud.td_duphead); 1287 MPASS(dc->tdc_current != NULL); 1288 } 1289 1290 return (dc->tdc_current); 1291 } 1292 1293 /* Lookup directory entry in RB-Tree. Function may return duphead entry. */ 1294 static struct tmpfs_dirent * 1295 tmpfs_dir_xlookup_hash(struct tmpfs_node *dnode, uint32_t hash) 1296 { 1297 struct tmpfs_dirent *de, dekey; 1298 1299 dekey.td_hash = hash; 1300 de = RB_FIND(tmpfs_dir, &dnode->tn_dir.tn_dirhead, &dekey); 1301 return (de); 1302 } 1303 1304 /* Lookup directory entry by cookie, initialize directory cursor accordingly. */ 1305 static struct tmpfs_dirent * 1306 tmpfs_dir_lookup_cookie(struct tmpfs_node *node, off_t cookie, 1307 struct tmpfs_dir_cursor *dc) 1308 { 1309 struct tmpfs_dir *dirhead = &node->tn_dir.tn_dirhead; 1310 struct tmpfs_dirent *de, dekey; 1311 1312 MPASS(cookie >= TMPFS_DIRCOOKIE_MIN); 1313 1314 if (cookie == node->tn_dir.tn_readdir_lastn && 1315 (de = node->tn_dir.tn_readdir_lastp) != NULL) { 1316 /* Protect against possible race, tn_readdir_last[pn] 1317 * may be updated with only shared vnode lock held. */ 1318 if (cookie == tmpfs_dirent_cookie(de)) 1319 goto out; 1320 } 1321 1322 if ((cookie & TMPFS_DIRCOOKIE_DUP) != 0) { 1323 LIST_FOREACH(de, &node->tn_dir.tn_dupindex, 1324 uh.td_dup.index_entries) { 1325 MPASS(tmpfs_dirent_dup(de)); 1326 if (de->td_cookie == cookie) 1327 goto out; 1328 /* dupindex list is sorted. */ 1329 if (de->td_cookie < cookie) { 1330 de = NULL; 1331 goto out; 1332 } 1333 } 1334 MPASS(de == NULL); 1335 goto out; 1336 } 1337 1338 if ((cookie & TMPFS_DIRCOOKIE_MASK) != cookie) { 1339 de = NULL; 1340 } else { 1341 dekey.td_hash = cookie; 1342 /* Recover if direntry for cookie was removed */ 1343 de = RB_NFIND(tmpfs_dir, dirhead, &dekey); 1344 } 1345 dc->tdc_tree = de; 1346 dc->tdc_current = de; 1347 if (de != NULL && tmpfs_dirent_duphead(de)) { 1348 dc->tdc_current = LIST_FIRST(&de->ud.td_duphead); 1349 MPASS(dc->tdc_current != NULL); 1350 } 1351 return (dc->tdc_current); 1352 1353 out: 1354 dc->tdc_tree = de; 1355 dc->tdc_current = de; 1356 if (de != NULL && tmpfs_dirent_dup(de)) 1357 dc->tdc_tree = tmpfs_dir_xlookup_hash(node, 1358 de->td_hash); 1359 return (dc->tdc_current); 1360 } 1361 1362 /* 1363 * Looks for a directory entry in the directory represented by node. 1364 * 'cnp' describes the name of the entry to look for. Note that the . 1365 * and .. components are not allowed as they do not physically exist 1366 * within directories. 1367 * 1368 * Returns a pointer to the entry when found, otherwise NULL. 1369 */ 1370 struct tmpfs_dirent * 1371 tmpfs_dir_lookup(struct tmpfs_node *node, struct tmpfs_node *f, 1372 struct componentname *cnp) 1373 { 1374 struct tmpfs_dir_duphead *duphead; 1375 struct tmpfs_dirent *de; 1376 uint32_t hash; 1377 1378 MPASS(IMPLIES(cnp->cn_namelen == 1, cnp->cn_nameptr[0] != '.')); 1379 MPASS(IMPLIES(cnp->cn_namelen == 2, !(cnp->cn_nameptr[0] == '.' && 1380 cnp->cn_nameptr[1] == '.'))); 1381 TMPFS_VALIDATE_DIR(node); 1382 1383 hash = tmpfs_dirent_hash(cnp->cn_nameptr, cnp->cn_namelen); 1384 de = tmpfs_dir_xlookup_hash(node, hash); 1385 if (de != NULL && tmpfs_dirent_duphead(de)) { 1386 duphead = &de->ud.td_duphead; 1387 LIST_FOREACH(de, duphead, uh.td_dup.entries) { 1388 if (TMPFS_DIRENT_MATCHES(de, cnp->cn_nameptr, 1389 cnp->cn_namelen)) 1390 break; 1391 } 1392 } else if (de != NULL) { 1393 if (!TMPFS_DIRENT_MATCHES(de, cnp->cn_nameptr, 1394 cnp->cn_namelen)) 1395 de = NULL; 1396 } 1397 if (de != NULL && f != NULL && de->td_node != f) 1398 de = NULL; 1399 1400 return (de); 1401 } 1402 1403 /* 1404 * Attach duplicate-cookie directory entry nde to dnode and insert to dupindex 1405 * list, allocate new cookie value. 1406 */ 1407 static void 1408 tmpfs_dir_attach_dup(struct tmpfs_node *dnode, 1409 struct tmpfs_dir_duphead *duphead, struct tmpfs_dirent *nde) 1410 { 1411 struct tmpfs_dir_duphead *dupindex; 1412 struct tmpfs_dirent *de, *pde; 1413 1414 dupindex = &dnode->tn_dir.tn_dupindex; 1415 de = LIST_FIRST(dupindex); 1416 if (de == NULL || de->td_cookie < TMPFS_DIRCOOKIE_DUP_MAX) { 1417 if (de == NULL) 1418 nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MIN; 1419 else 1420 nde->td_cookie = de->td_cookie + 1; 1421 MPASS(tmpfs_dirent_dup(nde)); 1422 LIST_INSERT_HEAD(dupindex, nde, uh.td_dup.index_entries); 1423 LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); 1424 return; 1425 } 1426 1427 /* 1428 * Cookie numbers are near exhaustion. Scan dupindex list for unused 1429 * numbers. dupindex list is sorted in descending order. Keep it so 1430 * after inserting nde. 1431 */ 1432 while (1) { 1433 pde = de; 1434 de = LIST_NEXT(de, uh.td_dup.index_entries); 1435 if (de == NULL && pde->td_cookie != TMPFS_DIRCOOKIE_DUP_MIN) { 1436 /* 1437 * Last element of the index doesn't have minimal cookie 1438 * value, use it. 1439 */ 1440 nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MIN; 1441 LIST_INSERT_AFTER(pde, nde, uh.td_dup.index_entries); 1442 LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); 1443 return; 1444 } else if (de == NULL) { 1445 /* 1446 * We are so lucky have 2^30 hash duplicates in single 1447 * directory :) Return largest possible cookie value. 1448 * It should be fine except possible issues with 1449 * VOP_READDIR restart. 1450 */ 1451 nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MAX; 1452 LIST_INSERT_HEAD(dupindex, nde, 1453 uh.td_dup.index_entries); 1454 LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); 1455 return; 1456 } 1457 if (de->td_cookie + 1 == pde->td_cookie || 1458 de->td_cookie >= TMPFS_DIRCOOKIE_DUP_MAX) 1459 continue; /* No hole or invalid cookie. */ 1460 nde->td_cookie = de->td_cookie + 1; 1461 MPASS(tmpfs_dirent_dup(nde)); 1462 MPASS(pde->td_cookie > nde->td_cookie); 1463 MPASS(nde->td_cookie > de->td_cookie); 1464 LIST_INSERT_BEFORE(de, nde, uh.td_dup.index_entries); 1465 LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); 1466 return; 1467 } 1468 } 1469 1470 /* 1471 * Attaches the directory entry de to the directory represented by vp. 1472 * Note that this does not change the link count of the node pointed by 1473 * the directory entry, as this is done by tmpfs_alloc_dirent. 1474 */ 1475 void 1476 tmpfs_dir_attach(struct vnode *vp, struct tmpfs_dirent *de) 1477 { 1478 struct tmpfs_node *dnode; 1479 struct tmpfs_dirent *xde, *nde; 1480 1481 ASSERT_VOP_ELOCKED(vp, __func__); 1482 MPASS(de->td_namelen > 0); 1483 MPASS(de->td_hash >= TMPFS_DIRCOOKIE_MIN); 1484 MPASS(de->td_cookie == de->td_hash); 1485 1486 dnode = VP_TO_TMPFS_DIR(vp); 1487 dnode->tn_dir.tn_readdir_lastn = 0; 1488 dnode->tn_dir.tn_readdir_lastp = NULL; 1489 1490 MPASS(!tmpfs_dirent_dup(de)); 1491 xde = RB_INSERT(tmpfs_dir, &dnode->tn_dir.tn_dirhead, de); 1492 if (xde != NULL && tmpfs_dirent_duphead(xde)) 1493 tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, de); 1494 else if (xde != NULL) { 1495 /* 1496 * Allocate new duphead. Swap xde with duphead to avoid 1497 * adding/removing elements with the same hash. 1498 */ 1499 MPASS(!tmpfs_dirent_dup(xde)); 1500 tmpfs_alloc_dirent(VFS_TO_TMPFS(vp->v_mount), NULL, NULL, 0, 1501 &nde); 1502 /* *nde = *xde; XXX gcc 4.2.1 may generate invalid code. */ 1503 memcpy(nde, xde, sizeof(*xde)); 1504 xde->td_cookie |= TMPFS_DIRCOOKIE_DUPHEAD; 1505 LIST_INIT(&xde->ud.td_duphead); 1506 xde->td_namelen = 0; 1507 xde->td_node = NULL; 1508 tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, nde); 1509 tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, de); 1510 } 1511 dnode->tn_size += sizeof(struct tmpfs_dirent); 1512 dnode->tn_status |= TMPFS_NODE_CHANGED | TMPFS_NODE_MODIFIED; 1513 dnode->tn_accessed = true; 1514 tmpfs_update(vp); 1515 } 1516 1517 /* 1518 * Detaches the directory entry de from the directory represented by vp. 1519 * Note that this does not change the link count of the node pointed by 1520 * the directory entry, as this is done by tmpfs_free_dirent. 1521 */ 1522 void 1523 tmpfs_dir_detach(struct vnode *vp, struct tmpfs_dirent *de) 1524 { 1525 struct tmpfs_mount *tmp; 1526 struct tmpfs_dir *head; 1527 struct tmpfs_node *dnode; 1528 struct tmpfs_dirent *xde; 1529 1530 ASSERT_VOP_ELOCKED(vp, __func__); 1531 1532 dnode = VP_TO_TMPFS_DIR(vp); 1533 head = &dnode->tn_dir.tn_dirhead; 1534 dnode->tn_dir.tn_readdir_lastn = 0; 1535 dnode->tn_dir.tn_readdir_lastp = NULL; 1536 1537 if (tmpfs_dirent_dup(de)) { 1538 /* Remove duphead if de was last entry. */ 1539 if (LIST_NEXT(de, uh.td_dup.entries) == NULL) { 1540 xde = tmpfs_dir_xlookup_hash(dnode, de->td_hash); 1541 MPASS(tmpfs_dirent_duphead(xde)); 1542 } else 1543 xde = NULL; 1544 LIST_REMOVE(de, uh.td_dup.entries); 1545 LIST_REMOVE(de, uh.td_dup.index_entries); 1546 if (xde != NULL) { 1547 if (LIST_EMPTY(&xde->ud.td_duphead)) { 1548 RB_REMOVE(tmpfs_dir, head, xde); 1549 tmp = VFS_TO_TMPFS(vp->v_mount); 1550 MPASS(xde->td_node == NULL); 1551 tmpfs_free_dirent(tmp, xde); 1552 } 1553 } 1554 de->td_cookie = de->td_hash; 1555 } else 1556 RB_REMOVE(tmpfs_dir, head, de); 1557 1558 dnode->tn_size -= sizeof(struct tmpfs_dirent); 1559 dnode->tn_status |= TMPFS_NODE_CHANGED | TMPFS_NODE_MODIFIED; 1560 dnode->tn_accessed = true; 1561 tmpfs_update(vp); 1562 } 1563 1564 void 1565 tmpfs_dir_destroy(struct tmpfs_mount *tmp, struct tmpfs_node *dnode) 1566 { 1567 struct tmpfs_dirent *de, *dde, *nde; 1568 1569 RB_FOREACH_SAFE(de, tmpfs_dir, &dnode->tn_dir.tn_dirhead, nde) { 1570 RB_REMOVE(tmpfs_dir, &dnode->tn_dir.tn_dirhead, de); 1571 /* Node may already be destroyed. */ 1572 de->td_node = NULL; 1573 if (tmpfs_dirent_duphead(de)) { 1574 while ((dde = LIST_FIRST(&de->ud.td_duphead)) != NULL) { 1575 LIST_REMOVE(dde, uh.td_dup.entries); 1576 dde->td_node = NULL; 1577 tmpfs_free_dirent(tmp, dde); 1578 } 1579 } 1580 tmpfs_free_dirent(tmp, de); 1581 } 1582 } 1583 1584 /* 1585 * Helper function for tmpfs_readdir. Creates a '.' entry for the given 1586 * directory and returns it in the uio space. The function returns 0 1587 * on success, -1 if there was not enough space in the uio structure to 1588 * hold the directory entry or an appropriate error code if another 1589 * error happens. 1590 */ 1591 static int 1592 tmpfs_dir_getdotdent(struct tmpfs_mount *tm, struct tmpfs_node *node, 1593 struct uio *uio) 1594 { 1595 int error; 1596 struct dirent dent; 1597 1598 TMPFS_VALIDATE_DIR(node); 1599 MPASS(uio->uio_offset == TMPFS_DIRCOOKIE_DOT); 1600 1601 dent.d_fileno = node->tn_id; 1602 dent.d_off = TMPFS_DIRCOOKIE_DOTDOT; 1603 dent.d_type = DT_DIR; 1604 dent.d_namlen = 1; 1605 dent.d_name[0] = '.'; 1606 dent.d_reclen = GENERIC_DIRSIZ(&dent); 1607 dirent_terminate(&dent); 1608 1609 if (dent.d_reclen > uio->uio_resid) 1610 error = EJUSTRETURN; 1611 else 1612 error = uiomove(&dent, dent.d_reclen, uio); 1613 1614 tmpfs_set_accessed(tm, node); 1615 1616 return (error); 1617 } 1618 1619 /* 1620 * Helper function for tmpfs_readdir. Creates a '..' entry for the given 1621 * directory and returns it in the uio space. The function returns 0 1622 * on success, -1 if there was not enough space in the uio structure to 1623 * hold the directory entry or an appropriate error code if another 1624 * error happens. 1625 */ 1626 static int 1627 tmpfs_dir_getdotdotdent(struct tmpfs_mount *tm, struct tmpfs_node *node, 1628 struct uio *uio, off_t next) 1629 { 1630 struct tmpfs_node *parent; 1631 struct dirent dent; 1632 int error; 1633 1634 TMPFS_VALIDATE_DIR(node); 1635 MPASS(uio->uio_offset == TMPFS_DIRCOOKIE_DOTDOT); 1636 1637 /* 1638 * Return ENOENT if the current node is already removed. 1639 */ 1640 TMPFS_ASSERT_LOCKED(node); 1641 parent = node->tn_dir.tn_parent; 1642 if (parent == NULL) 1643 return (ENOENT); 1644 1645 dent.d_fileno = parent->tn_id; 1646 dent.d_off = next; 1647 dent.d_type = DT_DIR; 1648 dent.d_namlen = 2; 1649 dent.d_name[0] = '.'; 1650 dent.d_name[1] = '.'; 1651 dent.d_reclen = GENERIC_DIRSIZ(&dent); 1652 dirent_terminate(&dent); 1653 1654 if (dent.d_reclen > uio->uio_resid) 1655 error = EJUSTRETURN; 1656 else 1657 error = uiomove(&dent, dent.d_reclen, uio); 1658 1659 tmpfs_set_accessed(tm, node); 1660 1661 return (error); 1662 } 1663 1664 /* 1665 * Helper function for tmpfs_readdir. Returns as much directory entries 1666 * as can fit in the uio space. The read starts at uio->uio_offset. 1667 * The function returns 0 on success, -1 if there was not enough space 1668 * in the uio structure to hold the directory entry or an appropriate 1669 * error code if another error happens. 1670 */ 1671 int 1672 tmpfs_dir_getdents(struct tmpfs_mount *tm, struct tmpfs_node *node, 1673 struct uio *uio, int maxcookies, uint64_t *cookies, int *ncookies) 1674 { 1675 struct tmpfs_dir_cursor dc; 1676 struct tmpfs_dirent *de, *nde; 1677 off_t off; 1678 int error; 1679 1680 TMPFS_VALIDATE_DIR(node); 1681 1682 off = 0; 1683 1684 /* 1685 * Lookup the node from the current offset. The starting offset of 1686 * 0 will lookup both '.' and '..', and then the first real entry, 1687 * or EOF if there are none. Then find all entries for the dir that 1688 * fit into the buffer. Once no more entries are found (de == NULL), 1689 * the offset is set to TMPFS_DIRCOOKIE_EOF, which will cause the next 1690 * call to return 0. 1691 */ 1692 switch (uio->uio_offset) { 1693 case TMPFS_DIRCOOKIE_DOT: 1694 error = tmpfs_dir_getdotdent(tm, node, uio); 1695 if (error != 0) 1696 return (error); 1697 uio->uio_offset = off = TMPFS_DIRCOOKIE_DOTDOT; 1698 if (cookies != NULL) 1699 cookies[(*ncookies)++] = off; 1700 /* FALLTHROUGH */ 1701 case TMPFS_DIRCOOKIE_DOTDOT: 1702 de = tmpfs_dir_first(node, &dc); 1703 off = tmpfs_dirent_cookie(de); 1704 error = tmpfs_dir_getdotdotdent(tm, node, uio, off); 1705 if (error != 0) 1706 return (error); 1707 uio->uio_offset = off; 1708 if (cookies != NULL) 1709 cookies[(*ncookies)++] = off; 1710 /* EOF. */ 1711 if (de == NULL) 1712 return (0); 1713 break; 1714 case TMPFS_DIRCOOKIE_EOF: 1715 return (0); 1716 default: 1717 de = tmpfs_dir_lookup_cookie(node, uio->uio_offset, &dc); 1718 if (de == NULL) 1719 return (EINVAL); 1720 if (cookies != NULL) 1721 off = tmpfs_dirent_cookie(de); 1722 } 1723 1724 /* 1725 * Read as much entries as possible; i.e., until we reach the end of the 1726 * directory or we exhaust uio space. 1727 */ 1728 do { 1729 struct dirent d; 1730 1731 /* 1732 * Create a dirent structure representing the current tmpfs_node 1733 * and fill it. 1734 */ 1735 if (de->td_node == NULL) { 1736 d.d_fileno = 1; 1737 d.d_type = DT_WHT; 1738 } else { 1739 d.d_fileno = de->td_node->tn_id; 1740 switch (de->td_node->tn_type) { 1741 case VBLK: 1742 d.d_type = DT_BLK; 1743 break; 1744 1745 case VCHR: 1746 d.d_type = DT_CHR; 1747 break; 1748 1749 case VDIR: 1750 d.d_type = DT_DIR; 1751 break; 1752 1753 case VFIFO: 1754 d.d_type = DT_FIFO; 1755 break; 1756 1757 case VLNK: 1758 d.d_type = DT_LNK; 1759 break; 1760 1761 case VREG: 1762 d.d_type = DT_REG; 1763 break; 1764 1765 case VSOCK: 1766 d.d_type = DT_SOCK; 1767 break; 1768 1769 default: 1770 panic("tmpfs_dir_getdents: type %p %d", 1771 de->td_node, (int)de->td_node->tn_type); 1772 } 1773 } 1774 d.d_namlen = de->td_namelen; 1775 MPASS(de->td_namelen < sizeof(d.d_name)); 1776 (void)memcpy(d.d_name, de->ud.td_name, de->td_namelen); 1777 d.d_reclen = GENERIC_DIRSIZ(&d); 1778 1779 /* 1780 * Stop reading if the directory entry we are treating is bigger 1781 * than the amount of data that can be returned. 1782 */ 1783 if (d.d_reclen > uio->uio_resid) { 1784 error = EJUSTRETURN; 1785 break; 1786 } 1787 1788 nde = tmpfs_dir_next(node, &dc); 1789 d.d_off = tmpfs_dirent_cookie(nde); 1790 dirent_terminate(&d); 1791 1792 /* 1793 * Copy the new dirent structure into the output buffer and 1794 * advance pointers. 1795 */ 1796 error = uiomove(&d, d.d_reclen, uio); 1797 if (error == 0) { 1798 de = nde; 1799 if (cookies != NULL) { 1800 off = tmpfs_dirent_cookie(de); 1801 MPASS(*ncookies < maxcookies); 1802 cookies[(*ncookies)++] = off; 1803 } 1804 } 1805 } while (error == 0 && uio->uio_resid > 0 && de != NULL); 1806 1807 /* Skip setting off when using cookies as it is already done above. */ 1808 if (cookies == NULL) 1809 off = tmpfs_dirent_cookie(de); 1810 1811 /* Update the offset and cache. */ 1812 uio->uio_offset = off; 1813 node->tn_dir.tn_readdir_lastn = off; 1814 node->tn_dir.tn_readdir_lastp = de; 1815 1816 tmpfs_set_accessed(tm, node); 1817 return (error); 1818 } 1819 1820 int 1821 tmpfs_dir_whiteout_add(struct vnode *dvp, struct componentname *cnp) 1822 { 1823 struct tmpfs_dirent *de; 1824 int error; 1825 1826 error = tmpfs_alloc_dirent(VFS_TO_TMPFS(dvp->v_mount), NULL, 1827 cnp->cn_nameptr, cnp->cn_namelen, &de); 1828 if (error != 0) 1829 return (error); 1830 tmpfs_dir_attach(dvp, de); 1831 return (0); 1832 } 1833 1834 void 1835 tmpfs_dir_whiteout_remove(struct vnode *dvp, struct componentname *cnp) 1836 { 1837 struct tmpfs_dirent *de; 1838 1839 de = tmpfs_dir_lookup(VP_TO_TMPFS_DIR(dvp), NULL, cnp); 1840 MPASS(de != NULL && de->td_node == NULL); 1841 tmpfs_dir_detach(dvp, de); 1842 tmpfs_free_dirent(VFS_TO_TMPFS(dvp->v_mount), de); 1843 } 1844 1845 /* 1846 * Resizes the aobj associated with the regular file pointed to by 'vp' to the 1847 * size 'newsize'. 'vp' must point to a vnode that represents a regular file. 1848 * 'newsize' must be positive. 1849 * 1850 * Returns zero on success or an appropriate error code on failure. 1851 */ 1852 int 1853 tmpfs_reg_resize(struct vnode *vp, off_t newsize, boolean_t ignerr) 1854 { 1855 struct tmpfs_node *node; 1856 vm_object_t uobj; 1857 vm_pindex_t idx, newpages, oldpages; 1858 off_t oldsize; 1859 int base, error; 1860 1861 MPASS(vp->v_type == VREG); 1862 MPASS(newsize >= 0); 1863 1864 node = VP_TO_TMPFS_NODE(vp); 1865 uobj = node->tn_reg.tn_aobj; 1866 1867 /* 1868 * Convert the old and new sizes to the number of pages needed to 1869 * store them. It may happen that we do not need to do anything 1870 * because the last allocated page can accommodate the change on 1871 * its own. 1872 */ 1873 oldsize = node->tn_size; 1874 oldpages = OFF_TO_IDX(oldsize + PAGE_MASK); 1875 MPASS(oldpages == uobj->size); 1876 newpages = OFF_TO_IDX(newsize + PAGE_MASK); 1877 1878 if (__predict_true(newpages == oldpages && newsize >= oldsize)) { 1879 node->tn_size = newsize; 1880 return (0); 1881 } 1882 1883 VM_OBJECT_WLOCK(uobj); 1884 if (newsize < oldsize) { 1885 /* 1886 * Zero the truncated part of the last page. 1887 */ 1888 base = newsize & PAGE_MASK; 1889 if (base != 0) { 1890 idx = OFF_TO_IDX(newsize); 1891 error = tmpfs_partial_page_invalidate(uobj, idx, base, 1892 PAGE_SIZE, ignerr); 1893 if (error != 0) { 1894 VM_OBJECT_WUNLOCK(uobj); 1895 return (error); 1896 } 1897 } 1898 1899 /* 1900 * Release any swap space and free any whole pages. 1901 */ 1902 if (newpages < oldpages) 1903 vm_object_page_remove(uobj, newpages, 0, 0); 1904 } 1905 uobj->size = newpages; 1906 VM_OBJECT_WUNLOCK(uobj); 1907 1908 node->tn_size = newsize; 1909 return (0); 1910 } 1911 1912 /* 1913 * Punch hole in the aobj associated with the regular file pointed to by 'vp'. 1914 * Requests completely beyond the end-of-file are converted to no-op. 1915 * 1916 * Returns 0 on success or error code from tmpfs_partial_page_invalidate() on 1917 * failure. 1918 */ 1919 int 1920 tmpfs_reg_punch_hole(struct vnode *vp, off_t *offset, off_t *length) 1921 { 1922 struct tmpfs_node *node; 1923 vm_object_t object; 1924 vm_pindex_t pistart, pi, piend; 1925 int startofs, endofs, end; 1926 off_t off, len; 1927 int error; 1928 1929 KASSERT(*length <= OFF_MAX - *offset, ("%s: offset + length overflows", 1930 __func__)); 1931 node = VP_TO_TMPFS_NODE(vp); 1932 KASSERT(node->tn_type == VREG, ("%s: node is not regular file", 1933 __func__)); 1934 object = node->tn_reg.tn_aobj; 1935 off = *offset; 1936 len = omin(node->tn_size - off, *length); 1937 startofs = off & PAGE_MASK; 1938 endofs = (off + len) & PAGE_MASK; 1939 pistart = OFF_TO_IDX(off); 1940 piend = OFF_TO_IDX(off + len); 1941 pi = OFF_TO_IDX((vm_ooffset_t)off + PAGE_MASK); 1942 error = 0; 1943 1944 /* Handle the case when offset is on or beyond file size. */ 1945 if (len <= 0) { 1946 *length = 0; 1947 return (0); 1948 } 1949 1950 VM_OBJECT_WLOCK(object); 1951 1952 /* 1953 * If there is a partial page at the beginning of the hole-punching 1954 * request, fill the partial page with zeroes. 1955 */ 1956 if (startofs != 0) { 1957 end = pistart != piend ? PAGE_SIZE : endofs; 1958 error = tmpfs_partial_page_invalidate(object, pistart, startofs, 1959 end, FALSE); 1960 if (error != 0) 1961 goto out; 1962 off += end - startofs; 1963 len -= end - startofs; 1964 } 1965 1966 /* 1967 * Toss away the full pages in the affected area. 1968 */ 1969 if (pi < piend) { 1970 vm_object_page_remove(object, pi, piend, 0); 1971 off += IDX_TO_OFF(piend - pi); 1972 len -= IDX_TO_OFF(piend - pi); 1973 } 1974 1975 /* 1976 * If there is a partial page at the end of the hole-punching request, 1977 * fill the partial page with zeroes. 1978 */ 1979 if (endofs != 0 && pistart != piend) { 1980 error = tmpfs_partial_page_invalidate(object, piend, 0, endofs, 1981 FALSE); 1982 if (error != 0) 1983 goto out; 1984 off += endofs; 1985 len -= endofs; 1986 } 1987 1988 out: 1989 VM_OBJECT_WUNLOCK(object); 1990 *offset = off; 1991 *length = len; 1992 return (error); 1993 } 1994 1995 void 1996 tmpfs_check_mtime(struct vnode *vp) 1997 { 1998 struct tmpfs_node *node; 1999 struct vm_object *obj; 2000 2001 ASSERT_VOP_ELOCKED(vp, "check_mtime"); 2002 if (vp->v_type != VREG) 2003 return; 2004 obj = vp->v_object; 2005 KASSERT(obj->type == tmpfs_pager_type && 2006 (obj->flags & (OBJ_SWAP | OBJ_TMPFS)) == 2007 (OBJ_SWAP | OBJ_TMPFS), ("non-tmpfs obj")); 2008 /* unlocked read */ 2009 if (obj->generation != obj->cleangeneration) { 2010 VM_OBJECT_WLOCK(obj); 2011 if (obj->generation != obj->cleangeneration) { 2012 obj->cleangeneration = obj->generation; 2013 node = VP_TO_TMPFS_NODE(vp); 2014 node->tn_status |= TMPFS_NODE_MODIFIED | 2015 TMPFS_NODE_CHANGED; 2016 } 2017 VM_OBJECT_WUNLOCK(obj); 2018 } 2019 } 2020 2021 /* 2022 * Change flags of the given vnode. 2023 * Caller should execute tmpfs_update on vp after a successful execution. 2024 * The vnode must be locked on entry and remain locked on exit. 2025 */ 2026 int 2027 tmpfs_chflags(struct vnode *vp, u_long flags, struct ucred *cred, 2028 struct thread *td) 2029 { 2030 int error; 2031 struct tmpfs_node *node; 2032 2033 ASSERT_VOP_ELOCKED(vp, "chflags"); 2034 2035 node = VP_TO_TMPFS_NODE(vp); 2036 2037 if ((flags & ~(SF_APPEND | SF_ARCHIVED | SF_IMMUTABLE | SF_NOUNLINK | 2038 UF_APPEND | UF_ARCHIVE | UF_HIDDEN | UF_IMMUTABLE | UF_NODUMP | 2039 UF_NOUNLINK | UF_OFFLINE | UF_OPAQUE | UF_READONLY | UF_REPARSE | 2040 UF_SPARSE | UF_SYSTEM)) != 0) 2041 return (EOPNOTSUPP); 2042 2043 /* Disallow this operation if the file system is mounted read-only. */ 2044 if (vp->v_mount->mnt_flag & MNT_RDONLY) 2045 return (EROFS); 2046 2047 /* 2048 * Callers may only modify the file flags on objects they 2049 * have VADMIN rights for. 2050 */ 2051 if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) 2052 return (error); 2053 /* 2054 * Unprivileged processes are not permitted to unset system 2055 * flags, or modify flags if any system flags are set. 2056 */ 2057 if (!priv_check_cred(cred, PRIV_VFS_SYSFLAGS)) { 2058 if (node->tn_flags & 2059 (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND)) { 2060 error = securelevel_gt(cred, 0); 2061 if (error) 2062 return (error); 2063 } 2064 } else { 2065 if (node->tn_flags & 2066 (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND) || 2067 ((flags ^ node->tn_flags) & SF_SETTABLE)) 2068 return (EPERM); 2069 } 2070 node->tn_flags = flags; 2071 node->tn_status |= TMPFS_NODE_CHANGED; 2072 2073 ASSERT_VOP_ELOCKED(vp, "chflags2"); 2074 2075 return (0); 2076 } 2077 2078 /* 2079 * Change access mode on the given vnode. 2080 * Caller should execute tmpfs_update on vp after a successful execution. 2081 * The vnode must be locked on entry and remain locked on exit. 2082 */ 2083 int 2084 tmpfs_chmod(struct vnode *vp, mode_t mode, struct ucred *cred, 2085 struct thread *td) 2086 { 2087 int error; 2088 struct tmpfs_node *node; 2089 mode_t newmode; 2090 2091 ASSERT_VOP_ELOCKED(vp, "chmod"); 2092 ASSERT_VOP_IN_SEQC(vp); 2093 2094 node = VP_TO_TMPFS_NODE(vp); 2095 2096 /* Disallow this operation if the file system is mounted read-only. */ 2097 if (vp->v_mount->mnt_flag & MNT_RDONLY) 2098 return (EROFS); 2099 2100 /* Immutable or append-only files cannot be modified, either. */ 2101 if (node->tn_flags & (IMMUTABLE | APPEND)) 2102 return (EPERM); 2103 2104 /* 2105 * To modify the permissions on a file, must possess VADMIN 2106 * for that file. 2107 */ 2108 if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) 2109 return (error); 2110 2111 /* 2112 * Privileged processes may set the sticky bit on non-directories, 2113 * as well as set the setgid bit on a file with a group that the 2114 * process is not a member of. 2115 */ 2116 if (vp->v_type != VDIR && (mode & S_ISTXT)) { 2117 if (priv_check_cred(cred, PRIV_VFS_STICKYFILE)) 2118 return (EFTYPE); 2119 } 2120 if (!groupmember(node->tn_gid, cred) && (mode & S_ISGID)) { 2121 error = priv_check_cred(cred, PRIV_VFS_SETGID); 2122 if (error) 2123 return (error); 2124 } 2125 2126 newmode = node->tn_mode & ~ALLPERMS; 2127 newmode |= mode & ALLPERMS; 2128 atomic_store_short(&node->tn_mode, newmode); 2129 2130 node->tn_status |= TMPFS_NODE_CHANGED; 2131 2132 ASSERT_VOP_ELOCKED(vp, "chmod2"); 2133 2134 return (0); 2135 } 2136 2137 /* 2138 * Change ownership of the given vnode. At least one of uid or gid must 2139 * be different than VNOVAL. If one is set to that value, the attribute 2140 * is unchanged. 2141 * Caller should execute tmpfs_update on vp after a successful execution. 2142 * The vnode must be locked on entry and remain locked on exit. 2143 */ 2144 int 2145 tmpfs_chown(struct vnode *vp, uid_t uid, gid_t gid, struct ucred *cred, 2146 struct thread *td) 2147 { 2148 int error; 2149 struct tmpfs_node *node; 2150 uid_t ouid; 2151 gid_t ogid; 2152 mode_t newmode; 2153 2154 ASSERT_VOP_ELOCKED(vp, "chown"); 2155 ASSERT_VOP_IN_SEQC(vp); 2156 2157 node = VP_TO_TMPFS_NODE(vp); 2158 2159 /* Assign default values if they are unknown. */ 2160 MPASS(uid != VNOVAL || gid != VNOVAL); 2161 if (uid == VNOVAL) 2162 uid = node->tn_uid; 2163 if (gid == VNOVAL) 2164 gid = node->tn_gid; 2165 MPASS(uid != VNOVAL && gid != VNOVAL); 2166 2167 /* Disallow this operation if the file system is mounted read-only. */ 2168 if (vp->v_mount->mnt_flag & MNT_RDONLY) 2169 return (EROFS); 2170 2171 /* Immutable or append-only files cannot be modified, either. */ 2172 if (node->tn_flags & (IMMUTABLE | APPEND)) 2173 return (EPERM); 2174 2175 /* 2176 * To modify the ownership of a file, must possess VADMIN for that 2177 * file. 2178 */ 2179 if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) 2180 return (error); 2181 2182 /* 2183 * To change the owner of a file, or change the group of a file to a 2184 * group of which we are not a member, the caller must have 2185 * privilege. 2186 */ 2187 if ((uid != node->tn_uid || 2188 (gid != node->tn_gid && !groupmember(gid, cred))) && 2189 (error = priv_check_cred(cred, PRIV_VFS_CHOWN))) 2190 return (error); 2191 2192 ogid = node->tn_gid; 2193 ouid = node->tn_uid; 2194 2195 node->tn_uid = uid; 2196 node->tn_gid = gid; 2197 2198 node->tn_status |= TMPFS_NODE_CHANGED; 2199 2200 if ((node->tn_mode & (S_ISUID | S_ISGID)) != 0 && 2201 (ouid != uid || ogid != gid)) { 2202 if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID)) { 2203 newmode = node->tn_mode & ~(S_ISUID | S_ISGID); 2204 atomic_store_short(&node->tn_mode, newmode); 2205 } 2206 } 2207 2208 ASSERT_VOP_ELOCKED(vp, "chown2"); 2209 2210 return (0); 2211 } 2212 2213 /* 2214 * Change size of the given vnode. 2215 * Caller should execute tmpfs_update on vp after a successful execution. 2216 * The vnode must be locked on entry and remain locked on exit. 2217 */ 2218 int 2219 tmpfs_chsize(struct vnode *vp, u_quad_t size, struct ucred *cred, 2220 struct thread *td) 2221 { 2222 int error; 2223 struct tmpfs_node *node; 2224 2225 ASSERT_VOP_ELOCKED(vp, "chsize"); 2226 2227 node = VP_TO_TMPFS_NODE(vp); 2228 2229 /* Decide whether this is a valid operation based on the file type. */ 2230 error = 0; 2231 switch (vp->v_type) { 2232 case VDIR: 2233 return (EISDIR); 2234 2235 case VREG: 2236 if (vp->v_mount->mnt_flag & MNT_RDONLY) 2237 return (EROFS); 2238 break; 2239 2240 case VBLK: 2241 /* FALLTHROUGH */ 2242 case VCHR: 2243 /* FALLTHROUGH */ 2244 case VFIFO: 2245 /* 2246 * Allow modifications of special files even if in the file 2247 * system is mounted read-only (we are not modifying the 2248 * files themselves, but the objects they represent). 2249 */ 2250 return (0); 2251 2252 default: 2253 /* Anything else is unsupported. */ 2254 return (EOPNOTSUPP); 2255 } 2256 2257 /* Immutable or append-only files cannot be modified, either. */ 2258 if (node->tn_flags & (IMMUTABLE | APPEND)) 2259 return (EPERM); 2260 2261 error = vn_rlimit_trunc(size, td); 2262 if (error != 0) 2263 return (error); 2264 2265 error = tmpfs_truncate(vp, size); 2266 /* 2267 * tmpfs_truncate will raise the NOTE_EXTEND and NOTE_ATTRIB kevents 2268 * for us, as will update tn_status; no need to do that here. 2269 */ 2270 2271 ASSERT_VOP_ELOCKED(vp, "chsize2"); 2272 2273 return (error); 2274 } 2275 2276 /* 2277 * Change access and modification times of the given vnode. 2278 * Caller should execute tmpfs_update on vp after a successful execution. 2279 * The vnode must be locked on entry and remain locked on exit. 2280 */ 2281 int 2282 tmpfs_chtimes(struct vnode *vp, struct vattr *vap, 2283 struct ucred *cred, struct thread *td) 2284 { 2285 int error; 2286 struct tmpfs_node *node; 2287 2288 ASSERT_VOP_ELOCKED(vp, "chtimes"); 2289 2290 node = VP_TO_TMPFS_NODE(vp); 2291 2292 /* Disallow this operation if the file system is mounted read-only. */ 2293 if (vp->v_mount->mnt_flag & MNT_RDONLY) 2294 return (EROFS); 2295 2296 /* Immutable or append-only files cannot be modified, either. */ 2297 if (node->tn_flags & (IMMUTABLE | APPEND)) 2298 return (EPERM); 2299 2300 error = vn_utimes_perm(vp, vap, cred, td); 2301 if (error != 0) 2302 return (error); 2303 2304 if (vap->va_atime.tv_sec != VNOVAL) 2305 node->tn_accessed = true; 2306 if (vap->va_mtime.tv_sec != VNOVAL) 2307 node->tn_status |= TMPFS_NODE_MODIFIED; 2308 if (vap->va_birthtime.tv_sec != VNOVAL) 2309 node->tn_status |= TMPFS_NODE_MODIFIED; 2310 tmpfs_itimes(vp, &vap->va_atime, &vap->va_mtime); 2311 if (vap->va_birthtime.tv_sec != VNOVAL) 2312 node->tn_birthtime = vap->va_birthtime; 2313 ASSERT_VOP_ELOCKED(vp, "chtimes2"); 2314 2315 return (0); 2316 } 2317 2318 void 2319 tmpfs_set_status(struct tmpfs_mount *tm, struct tmpfs_node *node, int status) 2320 { 2321 2322 if ((node->tn_status & status) == status || tm->tm_ronly) 2323 return; 2324 TMPFS_NODE_LOCK(node); 2325 node->tn_status |= status; 2326 TMPFS_NODE_UNLOCK(node); 2327 } 2328 2329 void 2330 tmpfs_set_accessed(struct tmpfs_mount *tm, struct tmpfs_node *node) 2331 { 2332 if (node->tn_accessed || tm->tm_ronly) 2333 return; 2334 atomic_store_8(&node->tn_accessed, true); 2335 } 2336 2337 /* Sync timestamps */ 2338 void 2339 tmpfs_itimes(struct vnode *vp, const struct timespec *acc, 2340 const struct timespec *mod) 2341 { 2342 struct tmpfs_node *node; 2343 struct timespec now; 2344 2345 ASSERT_VOP_LOCKED(vp, "tmpfs_itimes"); 2346 node = VP_TO_TMPFS_NODE(vp); 2347 2348 if (!node->tn_accessed && 2349 (node->tn_status & (TMPFS_NODE_MODIFIED | TMPFS_NODE_CHANGED)) == 0) 2350 return; 2351 2352 vfs_timestamp(&now); 2353 TMPFS_NODE_LOCK(node); 2354 if (node->tn_accessed) { 2355 if (acc == NULL) 2356 acc = &now; 2357 node->tn_atime = *acc; 2358 } 2359 if (node->tn_status & TMPFS_NODE_MODIFIED) { 2360 if (mod == NULL) 2361 mod = &now; 2362 node->tn_mtime = *mod; 2363 } 2364 if (node->tn_status & TMPFS_NODE_CHANGED) 2365 node->tn_ctime = now; 2366 node->tn_status &= ~(TMPFS_NODE_MODIFIED | TMPFS_NODE_CHANGED); 2367 node->tn_accessed = false; 2368 TMPFS_NODE_UNLOCK(node); 2369 2370 /* XXX: FIX? The entropy here is desirable, but the harvesting may be expensive */ 2371 random_harvest_queue(node, sizeof(*node), RANDOM_FS_ATIME); 2372 } 2373 2374 int 2375 tmpfs_truncate(struct vnode *vp, off_t length) 2376 { 2377 struct tmpfs_node *node; 2378 int error; 2379 2380 if (length < 0) 2381 return (EINVAL); 2382 if (length > VFS_TO_TMPFS(vp->v_mount)->tm_maxfilesize) 2383 return (EFBIG); 2384 2385 node = VP_TO_TMPFS_NODE(vp); 2386 error = node->tn_size == length ? 0 : tmpfs_reg_resize(vp, length, 2387 FALSE); 2388 if (error == 0) 2389 node->tn_status |= TMPFS_NODE_CHANGED | TMPFS_NODE_MODIFIED; 2390 tmpfs_update(vp); 2391 2392 return (error); 2393 } 2394 2395 static __inline int 2396 tmpfs_dirtree_cmp(struct tmpfs_dirent *a, struct tmpfs_dirent *b) 2397 { 2398 if (a->td_hash > b->td_hash) 2399 return (1); 2400 else if (a->td_hash < b->td_hash) 2401 return (-1); 2402 return (0); 2403 } 2404 2405 RB_GENERATE_STATIC(tmpfs_dir, tmpfs_dirent, uh.td_entries, tmpfs_dirtree_cmp); 2406