1 /* $NetBSD: tmpfs_subr.c,v 1.35 2007/07/09 21:10:50 ad Exp $ */ 2 3 /*- 4 * SPDX-License-Identifier: BSD-2-Clause 5 * 6 * Copyright (c) 2005 The NetBSD Foundation, Inc. 7 * All rights reserved. 8 * 9 * This code is derived from software contributed to The NetBSD Foundation 10 * by Julio M. Merino Vidal, developed as part of Google's Summer of Code 11 * 2005 program. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 24 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 26 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 29 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 30 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 32 * POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 /* 36 * Efficient memory file system supporting functions. 37 */ 38 39 #include <sys/param.h> 40 #include <sys/systm.h> 41 #include <sys/dirent.h> 42 #include <sys/fnv_hash.h> 43 #include <sys/lock.h> 44 #include <sys/limits.h> 45 #include <sys/mount.h> 46 #include <sys/namei.h> 47 #include <sys/priv.h> 48 #include <sys/proc.h> 49 #include <sys/random.h> 50 #include <sys/refcount.h> 51 #include <sys/rwlock.h> 52 #include <sys/smr.h> 53 #include <sys/stat.h> 54 #include <sys/sysctl.h> 55 #include <sys/user.h> 56 #include <sys/vnode.h> 57 #include <sys/vmmeter.h> 58 59 #include <vm/vm.h> 60 #include <vm/vm_param.h> 61 #include <vm/vm_object.h> 62 #include <vm/vm_page.h> 63 #include <vm/vm_pageout.h> 64 #include <vm/vm_pager.h> 65 #include <vm/vm_extern.h> 66 #include <vm/swap_pager.h> 67 #include <vm/uma.h> 68 69 #include <fs/tmpfs/tmpfs.h> 70 #include <fs/tmpfs/tmpfs_fifoops.h> 71 #include <fs/tmpfs/tmpfs_vnops.h> 72 73 SYSCTL_NODE(_vfs, OID_AUTO, tmpfs, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 74 "tmpfs file system"); 75 76 static long tmpfs_pages_reserved = TMPFS_PAGES_MINRESERVED; 77 static long tmpfs_pages_avail_init; 78 static int tmpfs_mem_percent = TMPFS_MEM_PERCENT; 79 static void tmpfs_set_reserve_from_percent(void); 80 81 MALLOC_DEFINE(M_TMPFSDIR, "tmpfs dir", "tmpfs dirent structure"); 82 static uma_zone_t tmpfs_node_pool; 83 VFS_SMR_DECLARE; 84 85 int tmpfs_pager_type = -1; 86 87 static vm_object_t 88 tmpfs_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, 89 vm_ooffset_t offset, struct ucred *cred) 90 { 91 vm_object_t object; 92 93 MPASS(handle == NULL); 94 MPASS(offset == 0); 95 object = vm_object_allocate_dyn(tmpfs_pager_type, size, 96 OBJ_COLORED | OBJ_SWAP); 97 if (!swap_pager_init_object(object, NULL, NULL, size, 0)) { 98 vm_object_deallocate(object); 99 object = NULL; 100 } 101 return (object); 102 } 103 104 /* 105 * Make sure tmpfs vnodes with writable mappings can be found on the lazy list. 106 * 107 * This allows for periodic mtime updates while only scanning vnodes which are 108 * plausibly dirty, see tmpfs_update_mtime_lazy. 109 */ 110 static void 111 tmpfs_pager_writecount_recalc(vm_object_t object, vm_offset_t old, 112 vm_offset_t new) 113 { 114 struct vnode *vp; 115 116 VM_OBJECT_ASSERT_WLOCKED(object); 117 118 vp = VM_TO_TMPFS_VP(object); 119 120 /* 121 * Forced unmount? 122 */ 123 if (vp == NULL) { 124 KASSERT((object->flags & OBJ_TMPFS_VREF) == 0, 125 ("object %p with OBJ_TMPFS_VREF but without vnode", 126 object)); 127 VM_OBJECT_WUNLOCK(object); 128 return; 129 } 130 131 if (old == 0) { 132 VNASSERT((object->flags & OBJ_TMPFS_VREF) == 0, vp, 133 ("object without writable mappings has a reference")); 134 VNPASS(vp->v_usecount > 0, vp); 135 } else { 136 VNASSERT((object->flags & OBJ_TMPFS_VREF) != 0, vp, 137 ("object with writable mappings does not " 138 "have a reference")); 139 } 140 141 if (old == new) { 142 VM_OBJECT_WUNLOCK(object); 143 return; 144 } 145 146 if (new == 0) { 147 vm_object_clear_flag(object, OBJ_TMPFS_VREF); 148 VM_OBJECT_WUNLOCK(object); 149 vrele(vp); 150 } else { 151 if ((object->flags & OBJ_TMPFS_VREF) == 0) { 152 vref(vp); 153 vlazy(vp); 154 vm_object_set_flag(object, OBJ_TMPFS_VREF); 155 } 156 VM_OBJECT_WUNLOCK(object); 157 } 158 } 159 160 static void 161 tmpfs_pager_update_writecount(vm_object_t object, vm_offset_t start, 162 vm_offset_t end) 163 { 164 vm_offset_t new, old; 165 166 VM_OBJECT_WLOCK(object); 167 KASSERT((object->flags & OBJ_ANON) == 0, 168 ("%s: object %p with OBJ_ANON", __func__, object)); 169 old = object->un_pager.swp.writemappings; 170 object->un_pager.swp.writemappings += (vm_ooffset_t)end - start; 171 new = object->un_pager.swp.writemappings; 172 tmpfs_pager_writecount_recalc(object, old, new); 173 VM_OBJECT_ASSERT_UNLOCKED(object); 174 } 175 176 static void 177 tmpfs_pager_release_writecount(vm_object_t object, vm_offset_t start, 178 vm_offset_t end) 179 { 180 vm_offset_t new, old; 181 182 VM_OBJECT_WLOCK(object); 183 KASSERT((object->flags & OBJ_ANON) == 0, 184 ("%s: object %p with OBJ_ANON", __func__, object)); 185 old = object->un_pager.swp.writemappings; 186 object->un_pager.swp.writemappings -= (vm_ooffset_t)end - start; 187 new = object->un_pager.swp.writemappings; 188 tmpfs_pager_writecount_recalc(object, old, new); 189 VM_OBJECT_ASSERT_UNLOCKED(object); 190 } 191 192 static void 193 tmpfs_pager_getvp(vm_object_t object, struct vnode **vpp, bool *vp_heldp) 194 { 195 struct vnode *vp; 196 197 /* 198 * Tmpfs VREG node, which was reclaimed, has tmpfs_pager_type 199 * type. In this case there is no v_writecount to adjust. 200 */ 201 if (vp_heldp != NULL) 202 VM_OBJECT_RLOCK(object); 203 else 204 VM_OBJECT_ASSERT_LOCKED(object); 205 if ((object->flags & OBJ_TMPFS) != 0) { 206 vp = VM_TO_TMPFS_VP(object); 207 if (vp != NULL) { 208 *vpp = vp; 209 if (vp_heldp != NULL) { 210 vhold(vp); 211 *vp_heldp = true; 212 } 213 } 214 } 215 if (vp_heldp != NULL) 216 VM_OBJECT_RUNLOCK(object); 217 } 218 219 static void 220 tmpfs_pager_freespace(vm_object_t obj, vm_pindex_t start, vm_size_t size) 221 { 222 struct tmpfs_node *node; 223 struct tmpfs_mount *tm; 224 vm_size_t c; 225 226 swap_pager_freespace(obj, start, size, &c); 227 if ((obj->flags & OBJ_TMPFS) == 0 || c == 0) 228 return; 229 230 node = obj->un_pager.swp.swp_priv; 231 MPASS(node->tn_type == VREG); 232 tm = node->tn_reg.tn_tmp; 233 234 KASSERT(tm->tm_pages_used >= c, 235 ("tmpfs tm %p pages %jd free %jd", tm, 236 (uintmax_t)tm->tm_pages_used, (uintmax_t)c)); 237 atomic_add_long(&tm->tm_pages_used, -c); 238 KASSERT(node->tn_reg.tn_pages >= c, 239 ("tmpfs node %p pages %jd free %jd", node, 240 (uintmax_t)node->tn_reg.tn_pages, (uintmax_t)c)); 241 node->tn_reg.tn_pages -= c; 242 } 243 244 static void 245 tmpfs_page_inserted(vm_object_t obj, vm_page_t m) 246 { 247 struct tmpfs_node *node; 248 struct tmpfs_mount *tm; 249 250 if ((obj->flags & OBJ_TMPFS) == 0) 251 return; 252 253 node = obj->un_pager.swp.swp_priv; 254 MPASS(node->tn_type == VREG); 255 tm = node->tn_reg.tn_tmp; 256 257 if (!vm_pager_has_page(obj, m->pindex, NULL, NULL)) { 258 atomic_add_long(&tm->tm_pages_used, 1); 259 node->tn_reg.tn_pages += 1; 260 } 261 } 262 263 static void 264 tmpfs_page_removed(vm_object_t obj, vm_page_t m) 265 { 266 struct tmpfs_node *node; 267 struct tmpfs_mount *tm; 268 269 if ((obj->flags & OBJ_TMPFS) == 0) 270 return; 271 272 node = obj->un_pager.swp.swp_priv; 273 MPASS(node->tn_type == VREG); 274 tm = node->tn_reg.tn_tmp; 275 276 if (!vm_pager_has_page(obj, m->pindex, NULL, NULL)) { 277 KASSERT(tm->tm_pages_used >= 1, 278 ("tmpfs tm %p pages %jd free 1", tm, 279 (uintmax_t)tm->tm_pages_used)); 280 atomic_add_long(&tm->tm_pages_used, -1); 281 KASSERT(node->tn_reg.tn_pages >= 1, 282 ("tmpfs node %p pages %jd free 1", node, 283 (uintmax_t)node->tn_reg.tn_pages)); 284 node->tn_reg.tn_pages -= 1; 285 } 286 } 287 288 static boolean_t 289 tmpfs_can_alloc_page(vm_object_t obj, vm_pindex_t pindex) 290 { 291 struct tmpfs_mount *tm; 292 293 tm = VM_TO_TMPFS_MP(obj); 294 if (tm == NULL || vm_pager_has_page(obj, pindex, NULL, NULL) || 295 tm->tm_pages_max == 0) 296 return (true); 297 if (tm->tm_pages_max == ULONG_MAX) 298 return (tmpfs_mem_avail() >= 1); 299 return (tm->tm_pages_max > atomic_load_long(&tm->tm_pages_used)); 300 } 301 302 struct pagerops tmpfs_pager_ops = { 303 .pgo_kvme_type = KVME_TYPE_VNODE, 304 .pgo_alloc = tmpfs_pager_alloc, 305 .pgo_set_writeable_dirty = vm_object_set_writeable_dirty_, 306 .pgo_update_writecount = tmpfs_pager_update_writecount, 307 .pgo_release_writecount = tmpfs_pager_release_writecount, 308 .pgo_mightbedirty = vm_object_mightbedirty_, 309 .pgo_getvp = tmpfs_pager_getvp, 310 .pgo_freespace = tmpfs_pager_freespace, 311 .pgo_page_inserted = tmpfs_page_inserted, 312 .pgo_page_removed = tmpfs_page_removed, 313 .pgo_can_alloc_page = tmpfs_can_alloc_page, 314 }; 315 316 static int 317 tmpfs_node_ctor(void *mem, int size, void *arg, int flags) 318 { 319 struct tmpfs_node *node; 320 321 node = mem; 322 node->tn_gen++; 323 node->tn_size = 0; 324 node->tn_status = 0; 325 node->tn_accessed = false; 326 node->tn_flags = 0; 327 node->tn_links = 0; 328 node->tn_vnode = NULL; 329 node->tn_vpstate = 0; 330 return (0); 331 } 332 333 static void 334 tmpfs_node_dtor(void *mem, int size, void *arg) 335 { 336 struct tmpfs_node *node; 337 338 node = mem; 339 node->tn_type = VNON; 340 } 341 342 static int 343 tmpfs_node_init(void *mem, int size, int flags) 344 { 345 struct tmpfs_node *node; 346 347 node = mem; 348 node->tn_id = 0; 349 mtx_init(&node->tn_interlock, "tmpfsni", NULL, MTX_DEF); 350 node->tn_gen = arc4random(); 351 return (0); 352 } 353 354 static void 355 tmpfs_node_fini(void *mem, int size) 356 { 357 struct tmpfs_node *node; 358 359 node = mem; 360 mtx_destroy(&node->tn_interlock); 361 } 362 363 int 364 tmpfs_subr_init(void) 365 { 366 tmpfs_pager_type = vm_pager_alloc_dyn_type(&tmpfs_pager_ops, 367 OBJT_SWAP); 368 if (tmpfs_pager_type == -1) 369 return (EINVAL); 370 tmpfs_node_pool = uma_zcreate("TMPFS node", 371 sizeof(struct tmpfs_node), tmpfs_node_ctor, tmpfs_node_dtor, 372 tmpfs_node_init, tmpfs_node_fini, UMA_ALIGN_PTR, 0); 373 VFS_SMR_ZONE_SET(tmpfs_node_pool); 374 375 tmpfs_pages_avail_init = tmpfs_mem_avail(); 376 tmpfs_set_reserve_from_percent(); 377 return (0); 378 } 379 380 void 381 tmpfs_subr_uninit(void) 382 { 383 if (tmpfs_pager_type != -1) 384 vm_pager_free_dyn_type(tmpfs_pager_type); 385 tmpfs_pager_type = -1; 386 uma_zdestroy(tmpfs_node_pool); 387 } 388 389 static int 390 sysctl_mem_reserved(SYSCTL_HANDLER_ARGS) 391 { 392 int error; 393 long pages, bytes; 394 395 pages = *(long *)arg1; 396 bytes = pages * PAGE_SIZE; 397 398 error = sysctl_handle_long(oidp, &bytes, 0, req); 399 if (error || !req->newptr) 400 return (error); 401 402 pages = bytes / PAGE_SIZE; 403 if (pages < TMPFS_PAGES_MINRESERVED) 404 return (EINVAL); 405 406 *(long *)arg1 = pages; 407 return (0); 408 } 409 410 SYSCTL_PROC(_vfs_tmpfs, OID_AUTO, memory_reserved, 411 CTLTYPE_LONG | CTLFLAG_MPSAFE | CTLFLAG_RW, &tmpfs_pages_reserved, 0, 412 sysctl_mem_reserved, "L", 413 "Amount of available memory and swap below which tmpfs growth stops"); 414 415 static int 416 sysctl_mem_percent(SYSCTL_HANDLER_ARGS) 417 { 418 int error, percent; 419 420 percent = *(int *)arg1; 421 error = sysctl_handle_int(oidp, &percent, 0, req); 422 if (error || !req->newptr) 423 return (error); 424 425 if ((unsigned) percent > 100) 426 return (EINVAL); 427 428 *(long *)arg1 = percent; 429 tmpfs_set_reserve_from_percent(); 430 return (0); 431 } 432 433 static void 434 tmpfs_set_reserve_from_percent(void) 435 { 436 size_t reserved; 437 438 reserved = tmpfs_pages_avail_init * (100 - tmpfs_mem_percent) / 100; 439 tmpfs_pages_reserved = max(reserved, TMPFS_PAGES_MINRESERVED); 440 } 441 442 SYSCTL_PROC(_vfs_tmpfs, OID_AUTO, memory_percent, 443 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, &tmpfs_mem_percent, 0, 444 sysctl_mem_percent, "I", 445 "Percent of available memory that can be used if no size limit"); 446 447 static __inline int tmpfs_dirtree_cmp(struct tmpfs_dirent *a, 448 struct tmpfs_dirent *b); 449 RB_PROTOTYPE_STATIC(tmpfs_dir, tmpfs_dirent, uh.td_entries, tmpfs_dirtree_cmp); 450 451 size_t 452 tmpfs_mem_avail(void) 453 { 454 size_t avail; 455 long reserved; 456 457 avail = swap_pager_avail + vm_free_count(); 458 reserved = atomic_load_long(&tmpfs_pages_reserved); 459 if (__predict_false(avail < reserved)) 460 return (0); 461 return (avail - reserved); 462 } 463 464 size_t 465 tmpfs_pages_used(struct tmpfs_mount *tmp) 466 { 467 const size_t node_size = sizeof(struct tmpfs_node) + 468 sizeof(struct tmpfs_dirent); 469 size_t meta_pages; 470 471 meta_pages = howmany((uintmax_t)tmp->tm_nodes_inuse * node_size, 472 PAGE_SIZE); 473 return (meta_pages + tmp->tm_pages_used); 474 } 475 476 bool 477 tmpfs_pages_check_avail(struct tmpfs_mount *tmp, size_t req_pages) 478 { 479 if (tmpfs_mem_avail() < req_pages) 480 return (false); 481 482 if (tmp->tm_pages_max != ULONG_MAX && 483 tmp->tm_pages_max < req_pages + tmpfs_pages_used(tmp)) 484 return (false); 485 486 return (true); 487 } 488 489 static int 490 tmpfs_partial_page_invalidate(vm_object_t object, vm_pindex_t idx, int base, 491 int end, boolean_t ignerr) 492 { 493 vm_page_t m; 494 int rv, error; 495 496 VM_OBJECT_ASSERT_WLOCKED(object); 497 KASSERT(base >= 0, ("%s: base %d", __func__, base)); 498 KASSERT(end - base <= PAGE_SIZE, ("%s: base %d end %d", __func__, base, 499 end)); 500 error = 0; 501 502 retry: 503 m = vm_page_grab(object, idx, VM_ALLOC_NOCREAT); 504 if (m != NULL) { 505 MPASS(vm_page_all_valid(m)); 506 } else if (vm_pager_has_page(object, idx, NULL, NULL)) { 507 m = vm_page_alloc(object, idx, VM_ALLOC_NORMAL | 508 VM_ALLOC_WAITFAIL); 509 if (m == NULL) 510 goto retry; 511 vm_object_pip_add(object, 1); 512 VM_OBJECT_WUNLOCK(object); 513 rv = vm_pager_get_pages(object, &m, 1, NULL, NULL); 514 VM_OBJECT_WLOCK(object); 515 vm_object_pip_wakeup(object); 516 if (rv == VM_PAGER_OK) { 517 /* 518 * Since the page was not resident, and therefore not 519 * recently accessed, immediately enqueue it for 520 * asynchronous laundering. The current operation is 521 * not regarded as an access. 522 */ 523 vm_page_launder(m); 524 } else { 525 vm_page_free(m); 526 m = NULL; 527 if (!ignerr) 528 error = EIO; 529 } 530 } 531 if (m != NULL) { 532 pmap_zero_page_area(m, base, end - base); 533 vm_page_set_dirty(m); 534 vm_page_xunbusy(m); 535 } 536 537 return (error); 538 } 539 540 void 541 tmpfs_ref_node(struct tmpfs_node *node) 542 { 543 #ifdef INVARIANTS 544 u_int old; 545 546 old = 547 #endif 548 refcount_acquire(&node->tn_refcount); 549 #ifdef INVARIANTS 550 KASSERT(old > 0, ("node %p zero refcount", node)); 551 #endif 552 } 553 554 /* 555 * Allocates a new node of type 'type' inside the 'tmp' mount point, with 556 * its owner set to 'uid', its group to 'gid' and its mode set to 'mode', 557 * using the credentials of the process 'p'. 558 * 559 * If the node type is set to 'VDIR', then the parent parameter must point 560 * to the parent directory of the node being created. It may only be NULL 561 * while allocating the root node. 562 * 563 * If the node type is set to 'VBLK' or 'VCHR', then the rdev parameter 564 * specifies the device the node represents. 565 * 566 * If the node type is set to 'VLNK', then the parameter target specifies 567 * the file name of the target file for the symbolic link that is being 568 * created. 569 * 570 * Note that new nodes are retrieved from the available list if it has 571 * items or, if it is empty, from the node pool as long as there is enough 572 * space to create them. 573 * 574 * Returns zero on success or an appropriate error code on failure. 575 */ 576 int 577 tmpfs_alloc_node(struct mount *mp, struct tmpfs_mount *tmp, __enum_uint8(vtype) type, 578 uid_t uid, gid_t gid, mode_t mode, struct tmpfs_node *parent, 579 const char *target, dev_t rdev, struct tmpfs_node **node) 580 { 581 struct tmpfs_node *nnode; 582 char *symlink; 583 char symlink_smr; 584 585 /* If the root directory of the 'tmp' file system is not yet 586 * allocated, this must be the request to do it. */ 587 MPASS(IMPLIES(tmp->tm_root == NULL, parent == NULL && type == VDIR)); 588 589 MPASS((type == VLNK) ^ (target == NULL)); 590 MPASS((type == VBLK || type == VCHR) ^ (rdev == VNOVAL)); 591 592 if (tmp->tm_nodes_inuse >= tmp->tm_nodes_max) 593 return (ENOSPC); 594 if (!tmpfs_pages_check_avail(tmp, 1)) 595 return (ENOSPC); 596 597 if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) { 598 /* 599 * When a new tmpfs node is created for fully 600 * constructed mount point, there must be a parent 601 * node, which vnode is locked exclusively. As 602 * consequence, if the unmount is executing in 603 * parallel, vflush() cannot reclaim the parent vnode. 604 * Due to this, the check for MNTK_UNMOUNT flag is not 605 * racy: if we did not see MNTK_UNMOUNT flag, then tmp 606 * cannot be destroyed until node construction is 607 * finished and the parent vnode unlocked. 608 * 609 * Tmpfs does not need to instantiate new nodes during 610 * unmount. 611 */ 612 return (EBUSY); 613 } 614 if ((mp->mnt_kern_flag & MNT_RDONLY) != 0) 615 return (EROFS); 616 617 nnode = uma_zalloc_smr(tmpfs_node_pool, M_WAITOK); 618 619 /* Generic initialization. */ 620 nnode->tn_type = type; 621 vfs_timestamp(&nnode->tn_atime); 622 nnode->tn_birthtime = nnode->tn_ctime = nnode->tn_mtime = 623 nnode->tn_atime; 624 nnode->tn_uid = uid; 625 nnode->tn_gid = gid; 626 nnode->tn_mode = mode; 627 nnode->tn_id = alloc_unr64(&tmp->tm_ino_unr); 628 nnode->tn_refcount = 1; 629 LIST_INIT(&nnode->tn_extattrs); 630 631 /* Type-specific initialization. */ 632 switch (nnode->tn_type) { 633 case VBLK: 634 case VCHR: 635 nnode->tn_rdev = rdev; 636 break; 637 638 case VDIR: 639 RB_INIT(&nnode->tn_dir.tn_dirhead); 640 LIST_INIT(&nnode->tn_dir.tn_dupindex); 641 MPASS(parent != nnode); 642 MPASS(IMPLIES(parent == NULL, tmp->tm_root == NULL)); 643 nnode->tn_dir.tn_parent = (parent == NULL) ? nnode : parent; 644 nnode->tn_dir.tn_readdir_lastn = 0; 645 nnode->tn_dir.tn_readdir_lastp = NULL; 646 nnode->tn_links++; 647 TMPFS_NODE_LOCK(nnode->tn_dir.tn_parent); 648 nnode->tn_dir.tn_parent->tn_links++; 649 TMPFS_NODE_UNLOCK(nnode->tn_dir.tn_parent); 650 break; 651 652 case VFIFO: 653 /* FALLTHROUGH */ 654 case VSOCK: 655 break; 656 657 case VLNK: 658 MPASS(strlen(target) < MAXPATHLEN); 659 nnode->tn_size = strlen(target); 660 661 symlink = NULL; 662 if (!tmp->tm_nonc) { 663 symlink = cache_symlink_alloc(nnode->tn_size + 1, 664 M_WAITOK); 665 symlink_smr = true; 666 } 667 if (symlink == NULL) { 668 symlink = malloc(nnode->tn_size + 1, M_TMPFSNAME, 669 M_WAITOK); 670 symlink_smr = false; 671 } 672 memcpy(symlink, target, nnode->tn_size + 1); 673 674 /* 675 * Allow safe symlink resolving for lockless lookup. 676 * tmpfs_fplookup_symlink references this comment. 677 * 678 * 1. nnode is not yet visible to the world 679 * 2. both tn_link_target and tn_link_smr get populated 680 * 3. release fence publishes their content 681 * 4. tn_link_target content is immutable until node 682 * destruction, where the pointer gets set to NULL 683 * 5. tn_link_smr is never changed once set 684 * 685 * As a result it is sufficient to issue load consume 686 * on the node pointer to also get the above content 687 * in a stable manner. Worst case tn_link_smr flag 688 * may be set to true despite being stale, while the 689 * target buffer is already cleared out. 690 */ 691 atomic_store_ptr(&nnode->tn_link_target, symlink); 692 atomic_store_char((char *)&nnode->tn_link_smr, symlink_smr); 693 atomic_thread_fence_rel(); 694 break; 695 696 case VREG: 697 nnode->tn_reg.tn_aobj = 698 vm_pager_allocate(tmpfs_pager_type, NULL, 0, 699 VM_PROT_DEFAULT, 0, 700 NULL /* XXXKIB - tmpfs needs swap reservation */); 701 nnode->tn_reg.tn_aobj->un_pager.swp.swp_priv = nnode; 702 vm_object_set_flag(nnode->tn_reg.tn_aobj, OBJ_TMPFS); 703 nnode->tn_reg.tn_tmp = tmp; 704 nnode->tn_reg.tn_pages = 0; 705 break; 706 707 default: 708 panic("tmpfs_alloc_node: type %p %d", nnode, 709 (int)nnode->tn_type); 710 } 711 712 TMPFS_LOCK(tmp); 713 LIST_INSERT_HEAD(&tmp->tm_nodes_used, nnode, tn_entries); 714 nnode->tn_attached = true; 715 tmp->tm_nodes_inuse++; 716 tmp->tm_refcount++; 717 TMPFS_UNLOCK(tmp); 718 719 *node = nnode; 720 return (0); 721 } 722 723 /* 724 * Destroys the node pointed to by node from the file system 'tmp'. 725 * If the node references a directory, no entries are allowed. 726 */ 727 void 728 tmpfs_free_node(struct tmpfs_mount *tmp, struct tmpfs_node *node) 729 { 730 if (refcount_release_if_not_last(&node->tn_refcount)) 731 return; 732 733 TMPFS_LOCK(tmp); 734 TMPFS_NODE_LOCK(node); 735 if (!tmpfs_free_node_locked(tmp, node, false)) { 736 TMPFS_NODE_UNLOCK(node); 737 TMPFS_UNLOCK(tmp); 738 } 739 } 740 741 bool 742 tmpfs_free_node_locked(struct tmpfs_mount *tmp, struct tmpfs_node *node, 743 bool detach) 744 { 745 struct tmpfs_extattr *ea; 746 vm_object_t uobj; 747 char *symlink; 748 bool last; 749 750 TMPFS_MP_ASSERT_LOCKED(tmp); 751 TMPFS_NODE_ASSERT_LOCKED(node); 752 753 last = refcount_release(&node->tn_refcount); 754 if (node->tn_attached && (detach || last)) { 755 MPASS(tmp->tm_nodes_inuse > 0); 756 tmp->tm_nodes_inuse--; 757 LIST_REMOVE(node, tn_entries); 758 node->tn_attached = false; 759 } 760 if (!last) 761 return (false); 762 763 TMPFS_NODE_UNLOCK(node); 764 765 #ifdef INVARIANTS 766 MPASS(node->tn_vnode == NULL); 767 MPASS((node->tn_vpstate & TMPFS_VNODE_ALLOCATING) == 0); 768 769 /* 770 * Make sure this is a node type we can deal with. Everything 771 * is explicitly enumerated without the 'default' clause so 772 * the compiler can throw an error in case a new type is 773 * added. 774 */ 775 switch (node->tn_type) { 776 case VBLK: 777 case VCHR: 778 case VDIR: 779 case VFIFO: 780 case VSOCK: 781 case VLNK: 782 case VREG: 783 break; 784 case VNON: 785 case VBAD: 786 case VMARKER: 787 panic("%s: bad type %d for node %p", __func__, 788 (int)node->tn_type, node); 789 } 790 #endif 791 792 while ((ea = LIST_FIRST(&node->tn_extattrs)) != NULL) { 793 LIST_REMOVE(ea, ea_extattrs); 794 tmpfs_extattr_free(ea); 795 } 796 797 switch (node->tn_type) { 798 case VREG: 799 uobj = node->tn_reg.tn_aobj; 800 node->tn_reg.tn_aobj = NULL; 801 if (uobj != NULL) { 802 VM_OBJECT_WLOCK(uobj); 803 KASSERT((uobj->flags & OBJ_TMPFS) != 0, 804 ("tmpfs node %p uobj %p not tmpfs", node, uobj)); 805 vm_object_clear_flag(uobj, OBJ_TMPFS); 806 KASSERT(tmp->tm_pages_used >= node->tn_reg.tn_pages, 807 ("tmpfs tmp %p node %p pages %jd free %jd", tmp, 808 node, (uintmax_t)tmp->tm_pages_used, 809 (uintmax_t)node->tn_reg.tn_pages)); 810 atomic_add_long(&tmp->tm_pages_used, 811 -node->tn_reg.tn_pages); 812 VM_OBJECT_WUNLOCK(uobj); 813 } 814 tmpfs_free_tmp(tmp); 815 816 /* 817 * vm_object_deallocate() must not be called while 818 * owning tm_allnode_lock, because deallocate might 819 * sleep. Call it after tmpfs_free_tmp() does the 820 * unlock. 821 */ 822 if (uobj != NULL) 823 vm_object_deallocate(uobj); 824 825 break; 826 case VLNK: 827 tmpfs_free_tmp(tmp); 828 829 symlink = node->tn_link_target; 830 atomic_store_ptr(&node->tn_link_target, NULL); 831 if (atomic_load_char(&node->tn_link_smr)) { 832 cache_symlink_free(symlink, node->tn_size + 1); 833 } else { 834 free(symlink, M_TMPFSNAME); 835 } 836 break; 837 default: 838 tmpfs_free_tmp(tmp); 839 break; 840 } 841 842 uma_zfree_smr(tmpfs_node_pool, node); 843 return (true); 844 } 845 846 static __inline uint32_t 847 tmpfs_dirent_hash(const char *name, u_int len) 848 { 849 uint32_t hash; 850 851 hash = fnv_32_buf(name, len, FNV1_32_INIT + len) & TMPFS_DIRCOOKIE_MASK; 852 #ifdef TMPFS_DEBUG_DIRCOOKIE_DUP 853 hash &= 0xf; 854 #endif 855 if (hash < TMPFS_DIRCOOKIE_MIN) 856 hash += TMPFS_DIRCOOKIE_MIN; 857 858 return (hash); 859 } 860 861 static __inline off_t 862 tmpfs_dirent_cookie(struct tmpfs_dirent *de) 863 { 864 if (de == NULL) 865 return (TMPFS_DIRCOOKIE_EOF); 866 867 MPASS(de->td_cookie >= TMPFS_DIRCOOKIE_MIN); 868 869 return (de->td_cookie); 870 } 871 872 static __inline boolean_t 873 tmpfs_dirent_dup(struct tmpfs_dirent *de) 874 { 875 return ((de->td_cookie & TMPFS_DIRCOOKIE_DUP) != 0); 876 } 877 878 static __inline boolean_t 879 tmpfs_dirent_duphead(struct tmpfs_dirent *de) 880 { 881 return ((de->td_cookie & TMPFS_DIRCOOKIE_DUPHEAD) != 0); 882 } 883 884 void 885 tmpfs_dirent_init(struct tmpfs_dirent *de, const char *name, u_int namelen) 886 { 887 de->td_hash = de->td_cookie = tmpfs_dirent_hash(name, namelen); 888 memcpy(de->ud.td_name, name, namelen); 889 de->td_namelen = namelen; 890 } 891 892 /* 893 * Allocates a new directory entry for the node node with a name of name. 894 * The new directory entry is returned in *de. 895 * 896 * The link count of node is increased by one to reflect the new object 897 * referencing it. 898 * 899 * Returns zero on success or an appropriate error code on failure. 900 */ 901 int 902 tmpfs_alloc_dirent(struct tmpfs_mount *tmp, struct tmpfs_node *node, 903 const char *name, u_int len, struct tmpfs_dirent **de) 904 { 905 struct tmpfs_dirent *nde; 906 907 nde = malloc(sizeof(*nde), M_TMPFSDIR, M_WAITOK); 908 nde->td_node = node; 909 if (name != NULL) { 910 nde->ud.td_name = malloc(len, M_TMPFSNAME, M_WAITOK); 911 tmpfs_dirent_init(nde, name, len); 912 } else 913 nde->td_namelen = 0; 914 if (node != NULL) 915 node->tn_links++; 916 917 *de = nde; 918 919 return (0); 920 } 921 922 /* 923 * Frees a directory entry. It is the caller's responsibility to destroy 924 * the node referenced by it if needed. 925 * 926 * The link count of node is decreased by one to reflect the removal of an 927 * object that referenced it. This only happens if 'node_exists' is true; 928 * otherwise the function will not access the node referred to by the 929 * directory entry, as it may already have been released from the outside. 930 */ 931 void 932 tmpfs_free_dirent(struct tmpfs_mount *tmp, struct tmpfs_dirent *de) 933 { 934 struct tmpfs_node *node; 935 936 node = de->td_node; 937 if (node != NULL) { 938 MPASS(node->tn_links > 0); 939 node->tn_links--; 940 } 941 if (!tmpfs_dirent_duphead(de) && de->ud.td_name != NULL) 942 free(de->ud.td_name, M_TMPFSNAME); 943 free(de, M_TMPFSDIR); 944 } 945 946 void 947 tmpfs_destroy_vobject(struct vnode *vp, vm_object_t obj) 948 { 949 bool want_vrele; 950 951 ASSERT_VOP_ELOCKED(vp, "tmpfs_destroy_vobject"); 952 if (vp->v_type != VREG || obj == NULL) 953 return; 954 955 VM_OBJECT_WLOCK(obj); 956 VI_LOCK(vp); 957 /* 958 * May be going through forced unmount. 959 */ 960 want_vrele = false; 961 if ((obj->flags & OBJ_TMPFS_VREF) != 0) { 962 vm_object_clear_flag(obj, OBJ_TMPFS_VREF); 963 want_vrele = true; 964 } 965 966 if (vp->v_writecount < 0) 967 vp->v_writecount = 0; 968 VI_UNLOCK(vp); 969 VM_OBJECT_WUNLOCK(obj); 970 if (want_vrele) { 971 vrele(vp); 972 } 973 } 974 975 /* 976 * Allocates a new vnode for the node node or returns a new reference to 977 * an existing one if the node had already a vnode referencing it. The 978 * resulting locked vnode is returned in *vpp. 979 * 980 * Returns zero on success or an appropriate error code on failure. 981 */ 982 int 983 tmpfs_alloc_vp(struct mount *mp, struct tmpfs_node *node, int lkflag, 984 struct vnode **vpp) 985 { 986 struct vnode *vp; 987 enum vgetstate vs; 988 struct tmpfs_mount *tm; 989 vm_object_t object; 990 int error; 991 992 error = 0; 993 tm = VFS_TO_TMPFS(mp); 994 TMPFS_NODE_LOCK(node); 995 tmpfs_ref_node(node); 996 loop: 997 TMPFS_NODE_ASSERT_LOCKED(node); 998 if ((vp = node->tn_vnode) != NULL) { 999 MPASS((node->tn_vpstate & TMPFS_VNODE_DOOMED) == 0); 1000 if ((node->tn_type == VDIR && node->tn_dir.tn_parent == NULL) || 1001 (VN_IS_DOOMED(vp) && 1002 (lkflag & LK_NOWAIT) != 0)) { 1003 TMPFS_NODE_UNLOCK(node); 1004 error = ENOENT; 1005 vp = NULL; 1006 goto out; 1007 } 1008 if (VN_IS_DOOMED(vp)) { 1009 node->tn_vpstate |= TMPFS_VNODE_WRECLAIM; 1010 while ((node->tn_vpstate & TMPFS_VNODE_WRECLAIM) != 0) { 1011 msleep(&node->tn_vnode, TMPFS_NODE_MTX(node), 1012 0, "tmpfsE", 0); 1013 } 1014 goto loop; 1015 } 1016 vs = vget_prep(vp); 1017 TMPFS_NODE_UNLOCK(node); 1018 error = vget_finish(vp, lkflag, vs); 1019 if (error == ENOENT) { 1020 TMPFS_NODE_LOCK(node); 1021 goto loop; 1022 } 1023 if (error != 0) { 1024 vp = NULL; 1025 goto out; 1026 } 1027 1028 /* 1029 * Make sure the vnode is still there after 1030 * getting the interlock to avoid racing a free. 1031 */ 1032 if (node->tn_vnode != vp) { 1033 vput(vp); 1034 TMPFS_NODE_LOCK(node); 1035 goto loop; 1036 } 1037 1038 goto out; 1039 } 1040 1041 if ((node->tn_vpstate & TMPFS_VNODE_DOOMED) || 1042 (node->tn_type == VDIR && node->tn_dir.tn_parent == NULL)) { 1043 TMPFS_NODE_UNLOCK(node); 1044 error = ENOENT; 1045 vp = NULL; 1046 goto out; 1047 } 1048 1049 /* 1050 * otherwise lock the vp list while we call getnewvnode 1051 * since that can block. 1052 */ 1053 if (node->tn_vpstate & TMPFS_VNODE_ALLOCATING) { 1054 node->tn_vpstate |= TMPFS_VNODE_WANT; 1055 error = msleep((caddr_t) &node->tn_vpstate, 1056 TMPFS_NODE_MTX(node), 0, "tmpfs_alloc_vp", 0); 1057 if (error != 0) 1058 goto out; 1059 goto loop; 1060 } else 1061 node->tn_vpstate |= TMPFS_VNODE_ALLOCATING; 1062 1063 TMPFS_NODE_UNLOCK(node); 1064 1065 /* Get a new vnode and associate it with our node. */ 1066 error = getnewvnode("tmpfs", mp, VFS_TO_TMPFS(mp)->tm_nonc ? 1067 &tmpfs_vnodeop_nonc_entries : &tmpfs_vnodeop_entries, &vp); 1068 if (error != 0) 1069 goto unlock; 1070 MPASS(vp != NULL); 1071 1072 /* lkflag is ignored, the lock is exclusive */ 1073 (void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1074 1075 vp->v_data = node; 1076 vp->v_type = node->tn_type; 1077 1078 /* Type-specific initialization. */ 1079 switch (node->tn_type) { 1080 case VBLK: 1081 /* FALLTHROUGH */ 1082 case VCHR: 1083 /* FALLTHROUGH */ 1084 case VLNK: 1085 /* FALLTHROUGH */ 1086 case VSOCK: 1087 break; 1088 case VFIFO: 1089 vp->v_op = &tmpfs_fifoop_entries; 1090 break; 1091 case VREG: 1092 object = node->tn_reg.tn_aobj; 1093 VM_OBJECT_WLOCK(object); 1094 KASSERT((object->flags & OBJ_TMPFS_VREF) == 0, 1095 ("%s: object %p with OBJ_TMPFS_VREF but without vnode", 1096 __func__, object)); 1097 KASSERT(object->un_pager.swp.writemappings == 0, 1098 ("%s: object %p has writemappings", 1099 __func__, object)); 1100 VI_LOCK(vp); 1101 KASSERT(vp->v_object == NULL, ("Not NULL v_object in tmpfs")); 1102 vp->v_object = object; 1103 vn_irflag_set_locked(vp, (tm->tm_pgread ? VIRF_PGREAD : 0) | 1104 VIRF_TEXT_REF); 1105 VI_UNLOCK(vp); 1106 VM_OBJECT_WUNLOCK(object); 1107 break; 1108 case VDIR: 1109 MPASS(node->tn_dir.tn_parent != NULL); 1110 if (node->tn_dir.tn_parent == node) 1111 vp->v_vflag |= VV_ROOT; 1112 break; 1113 1114 default: 1115 panic("tmpfs_alloc_vp: type %p %d", node, (int)node->tn_type); 1116 } 1117 if (vp->v_type != VFIFO) 1118 VN_LOCK_ASHARE(vp); 1119 1120 error = insmntque1(vp, mp); 1121 if (error != 0) { 1122 /* Need to clear v_object for insmntque failure. */ 1123 tmpfs_destroy_vobject(vp, vp->v_object); 1124 vp->v_object = NULL; 1125 vp->v_data = NULL; 1126 vp->v_op = &dead_vnodeops; 1127 vgone(vp); 1128 vput(vp); 1129 vp = NULL; 1130 } else { 1131 vn_set_state(vp, VSTATE_CONSTRUCTED); 1132 } 1133 1134 unlock: 1135 TMPFS_NODE_LOCK(node); 1136 1137 MPASS(node->tn_vpstate & TMPFS_VNODE_ALLOCATING); 1138 node->tn_vpstate &= ~TMPFS_VNODE_ALLOCATING; 1139 node->tn_vnode = vp; 1140 1141 if (node->tn_vpstate & TMPFS_VNODE_WANT) { 1142 node->tn_vpstate &= ~TMPFS_VNODE_WANT; 1143 TMPFS_NODE_UNLOCK(node); 1144 wakeup((caddr_t) &node->tn_vpstate); 1145 } else 1146 TMPFS_NODE_UNLOCK(node); 1147 1148 out: 1149 if (error == 0) { 1150 *vpp = vp; 1151 1152 #ifdef INVARIANTS 1153 MPASS(*vpp != NULL); 1154 ASSERT_VOP_LOCKED(*vpp, __func__); 1155 TMPFS_NODE_LOCK(node); 1156 MPASS(*vpp == node->tn_vnode); 1157 TMPFS_NODE_UNLOCK(node); 1158 #endif 1159 } 1160 tmpfs_free_node(tm, node); 1161 1162 return (error); 1163 } 1164 1165 /* 1166 * Destroys the association between the vnode vp and the node it 1167 * references. 1168 */ 1169 void 1170 tmpfs_free_vp(struct vnode *vp) 1171 { 1172 struct tmpfs_node *node; 1173 1174 node = VP_TO_TMPFS_NODE(vp); 1175 1176 TMPFS_NODE_ASSERT_LOCKED(node); 1177 node->tn_vnode = NULL; 1178 if ((node->tn_vpstate & TMPFS_VNODE_WRECLAIM) != 0) 1179 wakeup(&node->tn_vnode); 1180 node->tn_vpstate &= ~TMPFS_VNODE_WRECLAIM; 1181 vp->v_data = NULL; 1182 } 1183 1184 /* 1185 * Allocates a new file of type 'type' and adds it to the parent directory 1186 * 'dvp'; this addition is done using the component name given in 'cnp'. 1187 * The ownership of the new file is automatically assigned based on the 1188 * credentials of the caller (through 'cnp'), the group is set based on 1189 * the parent directory and the mode is determined from the 'vap' argument. 1190 * If successful, *vpp holds a vnode to the newly created file and zero 1191 * is returned. Otherwise *vpp is NULL and the function returns an 1192 * appropriate error code. 1193 */ 1194 int 1195 tmpfs_alloc_file(struct vnode *dvp, struct vnode **vpp, struct vattr *vap, 1196 struct componentname *cnp, const char *target) 1197 { 1198 int error; 1199 struct tmpfs_dirent *de; 1200 struct tmpfs_mount *tmp; 1201 struct tmpfs_node *dnode; 1202 struct tmpfs_node *node; 1203 struct tmpfs_node *parent; 1204 1205 ASSERT_VOP_ELOCKED(dvp, "tmpfs_alloc_file"); 1206 1207 tmp = VFS_TO_TMPFS(dvp->v_mount); 1208 dnode = VP_TO_TMPFS_DIR(dvp); 1209 *vpp = NULL; 1210 1211 /* If the entry we are creating is a directory, we cannot overflow 1212 * the number of links of its parent, because it will get a new 1213 * link. */ 1214 if (vap->va_type == VDIR) { 1215 /* Ensure that we do not overflow the maximum number of links 1216 * imposed by the system. */ 1217 MPASS(dnode->tn_links <= TMPFS_LINK_MAX); 1218 if (dnode->tn_links == TMPFS_LINK_MAX) { 1219 return (EMLINK); 1220 } 1221 1222 parent = dnode; 1223 MPASS(parent != NULL); 1224 } else 1225 parent = NULL; 1226 1227 /* Allocate a node that represents the new file. */ 1228 error = tmpfs_alloc_node(dvp->v_mount, tmp, vap->va_type, 1229 cnp->cn_cred->cr_uid, dnode->tn_gid, vap->va_mode, parent, 1230 target, vap->va_rdev, &node); 1231 if (error != 0) 1232 return (error); 1233 1234 /* Allocate a directory entry that points to the new file. */ 1235 error = tmpfs_alloc_dirent(tmp, node, cnp->cn_nameptr, cnp->cn_namelen, 1236 &de); 1237 if (error != 0) { 1238 tmpfs_free_node(tmp, node); 1239 return (error); 1240 } 1241 1242 /* Allocate a vnode for the new file. */ 1243 error = tmpfs_alloc_vp(dvp->v_mount, node, LK_EXCLUSIVE, vpp); 1244 if (error != 0) { 1245 tmpfs_free_dirent(tmp, de); 1246 tmpfs_free_node(tmp, node); 1247 return (error); 1248 } 1249 1250 /* Now that all required items are allocated, we can proceed to 1251 * insert the new node into the directory, an operation that 1252 * cannot fail. */ 1253 if (cnp->cn_flags & ISWHITEOUT) 1254 tmpfs_dir_whiteout_remove(dvp, cnp); 1255 tmpfs_dir_attach(dvp, de); 1256 return (0); 1257 } 1258 1259 struct tmpfs_dirent * 1260 tmpfs_dir_first(struct tmpfs_node *dnode, struct tmpfs_dir_cursor *dc) 1261 { 1262 struct tmpfs_dirent *de; 1263 1264 de = RB_MIN(tmpfs_dir, &dnode->tn_dir.tn_dirhead); 1265 dc->tdc_tree = de; 1266 if (de != NULL && tmpfs_dirent_duphead(de)) 1267 de = LIST_FIRST(&de->ud.td_duphead); 1268 dc->tdc_current = de; 1269 1270 return (dc->tdc_current); 1271 } 1272 1273 struct tmpfs_dirent * 1274 tmpfs_dir_next(struct tmpfs_node *dnode, struct tmpfs_dir_cursor *dc) 1275 { 1276 struct tmpfs_dirent *de; 1277 1278 MPASS(dc->tdc_tree != NULL); 1279 if (tmpfs_dirent_dup(dc->tdc_current)) { 1280 dc->tdc_current = LIST_NEXT(dc->tdc_current, uh.td_dup.entries); 1281 if (dc->tdc_current != NULL) 1282 return (dc->tdc_current); 1283 } 1284 dc->tdc_tree = dc->tdc_current = RB_NEXT(tmpfs_dir, 1285 &dnode->tn_dir.tn_dirhead, dc->tdc_tree); 1286 if ((de = dc->tdc_current) != NULL && tmpfs_dirent_duphead(de)) { 1287 dc->tdc_current = LIST_FIRST(&de->ud.td_duphead); 1288 MPASS(dc->tdc_current != NULL); 1289 } 1290 1291 return (dc->tdc_current); 1292 } 1293 1294 /* Lookup directory entry in RB-Tree. Function may return duphead entry. */ 1295 static struct tmpfs_dirent * 1296 tmpfs_dir_xlookup_hash(struct tmpfs_node *dnode, uint32_t hash) 1297 { 1298 struct tmpfs_dirent *de, dekey; 1299 1300 dekey.td_hash = hash; 1301 de = RB_FIND(tmpfs_dir, &dnode->tn_dir.tn_dirhead, &dekey); 1302 return (de); 1303 } 1304 1305 /* Lookup directory entry by cookie, initialize directory cursor accordingly. */ 1306 static struct tmpfs_dirent * 1307 tmpfs_dir_lookup_cookie(struct tmpfs_node *node, off_t cookie, 1308 struct tmpfs_dir_cursor *dc) 1309 { 1310 struct tmpfs_dir *dirhead = &node->tn_dir.tn_dirhead; 1311 struct tmpfs_dirent *de, dekey; 1312 1313 MPASS(cookie >= TMPFS_DIRCOOKIE_MIN); 1314 1315 if (cookie == node->tn_dir.tn_readdir_lastn && 1316 (de = node->tn_dir.tn_readdir_lastp) != NULL) { 1317 /* Protect against possible race, tn_readdir_last[pn] 1318 * may be updated with only shared vnode lock held. */ 1319 if (cookie == tmpfs_dirent_cookie(de)) 1320 goto out; 1321 } 1322 1323 if ((cookie & TMPFS_DIRCOOKIE_DUP) != 0) { 1324 LIST_FOREACH(de, &node->tn_dir.tn_dupindex, 1325 uh.td_dup.index_entries) { 1326 MPASS(tmpfs_dirent_dup(de)); 1327 if (de->td_cookie == cookie) 1328 goto out; 1329 /* dupindex list is sorted. */ 1330 if (de->td_cookie < cookie) { 1331 de = NULL; 1332 goto out; 1333 } 1334 } 1335 MPASS(de == NULL); 1336 goto out; 1337 } 1338 1339 if ((cookie & TMPFS_DIRCOOKIE_MASK) != cookie) { 1340 de = NULL; 1341 } else { 1342 dekey.td_hash = cookie; 1343 /* Recover if direntry for cookie was removed */ 1344 de = RB_NFIND(tmpfs_dir, dirhead, &dekey); 1345 } 1346 dc->tdc_tree = de; 1347 dc->tdc_current = de; 1348 if (de != NULL && tmpfs_dirent_duphead(de)) { 1349 dc->tdc_current = LIST_FIRST(&de->ud.td_duphead); 1350 MPASS(dc->tdc_current != NULL); 1351 } 1352 return (dc->tdc_current); 1353 1354 out: 1355 dc->tdc_tree = de; 1356 dc->tdc_current = de; 1357 if (de != NULL && tmpfs_dirent_dup(de)) 1358 dc->tdc_tree = tmpfs_dir_xlookup_hash(node, 1359 de->td_hash); 1360 return (dc->tdc_current); 1361 } 1362 1363 /* 1364 * Looks for a directory entry in the directory represented by node. 1365 * 'cnp' describes the name of the entry to look for. Note that the . 1366 * and .. components are not allowed as they do not physically exist 1367 * within directories. 1368 * 1369 * Returns a pointer to the entry when found, otherwise NULL. 1370 */ 1371 struct tmpfs_dirent * 1372 tmpfs_dir_lookup(struct tmpfs_node *node, struct tmpfs_node *f, 1373 struct componentname *cnp) 1374 { 1375 struct tmpfs_dir_duphead *duphead; 1376 struct tmpfs_dirent *de; 1377 uint32_t hash; 1378 1379 MPASS(IMPLIES(cnp->cn_namelen == 1, cnp->cn_nameptr[0] != '.')); 1380 MPASS(IMPLIES(cnp->cn_namelen == 2, !(cnp->cn_nameptr[0] == '.' && 1381 cnp->cn_nameptr[1] == '.'))); 1382 TMPFS_VALIDATE_DIR(node); 1383 1384 hash = tmpfs_dirent_hash(cnp->cn_nameptr, cnp->cn_namelen); 1385 de = tmpfs_dir_xlookup_hash(node, hash); 1386 if (de != NULL && tmpfs_dirent_duphead(de)) { 1387 duphead = &de->ud.td_duphead; 1388 LIST_FOREACH(de, duphead, uh.td_dup.entries) { 1389 if (TMPFS_DIRENT_MATCHES(de, cnp->cn_nameptr, 1390 cnp->cn_namelen)) 1391 break; 1392 } 1393 } else if (de != NULL) { 1394 if (!TMPFS_DIRENT_MATCHES(de, cnp->cn_nameptr, 1395 cnp->cn_namelen)) 1396 de = NULL; 1397 } 1398 if (de != NULL && f != NULL && de->td_node != f) 1399 de = NULL; 1400 1401 return (de); 1402 } 1403 1404 /* 1405 * Attach duplicate-cookie directory entry nde to dnode and insert to dupindex 1406 * list, allocate new cookie value. 1407 */ 1408 static void 1409 tmpfs_dir_attach_dup(struct tmpfs_node *dnode, 1410 struct tmpfs_dir_duphead *duphead, struct tmpfs_dirent *nde) 1411 { 1412 struct tmpfs_dir_duphead *dupindex; 1413 struct tmpfs_dirent *de, *pde; 1414 1415 dupindex = &dnode->tn_dir.tn_dupindex; 1416 de = LIST_FIRST(dupindex); 1417 if (de == NULL || de->td_cookie < TMPFS_DIRCOOKIE_DUP_MAX) { 1418 if (de == NULL) 1419 nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MIN; 1420 else 1421 nde->td_cookie = de->td_cookie + 1; 1422 MPASS(tmpfs_dirent_dup(nde)); 1423 LIST_INSERT_HEAD(dupindex, nde, uh.td_dup.index_entries); 1424 LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); 1425 return; 1426 } 1427 1428 /* 1429 * Cookie numbers are near exhaustion. Scan dupindex list for unused 1430 * numbers. dupindex list is sorted in descending order. Keep it so 1431 * after inserting nde. 1432 */ 1433 while (1) { 1434 pde = de; 1435 de = LIST_NEXT(de, uh.td_dup.index_entries); 1436 if (de == NULL && pde->td_cookie != TMPFS_DIRCOOKIE_DUP_MIN) { 1437 /* 1438 * Last element of the index doesn't have minimal cookie 1439 * value, use it. 1440 */ 1441 nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MIN; 1442 LIST_INSERT_AFTER(pde, nde, uh.td_dup.index_entries); 1443 LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); 1444 return; 1445 } else if (de == NULL) { 1446 /* 1447 * We are so lucky have 2^30 hash duplicates in single 1448 * directory :) Return largest possible cookie value. 1449 * It should be fine except possible issues with 1450 * VOP_READDIR restart. 1451 */ 1452 nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MAX; 1453 LIST_INSERT_HEAD(dupindex, nde, 1454 uh.td_dup.index_entries); 1455 LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); 1456 return; 1457 } 1458 if (de->td_cookie + 1 == pde->td_cookie || 1459 de->td_cookie >= TMPFS_DIRCOOKIE_DUP_MAX) 1460 continue; /* No hole or invalid cookie. */ 1461 nde->td_cookie = de->td_cookie + 1; 1462 MPASS(tmpfs_dirent_dup(nde)); 1463 MPASS(pde->td_cookie > nde->td_cookie); 1464 MPASS(nde->td_cookie > de->td_cookie); 1465 LIST_INSERT_BEFORE(de, nde, uh.td_dup.index_entries); 1466 LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); 1467 return; 1468 } 1469 } 1470 1471 /* 1472 * Attaches the directory entry de to the directory represented by vp. 1473 * Note that this does not change the link count of the node pointed by 1474 * the directory entry, as this is done by tmpfs_alloc_dirent. 1475 */ 1476 void 1477 tmpfs_dir_attach(struct vnode *vp, struct tmpfs_dirent *de) 1478 { 1479 struct tmpfs_node *dnode; 1480 struct tmpfs_dirent *xde, *nde; 1481 1482 ASSERT_VOP_ELOCKED(vp, __func__); 1483 MPASS(de->td_namelen > 0); 1484 MPASS(de->td_hash >= TMPFS_DIRCOOKIE_MIN); 1485 MPASS(de->td_cookie == de->td_hash); 1486 1487 dnode = VP_TO_TMPFS_DIR(vp); 1488 dnode->tn_dir.tn_readdir_lastn = 0; 1489 dnode->tn_dir.tn_readdir_lastp = NULL; 1490 1491 MPASS(!tmpfs_dirent_dup(de)); 1492 xde = RB_INSERT(tmpfs_dir, &dnode->tn_dir.tn_dirhead, de); 1493 if (xde != NULL && tmpfs_dirent_duphead(xde)) 1494 tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, de); 1495 else if (xde != NULL) { 1496 /* 1497 * Allocate new duphead. Swap xde with duphead to avoid 1498 * adding/removing elements with the same hash. 1499 */ 1500 MPASS(!tmpfs_dirent_dup(xde)); 1501 tmpfs_alloc_dirent(VFS_TO_TMPFS(vp->v_mount), NULL, NULL, 0, 1502 &nde); 1503 /* *nde = *xde; XXX gcc 4.2.1 may generate invalid code. */ 1504 memcpy(nde, xde, sizeof(*xde)); 1505 xde->td_cookie |= TMPFS_DIRCOOKIE_DUPHEAD; 1506 LIST_INIT(&xde->ud.td_duphead); 1507 xde->td_namelen = 0; 1508 xde->td_node = NULL; 1509 tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, nde); 1510 tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, de); 1511 } 1512 dnode->tn_size += sizeof(struct tmpfs_dirent); 1513 dnode->tn_status |= TMPFS_NODE_CHANGED | TMPFS_NODE_MODIFIED; 1514 dnode->tn_accessed = true; 1515 tmpfs_update(vp); 1516 } 1517 1518 /* 1519 * Detaches the directory entry de from the directory represented by vp. 1520 * Note that this does not change the link count of the node pointed by 1521 * the directory entry, as this is done by tmpfs_free_dirent. 1522 */ 1523 void 1524 tmpfs_dir_detach(struct vnode *vp, struct tmpfs_dirent *de) 1525 { 1526 struct tmpfs_mount *tmp; 1527 struct tmpfs_dir *head; 1528 struct tmpfs_node *dnode; 1529 struct tmpfs_dirent *xde; 1530 1531 ASSERT_VOP_ELOCKED(vp, __func__); 1532 1533 dnode = VP_TO_TMPFS_DIR(vp); 1534 head = &dnode->tn_dir.tn_dirhead; 1535 dnode->tn_dir.tn_readdir_lastn = 0; 1536 dnode->tn_dir.tn_readdir_lastp = NULL; 1537 1538 if (tmpfs_dirent_dup(de)) { 1539 /* Remove duphead if de was last entry. */ 1540 if (LIST_NEXT(de, uh.td_dup.entries) == NULL) { 1541 xde = tmpfs_dir_xlookup_hash(dnode, de->td_hash); 1542 MPASS(tmpfs_dirent_duphead(xde)); 1543 } else 1544 xde = NULL; 1545 LIST_REMOVE(de, uh.td_dup.entries); 1546 LIST_REMOVE(de, uh.td_dup.index_entries); 1547 if (xde != NULL) { 1548 if (LIST_EMPTY(&xde->ud.td_duphead)) { 1549 RB_REMOVE(tmpfs_dir, head, xde); 1550 tmp = VFS_TO_TMPFS(vp->v_mount); 1551 MPASS(xde->td_node == NULL); 1552 tmpfs_free_dirent(tmp, xde); 1553 } 1554 } 1555 de->td_cookie = de->td_hash; 1556 } else 1557 RB_REMOVE(tmpfs_dir, head, de); 1558 1559 dnode->tn_size -= sizeof(struct tmpfs_dirent); 1560 dnode->tn_status |= TMPFS_NODE_CHANGED | TMPFS_NODE_MODIFIED; 1561 dnode->tn_accessed = true; 1562 tmpfs_update(vp); 1563 } 1564 1565 void 1566 tmpfs_dir_destroy(struct tmpfs_mount *tmp, struct tmpfs_node *dnode) 1567 { 1568 struct tmpfs_dirent *de, *dde, *nde; 1569 1570 RB_FOREACH_SAFE(de, tmpfs_dir, &dnode->tn_dir.tn_dirhead, nde) { 1571 RB_REMOVE(tmpfs_dir, &dnode->tn_dir.tn_dirhead, de); 1572 /* Node may already be destroyed. */ 1573 de->td_node = NULL; 1574 if (tmpfs_dirent_duphead(de)) { 1575 while ((dde = LIST_FIRST(&de->ud.td_duphead)) != NULL) { 1576 LIST_REMOVE(dde, uh.td_dup.entries); 1577 dde->td_node = NULL; 1578 tmpfs_free_dirent(tmp, dde); 1579 } 1580 } 1581 tmpfs_free_dirent(tmp, de); 1582 } 1583 } 1584 1585 /* 1586 * Helper function for tmpfs_readdir. Creates a '.' entry for the given 1587 * directory and returns it in the uio space. The function returns 0 1588 * on success, -1 if there was not enough space in the uio structure to 1589 * hold the directory entry or an appropriate error code if another 1590 * error happens. 1591 */ 1592 static int 1593 tmpfs_dir_getdotdent(struct tmpfs_mount *tm, struct tmpfs_node *node, 1594 struct uio *uio) 1595 { 1596 int error; 1597 struct dirent dent; 1598 1599 TMPFS_VALIDATE_DIR(node); 1600 MPASS(uio->uio_offset == TMPFS_DIRCOOKIE_DOT); 1601 1602 dent.d_fileno = node->tn_id; 1603 dent.d_off = TMPFS_DIRCOOKIE_DOTDOT; 1604 dent.d_type = DT_DIR; 1605 dent.d_namlen = 1; 1606 dent.d_name[0] = '.'; 1607 dent.d_reclen = GENERIC_DIRSIZ(&dent); 1608 dirent_terminate(&dent); 1609 1610 if (dent.d_reclen > uio->uio_resid) 1611 error = EJUSTRETURN; 1612 else 1613 error = uiomove(&dent, dent.d_reclen, uio); 1614 1615 tmpfs_set_accessed(tm, node); 1616 1617 return (error); 1618 } 1619 1620 /* 1621 * Helper function for tmpfs_readdir. Creates a '..' entry for the given 1622 * directory and returns it in the uio space. The function returns 0 1623 * on success, -1 if there was not enough space in the uio structure to 1624 * hold the directory entry or an appropriate error code if another 1625 * error happens. 1626 */ 1627 static int 1628 tmpfs_dir_getdotdotdent(struct tmpfs_mount *tm, struct tmpfs_node *node, 1629 struct uio *uio, off_t next) 1630 { 1631 struct tmpfs_node *parent; 1632 struct dirent dent; 1633 int error; 1634 1635 TMPFS_VALIDATE_DIR(node); 1636 MPASS(uio->uio_offset == TMPFS_DIRCOOKIE_DOTDOT); 1637 1638 /* 1639 * Return ENOENT if the current node is already removed. 1640 */ 1641 TMPFS_ASSERT_LOCKED(node); 1642 parent = node->tn_dir.tn_parent; 1643 if (parent == NULL) 1644 return (ENOENT); 1645 1646 dent.d_fileno = parent->tn_id; 1647 dent.d_off = next; 1648 dent.d_type = DT_DIR; 1649 dent.d_namlen = 2; 1650 dent.d_name[0] = '.'; 1651 dent.d_name[1] = '.'; 1652 dent.d_reclen = GENERIC_DIRSIZ(&dent); 1653 dirent_terminate(&dent); 1654 1655 if (dent.d_reclen > uio->uio_resid) 1656 error = EJUSTRETURN; 1657 else 1658 error = uiomove(&dent, dent.d_reclen, uio); 1659 1660 tmpfs_set_accessed(tm, node); 1661 1662 return (error); 1663 } 1664 1665 /* 1666 * Helper function for tmpfs_readdir. Returns as much directory entries 1667 * as can fit in the uio space. The read starts at uio->uio_offset. 1668 * The function returns 0 on success, -1 if there was not enough space 1669 * in the uio structure to hold the directory entry or an appropriate 1670 * error code if another error happens. 1671 */ 1672 int 1673 tmpfs_dir_getdents(struct tmpfs_mount *tm, struct tmpfs_node *node, 1674 struct uio *uio, int maxcookies, uint64_t *cookies, int *ncookies) 1675 { 1676 struct tmpfs_dir_cursor dc; 1677 struct tmpfs_dirent *de, *nde; 1678 off_t off; 1679 int error; 1680 1681 TMPFS_VALIDATE_DIR(node); 1682 1683 off = 0; 1684 1685 /* 1686 * Lookup the node from the current offset. The starting offset of 1687 * 0 will lookup both '.' and '..', and then the first real entry, 1688 * or EOF if there are none. Then find all entries for the dir that 1689 * fit into the buffer. Once no more entries are found (de == NULL), 1690 * the offset is set to TMPFS_DIRCOOKIE_EOF, which will cause the next 1691 * call to return 0. 1692 */ 1693 switch (uio->uio_offset) { 1694 case TMPFS_DIRCOOKIE_DOT: 1695 error = tmpfs_dir_getdotdent(tm, node, uio); 1696 if (error != 0) 1697 return (error); 1698 uio->uio_offset = off = TMPFS_DIRCOOKIE_DOTDOT; 1699 if (cookies != NULL) 1700 cookies[(*ncookies)++] = off; 1701 /* FALLTHROUGH */ 1702 case TMPFS_DIRCOOKIE_DOTDOT: 1703 de = tmpfs_dir_first(node, &dc); 1704 off = tmpfs_dirent_cookie(de); 1705 error = tmpfs_dir_getdotdotdent(tm, node, uio, off); 1706 if (error != 0) 1707 return (error); 1708 uio->uio_offset = off; 1709 if (cookies != NULL) 1710 cookies[(*ncookies)++] = off; 1711 /* EOF. */ 1712 if (de == NULL) 1713 return (0); 1714 break; 1715 case TMPFS_DIRCOOKIE_EOF: 1716 return (0); 1717 default: 1718 de = tmpfs_dir_lookup_cookie(node, uio->uio_offset, &dc); 1719 if (de == NULL) 1720 return (EINVAL); 1721 if (cookies != NULL) 1722 off = tmpfs_dirent_cookie(de); 1723 } 1724 1725 /* 1726 * Read as much entries as possible; i.e., until we reach the end of the 1727 * directory or we exhaust uio space. 1728 */ 1729 do { 1730 struct dirent d; 1731 1732 /* 1733 * Create a dirent structure representing the current tmpfs_node 1734 * and fill it. 1735 */ 1736 if (de->td_node == NULL) { 1737 d.d_fileno = 1; 1738 d.d_type = DT_WHT; 1739 } else { 1740 d.d_fileno = de->td_node->tn_id; 1741 switch (de->td_node->tn_type) { 1742 case VBLK: 1743 d.d_type = DT_BLK; 1744 break; 1745 1746 case VCHR: 1747 d.d_type = DT_CHR; 1748 break; 1749 1750 case VDIR: 1751 d.d_type = DT_DIR; 1752 break; 1753 1754 case VFIFO: 1755 d.d_type = DT_FIFO; 1756 break; 1757 1758 case VLNK: 1759 d.d_type = DT_LNK; 1760 break; 1761 1762 case VREG: 1763 d.d_type = DT_REG; 1764 break; 1765 1766 case VSOCK: 1767 d.d_type = DT_SOCK; 1768 break; 1769 1770 default: 1771 panic("tmpfs_dir_getdents: type %p %d", 1772 de->td_node, (int)de->td_node->tn_type); 1773 } 1774 } 1775 d.d_namlen = de->td_namelen; 1776 MPASS(de->td_namelen < sizeof(d.d_name)); 1777 (void)memcpy(d.d_name, de->ud.td_name, de->td_namelen); 1778 d.d_reclen = GENERIC_DIRSIZ(&d); 1779 1780 /* 1781 * Stop reading if the directory entry we are treating is bigger 1782 * than the amount of data that can be returned. 1783 */ 1784 if (d.d_reclen > uio->uio_resid) { 1785 error = EJUSTRETURN; 1786 break; 1787 } 1788 1789 nde = tmpfs_dir_next(node, &dc); 1790 d.d_off = tmpfs_dirent_cookie(nde); 1791 dirent_terminate(&d); 1792 1793 /* 1794 * Copy the new dirent structure into the output buffer and 1795 * advance pointers. 1796 */ 1797 error = uiomove(&d, d.d_reclen, uio); 1798 if (error == 0) { 1799 de = nde; 1800 if (cookies != NULL) { 1801 off = tmpfs_dirent_cookie(de); 1802 MPASS(*ncookies < maxcookies); 1803 cookies[(*ncookies)++] = off; 1804 } 1805 } 1806 } while (error == 0 && uio->uio_resid > 0 && de != NULL); 1807 1808 /* Skip setting off when using cookies as it is already done above. */ 1809 if (cookies == NULL) 1810 off = tmpfs_dirent_cookie(de); 1811 1812 /* Update the offset and cache. */ 1813 uio->uio_offset = off; 1814 node->tn_dir.tn_readdir_lastn = off; 1815 node->tn_dir.tn_readdir_lastp = de; 1816 1817 tmpfs_set_accessed(tm, node); 1818 return (error); 1819 } 1820 1821 int 1822 tmpfs_dir_whiteout_add(struct vnode *dvp, struct componentname *cnp) 1823 { 1824 struct tmpfs_dirent *de; 1825 int error; 1826 1827 error = tmpfs_alloc_dirent(VFS_TO_TMPFS(dvp->v_mount), NULL, 1828 cnp->cn_nameptr, cnp->cn_namelen, &de); 1829 if (error != 0) 1830 return (error); 1831 tmpfs_dir_attach(dvp, de); 1832 return (0); 1833 } 1834 1835 void 1836 tmpfs_dir_whiteout_remove(struct vnode *dvp, struct componentname *cnp) 1837 { 1838 struct tmpfs_dirent *de; 1839 1840 de = tmpfs_dir_lookup(VP_TO_TMPFS_DIR(dvp), NULL, cnp); 1841 MPASS(de != NULL && de->td_node == NULL); 1842 tmpfs_dir_detach(dvp, de); 1843 tmpfs_free_dirent(VFS_TO_TMPFS(dvp->v_mount), de); 1844 } 1845 1846 /* 1847 * Resizes the aobj associated with the regular file pointed to by 'vp' to the 1848 * size 'newsize'. 'vp' must point to a vnode that represents a regular file. 1849 * 'newsize' must be positive. 1850 * 1851 * Returns zero on success or an appropriate error code on failure. 1852 */ 1853 int 1854 tmpfs_reg_resize(struct vnode *vp, off_t newsize, boolean_t ignerr) 1855 { 1856 struct tmpfs_node *node; 1857 vm_object_t uobj; 1858 vm_pindex_t idx, newpages, oldpages; 1859 off_t oldsize; 1860 int base, error; 1861 1862 MPASS(vp->v_type == VREG); 1863 MPASS(newsize >= 0); 1864 1865 node = VP_TO_TMPFS_NODE(vp); 1866 uobj = node->tn_reg.tn_aobj; 1867 1868 /* 1869 * Convert the old and new sizes to the number of pages needed to 1870 * store them. It may happen that we do not need to do anything 1871 * because the last allocated page can accommodate the change on 1872 * its own. 1873 */ 1874 oldsize = node->tn_size; 1875 oldpages = OFF_TO_IDX(oldsize + PAGE_MASK); 1876 MPASS(oldpages == uobj->size); 1877 newpages = OFF_TO_IDX(newsize + PAGE_MASK); 1878 1879 if (__predict_true(newpages == oldpages && newsize >= oldsize)) { 1880 node->tn_size = newsize; 1881 return (0); 1882 } 1883 1884 VM_OBJECT_WLOCK(uobj); 1885 if (newsize < oldsize) { 1886 /* 1887 * Zero the truncated part of the last page. 1888 */ 1889 base = newsize & PAGE_MASK; 1890 if (base != 0) { 1891 idx = OFF_TO_IDX(newsize); 1892 error = tmpfs_partial_page_invalidate(uobj, idx, base, 1893 PAGE_SIZE, ignerr); 1894 if (error != 0) { 1895 VM_OBJECT_WUNLOCK(uobj); 1896 return (error); 1897 } 1898 } 1899 1900 /* 1901 * Release any swap space and free any whole pages. 1902 */ 1903 if (newpages < oldpages) 1904 vm_object_page_remove(uobj, newpages, 0, 0); 1905 } 1906 uobj->size = newpages; 1907 VM_OBJECT_WUNLOCK(uobj); 1908 1909 node->tn_size = newsize; 1910 return (0); 1911 } 1912 1913 /* 1914 * Punch hole in the aobj associated with the regular file pointed to by 'vp'. 1915 * Requests completely beyond the end-of-file are converted to no-op. 1916 * 1917 * Returns 0 on success or error code from tmpfs_partial_page_invalidate() on 1918 * failure. 1919 */ 1920 int 1921 tmpfs_reg_punch_hole(struct vnode *vp, off_t *offset, off_t *length) 1922 { 1923 struct tmpfs_node *node; 1924 vm_object_t object; 1925 vm_pindex_t pistart, pi, piend; 1926 int startofs, endofs, end; 1927 off_t off, len; 1928 int error; 1929 1930 KASSERT(*length <= OFF_MAX - *offset, ("%s: offset + length overflows", 1931 __func__)); 1932 node = VP_TO_TMPFS_NODE(vp); 1933 KASSERT(node->tn_type == VREG, ("%s: node is not regular file", 1934 __func__)); 1935 object = node->tn_reg.tn_aobj; 1936 off = *offset; 1937 len = omin(node->tn_size - off, *length); 1938 startofs = off & PAGE_MASK; 1939 endofs = (off + len) & PAGE_MASK; 1940 pistart = OFF_TO_IDX(off); 1941 piend = OFF_TO_IDX(off + len); 1942 pi = OFF_TO_IDX((vm_ooffset_t)off + PAGE_MASK); 1943 error = 0; 1944 1945 /* Handle the case when offset is on or beyond file size. */ 1946 if (len <= 0) { 1947 *length = 0; 1948 return (0); 1949 } 1950 1951 VM_OBJECT_WLOCK(object); 1952 1953 /* 1954 * If there is a partial page at the beginning of the hole-punching 1955 * request, fill the partial page with zeroes. 1956 */ 1957 if (startofs != 0) { 1958 end = pistart != piend ? PAGE_SIZE : endofs; 1959 error = tmpfs_partial_page_invalidate(object, pistart, startofs, 1960 end, FALSE); 1961 if (error != 0) 1962 goto out; 1963 off += end - startofs; 1964 len -= end - startofs; 1965 } 1966 1967 /* 1968 * Toss away the full pages in the affected area. 1969 */ 1970 if (pi < piend) { 1971 vm_object_page_remove(object, pi, piend, 0); 1972 off += IDX_TO_OFF(piend - pi); 1973 len -= IDX_TO_OFF(piend - pi); 1974 } 1975 1976 /* 1977 * If there is a partial page at the end of the hole-punching request, 1978 * fill the partial page with zeroes. 1979 */ 1980 if (endofs != 0 && pistart != piend) { 1981 error = tmpfs_partial_page_invalidate(object, piend, 0, endofs, 1982 FALSE); 1983 if (error != 0) 1984 goto out; 1985 off += endofs; 1986 len -= endofs; 1987 } 1988 1989 out: 1990 VM_OBJECT_WUNLOCK(object); 1991 *offset = off; 1992 *length = len; 1993 return (error); 1994 } 1995 1996 void 1997 tmpfs_check_mtime(struct vnode *vp) 1998 { 1999 struct tmpfs_node *node; 2000 struct vm_object *obj; 2001 2002 ASSERT_VOP_ELOCKED(vp, "check_mtime"); 2003 if (vp->v_type != VREG) 2004 return; 2005 obj = vp->v_object; 2006 KASSERT(obj->type == tmpfs_pager_type && 2007 (obj->flags & (OBJ_SWAP | OBJ_TMPFS)) == 2008 (OBJ_SWAP | OBJ_TMPFS), ("non-tmpfs obj")); 2009 /* unlocked read */ 2010 if (obj->generation != obj->cleangeneration) { 2011 VM_OBJECT_WLOCK(obj); 2012 if (obj->generation != obj->cleangeneration) { 2013 obj->cleangeneration = obj->generation; 2014 node = VP_TO_TMPFS_NODE(vp); 2015 node->tn_status |= TMPFS_NODE_MODIFIED | 2016 TMPFS_NODE_CHANGED; 2017 } 2018 VM_OBJECT_WUNLOCK(obj); 2019 } 2020 } 2021 2022 /* 2023 * Change flags of the given vnode. 2024 * Caller should execute tmpfs_update on vp after a successful execution. 2025 * The vnode must be locked on entry and remain locked on exit. 2026 */ 2027 int 2028 tmpfs_chflags(struct vnode *vp, u_long flags, struct ucred *cred, 2029 struct thread *td) 2030 { 2031 int error; 2032 struct tmpfs_node *node; 2033 2034 ASSERT_VOP_ELOCKED(vp, "chflags"); 2035 2036 node = VP_TO_TMPFS_NODE(vp); 2037 2038 if ((flags & ~(SF_APPEND | SF_ARCHIVED | SF_IMMUTABLE | SF_NOUNLINK | 2039 UF_APPEND | UF_ARCHIVE | UF_HIDDEN | UF_IMMUTABLE | UF_NODUMP | 2040 UF_NOUNLINK | UF_OFFLINE | UF_OPAQUE | UF_READONLY | UF_REPARSE | 2041 UF_SPARSE | UF_SYSTEM)) != 0) 2042 return (EOPNOTSUPP); 2043 2044 /* Disallow this operation if the file system is mounted read-only. */ 2045 if (vp->v_mount->mnt_flag & MNT_RDONLY) 2046 return (EROFS); 2047 2048 /* 2049 * Callers may only modify the file flags on objects they 2050 * have VADMIN rights for. 2051 */ 2052 if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) 2053 return (error); 2054 /* 2055 * Unprivileged processes are not permitted to unset system 2056 * flags, or modify flags if any system flags are set. 2057 */ 2058 if (!priv_check_cred(cred, PRIV_VFS_SYSFLAGS)) { 2059 if (node->tn_flags & 2060 (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND)) { 2061 error = securelevel_gt(cred, 0); 2062 if (error) 2063 return (error); 2064 } 2065 } else { 2066 if (node->tn_flags & 2067 (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND) || 2068 ((flags ^ node->tn_flags) & SF_SETTABLE)) 2069 return (EPERM); 2070 } 2071 node->tn_flags = flags; 2072 node->tn_status |= TMPFS_NODE_CHANGED; 2073 2074 ASSERT_VOP_ELOCKED(vp, "chflags2"); 2075 2076 return (0); 2077 } 2078 2079 /* 2080 * Change access mode on the given vnode. 2081 * Caller should execute tmpfs_update on vp after a successful execution. 2082 * The vnode must be locked on entry and remain locked on exit. 2083 */ 2084 int 2085 tmpfs_chmod(struct vnode *vp, mode_t mode, struct ucred *cred, 2086 struct thread *td) 2087 { 2088 int error; 2089 struct tmpfs_node *node; 2090 mode_t newmode; 2091 2092 ASSERT_VOP_ELOCKED(vp, "chmod"); 2093 ASSERT_VOP_IN_SEQC(vp); 2094 2095 node = VP_TO_TMPFS_NODE(vp); 2096 2097 /* Disallow this operation if the file system is mounted read-only. */ 2098 if (vp->v_mount->mnt_flag & MNT_RDONLY) 2099 return (EROFS); 2100 2101 /* Immutable or append-only files cannot be modified, either. */ 2102 if (node->tn_flags & (IMMUTABLE | APPEND)) 2103 return (EPERM); 2104 2105 /* 2106 * To modify the permissions on a file, must possess VADMIN 2107 * for that file. 2108 */ 2109 if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) 2110 return (error); 2111 2112 /* 2113 * Privileged processes may set the sticky bit on non-directories, 2114 * as well as set the setgid bit on a file with a group that the 2115 * process is not a member of. 2116 */ 2117 if (vp->v_type != VDIR && (mode & S_ISTXT)) { 2118 if (priv_check_cred(cred, PRIV_VFS_STICKYFILE)) 2119 return (EFTYPE); 2120 } 2121 if (!groupmember(node->tn_gid, cred) && (mode & S_ISGID)) { 2122 error = priv_check_cred(cred, PRIV_VFS_SETGID); 2123 if (error) 2124 return (error); 2125 } 2126 2127 newmode = node->tn_mode & ~ALLPERMS; 2128 newmode |= mode & ALLPERMS; 2129 atomic_store_short(&node->tn_mode, newmode); 2130 2131 node->tn_status |= TMPFS_NODE_CHANGED; 2132 2133 ASSERT_VOP_ELOCKED(vp, "chmod2"); 2134 2135 return (0); 2136 } 2137 2138 /* 2139 * Change ownership of the given vnode. At least one of uid or gid must 2140 * be different than VNOVAL. If one is set to that value, the attribute 2141 * is unchanged. 2142 * Caller should execute tmpfs_update on vp after a successful execution. 2143 * The vnode must be locked on entry and remain locked on exit. 2144 */ 2145 int 2146 tmpfs_chown(struct vnode *vp, uid_t uid, gid_t gid, struct ucred *cred, 2147 struct thread *td) 2148 { 2149 int error; 2150 struct tmpfs_node *node; 2151 uid_t ouid; 2152 gid_t ogid; 2153 mode_t newmode; 2154 2155 ASSERT_VOP_ELOCKED(vp, "chown"); 2156 ASSERT_VOP_IN_SEQC(vp); 2157 2158 node = VP_TO_TMPFS_NODE(vp); 2159 2160 /* Assign default values if they are unknown. */ 2161 MPASS(uid != VNOVAL || gid != VNOVAL); 2162 if (uid == VNOVAL) 2163 uid = node->tn_uid; 2164 if (gid == VNOVAL) 2165 gid = node->tn_gid; 2166 MPASS(uid != VNOVAL && gid != VNOVAL); 2167 2168 /* Disallow this operation if the file system is mounted read-only. */ 2169 if (vp->v_mount->mnt_flag & MNT_RDONLY) 2170 return (EROFS); 2171 2172 /* Immutable or append-only files cannot be modified, either. */ 2173 if (node->tn_flags & (IMMUTABLE | APPEND)) 2174 return (EPERM); 2175 2176 /* 2177 * To modify the ownership of a file, must possess VADMIN for that 2178 * file. 2179 */ 2180 if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) 2181 return (error); 2182 2183 /* 2184 * To change the owner of a file, or change the group of a file to a 2185 * group of which we are not a member, the caller must have 2186 * privilege. 2187 */ 2188 if ((uid != node->tn_uid || 2189 (gid != node->tn_gid && !groupmember(gid, cred))) && 2190 (error = priv_check_cred(cred, PRIV_VFS_CHOWN))) 2191 return (error); 2192 2193 ogid = node->tn_gid; 2194 ouid = node->tn_uid; 2195 2196 node->tn_uid = uid; 2197 node->tn_gid = gid; 2198 2199 node->tn_status |= TMPFS_NODE_CHANGED; 2200 2201 if ((node->tn_mode & (S_ISUID | S_ISGID)) != 0 && 2202 (ouid != uid || ogid != gid)) { 2203 if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID)) { 2204 newmode = node->tn_mode & ~(S_ISUID | S_ISGID); 2205 atomic_store_short(&node->tn_mode, newmode); 2206 } 2207 } 2208 2209 ASSERT_VOP_ELOCKED(vp, "chown2"); 2210 2211 return (0); 2212 } 2213 2214 /* 2215 * Change size of the given vnode. 2216 * Caller should execute tmpfs_update on vp after a successful execution. 2217 * The vnode must be locked on entry and remain locked on exit. 2218 */ 2219 int 2220 tmpfs_chsize(struct vnode *vp, u_quad_t size, struct ucred *cred, 2221 struct thread *td) 2222 { 2223 int error; 2224 struct tmpfs_node *node; 2225 2226 ASSERT_VOP_ELOCKED(vp, "chsize"); 2227 2228 node = VP_TO_TMPFS_NODE(vp); 2229 2230 /* Decide whether this is a valid operation based on the file type. */ 2231 error = 0; 2232 switch (vp->v_type) { 2233 case VDIR: 2234 return (EISDIR); 2235 2236 case VREG: 2237 if (vp->v_mount->mnt_flag & MNT_RDONLY) 2238 return (EROFS); 2239 break; 2240 2241 case VBLK: 2242 /* FALLTHROUGH */ 2243 case VCHR: 2244 /* FALLTHROUGH */ 2245 case VFIFO: 2246 /* 2247 * Allow modifications of special files even if in the file 2248 * system is mounted read-only (we are not modifying the 2249 * files themselves, but the objects they represent). 2250 */ 2251 return (0); 2252 2253 default: 2254 /* Anything else is unsupported. */ 2255 return (EOPNOTSUPP); 2256 } 2257 2258 /* Immutable or append-only files cannot be modified, either. */ 2259 if (node->tn_flags & (IMMUTABLE | APPEND)) 2260 return (EPERM); 2261 2262 error = vn_rlimit_trunc(size, td); 2263 if (error != 0) 2264 return (error); 2265 2266 error = tmpfs_truncate(vp, size); 2267 /* 2268 * tmpfs_truncate will raise the NOTE_EXTEND and NOTE_ATTRIB kevents 2269 * for us, as will update tn_status; no need to do that here. 2270 */ 2271 2272 ASSERT_VOP_ELOCKED(vp, "chsize2"); 2273 2274 return (error); 2275 } 2276 2277 /* 2278 * Change access and modification times of the given vnode. 2279 * Caller should execute tmpfs_update on vp after a successful execution. 2280 * The vnode must be locked on entry and remain locked on exit. 2281 */ 2282 int 2283 tmpfs_chtimes(struct vnode *vp, struct vattr *vap, 2284 struct ucred *cred, struct thread *td) 2285 { 2286 int error; 2287 struct tmpfs_node *node; 2288 2289 ASSERT_VOP_ELOCKED(vp, "chtimes"); 2290 2291 node = VP_TO_TMPFS_NODE(vp); 2292 2293 /* Disallow this operation if the file system is mounted read-only. */ 2294 if (vp->v_mount->mnt_flag & MNT_RDONLY) 2295 return (EROFS); 2296 2297 /* Immutable or append-only files cannot be modified, either. */ 2298 if (node->tn_flags & (IMMUTABLE | APPEND)) 2299 return (EPERM); 2300 2301 error = vn_utimes_perm(vp, vap, cred, td); 2302 if (error != 0) 2303 return (error); 2304 2305 if (vap->va_atime.tv_sec != VNOVAL) 2306 node->tn_accessed = true; 2307 if (vap->va_mtime.tv_sec != VNOVAL) 2308 node->tn_status |= TMPFS_NODE_MODIFIED; 2309 if (vap->va_birthtime.tv_sec != VNOVAL) 2310 node->tn_status |= TMPFS_NODE_MODIFIED; 2311 tmpfs_itimes(vp, &vap->va_atime, &vap->va_mtime); 2312 if (vap->va_birthtime.tv_sec != VNOVAL) 2313 node->tn_birthtime = vap->va_birthtime; 2314 ASSERT_VOP_ELOCKED(vp, "chtimes2"); 2315 2316 return (0); 2317 } 2318 2319 void 2320 tmpfs_set_status(struct tmpfs_mount *tm, struct tmpfs_node *node, int status) 2321 { 2322 2323 if ((node->tn_status & status) == status || tm->tm_ronly) 2324 return; 2325 TMPFS_NODE_LOCK(node); 2326 node->tn_status |= status; 2327 TMPFS_NODE_UNLOCK(node); 2328 } 2329 2330 void 2331 tmpfs_set_accessed(struct tmpfs_mount *tm, struct tmpfs_node *node) 2332 { 2333 if (node->tn_accessed || tm->tm_ronly) 2334 return; 2335 atomic_store_8(&node->tn_accessed, true); 2336 } 2337 2338 /* Sync timestamps */ 2339 void 2340 tmpfs_itimes(struct vnode *vp, const struct timespec *acc, 2341 const struct timespec *mod) 2342 { 2343 struct tmpfs_node *node; 2344 struct timespec now; 2345 2346 ASSERT_VOP_LOCKED(vp, "tmpfs_itimes"); 2347 node = VP_TO_TMPFS_NODE(vp); 2348 2349 if (!node->tn_accessed && 2350 (node->tn_status & (TMPFS_NODE_MODIFIED | TMPFS_NODE_CHANGED)) == 0) 2351 return; 2352 2353 vfs_timestamp(&now); 2354 TMPFS_NODE_LOCK(node); 2355 if (node->tn_accessed) { 2356 if (acc == NULL) 2357 acc = &now; 2358 node->tn_atime = *acc; 2359 } 2360 if (node->tn_status & TMPFS_NODE_MODIFIED) { 2361 if (mod == NULL) 2362 mod = &now; 2363 node->tn_mtime = *mod; 2364 } 2365 if (node->tn_status & TMPFS_NODE_CHANGED) 2366 node->tn_ctime = now; 2367 node->tn_status &= ~(TMPFS_NODE_MODIFIED | TMPFS_NODE_CHANGED); 2368 node->tn_accessed = false; 2369 TMPFS_NODE_UNLOCK(node); 2370 2371 /* XXX: FIX? The entropy here is desirable, but the harvesting may be expensive */ 2372 random_harvest_queue(node, sizeof(*node), RANDOM_FS_ATIME); 2373 } 2374 2375 int 2376 tmpfs_truncate(struct vnode *vp, off_t length) 2377 { 2378 struct tmpfs_node *node; 2379 int error; 2380 2381 if (length < 0) 2382 return (EINVAL); 2383 if (length > VFS_TO_TMPFS(vp->v_mount)->tm_maxfilesize) 2384 return (EFBIG); 2385 2386 node = VP_TO_TMPFS_NODE(vp); 2387 error = node->tn_size == length ? 0 : tmpfs_reg_resize(vp, length, 2388 FALSE); 2389 if (error == 0) 2390 node->tn_status |= TMPFS_NODE_CHANGED | TMPFS_NODE_MODIFIED; 2391 tmpfs_update(vp); 2392 2393 return (error); 2394 } 2395 2396 static __inline int 2397 tmpfs_dirtree_cmp(struct tmpfs_dirent *a, struct tmpfs_dirent *b) 2398 { 2399 if (a->td_hash > b->td_hash) 2400 return (1); 2401 else if (a->td_hash < b->td_hash) 2402 return (-1); 2403 return (0); 2404 } 2405 2406 RB_GENERATE_STATIC(tmpfs_dir, tmpfs_dirent, uh.td_entries, tmpfs_dirtree_cmp); 2407