1 /* $NetBSD: tmpfs_subr.c,v 1.35 2007/07/09 21:10:50 ad Exp $ */ 2 3 /*- 4 * SPDX-License-Identifier: BSD-2-Clause 5 * 6 * Copyright (c) 2005 The NetBSD Foundation, Inc. 7 * All rights reserved. 8 * 9 * This code is derived from software contributed to The NetBSD Foundation 10 * by Julio M. Merino Vidal, developed as part of Google's Summer of Code 11 * 2005 program. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 24 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 26 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 29 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 30 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 32 * POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 /* 36 * Efficient memory file system supporting functions. 37 */ 38 39 #include <sys/param.h> 40 #include <sys/systm.h> 41 #include <sys/dirent.h> 42 #include <sys/fnv_hash.h> 43 #include <sys/lock.h> 44 #include <sys/limits.h> 45 #include <sys/mount.h> 46 #include <sys/namei.h> 47 #include <sys/priv.h> 48 #include <sys/proc.h> 49 #include <sys/random.h> 50 #include <sys/refcount.h> 51 #include <sys/rwlock.h> 52 #include <sys/smr.h> 53 #include <sys/stat.h> 54 #include <sys/sysctl.h> 55 #include <sys/user.h> 56 #include <sys/vnode.h> 57 #include <sys/vmmeter.h> 58 59 #include <vm/vm.h> 60 #include <vm/vm_param.h> 61 #include <vm/vm_object.h> 62 #include <vm/vm_page.h> 63 #include <vm/vm_pageout.h> 64 #include <vm/vm_pager.h> 65 #include <vm/vm_extern.h> 66 #include <vm/swap_pager.h> 67 #include <vm/uma.h> 68 69 #include <fs/tmpfs/tmpfs.h> 70 #include <fs/tmpfs/tmpfs_fifoops.h> 71 #include <fs/tmpfs/tmpfs_vnops.h> 72 73 SYSCTL_NODE(_vfs, OID_AUTO, tmpfs, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 74 "tmpfs file system"); 75 76 static long tmpfs_pages_reserved = TMPFS_PAGES_MINRESERVED; 77 static long tmpfs_pages_avail_init; 78 static int tmpfs_mem_percent = TMPFS_MEM_PERCENT; 79 static void tmpfs_set_reserve_from_percent(void); 80 81 MALLOC_DEFINE(M_TMPFSDIR, "tmpfs dir", "tmpfs dirent structure"); 82 static uma_zone_t tmpfs_node_pool; 83 VFS_SMR_DECLARE; 84 85 int tmpfs_pager_type = -1; 86 87 static vm_object_t 88 tmpfs_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, 89 vm_ooffset_t offset, struct ucred *cred) 90 { 91 vm_object_t object; 92 93 MPASS(handle == NULL); 94 MPASS(offset == 0); 95 object = vm_object_allocate_dyn(tmpfs_pager_type, size, 96 OBJ_COLORED | OBJ_SWAP); 97 if (!swap_pager_init_object(object, NULL, NULL, size, 0)) { 98 vm_object_deallocate(object); 99 object = NULL; 100 } 101 return (object); 102 } 103 104 /* 105 * Make sure tmpfs vnodes with writable mappings can be found on the lazy list. 106 * 107 * This allows for periodic mtime updates while only scanning vnodes which are 108 * plausibly dirty, see tmpfs_update_mtime_lazy. 109 */ 110 static void 111 tmpfs_pager_writecount_recalc(vm_object_t object, vm_offset_t old, 112 vm_offset_t new) 113 { 114 struct vnode *vp; 115 116 VM_OBJECT_ASSERT_WLOCKED(object); 117 118 vp = VM_TO_TMPFS_VP(object); 119 120 /* 121 * Forced unmount? 122 */ 123 if (vp == NULL || vp->v_object == NULL) { 124 KASSERT((object->flags & OBJ_TMPFS_VREF) == 0, 125 ("object %p with OBJ_TMPFS_VREF but without vnode", 126 object)); 127 VM_OBJECT_WUNLOCK(object); 128 return; 129 } 130 131 if (old == 0) { 132 VNASSERT((object->flags & OBJ_TMPFS_VREF) == 0, vp, 133 ("object without writable mappings has a reference")); 134 VNPASS(vp->v_usecount > 0, vp); 135 } else { 136 VNASSERT((object->flags & OBJ_TMPFS_VREF) != 0, vp, 137 ("object with writable mappings does not " 138 "have a reference")); 139 } 140 141 if (old == new) { 142 VM_OBJECT_WUNLOCK(object); 143 return; 144 } 145 146 if (new == 0) { 147 vm_object_clear_flag(object, OBJ_TMPFS_VREF); 148 VM_OBJECT_WUNLOCK(object); 149 vrele(vp); 150 } else { 151 if ((object->flags & OBJ_TMPFS_VREF) == 0) { 152 vref(vp); 153 vlazy(vp); 154 vm_object_set_flag(object, OBJ_TMPFS_VREF); 155 } 156 VM_OBJECT_WUNLOCK(object); 157 } 158 } 159 160 static void 161 tmpfs_pager_update_writecount(vm_object_t object, vm_offset_t start, 162 vm_offset_t end) 163 { 164 vm_offset_t new, old; 165 166 VM_OBJECT_WLOCK(object); 167 KASSERT((object->flags & OBJ_ANON) == 0, 168 ("%s: object %p with OBJ_ANON", __func__, object)); 169 old = object->un_pager.swp.writemappings; 170 object->un_pager.swp.writemappings += (vm_ooffset_t)end - start; 171 new = object->un_pager.swp.writemappings; 172 tmpfs_pager_writecount_recalc(object, old, new); 173 VM_OBJECT_ASSERT_UNLOCKED(object); 174 } 175 176 static void 177 tmpfs_pager_release_writecount(vm_object_t object, vm_offset_t start, 178 vm_offset_t end) 179 { 180 vm_offset_t new, old; 181 182 VM_OBJECT_WLOCK(object); 183 KASSERT((object->flags & OBJ_ANON) == 0, 184 ("%s: object %p with OBJ_ANON", __func__, object)); 185 old = object->un_pager.swp.writemappings; 186 KASSERT(old >= (vm_ooffset_t)end - start, 187 ("tmpfs obj %p writecount %jx dec %jx", object, (uintmax_t)old, 188 (uintmax_t)((vm_ooffset_t)end - start))); 189 object->un_pager.swp.writemappings -= (vm_ooffset_t)end - start; 190 new = object->un_pager.swp.writemappings; 191 tmpfs_pager_writecount_recalc(object, old, new); 192 VM_OBJECT_ASSERT_UNLOCKED(object); 193 } 194 195 static void 196 tmpfs_pager_getvp(vm_object_t object, struct vnode **vpp, bool *vp_heldp) 197 { 198 struct vnode *vp; 199 200 /* 201 * Tmpfs VREG node, which was reclaimed, has tmpfs_pager_type 202 * type. In this case there is no v_writecount to adjust. 203 */ 204 if (vp_heldp != NULL) 205 VM_OBJECT_RLOCK(object); 206 else 207 VM_OBJECT_ASSERT_LOCKED(object); 208 if ((object->flags & OBJ_TMPFS) != 0) { 209 vp = VM_TO_TMPFS_VP(object); 210 if (vp != NULL) { 211 *vpp = vp; 212 if (vp_heldp != NULL) { 213 vhold(vp); 214 *vp_heldp = true; 215 } 216 } 217 } 218 if (vp_heldp != NULL) 219 VM_OBJECT_RUNLOCK(object); 220 } 221 222 static void 223 tmpfs_pager_freespace(vm_object_t obj, vm_pindex_t start, vm_size_t size) 224 { 225 struct tmpfs_node *node; 226 struct tmpfs_mount *tm; 227 vm_size_t c; 228 229 swap_pager_freespace(obj, start, size, &c); 230 if ((obj->flags & OBJ_TMPFS) == 0 || c == 0) 231 return; 232 233 node = obj->un_pager.swp.swp_priv; 234 MPASS(node->tn_type == VREG); 235 tm = node->tn_reg.tn_tmp; 236 237 KASSERT(tm->tm_pages_used >= c, 238 ("tmpfs tm %p pages %jd free %jd", tm, 239 (uintmax_t)tm->tm_pages_used, (uintmax_t)c)); 240 atomic_add_long(&tm->tm_pages_used, -c); 241 KASSERT(node->tn_reg.tn_pages >= c, 242 ("tmpfs node %p pages %jd free %jd", node, 243 (uintmax_t)node->tn_reg.tn_pages, (uintmax_t)c)); 244 node->tn_reg.tn_pages -= c; 245 } 246 247 static void 248 tmpfs_page_inserted(vm_object_t obj, vm_page_t m) 249 { 250 struct tmpfs_node *node; 251 struct tmpfs_mount *tm; 252 253 if ((obj->flags & OBJ_TMPFS) == 0) 254 return; 255 256 node = obj->un_pager.swp.swp_priv; 257 MPASS(node->tn_type == VREG); 258 tm = node->tn_reg.tn_tmp; 259 260 if (!vm_pager_has_page(obj, m->pindex, NULL, NULL)) { 261 atomic_add_long(&tm->tm_pages_used, 1); 262 node->tn_reg.tn_pages += 1; 263 } 264 } 265 266 static void 267 tmpfs_page_removed(vm_object_t obj, vm_page_t m) 268 { 269 struct tmpfs_node *node; 270 struct tmpfs_mount *tm; 271 272 if ((obj->flags & OBJ_TMPFS) == 0) 273 return; 274 275 node = obj->un_pager.swp.swp_priv; 276 MPASS(node->tn_type == VREG); 277 tm = node->tn_reg.tn_tmp; 278 279 if (!vm_pager_has_page(obj, m->pindex, NULL, NULL)) { 280 KASSERT(tm->tm_pages_used >= 1, 281 ("tmpfs tm %p pages %jd free 1", tm, 282 (uintmax_t)tm->tm_pages_used)); 283 atomic_add_long(&tm->tm_pages_used, -1); 284 KASSERT(node->tn_reg.tn_pages >= 1, 285 ("tmpfs node %p pages %jd free 1", node, 286 (uintmax_t)node->tn_reg.tn_pages)); 287 node->tn_reg.tn_pages -= 1; 288 } 289 } 290 291 static boolean_t 292 tmpfs_can_alloc_page(vm_object_t obj, vm_pindex_t pindex) 293 { 294 struct tmpfs_mount *tm; 295 296 tm = VM_TO_TMPFS_MP(obj); 297 if (tm == NULL || vm_pager_has_page(obj, pindex, NULL, NULL) || 298 tm->tm_pages_max == 0) 299 return (true); 300 if (tm->tm_pages_max == ULONG_MAX) 301 return (tmpfs_mem_avail() >= 1); 302 return (tm->tm_pages_max > atomic_load_long(&tm->tm_pages_used)); 303 } 304 305 struct pagerops tmpfs_pager_ops = { 306 .pgo_kvme_type = KVME_TYPE_VNODE, 307 .pgo_alloc = tmpfs_pager_alloc, 308 .pgo_set_writeable_dirty = vm_object_set_writeable_dirty_, 309 .pgo_update_writecount = tmpfs_pager_update_writecount, 310 .pgo_release_writecount = tmpfs_pager_release_writecount, 311 .pgo_mightbedirty = vm_object_mightbedirty_, 312 .pgo_getvp = tmpfs_pager_getvp, 313 .pgo_freespace = tmpfs_pager_freespace, 314 .pgo_page_inserted = tmpfs_page_inserted, 315 .pgo_page_removed = tmpfs_page_removed, 316 .pgo_can_alloc_page = tmpfs_can_alloc_page, 317 }; 318 319 static int 320 tmpfs_node_ctor(void *mem, int size, void *arg, int flags) 321 { 322 struct tmpfs_node *node; 323 324 node = mem; 325 node->tn_gen++; 326 node->tn_size = 0; 327 node->tn_status = 0; 328 node->tn_accessed = false; 329 node->tn_flags = 0; 330 node->tn_links = 0; 331 node->tn_vnode = NULL; 332 node->tn_vpstate = 0; 333 return (0); 334 } 335 336 static void 337 tmpfs_node_dtor(void *mem, int size, void *arg) 338 { 339 struct tmpfs_node *node; 340 341 node = mem; 342 node->tn_type = VNON; 343 } 344 345 static int 346 tmpfs_node_init(void *mem, int size, int flags) 347 { 348 struct tmpfs_node *node; 349 350 node = mem; 351 node->tn_id = 0; 352 mtx_init(&node->tn_interlock, "tmpfsni", NULL, MTX_DEF | MTX_NEW); 353 node->tn_gen = arc4random(); 354 return (0); 355 } 356 357 static void 358 tmpfs_node_fini(void *mem, int size) 359 { 360 struct tmpfs_node *node; 361 362 node = mem; 363 mtx_destroy(&node->tn_interlock); 364 } 365 366 int 367 tmpfs_subr_init(void) 368 { 369 tmpfs_pager_type = vm_pager_alloc_dyn_type(&tmpfs_pager_ops, 370 OBJT_SWAP); 371 if (tmpfs_pager_type == -1) 372 return (EINVAL); 373 tmpfs_node_pool = uma_zcreate("TMPFS node", 374 sizeof(struct tmpfs_node), tmpfs_node_ctor, tmpfs_node_dtor, 375 tmpfs_node_init, tmpfs_node_fini, UMA_ALIGN_PTR, 0); 376 VFS_SMR_ZONE_SET(tmpfs_node_pool); 377 378 tmpfs_pages_avail_init = tmpfs_mem_avail(); 379 tmpfs_set_reserve_from_percent(); 380 return (0); 381 } 382 383 void 384 tmpfs_subr_uninit(void) 385 { 386 if (tmpfs_pager_type != -1) 387 vm_pager_free_dyn_type(tmpfs_pager_type); 388 tmpfs_pager_type = -1; 389 uma_zdestroy(tmpfs_node_pool); 390 } 391 392 static int 393 sysctl_mem_reserved(SYSCTL_HANDLER_ARGS) 394 { 395 int error; 396 long pages, bytes; 397 398 pages = *(long *)arg1; 399 bytes = pages * PAGE_SIZE; 400 401 error = sysctl_handle_long(oidp, &bytes, 0, req); 402 if (error || !req->newptr) 403 return (error); 404 405 pages = bytes / PAGE_SIZE; 406 if (pages < TMPFS_PAGES_MINRESERVED) 407 return (EINVAL); 408 409 *(long *)arg1 = pages; 410 return (0); 411 } 412 413 SYSCTL_PROC(_vfs_tmpfs, OID_AUTO, memory_reserved, 414 CTLTYPE_LONG | CTLFLAG_MPSAFE | CTLFLAG_RW, &tmpfs_pages_reserved, 0, 415 sysctl_mem_reserved, "L", 416 "Amount of available memory and swap below which tmpfs growth stops"); 417 418 static int 419 sysctl_mem_percent(SYSCTL_HANDLER_ARGS) 420 { 421 int error, percent; 422 423 percent = *(int *)arg1; 424 error = sysctl_handle_int(oidp, &percent, 0, req); 425 if (error || !req->newptr) 426 return (error); 427 428 if ((unsigned) percent > 100) 429 return (EINVAL); 430 431 *(int *)arg1 = percent; 432 tmpfs_set_reserve_from_percent(); 433 return (0); 434 } 435 436 static void 437 tmpfs_set_reserve_from_percent(void) 438 { 439 size_t reserved; 440 441 reserved = tmpfs_pages_avail_init * (100 - tmpfs_mem_percent) / 100; 442 tmpfs_pages_reserved = max(reserved, TMPFS_PAGES_MINRESERVED); 443 } 444 445 SYSCTL_PROC(_vfs_tmpfs, OID_AUTO, memory_percent, 446 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, &tmpfs_mem_percent, 0, 447 sysctl_mem_percent, "I", 448 "Percent of available memory that can be used if no size limit"); 449 450 static __inline int tmpfs_dirtree_cmp(struct tmpfs_dirent *a, 451 struct tmpfs_dirent *b); 452 RB_PROTOTYPE_STATIC(tmpfs_dir, tmpfs_dirent, uh.td_entries, tmpfs_dirtree_cmp); 453 454 size_t 455 tmpfs_mem_avail(void) 456 { 457 size_t avail; 458 long reserved; 459 460 avail = swap_pager_avail + vm_free_count(); 461 reserved = atomic_load_long(&tmpfs_pages_reserved); 462 if (__predict_false(avail < reserved)) 463 return (0); 464 return (avail - reserved); 465 } 466 467 size_t 468 tmpfs_pages_used(struct tmpfs_mount *tmp) 469 { 470 const size_t node_size = sizeof(struct tmpfs_node) + 471 sizeof(struct tmpfs_dirent); 472 size_t meta_pages; 473 474 meta_pages = howmany((uintmax_t)tmp->tm_nodes_inuse * node_size, 475 PAGE_SIZE); 476 return (meta_pages + tmp->tm_pages_used); 477 } 478 479 bool 480 tmpfs_pages_check_avail(struct tmpfs_mount *tmp, size_t req_pages) 481 { 482 if (tmpfs_mem_avail() < req_pages) 483 return (false); 484 485 if (tmp->tm_pages_max != ULONG_MAX && 486 tmp->tm_pages_max < req_pages + tmpfs_pages_used(tmp)) 487 return (false); 488 489 return (true); 490 } 491 492 static int 493 tmpfs_partial_page_invalidate(vm_object_t object, vm_pindex_t idx, int base, 494 int end, boolean_t ignerr) 495 { 496 vm_page_t m; 497 int rv, error; 498 499 VM_OBJECT_ASSERT_WLOCKED(object); 500 KASSERT(base >= 0, ("%s: base %d", __func__, base)); 501 KASSERT(end - base <= PAGE_SIZE, ("%s: base %d end %d", __func__, base, 502 end)); 503 error = 0; 504 505 retry: 506 m = vm_page_grab(object, idx, VM_ALLOC_NOCREAT); 507 if (m != NULL) { 508 MPASS(vm_page_all_valid(m)); 509 } else if (vm_pager_has_page(object, idx, NULL, NULL)) { 510 m = vm_page_alloc(object, idx, VM_ALLOC_NORMAL | 511 VM_ALLOC_WAITFAIL); 512 if (m == NULL) 513 goto retry; 514 vm_object_pip_add(object, 1); 515 VM_OBJECT_WUNLOCK(object); 516 rv = vm_pager_get_pages(object, &m, 1, NULL, NULL); 517 VM_OBJECT_WLOCK(object); 518 vm_object_pip_wakeup(object); 519 if (rv == VM_PAGER_OK) { 520 /* 521 * Since the page was not resident, and therefore not 522 * recently accessed, immediately enqueue it for 523 * asynchronous laundering. The current operation is 524 * not regarded as an access. 525 */ 526 vm_page_launder(m); 527 } else { 528 vm_page_free(m); 529 m = NULL; 530 if (!ignerr) 531 error = EIO; 532 } 533 } 534 if (m != NULL) { 535 pmap_zero_page_area(m, base, end - base); 536 vm_page_set_dirty(m); 537 vm_page_xunbusy(m); 538 } 539 540 return (error); 541 } 542 543 void 544 tmpfs_ref_node(struct tmpfs_node *node) 545 { 546 #ifdef INVARIANTS 547 u_int old; 548 549 old = 550 #endif 551 refcount_acquire(&node->tn_refcount); 552 #ifdef INVARIANTS 553 KASSERT(old > 0, ("node %p zero refcount", node)); 554 #endif 555 } 556 557 /* 558 * Allocates a new node of type 'type' inside the 'tmp' mount point, with 559 * its owner set to 'uid', its group to 'gid' and its mode set to 'mode', 560 * using the credentials of the process 'p'. 561 * 562 * If the node type is set to 'VDIR', then the parent parameter must point 563 * to the parent directory of the node being created. It may only be NULL 564 * while allocating the root node. 565 * 566 * If the node type is set to 'VBLK' or 'VCHR', then the rdev parameter 567 * specifies the device the node represents. 568 * 569 * If the node type is set to 'VLNK', then the parameter target specifies 570 * the file name of the target file for the symbolic link that is being 571 * created. 572 * 573 * Note that new nodes are retrieved from the available list if it has 574 * items or, if it is empty, from the node pool as long as there is enough 575 * space to create them. 576 * 577 * Returns zero on success or an appropriate error code on failure. 578 */ 579 int 580 tmpfs_alloc_node(struct mount *mp, struct tmpfs_mount *tmp, __enum_uint8(vtype) type, 581 uid_t uid, gid_t gid, mode_t mode, struct tmpfs_node *parent, 582 const char *target, dev_t rdev, struct tmpfs_node **node) 583 { 584 struct tmpfs_node *nnode; 585 char *symlink; 586 char symlink_smr; 587 588 /* If the root directory of the 'tmp' file system is not yet 589 * allocated, this must be the request to do it. */ 590 MPASS(IMPLIES(tmp->tm_root == NULL, parent == NULL && type == VDIR)); 591 592 MPASS((type == VLNK) ^ (target == NULL)); 593 MPASS((type == VBLK || type == VCHR) ^ (rdev == VNOVAL)); 594 595 if (tmp->tm_nodes_inuse >= tmp->tm_nodes_max) 596 return (ENOSPC); 597 if (!tmpfs_pages_check_avail(tmp, 1)) 598 return (ENOSPC); 599 600 if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) { 601 /* 602 * When a new tmpfs node is created for fully 603 * constructed mount point, there must be a parent 604 * node, which vnode is locked exclusively. As 605 * consequence, if the unmount is executing in 606 * parallel, vflush() cannot reclaim the parent vnode. 607 * Due to this, the check for MNTK_UNMOUNT flag is not 608 * racy: if we did not see MNTK_UNMOUNT flag, then tmp 609 * cannot be destroyed until node construction is 610 * finished and the parent vnode unlocked. 611 * 612 * Tmpfs does not need to instantiate new nodes during 613 * unmount. 614 */ 615 return (EBUSY); 616 } 617 if ((mp->mnt_kern_flag & MNT_RDONLY) != 0) 618 return (EROFS); 619 620 nnode = uma_zalloc_smr(tmpfs_node_pool, M_WAITOK); 621 622 /* Generic initialization. */ 623 nnode->tn_type = type; 624 vfs_timestamp(&nnode->tn_atime); 625 nnode->tn_birthtime = nnode->tn_ctime = nnode->tn_mtime = 626 nnode->tn_atime; 627 nnode->tn_uid = uid; 628 nnode->tn_gid = gid; 629 nnode->tn_mode = mode; 630 nnode->tn_id = alloc_unr64(&tmp->tm_ino_unr); 631 nnode->tn_refcount = 1; 632 LIST_INIT(&nnode->tn_extattrs); 633 634 /* Type-specific initialization. */ 635 switch (nnode->tn_type) { 636 case VBLK: 637 case VCHR: 638 nnode->tn_rdev = rdev; 639 break; 640 641 case VDIR: 642 RB_INIT(&nnode->tn_dir.tn_dirhead); 643 LIST_INIT(&nnode->tn_dir.tn_dupindex); 644 MPASS(parent != nnode); 645 MPASS(IMPLIES(parent == NULL, tmp->tm_root == NULL)); 646 nnode->tn_dir.tn_parent = (parent == NULL) ? nnode : parent; 647 nnode->tn_dir.tn_readdir_lastn = 0; 648 nnode->tn_dir.tn_readdir_lastp = NULL; 649 nnode->tn_dir.tn_wht_size = 0; 650 nnode->tn_links++; 651 TMPFS_NODE_LOCK(nnode->tn_dir.tn_parent); 652 nnode->tn_dir.tn_parent->tn_links++; 653 TMPFS_NODE_UNLOCK(nnode->tn_dir.tn_parent); 654 break; 655 656 case VFIFO: 657 /* FALLTHROUGH */ 658 case VSOCK: 659 break; 660 661 case VLNK: 662 MPASS(strlen(target) < MAXPATHLEN); 663 nnode->tn_size = strlen(target); 664 665 symlink = NULL; 666 if (!tmp->tm_nonc) { 667 symlink = cache_symlink_alloc(nnode->tn_size + 1, 668 M_WAITOK); 669 symlink_smr = true; 670 } 671 if (symlink == NULL) { 672 symlink = malloc(nnode->tn_size + 1, M_TMPFSNAME, 673 M_WAITOK); 674 symlink_smr = false; 675 } 676 memcpy(symlink, target, nnode->tn_size + 1); 677 678 /* 679 * Allow safe symlink resolving for lockless lookup. 680 * tmpfs_fplookup_symlink references this comment. 681 * 682 * 1. nnode is not yet visible to the world 683 * 2. both tn_link_target and tn_link_smr get populated 684 * 3. release fence publishes their content 685 * 4. tn_link_target content is immutable until node 686 * destruction, where the pointer gets set to NULL 687 * 5. tn_link_smr is never changed once set 688 * 689 * As a result it is sufficient to issue load consume 690 * on the node pointer to also get the above content 691 * in a stable manner. Worst case tn_link_smr flag 692 * may be set to true despite being stale, while the 693 * target buffer is already cleared out. 694 */ 695 atomic_store_ptr(&nnode->tn_link_target, symlink); 696 atomic_store_char((char *)&nnode->tn_link_smr, symlink_smr); 697 atomic_thread_fence_rel(); 698 break; 699 700 case VREG: 701 nnode->tn_reg.tn_aobj = 702 vm_pager_allocate(tmpfs_pager_type, NULL, 0, 703 VM_PROT_DEFAULT, 0, 704 NULL /* XXXKIB - tmpfs needs swap reservation */); 705 nnode->tn_reg.tn_aobj->un_pager.swp.swp_priv = nnode; 706 vm_object_set_flag(nnode->tn_reg.tn_aobj, OBJ_TMPFS); 707 nnode->tn_reg.tn_tmp = tmp; 708 nnode->tn_reg.tn_pages = 0; 709 break; 710 711 default: 712 panic("tmpfs_alloc_node: type %p %d", nnode, 713 (int)nnode->tn_type); 714 } 715 716 TMPFS_LOCK(tmp); 717 LIST_INSERT_HEAD(&tmp->tm_nodes_used, nnode, tn_entries); 718 nnode->tn_attached = true; 719 tmp->tm_nodes_inuse++; 720 tmp->tm_refcount++; 721 TMPFS_UNLOCK(tmp); 722 723 *node = nnode; 724 return (0); 725 } 726 727 /* 728 * Destroys the node pointed to by node from the file system 'tmp'. 729 * If the node references a directory, no entries are allowed. 730 */ 731 void 732 tmpfs_free_node(struct tmpfs_mount *tmp, struct tmpfs_node *node) 733 { 734 if (refcount_release_if_not_last(&node->tn_refcount)) 735 return; 736 737 TMPFS_LOCK(tmp); 738 TMPFS_NODE_LOCK(node); 739 if (!tmpfs_free_node_locked(tmp, node, false)) { 740 TMPFS_NODE_UNLOCK(node); 741 TMPFS_UNLOCK(tmp); 742 } 743 } 744 745 bool 746 tmpfs_free_node_locked(struct tmpfs_mount *tmp, struct tmpfs_node *node, 747 bool detach) 748 { 749 struct tmpfs_extattr *ea; 750 vm_object_t uobj; 751 char *symlink; 752 bool last; 753 754 TMPFS_MP_ASSERT_LOCKED(tmp); 755 TMPFS_NODE_ASSERT_LOCKED(node); 756 757 last = refcount_release(&node->tn_refcount); 758 if (node->tn_attached && (detach || last)) { 759 MPASS(tmp->tm_nodes_inuse > 0); 760 tmp->tm_nodes_inuse--; 761 LIST_REMOVE(node, tn_entries); 762 node->tn_attached = false; 763 } 764 if (!last) 765 return (false); 766 767 TMPFS_NODE_UNLOCK(node); 768 769 #ifdef INVARIANTS 770 MPASS(node->tn_vnode == NULL); 771 MPASS((node->tn_vpstate & TMPFS_VNODE_ALLOCATING) == 0); 772 773 /* 774 * Make sure this is a node type we can deal with. Everything 775 * is explicitly enumerated without the 'default' clause so 776 * the compiler can throw an error in case a new type is 777 * added. 778 */ 779 switch (node->tn_type) { 780 case VBLK: 781 case VCHR: 782 case VDIR: 783 case VFIFO: 784 case VSOCK: 785 case VLNK: 786 case VREG: 787 break; 788 case VNON: 789 case VBAD: 790 case VMARKER: 791 panic("%s: bad type %d for node %p", __func__, 792 (int)node->tn_type, node); 793 } 794 #endif 795 796 while ((ea = LIST_FIRST(&node->tn_extattrs)) != NULL) { 797 LIST_REMOVE(ea, ea_extattrs); 798 tmpfs_extattr_free(ea); 799 } 800 801 switch (node->tn_type) { 802 case VREG: 803 uobj = node->tn_reg.tn_aobj; 804 node->tn_reg.tn_aobj = NULL; 805 if (uobj != NULL) { 806 VM_OBJECT_WLOCK(uobj); 807 KASSERT((uobj->flags & OBJ_TMPFS) != 0, 808 ("tmpfs node %p uobj %p not tmpfs", node, uobj)); 809 vm_object_clear_flag(uobj, OBJ_TMPFS); 810 KASSERT(tmp->tm_pages_used >= node->tn_reg.tn_pages, 811 ("tmpfs tmp %p node %p pages %jd free %jd", tmp, 812 node, (uintmax_t)tmp->tm_pages_used, 813 (uintmax_t)node->tn_reg.tn_pages)); 814 atomic_add_long(&tmp->tm_pages_used, 815 -node->tn_reg.tn_pages); 816 VM_OBJECT_WUNLOCK(uobj); 817 } 818 tmpfs_free_tmp(tmp); 819 820 /* 821 * vm_object_deallocate() must not be called while 822 * owning tm_allnode_lock, because deallocate might 823 * sleep. Call it after tmpfs_free_tmp() does the 824 * unlock. 825 */ 826 if (uobj != NULL) 827 vm_object_deallocate(uobj); 828 829 break; 830 case VLNK: 831 tmpfs_free_tmp(tmp); 832 833 symlink = node->tn_link_target; 834 atomic_store_ptr(&node->tn_link_target, NULL); 835 if (atomic_load_char(&node->tn_link_smr)) { 836 cache_symlink_free(symlink, node->tn_size + 1); 837 } else { 838 free(symlink, M_TMPFSNAME); 839 } 840 break; 841 default: 842 tmpfs_free_tmp(tmp); 843 break; 844 } 845 846 uma_zfree_smr(tmpfs_node_pool, node); 847 return (true); 848 } 849 850 static __inline uint32_t 851 tmpfs_dirent_hash(const char *name, u_int len) 852 { 853 uint32_t hash; 854 855 hash = fnv_32_buf(name, len, FNV1_32_INIT + len) & TMPFS_DIRCOOKIE_MASK; 856 #ifdef TMPFS_DEBUG_DIRCOOKIE_DUP 857 hash &= 0xf; 858 #endif 859 if (hash < TMPFS_DIRCOOKIE_MIN) 860 hash += TMPFS_DIRCOOKIE_MIN; 861 862 return (hash); 863 } 864 865 static __inline off_t 866 tmpfs_dirent_cookie(struct tmpfs_dirent *de) 867 { 868 if (de == NULL) 869 return (TMPFS_DIRCOOKIE_EOF); 870 871 MPASS(de->td_cookie >= TMPFS_DIRCOOKIE_MIN); 872 873 return (de->td_cookie); 874 } 875 876 static __inline boolean_t 877 tmpfs_dirent_dup(struct tmpfs_dirent *de) 878 { 879 return ((de->td_cookie & TMPFS_DIRCOOKIE_DUP) != 0); 880 } 881 882 static __inline boolean_t 883 tmpfs_dirent_duphead(struct tmpfs_dirent *de) 884 { 885 return ((de->td_cookie & TMPFS_DIRCOOKIE_DUPHEAD) != 0); 886 } 887 888 void 889 tmpfs_dirent_init(struct tmpfs_dirent *de, const char *name, u_int namelen) 890 { 891 de->td_hash = de->td_cookie = tmpfs_dirent_hash(name, namelen); 892 memcpy(de->ud.td_name, name, namelen); 893 de->td_namelen = namelen; 894 } 895 896 /* 897 * Allocates a new directory entry for the node node with a name of name. 898 * The new directory entry is returned in *de. 899 * 900 * The link count of node is increased by one to reflect the new object 901 * referencing it. 902 * 903 * Returns zero on success or an appropriate error code on failure. 904 */ 905 int 906 tmpfs_alloc_dirent(struct tmpfs_mount *tmp, struct tmpfs_node *node, 907 const char *name, u_int len, struct tmpfs_dirent **de) 908 { 909 struct tmpfs_dirent *nde; 910 911 nde = malloc(sizeof(*nde), M_TMPFSDIR, M_WAITOK); 912 nde->td_node = node; 913 if (name != NULL) { 914 nde->ud.td_name = malloc(len, M_TMPFSNAME, M_WAITOK); 915 tmpfs_dirent_init(nde, name, len); 916 } else 917 nde->td_namelen = 0; 918 if (node != NULL) 919 node->tn_links++; 920 921 *de = nde; 922 923 return (0); 924 } 925 926 /* 927 * Frees a directory entry. It is the caller's responsibility to destroy 928 * the node referenced by it if needed. 929 * 930 * The link count of node is decreased by one to reflect the removal of an 931 * object that referenced it. This only happens if 'node_exists' is true; 932 * otherwise the function will not access the node referred to by the 933 * directory entry, as it may already have been released from the outside. 934 */ 935 void 936 tmpfs_free_dirent(struct tmpfs_mount *tmp, struct tmpfs_dirent *de) 937 { 938 struct tmpfs_node *node; 939 940 node = de->td_node; 941 if (node != NULL) { 942 MPASS(node->tn_links > 0); 943 node->tn_links--; 944 } 945 if (!tmpfs_dirent_duphead(de) && de->ud.td_name != NULL) 946 free(de->ud.td_name, M_TMPFSNAME); 947 free(de, M_TMPFSDIR); 948 } 949 950 void 951 tmpfs_destroy_vobject(struct vnode *vp, vm_object_t obj) 952 { 953 bool want_vrele; 954 955 ASSERT_VOP_ELOCKED(vp, "tmpfs_destroy_vobject"); 956 if (vp->v_type != VREG || obj == NULL) 957 return; 958 959 VM_OBJECT_WLOCK(obj); 960 VI_LOCK(vp); 961 vp->v_object = NULL; 962 963 /* 964 * May be going through forced unmount. 965 */ 966 want_vrele = false; 967 if ((obj->flags & OBJ_TMPFS_VREF) != 0) { 968 vm_object_clear_flag(obj, OBJ_TMPFS_VREF); 969 want_vrele = true; 970 } 971 972 if (vp->v_writecount < 0) 973 vp->v_writecount = 0; 974 VI_UNLOCK(vp); 975 VM_OBJECT_WUNLOCK(obj); 976 if (want_vrele) { 977 vrele(vp); 978 } 979 } 980 981 /* 982 * Allocates a new vnode for the node node or returns a new reference to 983 * an existing one if the node had already a vnode referencing it. The 984 * resulting locked vnode is returned in *vpp. 985 * 986 * Returns zero on success or an appropriate error code on failure. 987 */ 988 int 989 tmpfs_alloc_vp(struct mount *mp, struct tmpfs_node *node, int lkflag, 990 struct vnode **vpp) 991 { 992 struct vnode *vp; 993 enum vgetstate vs; 994 struct tmpfs_mount *tm; 995 vm_object_t object; 996 int error; 997 998 error = 0; 999 tm = VFS_TO_TMPFS(mp); 1000 TMPFS_NODE_LOCK(node); 1001 tmpfs_ref_node(node); 1002 loop: 1003 TMPFS_NODE_ASSERT_LOCKED(node); 1004 if ((vp = node->tn_vnode) != NULL) { 1005 MPASS((node->tn_vpstate & TMPFS_VNODE_DOOMED) == 0); 1006 if ((node->tn_type == VDIR && node->tn_dir.tn_parent == NULL) || 1007 (VN_IS_DOOMED(vp) && 1008 (lkflag & LK_NOWAIT) != 0)) { 1009 TMPFS_NODE_UNLOCK(node); 1010 error = ENOENT; 1011 vp = NULL; 1012 goto out; 1013 } 1014 if (VN_IS_DOOMED(vp)) { 1015 node->tn_vpstate |= TMPFS_VNODE_WRECLAIM; 1016 while ((node->tn_vpstate & TMPFS_VNODE_WRECLAIM) != 0) { 1017 msleep(&node->tn_vnode, TMPFS_NODE_MTX(node), 1018 0, "tmpfsE", 0); 1019 } 1020 goto loop; 1021 } 1022 vs = vget_prep(vp); 1023 TMPFS_NODE_UNLOCK(node); 1024 error = vget_finish(vp, lkflag, vs); 1025 if (error == ENOENT) { 1026 TMPFS_NODE_LOCK(node); 1027 goto loop; 1028 } 1029 if (error != 0) { 1030 vp = NULL; 1031 goto out; 1032 } 1033 1034 /* 1035 * Make sure the vnode is still there after 1036 * getting the interlock to avoid racing a free. 1037 */ 1038 if (node->tn_vnode != vp) { 1039 vput(vp); 1040 TMPFS_NODE_LOCK(node); 1041 goto loop; 1042 } 1043 1044 goto out; 1045 } 1046 1047 if ((node->tn_vpstate & TMPFS_VNODE_DOOMED) || 1048 (node->tn_type == VDIR && node->tn_dir.tn_parent == NULL)) { 1049 TMPFS_NODE_UNLOCK(node); 1050 error = ENOENT; 1051 vp = NULL; 1052 goto out; 1053 } 1054 1055 /* 1056 * otherwise lock the vp list while we call getnewvnode 1057 * since that can block. 1058 */ 1059 if (node->tn_vpstate & TMPFS_VNODE_ALLOCATING) { 1060 node->tn_vpstate |= TMPFS_VNODE_WANT; 1061 error = msleep((caddr_t) &node->tn_vpstate, 1062 TMPFS_NODE_MTX(node), 0, "tmpfs_alloc_vp", 0); 1063 if (error != 0) 1064 goto out; 1065 goto loop; 1066 } else 1067 node->tn_vpstate |= TMPFS_VNODE_ALLOCATING; 1068 1069 TMPFS_NODE_UNLOCK(node); 1070 1071 /* Get a new vnode and associate it with our node. */ 1072 error = getnewvnode("tmpfs", mp, VFS_TO_TMPFS(mp)->tm_nonc ? 1073 &tmpfs_vnodeop_nonc_entries : &tmpfs_vnodeop_entries, &vp); 1074 if (error != 0) 1075 goto unlock; 1076 MPASS(vp != NULL); 1077 1078 /* lkflag is ignored, the lock is exclusive */ 1079 (void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1080 1081 vp->v_data = node; 1082 vp->v_type = node->tn_type; 1083 1084 /* Type-specific initialization. */ 1085 switch (node->tn_type) { 1086 case VBLK: 1087 /* FALLTHROUGH */ 1088 case VCHR: 1089 /* FALLTHROUGH */ 1090 case VLNK: 1091 /* FALLTHROUGH */ 1092 case VSOCK: 1093 break; 1094 case VFIFO: 1095 vp->v_op = &tmpfs_fifoop_entries; 1096 break; 1097 case VREG: 1098 object = node->tn_reg.tn_aobj; 1099 VM_OBJECT_WLOCK(object); 1100 KASSERT((object->flags & OBJ_TMPFS_VREF) == 0, 1101 ("%s: object %p with OBJ_TMPFS_VREF but without vnode", 1102 __func__, object)); 1103 VI_LOCK(vp); 1104 KASSERT(vp->v_object == NULL, ("Not NULL v_object in tmpfs")); 1105 vp->v_object = object; 1106 vn_irflag_set_locked(vp, (tm->tm_pgread ? VIRF_PGREAD : 0) | 1107 VIRF_TEXT_REF); 1108 VI_UNLOCK(vp); 1109 VNASSERT((object->flags & OBJ_TMPFS_VREF) == 0, vp, 1110 ("leaked OBJ_TMPFS_VREF")); 1111 if (object->un_pager.swp.writemappings > 0) { 1112 vrefact(vp); 1113 vlazy(vp); 1114 vm_object_set_flag(object, OBJ_TMPFS_VREF); 1115 } 1116 VM_OBJECT_WUNLOCK(object); 1117 break; 1118 case VDIR: 1119 MPASS(node->tn_dir.tn_parent != NULL); 1120 if (node->tn_dir.tn_parent == node) 1121 vp->v_vflag |= VV_ROOT; 1122 break; 1123 1124 default: 1125 panic("tmpfs_alloc_vp: type %p %d", node, (int)node->tn_type); 1126 } 1127 if (vp->v_type != VFIFO) 1128 VN_LOCK_ASHARE(vp); 1129 1130 error = insmntque1(vp, mp); 1131 if (error != 0) { 1132 /* Need to clear v_object for insmntque failure. */ 1133 tmpfs_destroy_vobject(vp, vp->v_object); 1134 vp->v_object = NULL; 1135 vp->v_data = NULL; 1136 vp->v_op = &dead_vnodeops; 1137 vgone(vp); 1138 vput(vp); 1139 vp = NULL; 1140 } else { 1141 vn_set_state(vp, VSTATE_CONSTRUCTED); 1142 } 1143 1144 unlock: 1145 TMPFS_NODE_LOCK(node); 1146 1147 MPASS(node->tn_vpstate & TMPFS_VNODE_ALLOCATING); 1148 node->tn_vpstate &= ~TMPFS_VNODE_ALLOCATING; 1149 node->tn_vnode = vp; 1150 1151 if (node->tn_vpstate & TMPFS_VNODE_WANT) { 1152 node->tn_vpstate &= ~TMPFS_VNODE_WANT; 1153 TMPFS_NODE_UNLOCK(node); 1154 wakeup((caddr_t) &node->tn_vpstate); 1155 } else 1156 TMPFS_NODE_UNLOCK(node); 1157 1158 out: 1159 if (error == 0) { 1160 *vpp = vp; 1161 1162 #ifdef INVARIANTS 1163 MPASS(*vpp != NULL); 1164 ASSERT_VOP_LOCKED(*vpp, __func__); 1165 TMPFS_NODE_LOCK(node); 1166 MPASS(*vpp == node->tn_vnode); 1167 TMPFS_NODE_UNLOCK(node); 1168 #endif 1169 } 1170 tmpfs_free_node(tm, node); 1171 1172 return (error); 1173 } 1174 1175 /* 1176 * Destroys the association between the vnode vp and the node it 1177 * references. 1178 */ 1179 void 1180 tmpfs_free_vp(struct vnode *vp) 1181 { 1182 struct tmpfs_node *node; 1183 1184 node = VP_TO_TMPFS_NODE(vp); 1185 1186 TMPFS_NODE_ASSERT_LOCKED(node); 1187 node->tn_vnode = NULL; 1188 if ((node->tn_vpstate & TMPFS_VNODE_WRECLAIM) != 0) 1189 wakeup(&node->tn_vnode); 1190 node->tn_vpstate &= ~TMPFS_VNODE_WRECLAIM; 1191 vp->v_data = NULL; 1192 } 1193 1194 /* 1195 * Allocates a new file of type 'type' and adds it to the parent directory 1196 * 'dvp'; this addition is done using the component name given in 'cnp'. 1197 * The ownership of the new file is automatically assigned based on the 1198 * credentials of the caller (through 'cnp'), the group is set based on 1199 * the parent directory and the mode is determined from the 'vap' argument. 1200 * If successful, *vpp holds a vnode to the newly created file and zero 1201 * is returned. Otherwise *vpp is NULL and the function returns an 1202 * appropriate error code. 1203 */ 1204 int 1205 tmpfs_alloc_file(struct vnode *dvp, struct vnode **vpp, struct vattr *vap, 1206 struct componentname *cnp, const char *target) 1207 { 1208 int error; 1209 struct tmpfs_dirent *de; 1210 struct tmpfs_mount *tmp; 1211 struct tmpfs_node *dnode; 1212 struct tmpfs_node *node; 1213 struct tmpfs_node *parent; 1214 1215 ASSERT_VOP_ELOCKED(dvp, "tmpfs_alloc_file"); 1216 1217 tmp = VFS_TO_TMPFS(dvp->v_mount); 1218 dnode = VP_TO_TMPFS_DIR(dvp); 1219 *vpp = NULL; 1220 1221 /* If the entry we are creating is a directory, we cannot overflow 1222 * the number of links of its parent, because it will get a new 1223 * link. */ 1224 if (vap->va_type == VDIR) { 1225 /* Ensure that we do not overflow the maximum number of links 1226 * imposed by the system. */ 1227 MPASS(dnode->tn_links <= TMPFS_LINK_MAX); 1228 if (dnode->tn_links == TMPFS_LINK_MAX) { 1229 return (EMLINK); 1230 } 1231 1232 parent = dnode; 1233 MPASS(parent != NULL); 1234 } else 1235 parent = NULL; 1236 1237 /* Allocate a node that represents the new file. */ 1238 error = tmpfs_alloc_node(dvp->v_mount, tmp, vap->va_type, 1239 cnp->cn_cred->cr_uid, dnode->tn_gid, vap->va_mode, parent, 1240 target, vap->va_rdev, &node); 1241 if (error != 0) 1242 return (error); 1243 1244 /* Allocate a directory entry that points to the new file. */ 1245 error = tmpfs_alloc_dirent(tmp, node, cnp->cn_nameptr, cnp->cn_namelen, 1246 &de); 1247 if (error != 0) { 1248 tmpfs_free_node(tmp, node); 1249 return (error); 1250 } 1251 1252 /* Allocate a vnode for the new file. */ 1253 error = tmpfs_alloc_vp(dvp->v_mount, node, LK_EXCLUSIVE, vpp); 1254 if (error != 0) { 1255 tmpfs_free_dirent(tmp, de); 1256 tmpfs_free_node(tmp, node); 1257 return (error); 1258 } 1259 1260 /* Now that all required items are allocated, we can proceed to 1261 * insert the new node into the directory, an operation that 1262 * cannot fail. */ 1263 if (cnp->cn_flags & ISWHITEOUT) 1264 tmpfs_dir_whiteout_remove(dvp, cnp); 1265 tmpfs_dir_attach(dvp, de); 1266 return (0); 1267 } 1268 1269 struct tmpfs_dirent * 1270 tmpfs_dir_first(struct tmpfs_node *dnode, struct tmpfs_dir_cursor *dc) 1271 { 1272 struct tmpfs_dirent *de; 1273 1274 de = RB_MIN(tmpfs_dir, &dnode->tn_dir.tn_dirhead); 1275 dc->tdc_tree = de; 1276 if (de != NULL && tmpfs_dirent_duphead(de)) 1277 de = LIST_FIRST(&de->ud.td_duphead); 1278 dc->tdc_current = de; 1279 1280 return (dc->tdc_current); 1281 } 1282 1283 struct tmpfs_dirent * 1284 tmpfs_dir_next(struct tmpfs_node *dnode, struct tmpfs_dir_cursor *dc) 1285 { 1286 struct tmpfs_dirent *de; 1287 1288 MPASS(dc->tdc_tree != NULL); 1289 if (tmpfs_dirent_dup(dc->tdc_current)) { 1290 dc->tdc_current = LIST_NEXT(dc->tdc_current, uh.td_dup.entries); 1291 if (dc->tdc_current != NULL) 1292 return (dc->tdc_current); 1293 } 1294 dc->tdc_tree = dc->tdc_current = RB_NEXT(tmpfs_dir, 1295 &dnode->tn_dir.tn_dirhead, dc->tdc_tree); 1296 if ((de = dc->tdc_current) != NULL && tmpfs_dirent_duphead(de)) { 1297 dc->tdc_current = LIST_FIRST(&de->ud.td_duphead); 1298 MPASS(dc->tdc_current != NULL); 1299 } 1300 1301 return (dc->tdc_current); 1302 } 1303 1304 /* Lookup directory entry in RB-Tree. Function may return duphead entry. */ 1305 static struct tmpfs_dirent * 1306 tmpfs_dir_xlookup_hash(struct tmpfs_node *dnode, uint32_t hash) 1307 { 1308 struct tmpfs_dirent *de, dekey; 1309 1310 dekey.td_hash = hash; 1311 de = RB_FIND(tmpfs_dir, &dnode->tn_dir.tn_dirhead, &dekey); 1312 return (de); 1313 } 1314 1315 /* Lookup directory entry by cookie, initialize directory cursor accordingly. */ 1316 static struct tmpfs_dirent * 1317 tmpfs_dir_lookup_cookie(struct tmpfs_node *node, off_t cookie, 1318 struct tmpfs_dir_cursor *dc) 1319 { 1320 struct tmpfs_dir *dirhead = &node->tn_dir.tn_dirhead; 1321 struct tmpfs_dirent *de, dekey; 1322 1323 MPASS(cookie >= TMPFS_DIRCOOKIE_MIN); 1324 1325 if (cookie == node->tn_dir.tn_readdir_lastn && 1326 (de = node->tn_dir.tn_readdir_lastp) != NULL) { 1327 /* Protect against possible race, tn_readdir_last[pn] 1328 * may be updated with only shared vnode lock held. */ 1329 if (cookie == tmpfs_dirent_cookie(de)) 1330 goto out; 1331 } 1332 1333 if ((cookie & TMPFS_DIRCOOKIE_DUP) != 0) { 1334 LIST_FOREACH(de, &node->tn_dir.tn_dupindex, 1335 uh.td_dup.index_entries) { 1336 MPASS(tmpfs_dirent_dup(de)); 1337 if (de->td_cookie == cookie) 1338 goto out; 1339 /* dupindex list is sorted. */ 1340 if (de->td_cookie < cookie) { 1341 de = NULL; 1342 goto out; 1343 } 1344 } 1345 MPASS(de == NULL); 1346 goto out; 1347 } 1348 1349 if ((cookie & TMPFS_DIRCOOKIE_MASK) != cookie) { 1350 de = NULL; 1351 } else { 1352 dekey.td_hash = cookie; 1353 /* Recover if direntry for cookie was removed */ 1354 de = RB_NFIND(tmpfs_dir, dirhead, &dekey); 1355 } 1356 dc->tdc_tree = de; 1357 dc->tdc_current = de; 1358 if (de != NULL && tmpfs_dirent_duphead(de)) { 1359 dc->tdc_current = LIST_FIRST(&de->ud.td_duphead); 1360 MPASS(dc->tdc_current != NULL); 1361 } 1362 return (dc->tdc_current); 1363 1364 out: 1365 dc->tdc_tree = de; 1366 dc->tdc_current = de; 1367 if (de != NULL && tmpfs_dirent_dup(de)) 1368 dc->tdc_tree = tmpfs_dir_xlookup_hash(node, 1369 de->td_hash); 1370 return (dc->tdc_current); 1371 } 1372 1373 /* 1374 * Looks for a directory entry in the directory represented by node. 1375 * 'cnp' describes the name of the entry to look for. Note that the . 1376 * and .. components are not allowed as they do not physically exist 1377 * within directories. 1378 * 1379 * Returns a pointer to the entry when found, otherwise NULL. 1380 */ 1381 struct tmpfs_dirent * 1382 tmpfs_dir_lookup(struct tmpfs_node *node, struct tmpfs_node *f, 1383 struct componentname *cnp) 1384 { 1385 struct tmpfs_dir_duphead *duphead; 1386 struct tmpfs_dirent *de; 1387 uint32_t hash; 1388 1389 MPASS(IMPLIES(cnp->cn_namelen == 1, cnp->cn_nameptr[0] != '.')); 1390 MPASS(IMPLIES(cnp->cn_namelen == 2, !(cnp->cn_nameptr[0] == '.' && 1391 cnp->cn_nameptr[1] == '.'))); 1392 TMPFS_VALIDATE_DIR(node); 1393 1394 hash = tmpfs_dirent_hash(cnp->cn_nameptr, cnp->cn_namelen); 1395 de = tmpfs_dir_xlookup_hash(node, hash); 1396 if (de != NULL && tmpfs_dirent_duphead(de)) { 1397 duphead = &de->ud.td_duphead; 1398 LIST_FOREACH(de, duphead, uh.td_dup.entries) { 1399 if (TMPFS_DIRENT_MATCHES(de, cnp->cn_nameptr, 1400 cnp->cn_namelen)) 1401 break; 1402 } 1403 } else if (de != NULL) { 1404 if (!TMPFS_DIRENT_MATCHES(de, cnp->cn_nameptr, 1405 cnp->cn_namelen)) 1406 de = NULL; 1407 } 1408 if (de != NULL && f != NULL && de->td_node != f) 1409 de = NULL; 1410 1411 return (de); 1412 } 1413 1414 /* 1415 * Attach duplicate-cookie directory entry nde to dnode and insert to dupindex 1416 * list, allocate new cookie value. 1417 */ 1418 static void 1419 tmpfs_dir_attach_dup(struct tmpfs_node *dnode, 1420 struct tmpfs_dir_duphead *duphead, struct tmpfs_dirent *nde) 1421 { 1422 struct tmpfs_dir_duphead *dupindex; 1423 struct tmpfs_dirent *de, *pde; 1424 1425 dupindex = &dnode->tn_dir.tn_dupindex; 1426 de = LIST_FIRST(dupindex); 1427 if (de == NULL || de->td_cookie < TMPFS_DIRCOOKIE_DUP_MAX) { 1428 if (de == NULL) 1429 nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MIN; 1430 else 1431 nde->td_cookie = de->td_cookie + 1; 1432 MPASS(tmpfs_dirent_dup(nde)); 1433 LIST_INSERT_HEAD(dupindex, nde, uh.td_dup.index_entries); 1434 LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); 1435 return; 1436 } 1437 1438 /* 1439 * Cookie numbers are near exhaustion. Scan dupindex list for unused 1440 * numbers. dupindex list is sorted in descending order. Keep it so 1441 * after inserting nde. 1442 */ 1443 while (1) { 1444 pde = de; 1445 de = LIST_NEXT(de, uh.td_dup.index_entries); 1446 if (de == NULL && pde->td_cookie != TMPFS_DIRCOOKIE_DUP_MIN) { 1447 /* 1448 * Last element of the index doesn't have minimal cookie 1449 * value, use it. 1450 */ 1451 nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MIN; 1452 LIST_INSERT_AFTER(pde, nde, uh.td_dup.index_entries); 1453 LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); 1454 return; 1455 } else if (de == NULL) { 1456 /* 1457 * We are so lucky have 2^30 hash duplicates in single 1458 * directory :) Return largest possible cookie value. 1459 * It should be fine except possible issues with 1460 * VOP_READDIR restart. 1461 */ 1462 nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MAX; 1463 LIST_INSERT_HEAD(dupindex, nde, 1464 uh.td_dup.index_entries); 1465 LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); 1466 return; 1467 } 1468 if (de->td_cookie + 1 == pde->td_cookie || 1469 de->td_cookie >= TMPFS_DIRCOOKIE_DUP_MAX) 1470 continue; /* No hole or invalid cookie. */ 1471 nde->td_cookie = de->td_cookie + 1; 1472 MPASS(tmpfs_dirent_dup(nde)); 1473 MPASS(pde->td_cookie > nde->td_cookie); 1474 MPASS(nde->td_cookie > de->td_cookie); 1475 LIST_INSERT_BEFORE(de, nde, uh.td_dup.index_entries); 1476 LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); 1477 return; 1478 } 1479 } 1480 1481 /* 1482 * Attaches the directory entry de to the directory represented by vp. 1483 * Note that this does not change the link count of the node pointed by 1484 * the directory entry, as this is done by tmpfs_alloc_dirent. 1485 */ 1486 void 1487 tmpfs_dir_attach(struct vnode *vp, struct tmpfs_dirent *de) 1488 { 1489 struct tmpfs_node *dnode; 1490 struct tmpfs_dirent *xde, *nde; 1491 1492 ASSERT_VOP_ELOCKED(vp, __func__); 1493 MPASS(de->td_namelen > 0); 1494 MPASS(de->td_hash >= TMPFS_DIRCOOKIE_MIN); 1495 MPASS(de->td_cookie == de->td_hash); 1496 1497 dnode = VP_TO_TMPFS_DIR(vp); 1498 dnode->tn_dir.tn_readdir_lastn = 0; 1499 dnode->tn_dir.tn_readdir_lastp = NULL; 1500 1501 MPASS(!tmpfs_dirent_dup(de)); 1502 xde = RB_INSERT(tmpfs_dir, &dnode->tn_dir.tn_dirhead, de); 1503 if (xde != NULL && tmpfs_dirent_duphead(xde)) 1504 tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, de); 1505 else if (xde != NULL) { 1506 /* 1507 * Allocate new duphead. Swap xde with duphead to avoid 1508 * adding/removing elements with the same hash. 1509 */ 1510 MPASS(!tmpfs_dirent_dup(xde)); 1511 tmpfs_alloc_dirent(VFS_TO_TMPFS(vp->v_mount), NULL, NULL, 0, 1512 &nde); 1513 /* *nde = *xde; XXX gcc 4.2.1 may generate invalid code. */ 1514 memcpy(nde, xde, sizeof(*xde)); 1515 xde->td_cookie |= TMPFS_DIRCOOKIE_DUPHEAD; 1516 LIST_INIT(&xde->ud.td_duphead); 1517 xde->td_namelen = 0; 1518 xde->td_node = NULL; 1519 tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, nde); 1520 tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, de); 1521 } 1522 dnode->tn_size += sizeof(struct tmpfs_dirent); 1523 dnode->tn_status |= TMPFS_NODE_CHANGED | TMPFS_NODE_MODIFIED; 1524 dnode->tn_accessed = true; 1525 tmpfs_update(vp); 1526 } 1527 1528 /* 1529 * Detaches the directory entry de from the directory represented by vp. 1530 * Note that this does not change the link count of the node pointed by 1531 * the directory entry, as this is done by tmpfs_free_dirent. 1532 */ 1533 void 1534 tmpfs_dir_detach(struct vnode *vp, struct tmpfs_dirent *de) 1535 { 1536 struct tmpfs_mount *tmp; 1537 struct tmpfs_dir *head; 1538 struct tmpfs_node *dnode; 1539 struct tmpfs_dirent *xde; 1540 1541 ASSERT_VOP_ELOCKED(vp, __func__); 1542 1543 dnode = VP_TO_TMPFS_DIR(vp); 1544 head = &dnode->tn_dir.tn_dirhead; 1545 dnode->tn_dir.tn_readdir_lastn = 0; 1546 dnode->tn_dir.tn_readdir_lastp = NULL; 1547 1548 if (tmpfs_dirent_dup(de)) { 1549 /* Remove duphead if de was last entry. */ 1550 if (LIST_NEXT(de, uh.td_dup.entries) == NULL) { 1551 xde = tmpfs_dir_xlookup_hash(dnode, de->td_hash); 1552 MPASS(tmpfs_dirent_duphead(xde)); 1553 } else 1554 xde = NULL; 1555 LIST_REMOVE(de, uh.td_dup.entries); 1556 LIST_REMOVE(de, uh.td_dup.index_entries); 1557 if (xde != NULL) { 1558 if (LIST_EMPTY(&xde->ud.td_duphead)) { 1559 RB_REMOVE(tmpfs_dir, head, xde); 1560 tmp = VFS_TO_TMPFS(vp->v_mount); 1561 MPASS(xde->td_node == NULL); 1562 tmpfs_free_dirent(tmp, xde); 1563 } 1564 } 1565 de->td_cookie = de->td_hash; 1566 } else 1567 RB_REMOVE(tmpfs_dir, head, de); 1568 1569 dnode->tn_size -= sizeof(struct tmpfs_dirent); 1570 dnode->tn_status |= TMPFS_NODE_CHANGED | TMPFS_NODE_MODIFIED; 1571 dnode->tn_accessed = true; 1572 tmpfs_update(vp); 1573 } 1574 1575 void 1576 tmpfs_dir_destroy(struct tmpfs_mount *tmp, struct tmpfs_node *dnode) 1577 { 1578 struct tmpfs_dirent *de, *dde, *nde; 1579 1580 RB_FOREACH_SAFE(de, tmpfs_dir, &dnode->tn_dir.tn_dirhead, nde) { 1581 RB_REMOVE(tmpfs_dir, &dnode->tn_dir.tn_dirhead, de); 1582 /* Node may already be destroyed. */ 1583 de->td_node = NULL; 1584 if (tmpfs_dirent_duphead(de)) { 1585 while ((dde = LIST_FIRST(&de->ud.td_duphead)) != NULL) { 1586 LIST_REMOVE(dde, uh.td_dup.entries); 1587 dde->td_node = NULL; 1588 tmpfs_free_dirent(tmp, dde); 1589 } 1590 } 1591 tmpfs_free_dirent(tmp, de); 1592 } 1593 } 1594 1595 /* 1596 * Helper function for tmpfs_readdir. Creates a '.' entry for the given 1597 * directory and returns it in the uio space. The function returns 0 1598 * on success, -1 if there was not enough space in the uio structure to 1599 * hold the directory entry or an appropriate error code if another 1600 * error happens. 1601 */ 1602 static int 1603 tmpfs_dir_getdotdent(struct tmpfs_mount *tm, struct tmpfs_node *node, 1604 struct uio *uio) 1605 { 1606 int error; 1607 struct dirent dent; 1608 1609 TMPFS_VALIDATE_DIR(node); 1610 MPASS(uio->uio_offset == TMPFS_DIRCOOKIE_DOT); 1611 1612 dent.d_fileno = node->tn_id; 1613 dent.d_off = TMPFS_DIRCOOKIE_DOTDOT; 1614 dent.d_type = DT_DIR; 1615 dent.d_namlen = 1; 1616 dent.d_name[0] = '.'; 1617 dent.d_reclen = GENERIC_DIRSIZ(&dent); 1618 dirent_terminate(&dent); 1619 1620 if (dent.d_reclen > uio->uio_resid) 1621 error = EJUSTRETURN; 1622 else 1623 error = uiomove(&dent, dent.d_reclen, uio); 1624 1625 tmpfs_set_accessed(tm, node); 1626 1627 return (error); 1628 } 1629 1630 /* 1631 * Helper function for tmpfs_readdir. Creates a '..' entry for the given 1632 * directory and returns it in the uio space. The function returns 0 1633 * on success, -1 if there was not enough space in the uio structure to 1634 * hold the directory entry or an appropriate error code if another 1635 * error happens. 1636 */ 1637 static int 1638 tmpfs_dir_getdotdotdent(struct tmpfs_mount *tm, struct tmpfs_node *node, 1639 struct uio *uio, off_t next) 1640 { 1641 struct tmpfs_node *parent; 1642 struct dirent dent; 1643 int error; 1644 1645 TMPFS_VALIDATE_DIR(node); 1646 MPASS(uio->uio_offset == TMPFS_DIRCOOKIE_DOTDOT); 1647 1648 /* 1649 * Return ENOENT if the current node is already removed. 1650 */ 1651 TMPFS_ASSERT_LOCKED(node); 1652 parent = node->tn_dir.tn_parent; 1653 if (parent == NULL) 1654 return (ENOENT); 1655 1656 dent.d_fileno = parent->tn_id; 1657 dent.d_off = next; 1658 dent.d_type = DT_DIR; 1659 dent.d_namlen = 2; 1660 dent.d_name[0] = '.'; 1661 dent.d_name[1] = '.'; 1662 dent.d_reclen = GENERIC_DIRSIZ(&dent); 1663 dirent_terminate(&dent); 1664 1665 if (dent.d_reclen > uio->uio_resid) 1666 error = EJUSTRETURN; 1667 else 1668 error = uiomove(&dent, dent.d_reclen, uio); 1669 1670 tmpfs_set_accessed(tm, node); 1671 1672 return (error); 1673 } 1674 1675 /* 1676 * Helper function for tmpfs_readdir. Returns as much directory entries 1677 * as can fit in the uio space. The read starts at uio->uio_offset. 1678 * The function returns 0 on success, -1 if there was not enough space 1679 * in the uio structure to hold the directory entry or an appropriate 1680 * error code if another error happens. 1681 */ 1682 int 1683 tmpfs_dir_getdents(struct tmpfs_mount *tm, struct tmpfs_node *node, 1684 struct uio *uio, int maxcookies, uint64_t *cookies, int *ncookies) 1685 { 1686 struct tmpfs_dir_cursor dc; 1687 struct tmpfs_dirent *de, *nde; 1688 off_t off; 1689 int error; 1690 1691 TMPFS_VALIDATE_DIR(node); 1692 1693 off = 0; 1694 1695 /* 1696 * Lookup the node from the current offset. The starting offset of 1697 * 0 will lookup both '.' and '..', and then the first real entry, 1698 * or EOF if there are none. Then find all entries for the dir that 1699 * fit into the buffer. Once no more entries are found (de == NULL), 1700 * the offset is set to TMPFS_DIRCOOKIE_EOF, which will cause the next 1701 * call to return 0. 1702 */ 1703 switch (uio->uio_offset) { 1704 case TMPFS_DIRCOOKIE_DOT: 1705 error = tmpfs_dir_getdotdent(tm, node, uio); 1706 if (error != 0) 1707 return (error); 1708 uio->uio_offset = off = TMPFS_DIRCOOKIE_DOTDOT; 1709 if (cookies != NULL) 1710 cookies[(*ncookies)++] = off; 1711 /* FALLTHROUGH */ 1712 case TMPFS_DIRCOOKIE_DOTDOT: 1713 de = tmpfs_dir_first(node, &dc); 1714 off = tmpfs_dirent_cookie(de); 1715 error = tmpfs_dir_getdotdotdent(tm, node, uio, off); 1716 if (error != 0) 1717 return (error); 1718 uio->uio_offset = off; 1719 if (cookies != NULL) 1720 cookies[(*ncookies)++] = off; 1721 /* EOF. */ 1722 if (de == NULL) 1723 return (0); 1724 break; 1725 case TMPFS_DIRCOOKIE_EOF: 1726 return (0); 1727 default: 1728 de = tmpfs_dir_lookup_cookie(node, uio->uio_offset, &dc); 1729 if (de == NULL) 1730 return (EINVAL); 1731 if (cookies != NULL) 1732 off = tmpfs_dirent_cookie(de); 1733 } 1734 1735 /* 1736 * Read as much entries as possible; i.e., until we reach the end of the 1737 * directory or we exhaust uio space. 1738 */ 1739 do { 1740 struct dirent d; 1741 1742 /* 1743 * Create a dirent structure representing the current tmpfs_node 1744 * and fill it. 1745 */ 1746 if (de->td_node == NULL) { 1747 d.d_fileno = 1; 1748 d.d_type = DT_WHT; 1749 } else { 1750 d.d_fileno = de->td_node->tn_id; 1751 switch (de->td_node->tn_type) { 1752 case VBLK: 1753 d.d_type = DT_BLK; 1754 break; 1755 1756 case VCHR: 1757 d.d_type = DT_CHR; 1758 break; 1759 1760 case VDIR: 1761 d.d_type = DT_DIR; 1762 break; 1763 1764 case VFIFO: 1765 d.d_type = DT_FIFO; 1766 break; 1767 1768 case VLNK: 1769 d.d_type = DT_LNK; 1770 break; 1771 1772 case VREG: 1773 d.d_type = DT_REG; 1774 break; 1775 1776 case VSOCK: 1777 d.d_type = DT_SOCK; 1778 break; 1779 1780 default: 1781 panic("tmpfs_dir_getdents: type %p %d", 1782 de->td_node, (int)de->td_node->tn_type); 1783 } 1784 } 1785 d.d_namlen = de->td_namelen; 1786 MPASS(de->td_namelen < sizeof(d.d_name)); 1787 (void)memcpy(d.d_name, de->ud.td_name, de->td_namelen); 1788 d.d_reclen = GENERIC_DIRSIZ(&d); 1789 1790 /* 1791 * Stop reading if the directory entry we are treating is bigger 1792 * than the amount of data that can be returned. 1793 */ 1794 if (d.d_reclen > uio->uio_resid) { 1795 error = EJUSTRETURN; 1796 break; 1797 } 1798 1799 nde = tmpfs_dir_next(node, &dc); 1800 d.d_off = tmpfs_dirent_cookie(nde); 1801 dirent_terminate(&d); 1802 1803 /* 1804 * Copy the new dirent structure into the output buffer and 1805 * advance pointers. 1806 */ 1807 error = uiomove(&d, d.d_reclen, uio); 1808 if (error == 0) { 1809 de = nde; 1810 if (cookies != NULL) { 1811 off = tmpfs_dirent_cookie(de); 1812 MPASS(*ncookies < maxcookies); 1813 cookies[(*ncookies)++] = off; 1814 } 1815 } 1816 } while (error == 0 && uio->uio_resid > 0 && de != NULL); 1817 1818 /* Skip setting off when using cookies as it is already done above. */ 1819 if (cookies == NULL) 1820 off = tmpfs_dirent_cookie(de); 1821 1822 /* Update the offset and cache. */ 1823 uio->uio_offset = off; 1824 node->tn_dir.tn_readdir_lastn = off; 1825 node->tn_dir.tn_readdir_lastp = de; 1826 1827 tmpfs_set_accessed(tm, node); 1828 return (error); 1829 } 1830 1831 int 1832 tmpfs_dir_whiteout_add(struct vnode *dvp, struct componentname *cnp) 1833 { 1834 struct tmpfs_dirent *de; 1835 struct tmpfs_node *dnode; 1836 int error; 1837 1838 error = tmpfs_alloc_dirent(VFS_TO_TMPFS(dvp->v_mount), NULL, 1839 cnp->cn_nameptr, cnp->cn_namelen, &de); 1840 if (error != 0) 1841 return (error); 1842 dnode = VP_TO_TMPFS_DIR(dvp); 1843 tmpfs_dir_attach(dvp, de); 1844 dnode->tn_dir.tn_wht_size += sizeof(*de); 1845 return (0); 1846 } 1847 1848 void 1849 tmpfs_dir_whiteout_remove(struct vnode *dvp, struct componentname *cnp) 1850 { 1851 struct tmpfs_dirent *de; 1852 struct tmpfs_node *dnode; 1853 1854 dnode = VP_TO_TMPFS_DIR(dvp); 1855 de = tmpfs_dir_lookup(dnode, NULL, cnp); 1856 MPASS(de != NULL && de->td_node == NULL); 1857 MPASS(dnode->tn_dir.tn_wht_size >= sizeof(*de)); 1858 dnode->tn_dir.tn_wht_size -= sizeof(*de); 1859 tmpfs_dir_detach(dvp, de); 1860 tmpfs_free_dirent(VFS_TO_TMPFS(dvp->v_mount), de); 1861 } 1862 1863 /* 1864 * Frees any dirents still associated with the directory represented 1865 * by dvp in preparation for the removal of the directory. This is 1866 * required when removing a directory which contains only whiteout 1867 * entries. 1868 */ 1869 void 1870 tmpfs_dir_clear_whiteouts(struct vnode *dvp) 1871 { 1872 struct tmpfs_dir_cursor dc; 1873 struct tmpfs_dirent *de; 1874 struct tmpfs_node *dnode; 1875 1876 dnode = VP_TO_TMPFS_DIR(dvp); 1877 1878 while ((de = tmpfs_dir_first(dnode, &dc)) != NULL) { 1879 KASSERT(de->td_node == NULL, ("%s: non-whiteout dirent %p", 1880 __func__, de)); 1881 dnode->tn_dir.tn_wht_size -= sizeof(*de); 1882 tmpfs_dir_detach(dvp, de); 1883 tmpfs_free_dirent(VFS_TO_TMPFS(dvp->v_mount), de); 1884 } 1885 MPASS(dnode->tn_size == 0); 1886 MPASS(dnode->tn_dir.tn_wht_size == 0); 1887 } 1888 1889 /* 1890 * Resizes the aobj associated with the regular file pointed to by 'vp' to the 1891 * size 'newsize'. 'vp' must point to a vnode that represents a regular file. 1892 * 'newsize' must be positive. 1893 * 1894 * Returns zero on success or an appropriate error code on failure. 1895 */ 1896 int 1897 tmpfs_reg_resize(struct vnode *vp, off_t newsize, boolean_t ignerr) 1898 { 1899 struct tmpfs_node *node; 1900 vm_object_t uobj; 1901 vm_pindex_t idx, newpages, oldpages; 1902 off_t oldsize; 1903 int base, error; 1904 1905 MPASS(vp->v_type == VREG); 1906 MPASS(newsize >= 0); 1907 1908 node = VP_TO_TMPFS_NODE(vp); 1909 uobj = node->tn_reg.tn_aobj; 1910 1911 /* 1912 * Convert the old and new sizes to the number of pages needed to 1913 * store them. It may happen that we do not need to do anything 1914 * because the last allocated page can accommodate the change on 1915 * its own. 1916 */ 1917 oldsize = node->tn_size; 1918 oldpages = OFF_TO_IDX(oldsize + PAGE_MASK); 1919 MPASS(oldpages == uobj->size); 1920 newpages = OFF_TO_IDX(newsize + PAGE_MASK); 1921 1922 if (__predict_true(newpages == oldpages && newsize >= oldsize)) { 1923 node->tn_size = newsize; 1924 return (0); 1925 } 1926 1927 VM_OBJECT_WLOCK(uobj); 1928 if (newsize < oldsize) { 1929 /* 1930 * Zero the truncated part of the last page. 1931 */ 1932 base = newsize & PAGE_MASK; 1933 if (base != 0) { 1934 idx = OFF_TO_IDX(newsize); 1935 error = tmpfs_partial_page_invalidate(uobj, idx, base, 1936 PAGE_SIZE, ignerr); 1937 if (error != 0) { 1938 VM_OBJECT_WUNLOCK(uobj); 1939 return (error); 1940 } 1941 } 1942 1943 /* 1944 * Release any swap space and free any whole pages. 1945 */ 1946 if (newpages < oldpages) 1947 vm_object_page_remove(uobj, newpages, 0, 0); 1948 } 1949 uobj->size = newpages; 1950 VM_OBJECT_WUNLOCK(uobj); 1951 1952 node->tn_size = newsize; 1953 return (0); 1954 } 1955 1956 /* 1957 * Punch hole in the aobj associated with the regular file pointed to by 'vp'. 1958 * Requests completely beyond the end-of-file are converted to no-op. 1959 * 1960 * Returns 0 on success or error code from tmpfs_partial_page_invalidate() on 1961 * failure. 1962 */ 1963 int 1964 tmpfs_reg_punch_hole(struct vnode *vp, off_t *offset, off_t *length) 1965 { 1966 struct tmpfs_node *node; 1967 vm_object_t object; 1968 vm_pindex_t pistart, pi, piend; 1969 int startofs, endofs, end; 1970 off_t off, len; 1971 int error; 1972 1973 KASSERT(*length <= OFF_MAX - *offset, ("%s: offset + length overflows", 1974 __func__)); 1975 node = VP_TO_TMPFS_NODE(vp); 1976 KASSERT(node->tn_type == VREG, ("%s: node is not regular file", 1977 __func__)); 1978 object = node->tn_reg.tn_aobj; 1979 off = *offset; 1980 len = omin(node->tn_size - off, *length); 1981 startofs = off & PAGE_MASK; 1982 endofs = (off + len) & PAGE_MASK; 1983 pistart = OFF_TO_IDX(off); 1984 piend = OFF_TO_IDX(off + len); 1985 pi = OFF_TO_IDX((vm_ooffset_t)off + PAGE_MASK); 1986 error = 0; 1987 1988 /* Handle the case when offset is on or beyond file size. */ 1989 if (len <= 0) { 1990 *length = 0; 1991 return (0); 1992 } 1993 1994 VM_OBJECT_WLOCK(object); 1995 1996 /* 1997 * If there is a partial page at the beginning of the hole-punching 1998 * request, fill the partial page with zeroes. 1999 */ 2000 if (startofs != 0) { 2001 end = pistart != piend ? PAGE_SIZE : endofs; 2002 error = tmpfs_partial_page_invalidate(object, pistart, startofs, 2003 end, FALSE); 2004 if (error != 0) 2005 goto out; 2006 off += end - startofs; 2007 len -= end - startofs; 2008 } 2009 2010 /* 2011 * Toss away the full pages in the affected area. 2012 */ 2013 if (pi < piend) { 2014 vm_object_page_remove(object, pi, piend, 0); 2015 off += IDX_TO_OFF(piend - pi); 2016 len -= IDX_TO_OFF(piend - pi); 2017 } 2018 2019 /* 2020 * If there is a partial page at the end of the hole-punching request, 2021 * fill the partial page with zeroes. 2022 */ 2023 if (endofs != 0 && pistart != piend) { 2024 error = tmpfs_partial_page_invalidate(object, piend, 0, endofs, 2025 FALSE); 2026 if (error != 0) 2027 goto out; 2028 off += endofs; 2029 len -= endofs; 2030 } 2031 2032 out: 2033 VM_OBJECT_WUNLOCK(object); 2034 *offset = off; 2035 *length = len; 2036 return (error); 2037 } 2038 2039 void 2040 tmpfs_check_mtime(struct vnode *vp) 2041 { 2042 struct tmpfs_node *node; 2043 struct vm_object *obj; 2044 2045 ASSERT_VOP_ELOCKED(vp, "check_mtime"); 2046 if (vp->v_type != VREG) 2047 return; 2048 obj = vp->v_object; 2049 KASSERT(obj->type == tmpfs_pager_type && 2050 (obj->flags & (OBJ_SWAP | OBJ_TMPFS)) == 2051 (OBJ_SWAP | OBJ_TMPFS), ("non-tmpfs obj")); 2052 /* unlocked read */ 2053 if (obj->generation != obj->cleangeneration) { 2054 VM_OBJECT_WLOCK(obj); 2055 if (obj->generation != obj->cleangeneration) { 2056 obj->cleangeneration = obj->generation; 2057 node = VP_TO_TMPFS_NODE(vp); 2058 node->tn_status |= TMPFS_NODE_MODIFIED | 2059 TMPFS_NODE_CHANGED; 2060 } 2061 VM_OBJECT_WUNLOCK(obj); 2062 } 2063 } 2064 2065 /* 2066 * Change flags of the given vnode. 2067 * Caller should execute tmpfs_update on vp after a successful execution. 2068 * The vnode must be locked on entry and remain locked on exit. 2069 */ 2070 int 2071 tmpfs_chflags(struct vnode *vp, u_long flags, struct ucred *cred, 2072 struct thread *td) 2073 { 2074 int error; 2075 struct tmpfs_node *node; 2076 2077 ASSERT_VOP_ELOCKED(vp, "chflags"); 2078 2079 node = VP_TO_TMPFS_NODE(vp); 2080 2081 if ((flags & ~(SF_APPEND | SF_ARCHIVED | SF_IMMUTABLE | SF_NOUNLINK | 2082 UF_APPEND | UF_ARCHIVE | UF_HIDDEN | UF_IMMUTABLE | UF_NODUMP | 2083 UF_NOUNLINK | UF_OFFLINE | UF_OPAQUE | UF_READONLY | UF_REPARSE | 2084 UF_SPARSE | UF_SYSTEM)) != 0) 2085 return (EOPNOTSUPP); 2086 2087 /* Disallow this operation if the file system is mounted read-only. */ 2088 if (vp->v_mount->mnt_flag & MNT_RDONLY) 2089 return (EROFS); 2090 2091 /* 2092 * Callers may only modify the file flags on objects they 2093 * have VADMIN rights for. 2094 */ 2095 if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) 2096 return (error); 2097 /* 2098 * Unprivileged processes are not permitted to unset system 2099 * flags, or modify flags if any system flags are set. 2100 */ 2101 if (!priv_check_cred(cred, PRIV_VFS_SYSFLAGS)) { 2102 if (node->tn_flags & 2103 (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND)) { 2104 error = securelevel_gt(cred, 0); 2105 if (error) 2106 return (error); 2107 } 2108 } else { 2109 if (node->tn_flags & 2110 (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND) || 2111 ((flags ^ node->tn_flags) & SF_SETTABLE)) 2112 return (EPERM); 2113 } 2114 node->tn_flags = flags; 2115 node->tn_status |= TMPFS_NODE_CHANGED; 2116 2117 ASSERT_VOP_ELOCKED(vp, "chflags2"); 2118 2119 return (0); 2120 } 2121 2122 /* 2123 * Change access mode on the given vnode. 2124 * Caller should execute tmpfs_update on vp after a successful execution. 2125 * The vnode must be locked on entry and remain locked on exit. 2126 */ 2127 int 2128 tmpfs_chmod(struct vnode *vp, mode_t mode, struct ucred *cred, 2129 struct thread *td) 2130 { 2131 int error; 2132 struct tmpfs_node *node; 2133 mode_t newmode; 2134 2135 ASSERT_VOP_ELOCKED(vp, "chmod"); 2136 ASSERT_VOP_IN_SEQC(vp); 2137 2138 node = VP_TO_TMPFS_NODE(vp); 2139 2140 /* Disallow this operation if the file system is mounted read-only. */ 2141 if (vp->v_mount->mnt_flag & MNT_RDONLY) 2142 return (EROFS); 2143 2144 /* Immutable or append-only files cannot be modified, either. */ 2145 if (node->tn_flags & (IMMUTABLE | APPEND)) 2146 return (EPERM); 2147 2148 /* 2149 * To modify the permissions on a file, must possess VADMIN 2150 * for that file. 2151 */ 2152 if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) 2153 return (error); 2154 2155 /* 2156 * Privileged processes may set the sticky bit on non-directories, 2157 * as well as set the setgid bit on a file with a group that the 2158 * process is not a member of. 2159 */ 2160 if (vp->v_type != VDIR && (mode & S_ISTXT)) { 2161 if (priv_check_cred(cred, PRIV_VFS_STICKYFILE)) 2162 return (EFTYPE); 2163 } 2164 if (!groupmember(node->tn_gid, cred) && (mode & S_ISGID)) { 2165 error = priv_check_cred(cred, PRIV_VFS_SETGID); 2166 if (error) 2167 return (error); 2168 } 2169 2170 newmode = node->tn_mode & ~ALLPERMS; 2171 newmode |= mode & ALLPERMS; 2172 atomic_store_short(&node->tn_mode, newmode); 2173 2174 node->tn_status |= TMPFS_NODE_CHANGED; 2175 2176 ASSERT_VOP_ELOCKED(vp, "chmod2"); 2177 2178 return (0); 2179 } 2180 2181 /* 2182 * Change ownership of the given vnode. At least one of uid or gid must 2183 * be different than VNOVAL. If one is set to that value, the attribute 2184 * is unchanged. 2185 * Caller should execute tmpfs_update on vp after a successful execution. 2186 * The vnode must be locked on entry and remain locked on exit. 2187 */ 2188 int 2189 tmpfs_chown(struct vnode *vp, uid_t uid, gid_t gid, struct ucred *cred, 2190 struct thread *td) 2191 { 2192 int error; 2193 struct tmpfs_node *node; 2194 uid_t ouid; 2195 gid_t ogid; 2196 mode_t newmode; 2197 2198 ASSERT_VOP_ELOCKED(vp, "chown"); 2199 ASSERT_VOP_IN_SEQC(vp); 2200 2201 node = VP_TO_TMPFS_NODE(vp); 2202 2203 /* Assign default values if they are unknown. */ 2204 MPASS(uid != VNOVAL || gid != VNOVAL); 2205 if (uid == VNOVAL) 2206 uid = node->tn_uid; 2207 if (gid == VNOVAL) 2208 gid = node->tn_gid; 2209 MPASS(uid != VNOVAL && gid != VNOVAL); 2210 2211 /* Disallow this operation if the file system is mounted read-only. */ 2212 if (vp->v_mount->mnt_flag & MNT_RDONLY) 2213 return (EROFS); 2214 2215 /* Immutable or append-only files cannot be modified, either. */ 2216 if (node->tn_flags & (IMMUTABLE | APPEND)) 2217 return (EPERM); 2218 2219 /* 2220 * To modify the ownership of a file, must possess VADMIN for that 2221 * file. 2222 */ 2223 if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) 2224 return (error); 2225 2226 /* 2227 * To change the owner of a file, or change the group of a file to a 2228 * group of which we are not a member, the caller must have 2229 * privilege. 2230 */ 2231 if ((uid != node->tn_uid || 2232 (gid != node->tn_gid && !groupmember(gid, cred))) && 2233 (error = priv_check_cred(cred, PRIV_VFS_CHOWN))) 2234 return (error); 2235 2236 ogid = node->tn_gid; 2237 ouid = node->tn_uid; 2238 2239 node->tn_uid = uid; 2240 node->tn_gid = gid; 2241 2242 node->tn_status |= TMPFS_NODE_CHANGED; 2243 2244 if ((node->tn_mode & (S_ISUID | S_ISGID)) != 0 && 2245 (ouid != uid || ogid != gid)) { 2246 if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID)) { 2247 newmode = node->tn_mode & ~(S_ISUID | S_ISGID); 2248 atomic_store_short(&node->tn_mode, newmode); 2249 } 2250 } 2251 2252 ASSERT_VOP_ELOCKED(vp, "chown2"); 2253 2254 return (0); 2255 } 2256 2257 /* 2258 * Change size of the given vnode. 2259 * Caller should execute tmpfs_update on vp after a successful execution. 2260 * The vnode must be locked on entry and remain locked on exit. 2261 */ 2262 int 2263 tmpfs_chsize(struct vnode *vp, u_quad_t size, struct ucred *cred, 2264 struct thread *td) 2265 { 2266 int error; 2267 struct tmpfs_node *node; 2268 2269 ASSERT_VOP_ELOCKED(vp, "chsize"); 2270 2271 node = VP_TO_TMPFS_NODE(vp); 2272 2273 /* Decide whether this is a valid operation based on the file type. */ 2274 error = 0; 2275 switch (vp->v_type) { 2276 case VDIR: 2277 return (EISDIR); 2278 2279 case VREG: 2280 if (vp->v_mount->mnt_flag & MNT_RDONLY) 2281 return (EROFS); 2282 break; 2283 2284 case VBLK: 2285 /* FALLTHROUGH */ 2286 case VCHR: 2287 /* FALLTHROUGH */ 2288 case VFIFO: 2289 /* 2290 * Allow modifications of special files even if in the file 2291 * system is mounted read-only (we are not modifying the 2292 * files themselves, but the objects they represent). 2293 */ 2294 return (0); 2295 2296 default: 2297 /* Anything else is unsupported. */ 2298 return (EOPNOTSUPP); 2299 } 2300 2301 /* Immutable or append-only files cannot be modified, either. */ 2302 if (node->tn_flags & (IMMUTABLE | APPEND)) 2303 return (EPERM); 2304 2305 error = vn_rlimit_trunc(size, td); 2306 if (error != 0) 2307 return (error); 2308 2309 error = tmpfs_truncate(vp, size); 2310 /* 2311 * tmpfs_truncate will raise the NOTE_EXTEND and NOTE_ATTRIB kevents 2312 * for us, as will update tn_status; no need to do that here. 2313 */ 2314 2315 ASSERT_VOP_ELOCKED(vp, "chsize2"); 2316 2317 return (error); 2318 } 2319 2320 /* 2321 * Change access and modification times of the given vnode. 2322 * Caller should execute tmpfs_update on vp after a successful execution. 2323 * The vnode must be locked on entry and remain locked on exit. 2324 */ 2325 int 2326 tmpfs_chtimes(struct vnode *vp, struct vattr *vap, 2327 struct ucred *cred, struct thread *td) 2328 { 2329 int error; 2330 struct tmpfs_node *node; 2331 2332 ASSERT_VOP_ELOCKED(vp, "chtimes"); 2333 2334 node = VP_TO_TMPFS_NODE(vp); 2335 2336 /* Disallow this operation if the file system is mounted read-only. */ 2337 if (vp->v_mount->mnt_flag & MNT_RDONLY) 2338 return (EROFS); 2339 2340 /* Immutable or append-only files cannot be modified, either. */ 2341 if (node->tn_flags & (IMMUTABLE | APPEND)) 2342 return (EPERM); 2343 2344 error = vn_utimes_perm(vp, vap, cred, td); 2345 if (error != 0) 2346 return (error); 2347 2348 if (vap->va_atime.tv_sec != VNOVAL) 2349 node->tn_accessed = true; 2350 if (vap->va_mtime.tv_sec != VNOVAL) 2351 node->tn_status |= TMPFS_NODE_MODIFIED; 2352 if (vap->va_birthtime.tv_sec != VNOVAL) 2353 node->tn_status |= TMPFS_NODE_MODIFIED; 2354 tmpfs_itimes(vp, &vap->va_atime, &vap->va_mtime); 2355 if (vap->va_birthtime.tv_sec != VNOVAL) 2356 node->tn_birthtime = vap->va_birthtime; 2357 ASSERT_VOP_ELOCKED(vp, "chtimes2"); 2358 2359 return (0); 2360 } 2361 2362 void 2363 tmpfs_set_status(struct tmpfs_mount *tm, struct tmpfs_node *node, int status) 2364 { 2365 2366 if ((node->tn_status & status) == status || tm->tm_ronly) 2367 return; 2368 TMPFS_NODE_LOCK(node); 2369 node->tn_status |= status; 2370 TMPFS_NODE_UNLOCK(node); 2371 } 2372 2373 void 2374 tmpfs_set_accessed(struct tmpfs_mount *tm, struct tmpfs_node *node) 2375 { 2376 if (node->tn_accessed || tm->tm_ronly) 2377 return; 2378 atomic_store_8(&node->tn_accessed, true); 2379 } 2380 2381 /* Sync timestamps */ 2382 void 2383 tmpfs_itimes(struct vnode *vp, const struct timespec *acc, 2384 const struct timespec *mod) 2385 { 2386 struct tmpfs_node *node; 2387 struct timespec now; 2388 2389 ASSERT_VOP_LOCKED(vp, "tmpfs_itimes"); 2390 node = VP_TO_TMPFS_NODE(vp); 2391 2392 if (!node->tn_accessed && 2393 (node->tn_status & (TMPFS_NODE_MODIFIED | TMPFS_NODE_CHANGED)) == 0) 2394 return; 2395 2396 vfs_timestamp(&now); 2397 TMPFS_NODE_LOCK(node); 2398 if (node->tn_accessed) { 2399 if (acc == NULL) 2400 acc = &now; 2401 node->tn_atime = *acc; 2402 } 2403 if (node->tn_status & TMPFS_NODE_MODIFIED) { 2404 if (mod == NULL) 2405 mod = &now; 2406 node->tn_mtime = *mod; 2407 } 2408 if (node->tn_status & TMPFS_NODE_CHANGED) 2409 node->tn_ctime = now; 2410 node->tn_status &= ~(TMPFS_NODE_MODIFIED | TMPFS_NODE_CHANGED); 2411 node->tn_accessed = false; 2412 TMPFS_NODE_UNLOCK(node); 2413 2414 /* XXX: FIX? The entropy here is desirable, but the harvesting may be expensive */ 2415 random_harvest_queue(node, sizeof(*node), RANDOM_FS_ATIME); 2416 } 2417 2418 int 2419 tmpfs_truncate(struct vnode *vp, off_t length) 2420 { 2421 struct tmpfs_node *node; 2422 int error; 2423 2424 if (length < 0) 2425 return (EINVAL); 2426 if (length > VFS_TO_TMPFS(vp->v_mount)->tm_maxfilesize) 2427 return (EFBIG); 2428 2429 node = VP_TO_TMPFS_NODE(vp); 2430 error = node->tn_size == length ? 0 : tmpfs_reg_resize(vp, length, 2431 FALSE); 2432 if (error == 0) 2433 node->tn_status |= TMPFS_NODE_CHANGED | TMPFS_NODE_MODIFIED; 2434 tmpfs_update(vp); 2435 2436 return (error); 2437 } 2438 2439 static __inline int 2440 tmpfs_dirtree_cmp(struct tmpfs_dirent *a, struct tmpfs_dirent *b) 2441 { 2442 if (a->td_hash > b->td_hash) 2443 return (1); 2444 else if (a->td_hash < b->td_hash) 2445 return (-1); 2446 return (0); 2447 } 2448 2449 RB_GENERATE_STATIC(tmpfs_dir, tmpfs_dirent, uh.td_entries, tmpfs_dirtree_cmp); 2450