1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Poul-Henning Kamp of the FreeBSD Project. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_ddb.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/capsicum.h> 46 #include <sys/counter.h> 47 #include <sys/filedesc.h> 48 #include <sys/fnv_hash.h> 49 #include <sys/kernel.h> 50 #include <sys/ktr.h> 51 #include <sys/lock.h> 52 #include <sys/malloc.h> 53 #include <sys/fcntl.h> 54 #include <sys/jail.h> 55 #include <sys/mount.h> 56 #include <sys/namei.h> 57 #include <sys/proc.h> 58 #include <sys/seqc.h> 59 #include <sys/sdt.h> 60 #include <sys/smr.h> 61 #include <sys/smp.h> 62 #include <sys/syscallsubr.h> 63 #include <sys/sysctl.h> 64 #include <sys/sysproto.h> 65 #include <sys/vnode.h> 66 #include <ck_queue.h> 67 #ifdef KTRACE 68 #include <sys/ktrace.h> 69 #endif 70 71 #include <sys/capsicum.h> 72 73 #include <security/audit/audit.h> 74 #include <security/mac/mac_framework.h> 75 76 #ifdef DDB 77 #include <ddb/ddb.h> 78 #endif 79 80 #include <vm/uma.h> 81 82 SDT_PROVIDER_DECLARE(vfs); 83 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", 84 "struct vnode *"); 85 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", 86 "char *"); 87 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *", 88 "const char *"); 89 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *", 90 "struct namecache *", "int", "int"); 91 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); 92 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", 93 "char *", "struct vnode *"); 94 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); 95 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", 96 "struct vnode *", "char *"); 97 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", 98 "struct vnode *"); 99 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, 100 "struct vnode *", "char *"); 101 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", 102 "char *"); 103 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *", 104 "struct componentname *"); 105 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *", 106 "struct componentname *"); 107 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); 108 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); 109 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); 110 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", 111 "struct vnode *"); 112 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", 113 "char *"); 114 SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *", 115 "char *"); 116 117 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool"); 118 SDT_PROBE_DECLARE(vfs, namei, lookup, entry); 119 SDT_PROBE_DECLARE(vfs, namei, lookup, return); 120 121 /* 122 * This structure describes the elements in the cache of recent 123 * names looked up by namei. 124 */ 125 struct negstate { 126 u_char neg_flag; 127 }; 128 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *), 129 "the state must fit in a union with a pointer without growing it"); 130 131 struct namecache { 132 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 133 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 134 CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */ 135 struct vnode *nc_dvp; /* vnode of parent of name */ 136 union { 137 struct vnode *nu_vp; /* vnode the name refers to */ 138 struct negstate nu_neg;/* negative entry state */ 139 } n_un; 140 u_char nc_flag; /* flag bits */ 141 u_char nc_nlen; /* length of name */ 142 char nc_name[0]; /* segment name + nul */ 143 }; 144 145 /* 146 * struct namecache_ts repeats struct namecache layout up to the 147 * nc_nlen member. 148 * struct namecache_ts is used in place of struct namecache when time(s) need 149 * to be stored. The nc_dotdottime field is used when a cache entry is mapping 150 * both a non-dotdot directory name plus dotdot for the directory's 151 * parent. 152 * 153 * See below for alignment requirement. 154 */ 155 struct namecache_ts { 156 struct timespec nc_time; /* timespec provided by fs */ 157 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ 158 int nc_ticks; /* ticks value when entry was added */ 159 struct namecache nc_nc; 160 }; 161 162 /* 163 * At least mips n32 performs 64-bit accesses to timespec as found 164 * in namecache_ts and requires them to be aligned. Since others 165 * may be in the same spot suffer a little bit and enforce the 166 * alignment for everyone. Note this is a nop for 64-bit platforms. 167 */ 168 #define CACHE_ZONE_ALIGNMENT UMA_ALIGNOF(time_t) 169 #define CACHE_PATH_CUTOFF 39 170 171 #define CACHE_ZONE_SMALL_SIZE (sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1) 172 #define CACHE_ZONE_SMALL_TS_SIZE (sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1) 173 #define CACHE_ZONE_LARGE_SIZE (sizeof(struct namecache) + NAME_MAX + 1) 174 #define CACHE_ZONE_LARGE_TS_SIZE (sizeof(struct namecache_ts) + NAME_MAX + 1) 175 176 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 177 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 178 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 179 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 180 181 #define nc_vp n_un.nu_vp 182 #define nc_neg n_un.nu_neg 183 184 /* 185 * Flags in namecache.nc_flag 186 */ 187 #define NCF_WHITE 0x01 188 #define NCF_ISDOTDOT 0x02 189 #define NCF_TS 0x04 190 #define NCF_DTS 0x08 191 #define NCF_DVDROP 0x10 192 #define NCF_NEGATIVE 0x20 193 #define NCF_INVALID 0x40 194 #define NCF_WIP 0x80 195 196 /* 197 * Flags in negstate.neg_flag 198 */ 199 #define NEG_HOT 0x01 200 201 /* 202 * Mark an entry as invalid. 203 * 204 * This is called before it starts getting deconstructed. 205 */ 206 static void 207 cache_ncp_invalidate(struct namecache *ncp) 208 { 209 210 KASSERT((ncp->nc_flag & NCF_INVALID) == 0, 211 ("%s: entry %p already invalid", __func__, ncp)); 212 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID); 213 atomic_thread_fence_rel(); 214 } 215 216 /* 217 * Check whether the entry can be safely used. 218 * 219 * All places which elide locks are supposed to call this after they are 220 * done with reading from an entry. 221 */ 222 static bool 223 cache_ncp_canuse(struct namecache *ncp) 224 { 225 226 atomic_thread_fence_acq(); 227 return ((atomic_load_char(&ncp->nc_flag) & (NCF_INVALID | NCF_WIP)) == 0); 228 } 229 230 /* 231 * Name caching works as follows: 232 * 233 * Names found by directory scans are retained in a cache 234 * for future reference. It is managed LRU, so frequently 235 * used names will hang around. Cache is indexed by hash value 236 * obtained from (dvp, name) where dvp refers to the directory 237 * containing name. 238 * 239 * If it is a "negative" entry, (i.e. for a name that is known NOT to 240 * exist) the vnode pointer will be NULL. 241 * 242 * Upon reaching the last segment of a path, if the reference 243 * is for DELETE, or NOCACHE is set (rewrite), and the 244 * name is located in the cache, it will be dropped. 245 * 246 * These locks are used (in the order in which they can be taken): 247 * NAME TYPE ROLE 248 * vnodelock mtx vnode lists and v_cache_dd field protection 249 * bucketlock mtx for access to given set of hash buckets 250 * neglist mtx negative entry LRU management 251 * 252 * Additionally, ncneg_shrink_lock mtx is used to have at most one thread 253 * shrinking the LRU list. 254 * 255 * It is legal to take multiple vnodelock and bucketlock locks. The locking 256 * order is lower address first. Both are recursive. 257 * 258 * "." lookups are lockless. 259 * 260 * ".." and vnode -> name lookups require vnodelock. 261 * 262 * name -> vnode lookup requires the relevant bucketlock to be held for reading. 263 * 264 * Insertions and removals of entries require involved vnodes and bucketlocks 265 * to be locked to provide safe operation against other threads modifying the 266 * cache. 267 * 268 * Some lookups result in removal of the found entry (e.g. getting rid of a 269 * negative entry with the intent to create a positive one), which poses a 270 * problem when multiple threads reach the state. Similarly, two different 271 * threads can purge two different vnodes and try to remove the same name. 272 * 273 * If the already held vnode lock is lower than the second required lock, we 274 * can just take the other lock. However, in the opposite case, this could 275 * deadlock. As such, this is resolved by trylocking and if that fails unlocking 276 * the first node, locking everything in order and revalidating the state. 277 */ 278 279 VFS_SMR_DECLARE; 280 281 /* 282 * Structures associated with name caching. 283 */ 284 #define NCHHASH(hash) \ 285 (&nchashtbl[(hash) & nchash]) 286 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ 287 static u_long __read_mostly nchash; /* size of hash table */ 288 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 289 "Size of namecache hash table"); 290 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */ 291 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 292 "Ratio of negative namecache entries"); 293 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ 294 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ 295 u_int ncsizefactor = 2; 296 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0, 297 "Size factor for namecache"); 298 static u_int __read_mostly ncpurgeminvnodes; 299 SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0, 300 "Number of vnodes below which purgevfs ignores the request"); 301 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */ 302 303 struct nchstats nchstats; /* cache effectiveness statistics */ 304 305 static bool __read_frequently cache_fast_revlookup = true; 306 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_revlookup, CTLFLAG_RW, 307 &cache_fast_revlookup, 0, ""); 308 309 static struct mtx __exclusive_cache_line ncneg_shrink_lock; 310 311 struct neglist { 312 struct mtx nl_lock; 313 TAILQ_HEAD(, namecache) nl_list; 314 } __aligned(CACHE_LINE_SIZE); 315 316 static struct neglist __read_mostly *neglists; 317 static struct neglist ncneg_hot; 318 static u_long numhotneg; 319 320 #define ncneghash 3 321 #define numneglists (ncneghash + 1) 322 static inline struct neglist * 323 NCP2NEGLIST(struct namecache *ncp) 324 { 325 326 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); 327 } 328 329 static inline struct negstate * 330 NCP2NEGSTATE(struct namecache *ncp) 331 { 332 333 MPASS(ncp->nc_flag & NCF_NEGATIVE); 334 return (&ncp->nc_neg); 335 } 336 337 #define numbucketlocks (ncbuckethash + 1) 338 static u_int __read_mostly ncbuckethash; 339 static struct mtx_padalign __read_mostly *bucketlocks; 340 #define HASH2BUCKETLOCK(hash) \ 341 ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)])) 342 343 #define numvnodelocks (ncvnodehash + 1) 344 static u_int __read_mostly ncvnodehash; 345 static struct mtx __read_mostly *vnodelocks; 346 static inline struct mtx * 347 VP2VNODELOCK(struct vnode *vp) 348 { 349 350 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); 351 } 352 353 /* 354 * UMA zones for the VFS cache. 355 * 356 * The small cache is used for entries with short names, which are the 357 * most common. The large cache is used for entries which are too big to 358 * fit in the small cache. 359 */ 360 static uma_zone_t __read_mostly cache_zone_small; 361 static uma_zone_t __read_mostly cache_zone_small_ts; 362 static uma_zone_t __read_mostly cache_zone_large; 363 static uma_zone_t __read_mostly cache_zone_large_ts; 364 365 static struct namecache * 366 cache_alloc(int len, int ts) 367 { 368 struct namecache_ts *ncp_ts; 369 struct namecache *ncp; 370 371 if (__predict_false(ts)) { 372 if (len <= CACHE_PATH_CUTOFF) 373 ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK); 374 else 375 ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK); 376 ncp = &ncp_ts->nc_nc; 377 } else { 378 if (len <= CACHE_PATH_CUTOFF) 379 ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK); 380 else 381 ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK); 382 } 383 return (ncp); 384 } 385 386 static void 387 cache_free(struct namecache *ncp) 388 { 389 struct namecache_ts *ncp_ts; 390 391 MPASS(ncp != NULL); 392 if ((ncp->nc_flag & NCF_DVDROP) != 0) 393 vdrop(ncp->nc_dvp); 394 if (__predict_false(ncp->nc_flag & NCF_TS)) { 395 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 396 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 397 uma_zfree_smr(cache_zone_small_ts, ncp_ts); 398 else 399 uma_zfree_smr(cache_zone_large_ts, ncp_ts); 400 } else { 401 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 402 uma_zfree_smr(cache_zone_small, ncp); 403 else 404 uma_zfree_smr(cache_zone_large, ncp); 405 } 406 } 407 408 static void 409 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) 410 { 411 struct namecache_ts *ncp_ts; 412 413 KASSERT((ncp->nc_flag & NCF_TS) != 0 || 414 (tsp == NULL && ticksp == NULL), 415 ("No NCF_TS")); 416 417 if (tsp == NULL) 418 return; 419 420 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 421 *tsp = ncp_ts->nc_time; 422 *ticksp = ncp_ts->nc_ticks; 423 } 424 425 #ifdef DEBUG_CACHE 426 static int __read_mostly doingcache = 1; /* 1 => enable the cache */ 427 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, 428 "VFS namecache enabled"); 429 #endif 430 431 /* Export size information to userland */ 432 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 433 sizeof(struct namecache), "sizeof(struct namecache)"); 434 435 /* 436 * The new name cache statistics 437 */ 438 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 439 "Name cache statistics"); 440 #define STATNODE_ULONG(name, descr) \ 441 SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr); 442 #define STATNODE_COUNTER(name, descr) \ 443 static COUNTER_U64_DEFINE_EARLY(name); \ 444 SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, \ 445 descr); 446 STATNODE_ULONG(numneg, "Number of negative cache entries"); 447 STATNODE_ULONG(numcache, "Number of cache entries"); 448 STATNODE_COUNTER(numcachehv, "Number of namecache entries with vnodes held"); 449 STATNODE_COUNTER(numdrops, "Number of dropped entries due to reaching the limit"); 450 STATNODE_COUNTER(dothits, "Number of '.' hits"); 451 STATNODE_COUNTER(dotdothits, "Number of '..' hits"); 452 STATNODE_COUNTER(nummiss, "Number of cache misses"); 453 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache"); 454 STATNODE_COUNTER(numposzaps, 455 "Number of cache hits (positive) we do not want to cache"); 456 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)"); 457 STATNODE_COUNTER(numnegzaps, 458 "Number of cache hits (negative) we do not want to cache"); 459 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)"); 460 /* These count for vn_getcwd(), too. */ 461 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls"); 462 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); 463 STATNODE_COUNTER(numfullpathfail2, 464 "Number of fullpath search errors (VOP_VPTOCNP failures)"); 465 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); 466 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls"); 467 STATNODE_COUNTER(zap_and_exit_bucket_relock_success, 468 "Number of successful removals after relocking"); 469 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail, 470 "Number of times zap_and_exit failed to lock"); 471 static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2, 472 "Number of times zap_and_exit failed to lock"); 473 static long cache_lock_vnodes_cel_3_failures; 474 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures, 475 "Number of times 3-way vnode locking failed"); 476 STATNODE_ULONG(numhotneg, "Number of hot negative entries"); 477 STATNODE_COUNTER(numneg_evicted, 478 "Number of negative entries evicted when adding a new entry"); 479 STATNODE_COUNTER(shrinking_skipped, 480 "Number of times shrinking was already in progress"); 481 482 static void cache_zap_locked(struct namecache *ncp); 483 static int vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, 484 char **freebuf, size_t *buflen); 485 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 486 char **retbuf, size_t *buflen, bool slash_prefixed, size_t addend); 487 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, 488 char **retbuf, size_t *buflen); 489 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, 490 char **retbuf, size_t *len, bool slash_prefixed, size_t addend); 491 492 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 493 494 static int cache_yield; 495 SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0, 496 "Number of times cache called yield"); 497 498 static void __noinline 499 cache_maybe_yield(void) 500 { 501 502 if (should_yield()) { 503 cache_yield++; 504 kern_yield(PRI_USER); 505 } 506 } 507 508 static inline void 509 cache_assert_vlp_locked(struct mtx *vlp) 510 { 511 512 if (vlp != NULL) 513 mtx_assert(vlp, MA_OWNED); 514 } 515 516 static inline void 517 cache_assert_vnode_locked(struct vnode *vp) 518 { 519 struct mtx *vlp; 520 521 vlp = VP2VNODELOCK(vp); 522 cache_assert_vlp_locked(vlp); 523 } 524 525 /* 526 * TODO: With the value stored we can do better than computing the hash based 527 * on the address. The choice of FNV should also be revisited. 528 */ 529 static void 530 cache_prehash(struct vnode *vp) 531 { 532 533 vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT); 534 } 535 536 static uint32_t 537 cache_get_hash(char *name, u_char len, struct vnode *dvp) 538 { 539 540 return (fnv_32_buf(name, len, dvp->v_nchash)); 541 } 542 543 static inline struct nchashhead * 544 NCP2BUCKET(struct namecache *ncp) 545 { 546 uint32_t hash; 547 548 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 549 return (NCHHASH(hash)); 550 } 551 552 static inline struct mtx * 553 NCP2BUCKETLOCK(struct namecache *ncp) 554 { 555 uint32_t hash; 556 557 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 558 return (HASH2BUCKETLOCK(hash)); 559 } 560 561 #ifdef INVARIANTS 562 static void 563 cache_assert_bucket_locked(struct namecache *ncp) 564 { 565 struct mtx *blp; 566 567 blp = NCP2BUCKETLOCK(ncp); 568 mtx_assert(blp, MA_OWNED); 569 } 570 571 static void 572 cache_assert_bucket_unlocked(struct namecache *ncp) 573 { 574 struct mtx *blp; 575 576 blp = NCP2BUCKETLOCK(ncp); 577 mtx_assert(blp, MA_NOTOWNED); 578 } 579 #else 580 #define cache_assert_bucket_locked(x) do { } while (0) 581 #define cache_assert_bucket_unlocked(x) do { } while (0) 582 #endif 583 584 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) 585 static void 586 _cache_sort_vnodes(void **p1, void **p2) 587 { 588 void *tmp; 589 590 MPASS(*p1 != NULL || *p2 != NULL); 591 592 if (*p1 > *p2) { 593 tmp = *p2; 594 *p2 = *p1; 595 *p1 = tmp; 596 } 597 } 598 599 static void 600 cache_lock_all_buckets(void) 601 { 602 u_int i; 603 604 for (i = 0; i < numbucketlocks; i++) 605 mtx_lock(&bucketlocks[i]); 606 } 607 608 static void 609 cache_unlock_all_buckets(void) 610 { 611 u_int i; 612 613 for (i = 0; i < numbucketlocks; i++) 614 mtx_unlock(&bucketlocks[i]); 615 } 616 617 static void 618 cache_lock_all_vnodes(void) 619 { 620 u_int i; 621 622 for (i = 0; i < numvnodelocks; i++) 623 mtx_lock(&vnodelocks[i]); 624 } 625 626 static void 627 cache_unlock_all_vnodes(void) 628 { 629 u_int i; 630 631 for (i = 0; i < numvnodelocks; i++) 632 mtx_unlock(&vnodelocks[i]); 633 } 634 635 static int 636 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 637 { 638 639 cache_sort_vnodes(&vlp1, &vlp2); 640 641 if (vlp1 != NULL) { 642 if (!mtx_trylock(vlp1)) 643 return (EAGAIN); 644 } 645 if (!mtx_trylock(vlp2)) { 646 if (vlp1 != NULL) 647 mtx_unlock(vlp1); 648 return (EAGAIN); 649 } 650 651 return (0); 652 } 653 654 static void 655 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 656 { 657 658 MPASS(vlp1 != NULL || vlp2 != NULL); 659 MPASS(vlp1 <= vlp2); 660 661 if (vlp1 != NULL) 662 mtx_lock(vlp1); 663 if (vlp2 != NULL) 664 mtx_lock(vlp2); 665 } 666 667 static void 668 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 669 { 670 671 MPASS(vlp1 != NULL || vlp2 != NULL); 672 673 if (vlp1 != NULL) 674 mtx_unlock(vlp1); 675 if (vlp2 != NULL) 676 mtx_unlock(vlp2); 677 } 678 679 static int 680 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 681 { 682 struct nchstats snap; 683 684 if (req->oldptr == NULL) 685 return (SYSCTL_OUT(req, 0, sizeof(snap))); 686 687 snap = nchstats; 688 snap.ncs_goodhits = counter_u64_fetch(numposhits); 689 snap.ncs_neghits = counter_u64_fetch(numneghits); 690 snap.ncs_badhits = counter_u64_fetch(numposzaps) + 691 counter_u64_fetch(numnegzaps); 692 snap.ncs_miss = counter_u64_fetch(nummisszap) + 693 counter_u64_fetch(nummiss); 694 695 return (SYSCTL_OUT(req, &snap, sizeof(snap))); 696 } 697 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | 698 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", 699 "VFS cache effectiveness statistics"); 700 701 #ifdef DIAGNOSTIC 702 /* 703 * Grab an atomic snapshot of the name cache hash chain lengths 704 */ 705 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, 706 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 707 "hash table stats"); 708 709 static int 710 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) 711 { 712 struct nchashhead *ncpp; 713 struct namecache *ncp; 714 int i, error, n_nchash, *cntbuf; 715 716 retry: 717 n_nchash = nchash + 1; /* nchash is max index, not count */ 718 if (req->oldptr == NULL) 719 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); 720 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); 721 cache_lock_all_buckets(); 722 if (n_nchash != nchash + 1) { 723 cache_unlock_all_buckets(); 724 free(cntbuf, M_TEMP); 725 goto retry; 726 } 727 /* Scan hash tables counting entries */ 728 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) 729 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) 730 cntbuf[i]++; 731 cache_unlock_all_buckets(); 732 for (error = 0, i = 0; i < n_nchash; i++) 733 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) 734 break; 735 free(cntbuf, M_TEMP); 736 return (error); 737 } 738 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| 739 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", 740 "nchash chain lengths"); 741 742 static int 743 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) 744 { 745 int error; 746 struct nchashhead *ncpp; 747 struct namecache *ncp; 748 int n_nchash; 749 int count, maxlength, used, pct; 750 751 if (!req->oldptr) 752 return SYSCTL_OUT(req, 0, 4 * sizeof(int)); 753 754 cache_lock_all_buckets(); 755 n_nchash = nchash + 1; /* nchash is max index, not count */ 756 used = 0; 757 maxlength = 0; 758 759 /* Scan hash tables for applicable entries */ 760 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { 761 count = 0; 762 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) { 763 count++; 764 } 765 if (count) 766 used++; 767 if (maxlength < count) 768 maxlength = count; 769 } 770 n_nchash = nchash + 1; 771 cache_unlock_all_buckets(); 772 pct = (used * 100) / (n_nchash / 100); 773 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); 774 if (error) 775 return (error); 776 error = SYSCTL_OUT(req, &used, sizeof(used)); 777 if (error) 778 return (error); 779 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); 780 if (error) 781 return (error); 782 error = SYSCTL_OUT(req, &pct, sizeof(pct)); 783 if (error) 784 return (error); 785 return (0); 786 } 787 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| 788 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", 789 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); 790 #endif 791 792 /* 793 * Negative entries management 794 * 795 * A variation of LRU scheme is used. New entries are hashed into one of 796 * numneglists cold lists. Entries get promoted to the hot list on first hit. 797 * 798 * The shrinker will demote hot list head and evict from the cold list in a 799 * round-robin manner. 800 */ 801 static void 802 cache_negative_init(struct namecache *ncp) 803 { 804 struct negstate *negstate; 805 806 ncp->nc_flag |= NCF_NEGATIVE; 807 negstate = NCP2NEGSTATE(ncp); 808 negstate->neg_flag = 0; 809 } 810 811 static void 812 cache_negative_hit(struct namecache *ncp) 813 { 814 struct neglist *neglist; 815 struct negstate *negstate; 816 817 negstate = NCP2NEGSTATE(ncp); 818 if ((negstate->neg_flag & NEG_HOT) != 0) 819 return; 820 neglist = NCP2NEGLIST(ncp); 821 mtx_lock(&ncneg_hot.nl_lock); 822 mtx_lock(&neglist->nl_lock); 823 if ((negstate->neg_flag & NEG_HOT) == 0) { 824 numhotneg++; 825 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 826 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); 827 negstate->neg_flag |= NEG_HOT; 828 } 829 mtx_unlock(&neglist->nl_lock); 830 mtx_unlock(&ncneg_hot.nl_lock); 831 } 832 833 static void 834 cache_negative_insert(struct namecache *ncp) 835 { 836 struct neglist *neglist; 837 838 MPASS(ncp->nc_flag & NCF_NEGATIVE); 839 cache_assert_bucket_locked(ncp); 840 neglist = NCP2NEGLIST(ncp); 841 mtx_lock(&neglist->nl_lock); 842 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 843 mtx_unlock(&neglist->nl_lock); 844 atomic_add_rel_long(&numneg, 1); 845 } 846 847 static void 848 cache_negative_remove(struct namecache *ncp) 849 { 850 struct neglist *neglist; 851 struct negstate *negstate; 852 bool hot_locked = false; 853 bool list_locked = false; 854 855 cache_assert_bucket_locked(ncp); 856 neglist = NCP2NEGLIST(ncp); 857 negstate = NCP2NEGSTATE(ncp); 858 if ((negstate->neg_flag & NEG_HOT) != 0) { 859 hot_locked = true; 860 mtx_lock(&ncneg_hot.nl_lock); 861 if ((negstate->neg_flag & NEG_HOT) == 0) { 862 list_locked = true; 863 mtx_lock(&neglist->nl_lock); 864 } 865 } else { 866 list_locked = true; 867 mtx_lock(&neglist->nl_lock); 868 /* 869 * We may be racing against promotion in lockless lookup. 870 */ 871 if ((negstate->neg_flag & NEG_HOT) != 0) { 872 mtx_unlock(&neglist->nl_lock); 873 hot_locked = true; 874 mtx_lock(&ncneg_hot.nl_lock); 875 mtx_lock(&neglist->nl_lock); 876 } 877 } 878 if ((negstate->neg_flag & NEG_HOT) != 0) { 879 mtx_assert(&ncneg_hot.nl_lock, MA_OWNED); 880 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 881 numhotneg--; 882 } else { 883 mtx_assert(&neglist->nl_lock, MA_OWNED); 884 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 885 } 886 if (list_locked) 887 mtx_unlock(&neglist->nl_lock); 888 if (hot_locked) 889 mtx_unlock(&ncneg_hot.nl_lock); 890 atomic_subtract_rel_long(&numneg, 1); 891 } 892 893 static void 894 cache_negative_shrink_select(struct namecache **ncpp, 895 struct neglist **neglistpp) 896 { 897 struct neglist *neglist; 898 struct namecache *ncp; 899 static u_int cycle; 900 u_int i; 901 902 *ncpp = ncp = NULL; 903 904 for (i = 0; i < numneglists; i++) { 905 neglist = &neglists[(cycle + i) % numneglists]; 906 if (TAILQ_FIRST(&neglist->nl_list) == NULL) 907 continue; 908 mtx_lock(&neglist->nl_lock); 909 ncp = TAILQ_FIRST(&neglist->nl_list); 910 if (ncp != NULL) 911 break; 912 mtx_unlock(&neglist->nl_lock); 913 } 914 915 *neglistpp = neglist; 916 *ncpp = ncp; 917 cycle++; 918 } 919 920 static void 921 cache_negative_zap_one(void) 922 { 923 struct namecache *ncp, *ncp2; 924 struct neglist *neglist; 925 struct negstate *negstate; 926 struct mtx *dvlp; 927 struct mtx *blp; 928 929 if (mtx_owner(&ncneg_shrink_lock) != NULL || 930 !mtx_trylock(&ncneg_shrink_lock)) { 931 counter_u64_add(shrinking_skipped, 1); 932 return; 933 } 934 935 mtx_lock(&ncneg_hot.nl_lock); 936 ncp = TAILQ_FIRST(&ncneg_hot.nl_list); 937 if (ncp != NULL) { 938 neglist = NCP2NEGLIST(ncp); 939 negstate = NCP2NEGSTATE(ncp); 940 mtx_lock(&neglist->nl_lock); 941 MPASS((negstate->neg_flag & NEG_HOT) != 0); 942 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 943 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 944 negstate->neg_flag &= ~NEG_HOT; 945 numhotneg--; 946 mtx_unlock(&neglist->nl_lock); 947 } 948 mtx_unlock(&ncneg_hot.nl_lock); 949 950 cache_negative_shrink_select(&ncp, &neglist); 951 952 mtx_unlock(&ncneg_shrink_lock); 953 if (ncp == NULL) 954 return; 955 956 MPASS(ncp->nc_flag & NCF_NEGATIVE); 957 dvlp = VP2VNODELOCK(ncp->nc_dvp); 958 blp = NCP2BUCKETLOCK(ncp); 959 mtx_unlock(&neglist->nl_lock); 960 mtx_lock(dvlp); 961 mtx_lock(blp); 962 /* 963 * Enter SMR to safely check the negative list. 964 * Even if the found pointer matches, the entry may now be reallocated 965 * and used by a different vnode. 966 */ 967 vfs_smr_enter(); 968 ncp2 = TAILQ_FIRST(&neglist->nl_list); 969 if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) || 970 blp != NCP2BUCKETLOCK(ncp2)) { 971 vfs_smr_exit(); 972 ncp = NULL; 973 } else { 974 vfs_smr_exit(); 975 SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp, 976 ncp->nc_name); 977 cache_zap_locked(ncp); 978 counter_u64_add(numneg_evicted, 1); 979 } 980 mtx_unlock(blp); 981 mtx_unlock(dvlp); 982 if (ncp != NULL) 983 cache_free(ncp); 984 } 985 986 /* 987 * cache_zap_locked(): 988 * 989 * Removes a namecache entry from cache, whether it contains an actual 990 * pointer to a vnode or if it is just a negative cache entry. 991 */ 992 static void 993 cache_zap_locked(struct namecache *ncp) 994 { 995 struct nchashhead *ncpp; 996 997 if (!(ncp->nc_flag & NCF_NEGATIVE)) 998 cache_assert_vnode_locked(ncp->nc_vp); 999 cache_assert_vnode_locked(ncp->nc_dvp); 1000 cache_assert_bucket_locked(ncp); 1001 1002 cache_ncp_invalidate(ncp); 1003 1004 ncpp = NCP2BUCKET(ncp); 1005 CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash); 1006 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 1007 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, 1008 ncp->nc_name, ncp->nc_vp); 1009 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); 1010 if (ncp == ncp->nc_vp->v_cache_dd) { 1011 vn_seqc_write_begin_unheld(ncp->nc_vp); 1012 ncp->nc_vp->v_cache_dd = NULL; 1013 vn_seqc_write_end(ncp->nc_vp); 1014 } 1015 } else { 1016 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp, 1017 ncp->nc_name); 1018 cache_negative_remove(ncp); 1019 } 1020 if (ncp->nc_flag & NCF_ISDOTDOT) { 1021 if (ncp == ncp->nc_dvp->v_cache_dd) { 1022 vn_seqc_write_begin_unheld(ncp->nc_dvp); 1023 ncp->nc_dvp->v_cache_dd = NULL; 1024 vn_seqc_write_end(ncp->nc_dvp); 1025 } 1026 } else { 1027 LIST_REMOVE(ncp, nc_src); 1028 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { 1029 ncp->nc_flag |= NCF_DVDROP; 1030 counter_u64_add(numcachehv, -1); 1031 } 1032 } 1033 atomic_subtract_rel_long(&numcache, 1); 1034 } 1035 1036 static void 1037 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) 1038 { 1039 struct mtx *blp; 1040 1041 MPASS(ncp->nc_dvp == vp); 1042 MPASS(ncp->nc_flag & NCF_NEGATIVE); 1043 cache_assert_vnode_locked(vp); 1044 1045 blp = NCP2BUCKETLOCK(ncp); 1046 mtx_lock(blp); 1047 cache_zap_locked(ncp); 1048 mtx_unlock(blp); 1049 } 1050 1051 static bool 1052 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, 1053 struct mtx **vlpp) 1054 { 1055 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 1056 struct mtx *blp; 1057 1058 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 1059 cache_assert_vnode_locked(vp); 1060 1061 if (ncp->nc_flag & NCF_NEGATIVE) { 1062 if (*vlpp != NULL) { 1063 mtx_unlock(*vlpp); 1064 *vlpp = NULL; 1065 } 1066 cache_zap_negative_locked_vnode_kl(ncp, vp); 1067 return (true); 1068 } 1069 1070 pvlp = VP2VNODELOCK(vp); 1071 blp = NCP2BUCKETLOCK(ncp); 1072 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 1073 vlp2 = VP2VNODELOCK(ncp->nc_vp); 1074 1075 if (*vlpp == vlp1 || *vlpp == vlp2) { 1076 to_unlock = *vlpp; 1077 *vlpp = NULL; 1078 } else { 1079 if (*vlpp != NULL) { 1080 mtx_unlock(*vlpp); 1081 *vlpp = NULL; 1082 } 1083 cache_sort_vnodes(&vlp1, &vlp2); 1084 if (vlp1 == pvlp) { 1085 mtx_lock(vlp2); 1086 to_unlock = vlp2; 1087 } else { 1088 if (!mtx_trylock(vlp1)) 1089 goto out_relock; 1090 to_unlock = vlp1; 1091 } 1092 } 1093 mtx_lock(blp); 1094 cache_zap_locked(ncp); 1095 mtx_unlock(blp); 1096 if (to_unlock != NULL) 1097 mtx_unlock(to_unlock); 1098 return (true); 1099 1100 out_relock: 1101 mtx_unlock(vlp2); 1102 mtx_lock(vlp1); 1103 mtx_lock(vlp2); 1104 MPASS(*vlpp == NULL); 1105 *vlpp = vlp1; 1106 return (false); 1107 } 1108 1109 static int __noinline 1110 cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp) 1111 { 1112 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 1113 struct mtx *blp; 1114 int error = 0; 1115 1116 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 1117 cache_assert_vnode_locked(vp); 1118 1119 pvlp = VP2VNODELOCK(vp); 1120 if (ncp->nc_flag & NCF_NEGATIVE) { 1121 cache_zap_negative_locked_vnode_kl(ncp, vp); 1122 goto out; 1123 } 1124 1125 blp = NCP2BUCKETLOCK(ncp); 1126 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 1127 vlp2 = VP2VNODELOCK(ncp->nc_vp); 1128 cache_sort_vnodes(&vlp1, &vlp2); 1129 if (vlp1 == pvlp) { 1130 mtx_lock(vlp2); 1131 to_unlock = vlp2; 1132 } else { 1133 if (!mtx_trylock(vlp1)) { 1134 /* 1135 * TODO: Very wasteful but rare. 1136 */ 1137 mtx_unlock(pvlp); 1138 mtx_lock(vlp1); 1139 mtx_lock(vlp2); 1140 mtx_unlock(vlp2); 1141 mtx_unlock(vlp1); 1142 return (EAGAIN); 1143 } 1144 to_unlock = vlp1; 1145 } 1146 mtx_lock(blp); 1147 cache_zap_locked(ncp); 1148 mtx_unlock(blp); 1149 mtx_unlock(to_unlock); 1150 out: 1151 mtx_unlock(pvlp); 1152 return (error); 1153 } 1154 1155 /* 1156 * If trylocking failed we can get here. We know enough to take all needed locks 1157 * in the right order and re-lookup the entry. 1158 */ 1159 static int 1160 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1161 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, 1162 struct mtx *blp) 1163 { 1164 struct namecache *rncp; 1165 1166 cache_assert_bucket_unlocked(ncp); 1167 1168 cache_sort_vnodes(&dvlp, &vlp); 1169 cache_lock_vnodes(dvlp, vlp); 1170 mtx_lock(blp); 1171 CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { 1172 if (rncp == ncp && rncp->nc_dvp == dvp && 1173 rncp->nc_nlen == cnp->cn_namelen && 1174 !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) 1175 break; 1176 } 1177 if (rncp != NULL) { 1178 cache_zap_locked(rncp); 1179 mtx_unlock(blp); 1180 cache_unlock_vnodes(dvlp, vlp); 1181 counter_u64_add(zap_and_exit_bucket_relock_success, 1); 1182 return (0); 1183 } 1184 1185 mtx_unlock(blp); 1186 cache_unlock_vnodes(dvlp, vlp); 1187 return (EAGAIN); 1188 } 1189 1190 static int __noinline 1191 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp, 1192 uint32_t hash, struct mtx *blp) 1193 { 1194 struct mtx *dvlp, *vlp; 1195 struct vnode *dvp; 1196 1197 cache_assert_bucket_locked(ncp); 1198 1199 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1200 vlp = NULL; 1201 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1202 vlp = VP2VNODELOCK(ncp->nc_vp); 1203 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1204 cache_zap_locked(ncp); 1205 mtx_unlock(blp); 1206 cache_unlock_vnodes(dvlp, vlp); 1207 return (0); 1208 } 1209 1210 dvp = ncp->nc_dvp; 1211 mtx_unlock(blp); 1212 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1213 } 1214 1215 static int 1216 cache_zap_locked_bucket_kl(struct namecache *ncp, struct mtx *blp, 1217 struct mtx **vlpp1, struct mtx **vlpp2) 1218 { 1219 struct mtx *dvlp, *vlp; 1220 1221 cache_assert_bucket_locked(ncp); 1222 1223 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1224 vlp = NULL; 1225 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1226 vlp = VP2VNODELOCK(ncp->nc_vp); 1227 cache_sort_vnodes(&dvlp, &vlp); 1228 1229 if (*vlpp1 == dvlp && *vlpp2 == vlp) { 1230 cache_zap_locked(ncp); 1231 cache_unlock_vnodes(dvlp, vlp); 1232 *vlpp1 = NULL; 1233 *vlpp2 = NULL; 1234 return (0); 1235 } 1236 1237 if (*vlpp1 != NULL) 1238 mtx_unlock(*vlpp1); 1239 if (*vlpp2 != NULL) 1240 mtx_unlock(*vlpp2); 1241 *vlpp1 = NULL; 1242 *vlpp2 = NULL; 1243 1244 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1245 cache_zap_locked(ncp); 1246 cache_unlock_vnodes(dvlp, vlp); 1247 return (0); 1248 } 1249 1250 mtx_unlock(blp); 1251 *vlpp1 = dvlp; 1252 *vlpp2 = vlp; 1253 if (*vlpp1 != NULL) 1254 mtx_lock(*vlpp1); 1255 mtx_lock(*vlpp2); 1256 mtx_lock(blp); 1257 return (EAGAIN); 1258 } 1259 1260 static __noinline int 1261 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp) 1262 { 1263 struct namecache *ncp; 1264 struct mtx *blp; 1265 struct mtx *dvlp, *dvlp2; 1266 uint32_t hash; 1267 int error; 1268 1269 if (cnp->cn_namelen == 2 && 1270 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1271 dvlp = VP2VNODELOCK(dvp); 1272 dvlp2 = NULL; 1273 mtx_lock(dvlp); 1274 retry_dotdot: 1275 ncp = dvp->v_cache_dd; 1276 if (ncp == NULL) { 1277 mtx_unlock(dvlp); 1278 if (dvlp2 != NULL) 1279 mtx_unlock(dvlp2); 1280 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1281 return (0); 1282 } 1283 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1284 if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2)) 1285 goto retry_dotdot; 1286 MPASS(dvp->v_cache_dd == NULL); 1287 mtx_unlock(dvlp); 1288 if (dvlp2 != NULL) 1289 mtx_unlock(dvlp2); 1290 cache_free(ncp); 1291 } else { 1292 vn_seqc_write_begin(dvp); 1293 dvp->v_cache_dd = NULL; 1294 vn_seqc_write_end(dvp); 1295 mtx_unlock(dvlp); 1296 if (dvlp2 != NULL) 1297 mtx_unlock(dvlp2); 1298 } 1299 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1300 return (1); 1301 } 1302 1303 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1304 blp = HASH2BUCKETLOCK(hash); 1305 retry: 1306 if (CK_SLIST_EMPTY(NCHHASH(hash))) 1307 goto out_no_entry; 1308 1309 mtx_lock(blp); 1310 1311 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1312 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1313 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1314 break; 1315 } 1316 1317 if (ncp == NULL) { 1318 mtx_unlock(blp); 1319 goto out_no_entry; 1320 } 1321 1322 error = cache_zap_locked_bucket(ncp, cnp, hash, blp); 1323 if (__predict_false(error != 0)) { 1324 zap_and_exit_bucket_fail++; 1325 goto retry; 1326 } 1327 counter_u64_add(numposzaps, 1); 1328 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1329 cache_free(ncp); 1330 return (1); 1331 out_no_entry: 1332 counter_u64_add(nummisszap, 1); 1333 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1334 return (0); 1335 } 1336 1337 static int __noinline 1338 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1339 struct timespec *tsp, int *ticksp) 1340 { 1341 int ltype; 1342 1343 *vpp = dvp; 1344 counter_u64_add(dothits, 1); 1345 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); 1346 if (tsp != NULL) 1347 timespecclear(tsp); 1348 if (ticksp != NULL) 1349 *ticksp = ticks; 1350 vrefact(*vpp); 1351 /* 1352 * When we lookup "." we still can be asked to lock it 1353 * differently... 1354 */ 1355 ltype = cnp->cn_lkflags & LK_TYPE_MASK; 1356 if (ltype != VOP_ISLOCKED(*vpp)) { 1357 if (ltype == LK_EXCLUSIVE) { 1358 vn_lock(*vpp, LK_UPGRADE | LK_RETRY); 1359 if (VN_IS_DOOMED((*vpp))) { 1360 /* forced unmount */ 1361 vrele(*vpp); 1362 *vpp = NULL; 1363 return (ENOENT); 1364 } 1365 } else 1366 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); 1367 } 1368 return (-1); 1369 } 1370 1371 static int __noinline 1372 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1373 struct timespec *tsp, int *ticksp) 1374 { 1375 struct namecache_ts *ncp_ts; 1376 struct namecache *ncp; 1377 struct mtx *dvlp; 1378 enum vgetstate vs; 1379 int error, ltype; 1380 bool whiteout; 1381 1382 MPASS((cnp->cn_flags & ISDOTDOT) != 0); 1383 1384 if ((cnp->cn_flags & MAKEENTRY) == 0) { 1385 cache_remove_cnp(dvp, cnp); 1386 return (0); 1387 } 1388 1389 counter_u64_add(dotdothits, 1); 1390 retry: 1391 dvlp = VP2VNODELOCK(dvp); 1392 mtx_lock(dvlp); 1393 ncp = dvp->v_cache_dd; 1394 if (ncp == NULL) { 1395 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, "..", NULL); 1396 mtx_unlock(dvlp); 1397 return (0); 1398 } 1399 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1400 if (ncp->nc_flag & NCF_NEGATIVE) 1401 *vpp = NULL; 1402 else 1403 *vpp = ncp->nc_vp; 1404 } else 1405 *vpp = ncp->nc_dvp; 1406 if (*vpp == NULL) 1407 goto negative_success; 1408 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp); 1409 cache_out_ts(ncp, tsp, ticksp); 1410 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == 1411 NCF_DTS && tsp != NULL) { 1412 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1413 *tsp = ncp_ts->nc_dotdottime; 1414 } 1415 1416 MPASS(dvp != *vpp); 1417 ltype = VOP_ISLOCKED(dvp); 1418 VOP_UNLOCK(dvp); 1419 vs = vget_prep(*vpp); 1420 mtx_unlock(dvlp); 1421 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1422 vn_lock(dvp, ltype | LK_RETRY); 1423 if (VN_IS_DOOMED(dvp)) { 1424 if (error == 0) 1425 vput(*vpp); 1426 *vpp = NULL; 1427 return (ENOENT); 1428 } 1429 if (error) { 1430 *vpp = NULL; 1431 goto retry; 1432 } 1433 return (-1); 1434 negative_success: 1435 if (__predict_false(cnp->cn_nameiop == CREATE)) { 1436 if (cnp->cn_flags & ISLASTCN) { 1437 counter_u64_add(numnegzaps, 1); 1438 error = cache_zap_locked_vnode(ncp, dvp); 1439 if (__predict_false(error != 0)) { 1440 zap_and_exit_bucket_fail2++; 1441 goto retry; 1442 } 1443 cache_free(ncp); 1444 return (0); 1445 } 1446 } 1447 1448 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 1449 cache_out_ts(ncp, tsp, ticksp); 1450 counter_u64_add(numneghits, 1); 1451 whiteout = (ncp->nc_flag & NCF_WHITE); 1452 cache_negative_hit(ncp); 1453 mtx_unlock(dvlp); 1454 if (whiteout) 1455 cnp->cn_flags |= ISWHITEOUT; 1456 return (ENOENT); 1457 } 1458 1459 /** 1460 * Lookup a name in the name cache 1461 * 1462 * # Arguments 1463 * 1464 * - dvp: Parent directory in which to search. 1465 * - vpp: Return argument. Will contain desired vnode on cache hit. 1466 * - cnp: Parameters of the name search. The most interesting bits of 1467 * the cn_flags field have the following meanings: 1468 * - MAKEENTRY: If clear, free an entry from the cache rather than look 1469 * it up. 1470 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." 1471 * - tsp: Return storage for cache timestamp. On a successful (positive 1472 * or negative) lookup, tsp will be filled with any timespec that 1473 * was stored when this cache entry was created. However, it will 1474 * be clear for "." entries. 1475 * - ticks: Return storage for alternate cache timestamp. On a successful 1476 * (positive or negative) lookup, it will contain the ticks value 1477 * that was current when the cache entry was created, unless cnp 1478 * was ".". 1479 * 1480 * Either both tsp and ticks have to be provided or neither of them. 1481 * 1482 * # Returns 1483 * 1484 * - -1: A positive cache hit. vpp will contain the desired vnode. 1485 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due 1486 * to a forced unmount. vpp will not be modified. If the entry 1487 * is a whiteout, then the ISWHITEOUT flag will be set in 1488 * cnp->cn_flags. 1489 * - 0: A cache miss. vpp will not be modified. 1490 * 1491 * # Locking 1492 * 1493 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up 1494 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the 1495 * lock is not recursively acquired. 1496 */ 1497 static int __noinline 1498 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1499 struct timespec *tsp, int *ticksp) 1500 { 1501 struct namecache *ncp; 1502 struct mtx *blp; 1503 uint32_t hash; 1504 enum vgetstate vs; 1505 int error; 1506 bool whiteout; 1507 1508 MPASS((cnp->cn_flags & (MAKEENTRY | ISDOTDOT)) == MAKEENTRY); 1509 1510 retry: 1511 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1512 blp = HASH2BUCKETLOCK(hash); 1513 mtx_lock(blp); 1514 1515 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1516 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1517 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1518 break; 1519 } 1520 1521 if (__predict_false(ncp == NULL)) { 1522 mtx_unlock(blp); 1523 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1524 NULL); 1525 counter_u64_add(nummiss, 1); 1526 return (0); 1527 } 1528 1529 if (ncp->nc_flag & NCF_NEGATIVE) 1530 goto negative_success; 1531 1532 counter_u64_add(numposhits, 1); 1533 *vpp = ncp->nc_vp; 1534 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); 1535 cache_out_ts(ncp, tsp, ticksp); 1536 MPASS(dvp != *vpp); 1537 vs = vget_prep(*vpp); 1538 mtx_unlock(blp); 1539 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1540 if (error) { 1541 *vpp = NULL; 1542 goto retry; 1543 } 1544 return (-1); 1545 negative_success: 1546 if (__predict_false(cnp->cn_nameiop == CREATE)) { 1547 if (cnp->cn_flags & ISLASTCN) { 1548 counter_u64_add(numnegzaps, 1); 1549 error = cache_zap_locked_vnode(ncp, dvp); 1550 if (__predict_false(error != 0)) { 1551 zap_and_exit_bucket_fail2++; 1552 goto retry; 1553 } 1554 cache_free(ncp); 1555 return (0); 1556 } 1557 } 1558 1559 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 1560 cache_out_ts(ncp, tsp, ticksp); 1561 counter_u64_add(numneghits, 1); 1562 whiteout = (ncp->nc_flag & NCF_WHITE); 1563 cache_negative_hit(ncp); 1564 mtx_unlock(blp); 1565 if (whiteout) 1566 cnp->cn_flags |= ISWHITEOUT; 1567 return (ENOENT); 1568 } 1569 1570 int 1571 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1572 struct timespec *tsp, int *ticksp) 1573 { 1574 struct namecache *ncp; 1575 struct negstate *negstate; 1576 uint32_t hash; 1577 enum vgetstate vs; 1578 int error; 1579 bool whiteout; 1580 u_short nc_flag; 1581 1582 MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL)); 1583 1584 #ifdef DEBUG_CACHE 1585 if (__predict_false(!doingcache)) { 1586 cnp->cn_flags &= ~MAKEENTRY; 1587 return (0); 1588 } 1589 #endif 1590 1591 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 1592 if (cnp->cn_namelen == 1) 1593 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); 1594 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') 1595 return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp)); 1596 } 1597 1598 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 1599 1600 if ((cnp->cn_flags & MAKEENTRY) == 0) { 1601 cache_remove_cnp(dvp, cnp); 1602 return (0); 1603 } 1604 1605 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1606 vfs_smr_enter(); 1607 1608 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1609 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1610 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1611 break; 1612 } 1613 1614 if (__predict_false(ncp == NULL)) { 1615 vfs_smr_exit(); 1616 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1617 NULL); 1618 counter_u64_add(nummiss, 1); 1619 return (0); 1620 } 1621 1622 nc_flag = atomic_load_char(&ncp->nc_flag); 1623 if (nc_flag & NCF_NEGATIVE) 1624 goto negative_success; 1625 1626 counter_u64_add(numposhits, 1); 1627 *vpp = ncp->nc_vp; 1628 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); 1629 cache_out_ts(ncp, tsp, ticksp); 1630 MPASS(dvp != *vpp); 1631 if (!cache_ncp_canuse(ncp)) { 1632 vfs_smr_exit(); 1633 *vpp = NULL; 1634 goto out_fallback; 1635 } 1636 vs = vget_prep_smr(*vpp); 1637 vfs_smr_exit(); 1638 if (__predict_false(vs == VGET_NONE)) { 1639 *vpp = NULL; 1640 goto out_fallback; 1641 } 1642 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1643 if (error) { 1644 *vpp = NULL; 1645 goto out_fallback; 1646 } 1647 return (-1); 1648 negative_success: 1649 if (__predict_false(cnp->cn_nameiop == CREATE)) { 1650 if (cnp->cn_flags & ISLASTCN) { 1651 vfs_smr_exit(); 1652 goto out_fallback; 1653 } 1654 } 1655 1656 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 1657 cache_out_ts(ncp, tsp, ticksp); 1658 counter_u64_add(numneghits, 1); 1659 whiteout = (ncp->nc_flag & NCF_WHITE); 1660 /* 1661 * TODO: We need to take locks to promote an entry. Code doing it 1662 * in SMR lookup can be modified to be shared. 1663 */ 1664 negstate = NCP2NEGSTATE(ncp); 1665 if ((negstate->neg_flag & NEG_HOT) == 0 || 1666 !cache_ncp_canuse(ncp)) { 1667 vfs_smr_exit(); 1668 goto out_fallback; 1669 } 1670 vfs_smr_exit(); 1671 if (whiteout) 1672 cnp->cn_flags |= ISWHITEOUT; 1673 return (ENOENT); 1674 out_fallback: 1675 return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp)); 1676 } 1677 1678 struct celockstate { 1679 struct mtx *vlp[3]; 1680 struct mtx *blp[2]; 1681 }; 1682 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); 1683 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); 1684 1685 static inline void 1686 cache_celockstate_init(struct celockstate *cel) 1687 { 1688 1689 bzero(cel, sizeof(*cel)); 1690 } 1691 1692 static void 1693 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, 1694 struct vnode *dvp) 1695 { 1696 struct mtx *vlp1, *vlp2; 1697 1698 MPASS(cel->vlp[0] == NULL); 1699 MPASS(cel->vlp[1] == NULL); 1700 MPASS(cel->vlp[2] == NULL); 1701 1702 MPASS(vp != NULL || dvp != NULL); 1703 1704 vlp1 = VP2VNODELOCK(vp); 1705 vlp2 = VP2VNODELOCK(dvp); 1706 cache_sort_vnodes(&vlp1, &vlp2); 1707 1708 if (vlp1 != NULL) { 1709 mtx_lock(vlp1); 1710 cel->vlp[0] = vlp1; 1711 } 1712 mtx_lock(vlp2); 1713 cel->vlp[1] = vlp2; 1714 } 1715 1716 static void 1717 cache_unlock_vnodes_cel(struct celockstate *cel) 1718 { 1719 1720 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); 1721 1722 if (cel->vlp[0] != NULL) 1723 mtx_unlock(cel->vlp[0]); 1724 if (cel->vlp[1] != NULL) 1725 mtx_unlock(cel->vlp[1]); 1726 if (cel->vlp[2] != NULL) 1727 mtx_unlock(cel->vlp[2]); 1728 } 1729 1730 static bool 1731 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) 1732 { 1733 struct mtx *vlp; 1734 bool ret; 1735 1736 cache_assert_vlp_locked(cel->vlp[0]); 1737 cache_assert_vlp_locked(cel->vlp[1]); 1738 MPASS(cel->vlp[2] == NULL); 1739 1740 MPASS(vp != NULL); 1741 vlp = VP2VNODELOCK(vp); 1742 1743 ret = true; 1744 if (vlp >= cel->vlp[1]) { 1745 mtx_lock(vlp); 1746 } else { 1747 if (mtx_trylock(vlp)) 1748 goto out; 1749 cache_lock_vnodes_cel_3_failures++; 1750 cache_unlock_vnodes_cel(cel); 1751 if (vlp < cel->vlp[0]) { 1752 mtx_lock(vlp); 1753 mtx_lock(cel->vlp[0]); 1754 mtx_lock(cel->vlp[1]); 1755 } else { 1756 if (cel->vlp[0] != NULL) 1757 mtx_lock(cel->vlp[0]); 1758 mtx_lock(vlp); 1759 mtx_lock(cel->vlp[1]); 1760 } 1761 ret = false; 1762 } 1763 out: 1764 cel->vlp[2] = vlp; 1765 return (ret); 1766 } 1767 1768 static void 1769 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1, 1770 struct mtx *blp2) 1771 { 1772 1773 MPASS(cel->blp[0] == NULL); 1774 MPASS(cel->blp[1] == NULL); 1775 1776 cache_sort_vnodes(&blp1, &blp2); 1777 1778 if (blp1 != NULL) { 1779 mtx_lock(blp1); 1780 cel->blp[0] = blp1; 1781 } 1782 mtx_lock(blp2); 1783 cel->blp[1] = blp2; 1784 } 1785 1786 static void 1787 cache_unlock_buckets_cel(struct celockstate *cel) 1788 { 1789 1790 if (cel->blp[0] != NULL) 1791 mtx_unlock(cel->blp[0]); 1792 mtx_unlock(cel->blp[1]); 1793 } 1794 1795 /* 1796 * Lock part of the cache affected by the insertion. 1797 * 1798 * This means vnodelocks for dvp, vp and the relevant bucketlock. 1799 * However, insertion can result in removal of an old entry. In this 1800 * case we have an additional vnode and bucketlock pair to lock. 1801 * 1802 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while 1803 * preserving the locking order (smaller address first). 1804 */ 1805 static void 1806 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1807 uint32_t hash) 1808 { 1809 struct namecache *ncp; 1810 struct mtx *blps[2]; 1811 1812 blps[0] = HASH2BUCKETLOCK(hash); 1813 for (;;) { 1814 blps[1] = NULL; 1815 cache_lock_vnodes_cel(cel, dvp, vp); 1816 if (vp == NULL || vp->v_type != VDIR) 1817 break; 1818 ncp = vp->v_cache_dd; 1819 if (ncp == NULL) 1820 break; 1821 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1822 break; 1823 MPASS(ncp->nc_dvp == vp); 1824 blps[1] = NCP2BUCKETLOCK(ncp); 1825 if (ncp->nc_flag & NCF_NEGATIVE) 1826 break; 1827 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1828 break; 1829 /* 1830 * All vnodes got re-locked. Re-validate the state and if 1831 * nothing changed we are done. Otherwise restart. 1832 */ 1833 if (ncp == vp->v_cache_dd && 1834 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1835 blps[1] == NCP2BUCKETLOCK(ncp) && 1836 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1837 break; 1838 cache_unlock_vnodes_cel(cel); 1839 cel->vlp[0] = NULL; 1840 cel->vlp[1] = NULL; 1841 cel->vlp[2] = NULL; 1842 } 1843 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1844 } 1845 1846 static void 1847 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1848 uint32_t hash) 1849 { 1850 struct namecache *ncp; 1851 struct mtx *blps[2]; 1852 1853 blps[0] = HASH2BUCKETLOCK(hash); 1854 for (;;) { 1855 blps[1] = NULL; 1856 cache_lock_vnodes_cel(cel, dvp, vp); 1857 ncp = dvp->v_cache_dd; 1858 if (ncp == NULL) 1859 break; 1860 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1861 break; 1862 MPASS(ncp->nc_dvp == dvp); 1863 blps[1] = NCP2BUCKETLOCK(ncp); 1864 if (ncp->nc_flag & NCF_NEGATIVE) 1865 break; 1866 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1867 break; 1868 if (ncp == dvp->v_cache_dd && 1869 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1870 blps[1] == NCP2BUCKETLOCK(ncp) && 1871 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1872 break; 1873 cache_unlock_vnodes_cel(cel); 1874 cel->vlp[0] = NULL; 1875 cel->vlp[1] = NULL; 1876 cel->vlp[2] = NULL; 1877 } 1878 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1879 } 1880 1881 static void 1882 cache_enter_unlock(struct celockstate *cel) 1883 { 1884 1885 cache_unlock_buckets_cel(cel); 1886 cache_unlock_vnodes_cel(cel); 1887 } 1888 1889 static void __noinline 1890 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, 1891 struct componentname *cnp) 1892 { 1893 struct celockstate cel; 1894 struct namecache *ncp; 1895 uint32_t hash; 1896 int len; 1897 1898 if (dvp->v_cache_dd == NULL) 1899 return; 1900 len = cnp->cn_namelen; 1901 cache_celockstate_init(&cel); 1902 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1903 cache_enter_lock_dd(&cel, dvp, vp, hash); 1904 vn_seqc_write_begin(dvp); 1905 ncp = dvp->v_cache_dd; 1906 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) { 1907 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); 1908 cache_zap_locked(ncp); 1909 } else { 1910 ncp = NULL; 1911 } 1912 dvp->v_cache_dd = NULL; 1913 vn_seqc_write_end(dvp); 1914 cache_enter_unlock(&cel); 1915 if (ncp != NULL) 1916 cache_free(ncp); 1917 } 1918 1919 /* 1920 * Add an entry to the cache. 1921 */ 1922 void 1923 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 1924 struct timespec *tsp, struct timespec *dtsp) 1925 { 1926 struct celockstate cel; 1927 struct namecache *ncp, *n2, *ndd; 1928 struct namecache_ts *ncp_ts; 1929 struct nchashhead *ncpp; 1930 uint32_t hash; 1931 int flag; 1932 int len; 1933 u_long lnumcache; 1934 1935 VNPASS(!VN_IS_DOOMED(dvp), dvp); 1936 VNPASS(dvp->v_type != VNON, dvp); 1937 if (vp != NULL) { 1938 VNPASS(!VN_IS_DOOMED(vp), vp); 1939 VNPASS(vp->v_type != VNON, vp); 1940 } 1941 1942 #ifdef DEBUG_CACHE 1943 if (__predict_false(!doingcache)) 1944 return; 1945 #endif 1946 1947 flag = 0; 1948 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 1949 if (cnp->cn_namelen == 1) 1950 return; 1951 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 1952 cache_enter_dotdot_prep(dvp, vp, cnp); 1953 flag = NCF_ISDOTDOT; 1954 } 1955 } 1956 1957 /* 1958 * Avoid blowout in namecache entries. 1959 */ 1960 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; 1961 if (__predict_false(lnumcache >= ncsize)) { 1962 atomic_add_long(&numcache, -1); 1963 counter_u64_add(numdrops, 1); 1964 return; 1965 } 1966 1967 cache_celockstate_init(&cel); 1968 ndd = NULL; 1969 ncp_ts = NULL; 1970 1971 /* 1972 * Calculate the hash key and setup as much of the new 1973 * namecache entry as possible before acquiring the lock. 1974 */ 1975 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); 1976 ncp->nc_flag = flag | NCF_WIP; 1977 ncp->nc_vp = vp; 1978 if (vp == NULL) 1979 cache_negative_init(ncp); 1980 ncp->nc_dvp = dvp; 1981 if (tsp != NULL) { 1982 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1983 ncp_ts->nc_time = *tsp; 1984 ncp_ts->nc_ticks = ticks; 1985 ncp_ts->nc_nc.nc_flag |= NCF_TS; 1986 if (dtsp != NULL) { 1987 ncp_ts->nc_dotdottime = *dtsp; 1988 ncp_ts->nc_nc.nc_flag |= NCF_DTS; 1989 } 1990 } 1991 len = ncp->nc_nlen = cnp->cn_namelen; 1992 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1993 memcpy(ncp->nc_name, cnp->cn_nameptr, len); 1994 ncp->nc_name[len] = '\0'; 1995 cache_enter_lock(&cel, dvp, vp, hash); 1996 1997 /* 1998 * See if this vnode or negative entry is already in the cache 1999 * with this name. This can happen with concurrent lookups of 2000 * the same path name. 2001 */ 2002 ncpp = NCHHASH(hash); 2003 CK_SLIST_FOREACH(n2, ncpp, nc_hash) { 2004 if (n2->nc_dvp == dvp && 2005 n2->nc_nlen == cnp->cn_namelen && 2006 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { 2007 MPASS(cache_ncp_canuse(n2)); 2008 if ((n2->nc_flag & NCF_NEGATIVE) != 0) 2009 KASSERT(vp == NULL, 2010 ("%s: found entry pointing to a different vnode (%p != %p)", 2011 __func__, NULL, vp)); 2012 else 2013 KASSERT(n2->nc_vp == vp, 2014 ("%s: found entry pointing to a different vnode (%p != %p)", 2015 __func__, n2->nc_vp, vp)); 2016 /* 2017 * Entries are supposed to be immutable unless in the 2018 * process of getting destroyed. Accommodating for 2019 * changing timestamps is possible but not worth it. 2020 * This should be harmless in terms of correctness, in 2021 * the worst case resulting in an earlier expiration. 2022 * Alternatively, the found entry can be replaced 2023 * altogether. 2024 */ 2025 MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS))); 2026 #if 0 2027 if (tsp != NULL) { 2028 KASSERT((n2->nc_flag & NCF_TS) != 0, 2029 ("no NCF_TS")); 2030 n2_ts = __containerof(n2, struct namecache_ts, nc_nc); 2031 n2_ts->nc_time = ncp_ts->nc_time; 2032 n2_ts->nc_ticks = ncp_ts->nc_ticks; 2033 if (dtsp != NULL) { 2034 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; 2035 n2_ts->nc_nc.nc_flag |= NCF_DTS; 2036 } 2037 } 2038 #endif 2039 goto out_unlock_free; 2040 } 2041 } 2042 2043 if (flag == NCF_ISDOTDOT) { 2044 /* 2045 * See if we are trying to add .. entry, but some other lookup 2046 * has populated v_cache_dd pointer already. 2047 */ 2048 if (dvp->v_cache_dd != NULL) 2049 goto out_unlock_free; 2050 KASSERT(vp == NULL || vp->v_type == VDIR, 2051 ("wrong vnode type %p", vp)); 2052 vn_seqc_write_begin(dvp); 2053 dvp->v_cache_dd = ncp; 2054 vn_seqc_write_end(dvp); 2055 } 2056 2057 if (vp != NULL) { 2058 if (flag != NCF_ISDOTDOT) { 2059 /* 2060 * For this case, the cache entry maps both the 2061 * directory name in it and the name ".." for the 2062 * directory's parent. 2063 */ 2064 vn_seqc_write_begin(vp); 2065 if ((ndd = vp->v_cache_dd) != NULL) { 2066 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) 2067 cache_zap_locked(ndd); 2068 else 2069 ndd = NULL; 2070 } 2071 vp->v_cache_dd = ncp; 2072 vn_seqc_write_end(vp); 2073 } else if (vp->v_type != VDIR) { 2074 if (vp->v_cache_dd != NULL) { 2075 vn_seqc_write_begin(vp); 2076 vp->v_cache_dd = NULL; 2077 vn_seqc_write_end(vp); 2078 } 2079 } 2080 } 2081 2082 if (flag != NCF_ISDOTDOT) { 2083 if (LIST_EMPTY(&dvp->v_cache_src)) { 2084 vhold(dvp); 2085 counter_u64_add(numcachehv, 1); 2086 } 2087 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); 2088 } 2089 2090 /* 2091 * If the entry is "negative", we place it into the 2092 * "negative" cache queue, otherwise, we place it into the 2093 * destination vnode's cache entries queue. 2094 */ 2095 if (vp != NULL) { 2096 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); 2097 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, 2098 vp); 2099 } else { 2100 if (cnp->cn_flags & ISWHITEOUT) 2101 ncp->nc_flag |= NCF_WHITE; 2102 cache_negative_insert(ncp); 2103 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, 2104 ncp->nc_name); 2105 } 2106 2107 /* 2108 * Insert the new namecache entry into the appropriate chain 2109 * within the cache entries table. 2110 */ 2111 CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash); 2112 2113 atomic_thread_fence_rel(); 2114 /* 2115 * Mark the entry as fully constructed. 2116 * It is immutable past this point until its removal. 2117 */ 2118 atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP); 2119 2120 cache_enter_unlock(&cel); 2121 if (numneg * ncnegfactor > lnumcache) 2122 cache_negative_zap_one(); 2123 if (ndd != NULL) 2124 cache_free(ndd); 2125 return; 2126 out_unlock_free: 2127 cache_enter_unlock(&cel); 2128 atomic_add_long(&numcache, -1); 2129 cache_free(ncp); 2130 return; 2131 } 2132 2133 static u_int 2134 cache_roundup_2(u_int val) 2135 { 2136 u_int res; 2137 2138 for (res = 1; res <= val; res <<= 1) 2139 continue; 2140 2141 return (res); 2142 } 2143 2144 static struct nchashhead * 2145 nchinittbl(u_long elements, u_long *hashmask) 2146 { 2147 struct nchashhead *hashtbl; 2148 u_long hashsize, i; 2149 2150 hashsize = cache_roundup_2(elements) / 2; 2151 2152 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK); 2153 for (i = 0; i < hashsize; i++) 2154 CK_SLIST_INIT(&hashtbl[i]); 2155 *hashmask = hashsize - 1; 2156 return (hashtbl); 2157 } 2158 2159 static void 2160 ncfreetbl(struct nchashhead *hashtbl) 2161 { 2162 2163 free(hashtbl, M_VFSCACHE); 2164 } 2165 2166 /* 2167 * Name cache initialization, from vfs_init() when we are booting 2168 */ 2169 static void 2170 nchinit(void *dummy __unused) 2171 { 2172 u_int i; 2173 2174 cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE, 2175 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2176 cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE, 2177 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2178 cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE, 2179 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2180 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE, 2181 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2182 2183 VFS_SMR_ZONE_SET(cache_zone_small); 2184 VFS_SMR_ZONE_SET(cache_zone_small_ts); 2185 VFS_SMR_ZONE_SET(cache_zone_large); 2186 VFS_SMR_ZONE_SET(cache_zone_large_ts); 2187 2188 ncsize = desiredvnodes * ncsizefactor; 2189 nchashtbl = nchinittbl(desiredvnodes * 2, &nchash); 2190 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; 2191 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ 2192 ncbuckethash = 7; 2193 if (ncbuckethash > nchash) 2194 ncbuckethash = nchash; 2195 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, 2196 M_WAITOK | M_ZERO); 2197 for (i = 0; i < numbucketlocks; i++) 2198 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE); 2199 ncvnodehash = ncbuckethash; 2200 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, 2201 M_WAITOK | M_ZERO); 2202 for (i = 0; i < numvnodelocks; i++) 2203 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); 2204 ncpurgeminvnodes = numbucketlocks * 2; 2205 2206 neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE, 2207 M_WAITOK | M_ZERO); 2208 for (i = 0; i < numneglists; i++) { 2209 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); 2210 TAILQ_INIT(&neglists[i].nl_list); 2211 } 2212 mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF); 2213 TAILQ_INIT(&ncneg_hot.nl_list); 2214 2215 mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF); 2216 } 2217 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); 2218 2219 void 2220 cache_vnode_init(struct vnode *vp) 2221 { 2222 2223 LIST_INIT(&vp->v_cache_src); 2224 TAILQ_INIT(&vp->v_cache_dst); 2225 vp->v_cache_dd = NULL; 2226 cache_prehash(vp); 2227 } 2228 2229 void 2230 cache_changesize(u_long newmaxvnodes) 2231 { 2232 struct nchashhead *new_nchashtbl, *old_nchashtbl; 2233 u_long new_nchash, old_nchash; 2234 struct namecache *ncp; 2235 uint32_t hash; 2236 u_long newncsize; 2237 int i; 2238 2239 newncsize = newmaxvnodes * ncsizefactor; 2240 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); 2241 if (newmaxvnodes < numbucketlocks) 2242 newmaxvnodes = numbucketlocks; 2243 2244 new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash); 2245 /* If same hash table size, nothing to do */ 2246 if (nchash == new_nchash) { 2247 ncfreetbl(new_nchashtbl); 2248 return; 2249 } 2250 /* 2251 * Move everything from the old hash table to the new table. 2252 * None of the namecache entries in the table can be removed 2253 * because to do so, they have to be removed from the hash table. 2254 */ 2255 cache_lock_all_vnodes(); 2256 cache_lock_all_buckets(); 2257 old_nchashtbl = nchashtbl; 2258 old_nchash = nchash; 2259 nchashtbl = new_nchashtbl; 2260 nchash = new_nchash; 2261 for (i = 0; i <= old_nchash; i++) { 2262 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) { 2263 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, 2264 ncp->nc_dvp); 2265 CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash); 2266 CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); 2267 } 2268 } 2269 ncsize = newncsize; 2270 cache_unlock_all_buckets(); 2271 cache_unlock_all_vnodes(); 2272 ncfreetbl(old_nchashtbl); 2273 } 2274 2275 /* 2276 * Invalidate all entries from and to a particular vnode. 2277 */ 2278 static void 2279 cache_purge_impl(struct vnode *vp) 2280 { 2281 TAILQ_HEAD(, namecache) ncps; 2282 struct namecache *ncp, *nnp; 2283 struct mtx *vlp, *vlp2; 2284 2285 TAILQ_INIT(&ncps); 2286 vlp = VP2VNODELOCK(vp); 2287 vlp2 = NULL; 2288 mtx_assert(vlp, MA_OWNED); 2289 retry: 2290 while (!LIST_EMPTY(&vp->v_cache_src)) { 2291 ncp = LIST_FIRST(&vp->v_cache_src); 2292 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2293 goto retry; 2294 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2295 } 2296 while (!TAILQ_EMPTY(&vp->v_cache_dst)) { 2297 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2298 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2299 goto retry; 2300 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2301 } 2302 ncp = vp->v_cache_dd; 2303 if (ncp != NULL) { 2304 KASSERT(ncp->nc_flag & NCF_ISDOTDOT, 2305 ("lost dotdot link")); 2306 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2307 goto retry; 2308 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2309 } 2310 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); 2311 mtx_unlock(vlp); 2312 if (vlp2 != NULL) 2313 mtx_unlock(vlp2); 2314 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2315 cache_free(ncp); 2316 } 2317 } 2318 2319 void 2320 cache_purge(struct vnode *vp) 2321 { 2322 struct mtx *vlp; 2323 2324 SDT_PROBE1(vfs, namecache, purge, done, vp); 2325 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2326 vp->v_cache_dd == NULL) 2327 return; 2328 vlp = VP2VNODELOCK(vp); 2329 mtx_lock(vlp); 2330 cache_purge_impl(vp); 2331 } 2332 2333 /* 2334 * Only to be used by vgone. 2335 */ 2336 void 2337 cache_purge_vgone(struct vnode *vp) 2338 { 2339 struct mtx *vlp; 2340 2341 VNPASS(VN_IS_DOOMED(vp), vp); 2342 vlp = VP2VNODELOCK(vp); 2343 if (!(LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2344 vp->v_cache_dd == NULL)) { 2345 mtx_lock(vlp); 2346 cache_purge_impl(vp); 2347 mtx_assert(vlp, MA_NOTOWNED); 2348 return; 2349 } 2350 2351 /* 2352 * All the NULL pointer state we found above may be transient. 2353 * Serialize against a possible thread doing cache_purge. 2354 */ 2355 mtx_wait_unlocked(vlp); 2356 if (!(LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2357 vp->v_cache_dd == NULL)) { 2358 mtx_lock(vlp); 2359 cache_purge_impl(vp); 2360 mtx_assert(vlp, MA_NOTOWNED); 2361 return; 2362 } 2363 return; 2364 } 2365 2366 /* 2367 * Invalidate all negative entries for a particular directory vnode. 2368 */ 2369 void 2370 cache_purge_negative(struct vnode *vp) 2371 { 2372 TAILQ_HEAD(, namecache) ncps; 2373 struct namecache *ncp, *nnp; 2374 struct mtx *vlp; 2375 2376 SDT_PROBE1(vfs, namecache, purge_negative, done, vp); 2377 if (LIST_EMPTY(&vp->v_cache_src)) 2378 return; 2379 TAILQ_INIT(&ncps); 2380 vlp = VP2VNODELOCK(vp); 2381 mtx_lock(vlp); 2382 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { 2383 if (!(ncp->nc_flag & NCF_NEGATIVE)) 2384 continue; 2385 cache_zap_negative_locked_vnode_kl(ncp, vp); 2386 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2387 } 2388 mtx_unlock(vlp); 2389 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2390 cache_free(ncp); 2391 } 2392 } 2393 2394 void 2395 cache_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, 2396 struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp) 2397 { 2398 2399 ASSERT_VOP_IN_SEQC(fdvp); 2400 ASSERT_VOP_IN_SEQC(fvp); 2401 ASSERT_VOP_IN_SEQC(tdvp); 2402 if (tvp != NULL) 2403 ASSERT_VOP_IN_SEQC(tvp); 2404 2405 cache_purge(fvp); 2406 if (tvp != NULL) { 2407 cache_purge(tvp); 2408 KASSERT(!cache_remove_cnp(tdvp, tcnp), 2409 ("%s: lingering negative entry", __func__)); 2410 } else { 2411 cache_remove_cnp(tdvp, tcnp); 2412 } 2413 } 2414 2415 /* 2416 * Flush all entries referencing a particular filesystem. 2417 */ 2418 void 2419 cache_purgevfs(struct mount *mp, bool force) 2420 { 2421 TAILQ_HEAD(, namecache) ncps; 2422 struct mtx *vlp1, *vlp2; 2423 struct mtx *blp; 2424 struct nchashhead *bucket; 2425 struct namecache *ncp, *nnp; 2426 u_long i, j, n_nchash; 2427 int error; 2428 2429 /* Scan hash tables for applicable entries */ 2430 SDT_PROBE1(vfs, namecache, purgevfs, done, mp); 2431 if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes) 2432 return; 2433 TAILQ_INIT(&ncps); 2434 n_nchash = nchash + 1; 2435 vlp1 = vlp2 = NULL; 2436 for (i = 0; i < numbucketlocks; i++) { 2437 blp = (struct mtx *)&bucketlocks[i]; 2438 mtx_lock(blp); 2439 for (j = i; j < n_nchash; j += numbucketlocks) { 2440 retry: 2441 bucket = &nchashtbl[j]; 2442 CK_SLIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) { 2443 cache_assert_bucket_locked(ncp); 2444 if (ncp->nc_dvp->v_mount != mp) 2445 continue; 2446 error = cache_zap_locked_bucket_kl(ncp, blp, 2447 &vlp1, &vlp2); 2448 if (error != 0) 2449 goto retry; 2450 TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst); 2451 } 2452 } 2453 mtx_unlock(blp); 2454 if (vlp1 == NULL && vlp2 == NULL) 2455 cache_maybe_yield(); 2456 } 2457 if (vlp1 != NULL) 2458 mtx_unlock(vlp1); 2459 if (vlp2 != NULL) 2460 mtx_unlock(vlp2); 2461 2462 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2463 cache_free(ncp); 2464 } 2465 } 2466 2467 /* 2468 * Perform canonical checks and cache lookup and pass on to filesystem 2469 * through the vop_cachedlookup only if needed. 2470 */ 2471 2472 int 2473 vfs_cache_lookup(struct vop_lookup_args *ap) 2474 { 2475 struct vnode *dvp; 2476 int error; 2477 struct vnode **vpp = ap->a_vpp; 2478 struct componentname *cnp = ap->a_cnp; 2479 int flags = cnp->cn_flags; 2480 2481 *vpp = NULL; 2482 dvp = ap->a_dvp; 2483 2484 if (dvp->v_type != VDIR) 2485 return (ENOTDIR); 2486 2487 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && 2488 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) 2489 return (EROFS); 2490 2491 error = vn_dir_check_exec(dvp, cnp); 2492 if (error != 0) 2493 return (error); 2494 2495 error = cache_lookup(dvp, vpp, cnp, NULL, NULL); 2496 if (error == 0) 2497 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); 2498 if (error == -1) 2499 return (0); 2500 return (error); 2501 } 2502 2503 /* Implementation of the getcwd syscall. */ 2504 int 2505 sys___getcwd(struct thread *td, struct __getcwd_args *uap) 2506 { 2507 char *buf, *retbuf; 2508 size_t buflen; 2509 int error; 2510 2511 buflen = uap->buflen; 2512 if (__predict_false(buflen < 2)) 2513 return (EINVAL); 2514 if (buflen > MAXPATHLEN) 2515 buflen = MAXPATHLEN; 2516 2517 buf = uma_zalloc(namei_zone, M_WAITOK); 2518 error = vn_getcwd(buf, &retbuf, &buflen); 2519 if (error == 0) 2520 error = copyout(retbuf, uap->buf, buflen); 2521 uma_zfree(namei_zone, buf); 2522 return (error); 2523 } 2524 2525 int 2526 vn_getcwd(char *buf, char **retbuf, size_t *buflen) 2527 { 2528 struct pwd *pwd; 2529 int error; 2530 2531 vfs_smr_enter(); 2532 pwd = pwd_get_smr(); 2533 error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, 2534 buflen, false, 0); 2535 VFS_SMR_ASSERT_NOT_ENTERED(); 2536 if (error < 0) { 2537 pwd = pwd_hold(curthread); 2538 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf, 2539 retbuf, buflen); 2540 pwd_drop(pwd); 2541 } 2542 2543 #ifdef KTRACE 2544 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0) 2545 ktrnamei(*retbuf); 2546 #endif 2547 return (error); 2548 } 2549 2550 static int 2551 kern___realpathat(struct thread *td, int fd, const char *path, char *buf, 2552 size_t size, int flags, enum uio_seg pathseg) 2553 { 2554 struct nameidata nd; 2555 char *retbuf, *freebuf; 2556 int error; 2557 2558 if (flags != 0) 2559 return (EINVAL); 2560 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1, 2561 pathseg, path, fd, &cap_fstat_rights, td); 2562 if ((error = namei(&nd)) != 0) 2563 return (error); 2564 error = vn_fullpath_hardlink(&nd, &retbuf, &freebuf, &size); 2565 if (error == 0) { 2566 error = copyout(retbuf, buf, size); 2567 free(freebuf, M_TEMP); 2568 } 2569 NDFREE(&nd, 0); 2570 return (error); 2571 } 2572 2573 int 2574 sys___realpathat(struct thread *td, struct __realpathat_args *uap) 2575 { 2576 2577 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size, 2578 uap->flags, UIO_USERSPACE)); 2579 } 2580 2581 /* 2582 * Retrieve the full filesystem path that correspond to a vnode from the name 2583 * cache (if available) 2584 */ 2585 int 2586 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf) 2587 { 2588 struct pwd *pwd; 2589 char *buf; 2590 size_t buflen; 2591 int error; 2592 2593 if (__predict_false(vp == NULL)) 2594 return (EINVAL); 2595 2596 buflen = MAXPATHLEN; 2597 buf = malloc(buflen, M_TEMP, M_WAITOK); 2598 vfs_smr_enter(); 2599 pwd = pwd_get_smr(); 2600 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, false, 0); 2601 VFS_SMR_ASSERT_NOT_ENTERED(); 2602 if (error < 0) { 2603 pwd = pwd_hold(curthread); 2604 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen); 2605 pwd_drop(pwd); 2606 } 2607 if (error == 0) 2608 *freebuf = buf; 2609 else 2610 free(buf, M_TEMP); 2611 return (error); 2612 } 2613 2614 /* 2615 * This function is similar to vn_fullpath, but it attempts to lookup the 2616 * pathname relative to the global root mount point. This is required for the 2617 * auditing sub-system, as audited pathnames must be absolute, relative to the 2618 * global root mount point. 2619 */ 2620 int 2621 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf) 2622 { 2623 char *buf; 2624 size_t buflen; 2625 int error; 2626 2627 if (__predict_false(vp == NULL)) 2628 return (EINVAL); 2629 buflen = MAXPATHLEN; 2630 buf = malloc(buflen, M_TEMP, M_WAITOK); 2631 vfs_smr_enter(); 2632 error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, false, 0); 2633 VFS_SMR_ASSERT_NOT_ENTERED(); 2634 if (error < 0) { 2635 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen); 2636 } 2637 if (error == 0) 2638 *freebuf = buf; 2639 else 2640 free(buf, M_TEMP); 2641 return (error); 2642 } 2643 2644 static struct namecache * 2645 vn_dd_from_dst(struct vnode *vp) 2646 { 2647 struct namecache *ncp; 2648 2649 cache_assert_vnode_locked(vp); 2650 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) { 2651 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2652 return (ncp); 2653 } 2654 return (NULL); 2655 } 2656 2657 int 2658 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, size_t *buflen) 2659 { 2660 struct vnode *dvp; 2661 struct namecache *ncp; 2662 struct mtx *vlp; 2663 int error; 2664 2665 vlp = VP2VNODELOCK(*vp); 2666 mtx_lock(vlp); 2667 ncp = (*vp)->v_cache_dd; 2668 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) { 2669 KASSERT(ncp == vn_dd_from_dst(*vp), 2670 ("%s: mismatch for dd entry (%p != %p)", __func__, 2671 ncp, vn_dd_from_dst(*vp))); 2672 } else { 2673 ncp = vn_dd_from_dst(*vp); 2674 } 2675 if (ncp != NULL) { 2676 if (*buflen < ncp->nc_nlen) { 2677 mtx_unlock(vlp); 2678 vrele(*vp); 2679 counter_u64_add(numfullpathfail4, 1); 2680 error = ENOMEM; 2681 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2682 vp, NULL); 2683 return (error); 2684 } 2685 *buflen -= ncp->nc_nlen; 2686 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 2687 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, 2688 ncp->nc_name, vp); 2689 dvp = *vp; 2690 *vp = ncp->nc_dvp; 2691 vref(*vp); 2692 mtx_unlock(vlp); 2693 vrele(dvp); 2694 return (0); 2695 } 2696 SDT_PROBE1(vfs, namecache, fullpath, miss, vp); 2697 2698 mtx_unlock(vlp); 2699 vn_lock(*vp, LK_SHARED | LK_RETRY); 2700 error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen); 2701 vput(*vp); 2702 if (error) { 2703 counter_u64_add(numfullpathfail2, 1); 2704 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2705 return (error); 2706 } 2707 2708 *vp = dvp; 2709 if (VN_IS_DOOMED(dvp)) { 2710 /* forced unmount */ 2711 vrele(dvp); 2712 error = ENOENT; 2713 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2714 return (error); 2715 } 2716 /* 2717 * *vp has its use count incremented still. 2718 */ 2719 2720 return (0); 2721 } 2722 2723 /* 2724 * Resolve a directory to a pathname. 2725 * 2726 * The name of the directory can always be found in the namecache or fetched 2727 * from the filesystem. There is also guaranteed to be only one parent, meaning 2728 * we can just follow vnodes up until we find the root. 2729 * 2730 * The vnode must be referenced. 2731 */ 2732 static int 2733 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 2734 size_t *len, bool slash_prefixed, size_t addend) 2735 { 2736 #ifdef KDTRACE_HOOKS 2737 struct vnode *startvp = vp; 2738 #endif 2739 struct vnode *vp1; 2740 size_t buflen; 2741 int error; 2742 2743 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); 2744 VNPASS(vp->v_usecount > 0, vp); 2745 2746 buflen = *len; 2747 2748 if (!slash_prefixed) { 2749 MPASS(*len >= 2); 2750 buflen--; 2751 buf[buflen] = '\0'; 2752 } 2753 2754 error = 0; 2755 2756 SDT_PROBE1(vfs, namecache, fullpath, entry, vp); 2757 counter_u64_add(numfullpathcalls, 1); 2758 while (vp != rdir && vp != rootvnode) { 2759 /* 2760 * The vp vnode must be already fully constructed, 2761 * since it is either found in namecache or obtained 2762 * from VOP_VPTOCNP(). We may test for VV_ROOT safely 2763 * without obtaining the vnode lock. 2764 */ 2765 if ((vp->v_vflag & VV_ROOT) != 0) { 2766 vn_lock(vp, LK_RETRY | LK_SHARED); 2767 2768 /* 2769 * With the vnode locked, check for races with 2770 * unmount, forced or not. Note that we 2771 * already verified that vp is not equal to 2772 * the root vnode, which means that 2773 * mnt_vnodecovered can be NULL only for the 2774 * case of unmount. 2775 */ 2776 if (VN_IS_DOOMED(vp) || 2777 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || 2778 vp1->v_mountedhere != vp->v_mount) { 2779 vput(vp); 2780 error = ENOENT; 2781 SDT_PROBE3(vfs, namecache, fullpath, return, 2782 error, vp, NULL); 2783 break; 2784 } 2785 2786 vref(vp1); 2787 vput(vp); 2788 vp = vp1; 2789 continue; 2790 } 2791 if (vp->v_type != VDIR) { 2792 vrele(vp); 2793 counter_u64_add(numfullpathfail1, 1); 2794 error = ENOTDIR; 2795 SDT_PROBE3(vfs, namecache, fullpath, return, 2796 error, vp, NULL); 2797 break; 2798 } 2799 error = vn_vptocnp(&vp, curthread->td_ucred, buf, &buflen); 2800 if (error) 2801 break; 2802 if (buflen == 0) { 2803 vrele(vp); 2804 error = ENOMEM; 2805 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2806 startvp, NULL); 2807 break; 2808 } 2809 buf[--buflen] = '/'; 2810 slash_prefixed = true; 2811 } 2812 if (error) 2813 return (error); 2814 if (!slash_prefixed) { 2815 if (buflen == 0) { 2816 vrele(vp); 2817 counter_u64_add(numfullpathfail4, 1); 2818 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, 2819 startvp, NULL); 2820 return (ENOMEM); 2821 } 2822 buf[--buflen] = '/'; 2823 } 2824 counter_u64_add(numfullpathfound, 1); 2825 vrele(vp); 2826 2827 *retbuf = buf + buflen; 2828 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf); 2829 *len -= buflen; 2830 *len += addend; 2831 return (0); 2832 } 2833 2834 /* 2835 * Resolve an arbitrary vnode to a pathname. 2836 * 2837 * Note 2 caveats: 2838 * - hardlinks are not tracked, thus if the vnode is not a directory this can 2839 * resolve to a different path than the one used to find it 2840 * - namecache is not mandatory, meaning names are not guaranteed to be added 2841 * (in which case resolving fails) 2842 */ 2843 static void __inline 2844 cache_rev_failed_impl(int *reason, int line) 2845 { 2846 2847 *reason = line; 2848 } 2849 #define cache_rev_failed(var) cache_rev_failed_impl((var), __LINE__) 2850 2851 static int 2852 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 2853 char **retbuf, size_t *buflen, bool slash_prefixed, size_t addend) 2854 { 2855 #ifdef KDTRACE_HOOKS 2856 struct vnode *startvp = vp; 2857 #endif 2858 struct vnode *tvp; 2859 struct mount *mp; 2860 struct namecache *ncp; 2861 size_t orig_buflen; 2862 int reason; 2863 int error; 2864 #ifdef KDTRACE_HOOKS 2865 int i; 2866 #endif 2867 seqc_t vp_seqc, tvp_seqc; 2868 u_char nc_flag; 2869 2870 VFS_SMR_ASSERT_ENTERED(); 2871 2872 if (!cache_fast_revlookup) { 2873 vfs_smr_exit(); 2874 return (-1); 2875 } 2876 2877 orig_buflen = *buflen; 2878 2879 if (!slash_prefixed) { 2880 MPASS(*buflen >= 2); 2881 *buflen -= 1; 2882 buf[*buflen] = '\0'; 2883 } 2884 2885 if (vp == rdir || vp == rootvnode) { 2886 if (!slash_prefixed) { 2887 *buflen -= 1; 2888 buf[*buflen] = '/'; 2889 } 2890 goto out_ok; 2891 } 2892 2893 #ifdef KDTRACE_HOOKS 2894 i = 0; 2895 #endif 2896 error = -1; 2897 ncp = NULL; /* for sdt probe down below */ 2898 vp_seqc = vn_seqc_read_any(vp); 2899 if (seqc_in_modify(vp_seqc)) { 2900 cache_rev_failed(&reason); 2901 goto out_abort; 2902 } 2903 2904 for (;;) { 2905 #ifdef KDTRACE_HOOKS 2906 i++; 2907 #endif 2908 if ((vp->v_vflag & VV_ROOT) != 0) { 2909 mp = atomic_load_ptr(&vp->v_mount); 2910 if (mp == NULL) { 2911 cache_rev_failed(&reason); 2912 goto out_abort; 2913 } 2914 tvp = atomic_load_ptr(&mp->mnt_vnodecovered); 2915 tvp_seqc = vn_seqc_read_any(tvp); 2916 if (seqc_in_modify(tvp_seqc)) { 2917 cache_rev_failed(&reason); 2918 goto out_abort; 2919 } 2920 if (!vn_seqc_consistent(vp, vp_seqc)) { 2921 cache_rev_failed(&reason); 2922 goto out_abort; 2923 } 2924 vp = tvp; 2925 vp_seqc = tvp_seqc; 2926 continue; 2927 } 2928 ncp = atomic_load_ptr(&vp->v_cache_dd); 2929 if (ncp == NULL) { 2930 cache_rev_failed(&reason); 2931 goto out_abort; 2932 } 2933 nc_flag = atomic_load_char(&ncp->nc_flag); 2934 if ((nc_flag & NCF_ISDOTDOT) != 0) { 2935 cache_rev_failed(&reason); 2936 goto out_abort; 2937 } 2938 if (!cache_ncp_canuse(ncp)) { 2939 cache_rev_failed(&reason); 2940 goto out_abort; 2941 } 2942 if (ncp->nc_nlen >= *buflen) { 2943 cache_rev_failed(&reason); 2944 error = ENOMEM; 2945 goto out_abort; 2946 } 2947 *buflen -= ncp->nc_nlen; 2948 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 2949 *buflen -= 1; 2950 buf[*buflen] = '/'; 2951 tvp = ncp->nc_dvp; 2952 tvp_seqc = vn_seqc_read_any(tvp); 2953 if (seqc_in_modify(tvp_seqc)) { 2954 cache_rev_failed(&reason); 2955 goto out_abort; 2956 } 2957 if (!vn_seqc_consistent(vp, vp_seqc)) { 2958 cache_rev_failed(&reason); 2959 goto out_abort; 2960 } 2961 vp = tvp; 2962 vp_seqc = tvp_seqc; 2963 if (vp == rdir || vp == rootvnode) 2964 break; 2965 } 2966 out_ok: 2967 vfs_smr_exit(); 2968 *retbuf = buf + *buflen; 2969 *buflen = orig_buflen - *buflen + addend; 2970 SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf); 2971 return (0); 2972 2973 out_abort: 2974 *buflen = orig_buflen; 2975 SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i); 2976 vfs_smr_exit(); 2977 return (error); 2978 } 2979 2980 static int 2981 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 2982 size_t *buflen) 2983 { 2984 size_t orig_buflen; 2985 bool slash_prefixed; 2986 int error; 2987 2988 if (*buflen < 2) 2989 return (EINVAL); 2990 2991 orig_buflen = *buflen; 2992 2993 vref(vp); 2994 slash_prefixed = false; 2995 if (vp->v_type != VDIR) { 2996 *buflen -= 1; 2997 buf[*buflen] = '\0'; 2998 error = vn_vptocnp(&vp, curthread->td_ucred, buf, buflen); 2999 if (error) 3000 return (error); 3001 if (*buflen == 0) { 3002 vrele(vp); 3003 return (ENOMEM); 3004 } 3005 *buflen -= 1; 3006 buf[*buflen] = '/'; 3007 slash_prefixed = true; 3008 } 3009 3010 return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, slash_prefixed, 3011 orig_buflen - *buflen)); 3012 } 3013 3014 /* 3015 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks). 3016 * 3017 * Since the namecache does not track handlings, the caller is expected to first 3018 * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei. 3019 * 3020 * Then we have 2 cases: 3021 * - if the found vnode is a directory, the path can be constructed just by 3022 * fullowing names up the chain 3023 * - otherwise we populate the buffer with the saved name and start resolving 3024 * from the parent 3025 */ 3026 static int 3027 vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, char **freebuf, 3028 size_t *buflen) 3029 { 3030 char *buf, *tmpbuf; 3031 struct pwd *pwd; 3032 struct componentname *cnp; 3033 struct vnode *vp; 3034 size_t addend; 3035 int error; 3036 bool slash_prefixed; 3037 enum vtype type; 3038 3039 if (*buflen < 2) 3040 return (EINVAL); 3041 if (*buflen > MAXPATHLEN) 3042 *buflen = MAXPATHLEN; 3043 3044 slash_prefixed = false; 3045 3046 buf = malloc(*buflen, M_TEMP, M_WAITOK); 3047 3048 addend = 0; 3049 vp = ndp->ni_vp; 3050 /* 3051 * Check for VBAD to work around the vp_crossmp bug in lookup(). 3052 * 3053 * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be 3054 * set to mount point's root vnode while ni_dvp will be vp_crossmp. 3055 * If the type is VDIR (like in this very case) we can skip looking 3056 * at ni_dvp in the first place. However, since vnodes get passed here 3057 * unlocked the target may transition to doomed state (type == VBAD) 3058 * before we get to evaluate the condition. If this happens, we will 3059 * populate part of the buffer and descend to vn_fullpath_dir with 3060 * vp == vp_crossmp. Prevent the problem by checking for VBAD. 3061 * 3062 * This should be atomic_load(&vp->v_type) but it is ilegal to take 3063 * an address of a bit field, even if said field is sized to char. 3064 * Work around the problem by reading the value into a full-sized enum 3065 * and then re-reading it with atomic_load which will still prevent 3066 * the compiler from re-reading down the road. 3067 */ 3068 type = vp->v_type; 3069 type = atomic_load_int(&type); 3070 if (type == VBAD) { 3071 error = ENOENT; 3072 goto out_bad; 3073 } 3074 if (type != VDIR) { 3075 cnp = &ndp->ni_cnd; 3076 addend = cnp->cn_namelen + 2; 3077 if (*buflen < addend) { 3078 error = ENOMEM; 3079 goto out_bad; 3080 } 3081 *buflen -= addend; 3082 tmpbuf = buf + *buflen; 3083 tmpbuf[0] = '/'; 3084 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen); 3085 tmpbuf[addend - 1] = '\0'; 3086 slash_prefixed = true; 3087 vp = ndp->ni_dvp; 3088 } 3089 3090 vfs_smr_enter(); 3091 pwd = pwd_get_smr(); 3092 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen, 3093 slash_prefixed, addend); 3094 VFS_SMR_ASSERT_NOT_ENTERED(); 3095 if (error < 0) { 3096 pwd = pwd_hold(curthread); 3097 vref(vp); 3098 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen, 3099 slash_prefixed, addend); 3100 pwd_drop(pwd); 3101 if (error != 0) 3102 goto out_bad; 3103 } 3104 3105 *freebuf = buf; 3106 3107 return (0); 3108 out_bad: 3109 free(buf, M_TEMP); 3110 return (error); 3111 } 3112 3113 struct vnode * 3114 vn_dir_dd_ino(struct vnode *vp) 3115 { 3116 struct namecache *ncp; 3117 struct vnode *ddvp; 3118 struct mtx *vlp; 3119 enum vgetstate vs; 3120 3121 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); 3122 vlp = VP2VNODELOCK(vp); 3123 mtx_lock(vlp); 3124 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { 3125 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 3126 continue; 3127 ddvp = ncp->nc_dvp; 3128 vs = vget_prep(ddvp); 3129 mtx_unlock(vlp); 3130 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) 3131 return (NULL); 3132 return (ddvp); 3133 } 3134 mtx_unlock(vlp); 3135 return (NULL); 3136 } 3137 3138 int 3139 vn_commname(struct vnode *vp, char *buf, u_int buflen) 3140 { 3141 struct namecache *ncp; 3142 struct mtx *vlp; 3143 int l; 3144 3145 vlp = VP2VNODELOCK(vp); 3146 mtx_lock(vlp); 3147 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) 3148 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 3149 break; 3150 if (ncp == NULL) { 3151 mtx_unlock(vlp); 3152 return (ENOENT); 3153 } 3154 l = min(ncp->nc_nlen, buflen - 1); 3155 memcpy(buf, ncp->nc_name, l); 3156 mtx_unlock(vlp); 3157 buf[l] = '\0'; 3158 return (0); 3159 } 3160 3161 /* 3162 * This function updates path string to vnode's full global path 3163 * and checks the size of the new path string against the pathlen argument. 3164 * 3165 * Requires a locked, referenced vnode. 3166 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 3167 * 3168 * If vp is a directory, the call to vn_fullpath_global() always succeeds 3169 * because it falls back to the ".." lookup if the namecache lookup fails. 3170 */ 3171 int 3172 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, 3173 u_int pathlen) 3174 { 3175 struct nameidata nd; 3176 struct vnode *vp1; 3177 char *rpath, *fbuf; 3178 int error; 3179 3180 ASSERT_VOP_ELOCKED(vp, __func__); 3181 3182 /* Construct global filesystem path from vp. */ 3183 VOP_UNLOCK(vp); 3184 error = vn_fullpath_global(vp, &rpath, &fbuf); 3185 3186 if (error != 0) { 3187 vrele(vp); 3188 return (error); 3189 } 3190 3191 if (strlen(rpath) >= pathlen) { 3192 vrele(vp); 3193 error = ENAMETOOLONG; 3194 goto out; 3195 } 3196 3197 /* 3198 * Re-lookup the vnode by path to detect a possible rename. 3199 * As a side effect, the vnode is relocked. 3200 * If vnode was renamed, return ENOENT. 3201 */ 3202 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, 3203 UIO_SYSSPACE, path, td); 3204 error = namei(&nd); 3205 if (error != 0) { 3206 vrele(vp); 3207 goto out; 3208 } 3209 NDFREE(&nd, NDF_ONLY_PNBUF); 3210 vp1 = nd.ni_vp; 3211 vrele(vp); 3212 if (vp1 == vp) 3213 strcpy(path, rpath); 3214 else { 3215 vput(vp1); 3216 error = ENOENT; 3217 } 3218 3219 out: 3220 free(fbuf, M_TEMP); 3221 return (error); 3222 } 3223 3224 #ifdef DDB 3225 static void 3226 db_print_vpath(struct vnode *vp) 3227 { 3228 3229 while (vp != NULL) { 3230 db_printf("%p: ", vp); 3231 if (vp == rootvnode) { 3232 db_printf("/"); 3233 vp = NULL; 3234 } else { 3235 if (vp->v_vflag & VV_ROOT) { 3236 db_printf("<mount point>"); 3237 vp = vp->v_mount->mnt_vnodecovered; 3238 } else { 3239 struct namecache *ncp; 3240 char *ncn; 3241 int i; 3242 3243 ncp = TAILQ_FIRST(&vp->v_cache_dst); 3244 if (ncp != NULL) { 3245 ncn = ncp->nc_name; 3246 for (i = 0; i < ncp->nc_nlen; i++) 3247 db_printf("%c", *ncn++); 3248 vp = ncp->nc_dvp; 3249 } else { 3250 vp = NULL; 3251 } 3252 } 3253 } 3254 db_printf("\n"); 3255 } 3256 3257 return; 3258 } 3259 3260 DB_SHOW_COMMAND(vpath, db_show_vpath) 3261 { 3262 struct vnode *vp; 3263 3264 if (!have_addr) { 3265 db_printf("usage: show vpath <struct vnode *>\n"); 3266 return; 3267 } 3268 3269 vp = (struct vnode *)addr; 3270 db_print_vpath(vp); 3271 } 3272 3273 #endif 3274 3275 static bool __read_frequently cache_fast_lookup = true; 3276 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW, 3277 &cache_fast_lookup, 0, ""); 3278 3279 #define CACHE_FPL_FAILED -2020 3280 3281 static void 3282 cache_fpl_cleanup_cnp(struct componentname *cnp) 3283 { 3284 3285 uma_zfree(namei_zone, cnp->cn_pnbuf); 3286 #ifdef DIAGNOSTIC 3287 cnp->cn_pnbuf = NULL; 3288 cnp->cn_nameptr = NULL; 3289 #endif 3290 } 3291 3292 static void 3293 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp) 3294 { 3295 struct componentname *cnp; 3296 3297 cnp = &ndp->ni_cnd; 3298 while (*(cnp->cn_nameptr) == '/') { 3299 cnp->cn_nameptr++; 3300 ndp->ni_pathlen--; 3301 } 3302 3303 *dpp = ndp->ni_rootdir; 3304 } 3305 3306 /* 3307 * Components of nameidata (or objects it can point to) which may 3308 * need restoring in case fast path lookup fails. 3309 */ 3310 struct nameidata_saved { 3311 long cn_namelen; 3312 char *cn_nameptr; 3313 size_t ni_pathlen; 3314 int cn_flags; 3315 }; 3316 3317 struct cache_fpl { 3318 struct nameidata *ndp; 3319 struct componentname *cnp; 3320 struct pwd *pwd; 3321 struct vnode *dvp; 3322 struct vnode *tvp; 3323 seqc_t dvp_seqc; 3324 seqc_t tvp_seqc; 3325 struct nameidata_saved snd; 3326 int line; 3327 enum cache_fpl_status status:8; 3328 bool in_smr; 3329 }; 3330 3331 static void 3332 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd) 3333 { 3334 3335 snd->cn_flags = fpl->ndp->ni_cnd.cn_flags; 3336 snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen; 3337 snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr; 3338 snd->ni_pathlen = fpl->ndp->ni_pathlen; 3339 } 3340 3341 static void 3342 cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd) 3343 { 3344 3345 fpl->ndp->ni_cnd.cn_flags = snd->cn_flags; 3346 fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen; 3347 fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr; 3348 fpl->ndp->ni_pathlen = snd->ni_pathlen; 3349 } 3350 3351 #ifdef INVARIANTS 3352 #define cache_fpl_smr_assert_entered(fpl) ({ \ 3353 struct cache_fpl *_fpl = (fpl); \ 3354 MPASS(_fpl->in_smr == true); \ 3355 VFS_SMR_ASSERT_ENTERED(); \ 3356 }) 3357 #define cache_fpl_smr_assert_not_entered(fpl) ({ \ 3358 struct cache_fpl *_fpl = (fpl); \ 3359 MPASS(_fpl->in_smr == false); \ 3360 VFS_SMR_ASSERT_NOT_ENTERED(); \ 3361 }) 3362 #else 3363 #define cache_fpl_smr_assert_entered(fpl) do { } while (0) 3364 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0) 3365 #endif 3366 3367 #define cache_fpl_smr_enter_initial(fpl) ({ \ 3368 struct cache_fpl *_fpl = (fpl); \ 3369 vfs_smr_enter(); \ 3370 _fpl->in_smr = true; \ 3371 }) 3372 3373 #define cache_fpl_smr_enter(fpl) ({ \ 3374 struct cache_fpl *_fpl = (fpl); \ 3375 MPASS(_fpl->in_smr == false); \ 3376 vfs_smr_enter(); \ 3377 _fpl->in_smr = true; \ 3378 }) 3379 3380 #define cache_fpl_smr_exit(fpl) ({ \ 3381 struct cache_fpl *_fpl = (fpl); \ 3382 MPASS(_fpl->in_smr == true); \ 3383 vfs_smr_exit(); \ 3384 _fpl->in_smr = false; \ 3385 }) 3386 3387 static int 3388 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line) 3389 { 3390 3391 if (fpl->status != CACHE_FPL_STATUS_UNSET) { 3392 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, 3393 ("%s: converting to abort from %d at %d, set at %d\n", 3394 __func__, fpl->status, line, fpl->line)); 3395 } 3396 fpl->status = CACHE_FPL_STATUS_ABORTED; 3397 fpl->line = line; 3398 return (CACHE_FPL_FAILED); 3399 } 3400 3401 #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__) 3402 3403 static int 3404 cache_fpl_partial_impl(struct cache_fpl *fpl, int line) 3405 { 3406 3407 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3408 ("%s: setting to partial at %d, but already set to %d at %d\n", 3409 __func__, line, fpl->status, fpl->line)); 3410 cache_fpl_smr_assert_entered(fpl); 3411 fpl->status = CACHE_FPL_STATUS_PARTIAL; 3412 fpl->line = line; 3413 return (CACHE_FPL_FAILED); 3414 } 3415 3416 #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__) 3417 3418 static int 3419 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line) 3420 { 3421 3422 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3423 ("%s: setting to handled at %d, but already set to %d at %d\n", 3424 __func__, line, fpl->status, fpl->line)); 3425 cache_fpl_smr_assert_not_entered(fpl); 3426 MPASS(error != CACHE_FPL_FAILED); 3427 fpl->status = CACHE_FPL_STATUS_HANDLED; 3428 fpl->line = line; 3429 return (error); 3430 } 3431 3432 #define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__) 3433 3434 #define CACHE_FPL_SUPPORTED_CN_FLAGS \ 3435 (LOCKLEAF | LOCKPARENT | WANTPARENT | NOCACHE | FOLLOW | LOCKSHARED | SAVENAME | \ 3436 SAVESTART | WILLBEDIR | ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK) 3437 3438 #define CACHE_FPL_INTERNAL_CN_FLAGS \ 3439 (ISDOTDOT | MAKEENTRY | ISLASTCN) 3440 3441 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, 3442 "supported and internal flags overlap"); 3443 3444 static bool 3445 cache_fpl_islastcn(struct nameidata *ndp) 3446 { 3447 3448 return (*ndp->ni_next == 0); 3449 } 3450 3451 static bool 3452 cache_fpl_isdotdot(struct componentname *cnp) 3453 { 3454 3455 if (cnp->cn_namelen == 2 && 3456 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') 3457 return (true); 3458 return (false); 3459 } 3460 3461 static bool 3462 cache_can_fplookup(struct cache_fpl *fpl) 3463 { 3464 struct nameidata *ndp; 3465 struct componentname *cnp; 3466 struct thread *td; 3467 3468 ndp = fpl->ndp; 3469 cnp = fpl->cnp; 3470 td = cnp->cn_thread; 3471 3472 if (!cache_fast_lookup) { 3473 cache_fpl_aborted(fpl); 3474 return (false); 3475 } 3476 #ifdef MAC 3477 if (mac_vnode_check_lookup_enabled()) { 3478 cache_fpl_aborted(fpl); 3479 return (false); 3480 } 3481 #endif 3482 if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) { 3483 cache_fpl_aborted(fpl); 3484 return (false); 3485 } 3486 if (ndp->ni_dirfd != AT_FDCWD) { 3487 cache_fpl_aborted(fpl); 3488 return (false); 3489 } 3490 if (IN_CAPABILITY_MODE(td)) { 3491 cache_fpl_aborted(fpl); 3492 return (false); 3493 } 3494 if (AUDITING_TD(td)) { 3495 cache_fpl_aborted(fpl); 3496 return (false); 3497 } 3498 if (ndp->ni_startdir != NULL) { 3499 cache_fpl_aborted(fpl); 3500 return (false); 3501 } 3502 return (true); 3503 } 3504 3505 static bool 3506 cache_fplookup_vnode_supported(struct vnode *vp) 3507 { 3508 3509 return (vp->v_type != VLNK); 3510 } 3511 3512 /* 3513 * Move a negative entry to the hot list. 3514 * 3515 * We have to take locks, but they may be contended and in the worst 3516 * case we may need to go off CPU. We don't want to spin within the 3517 * smr section and we can't block with it. Instead we are going to 3518 * look up the entry again. 3519 */ 3520 static int __noinline 3521 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp, 3522 uint32_t hash) 3523 { 3524 struct componentname *cnp; 3525 struct namecache *ncp; 3526 struct neglist *neglist; 3527 struct negstate *negstate; 3528 struct vnode *dvp; 3529 u_char nc_flag; 3530 3531 cnp = fpl->cnp; 3532 dvp = fpl->dvp; 3533 3534 if (!vhold_smr(dvp)) 3535 return (cache_fpl_aborted(fpl)); 3536 3537 neglist = NCP2NEGLIST(oncp); 3538 cache_fpl_smr_exit(fpl); 3539 3540 mtx_lock(&ncneg_hot.nl_lock); 3541 mtx_lock(&neglist->nl_lock); 3542 /* 3543 * For hash iteration. 3544 */ 3545 cache_fpl_smr_enter(fpl); 3546 3547 /* 3548 * Avoid all surprises by only succeeding if we got the same entry and 3549 * bailing completely otherwise. 3550 * 3551 * In particular at this point there can be a new ncp which matches the 3552 * search but hashes to a different neglist. 3553 */ 3554 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 3555 if (ncp == oncp) 3556 break; 3557 } 3558 3559 /* 3560 * No match to begin with. 3561 */ 3562 if (__predict_false(ncp == NULL)) { 3563 goto out_abort; 3564 } 3565 3566 /* 3567 * The newly found entry may be something different... 3568 */ 3569 if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 3570 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) { 3571 goto out_abort; 3572 } 3573 3574 /* 3575 * ... and not even negative. 3576 */ 3577 nc_flag = atomic_load_char(&ncp->nc_flag); 3578 if ((nc_flag & NCF_NEGATIVE) == 0) { 3579 goto out_abort; 3580 } 3581 3582 if (__predict_false(!cache_ncp_canuse(ncp))) { 3583 goto out_abort; 3584 } 3585 3586 negstate = NCP2NEGSTATE(ncp); 3587 if ((negstate->neg_flag & NEG_HOT) == 0) { 3588 numhotneg++; 3589 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 3590 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); 3591 negstate->neg_flag |= NEG_HOT; 3592 } 3593 3594 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 3595 counter_u64_add(numneghits, 1); 3596 cache_fpl_smr_exit(fpl); 3597 mtx_unlock(&neglist->nl_lock); 3598 mtx_unlock(&ncneg_hot.nl_lock); 3599 vdrop(dvp); 3600 return (cache_fpl_handled(fpl, ENOENT)); 3601 out_abort: 3602 cache_fpl_smr_exit(fpl); 3603 mtx_unlock(&neglist->nl_lock); 3604 mtx_unlock(&ncneg_hot.nl_lock); 3605 vdrop(dvp); 3606 return (cache_fpl_aborted(fpl)); 3607 } 3608 3609 /* 3610 * The target vnode is not supported, prepare for the slow path to take over. 3611 */ 3612 static int __noinline 3613 cache_fplookup_partial_setup(struct cache_fpl *fpl) 3614 { 3615 struct nameidata *ndp; 3616 struct componentname *cnp; 3617 enum vgetstate dvs; 3618 struct vnode *dvp; 3619 struct pwd *pwd; 3620 seqc_t dvp_seqc; 3621 3622 ndp = fpl->ndp; 3623 cnp = fpl->cnp; 3624 dvp = fpl->dvp; 3625 dvp_seqc = fpl->dvp_seqc; 3626 3627 dvs = vget_prep_smr(dvp); 3628 if (__predict_false(dvs == VGET_NONE)) { 3629 cache_fpl_smr_exit(fpl); 3630 return (cache_fpl_aborted(fpl)); 3631 } 3632 3633 cache_fpl_smr_exit(fpl); 3634 3635 vget_finish_ref(dvp, dvs); 3636 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3637 vrele(dvp); 3638 return (cache_fpl_aborted(fpl)); 3639 } 3640 3641 pwd = pwd_hold(curthread); 3642 if (fpl->pwd != pwd) { 3643 vrele(dvp); 3644 pwd_drop(pwd); 3645 return (cache_fpl_aborted(fpl)); 3646 } 3647 3648 cache_fpl_restore(fpl, &fpl->snd); 3649 3650 ndp->ni_startdir = dvp; 3651 cnp->cn_flags |= MAKEENTRY; 3652 if (cache_fpl_islastcn(ndp)) 3653 cnp->cn_flags |= ISLASTCN; 3654 if (cache_fpl_isdotdot(cnp)) 3655 cnp->cn_flags |= ISDOTDOT; 3656 3657 return (0); 3658 } 3659 3660 static int 3661 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs) 3662 { 3663 struct componentname *cnp; 3664 struct vnode *tvp; 3665 seqc_t tvp_seqc; 3666 int error, lkflags; 3667 3668 cnp = fpl->cnp; 3669 tvp = fpl->tvp; 3670 tvp_seqc = fpl->tvp_seqc; 3671 3672 if ((cnp->cn_flags & LOCKLEAF) != 0) { 3673 lkflags = LK_SHARED; 3674 if ((cnp->cn_flags & LOCKSHARED) == 0) 3675 lkflags = LK_EXCLUSIVE; 3676 error = vget_finish(tvp, lkflags, tvs); 3677 if (__predict_false(error != 0)) { 3678 return (cache_fpl_aborted(fpl)); 3679 } 3680 } else { 3681 vget_finish_ref(tvp, tvs); 3682 } 3683 3684 if (!vn_seqc_consistent(tvp, tvp_seqc)) { 3685 if ((cnp->cn_flags & LOCKLEAF) != 0) 3686 vput(tvp); 3687 else 3688 vrele(tvp); 3689 return (cache_fpl_aborted(fpl)); 3690 } 3691 3692 return (cache_fpl_handled(fpl, 0)); 3693 } 3694 3695 /* 3696 * They want to possibly modify the state of the namecache. 3697 * 3698 * Don't try to match the API contract, just leave. 3699 * TODO: this leaves scalability on the table 3700 */ 3701 static int 3702 cache_fplookup_final_modifying(struct cache_fpl *fpl) 3703 { 3704 struct componentname *cnp; 3705 3706 cnp = fpl->cnp; 3707 MPASS(cnp->cn_nameiop != LOOKUP); 3708 return (cache_fpl_partial(fpl)); 3709 } 3710 3711 static int __noinline 3712 cache_fplookup_final_withparent(struct cache_fpl *fpl) 3713 { 3714 struct componentname *cnp; 3715 enum vgetstate dvs, tvs; 3716 struct vnode *dvp, *tvp; 3717 seqc_t dvp_seqc, tvp_seqc; 3718 int error; 3719 3720 cnp = fpl->cnp; 3721 dvp = fpl->dvp; 3722 dvp_seqc = fpl->dvp_seqc; 3723 tvp = fpl->tvp; 3724 tvp_seqc = fpl->tvp_seqc; 3725 3726 MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0); 3727 3728 /* 3729 * This is less efficient than it can be for simplicity. 3730 */ 3731 dvs = vget_prep_smr(dvp); 3732 if (__predict_false(dvs == VGET_NONE)) { 3733 return (cache_fpl_aborted(fpl)); 3734 } 3735 tvs = vget_prep_smr(tvp); 3736 if (__predict_false(tvs == VGET_NONE)) { 3737 cache_fpl_smr_exit(fpl); 3738 vget_abort(dvp, dvs); 3739 return (cache_fpl_aborted(fpl)); 3740 } 3741 3742 cache_fpl_smr_exit(fpl); 3743 3744 if ((cnp->cn_flags & LOCKPARENT) != 0) { 3745 error = vget_finish(dvp, LK_EXCLUSIVE, dvs); 3746 if (__predict_false(error != 0)) { 3747 vget_abort(tvp, tvs); 3748 return (cache_fpl_aborted(fpl)); 3749 } 3750 } else { 3751 vget_finish_ref(dvp, dvs); 3752 } 3753 3754 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3755 vget_abort(tvp, tvs); 3756 if ((cnp->cn_flags & LOCKPARENT) != 0) 3757 vput(dvp); 3758 else 3759 vrele(dvp); 3760 return (cache_fpl_aborted(fpl)); 3761 } 3762 3763 error = cache_fplookup_final_child(fpl, tvs); 3764 if (__predict_false(error != 0)) { 3765 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED); 3766 if ((cnp->cn_flags & LOCKPARENT) != 0) 3767 vput(dvp); 3768 else 3769 vrele(dvp); 3770 return (error); 3771 } 3772 3773 MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED); 3774 return (0); 3775 } 3776 3777 static int 3778 cache_fplookup_final(struct cache_fpl *fpl) 3779 { 3780 struct componentname *cnp; 3781 enum vgetstate tvs; 3782 struct vnode *dvp, *tvp; 3783 seqc_t dvp_seqc, tvp_seqc; 3784 3785 cnp = fpl->cnp; 3786 dvp = fpl->dvp; 3787 dvp_seqc = fpl->dvp_seqc; 3788 tvp = fpl->tvp; 3789 tvp_seqc = fpl->tvp_seqc; 3790 3791 VNPASS(cache_fplookup_vnode_supported(dvp), dvp); 3792 3793 if (cnp->cn_nameiop != LOOKUP) { 3794 return (cache_fplookup_final_modifying(fpl)); 3795 } 3796 3797 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) 3798 return (cache_fplookup_final_withparent(fpl)); 3799 3800 tvs = vget_prep_smr(tvp); 3801 if (__predict_false(tvs == VGET_NONE)) { 3802 return (cache_fpl_partial(fpl)); 3803 } 3804 3805 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3806 cache_fpl_smr_exit(fpl); 3807 vget_abort(tvp, tvs); 3808 return (cache_fpl_aborted(fpl)); 3809 } 3810 3811 cache_fpl_smr_exit(fpl); 3812 return (cache_fplookup_final_child(fpl, tvs)); 3813 } 3814 3815 static int __noinline 3816 cache_fplookup_dot(struct cache_fpl *fpl) 3817 { 3818 struct vnode *dvp; 3819 3820 dvp = fpl->dvp; 3821 3822 fpl->tvp = dvp; 3823 fpl->tvp_seqc = vn_seqc_read_any(dvp); 3824 if (seqc_in_modify(fpl->tvp_seqc)) { 3825 return (cache_fpl_aborted(fpl)); 3826 } 3827 3828 counter_u64_add(dothits, 1); 3829 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", dvp); 3830 3831 return (0); 3832 } 3833 3834 static int __noinline 3835 cache_fplookup_dotdot(struct cache_fpl *fpl) 3836 { 3837 struct nameidata *ndp; 3838 struct componentname *cnp; 3839 struct namecache *ncp; 3840 struct vnode *dvp; 3841 struct prison *pr; 3842 u_char nc_flag; 3843 3844 ndp = fpl->ndp; 3845 cnp = fpl->cnp; 3846 dvp = fpl->dvp; 3847 3848 /* 3849 * XXX this is racy the same way regular lookup is 3850 */ 3851 for (pr = cnp->cn_cred->cr_prison; pr != NULL; 3852 pr = pr->pr_parent) 3853 if (dvp == pr->pr_root) 3854 break; 3855 3856 if (dvp == ndp->ni_rootdir || 3857 dvp == ndp->ni_topdir || 3858 dvp == rootvnode || 3859 pr != NULL) { 3860 fpl->tvp = dvp; 3861 fpl->tvp_seqc = vn_seqc_read_any(dvp); 3862 if (seqc_in_modify(fpl->tvp_seqc)) { 3863 return (cache_fpl_aborted(fpl)); 3864 } 3865 return (0); 3866 } 3867 3868 if ((dvp->v_vflag & VV_ROOT) != 0) { 3869 /* 3870 * TODO 3871 * The opposite of climb mount is needed here. 3872 */ 3873 return (cache_fpl_aborted(fpl)); 3874 } 3875 3876 ncp = atomic_load_ptr(&dvp->v_cache_dd); 3877 if (ncp == NULL) { 3878 return (cache_fpl_aborted(fpl)); 3879 } 3880 3881 nc_flag = atomic_load_char(&ncp->nc_flag); 3882 if ((nc_flag & NCF_ISDOTDOT) != 0) { 3883 if ((nc_flag & NCF_NEGATIVE) != 0) 3884 return (cache_fpl_aborted(fpl)); 3885 fpl->tvp = ncp->nc_vp; 3886 } else { 3887 fpl->tvp = ncp->nc_dvp; 3888 } 3889 3890 if (__predict_false(!cache_ncp_canuse(ncp))) { 3891 return (cache_fpl_aborted(fpl)); 3892 } 3893 3894 fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp); 3895 if (seqc_in_modify(fpl->tvp_seqc)) { 3896 return (cache_fpl_partial(fpl)); 3897 } 3898 3899 counter_u64_add(dotdothits, 1); 3900 return (0); 3901 } 3902 3903 static int 3904 cache_fplookup_next(struct cache_fpl *fpl) 3905 { 3906 struct componentname *cnp; 3907 struct namecache *ncp; 3908 struct negstate *negstate; 3909 struct vnode *dvp, *tvp; 3910 u_char nc_flag; 3911 uint32_t hash; 3912 bool neg_hot; 3913 3914 cnp = fpl->cnp; 3915 dvp = fpl->dvp; 3916 3917 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) { 3918 return (cache_fplookup_dot(fpl)); 3919 } 3920 3921 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 3922 3923 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 3924 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 3925 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 3926 break; 3927 } 3928 3929 /* 3930 * If there is no entry we have to punt to the slow path to perform 3931 * actual lookup. Should there be nothing with this name a negative 3932 * entry will be created. 3933 */ 3934 if (__predict_false(ncp == NULL)) { 3935 return (cache_fpl_partial(fpl)); 3936 } 3937 3938 tvp = atomic_load_ptr(&ncp->nc_vp); 3939 nc_flag = atomic_load_char(&ncp->nc_flag); 3940 if ((nc_flag & NCF_NEGATIVE) != 0) { 3941 /* 3942 * If they want to create an entry we need to replace this one. 3943 */ 3944 if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) { 3945 return (cache_fpl_partial(fpl)); 3946 } 3947 negstate = NCP2NEGSTATE(ncp); 3948 neg_hot = ((negstate->neg_flag & NEG_HOT) != 0); 3949 if (__predict_false(!cache_ncp_canuse(ncp))) { 3950 return (cache_fpl_partial(fpl)); 3951 } 3952 if (__predict_false((nc_flag & NCF_WHITE) != 0)) { 3953 return (cache_fpl_partial(fpl)); 3954 } 3955 if (!neg_hot) { 3956 return (cache_fplookup_negative_promote(fpl, ncp, hash)); 3957 } 3958 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, 3959 ncp->nc_name); 3960 counter_u64_add(numneghits, 1); 3961 cache_fpl_smr_exit(fpl); 3962 return (cache_fpl_handled(fpl, ENOENT)); 3963 } 3964 3965 if (__predict_false(!cache_ncp_canuse(ncp))) { 3966 return (cache_fpl_partial(fpl)); 3967 } 3968 3969 fpl->tvp = tvp; 3970 fpl->tvp_seqc = vn_seqc_read_any(tvp); 3971 if (seqc_in_modify(fpl->tvp_seqc)) { 3972 return (cache_fpl_partial(fpl)); 3973 } 3974 3975 if (!cache_fplookup_vnode_supported(tvp)) { 3976 return (cache_fpl_partial(fpl)); 3977 } 3978 3979 counter_u64_add(numposhits, 1); 3980 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp); 3981 return (0); 3982 } 3983 3984 static bool 3985 cache_fplookup_mp_supported(struct mount *mp) 3986 { 3987 3988 if (mp == NULL) 3989 return (false); 3990 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) 3991 return (false); 3992 return (true); 3993 } 3994 3995 /* 3996 * Walk up the mount stack (if any). 3997 * 3998 * Correctness is provided in the following ways: 3999 * - all vnodes are protected from freeing with SMR 4000 * - struct mount objects are type stable making them always safe to access 4001 * - stability of the particular mount is provided by busying it 4002 * - relationship between the vnode which is mounted on and the mount is 4003 * verified with the vnode sequence counter after busying 4004 * - association between root vnode of the mount and the mount is protected 4005 * by busy 4006 * 4007 * From that point on we can read the sequence counter of the root vnode 4008 * and get the next mount on the stack (if any) using the same protection. 4009 * 4010 * By the end of successful walk we are guaranteed the reached state was 4011 * indeed present at least at some point which matches the regular lookup. 4012 */ 4013 static int __noinline 4014 cache_fplookup_climb_mount(struct cache_fpl *fpl) 4015 { 4016 struct mount *mp, *prev_mp; 4017 struct vnode *vp; 4018 seqc_t vp_seqc; 4019 4020 vp = fpl->tvp; 4021 vp_seqc = fpl->tvp_seqc; 4022 4023 VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp); 4024 mp = atomic_load_ptr(&vp->v_mountedhere); 4025 if (mp == NULL) 4026 return (0); 4027 4028 prev_mp = NULL; 4029 for (;;) { 4030 if (!vfs_op_thread_enter_crit(mp)) { 4031 if (prev_mp != NULL) 4032 vfs_op_thread_exit_crit(prev_mp); 4033 return (cache_fpl_partial(fpl)); 4034 } 4035 if (prev_mp != NULL) 4036 vfs_op_thread_exit_crit(prev_mp); 4037 if (!vn_seqc_consistent(vp, vp_seqc)) { 4038 vfs_op_thread_exit_crit(mp); 4039 return (cache_fpl_partial(fpl)); 4040 } 4041 if (!cache_fplookup_mp_supported(mp)) { 4042 vfs_op_thread_exit_crit(mp); 4043 return (cache_fpl_partial(fpl)); 4044 } 4045 vp = atomic_load_ptr(&mp->mnt_rootvnode); 4046 if (vp == NULL || VN_IS_DOOMED(vp)) { 4047 vfs_op_thread_exit_crit(mp); 4048 return (cache_fpl_partial(fpl)); 4049 } 4050 vp_seqc = vn_seqc_read_any(vp); 4051 if (seqc_in_modify(vp_seqc)) { 4052 vfs_op_thread_exit_crit(mp); 4053 return (cache_fpl_partial(fpl)); 4054 } 4055 prev_mp = mp; 4056 mp = atomic_load_ptr(&vp->v_mountedhere); 4057 if (mp == NULL) 4058 break; 4059 } 4060 4061 vfs_op_thread_exit_crit(prev_mp); 4062 fpl->tvp = vp; 4063 fpl->tvp_seqc = vp_seqc; 4064 return (0); 4065 } 4066 4067 static bool 4068 cache_fplookup_need_climb_mount(struct cache_fpl *fpl) 4069 { 4070 struct mount *mp; 4071 struct vnode *vp; 4072 4073 vp = fpl->tvp; 4074 4075 /* 4076 * Hack: while this is a union, the pointer tends to be NULL so save on 4077 * a branch. 4078 */ 4079 mp = atomic_load_ptr(&vp->v_mountedhere); 4080 if (mp == NULL) 4081 return (false); 4082 if (vp->v_type == VDIR) 4083 return (true); 4084 return (false); 4085 } 4086 4087 /* 4088 * Parse the path. 4089 * 4090 * The code is mostly copy-pasted from regular lookup, see lookup(). 4091 * The structure is maintained along with comments for easier maintenance. 4092 * Deduplicating the code will become feasible after fast path lookup 4093 * becomes more feature-complete. 4094 */ 4095 static int 4096 cache_fplookup_parse(struct cache_fpl *fpl) 4097 { 4098 struct nameidata *ndp; 4099 struct componentname *cnp; 4100 char *cp; 4101 4102 ndp = fpl->ndp; 4103 cnp = fpl->cnp; 4104 4105 /* 4106 * Search a new directory. 4107 * 4108 * The last component of the filename is left accessible via 4109 * cnp->cn_nameptr for callers that need the name. Callers needing 4110 * the name set the SAVENAME flag. When done, they assume 4111 * responsibility for freeing the pathname buffer. 4112 */ 4113 for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++) 4114 continue; 4115 cnp->cn_namelen = cp - cnp->cn_nameptr; 4116 if (__predict_false(cnp->cn_namelen > NAME_MAX)) { 4117 cache_fpl_smr_exit(fpl); 4118 return (cache_fpl_handled(fpl, ENAMETOOLONG)); 4119 } 4120 ndp->ni_pathlen -= cnp->cn_namelen; 4121 KASSERT(ndp->ni_pathlen <= PATH_MAX, 4122 ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen)); 4123 ndp->ni_next = cp; 4124 4125 /* 4126 * Replace multiple slashes by a single slash and trailing slashes 4127 * by a null. This must be done before VOP_LOOKUP() because some 4128 * fs's don't know about trailing slashes. Remember if there were 4129 * trailing slashes to handle symlinks, existing non-directories 4130 * and non-existing files that won't be directories specially later. 4131 */ 4132 while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) { 4133 cp++; 4134 ndp->ni_pathlen--; 4135 if (*cp == '\0') { 4136 /* 4137 * TODO 4138 * Regular lookup performs the following: 4139 * *ndp->ni_next = '\0'; 4140 * cnp->cn_flags |= TRAILINGSLASH; 4141 * 4142 * Which is problematic since it modifies data read 4143 * from userspace. Then if fast path lookup was to 4144 * abort we would have to either restore it or convey 4145 * the flag. Since this is a corner case just ignore 4146 * it for simplicity. 4147 */ 4148 return (cache_fpl_partial(fpl)); 4149 } 4150 } 4151 ndp->ni_next = cp; 4152 4153 /* 4154 * Check for degenerate name (e.g. / or "") 4155 * which is a way of talking about a directory, 4156 * e.g. like "/." or ".". 4157 * 4158 * TODO 4159 * Another corner case handled by the regular lookup 4160 */ 4161 if (__predict_false(cnp->cn_nameptr[0] == '\0')) { 4162 return (cache_fpl_partial(fpl)); 4163 } 4164 return (0); 4165 } 4166 4167 static void 4168 cache_fplookup_parse_advance(struct cache_fpl *fpl) 4169 { 4170 struct nameidata *ndp; 4171 struct componentname *cnp; 4172 4173 ndp = fpl->ndp; 4174 cnp = fpl->cnp; 4175 4176 cnp->cn_nameptr = ndp->ni_next; 4177 while (*cnp->cn_nameptr == '/') { 4178 cnp->cn_nameptr++; 4179 ndp->ni_pathlen--; 4180 } 4181 } 4182 4183 static int __noinline 4184 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error) 4185 { 4186 4187 switch (error) { 4188 case EAGAIN: 4189 /* 4190 * Can happen when racing against vgone. 4191 * */ 4192 case EOPNOTSUPP: 4193 cache_fpl_partial(fpl); 4194 break; 4195 default: 4196 /* 4197 * See the API contract for VOP_FPLOOKUP_VEXEC. 4198 */ 4199 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { 4200 error = cache_fpl_aborted(fpl); 4201 } else { 4202 cache_fpl_smr_exit(fpl); 4203 cache_fpl_handled(fpl, error); 4204 } 4205 break; 4206 } 4207 return (error); 4208 } 4209 4210 static int 4211 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl) 4212 { 4213 struct nameidata *ndp; 4214 struct componentname *cnp; 4215 struct mount *mp; 4216 int error; 4217 4218 error = CACHE_FPL_FAILED; 4219 ndp = fpl->ndp; 4220 cnp = fpl->cnp; 4221 4222 cache_fpl_checkpoint(fpl, &fpl->snd); 4223 4224 fpl->dvp = dvp; 4225 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); 4226 if (seqc_in_modify(fpl->dvp_seqc)) { 4227 cache_fpl_aborted(fpl); 4228 goto out; 4229 } 4230 mp = atomic_load_ptr(&fpl->dvp->v_mount); 4231 if (!cache_fplookup_mp_supported(mp)) { 4232 cache_fpl_aborted(fpl); 4233 goto out; 4234 } 4235 4236 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); 4237 4238 for (;;) { 4239 error = cache_fplookup_parse(fpl); 4240 if (__predict_false(error != 0)) { 4241 break; 4242 } 4243 4244 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); 4245 4246 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred); 4247 if (__predict_false(error != 0)) { 4248 error = cache_fplookup_failed_vexec(fpl, error); 4249 break; 4250 } 4251 4252 if (__predict_false(cache_fpl_isdotdot(cnp))) { 4253 error = cache_fplookup_dotdot(fpl); 4254 if (__predict_false(error != 0)) { 4255 break; 4256 } 4257 } else { 4258 error = cache_fplookup_next(fpl); 4259 if (__predict_false(error != 0)) { 4260 break; 4261 } 4262 4263 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 4264 4265 if (cache_fplookup_need_climb_mount(fpl)) { 4266 error = cache_fplookup_climb_mount(fpl); 4267 if (__predict_false(error != 0)) { 4268 break; 4269 } 4270 } 4271 } 4272 4273 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 4274 4275 if (cache_fpl_islastcn(ndp)) { 4276 error = cache_fplookup_final(fpl); 4277 break; 4278 } 4279 4280 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { 4281 error = cache_fpl_aborted(fpl); 4282 break; 4283 } 4284 4285 fpl->dvp = fpl->tvp; 4286 fpl->dvp_seqc = fpl->tvp_seqc; 4287 4288 cache_fplookup_parse_advance(fpl); 4289 cache_fpl_checkpoint(fpl, &fpl->snd); 4290 } 4291 out: 4292 switch (fpl->status) { 4293 case CACHE_FPL_STATUS_UNSET: 4294 __assert_unreachable(); 4295 break; 4296 case CACHE_FPL_STATUS_PARTIAL: 4297 cache_fpl_smr_assert_entered(fpl); 4298 return (cache_fplookup_partial_setup(fpl)); 4299 case CACHE_FPL_STATUS_ABORTED: 4300 if (fpl->in_smr) 4301 cache_fpl_smr_exit(fpl); 4302 return (CACHE_FPL_FAILED); 4303 case CACHE_FPL_STATUS_HANDLED: 4304 MPASS(error != CACHE_FPL_FAILED); 4305 cache_fpl_smr_assert_not_entered(fpl); 4306 if (__predict_false(error != 0)) { 4307 ndp->ni_dvp = NULL; 4308 ndp->ni_vp = NULL; 4309 cache_fpl_cleanup_cnp(cnp); 4310 return (error); 4311 } 4312 ndp->ni_dvp = fpl->dvp; 4313 ndp->ni_vp = fpl->tvp; 4314 if (cnp->cn_flags & SAVENAME) 4315 cnp->cn_flags |= HASBUF; 4316 else 4317 cache_fpl_cleanup_cnp(cnp); 4318 return (error); 4319 } 4320 } 4321 4322 /* 4323 * Fast path lookup protected with SMR and sequence counters. 4324 * 4325 * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one. 4326 * 4327 * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria 4328 * outlined below. 4329 * 4330 * Traditional vnode lookup conceptually looks like this: 4331 * 4332 * vn_lock(current); 4333 * for (;;) { 4334 * next = find(); 4335 * vn_lock(next); 4336 * vn_unlock(current); 4337 * current = next; 4338 * if (last) 4339 * break; 4340 * } 4341 * return (current); 4342 * 4343 * Each jump to the next vnode is safe memory-wise and atomic with respect to 4344 * any modifications thanks to holding respective locks. 4345 * 4346 * The same guarantee can be provided with a combination of safe memory 4347 * reclamation and sequence counters instead. If all operations which affect 4348 * the relationship between the current vnode and the one we are looking for 4349 * also modify the counter, we can verify whether all the conditions held as 4350 * we made the jump. This includes things like permissions, mount points etc. 4351 * Counter modification is provided by enclosing relevant places in 4352 * vn_seqc_write_begin()/end() calls. 4353 * 4354 * Thus this translates to: 4355 * 4356 * vfs_smr_enter(); 4357 * dvp_seqc = seqc_read_any(dvp); 4358 * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode 4359 * abort(); 4360 * for (;;) { 4361 * tvp = find(); 4362 * tvp_seqc = seqc_read_any(tvp); 4363 * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode 4364 * abort(); 4365 * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode 4366 * abort(); 4367 * dvp = tvp; // we know nothing of importance has changed 4368 * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration 4369 * if (last) 4370 * break; 4371 * } 4372 * vget(); // secure the vnode 4373 * if (!seqc_consistent(tvp, tvp_seqc) // final check 4374 * abort(); 4375 * // at this point we know nothing has changed for any parent<->child pair 4376 * // as they were crossed during the lookup, meaning we matched the guarantee 4377 * // of the locked variant 4378 * return (tvp); 4379 * 4380 * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows: 4381 * - they are called while within vfs_smr protection which they must never exit 4382 * - EAGAIN can be returned to denote checking could not be performed, it is 4383 * always valid to return it 4384 * - if the sequence counter has not changed the result must be valid 4385 * - if the sequence counter has changed both false positives and false negatives 4386 * are permitted (since the result will be rejected later) 4387 * - for simple cases of unix permission checks vaccess_vexec_smr can be used 4388 * 4389 * Caveats to watch out for: 4390 * - vnodes are passed unlocked and unreferenced with nothing stopping 4391 * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised 4392 * to use atomic_load_ptr to fetch it. 4393 * - the aforementioned object can also get freed, meaning absent other means it 4394 * should be protected with vfs_smr 4395 * - either safely checking permissions as they are modified or guaranteeing 4396 * their stability is left to the routine 4397 */ 4398 int 4399 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status, 4400 struct pwd **pwdp) 4401 { 4402 struct cache_fpl fpl; 4403 struct pwd *pwd; 4404 struct vnode *dvp; 4405 struct componentname *cnp; 4406 struct nameidata_saved orig; 4407 int error; 4408 4409 MPASS(ndp->ni_lcf == 0); 4410 4411 fpl.status = CACHE_FPL_STATUS_UNSET; 4412 fpl.ndp = ndp; 4413 fpl.cnp = &ndp->ni_cnd; 4414 MPASS(curthread == fpl.cnp->cn_thread); 4415 4416 if ((fpl.cnp->cn_flags & SAVESTART) != 0) 4417 MPASS(fpl.cnp->cn_nameiop != LOOKUP); 4418 4419 if (!cache_can_fplookup(&fpl)) { 4420 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 4421 *status = fpl.status; 4422 return (EOPNOTSUPP); 4423 } 4424 4425 cache_fpl_checkpoint(&fpl, &orig); 4426 4427 cache_fpl_smr_enter_initial(&fpl); 4428 pwd = pwd_get_smr(); 4429 fpl.pwd = pwd; 4430 ndp->ni_rootdir = pwd->pwd_rdir; 4431 ndp->ni_topdir = pwd->pwd_jdir; 4432 4433 cnp = fpl.cnp; 4434 cnp->cn_nameptr = cnp->cn_pnbuf; 4435 if (cnp->cn_pnbuf[0] == '/') { 4436 cache_fpl_handle_root(ndp, &dvp); 4437 } else { 4438 MPASS(ndp->ni_dirfd == AT_FDCWD); 4439 dvp = pwd->pwd_cdir; 4440 } 4441 4442 SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true); 4443 4444 error = cache_fplookup_impl(dvp, &fpl); 4445 cache_fpl_smr_assert_not_entered(&fpl); 4446 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 4447 4448 *status = fpl.status; 4449 switch (fpl.status) { 4450 case CACHE_FPL_STATUS_UNSET: 4451 __assert_unreachable(); 4452 break; 4453 case CACHE_FPL_STATUS_HANDLED: 4454 SDT_PROBE3(vfs, namei, lookup, return, error, 4455 (error == 0 ? ndp->ni_vp : NULL), true); 4456 break; 4457 case CACHE_FPL_STATUS_PARTIAL: 4458 *pwdp = fpl.pwd; 4459 /* 4460 * Status restored by cache_fplookup_partial_setup. 4461 */ 4462 break; 4463 case CACHE_FPL_STATUS_ABORTED: 4464 cache_fpl_restore(&fpl, &orig); 4465 break; 4466 } 4467 return (error); 4468 } 4469