1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Poul-Henning Kamp of the FreeBSD Project. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 35 */ 36 37 #include <sys/cdefs.h> 38 #include "opt_ddb.h" 39 #include "opt_ktrace.h" 40 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/capsicum.h> 44 #include <sys/counter.h> 45 #include <sys/filedesc.h> 46 #include <sys/fnv_hash.h> 47 #include <sys/kernel.h> 48 #include <sys/ktr.h> 49 #include <sys/lock.h> 50 #include <sys/malloc.h> 51 #include <sys/fcntl.h> 52 #include <sys/jail.h> 53 #include <sys/mount.h> 54 #include <sys/namei.h> 55 #include <sys/proc.h> 56 #include <sys/seqc.h> 57 #include <sys/sdt.h> 58 #include <sys/smr.h> 59 #include <sys/smp.h> 60 #include <sys/syscallsubr.h> 61 #include <sys/sysctl.h> 62 #include <sys/sysproto.h> 63 #include <sys/vnode.h> 64 #include <ck_queue.h> 65 #ifdef KTRACE 66 #include <sys/ktrace.h> 67 #endif 68 #ifdef INVARIANTS 69 #include <machine/_inttypes.h> 70 #endif 71 72 #include <security/audit/audit.h> 73 #include <security/mac/mac_framework.h> 74 75 #ifdef DDB 76 #include <ddb/ddb.h> 77 #endif 78 79 #include <vm/uma.h> 80 81 /* 82 * High level overview of name caching in the VFS layer. 83 * 84 * Originally caching was implemented as part of UFS, later extracted to allow 85 * use by other filesystems. A decision was made to make it optional and 86 * completely detached from the rest of the kernel, which comes with limitations 87 * outlined near the end of this comment block. 88 * 89 * This fundamental choice needs to be revisited. In the meantime, the current 90 * state is described below. Significance of all notable routines is explained 91 * in comments placed above their implementation. Scattered thoroughout the 92 * file are TODO comments indicating shortcomings which can be fixed without 93 * reworking everything (most of the fixes will likely be reusable). Various 94 * details are omitted from this explanation to not clutter the overview, they 95 * have to be checked by reading the code and associated commentary. 96 * 97 * Keep in mind that it's individual path components which are cached, not full 98 * paths. That is, for a fully cached path "foo/bar/baz" there are 3 entries, 99 * one for each name. 100 * 101 * I. Data organization 102 * 103 * Entries are described by "struct namecache" objects and stored in a hash 104 * table. See cache_get_hash for more information. 105 * 106 * "struct vnode" contains pointers to source entries (names which can be found 107 * when traversing through said vnode), destination entries (names of that 108 * vnode (see "Limitations" for a breakdown on the subject) and a pointer to 109 * the parent vnode. 110 * 111 * The (directory vnode; name) tuple reliably determines the target entry if 112 * it exists. 113 * 114 * Since there are no small locks at this time (all are 32 bytes in size on 115 * LP64), the code works around the problem by introducing lock arrays to 116 * protect hash buckets and vnode lists. 117 * 118 * II. Filesystem integration 119 * 120 * Filesystems participating in name caching do the following: 121 * - set vop_lookup routine to vfs_cache_lookup 122 * - set vop_cachedlookup to whatever can perform the lookup if the above fails 123 * - if they support lockless lookup (see below), vop_fplookup_vexec and 124 * vop_fplookup_symlink are set along with the MNTK_FPLOOKUP flag on the 125 * mount point 126 * - call cache_purge or cache_vop_* routines to eliminate stale entries as 127 * applicable 128 * - call cache_enter to add entries depending on the MAKEENTRY flag 129 * 130 * With the above in mind, there are 2 entry points when doing lookups: 131 * - ... -> namei -> cache_fplookup -- this is the default 132 * - ... -> VOP_LOOKUP -> vfs_cache_lookup -- normally only called by namei 133 * should the above fail 134 * 135 * Example code flow how an entry is added: 136 * ... -> namei -> cache_fplookup -> cache_fplookup_noentry -> VOP_LOOKUP -> 137 * vfs_cache_lookup -> VOP_CACHEDLOOKUP -> ufs_lookup_ino -> cache_enter 138 * 139 * III. Performance considerations 140 * 141 * For lockless case forward lookup avoids any writes to shared areas apart 142 * from the terminal path component. In other words non-modifying lookups of 143 * different files don't suffer any scalability problems in the namecache. 144 * Looking up the same file is limited by VFS and goes beyond the scope of this 145 * file. 146 * 147 * At least on amd64 the single-threaded bottleneck for long paths is hashing 148 * (see cache_get_hash). There are cases where the code issues acquire fence 149 * multiple times, they can be combined on architectures which suffer from it. 150 * 151 * For locked case each encountered vnode has to be referenced and locked in 152 * order to be handed out to the caller (normally that's namei). This 153 * introduces significant hit single-threaded and serialization multi-threaded. 154 * 155 * Reverse lookup (e.g., "getcwd") fully scales provided it is fully cached -- 156 * avoids any writes to shared areas to any components. 157 * 158 * Unrelated insertions are partially serialized on updating the global entry 159 * counter and possibly serialized on colliding bucket or vnode locks. 160 * 161 * IV. Observability 162 * 163 * Note not everything has an explicit dtrace probe nor it should have, thus 164 * some of the one-liners below depend on implementation details. 165 * 166 * Examples: 167 * 168 * # Check what lookups failed to be handled in a lockless manner. Column 1 is 169 * # line number, column 2 is status code (see cache_fpl_status) 170 * dtrace -n 'vfs:fplookup:lookup:done { @[arg1, arg2] = count(); }' 171 * 172 * # Lengths of names added by binary name 173 * dtrace -n 'fbt::cache_enter_time:entry { @[execname] = quantize(args[2]->cn_namelen); }' 174 * 175 * # Same as above but only those which exceed 64 characters 176 * dtrace -n 'fbt::cache_enter_time:entry /args[2]->cn_namelen > 64/ { @[execname] = quantize(args[2]->cn_namelen); }' 177 * 178 * # Who is performing lookups with spurious slashes (e.g., "foo//bar") and what 179 * # path is it 180 * dtrace -n 'fbt::cache_fplookup_skip_slashes:entry { @[execname, stringof(args[0]->cnp->cn_pnbuf)] = count(); }' 181 * 182 * V. Limitations and implementation defects 183 * 184 * - since it is possible there is no entry for an open file, tools like 185 * "procstat" may fail to resolve fd -> vnode -> path to anything 186 * - even if a filesystem adds an entry, it may get purged (e.g., due to memory 187 * shortage) in which case the above problem applies 188 * - hardlinks are not tracked, thus if a vnode is reachable in more than one 189 * way, resolving a name may return a different path than the one used to 190 * open it (even if said path is still valid) 191 * - by default entries are not added for newly created files 192 * - adding an entry may need to evict negative entry first, which happens in 2 193 * distinct places (evicting on lookup, adding in a later VOP) making it 194 * impossible to simply reuse it 195 * - there is a simple scheme to evict negative entries as the cache is approaching 196 * its capacity, but it is very unclear if doing so is a good idea to begin with 197 * - vnodes are subject to being recycled even if target inode is left in memory, 198 * which loses the name cache entries when it perhaps should not. in case of tmpfs 199 * names get duplicated -- kept by filesystem itself and namecache separately 200 * - struct namecache has a fixed size and comes in 2 variants, often wasting space. 201 * now hard to replace with malloc due to dependence on SMR. 202 * - lack of better integration with the kernel also turns nullfs into a layered 203 * filesystem instead of something which can take advantage of caching 204 */ 205 206 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 207 "Name cache"); 208 209 SDT_PROVIDER_DECLARE(vfs); 210 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", 211 "struct vnode *"); 212 SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *", 213 "struct vnode *"); 214 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", 215 "char *"); 216 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *", 217 "const char *"); 218 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *", 219 "struct namecache *", "int", "int"); 220 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); 221 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", 222 "char *", "struct vnode *"); 223 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); 224 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", 225 "struct vnode *", "char *"); 226 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", 227 "struct vnode *"); 228 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, 229 "struct vnode *", "char *"); 230 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", 231 "char *"); 232 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *", 233 "struct componentname *"); 234 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *", 235 "struct componentname *"); 236 SDT_PROBE_DEFINE3(vfs, namecache, purge, done, "struct vnode *", "size_t", "size_t"); 237 SDT_PROBE_DEFINE1(vfs, namecache, purge, batch, "int"); 238 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); 239 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); 240 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", 241 "struct vnode *"); 242 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", 243 "char *"); 244 SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *", 245 "char *"); 246 SDT_PROBE_DEFINE1(vfs, namecache, symlink, alloc__fail, "size_t"); 247 248 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool"); 249 SDT_PROBE_DECLARE(vfs, namei, lookup, entry); 250 SDT_PROBE_DECLARE(vfs, namei, lookup, return); 251 252 static char __read_frequently cache_fast_lookup_enabled = true; 253 254 /* 255 * This structure describes the elements in the cache of recent 256 * names looked up by namei. 257 */ 258 struct negstate { 259 u_char neg_flag; 260 u_char neg_hit; 261 }; 262 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *), 263 "the state must fit in a union with a pointer without growing it"); 264 265 struct namecache { 266 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 267 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 268 CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */ 269 struct vnode *nc_dvp; /* vnode of parent of name */ 270 union { 271 struct vnode *nu_vp; /* vnode the name refers to */ 272 struct negstate nu_neg;/* negative entry state */ 273 } n_un; 274 u_char nc_flag; /* flag bits */ 275 u_char nc_nlen; /* length of name */ 276 char nc_name[]; /* segment name + nul */ 277 }; 278 279 /* 280 * struct namecache_ts repeats struct namecache layout up to the 281 * nc_nlen member. 282 * struct namecache_ts is used in place of struct namecache when time(s) need 283 * to be stored. The nc_dotdottime field is used when a cache entry is mapping 284 * both a non-dotdot directory name plus dotdot for the directory's 285 * parent. 286 * 287 * See below for alignment requirement. 288 */ 289 struct namecache_ts { 290 struct timespec nc_time; /* timespec provided by fs */ 291 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ 292 int nc_ticks; /* ticks value when entry was added */ 293 int nc_pad; 294 struct namecache nc_nc; 295 }; 296 297 TAILQ_HEAD(cache_freebatch, namecache); 298 299 /* 300 * At least mips n32 performs 64-bit accesses to timespec as found 301 * in namecache_ts and requires them to be aligned. Since others 302 * may be in the same spot suffer a little bit and enforce the 303 * alignment for everyone. Note this is a nop for 64-bit platforms. 304 */ 305 #define CACHE_ZONE_ALIGNMENT UMA_ALIGNOF(time_t) 306 307 /* 308 * TODO: the initial value of CACHE_PATH_CUTOFF was inherited from the 309 * 4.4 BSD codebase. Later on struct namecache was tweaked to become 310 * smaller and the value was bumped to retain the total size, but it 311 * was never re-evaluated for suitability. A simple test counting 312 * lengths during package building shows that the value of 45 covers 313 * about 86% of all added entries, reaching 99% at 65. 314 * 315 * Regardless of the above, use of dedicated zones instead of malloc may be 316 * inducing additional waste. This may be hard to address as said zones are 317 * tied to VFS SMR. Even if retaining them, the current split should be 318 * re-evaluated. 319 */ 320 #ifdef __LP64__ 321 #define CACHE_PATH_CUTOFF 45 322 #define CACHE_LARGE_PAD 6 323 #else 324 #define CACHE_PATH_CUTOFF 41 325 #define CACHE_LARGE_PAD 2 326 #endif 327 328 #define CACHE_ZONE_SMALL_SIZE (offsetof(struct namecache, nc_name) + CACHE_PATH_CUTOFF + 1) 329 #define CACHE_ZONE_SMALL_TS_SIZE (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_SMALL_SIZE) 330 #define CACHE_ZONE_LARGE_SIZE (offsetof(struct namecache, nc_name) + NAME_MAX + 1 + CACHE_LARGE_PAD) 331 #define CACHE_ZONE_LARGE_TS_SIZE (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_LARGE_SIZE) 332 333 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 334 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 335 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 336 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 337 338 #define nc_vp n_un.nu_vp 339 #define nc_neg n_un.nu_neg 340 341 /* 342 * Flags in namecache.nc_flag 343 */ 344 #define NCF_WHITE 0x01 345 #define NCF_ISDOTDOT 0x02 346 #define NCF_TS 0x04 347 #define NCF_DTS 0x08 348 #define NCF_DVDROP 0x10 349 #define NCF_NEGATIVE 0x20 350 #define NCF_INVALID 0x40 351 #define NCF_WIP 0x80 352 353 /* 354 * Flags in negstate.neg_flag 355 */ 356 #define NEG_HOT 0x01 357 358 static bool cache_neg_evict_cond(u_long lnumcache); 359 360 /* 361 * Mark an entry as invalid. 362 * 363 * This is called before it starts getting deconstructed. 364 */ 365 static void 366 cache_ncp_invalidate(struct namecache *ncp) 367 { 368 369 KASSERT((ncp->nc_flag & NCF_INVALID) == 0, 370 ("%s: entry %p already invalid", __func__, ncp)); 371 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID); 372 atomic_thread_fence_rel(); 373 } 374 375 /* 376 * Check whether the entry can be safely used. 377 * 378 * All places which elide locks are supposed to call this after they are 379 * done with reading from an entry. 380 */ 381 #define cache_ncp_canuse(ncp) ({ \ 382 struct namecache *_ncp = (ncp); \ 383 u_char _nc_flag; \ 384 \ 385 atomic_thread_fence_acq(); \ 386 _nc_flag = atomic_load_char(&_ncp->nc_flag); \ 387 __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP)) == 0); \ 388 }) 389 390 /* 391 * Like the above but also checks NCF_WHITE. 392 */ 393 #define cache_fpl_neg_ncp_canuse(ncp) ({ \ 394 struct namecache *_ncp = (ncp); \ 395 u_char _nc_flag; \ 396 \ 397 atomic_thread_fence_acq(); \ 398 _nc_flag = atomic_load_char(&_ncp->nc_flag); \ 399 __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP | NCF_WHITE)) == 0); \ 400 }) 401 402 VFS_SMR_DECLARE; 403 404 static SYSCTL_NODE(_vfs_cache, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 405 "Name cache parameters"); 406 407 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */ 408 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, size, CTLFLAG_RD, &ncsize, 0, 409 "Total namecache capacity"); 410 411 u_int ncsizefactor = 2; 412 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, sizefactor, CTLFLAG_RW, &ncsizefactor, 0, 413 "Size factor for namecache"); 414 415 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */ 416 SYSCTL_ULONG(_vfs_cache_param, OID_AUTO, negfactor, CTLFLAG_RW, &ncnegfactor, 0, 417 "Ratio of negative namecache entries"); 418 419 /* 420 * Negative entry % of namecache capacity above which automatic eviction is allowed. 421 * 422 * Check cache_neg_evict_cond for details. 423 */ 424 static u_int ncnegminpct = 3; 425 426 static u_int __read_mostly neg_min; /* the above recomputed against ncsize */ 427 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, negmin, CTLFLAG_RD, &neg_min, 0, 428 "Negative entry count above which automatic eviction is allowed"); 429 430 /* 431 * Structures associated with name caching. 432 */ 433 #define NCHHASH(hash) \ 434 (&nchashtbl[(hash) & nchash]) 435 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ 436 static u_long __read_mostly nchash; /* size of hash table */ 437 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 438 "Size of namecache hash table"); 439 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ 440 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ 441 442 struct nchstats nchstats; /* cache effectiveness statistics */ 443 444 static u_int __exclusive_cache_line neg_cycle; 445 446 #define ncneghash 3 447 #define numneglists (ncneghash + 1) 448 449 struct neglist { 450 struct mtx nl_evict_lock; 451 struct mtx nl_lock __aligned(CACHE_LINE_SIZE); 452 TAILQ_HEAD(, namecache) nl_list; 453 TAILQ_HEAD(, namecache) nl_hotlist; 454 u_long nl_hotnum; 455 } __aligned(CACHE_LINE_SIZE); 456 457 static struct neglist neglists[numneglists]; 458 459 static inline struct neglist * 460 NCP2NEGLIST(struct namecache *ncp) 461 { 462 463 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); 464 } 465 466 static inline struct negstate * 467 NCP2NEGSTATE(struct namecache *ncp) 468 { 469 470 MPASS(atomic_load_char(&ncp->nc_flag) & NCF_NEGATIVE); 471 return (&ncp->nc_neg); 472 } 473 474 #define numbucketlocks (ncbuckethash + 1) 475 static u_int __read_mostly ncbuckethash; 476 static struct mtx_padalign __read_mostly *bucketlocks; 477 #define HASH2BUCKETLOCK(hash) \ 478 ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)])) 479 480 #define numvnodelocks (ncvnodehash + 1) 481 static u_int __read_mostly ncvnodehash; 482 static struct mtx __read_mostly *vnodelocks; 483 static inline struct mtx * 484 VP2VNODELOCK(struct vnode *vp) 485 { 486 487 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); 488 } 489 490 static void 491 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) 492 { 493 struct namecache_ts *ncp_ts; 494 495 KASSERT((ncp->nc_flag & NCF_TS) != 0 || 496 (tsp == NULL && ticksp == NULL), 497 ("No NCF_TS")); 498 499 if (tsp == NULL) 500 return; 501 502 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 503 *tsp = ncp_ts->nc_time; 504 *ticksp = ncp_ts->nc_ticks; 505 } 506 507 #ifdef DEBUG_CACHE 508 static int __read_mostly doingcache = 1; /* 1 => enable the cache */ 509 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, 510 "VFS namecache enabled"); 511 #endif 512 513 /* Export size information to userland */ 514 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 515 sizeof(struct namecache), "sizeof(struct namecache)"); 516 517 /* 518 * The new name cache statistics 519 */ 520 static SYSCTL_NODE(_vfs_cache, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 521 "Name cache statistics"); 522 523 #define STATNODE_ULONG(name, varname, descr) \ 524 SYSCTL_ULONG(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr); 525 #define STATNODE_COUNTER(name, varname, descr) \ 526 static COUNTER_U64_DEFINE_EARLY(varname); \ 527 SYSCTL_COUNTER_U64(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, \ 528 descr); 529 STATNODE_ULONG(neg, numneg, "Number of negative cache entries"); 530 STATNODE_ULONG(count, numcache, "Number of cache entries"); 531 STATNODE_COUNTER(heldvnodes, numcachehv, "Number of namecache entries with vnodes held"); 532 STATNODE_COUNTER(drops, numdrops, "Number of dropped entries due to reaching the limit"); 533 STATNODE_COUNTER(miss, nummiss, "Number of cache misses"); 534 STATNODE_COUNTER(misszap, nummisszap, "Number of cache misses we do not want to cache"); 535 STATNODE_COUNTER(poszaps, numposzaps, 536 "Number of cache hits (positive) we do not want to cache"); 537 STATNODE_COUNTER(poshits, numposhits, "Number of cache hits (positive)"); 538 STATNODE_COUNTER(negzaps, numnegzaps, 539 "Number of cache hits (negative) we do not want to cache"); 540 STATNODE_COUNTER(neghits, numneghits, "Number of cache hits (negative)"); 541 /* These count for vn_getcwd(), too. */ 542 STATNODE_COUNTER(fullpathcalls, numfullpathcalls, "Number of fullpath search calls"); 543 STATNODE_COUNTER(fullpathfail2, numfullpathfail2, 544 "Number of fullpath search errors (VOP_VPTOCNP failures)"); 545 STATNODE_COUNTER(fullpathfail4, numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); 546 STATNODE_COUNTER(fullpathfound, numfullpathfound, "Number of successful fullpath calls"); 547 STATNODE_COUNTER(symlinktoobig, symlinktoobig, "Number of times symlink did not fit the cache"); 548 549 /* 550 * Debug or developer statistics. 551 */ 552 static SYSCTL_NODE(_vfs_cache, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 553 "Name cache debugging"); 554 #define DEBUGNODE_ULONG(name, varname, descr) \ 555 SYSCTL_ULONG(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr); 556 #define DEBUGNODE_COUNTER(name, varname, descr) \ 557 static COUNTER_U64_DEFINE_EARLY(varname); \ 558 SYSCTL_COUNTER_U64(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, \ 559 descr); 560 DEBUGNODE_COUNTER(zap_bucket_relock_success, zap_bucket_relock_success, 561 "Number of successful removals after relocking"); 562 static long zap_bucket_fail; 563 DEBUGNODE_ULONG(zap_bucket_fail, zap_bucket_fail, ""); 564 static long zap_bucket_fail2; 565 DEBUGNODE_ULONG(zap_bucket_fail2, zap_bucket_fail2, ""); 566 static long cache_lock_vnodes_cel_3_failures; 567 DEBUGNODE_ULONG(vnodes_cel_3_failures, cache_lock_vnodes_cel_3_failures, 568 "Number of times 3-way vnode locking failed"); 569 570 static void cache_zap_locked(struct namecache *ncp); 571 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 572 char **retbuf, size_t *buflen, size_t addend); 573 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, 574 char **retbuf, size_t *buflen); 575 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, 576 char **retbuf, size_t *len, size_t addend); 577 578 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 579 580 static inline void 581 cache_assert_vlp_locked(struct mtx *vlp) 582 { 583 584 if (vlp != NULL) 585 mtx_assert(vlp, MA_OWNED); 586 } 587 588 static inline void 589 cache_assert_vnode_locked(struct vnode *vp) 590 { 591 struct mtx *vlp; 592 593 vlp = VP2VNODELOCK(vp); 594 cache_assert_vlp_locked(vlp); 595 } 596 597 /* 598 * Directory vnodes with entries are held for two reasons: 599 * 1. make them less of a target for reclamation in vnlru 600 * 2. suffer smaller performance penalty in locked lookup as requeieing is avoided 601 * 602 * It will be feasible to stop doing it altogether if all filesystems start 603 * supporting lockless lookup. 604 */ 605 static void 606 cache_hold_vnode(struct vnode *vp) 607 { 608 609 cache_assert_vnode_locked(vp); 610 VNPASS(LIST_EMPTY(&vp->v_cache_src), vp); 611 vhold(vp); 612 counter_u64_add(numcachehv, 1); 613 } 614 615 static void 616 cache_drop_vnode(struct vnode *vp) 617 { 618 619 /* 620 * Called after all locks are dropped, meaning we can't assert 621 * on the state of v_cache_src. 622 */ 623 vdrop(vp); 624 counter_u64_add(numcachehv, -1); 625 } 626 627 /* 628 * UMA zones. 629 */ 630 static uma_zone_t __read_mostly cache_zone_small; 631 static uma_zone_t __read_mostly cache_zone_small_ts; 632 static uma_zone_t __read_mostly cache_zone_large; 633 static uma_zone_t __read_mostly cache_zone_large_ts; 634 635 char * 636 cache_symlink_alloc(size_t size, int flags) 637 { 638 639 if (size < CACHE_ZONE_SMALL_SIZE) { 640 return (uma_zalloc_smr(cache_zone_small, flags)); 641 } 642 if (size < CACHE_ZONE_LARGE_SIZE) { 643 return (uma_zalloc_smr(cache_zone_large, flags)); 644 } 645 counter_u64_add(symlinktoobig, 1); 646 SDT_PROBE1(vfs, namecache, symlink, alloc__fail, size); 647 return (NULL); 648 } 649 650 void 651 cache_symlink_free(char *string, size_t size) 652 { 653 654 MPASS(string != NULL); 655 KASSERT(size < CACHE_ZONE_LARGE_SIZE, 656 ("%s: size %zu too big", __func__, size)); 657 658 if (size < CACHE_ZONE_SMALL_SIZE) { 659 uma_zfree_smr(cache_zone_small, string); 660 return; 661 } 662 if (size < CACHE_ZONE_LARGE_SIZE) { 663 uma_zfree_smr(cache_zone_large, string); 664 return; 665 } 666 __assert_unreachable(); 667 } 668 669 static struct namecache * 670 cache_alloc_uma(int len, bool ts) 671 { 672 struct namecache_ts *ncp_ts; 673 struct namecache *ncp; 674 675 if (__predict_false(ts)) { 676 if (len <= CACHE_PATH_CUTOFF) 677 ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK); 678 else 679 ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK); 680 ncp = &ncp_ts->nc_nc; 681 } else { 682 if (len <= CACHE_PATH_CUTOFF) 683 ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK); 684 else 685 ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK); 686 } 687 return (ncp); 688 } 689 690 static void 691 cache_free_uma(struct namecache *ncp) 692 { 693 struct namecache_ts *ncp_ts; 694 695 if (__predict_false(ncp->nc_flag & NCF_TS)) { 696 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 697 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 698 uma_zfree_smr(cache_zone_small_ts, ncp_ts); 699 else 700 uma_zfree_smr(cache_zone_large_ts, ncp_ts); 701 } else { 702 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 703 uma_zfree_smr(cache_zone_small, ncp); 704 else 705 uma_zfree_smr(cache_zone_large, ncp); 706 } 707 } 708 709 static struct namecache * 710 cache_alloc(int len, bool ts) 711 { 712 u_long lnumcache; 713 714 /* 715 * Avoid blowout in namecache entries. 716 * 717 * Bugs: 718 * 1. filesystems may end up trying to add an already existing entry 719 * (for example this can happen after a cache miss during concurrent 720 * lookup), in which case we will call cache_neg_evict despite not 721 * adding anything. 722 * 2. the routine may fail to free anything and no provisions are made 723 * to make it try harder (see the inside for failure modes) 724 * 3. it only ever looks at negative entries. 725 */ 726 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; 727 if (cache_neg_evict_cond(lnumcache)) { 728 lnumcache = atomic_load_long(&numcache); 729 } 730 if (__predict_false(lnumcache >= ncsize)) { 731 atomic_subtract_long(&numcache, 1); 732 counter_u64_add(numdrops, 1); 733 return (NULL); 734 } 735 return (cache_alloc_uma(len, ts)); 736 } 737 738 static void 739 cache_free(struct namecache *ncp) 740 { 741 742 MPASS(ncp != NULL); 743 if ((ncp->nc_flag & NCF_DVDROP) != 0) { 744 cache_drop_vnode(ncp->nc_dvp); 745 } 746 cache_free_uma(ncp); 747 atomic_subtract_long(&numcache, 1); 748 } 749 750 static void 751 cache_free_batch(struct cache_freebatch *batch) 752 { 753 struct namecache *ncp, *nnp; 754 int i; 755 756 i = 0; 757 if (TAILQ_EMPTY(batch)) 758 goto out; 759 TAILQ_FOREACH_SAFE(ncp, batch, nc_dst, nnp) { 760 if ((ncp->nc_flag & NCF_DVDROP) != 0) { 761 cache_drop_vnode(ncp->nc_dvp); 762 } 763 cache_free_uma(ncp); 764 i++; 765 } 766 atomic_subtract_long(&numcache, i); 767 out: 768 SDT_PROBE1(vfs, namecache, purge, batch, i); 769 } 770 771 /* 772 * Hashing. 773 * 774 * The code was made to use FNV in 2001 and this choice needs to be revisited. 775 * 776 * Short summary of the difficulty: 777 * The longest name which can be inserted is NAME_MAX characters in length (or 778 * 255 at the time of writing this comment), while majority of names used in 779 * practice are significantly shorter (mostly below 10). More importantly 780 * majority of lookups performed find names are even shorter than that. 781 * 782 * This poses a problem where hashes which do better than FNV past word size 783 * (or so) tend to come with additional overhead when finalizing the result, 784 * making them noticeably slower for the most commonly used range. 785 * 786 * Consider a path like: /usr/obj/usr/src/sys/amd64/GENERIC/vnode_if.c 787 * 788 * When looking it up the most time consuming part by a large margin (at least 789 * on amd64) is hashing. Replacing FNV with something which pessimizes short 790 * input would make the slowest part stand out even more. 791 */ 792 793 /* 794 * TODO: With the value stored we can do better than computing the hash based 795 * on the address. 796 */ 797 static void 798 cache_prehash(struct vnode *vp) 799 { 800 801 vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT); 802 } 803 804 static uint32_t 805 cache_get_hash(char *name, u_char len, struct vnode *dvp) 806 { 807 808 return (fnv_32_buf(name, len, dvp->v_nchash)); 809 } 810 811 static uint32_t 812 cache_get_hash_iter_start(struct vnode *dvp) 813 { 814 815 return (dvp->v_nchash); 816 } 817 818 static uint32_t 819 cache_get_hash_iter(char c, uint32_t hash) 820 { 821 822 return (fnv_32_buf(&c, 1, hash)); 823 } 824 825 static uint32_t 826 cache_get_hash_iter_finish(uint32_t hash) 827 { 828 829 return (hash); 830 } 831 832 static inline struct nchashhead * 833 NCP2BUCKET(struct namecache *ncp) 834 { 835 uint32_t hash; 836 837 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 838 return (NCHHASH(hash)); 839 } 840 841 static inline struct mtx * 842 NCP2BUCKETLOCK(struct namecache *ncp) 843 { 844 uint32_t hash; 845 846 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 847 return (HASH2BUCKETLOCK(hash)); 848 } 849 850 #ifdef INVARIANTS 851 static void 852 cache_assert_bucket_locked(struct namecache *ncp) 853 { 854 struct mtx *blp; 855 856 blp = NCP2BUCKETLOCK(ncp); 857 mtx_assert(blp, MA_OWNED); 858 } 859 860 static void 861 cache_assert_bucket_unlocked(struct namecache *ncp) 862 { 863 struct mtx *blp; 864 865 blp = NCP2BUCKETLOCK(ncp); 866 mtx_assert(blp, MA_NOTOWNED); 867 } 868 #else 869 #define cache_assert_bucket_locked(x) do { } while (0) 870 #define cache_assert_bucket_unlocked(x) do { } while (0) 871 #endif 872 873 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) 874 static void 875 _cache_sort_vnodes(void **p1, void **p2) 876 { 877 void *tmp; 878 879 MPASS(*p1 != NULL || *p2 != NULL); 880 881 if (*p1 > *p2) { 882 tmp = *p2; 883 *p2 = *p1; 884 *p1 = tmp; 885 } 886 } 887 888 static void 889 cache_lock_all_buckets(void) 890 { 891 u_int i; 892 893 for (i = 0; i < numbucketlocks; i++) 894 mtx_lock(&bucketlocks[i]); 895 } 896 897 static void 898 cache_unlock_all_buckets(void) 899 { 900 u_int i; 901 902 for (i = 0; i < numbucketlocks; i++) 903 mtx_unlock(&bucketlocks[i]); 904 } 905 906 static void 907 cache_lock_all_vnodes(void) 908 { 909 u_int i; 910 911 for (i = 0; i < numvnodelocks; i++) 912 mtx_lock(&vnodelocks[i]); 913 } 914 915 static void 916 cache_unlock_all_vnodes(void) 917 { 918 u_int i; 919 920 for (i = 0; i < numvnodelocks; i++) 921 mtx_unlock(&vnodelocks[i]); 922 } 923 924 static int 925 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 926 { 927 928 cache_sort_vnodes(&vlp1, &vlp2); 929 930 if (vlp1 != NULL) { 931 if (!mtx_trylock(vlp1)) 932 return (EAGAIN); 933 } 934 if (!mtx_trylock(vlp2)) { 935 if (vlp1 != NULL) 936 mtx_unlock(vlp1); 937 return (EAGAIN); 938 } 939 940 return (0); 941 } 942 943 static void 944 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 945 { 946 947 MPASS(vlp1 != NULL || vlp2 != NULL); 948 MPASS(vlp1 <= vlp2); 949 950 if (vlp1 != NULL) 951 mtx_lock(vlp1); 952 if (vlp2 != NULL) 953 mtx_lock(vlp2); 954 } 955 956 static void 957 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 958 { 959 960 MPASS(vlp1 != NULL || vlp2 != NULL); 961 962 if (vlp1 != NULL) 963 mtx_unlock(vlp1); 964 if (vlp2 != NULL) 965 mtx_unlock(vlp2); 966 } 967 968 static int 969 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 970 { 971 struct nchstats snap; 972 973 if (req->oldptr == NULL) 974 return (SYSCTL_OUT(req, 0, sizeof(snap))); 975 976 snap = nchstats; 977 snap.ncs_goodhits = counter_u64_fetch(numposhits); 978 snap.ncs_neghits = counter_u64_fetch(numneghits); 979 snap.ncs_badhits = counter_u64_fetch(numposzaps) + 980 counter_u64_fetch(numnegzaps); 981 snap.ncs_miss = counter_u64_fetch(nummisszap) + 982 counter_u64_fetch(nummiss); 983 984 return (SYSCTL_OUT(req, &snap, sizeof(snap))); 985 } 986 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | 987 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", 988 "VFS cache effectiveness statistics"); 989 990 static void 991 cache_recalc_neg_min(void) 992 { 993 994 neg_min = (ncsize * ncnegminpct) / 100; 995 } 996 997 static int 998 sysctl_negminpct(SYSCTL_HANDLER_ARGS) 999 { 1000 u_int val; 1001 int error; 1002 1003 val = ncnegminpct; 1004 error = sysctl_handle_int(oidp, &val, 0, req); 1005 if (error != 0 || req->newptr == NULL) 1006 return (error); 1007 1008 if (val == ncnegminpct) 1009 return (0); 1010 if (val < 0 || val > 99) 1011 return (EINVAL); 1012 ncnegminpct = val; 1013 cache_recalc_neg_min(); 1014 return (0); 1015 } 1016 1017 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, negminpct, 1018 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_negminpct, 1019 "I", "Negative entry \% of namecache capacity above which automatic eviction is allowed"); 1020 1021 #ifdef DEBUG_CACHE 1022 /* 1023 * Grab an atomic snapshot of the name cache hash chain lengths 1024 */ 1025 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, 1026 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 1027 "hash table stats"); 1028 1029 static int 1030 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) 1031 { 1032 struct nchashhead *ncpp; 1033 struct namecache *ncp; 1034 int i, error, n_nchash, *cntbuf; 1035 1036 retry: 1037 n_nchash = nchash + 1; /* nchash is max index, not count */ 1038 if (req->oldptr == NULL) 1039 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); 1040 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); 1041 cache_lock_all_buckets(); 1042 if (n_nchash != nchash + 1) { 1043 cache_unlock_all_buckets(); 1044 free(cntbuf, M_TEMP); 1045 goto retry; 1046 } 1047 /* Scan hash tables counting entries */ 1048 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) 1049 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) 1050 cntbuf[i]++; 1051 cache_unlock_all_buckets(); 1052 for (error = 0, i = 0; i < n_nchash; i++) 1053 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) 1054 break; 1055 free(cntbuf, M_TEMP); 1056 return (error); 1057 } 1058 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| 1059 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", 1060 "nchash chain lengths"); 1061 1062 static int 1063 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) 1064 { 1065 int error; 1066 struct nchashhead *ncpp; 1067 struct namecache *ncp; 1068 int n_nchash; 1069 int count, maxlength, used, pct; 1070 1071 if (!req->oldptr) 1072 return SYSCTL_OUT(req, 0, 4 * sizeof(int)); 1073 1074 cache_lock_all_buckets(); 1075 n_nchash = nchash + 1; /* nchash is max index, not count */ 1076 used = 0; 1077 maxlength = 0; 1078 1079 /* Scan hash tables for applicable entries */ 1080 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { 1081 count = 0; 1082 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) { 1083 count++; 1084 } 1085 if (count) 1086 used++; 1087 if (maxlength < count) 1088 maxlength = count; 1089 } 1090 n_nchash = nchash + 1; 1091 cache_unlock_all_buckets(); 1092 pct = (used * 100) / (n_nchash / 100); 1093 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); 1094 if (error) 1095 return (error); 1096 error = SYSCTL_OUT(req, &used, sizeof(used)); 1097 if (error) 1098 return (error); 1099 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); 1100 if (error) 1101 return (error); 1102 error = SYSCTL_OUT(req, &pct, sizeof(pct)); 1103 if (error) 1104 return (error); 1105 return (0); 1106 } 1107 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| 1108 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", 1109 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); 1110 #endif 1111 1112 /* 1113 * Negative entries management 1114 * 1115 * Various workloads create plenty of negative entries and barely use them 1116 * afterwards. Moreover malicious users can keep performing bogus lookups 1117 * adding even more entries. For example "make tinderbox" as of writing this 1118 * comment ends up with 2.6M namecache entries in total, 1.2M of which are 1119 * negative. 1120 * 1121 * As such, a rather aggressive eviction method is needed. The currently 1122 * employed method is a placeholder. 1123 * 1124 * Entries are split over numneglists separate lists, each of which is further 1125 * split into hot and cold entries. Entries get promoted after getting a hit. 1126 * Eviction happens on addition of new entry. 1127 */ 1128 static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1129 "Name cache negative entry statistics"); 1130 1131 SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0, 1132 "Number of negative cache entries"); 1133 1134 static COUNTER_U64_DEFINE_EARLY(neg_created); 1135 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created, 1136 "Number of created negative entries"); 1137 1138 static COUNTER_U64_DEFINE_EARLY(neg_evicted); 1139 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted, 1140 "Number of evicted negative entries"); 1141 1142 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty); 1143 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD, 1144 &neg_evict_skipped_empty, 1145 "Number of times evicting failed due to lack of entries"); 1146 1147 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed); 1148 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD, 1149 &neg_evict_skipped_missed, 1150 "Number of times evicting failed due to target entry disappearing"); 1151 1152 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended); 1153 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD, 1154 &neg_evict_skipped_contended, 1155 "Number of times evicting failed due to contention"); 1156 1157 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits, 1158 "Number of cache hits (negative)"); 1159 1160 static int 1161 sysctl_neg_hot(SYSCTL_HANDLER_ARGS) 1162 { 1163 int i, out; 1164 1165 out = 0; 1166 for (i = 0; i < numneglists; i++) 1167 out += neglists[i].nl_hotnum; 1168 1169 return (SYSCTL_OUT(req, &out, sizeof(out))); 1170 } 1171 SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD | 1172 CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I", 1173 "Number of hot negative entries"); 1174 1175 static void 1176 cache_neg_init(struct namecache *ncp) 1177 { 1178 struct negstate *ns; 1179 1180 ncp->nc_flag |= NCF_NEGATIVE; 1181 ns = NCP2NEGSTATE(ncp); 1182 ns->neg_flag = 0; 1183 ns->neg_hit = 0; 1184 counter_u64_add(neg_created, 1); 1185 } 1186 1187 #define CACHE_NEG_PROMOTION_THRESH 2 1188 1189 static bool 1190 cache_neg_hit_prep(struct namecache *ncp) 1191 { 1192 struct negstate *ns; 1193 u_char n; 1194 1195 ns = NCP2NEGSTATE(ncp); 1196 n = atomic_load_char(&ns->neg_hit); 1197 for (;;) { 1198 if (n >= CACHE_NEG_PROMOTION_THRESH) 1199 return (false); 1200 if (atomic_fcmpset_8(&ns->neg_hit, &n, n + 1)) 1201 break; 1202 } 1203 return (n + 1 == CACHE_NEG_PROMOTION_THRESH); 1204 } 1205 1206 /* 1207 * Nothing to do here but it is provided for completeness as some 1208 * cache_neg_hit_prep callers may end up returning without even 1209 * trying to promote. 1210 */ 1211 #define cache_neg_hit_abort(ncp) do { } while (0) 1212 1213 static void 1214 cache_neg_hit_finish(struct namecache *ncp) 1215 { 1216 1217 SDT_PROBE2(vfs, namecache, lookup, hit__negative, ncp->nc_dvp, ncp->nc_name); 1218 counter_u64_add(numneghits, 1); 1219 } 1220 1221 /* 1222 * Move a negative entry to the hot list. 1223 */ 1224 static void 1225 cache_neg_promote_locked(struct namecache *ncp) 1226 { 1227 struct neglist *nl; 1228 struct negstate *ns; 1229 1230 ns = NCP2NEGSTATE(ncp); 1231 nl = NCP2NEGLIST(ncp); 1232 mtx_assert(&nl->nl_lock, MA_OWNED); 1233 if ((ns->neg_flag & NEG_HOT) == 0) { 1234 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst); 1235 TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst); 1236 nl->nl_hotnum++; 1237 ns->neg_flag |= NEG_HOT; 1238 } 1239 } 1240 1241 /* 1242 * Move a hot negative entry to the cold list. 1243 */ 1244 static void 1245 cache_neg_demote_locked(struct namecache *ncp) 1246 { 1247 struct neglist *nl; 1248 struct negstate *ns; 1249 1250 ns = NCP2NEGSTATE(ncp); 1251 nl = NCP2NEGLIST(ncp); 1252 mtx_assert(&nl->nl_lock, MA_OWNED); 1253 MPASS(ns->neg_flag & NEG_HOT); 1254 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst); 1255 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst); 1256 nl->nl_hotnum--; 1257 ns->neg_flag &= ~NEG_HOT; 1258 atomic_store_char(&ns->neg_hit, 0); 1259 } 1260 1261 /* 1262 * Move a negative entry to the hot list if it matches the lookup. 1263 * 1264 * We have to take locks, but they may be contended and in the worst 1265 * case we may need to go off CPU. We don't want to spin within the 1266 * smr section and we can't block with it. Exiting the section means 1267 * the found entry could have been evicted. We are going to look it 1268 * up again. 1269 */ 1270 static bool 1271 cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp, 1272 struct namecache *oncp, uint32_t hash) 1273 { 1274 struct namecache *ncp; 1275 struct neglist *nl; 1276 u_char nc_flag; 1277 1278 nl = NCP2NEGLIST(oncp); 1279 1280 mtx_lock(&nl->nl_lock); 1281 /* 1282 * For hash iteration. 1283 */ 1284 vfs_smr_enter(); 1285 1286 /* 1287 * Avoid all surprises by only succeeding if we got the same entry and 1288 * bailing completely otherwise. 1289 * XXX There are no provisions to keep the vnode around, meaning we may 1290 * end up promoting a negative entry for a *new* vnode and returning 1291 * ENOENT on its account. This is the error we want to return anyway 1292 * and promotion is harmless. 1293 * 1294 * In particular at this point there can be a new ncp which matches the 1295 * search but hashes to a different neglist. 1296 */ 1297 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1298 if (ncp == oncp) 1299 break; 1300 } 1301 1302 /* 1303 * No match to begin with. 1304 */ 1305 if (__predict_false(ncp == NULL)) { 1306 goto out_abort; 1307 } 1308 1309 /* 1310 * The newly found entry may be something different... 1311 */ 1312 if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1313 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) { 1314 goto out_abort; 1315 } 1316 1317 /* 1318 * ... and not even negative. 1319 */ 1320 nc_flag = atomic_load_char(&ncp->nc_flag); 1321 if ((nc_flag & NCF_NEGATIVE) == 0) { 1322 goto out_abort; 1323 } 1324 1325 if (!cache_ncp_canuse(ncp)) { 1326 goto out_abort; 1327 } 1328 1329 cache_neg_promote_locked(ncp); 1330 cache_neg_hit_finish(ncp); 1331 vfs_smr_exit(); 1332 mtx_unlock(&nl->nl_lock); 1333 return (true); 1334 out_abort: 1335 vfs_smr_exit(); 1336 mtx_unlock(&nl->nl_lock); 1337 return (false); 1338 } 1339 1340 static void 1341 cache_neg_promote(struct namecache *ncp) 1342 { 1343 struct neglist *nl; 1344 1345 nl = NCP2NEGLIST(ncp); 1346 mtx_lock(&nl->nl_lock); 1347 cache_neg_promote_locked(ncp); 1348 mtx_unlock(&nl->nl_lock); 1349 } 1350 1351 static void 1352 cache_neg_insert(struct namecache *ncp) 1353 { 1354 struct neglist *nl; 1355 1356 MPASS(ncp->nc_flag & NCF_NEGATIVE); 1357 cache_assert_bucket_locked(ncp); 1358 nl = NCP2NEGLIST(ncp); 1359 mtx_lock(&nl->nl_lock); 1360 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst); 1361 mtx_unlock(&nl->nl_lock); 1362 atomic_add_long(&numneg, 1); 1363 } 1364 1365 static void 1366 cache_neg_remove(struct namecache *ncp) 1367 { 1368 struct neglist *nl; 1369 struct negstate *ns; 1370 1371 cache_assert_bucket_locked(ncp); 1372 nl = NCP2NEGLIST(ncp); 1373 ns = NCP2NEGSTATE(ncp); 1374 mtx_lock(&nl->nl_lock); 1375 if ((ns->neg_flag & NEG_HOT) != 0) { 1376 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst); 1377 nl->nl_hotnum--; 1378 } else { 1379 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst); 1380 } 1381 mtx_unlock(&nl->nl_lock); 1382 atomic_subtract_long(&numneg, 1); 1383 } 1384 1385 static struct neglist * 1386 cache_neg_evict_select_list(void) 1387 { 1388 struct neglist *nl; 1389 u_int c; 1390 1391 c = atomic_fetchadd_int(&neg_cycle, 1) + 1; 1392 nl = &neglists[c % numneglists]; 1393 if (!mtx_trylock(&nl->nl_evict_lock)) { 1394 counter_u64_add(neg_evict_skipped_contended, 1); 1395 return (NULL); 1396 } 1397 return (nl); 1398 } 1399 1400 static struct namecache * 1401 cache_neg_evict_select_entry(struct neglist *nl) 1402 { 1403 struct namecache *ncp, *lncp; 1404 struct negstate *ns, *lns; 1405 int i; 1406 1407 mtx_assert(&nl->nl_evict_lock, MA_OWNED); 1408 mtx_assert(&nl->nl_lock, MA_OWNED); 1409 ncp = TAILQ_FIRST(&nl->nl_list); 1410 if (ncp == NULL) 1411 return (NULL); 1412 lncp = ncp; 1413 lns = NCP2NEGSTATE(lncp); 1414 for (i = 1; i < 4; i++) { 1415 ncp = TAILQ_NEXT(ncp, nc_dst); 1416 if (ncp == NULL) 1417 break; 1418 ns = NCP2NEGSTATE(ncp); 1419 if (ns->neg_hit < lns->neg_hit) { 1420 lncp = ncp; 1421 lns = ns; 1422 } 1423 } 1424 return (lncp); 1425 } 1426 1427 static bool 1428 cache_neg_evict(void) 1429 { 1430 struct namecache *ncp, *ncp2; 1431 struct neglist *nl; 1432 struct vnode *dvp; 1433 struct mtx *dvlp; 1434 struct mtx *blp; 1435 uint32_t hash; 1436 u_char nlen; 1437 bool evicted; 1438 1439 nl = cache_neg_evict_select_list(); 1440 if (nl == NULL) { 1441 return (false); 1442 } 1443 1444 mtx_lock(&nl->nl_lock); 1445 ncp = TAILQ_FIRST(&nl->nl_hotlist); 1446 if (ncp != NULL) { 1447 cache_neg_demote_locked(ncp); 1448 } 1449 ncp = cache_neg_evict_select_entry(nl); 1450 if (ncp == NULL) { 1451 counter_u64_add(neg_evict_skipped_empty, 1); 1452 mtx_unlock(&nl->nl_lock); 1453 mtx_unlock(&nl->nl_evict_lock); 1454 return (false); 1455 } 1456 nlen = ncp->nc_nlen; 1457 dvp = ncp->nc_dvp; 1458 hash = cache_get_hash(ncp->nc_name, nlen, dvp); 1459 dvlp = VP2VNODELOCK(dvp); 1460 blp = HASH2BUCKETLOCK(hash); 1461 mtx_unlock(&nl->nl_lock); 1462 mtx_unlock(&nl->nl_evict_lock); 1463 mtx_lock(dvlp); 1464 mtx_lock(blp); 1465 /* 1466 * Note that since all locks were dropped above, the entry may be 1467 * gone or reallocated to be something else. 1468 */ 1469 CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) { 1470 if (ncp2 == ncp && ncp2->nc_dvp == dvp && 1471 ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0) 1472 break; 1473 } 1474 if (ncp2 == NULL) { 1475 counter_u64_add(neg_evict_skipped_missed, 1); 1476 ncp = NULL; 1477 evicted = false; 1478 } else { 1479 MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp)); 1480 MPASS(blp == NCP2BUCKETLOCK(ncp)); 1481 SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp, 1482 ncp->nc_name); 1483 cache_zap_locked(ncp); 1484 counter_u64_add(neg_evicted, 1); 1485 evicted = true; 1486 } 1487 mtx_unlock(blp); 1488 mtx_unlock(dvlp); 1489 if (ncp != NULL) 1490 cache_free(ncp); 1491 return (evicted); 1492 } 1493 1494 /* 1495 * Maybe evict a negative entry to create more room. 1496 * 1497 * The ncnegfactor parameter limits what fraction of the total count 1498 * can comprise of negative entries. However, if the cache is just 1499 * warming up this leads to excessive evictions. As such, ncnegminpct 1500 * (recomputed to neg_min) dictates whether the above should be 1501 * applied. 1502 * 1503 * Try evicting if the cache is close to full capacity regardless of 1504 * other considerations. 1505 */ 1506 static bool 1507 cache_neg_evict_cond(u_long lnumcache) 1508 { 1509 u_long lnumneg; 1510 1511 if (ncsize - 1000 < lnumcache) 1512 goto out_evict; 1513 lnumneg = atomic_load_long(&numneg); 1514 if (lnumneg < neg_min) 1515 return (false); 1516 if (lnumneg * ncnegfactor < lnumcache) 1517 return (false); 1518 out_evict: 1519 return (cache_neg_evict()); 1520 } 1521 1522 /* 1523 * cache_zap_locked(): 1524 * 1525 * Removes a namecache entry from cache, whether it contains an actual 1526 * pointer to a vnode or if it is just a negative cache entry. 1527 */ 1528 static void 1529 cache_zap_locked(struct namecache *ncp) 1530 { 1531 struct nchashhead *ncpp; 1532 struct vnode *dvp, *vp; 1533 1534 dvp = ncp->nc_dvp; 1535 vp = ncp->nc_vp; 1536 1537 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1538 cache_assert_vnode_locked(vp); 1539 cache_assert_vnode_locked(dvp); 1540 cache_assert_bucket_locked(ncp); 1541 1542 cache_ncp_invalidate(ncp); 1543 1544 ncpp = NCP2BUCKET(ncp); 1545 CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash); 1546 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 1547 SDT_PROBE3(vfs, namecache, zap, done, dvp, ncp->nc_name, vp); 1548 TAILQ_REMOVE(&vp->v_cache_dst, ncp, nc_dst); 1549 if (ncp == vp->v_cache_dd) { 1550 atomic_store_ptr(&vp->v_cache_dd, NULL); 1551 } 1552 } else { 1553 SDT_PROBE2(vfs, namecache, zap_negative, done, dvp, ncp->nc_name); 1554 cache_neg_remove(ncp); 1555 } 1556 if (ncp->nc_flag & NCF_ISDOTDOT) { 1557 if (ncp == dvp->v_cache_dd) { 1558 atomic_store_ptr(&dvp->v_cache_dd, NULL); 1559 } 1560 } else { 1561 LIST_REMOVE(ncp, nc_src); 1562 if (LIST_EMPTY(&dvp->v_cache_src)) { 1563 ncp->nc_flag |= NCF_DVDROP; 1564 } 1565 } 1566 } 1567 1568 static void 1569 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) 1570 { 1571 struct mtx *blp; 1572 1573 MPASS(ncp->nc_dvp == vp); 1574 MPASS(ncp->nc_flag & NCF_NEGATIVE); 1575 cache_assert_vnode_locked(vp); 1576 1577 blp = NCP2BUCKETLOCK(ncp); 1578 mtx_lock(blp); 1579 cache_zap_locked(ncp); 1580 mtx_unlock(blp); 1581 } 1582 1583 static bool 1584 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, 1585 struct mtx **vlpp) 1586 { 1587 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 1588 struct mtx *blp; 1589 1590 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 1591 cache_assert_vnode_locked(vp); 1592 1593 if (ncp->nc_flag & NCF_NEGATIVE) { 1594 if (*vlpp != NULL) { 1595 mtx_unlock(*vlpp); 1596 *vlpp = NULL; 1597 } 1598 cache_zap_negative_locked_vnode_kl(ncp, vp); 1599 return (true); 1600 } 1601 1602 pvlp = VP2VNODELOCK(vp); 1603 blp = NCP2BUCKETLOCK(ncp); 1604 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 1605 vlp2 = VP2VNODELOCK(ncp->nc_vp); 1606 1607 if (*vlpp == vlp1 || *vlpp == vlp2) { 1608 to_unlock = *vlpp; 1609 *vlpp = NULL; 1610 } else { 1611 if (*vlpp != NULL) { 1612 mtx_unlock(*vlpp); 1613 *vlpp = NULL; 1614 } 1615 cache_sort_vnodes(&vlp1, &vlp2); 1616 if (vlp1 == pvlp) { 1617 mtx_lock(vlp2); 1618 to_unlock = vlp2; 1619 } else { 1620 if (!mtx_trylock(vlp1)) 1621 goto out_relock; 1622 to_unlock = vlp1; 1623 } 1624 } 1625 mtx_lock(blp); 1626 cache_zap_locked(ncp); 1627 mtx_unlock(blp); 1628 if (to_unlock != NULL) 1629 mtx_unlock(to_unlock); 1630 return (true); 1631 1632 out_relock: 1633 mtx_unlock(vlp2); 1634 mtx_lock(vlp1); 1635 mtx_lock(vlp2); 1636 MPASS(*vlpp == NULL); 1637 *vlpp = vlp1; 1638 return (false); 1639 } 1640 1641 /* 1642 * If trylocking failed we can get here. We know enough to take all needed locks 1643 * in the right order and re-lookup the entry. 1644 */ 1645 static int 1646 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1647 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, 1648 struct mtx *blp) 1649 { 1650 struct namecache *rncp; 1651 1652 cache_assert_bucket_unlocked(ncp); 1653 1654 cache_sort_vnodes(&dvlp, &vlp); 1655 cache_lock_vnodes(dvlp, vlp); 1656 mtx_lock(blp); 1657 CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { 1658 if (rncp == ncp && rncp->nc_dvp == dvp && 1659 rncp->nc_nlen == cnp->cn_namelen && 1660 !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) 1661 break; 1662 } 1663 if (rncp != NULL) { 1664 cache_zap_locked(rncp); 1665 mtx_unlock(blp); 1666 cache_unlock_vnodes(dvlp, vlp); 1667 counter_u64_add(zap_bucket_relock_success, 1); 1668 return (0); 1669 } 1670 1671 mtx_unlock(blp); 1672 cache_unlock_vnodes(dvlp, vlp); 1673 return (EAGAIN); 1674 } 1675 1676 static int __noinline 1677 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp, 1678 uint32_t hash, struct mtx *blp) 1679 { 1680 struct mtx *dvlp, *vlp; 1681 struct vnode *dvp; 1682 1683 cache_assert_bucket_locked(ncp); 1684 1685 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1686 vlp = NULL; 1687 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1688 vlp = VP2VNODELOCK(ncp->nc_vp); 1689 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1690 cache_zap_locked(ncp); 1691 mtx_unlock(blp); 1692 cache_unlock_vnodes(dvlp, vlp); 1693 return (0); 1694 } 1695 1696 dvp = ncp->nc_dvp; 1697 mtx_unlock(blp); 1698 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1699 } 1700 1701 static __noinline int 1702 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp) 1703 { 1704 struct namecache *ncp; 1705 struct mtx *blp; 1706 struct mtx *dvlp, *dvlp2; 1707 uint32_t hash; 1708 int error; 1709 1710 if (cnp->cn_namelen == 2 && 1711 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1712 dvlp = VP2VNODELOCK(dvp); 1713 dvlp2 = NULL; 1714 mtx_lock(dvlp); 1715 retry_dotdot: 1716 ncp = dvp->v_cache_dd; 1717 if (ncp == NULL) { 1718 mtx_unlock(dvlp); 1719 if (dvlp2 != NULL) 1720 mtx_unlock(dvlp2); 1721 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1722 return (0); 1723 } 1724 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1725 if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2)) 1726 goto retry_dotdot; 1727 MPASS(dvp->v_cache_dd == NULL); 1728 mtx_unlock(dvlp); 1729 if (dvlp2 != NULL) 1730 mtx_unlock(dvlp2); 1731 cache_free(ncp); 1732 } else { 1733 atomic_store_ptr(&dvp->v_cache_dd, NULL); 1734 mtx_unlock(dvlp); 1735 if (dvlp2 != NULL) 1736 mtx_unlock(dvlp2); 1737 } 1738 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1739 return (1); 1740 } 1741 1742 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1743 blp = HASH2BUCKETLOCK(hash); 1744 retry: 1745 if (CK_SLIST_EMPTY(NCHHASH(hash))) 1746 goto out_no_entry; 1747 1748 mtx_lock(blp); 1749 1750 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1751 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1752 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1753 break; 1754 } 1755 1756 if (ncp == NULL) { 1757 mtx_unlock(blp); 1758 goto out_no_entry; 1759 } 1760 1761 error = cache_zap_locked_bucket(ncp, cnp, hash, blp); 1762 if (__predict_false(error != 0)) { 1763 zap_bucket_fail++; 1764 goto retry; 1765 } 1766 counter_u64_add(numposzaps, 1); 1767 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1768 cache_free(ncp); 1769 return (1); 1770 out_no_entry: 1771 counter_u64_add(nummisszap, 1); 1772 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1773 return (0); 1774 } 1775 1776 static int __noinline 1777 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1778 struct timespec *tsp, int *ticksp) 1779 { 1780 int ltype; 1781 1782 *vpp = dvp; 1783 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); 1784 if (tsp != NULL) 1785 timespecclear(tsp); 1786 if (ticksp != NULL) 1787 *ticksp = ticks; 1788 vrefact(*vpp); 1789 /* 1790 * When we lookup "." we still can be asked to lock it 1791 * differently... 1792 */ 1793 ltype = cnp->cn_lkflags & LK_TYPE_MASK; 1794 if (ltype != VOP_ISLOCKED(*vpp)) { 1795 if (ltype == LK_EXCLUSIVE) { 1796 vn_lock(*vpp, LK_UPGRADE | LK_RETRY); 1797 if (VN_IS_DOOMED((*vpp))) { 1798 /* forced unmount */ 1799 vrele(*vpp); 1800 *vpp = NULL; 1801 return (ENOENT); 1802 } 1803 } else 1804 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); 1805 } 1806 return (-1); 1807 } 1808 1809 static int __noinline 1810 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1811 struct timespec *tsp, int *ticksp) 1812 { 1813 struct namecache_ts *ncp_ts; 1814 struct namecache *ncp; 1815 struct mtx *dvlp; 1816 enum vgetstate vs; 1817 int error, ltype; 1818 bool whiteout; 1819 1820 MPASS((cnp->cn_flags & ISDOTDOT) != 0); 1821 1822 if ((cnp->cn_flags & MAKEENTRY) == 0) { 1823 cache_remove_cnp(dvp, cnp); 1824 return (0); 1825 } 1826 1827 retry: 1828 dvlp = VP2VNODELOCK(dvp); 1829 mtx_lock(dvlp); 1830 ncp = dvp->v_cache_dd; 1831 if (ncp == NULL) { 1832 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, ".."); 1833 mtx_unlock(dvlp); 1834 return (0); 1835 } 1836 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1837 if (ncp->nc_flag & NCF_NEGATIVE) 1838 *vpp = NULL; 1839 else 1840 *vpp = ncp->nc_vp; 1841 } else 1842 *vpp = ncp->nc_dvp; 1843 if (*vpp == NULL) 1844 goto negative_success; 1845 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp); 1846 cache_out_ts(ncp, tsp, ticksp); 1847 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == 1848 NCF_DTS && tsp != NULL) { 1849 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1850 *tsp = ncp_ts->nc_dotdottime; 1851 } 1852 1853 MPASS(dvp != *vpp); 1854 ltype = VOP_ISLOCKED(dvp); 1855 VOP_UNLOCK(dvp); 1856 vs = vget_prep(*vpp); 1857 mtx_unlock(dvlp); 1858 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1859 vn_lock(dvp, ltype | LK_RETRY); 1860 if (VN_IS_DOOMED(dvp)) { 1861 if (error == 0) 1862 vput(*vpp); 1863 *vpp = NULL; 1864 return (ENOENT); 1865 } 1866 if (error) { 1867 *vpp = NULL; 1868 goto retry; 1869 } 1870 return (-1); 1871 negative_success: 1872 if (__predict_false(cnp->cn_nameiop == CREATE)) { 1873 if (cnp->cn_flags & ISLASTCN) { 1874 counter_u64_add(numnegzaps, 1); 1875 cache_zap_negative_locked_vnode_kl(ncp, dvp); 1876 mtx_unlock(dvlp); 1877 cache_free(ncp); 1878 return (0); 1879 } 1880 } 1881 1882 whiteout = (ncp->nc_flag & NCF_WHITE); 1883 cache_out_ts(ncp, tsp, ticksp); 1884 if (cache_neg_hit_prep(ncp)) 1885 cache_neg_promote(ncp); 1886 else 1887 cache_neg_hit_finish(ncp); 1888 mtx_unlock(dvlp); 1889 if (whiteout) 1890 cnp->cn_flags |= ISWHITEOUT; 1891 return (ENOENT); 1892 } 1893 1894 /** 1895 * Lookup a name in the name cache 1896 * 1897 * # Arguments 1898 * 1899 * - dvp: Parent directory in which to search. 1900 * - vpp: Return argument. Will contain desired vnode on cache hit. 1901 * - cnp: Parameters of the name search. The most interesting bits of 1902 * the cn_flags field have the following meanings: 1903 * - MAKEENTRY: If clear, free an entry from the cache rather than look 1904 * it up. 1905 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." 1906 * - tsp: Return storage for cache timestamp. On a successful (positive 1907 * or negative) lookup, tsp will be filled with any timespec that 1908 * was stored when this cache entry was created. However, it will 1909 * be clear for "." entries. 1910 * - ticks: Return storage for alternate cache timestamp. On a successful 1911 * (positive or negative) lookup, it will contain the ticks value 1912 * that was current when the cache entry was created, unless cnp 1913 * was ".". 1914 * 1915 * Either both tsp and ticks have to be provided or neither of them. 1916 * 1917 * # Returns 1918 * 1919 * - -1: A positive cache hit. vpp will contain the desired vnode. 1920 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due 1921 * to a forced unmount. vpp will not be modified. If the entry 1922 * is a whiteout, then the ISWHITEOUT flag will be set in 1923 * cnp->cn_flags. 1924 * - 0: A cache miss. vpp will not be modified. 1925 * 1926 * # Locking 1927 * 1928 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up 1929 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the 1930 * lock is not recursively acquired. 1931 */ 1932 static int __noinline 1933 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1934 struct timespec *tsp, int *ticksp) 1935 { 1936 struct namecache *ncp; 1937 struct mtx *blp; 1938 uint32_t hash; 1939 enum vgetstate vs; 1940 int error; 1941 bool whiteout; 1942 1943 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 1944 MPASS((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) != 0); 1945 1946 retry: 1947 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1948 blp = HASH2BUCKETLOCK(hash); 1949 mtx_lock(blp); 1950 1951 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1952 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1953 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1954 break; 1955 } 1956 1957 if (__predict_false(ncp == NULL)) { 1958 mtx_unlock(blp); 1959 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr); 1960 counter_u64_add(nummiss, 1); 1961 return (0); 1962 } 1963 1964 if (ncp->nc_flag & NCF_NEGATIVE) 1965 goto negative_success; 1966 1967 counter_u64_add(numposhits, 1); 1968 *vpp = ncp->nc_vp; 1969 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); 1970 cache_out_ts(ncp, tsp, ticksp); 1971 MPASS(dvp != *vpp); 1972 vs = vget_prep(*vpp); 1973 mtx_unlock(blp); 1974 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1975 if (error) { 1976 *vpp = NULL; 1977 goto retry; 1978 } 1979 return (-1); 1980 negative_success: 1981 /* 1982 * We don't get here with regular lookup apart from corner cases. 1983 */ 1984 if (__predict_true(cnp->cn_nameiop == CREATE)) { 1985 if (cnp->cn_flags & ISLASTCN) { 1986 counter_u64_add(numnegzaps, 1); 1987 error = cache_zap_locked_bucket(ncp, cnp, hash, blp); 1988 if (__predict_false(error != 0)) { 1989 zap_bucket_fail2++; 1990 goto retry; 1991 } 1992 cache_free(ncp); 1993 return (0); 1994 } 1995 } 1996 1997 whiteout = (ncp->nc_flag & NCF_WHITE); 1998 cache_out_ts(ncp, tsp, ticksp); 1999 if (cache_neg_hit_prep(ncp)) 2000 cache_neg_promote(ncp); 2001 else 2002 cache_neg_hit_finish(ncp); 2003 mtx_unlock(blp); 2004 if (whiteout) 2005 cnp->cn_flags |= ISWHITEOUT; 2006 return (ENOENT); 2007 } 2008 2009 int 2010 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 2011 struct timespec *tsp, int *ticksp) 2012 { 2013 struct namecache *ncp; 2014 uint32_t hash; 2015 enum vgetstate vs; 2016 int error; 2017 bool whiteout, neg_promote; 2018 u_short nc_flag; 2019 2020 MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL)); 2021 2022 #ifdef DEBUG_CACHE 2023 if (__predict_false(!doingcache)) { 2024 cnp->cn_flags &= ~MAKEENTRY; 2025 return (0); 2026 } 2027 #endif 2028 2029 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 2030 if (cnp->cn_namelen == 1) 2031 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); 2032 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') 2033 return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp)); 2034 } 2035 2036 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 2037 2038 if ((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) == 0) { 2039 cache_remove_cnp(dvp, cnp); 2040 return (0); 2041 } 2042 2043 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 2044 vfs_smr_enter(); 2045 2046 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 2047 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 2048 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 2049 break; 2050 } 2051 2052 if (__predict_false(ncp == NULL)) { 2053 vfs_smr_exit(); 2054 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr); 2055 counter_u64_add(nummiss, 1); 2056 return (0); 2057 } 2058 2059 nc_flag = atomic_load_char(&ncp->nc_flag); 2060 if (nc_flag & NCF_NEGATIVE) 2061 goto negative_success; 2062 2063 counter_u64_add(numposhits, 1); 2064 *vpp = ncp->nc_vp; 2065 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); 2066 cache_out_ts(ncp, tsp, ticksp); 2067 MPASS(dvp != *vpp); 2068 if (!cache_ncp_canuse(ncp)) { 2069 vfs_smr_exit(); 2070 *vpp = NULL; 2071 goto out_fallback; 2072 } 2073 vs = vget_prep_smr(*vpp); 2074 vfs_smr_exit(); 2075 if (__predict_false(vs == VGET_NONE)) { 2076 *vpp = NULL; 2077 goto out_fallback; 2078 } 2079 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 2080 if (error) { 2081 *vpp = NULL; 2082 goto out_fallback; 2083 } 2084 return (-1); 2085 negative_success: 2086 if (cnp->cn_nameiop == CREATE) { 2087 if (cnp->cn_flags & ISLASTCN) { 2088 vfs_smr_exit(); 2089 goto out_fallback; 2090 } 2091 } 2092 2093 cache_out_ts(ncp, tsp, ticksp); 2094 whiteout = (atomic_load_char(&ncp->nc_flag) & NCF_WHITE); 2095 neg_promote = cache_neg_hit_prep(ncp); 2096 if (!cache_ncp_canuse(ncp)) { 2097 cache_neg_hit_abort(ncp); 2098 vfs_smr_exit(); 2099 goto out_fallback; 2100 } 2101 if (neg_promote) { 2102 vfs_smr_exit(); 2103 if (!cache_neg_promote_cond(dvp, cnp, ncp, hash)) 2104 goto out_fallback; 2105 } else { 2106 cache_neg_hit_finish(ncp); 2107 vfs_smr_exit(); 2108 } 2109 if (whiteout) 2110 cnp->cn_flags |= ISWHITEOUT; 2111 return (ENOENT); 2112 out_fallback: 2113 return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp)); 2114 } 2115 2116 struct celockstate { 2117 struct mtx *vlp[3]; 2118 struct mtx *blp[2]; 2119 }; 2120 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); 2121 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); 2122 2123 static inline void 2124 cache_celockstate_init(struct celockstate *cel) 2125 { 2126 2127 bzero(cel, sizeof(*cel)); 2128 } 2129 2130 static void 2131 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, 2132 struct vnode *dvp) 2133 { 2134 struct mtx *vlp1, *vlp2; 2135 2136 MPASS(cel->vlp[0] == NULL); 2137 MPASS(cel->vlp[1] == NULL); 2138 MPASS(cel->vlp[2] == NULL); 2139 2140 MPASS(vp != NULL || dvp != NULL); 2141 2142 vlp1 = VP2VNODELOCK(vp); 2143 vlp2 = VP2VNODELOCK(dvp); 2144 cache_sort_vnodes(&vlp1, &vlp2); 2145 2146 if (vlp1 != NULL) { 2147 mtx_lock(vlp1); 2148 cel->vlp[0] = vlp1; 2149 } 2150 mtx_lock(vlp2); 2151 cel->vlp[1] = vlp2; 2152 } 2153 2154 static void 2155 cache_unlock_vnodes_cel(struct celockstate *cel) 2156 { 2157 2158 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); 2159 2160 if (cel->vlp[0] != NULL) 2161 mtx_unlock(cel->vlp[0]); 2162 if (cel->vlp[1] != NULL) 2163 mtx_unlock(cel->vlp[1]); 2164 if (cel->vlp[2] != NULL) 2165 mtx_unlock(cel->vlp[2]); 2166 } 2167 2168 static bool 2169 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) 2170 { 2171 struct mtx *vlp; 2172 bool ret; 2173 2174 cache_assert_vlp_locked(cel->vlp[0]); 2175 cache_assert_vlp_locked(cel->vlp[1]); 2176 MPASS(cel->vlp[2] == NULL); 2177 2178 MPASS(vp != NULL); 2179 vlp = VP2VNODELOCK(vp); 2180 2181 ret = true; 2182 if (vlp >= cel->vlp[1]) { 2183 mtx_lock(vlp); 2184 } else { 2185 if (mtx_trylock(vlp)) 2186 goto out; 2187 cache_lock_vnodes_cel_3_failures++; 2188 cache_unlock_vnodes_cel(cel); 2189 if (vlp < cel->vlp[0]) { 2190 mtx_lock(vlp); 2191 mtx_lock(cel->vlp[0]); 2192 mtx_lock(cel->vlp[1]); 2193 } else { 2194 if (cel->vlp[0] != NULL) 2195 mtx_lock(cel->vlp[0]); 2196 mtx_lock(vlp); 2197 mtx_lock(cel->vlp[1]); 2198 } 2199 ret = false; 2200 } 2201 out: 2202 cel->vlp[2] = vlp; 2203 return (ret); 2204 } 2205 2206 static void 2207 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1, 2208 struct mtx *blp2) 2209 { 2210 2211 MPASS(cel->blp[0] == NULL); 2212 MPASS(cel->blp[1] == NULL); 2213 2214 cache_sort_vnodes(&blp1, &blp2); 2215 2216 if (blp1 != NULL) { 2217 mtx_lock(blp1); 2218 cel->blp[0] = blp1; 2219 } 2220 mtx_lock(blp2); 2221 cel->blp[1] = blp2; 2222 } 2223 2224 static void 2225 cache_unlock_buckets_cel(struct celockstate *cel) 2226 { 2227 2228 if (cel->blp[0] != NULL) 2229 mtx_unlock(cel->blp[0]); 2230 mtx_unlock(cel->blp[1]); 2231 } 2232 2233 /* 2234 * Lock part of the cache affected by the insertion. 2235 * 2236 * This means vnodelocks for dvp, vp and the relevant bucketlock. 2237 * However, insertion can result in removal of an old entry. In this 2238 * case we have an additional vnode and bucketlock pair to lock. 2239 * 2240 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while 2241 * preserving the locking order (smaller address first). 2242 */ 2243 static void 2244 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 2245 uint32_t hash) 2246 { 2247 struct namecache *ncp; 2248 struct mtx *blps[2]; 2249 u_char nc_flag; 2250 2251 blps[0] = HASH2BUCKETLOCK(hash); 2252 for (;;) { 2253 blps[1] = NULL; 2254 cache_lock_vnodes_cel(cel, dvp, vp); 2255 if (vp == NULL || vp->v_type != VDIR) 2256 break; 2257 ncp = atomic_load_consume_ptr(&vp->v_cache_dd); 2258 if (ncp == NULL) 2259 break; 2260 nc_flag = atomic_load_char(&ncp->nc_flag); 2261 if ((nc_flag & NCF_ISDOTDOT) == 0) 2262 break; 2263 MPASS(ncp->nc_dvp == vp); 2264 blps[1] = NCP2BUCKETLOCK(ncp); 2265 if ((nc_flag & NCF_NEGATIVE) != 0) 2266 break; 2267 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 2268 break; 2269 /* 2270 * All vnodes got re-locked. Re-validate the state and if 2271 * nothing changed we are done. Otherwise restart. 2272 */ 2273 if (ncp == vp->v_cache_dd && 2274 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 2275 blps[1] == NCP2BUCKETLOCK(ncp) && 2276 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 2277 break; 2278 cache_unlock_vnodes_cel(cel); 2279 cel->vlp[0] = NULL; 2280 cel->vlp[1] = NULL; 2281 cel->vlp[2] = NULL; 2282 } 2283 cache_lock_buckets_cel(cel, blps[0], blps[1]); 2284 } 2285 2286 static void 2287 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 2288 uint32_t hash) 2289 { 2290 struct namecache *ncp; 2291 struct mtx *blps[2]; 2292 u_char nc_flag; 2293 2294 blps[0] = HASH2BUCKETLOCK(hash); 2295 for (;;) { 2296 blps[1] = NULL; 2297 cache_lock_vnodes_cel(cel, dvp, vp); 2298 ncp = atomic_load_consume_ptr(&dvp->v_cache_dd); 2299 if (ncp == NULL) 2300 break; 2301 nc_flag = atomic_load_char(&ncp->nc_flag); 2302 if ((nc_flag & NCF_ISDOTDOT) == 0) 2303 break; 2304 MPASS(ncp->nc_dvp == dvp); 2305 blps[1] = NCP2BUCKETLOCK(ncp); 2306 if ((nc_flag & NCF_NEGATIVE) != 0) 2307 break; 2308 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 2309 break; 2310 if (ncp == dvp->v_cache_dd && 2311 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 2312 blps[1] == NCP2BUCKETLOCK(ncp) && 2313 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 2314 break; 2315 cache_unlock_vnodes_cel(cel); 2316 cel->vlp[0] = NULL; 2317 cel->vlp[1] = NULL; 2318 cel->vlp[2] = NULL; 2319 } 2320 cache_lock_buckets_cel(cel, blps[0], blps[1]); 2321 } 2322 2323 static void 2324 cache_enter_unlock(struct celockstate *cel) 2325 { 2326 2327 cache_unlock_buckets_cel(cel); 2328 cache_unlock_vnodes_cel(cel); 2329 } 2330 2331 static void __noinline 2332 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, 2333 struct componentname *cnp) 2334 { 2335 struct celockstate cel; 2336 struct namecache *ncp; 2337 uint32_t hash; 2338 int len; 2339 2340 if (atomic_load_ptr(&dvp->v_cache_dd) == NULL) 2341 return; 2342 len = cnp->cn_namelen; 2343 cache_celockstate_init(&cel); 2344 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 2345 cache_enter_lock_dd(&cel, dvp, vp, hash); 2346 ncp = dvp->v_cache_dd; 2347 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) { 2348 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); 2349 cache_zap_locked(ncp); 2350 } else { 2351 ncp = NULL; 2352 } 2353 atomic_store_ptr(&dvp->v_cache_dd, NULL); 2354 cache_enter_unlock(&cel); 2355 if (ncp != NULL) 2356 cache_free(ncp); 2357 } 2358 2359 /* 2360 * Add an entry to the cache. 2361 */ 2362 void 2363 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 2364 struct timespec *tsp, struct timespec *dtsp) 2365 { 2366 struct celockstate cel; 2367 struct namecache *ncp, *n2, *ndd; 2368 struct namecache_ts *ncp_ts; 2369 struct nchashhead *ncpp; 2370 uint32_t hash; 2371 int flag; 2372 int len; 2373 2374 KASSERT(cnp->cn_namelen <= NAME_MAX, 2375 ("%s: passed len %ld exceeds NAME_MAX (%d)", __func__, cnp->cn_namelen, 2376 NAME_MAX)); 2377 VNPASS(!VN_IS_DOOMED(dvp), dvp); 2378 VNPASS(dvp->v_type != VNON, dvp); 2379 if (vp != NULL) { 2380 VNPASS(!VN_IS_DOOMED(vp), vp); 2381 VNPASS(vp->v_type != VNON, vp); 2382 } 2383 if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') { 2384 KASSERT(dvp == vp, 2385 ("%s: different vnodes for dot entry (%p; %p)\n", __func__, 2386 dvp, vp)); 2387 } else { 2388 KASSERT(dvp != vp, 2389 ("%s: same vnode for non-dot entry [%s] (%p)\n", __func__, 2390 cnp->cn_nameptr, dvp)); 2391 } 2392 2393 #ifdef DEBUG_CACHE 2394 if (__predict_false(!doingcache)) 2395 return; 2396 #endif 2397 2398 flag = 0; 2399 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 2400 if (cnp->cn_namelen == 1) 2401 return; 2402 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 2403 cache_enter_dotdot_prep(dvp, vp, cnp); 2404 flag = NCF_ISDOTDOT; 2405 } 2406 } 2407 2408 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); 2409 if (ncp == NULL) 2410 return; 2411 2412 cache_celockstate_init(&cel); 2413 ndd = NULL; 2414 ncp_ts = NULL; 2415 2416 /* 2417 * Calculate the hash key and setup as much of the new 2418 * namecache entry as possible before acquiring the lock. 2419 */ 2420 ncp->nc_flag = flag | NCF_WIP; 2421 ncp->nc_vp = vp; 2422 if (vp == NULL) 2423 cache_neg_init(ncp); 2424 ncp->nc_dvp = dvp; 2425 if (tsp != NULL) { 2426 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 2427 ncp_ts->nc_time = *tsp; 2428 ncp_ts->nc_ticks = ticks; 2429 ncp_ts->nc_nc.nc_flag |= NCF_TS; 2430 if (dtsp != NULL) { 2431 ncp_ts->nc_dotdottime = *dtsp; 2432 ncp_ts->nc_nc.nc_flag |= NCF_DTS; 2433 } 2434 } 2435 len = ncp->nc_nlen = cnp->cn_namelen; 2436 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 2437 memcpy(ncp->nc_name, cnp->cn_nameptr, len); 2438 ncp->nc_name[len] = '\0'; 2439 cache_enter_lock(&cel, dvp, vp, hash); 2440 2441 /* 2442 * See if this vnode or negative entry is already in the cache 2443 * with this name. This can happen with concurrent lookups of 2444 * the same path name. 2445 */ 2446 ncpp = NCHHASH(hash); 2447 CK_SLIST_FOREACH(n2, ncpp, nc_hash) { 2448 if (n2->nc_dvp == dvp && 2449 n2->nc_nlen == cnp->cn_namelen && 2450 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { 2451 MPASS(cache_ncp_canuse(n2)); 2452 if ((n2->nc_flag & NCF_NEGATIVE) != 0) 2453 KASSERT(vp == NULL, 2454 ("%s: found entry pointing to a different vnode (%p != %p) ; name [%s]", 2455 __func__, NULL, vp, cnp->cn_nameptr)); 2456 else 2457 KASSERT(n2->nc_vp == vp, 2458 ("%s: found entry pointing to a different vnode (%p != %p) ; name [%s]", 2459 __func__, n2->nc_vp, vp, cnp->cn_nameptr)); 2460 /* 2461 * Entries are supposed to be immutable unless in the 2462 * process of getting destroyed. Accommodating for 2463 * changing timestamps is possible but not worth it. 2464 * This should be harmless in terms of correctness, in 2465 * the worst case resulting in an earlier expiration. 2466 * Alternatively, the found entry can be replaced 2467 * altogether. 2468 */ 2469 MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS))); 2470 #if 0 2471 if (tsp != NULL) { 2472 KASSERT((n2->nc_flag & NCF_TS) != 0, 2473 ("no NCF_TS")); 2474 n2_ts = __containerof(n2, struct namecache_ts, nc_nc); 2475 n2_ts->nc_time = ncp_ts->nc_time; 2476 n2_ts->nc_ticks = ncp_ts->nc_ticks; 2477 if (dtsp != NULL) { 2478 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; 2479 n2_ts->nc_nc.nc_flag |= NCF_DTS; 2480 } 2481 } 2482 #endif 2483 SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name, 2484 vp); 2485 goto out_unlock_free; 2486 } 2487 } 2488 2489 if (flag == NCF_ISDOTDOT) { 2490 /* 2491 * See if we are trying to add .. entry, but some other lookup 2492 * has populated v_cache_dd pointer already. 2493 */ 2494 if (dvp->v_cache_dd != NULL) 2495 goto out_unlock_free; 2496 KASSERT(vp == NULL || vp->v_type == VDIR, 2497 ("wrong vnode type %p", vp)); 2498 atomic_thread_fence_rel(); 2499 atomic_store_ptr(&dvp->v_cache_dd, ncp); 2500 } 2501 2502 if (vp != NULL) { 2503 if (flag != NCF_ISDOTDOT) { 2504 /* 2505 * For this case, the cache entry maps both the 2506 * directory name in it and the name ".." for the 2507 * directory's parent. 2508 */ 2509 if ((ndd = vp->v_cache_dd) != NULL) { 2510 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) 2511 cache_zap_locked(ndd); 2512 else 2513 ndd = NULL; 2514 } 2515 atomic_thread_fence_rel(); 2516 atomic_store_ptr(&vp->v_cache_dd, ncp); 2517 } else if (vp->v_type != VDIR) { 2518 if (vp->v_cache_dd != NULL) { 2519 atomic_store_ptr(&vp->v_cache_dd, NULL); 2520 } 2521 } 2522 } 2523 2524 if (flag != NCF_ISDOTDOT) { 2525 if (LIST_EMPTY(&dvp->v_cache_src)) { 2526 cache_hold_vnode(dvp); 2527 } 2528 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); 2529 } 2530 2531 /* 2532 * If the entry is "negative", we place it into the 2533 * "negative" cache queue, otherwise, we place it into the 2534 * destination vnode's cache entries queue. 2535 */ 2536 if (vp != NULL) { 2537 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); 2538 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, 2539 vp); 2540 } else { 2541 if (cnp->cn_flags & ISWHITEOUT) 2542 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_WHITE); 2543 cache_neg_insert(ncp); 2544 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, 2545 ncp->nc_name); 2546 } 2547 2548 /* 2549 * Insert the new namecache entry into the appropriate chain 2550 * within the cache entries table. 2551 */ 2552 CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash); 2553 2554 atomic_thread_fence_rel(); 2555 /* 2556 * Mark the entry as fully constructed. 2557 * It is immutable past this point until its removal. 2558 */ 2559 atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP); 2560 2561 cache_enter_unlock(&cel); 2562 if (ndd != NULL) 2563 cache_free(ndd); 2564 return; 2565 out_unlock_free: 2566 cache_enter_unlock(&cel); 2567 cache_free(ncp); 2568 return; 2569 } 2570 2571 /* 2572 * A variant of the above accepting flags. 2573 * 2574 * - VFS_CACHE_DROPOLD -- if a conflicting entry is found, drop it. 2575 * 2576 * TODO: this routine is a hack. It blindly removes the old entry, even if it 2577 * happens to match and it is doing it in an inefficient manner. It was added 2578 * to accommodate NFS which runs into a case where the target for a given name 2579 * may change from under it. Note this does nothing to solve the following 2580 * race: 2 callers of cache_enter_time_flags pass a different target vnode for 2581 * the same [dvp, cnp]. It may be argued that code doing this is broken. 2582 */ 2583 void 2584 cache_enter_time_flags(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 2585 struct timespec *tsp, struct timespec *dtsp, int flags) 2586 { 2587 2588 MPASS((flags & ~(VFS_CACHE_DROPOLD)) == 0); 2589 2590 if (flags & VFS_CACHE_DROPOLD) 2591 cache_remove_cnp(dvp, cnp); 2592 cache_enter_time(dvp, vp, cnp, tsp, dtsp); 2593 } 2594 2595 static u_long 2596 cache_roundup_2(u_long val) 2597 { 2598 u_long res; 2599 2600 for (res = 1; res <= val; res <<= 1) 2601 continue; 2602 2603 return (res); 2604 } 2605 2606 static struct nchashhead * 2607 nchinittbl(u_long elements, u_long *hashmask) 2608 { 2609 struct nchashhead *hashtbl; 2610 u_long hashsize, i; 2611 2612 hashsize = cache_roundup_2(elements) / 2; 2613 2614 hashtbl = malloc(hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK); 2615 for (i = 0; i < hashsize; i++) 2616 CK_SLIST_INIT(&hashtbl[i]); 2617 *hashmask = hashsize - 1; 2618 return (hashtbl); 2619 } 2620 2621 static void 2622 ncfreetbl(struct nchashhead *hashtbl) 2623 { 2624 2625 free(hashtbl, M_VFSCACHE); 2626 } 2627 2628 /* 2629 * Name cache initialization, from vfs_init() when we are booting 2630 */ 2631 static void 2632 nchinit(void *dummy __unused) 2633 { 2634 u_int i; 2635 2636 cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE, 2637 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2638 cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE, 2639 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2640 cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE, 2641 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2642 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE, 2643 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2644 2645 VFS_SMR_ZONE_SET(cache_zone_small); 2646 VFS_SMR_ZONE_SET(cache_zone_small_ts); 2647 VFS_SMR_ZONE_SET(cache_zone_large); 2648 VFS_SMR_ZONE_SET(cache_zone_large_ts); 2649 2650 ncsize = desiredvnodes * ncsizefactor; 2651 cache_recalc_neg_min(); 2652 nchashtbl = nchinittbl(desiredvnodes * 2, &nchash); 2653 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; 2654 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ 2655 ncbuckethash = 7; 2656 if (ncbuckethash > nchash) 2657 ncbuckethash = nchash; 2658 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, 2659 M_WAITOK | M_ZERO); 2660 for (i = 0; i < numbucketlocks; i++) 2661 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE); 2662 ncvnodehash = ncbuckethash; 2663 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, 2664 M_WAITOK | M_ZERO); 2665 for (i = 0; i < numvnodelocks; i++) 2666 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); 2667 2668 for (i = 0; i < numneglists; i++) { 2669 mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF); 2670 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); 2671 TAILQ_INIT(&neglists[i].nl_list); 2672 TAILQ_INIT(&neglists[i].nl_hotlist); 2673 } 2674 } 2675 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); 2676 2677 void 2678 cache_vnode_init(struct vnode *vp) 2679 { 2680 2681 LIST_INIT(&vp->v_cache_src); 2682 TAILQ_INIT(&vp->v_cache_dst); 2683 vp->v_cache_dd = NULL; 2684 cache_prehash(vp); 2685 } 2686 2687 /* 2688 * Induce transient cache misses for lockless operation in cache_lookup() by 2689 * using a temporary hash table. 2690 * 2691 * This will force a fs lookup. 2692 * 2693 * Synchronisation is done in 2 steps, calling vfs_smr_synchronize each time 2694 * to observe all CPUs not performing the lookup. 2695 */ 2696 static void 2697 cache_changesize_set_temp(struct nchashhead *temptbl, u_long temphash) 2698 { 2699 2700 MPASS(temphash < nchash); 2701 /* 2702 * Change the size. The new size is smaller and can safely be used 2703 * against the existing table. All lookups which now hash wrong will 2704 * result in a cache miss, which all callers are supposed to know how 2705 * to handle. 2706 */ 2707 atomic_store_long(&nchash, temphash); 2708 atomic_thread_fence_rel(); 2709 vfs_smr_synchronize(); 2710 /* 2711 * At this point everyone sees the updated hash value, but they still 2712 * see the old table. 2713 */ 2714 atomic_store_ptr(&nchashtbl, temptbl); 2715 atomic_thread_fence_rel(); 2716 vfs_smr_synchronize(); 2717 /* 2718 * At this point everyone sees the updated table pointer and size pair. 2719 */ 2720 } 2721 2722 /* 2723 * Set the new hash table. 2724 * 2725 * Similarly to cache_changesize_set_temp(), this has to synchronize against 2726 * lockless operation in cache_lookup(). 2727 */ 2728 static void 2729 cache_changesize_set_new(struct nchashhead *new_tbl, u_long new_hash) 2730 { 2731 2732 MPASS(nchash < new_hash); 2733 /* 2734 * Change the pointer first. This wont result in out of bounds access 2735 * since the temporary table is guaranteed to be smaller. 2736 */ 2737 atomic_store_ptr(&nchashtbl, new_tbl); 2738 atomic_thread_fence_rel(); 2739 vfs_smr_synchronize(); 2740 /* 2741 * At this point everyone sees the updated pointer value, but they 2742 * still see the old size. 2743 */ 2744 atomic_store_long(&nchash, new_hash); 2745 atomic_thread_fence_rel(); 2746 vfs_smr_synchronize(); 2747 /* 2748 * At this point everyone sees the updated table pointer and size pair. 2749 */ 2750 } 2751 2752 void 2753 cache_changesize(u_long newmaxvnodes) 2754 { 2755 struct nchashhead *new_nchashtbl, *old_nchashtbl, *temptbl; 2756 u_long new_nchash, old_nchash, temphash; 2757 struct namecache *ncp; 2758 uint32_t hash; 2759 u_long newncsize; 2760 u_long i; 2761 2762 newncsize = newmaxvnodes * ncsizefactor; 2763 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); 2764 if (newmaxvnodes < numbucketlocks) 2765 newmaxvnodes = numbucketlocks; 2766 2767 new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash); 2768 /* If same hash table size, nothing to do */ 2769 if (nchash == new_nchash) { 2770 ncfreetbl(new_nchashtbl); 2771 return; 2772 } 2773 2774 temptbl = nchinittbl(1, &temphash); 2775 2776 /* 2777 * Move everything from the old hash table to the new table. 2778 * None of the namecache entries in the table can be removed 2779 * because to do so, they have to be removed from the hash table. 2780 */ 2781 cache_lock_all_vnodes(); 2782 cache_lock_all_buckets(); 2783 old_nchashtbl = nchashtbl; 2784 old_nchash = nchash; 2785 cache_changesize_set_temp(temptbl, temphash); 2786 for (i = 0; i <= old_nchash; i++) { 2787 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) { 2788 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, 2789 ncp->nc_dvp); 2790 CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash); 2791 CK_SLIST_INSERT_HEAD(&new_nchashtbl[hash & new_nchash], ncp, nc_hash); 2792 } 2793 } 2794 ncsize = newncsize; 2795 cache_recalc_neg_min(); 2796 cache_changesize_set_new(new_nchashtbl, new_nchash); 2797 cache_unlock_all_buckets(); 2798 cache_unlock_all_vnodes(); 2799 ncfreetbl(old_nchashtbl); 2800 ncfreetbl(temptbl); 2801 } 2802 2803 /* 2804 * Remove all entries from and to a particular vnode. 2805 */ 2806 static void 2807 cache_purge_impl(struct vnode *vp) 2808 { 2809 struct cache_freebatch batch; 2810 struct namecache *ncp; 2811 struct mtx *vlp, *vlp2; 2812 2813 TAILQ_INIT(&batch); 2814 vlp = VP2VNODELOCK(vp); 2815 vlp2 = NULL; 2816 mtx_lock(vlp); 2817 retry: 2818 while (!LIST_EMPTY(&vp->v_cache_src)) { 2819 ncp = LIST_FIRST(&vp->v_cache_src); 2820 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2821 goto retry; 2822 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2823 } 2824 while (!TAILQ_EMPTY(&vp->v_cache_dst)) { 2825 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2826 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2827 goto retry; 2828 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2829 } 2830 ncp = vp->v_cache_dd; 2831 if (ncp != NULL) { 2832 KASSERT(ncp->nc_flag & NCF_ISDOTDOT, 2833 ("lost dotdot link")); 2834 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2835 goto retry; 2836 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2837 } 2838 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); 2839 mtx_unlock(vlp); 2840 if (vlp2 != NULL) 2841 mtx_unlock(vlp2); 2842 cache_free_batch(&batch); 2843 } 2844 2845 /* 2846 * Opportunistic check to see if there is anything to do. 2847 */ 2848 static bool 2849 cache_has_entries(struct vnode *vp) 2850 { 2851 2852 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2853 atomic_load_ptr(&vp->v_cache_dd) == NULL) 2854 return (false); 2855 return (true); 2856 } 2857 2858 void 2859 cache_purge(struct vnode *vp) 2860 { 2861 2862 SDT_PROBE1(vfs, namecache, purge, done, vp); 2863 if (!cache_has_entries(vp)) 2864 return; 2865 cache_purge_impl(vp); 2866 } 2867 2868 /* 2869 * Only to be used by vgone. 2870 */ 2871 void 2872 cache_purge_vgone(struct vnode *vp) 2873 { 2874 struct mtx *vlp; 2875 2876 VNPASS(VN_IS_DOOMED(vp), vp); 2877 if (cache_has_entries(vp)) { 2878 cache_purge_impl(vp); 2879 return; 2880 } 2881 2882 /* 2883 * Serialize against a potential thread doing cache_purge. 2884 */ 2885 vlp = VP2VNODELOCK(vp); 2886 mtx_wait_unlocked(vlp); 2887 if (cache_has_entries(vp)) { 2888 cache_purge_impl(vp); 2889 return; 2890 } 2891 return; 2892 } 2893 2894 /* 2895 * Remove all negative entries for a particular directory vnode. 2896 */ 2897 void 2898 cache_purge_negative(struct vnode *vp) 2899 { 2900 struct cache_freebatch batch; 2901 struct namecache *ncp, *nnp; 2902 struct mtx *vlp; 2903 2904 SDT_PROBE1(vfs, namecache, purge_negative, done, vp); 2905 if (LIST_EMPTY(&vp->v_cache_src)) 2906 return; 2907 TAILQ_INIT(&batch); 2908 vlp = VP2VNODELOCK(vp); 2909 mtx_lock(vlp); 2910 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { 2911 if (!(ncp->nc_flag & NCF_NEGATIVE)) 2912 continue; 2913 cache_zap_negative_locked_vnode_kl(ncp, vp); 2914 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2915 } 2916 mtx_unlock(vlp); 2917 cache_free_batch(&batch); 2918 } 2919 2920 /* 2921 * Entry points for modifying VOP operations. 2922 */ 2923 void 2924 cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, 2925 struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp) 2926 { 2927 2928 ASSERT_VOP_IN_SEQC(fdvp); 2929 ASSERT_VOP_IN_SEQC(fvp); 2930 ASSERT_VOP_IN_SEQC(tdvp); 2931 if (tvp != NULL) 2932 ASSERT_VOP_IN_SEQC(tvp); 2933 2934 cache_purge(fvp); 2935 if (tvp != NULL) { 2936 cache_purge(tvp); 2937 KASSERT(!cache_remove_cnp(tdvp, tcnp), 2938 ("%s: lingering negative entry", __func__)); 2939 } else { 2940 cache_remove_cnp(tdvp, tcnp); 2941 } 2942 2943 /* 2944 * TODO 2945 * 2946 * Historically renaming was always purging all revelang entries, 2947 * but that's quite wasteful. In particular turns out that in many cases 2948 * the target file is immediately accessed after rename, inducing a cache 2949 * miss. 2950 * 2951 * Recode this to reduce relocking and reuse the existing entry (if any) 2952 * instead of just removing it above and allocating a new one here. 2953 */ 2954 cache_enter(tdvp, fvp, tcnp); 2955 } 2956 2957 void 2958 cache_vop_rmdir(struct vnode *dvp, struct vnode *vp) 2959 { 2960 2961 ASSERT_VOP_IN_SEQC(dvp); 2962 ASSERT_VOP_IN_SEQC(vp); 2963 cache_purge(vp); 2964 } 2965 2966 #ifdef INVARIANTS 2967 /* 2968 * Validate that if an entry exists it matches. 2969 */ 2970 void 2971 cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) 2972 { 2973 struct namecache *ncp; 2974 struct mtx *blp; 2975 uint32_t hash; 2976 2977 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 2978 if (CK_SLIST_EMPTY(NCHHASH(hash))) 2979 return; 2980 blp = HASH2BUCKETLOCK(hash); 2981 mtx_lock(blp); 2982 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 2983 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 2984 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) { 2985 if (ncp->nc_vp != vp) 2986 panic("%s: mismatch (%p != %p); ncp %p [%s] dvp %p\n", 2987 __func__, vp, ncp->nc_vp, ncp, ncp->nc_name, ncp->nc_dvp); 2988 } 2989 } 2990 mtx_unlock(blp); 2991 } 2992 2993 void 2994 cache_assert_no_entries(struct vnode *vp) 2995 { 2996 2997 VNPASS(TAILQ_EMPTY(&vp->v_cache_dst), vp); 2998 VNPASS(LIST_EMPTY(&vp->v_cache_src), vp); 2999 VNPASS(vp->v_cache_dd == NULL, vp); 3000 } 3001 #endif 3002 3003 /* 3004 * Flush all entries referencing a particular filesystem. 3005 */ 3006 void 3007 cache_purgevfs(struct mount *mp) 3008 { 3009 struct vnode *vp, *mvp; 3010 size_t visited __sdt_used, purged __sdt_used; 3011 3012 visited = purged = 0; 3013 /* 3014 * Somewhat wasteful iteration over all vnodes. Would be better to 3015 * support filtering and avoid the interlock to begin with. 3016 */ 3017 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 3018 visited++; 3019 if (!cache_has_entries(vp)) { 3020 VI_UNLOCK(vp); 3021 continue; 3022 } 3023 vholdl(vp); 3024 VI_UNLOCK(vp); 3025 cache_purge(vp); 3026 purged++; 3027 vdrop(vp); 3028 } 3029 3030 SDT_PROBE3(vfs, namecache, purgevfs, done, mp, visited, purged); 3031 } 3032 3033 /* 3034 * Perform canonical checks and cache lookup and pass on to filesystem 3035 * through the vop_cachedlookup only if needed. 3036 */ 3037 3038 int 3039 vfs_cache_lookup(struct vop_lookup_args *ap) 3040 { 3041 struct vnode *dvp; 3042 int error; 3043 struct vnode **vpp = ap->a_vpp; 3044 struct componentname *cnp = ap->a_cnp; 3045 int flags = cnp->cn_flags; 3046 3047 *vpp = NULL; 3048 dvp = ap->a_dvp; 3049 3050 if (dvp->v_type != VDIR) 3051 return (ENOTDIR); 3052 3053 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && 3054 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) 3055 return (EROFS); 3056 3057 error = vn_dir_check_exec(dvp, cnp); 3058 if (error != 0) 3059 return (error); 3060 3061 error = cache_lookup(dvp, vpp, cnp, NULL, NULL); 3062 if (error == 0) 3063 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); 3064 if (error == -1) 3065 return (0); 3066 return (error); 3067 } 3068 3069 /* Implementation of the getcwd syscall. */ 3070 int 3071 sys___getcwd(struct thread *td, struct __getcwd_args *uap) 3072 { 3073 char *buf, *retbuf; 3074 size_t buflen; 3075 int error; 3076 3077 buflen = uap->buflen; 3078 if (__predict_false(buflen < 2)) 3079 return (EINVAL); 3080 if (buflen > MAXPATHLEN) 3081 buflen = MAXPATHLEN; 3082 3083 buf = uma_zalloc(namei_zone, M_WAITOK); 3084 error = vn_getcwd(buf, &retbuf, &buflen); 3085 if (error == 0) 3086 error = copyout(retbuf, uap->buf, buflen); 3087 uma_zfree(namei_zone, buf); 3088 return (error); 3089 } 3090 3091 int 3092 vn_getcwd(char *buf, char **retbuf, size_t *buflen) 3093 { 3094 struct pwd *pwd; 3095 int error; 3096 3097 vfs_smr_enter(); 3098 pwd = pwd_get_smr(); 3099 error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, 3100 buflen, 0); 3101 VFS_SMR_ASSERT_NOT_ENTERED(); 3102 if (error < 0) { 3103 pwd = pwd_hold(curthread); 3104 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf, 3105 retbuf, buflen); 3106 pwd_drop(pwd); 3107 } 3108 3109 #ifdef KTRACE 3110 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0) 3111 ktrnamei(*retbuf); 3112 #endif 3113 return (error); 3114 } 3115 3116 /* 3117 * Canonicalize a path by walking it forward and back. 3118 * 3119 * BUGS: 3120 * - Nothing guarantees the integrity of the entire chain. Consider the case 3121 * where the path "foo/bar/baz/qux" is passed, but "bar" is moved out of 3122 * "foo" into "quux" during the backwards walk. The result will be 3123 * "quux/bar/baz/qux", which could not have been obtained by an incremental 3124 * walk in userspace. Moreover, the path we return is inaccessible if the 3125 * calling thread lacks permission to traverse "quux". 3126 */ 3127 static int 3128 kern___realpathat(struct thread *td, int fd, const char *path, char *buf, 3129 size_t size, int flags, enum uio_seg pathseg) 3130 { 3131 struct nameidata nd; 3132 char *retbuf, *freebuf; 3133 int error; 3134 3135 if (flags != 0) 3136 return (EINVAL); 3137 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | WANTPARENT | AUDITVNODE1, 3138 pathseg, path, fd, &cap_fstat_rights); 3139 if ((error = namei(&nd)) != 0) 3140 return (error); 3141 3142 if (nd.ni_vp->v_type == VREG && nd.ni_dvp->v_type != VDIR && 3143 (nd.ni_vp->v_vflag & VV_ROOT) != 0) { 3144 /* 3145 * This happens if vp is a file mount. The call to 3146 * vn_fullpath_hardlink can panic if path resolution can't be 3147 * handled without the directory. 3148 * 3149 * To resolve this, we find the vnode which was mounted on - 3150 * this should have a unique global path since we disallow 3151 * mounting on linked files. 3152 */ 3153 struct vnode *covered_vp; 3154 error = vn_lock(nd.ni_vp, LK_SHARED); 3155 if (error != 0) 3156 goto out; 3157 covered_vp = nd.ni_vp->v_mount->mnt_vnodecovered; 3158 vref(covered_vp); 3159 VOP_UNLOCK(nd.ni_vp); 3160 error = vn_fullpath(covered_vp, &retbuf, &freebuf); 3161 vrele(covered_vp); 3162 } else { 3163 error = vn_fullpath_hardlink(nd.ni_vp, nd.ni_dvp, nd.ni_cnd.cn_nameptr, 3164 nd.ni_cnd.cn_namelen, &retbuf, &freebuf, &size); 3165 } 3166 if (error == 0) { 3167 error = copyout(retbuf, buf, size); 3168 free(freebuf, M_TEMP); 3169 } 3170 out: 3171 vrele(nd.ni_vp); 3172 vrele(nd.ni_dvp); 3173 NDFREE_PNBUF(&nd); 3174 return (error); 3175 } 3176 3177 int 3178 sys___realpathat(struct thread *td, struct __realpathat_args *uap) 3179 { 3180 3181 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size, 3182 uap->flags, UIO_USERSPACE)); 3183 } 3184 3185 /* 3186 * Retrieve the full filesystem path that correspond to a vnode from the name 3187 * cache (if available) 3188 */ 3189 int 3190 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf) 3191 { 3192 struct pwd *pwd; 3193 char *buf; 3194 size_t buflen; 3195 int error; 3196 3197 if (__predict_false(vp == NULL)) 3198 return (EINVAL); 3199 3200 buflen = MAXPATHLEN; 3201 buf = malloc(buflen, M_TEMP, M_WAITOK); 3202 vfs_smr_enter(); 3203 pwd = pwd_get_smr(); 3204 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, 0); 3205 VFS_SMR_ASSERT_NOT_ENTERED(); 3206 if (error < 0) { 3207 pwd = pwd_hold(curthread); 3208 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen); 3209 pwd_drop(pwd); 3210 } 3211 if (error == 0) 3212 *freebuf = buf; 3213 else 3214 free(buf, M_TEMP); 3215 return (error); 3216 } 3217 3218 /* 3219 * This function is similar to vn_fullpath, but it attempts to lookup the 3220 * pathname relative to the global root mount point. This is required for the 3221 * auditing sub-system, as audited pathnames must be absolute, relative to the 3222 * global root mount point. 3223 */ 3224 int 3225 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf) 3226 { 3227 char *buf; 3228 size_t buflen; 3229 int error; 3230 3231 if (__predict_false(vp == NULL)) 3232 return (EINVAL); 3233 buflen = MAXPATHLEN; 3234 buf = malloc(buflen, M_TEMP, M_WAITOK); 3235 vfs_smr_enter(); 3236 error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, 0); 3237 VFS_SMR_ASSERT_NOT_ENTERED(); 3238 if (error < 0) { 3239 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen); 3240 } 3241 if (error == 0) 3242 *freebuf = buf; 3243 else 3244 free(buf, M_TEMP); 3245 return (error); 3246 } 3247 3248 static struct namecache * 3249 vn_dd_from_dst(struct vnode *vp) 3250 { 3251 struct namecache *ncp; 3252 3253 cache_assert_vnode_locked(vp); 3254 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) { 3255 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 3256 return (ncp); 3257 } 3258 return (NULL); 3259 } 3260 3261 int 3262 vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen) 3263 { 3264 struct vnode *dvp; 3265 struct namecache *ncp; 3266 struct mtx *vlp; 3267 int error; 3268 3269 vlp = VP2VNODELOCK(*vp); 3270 mtx_lock(vlp); 3271 ncp = (*vp)->v_cache_dd; 3272 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) { 3273 KASSERT(ncp == vn_dd_from_dst(*vp), 3274 ("%s: mismatch for dd entry (%p != %p)", __func__, 3275 ncp, vn_dd_from_dst(*vp))); 3276 } else { 3277 ncp = vn_dd_from_dst(*vp); 3278 } 3279 if (ncp != NULL) { 3280 if (*buflen < ncp->nc_nlen) { 3281 mtx_unlock(vlp); 3282 vrele(*vp); 3283 counter_u64_add(numfullpathfail4, 1); 3284 error = ENOMEM; 3285 SDT_PROBE3(vfs, namecache, fullpath, return, error, 3286 vp, NULL); 3287 return (error); 3288 } 3289 *buflen -= ncp->nc_nlen; 3290 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 3291 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, 3292 ncp->nc_name, vp); 3293 dvp = *vp; 3294 *vp = ncp->nc_dvp; 3295 vref(*vp); 3296 mtx_unlock(vlp); 3297 vrele(dvp); 3298 return (0); 3299 } 3300 SDT_PROBE1(vfs, namecache, fullpath, miss, vp); 3301 3302 mtx_unlock(vlp); 3303 vn_lock(*vp, LK_SHARED | LK_RETRY); 3304 error = VOP_VPTOCNP(*vp, &dvp, buf, buflen); 3305 vput(*vp); 3306 if (error) { 3307 counter_u64_add(numfullpathfail2, 1); 3308 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 3309 return (error); 3310 } 3311 3312 *vp = dvp; 3313 if (VN_IS_DOOMED(dvp)) { 3314 /* forced unmount */ 3315 vrele(dvp); 3316 error = ENOENT; 3317 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 3318 return (error); 3319 } 3320 /* 3321 * *vp has its use count incremented still. 3322 */ 3323 3324 return (0); 3325 } 3326 3327 /* 3328 * Resolve a directory to a pathname. 3329 * 3330 * The name of the directory can always be found in the namecache or fetched 3331 * from the filesystem. There is also guaranteed to be only one parent, meaning 3332 * we can just follow vnodes up until we find the root. 3333 * 3334 * The vnode must be referenced. 3335 */ 3336 static int 3337 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 3338 size_t *len, size_t addend) 3339 { 3340 #ifdef KDTRACE_HOOKS 3341 struct vnode *startvp = vp; 3342 #endif 3343 struct vnode *vp1; 3344 size_t buflen; 3345 int error; 3346 bool slash_prefixed; 3347 3348 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); 3349 VNPASS(vp->v_usecount > 0, vp); 3350 3351 buflen = *len; 3352 3353 slash_prefixed = true; 3354 if (addend == 0) { 3355 MPASS(*len >= 2); 3356 buflen--; 3357 buf[buflen] = '\0'; 3358 slash_prefixed = false; 3359 } 3360 3361 error = 0; 3362 3363 SDT_PROBE1(vfs, namecache, fullpath, entry, vp); 3364 counter_u64_add(numfullpathcalls, 1); 3365 while (vp != rdir && vp != rootvnode) { 3366 /* 3367 * The vp vnode must be already fully constructed, 3368 * since it is either found in namecache or obtained 3369 * from VOP_VPTOCNP(). We may test for VV_ROOT safely 3370 * without obtaining the vnode lock. 3371 */ 3372 if ((vp->v_vflag & VV_ROOT) != 0) { 3373 vn_lock(vp, LK_RETRY | LK_SHARED); 3374 3375 /* 3376 * With the vnode locked, check for races with 3377 * unmount, forced or not. Note that we 3378 * already verified that vp is not equal to 3379 * the root vnode, which means that 3380 * mnt_vnodecovered can be NULL only for the 3381 * case of unmount. 3382 */ 3383 if (VN_IS_DOOMED(vp) || 3384 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || 3385 vp1->v_mountedhere != vp->v_mount) { 3386 vput(vp); 3387 error = ENOENT; 3388 SDT_PROBE3(vfs, namecache, fullpath, return, 3389 error, vp, NULL); 3390 break; 3391 } 3392 3393 vref(vp1); 3394 vput(vp); 3395 vp = vp1; 3396 continue; 3397 } 3398 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); 3399 error = vn_vptocnp(&vp, buf, &buflen); 3400 if (error) 3401 break; 3402 if (buflen == 0) { 3403 vrele(vp); 3404 error = ENOMEM; 3405 SDT_PROBE3(vfs, namecache, fullpath, return, error, 3406 startvp, NULL); 3407 break; 3408 } 3409 buf[--buflen] = '/'; 3410 slash_prefixed = true; 3411 } 3412 if (error) 3413 return (error); 3414 if (!slash_prefixed) { 3415 if (buflen == 0) { 3416 vrele(vp); 3417 counter_u64_add(numfullpathfail4, 1); 3418 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, 3419 startvp, NULL); 3420 return (ENOMEM); 3421 } 3422 buf[--buflen] = '/'; 3423 } 3424 counter_u64_add(numfullpathfound, 1); 3425 vrele(vp); 3426 3427 *retbuf = buf + buflen; 3428 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf); 3429 *len -= buflen; 3430 *len += addend; 3431 return (0); 3432 } 3433 3434 /* 3435 * Resolve an arbitrary vnode to a pathname. 3436 * 3437 * Note 2 caveats: 3438 * - hardlinks are not tracked, thus if the vnode is not a directory this can 3439 * resolve to a different path than the one used to find it 3440 * - namecache is not mandatory, meaning names are not guaranteed to be added 3441 * (in which case resolving fails) 3442 */ 3443 static void __inline 3444 cache_rev_failed_impl(int *reason, int line) 3445 { 3446 3447 *reason = line; 3448 } 3449 #define cache_rev_failed(var) cache_rev_failed_impl((var), __LINE__) 3450 3451 static int 3452 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 3453 char **retbuf, size_t *buflen, size_t addend) 3454 { 3455 #ifdef KDTRACE_HOOKS 3456 struct vnode *startvp = vp; 3457 #endif 3458 struct vnode *tvp; 3459 struct mount *mp; 3460 struct namecache *ncp; 3461 size_t orig_buflen; 3462 int reason; 3463 int error; 3464 #ifdef KDTRACE_HOOKS 3465 int i; 3466 #endif 3467 seqc_t vp_seqc, tvp_seqc; 3468 u_char nc_flag; 3469 3470 VFS_SMR_ASSERT_ENTERED(); 3471 3472 if (!atomic_load_char(&cache_fast_lookup_enabled)) { 3473 vfs_smr_exit(); 3474 return (-1); 3475 } 3476 3477 orig_buflen = *buflen; 3478 3479 if (addend == 0) { 3480 MPASS(*buflen >= 2); 3481 *buflen -= 1; 3482 buf[*buflen] = '\0'; 3483 } 3484 3485 if (vp == rdir || vp == rootvnode) { 3486 if (addend == 0) { 3487 *buflen -= 1; 3488 buf[*buflen] = '/'; 3489 } 3490 goto out_ok; 3491 } 3492 3493 #ifdef KDTRACE_HOOKS 3494 i = 0; 3495 #endif 3496 error = -1; 3497 ncp = NULL; /* for sdt probe down below */ 3498 vp_seqc = vn_seqc_read_any(vp); 3499 if (seqc_in_modify(vp_seqc)) { 3500 cache_rev_failed(&reason); 3501 goto out_abort; 3502 } 3503 3504 for (;;) { 3505 #ifdef KDTRACE_HOOKS 3506 i++; 3507 #endif 3508 if ((vp->v_vflag & VV_ROOT) != 0) { 3509 mp = atomic_load_ptr(&vp->v_mount); 3510 if (mp == NULL) { 3511 cache_rev_failed(&reason); 3512 goto out_abort; 3513 } 3514 tvp = atomic_load_ptr(&mp->mnt_vnodecovered); 3515 tvp_seqc = vn_seqc_read_any(tvp); 3516 if (seqc_in_modify(tvp_seqc)) { 3517 cache_rev_failed(&reason); 3518 goto out_abort; 3519 } 3520 if (!vn_seqc_consistent(vp, vp_seqc)) { 3521 cache_rev_failed(&reason); 3522 goto out_abort; 3523 } 3524 vp = tvp; 3525 vp_seqc = tvp_seqc; 3526 continue; 3527 } 3528 ncp = atomic_load_consume_ptr(&vp->v_cache_dd); 3529 if (ncp == NULL) { 3530 cache_rev_failed(&reason); 3531 goto out_abort; 3532 } 3533 nc_flag = atomic_load_char(&ncp->nc_flag); 3534 if ((nc_flag & NCF_ISDOTDOT) != 0) { 3535 cache_rev_failed(&reason); 3536 goto out_abort; 3537 } 3538 if (ncp->nc_nlen >= *buflen) { 3539 cache_rev_failed(&reason); 3540 error = ENOMEM; 3541 goto out_abort; 3542 } 3543 *buflen -= ncp->nc_nlen; 3544 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 3545 *buflen -= 1; 3546 buf[*buflen] = '/'; 3547 tvp = ncp->nc_dvp; 3548 tvp_seqc = vn_seqc_read_any(tvp); 3549 if (seqc_in_modify(tvp_seqc)) { 3550 cache_rev_failed(&reason); 3551 goto out_abort; 3552 } 3553 if (!vn_seqc_consistent(vp, vp_seqc)) { 3554 cache_rev_failed(&reason); 3555 goto out_abort; 3556 } 3557 /* 3558 * Acquire fence provided by vn_seqc_read_any above. 3559 */ 3560 if (__predict_false(atomic_load_ptr(&vp->v_cache_dd) != ncp)) { 3561 cache_rev_failed(&reason); 3562 goto out_abort; 3563 } 3564 if (!cache_ncp_canuse(ncp)) { 3565 cache_rev_failed(&reason); 3566 goto out_abort; 3567 } 3568 vp = tvp; 3569 vp_seqc = tvp_seqc; 3570 if (vp == rdir || vp == rootvnode) 3571 break; 3572 } 3573 out_ok: 3574 vfs_smr_exit(); 3575 *retbuf = buf + *buflen; 3576 *buflen = orig_buflen - *buflen + addend; 3577 SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf); 3578 return (0); 3579 3580 out_abort: 3581 *buflen = orig_buflen; 3582 SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i); 3583 vfs_smr_exit(); 3584 return (error); 3585 } 3586 3587 static int 3588 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 3589 size_t *buflen) 3590 { 3591 size_t orig_buflen, addend; 3592 int error; 3593 3594 if (*buflen < 2) 3595 return (EINVAL); 3596 3597 orig_buflen = *buflen; 3598 3599 vref(vp); 3600 addend = 0; 3601 if (vp->v_type != VDIR) { 3602 *buflen -= 1; 3603 buf[*buflen] = '\0'; 3604 error = vn_vptocnp(&vp, buf, buflen); 3605 if (error) 3606 return (error); 3607 if (*buflen == 0) { 3608 vrele(vp); 3609 return (ENOMEM); 3610 } 3611 *buflen -= 1; 3612 buf[*buflen] = '/'; 3613 addend = orig_buflen - *buflen; 3614 } 3615 3616 return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, addend)); 3617 } 3618 3619 /* 3620 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks). 3621 * 3622 * Since the namecache does not track hardlinks, the caller is expected to 3623 * first look up the target vnode with WANTPARENT flag passed to namei to get 3624 * dvp and vp. 3625 * 3626 * Then we have 2 cases: 3627 * - if the found vnode is a directory, the path can be constructed just by 3628 * following names up the chain 3629 * - otherwise we populate the buffer with the saved name and start resolving 3630 * from the parent 3631 */ 3632 int 3633 vn_fullpath_hardlink(struct vnode *vp, struct vnode *dvp, 3634 const char *hrdl_name, size_t hrdl_name_length, 3635 char **retbuf, char **freebuf, size_t *buflen) 3636 { 3637 char *buf, *tmpbuf; 3638 struct pwd *pwd; 3639 size_t addend; 3640 int error; 3641 __enum_uint8(vtype) type; 3642 3643 if (*buflen < 2) 3644 return (EINVAL); 3645 if (*buflen > MAXPATHLEN) 3646 *buflen = MAXPATHLEN; 3647 3648 buf = malloc(*buflen, M_TEMP, M_WAITOK); 3649 3650 addend = 0; 3651 3652 /* 3653 * Check for VBAD to work around the vp_crossmp bug in lookup(). 3654 * 3655 * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be 3656 * set to mount point's root vnode while ni_dvp will be vp_crossmp. 3657 * If the type is VDIR (like in this very case) we can skip looking 3658 * at ni_dvp in the first place. However, since vnodes get passed here 3659 * unlocked the target may transition to doomed state (type == VBAD) 3660 * before we get to evaluate the condition. If this happens, we will 3661 * populate part of the buffer and descend to vn_fullpath_dir with 3662 * vp == vp_crossmp. Prevent the problem by checking for VBAD. 3663 */ 3664 type = atomic_load_8(&vp->v_type); 3665 if (type == VBAD) { 3666 error = ENOENT; 3667 goto out_bad; 3668 } 3669 if (type != VDIR) { 3670 addend = hrdl_name_length + 2; 3671 if (*buflen < addend) { 3672 error = ENOMEM; 3673 goto out_bad; 3674 } 3675 *buflen -= addend; 3676 tmpbuf = buf + *buflen; 3677 tmpbuf[0] = '/'; 3678 memcpy(&tmpbuf[1], hrdl_name, hrdl_name_length); 3679 tmpbuf[addend - 1] = '\0'; 3680 vp = dvp; 3681 } 3682 3683 vfs_smr_enter(); 3684 pwd = pwd_get_smr(); 3685 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen, 3686 addend); 3687 VFS_SMR_ASSERT_NOT_ENTERED(); 3688 if (error < 0) { 3689 pwd = pwd_hold(curthread); 3690 vref(vp); 3691 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen, 3692 addend); 3693 pwd_drop(pwd); 3694 } 3695 if (error != 0) 3696 goto out_bad; 3697 3698 *freebuf = buf; 3699 3700 return (0); 3701 out_bad: 3702 free(buf, M_TEMP); 3703 return (error); 3704 } 3705 3706 struct vnode * 3707 vn_dir_dd_ino(struct vnode *vp) 3708 { 3709 struct namecache *ncp; 3710 struct vnode *ddvp; 3711 struct mtx *vlp; 3712 enum vgetstate vs; 3713 3714 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); 3715 vlp = VP2VNODELOCK(vp); 3716 mtx_lock(vlp); 3717 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { 3718 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 3719 continue; 3720 ddvp = ncp->nc_dvp; 3721 vs = vget_prep(ddvp); 3722 mtx_unlock(vlp); 3723 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) 3724 return (NULL); 3725 return (ddvp); 3726 } 3727 mtx_unlock(vlp); 3728 return (NULL); 3729 } 3730 3731 int 3732 vn_commname(struct vnode *vp, char *buf, u_int buflen) 3733 { 3734 struct namecache *ncp; 3735 struct mtx *vlp; 3736 int l; 3737 3738 vlp = VP2VNODELOCK(vp); 3739 mtx_lock(vlp); 3740 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) 3741 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 3742 break; 3743 if (ncp == NULL) { 3744 mtx_unlock(vlp); 3745 return (ENOENT); 3746 } 3747 l = min(ncp->nc_nlen, buflen - 1); 3748 memcpy(buf, ncp->nc_name, l); 3749 mtx_unlock(vlp); 3750 buf[l] = '\0'; 3751 return (0); 3752 } 3753 3754 /* 3755 * This function updates path string to vnode's full global path 3756 * and checks the size of the new path string against the pathlen argument. 3757 * 3758 * Requires a locked, referenced vnode. 3759 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 3760 * 3761 * If vp is a directory, the call to vn_fullpath_global() always succeeds 3762 * because it falls back to the ".." lookup if the namecache lookup fails. 3763 */ 3764 int 3765 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, 3766 u_int pathlen) 3767 { 3768 struct nameidata nd; 3769 struct vnode *vp1; 3770 char *rpath, *fbuf; 3771 int error; 3772 3773 ASSERT_VOP_ELOCKED(vp, __func__); 3774 3775 /* Construct global filesystem path from vp. */ 3776 VOP_UNLOCK(vp); 3777 error = vn_fullpath_global(vp, &rpath, &fbuf); 3778 3779 if (error != 0) { 3780 vrele(vp); 3781 return (error); 3782 } 3783 3784 if (strlen(rpath) >= pathlen) { 3785 vrele(vp); 3786 error = ENAMETOOLONG; 3787 goto out; 3788 } 3789 3790 /* 3791 * Re-lookup the vnode by path to detect a possible rename. 3792 * As a side effect, the vnode is relocked. 3793 * If vnode was renamed, return ENOENT. 3794 */ 3795 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, path); 3796 error = namei(&nd); 3797 if (error != 0) { 3798 vrele(vp); 3799 goto out; 3800 } 3801 NDFREE_PNBUF(&nd); 3802 vp1 = nd.ni_vp; 3803 vrele(vp); 3804 if (vp1 == vp) 3805 strcpy(path, rpath); 3806 else { 3807 vput(vp1); 3808 error = ENOENT; 3809 } 3810 3811 out: 3812 free(fbuf, M_TEMP); 3813 return (error); 3814 } 3815 3816 /* 3817 * This is similar to vn_path_to_global_path but allows for regular 3818 * files which may not be present in the cache. 3819 * 3820 * Requires a locked, referenced vnode. 3821 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 3822 */ 3823 int 3824 vn_path_to_global_path_hardlink(struct thread *td, struct vnode *vp, 3825 struct vnode *dvp, char *path, u_int pathlen, const char *leaf_name, 3826 size_t leaf_length) 3827 { 3828 struct nameidata nd; 3829 struct vnode *vp1; 3830 char *rpath, *fbuf; 3831 size_t len; 3832 int error; 3833 3834 ASSERT_VOP_ELOCKED(vp, __func__); 3835 3836 /* 3837 * Construct global filesystem path from dvp, vp and leaf 3838 * name. 3839 */ 3840 VOP_UNLOCK(vp); 3841 len = pathlen; 3842 error = vn_fullpath_hardlink(vp, dvp, leaf_name, leaf_length, 3843 &rpath, &fbuf, &len); 3844 3845 if (error != 0) { 3846 vrele(vp); 3847 return (error); 3848 } 3849 3850 if (strlen(rpath) >= pathlen) { 3851 vrele(vp); 3852 error = ENAMETOOLONG; 3853 goto out; 3854 } 3855 3856 /* 3857 * Re-lookup the vnode by path to detect a possible rename. 3858 * As a side effect, the vnode is relocked. 3859 * If vnode was renamed, return ENOENT. 3860 */ 3861 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, path); 3862 error = namei(&nd); 3863 if (error != 0) { 3864 vrele(vp); 3865 goto out; 3866 } 3867 NDFREE_PNBUF(&nd); 3868 vp1 = nd.ni_vp; 3869 vrele(vp); 3870 if (vp1 == vp) 3871 strcpy(path, rpath); 3872 else { 3873 vput(vp1); 3874 error = ENOENT; 3875 } 3876 3877 out: 3878 free(fbuf, M_TEMP); 3879 return (error); 3880 } 3881 3882 #ifdef DDB 3883 static void 3884 db_print_vpath(struct vnode *vp) 3885 { 3886 3887 while (vp != NULL) { 3888 db_printf("%p: ", vp); 3889 if (vp == rootvnode) { 3890 db_printf("/"); 3891 vp = NULL; 3892 } else { 3893 if (vp->v_vflag & VV_ROOT) { 3894 db_printf("<mount point>"); 3895 vp = vp->v_mount->mnt_vnodecovered; 3896 } else { 3897 struct namecache *ncp; 3898 char *ncn; 3899 int i; 3900 3901 ncp = TAILQ_FIRST(&vp->v_cache_dst); 3902 if (ncp != NULL) { 3903 ncn = ncp->nc_name; 3904 for (i = 0; i < ncp->nc_nlen; i++) 3905 db_printf("%c", *ncn++); 3906 vp = ncp->nc_dvp; 3907 } else { 3908 vp = NULL; 3909 } 3910 } 3911 } 3912 db_printf("\n"); 3913 } 3914 3915 return; 3916 } 3917 3918 DB_SHOW_COMMAND(vpath, db_show_vpath) 3919 { 3920 struct vnode *vp; 3921 3922 if (!have_addr) { 3923 db_printf("usage: show vpath <struct vnode *>\n"); 3924 return; 3925 } 3926 3927 vp = (struct vnode *)addr; 3928 db_print_vpath(vp); 3929 } 3930 3931 #endif 3932 3933 static int cache_fast_lookup = 1; 3934 3935 #define CACHE_FPL_FAILED -2020 3936 3937 static int 3938 cache_vop_bad_vexec(struct vop_fplookup_vexec_args *v) 3939 { 3940 vn_printf(v->a_vp, "no proper vop_fplookup_vexec\n"); 3941 panic("no proper vop_fplookup_vexec"); 3942 } 3943 3944 static int 3945 cache_vop_bad_symlink(struct vop_fplookup_symlink_args *v) 3946 { 3947 vn_printf(v->a_vp, "no proper vop_fplookup_symlink\n"); 3948 panic("no proper vop_fplookup_symlink"); 3949 } 3950 3951 void 3952 cache_vop_vector_register(struct vop_vector *v) 3953 { 3954 size_t ops; 3955 3956 ops = 0; 3957 if (v->vop_fplookup_vexec != NULL) { 3958 ops++; 3959 } 3960 if (v->vop_fplookup_symlink != NULL) { 3961 ops++; 3962 } 3963 3964 if (ops == 2) { 3965 return; 3966 } 3967 3968 if (ops == 0) { 3969 v->vop_fplookup_vexec = cache_vop_bad_vexec; 3970 v->vop_fplookup_symlink = cache_vop_bad_symlink; 3971 return; 3972 } 3973 3974 printf("%s: invalid vop vector %p -- either all or none fplookup vops " 3975 "need to be provided", __func__, v); 3976 if (v->vop_fplookup_vexec == NULL) { 3977 printf("%s: missing vop_fplookup_vexec\n", __func__); 3978 } 3979 if (v->vop_fplookup_symlink == NULL) { 3980 printf("%s: missing vop_fplookup_symlink\n", __func__); 3981 } 3982 panic("bad vop vector %p", v); 3983 } 3984 3985 #ifdef INVARIANTS 3986 void 3987 cache_validate_vop_vector(struct mount *mp, struct vop_vector *vops) 3988 { 3989 if (mp == NULL) 3990 return; 3991 3992 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) 3993 return; 3994 3995 if (vops->vop_fplookup_vexec == NULL || 3996 vops->vop_fplookup_vexec == cache_vop_bad_vexec) 3997 panic("bad vop_fplookup_vexec on vector %p for filesystem %s", 3998 vops, mp->mnt_vfc->vfc_name); 3999 4000 if (vops->vop_fplookup_symlink == NULL || 4001 vops->vop_fplookup_symlink == cache_vop_bad_symlink) 4002 panic("bad vop_fplookup_symlink on vector %p for filesystem %s", 4003 vops, mp->mnt_vfc->vfc_name); 4004 } 4005 #endif 4006 4007 void 4008 cache_fast_lookup_enabled_recalc(void) 4009 { 4010 int lookup_flag; 4011 int mac_on; 4012 4013 #ifdef MAC 4014 mac_on = mac_vnode_check_lookup_enabled(); 4015 mac_on |= mac_vnode_check_readlink_enabled(); 4016 #else 4017 mac_on = 0; 4018 #endif 4019 4020 lookup_flag = atomic_load_int(&cache_fast_lookup); 4021 if (lookup_flag && !mac_on) { 4022 atomic_store_char(&cache_fast_lookup_enabled, true); 4023 } else { 4024 atomic_store_char(&cache_fast_lookup_enabled, false); 4025 } 4026 } 4027 4028 static int 4029 syscal_vfs_cache_fast_lookup(SYSCTL_HANDLER_ARGS) 4030 { 4031 int error, old; 4032 4033 old = atomic_load_int(&cache_fast_lookup); 4034 error = sysctl_handle_int(oidp, arg1, arg2, req); 4035 if (error == 0 && req->newptr && old != atomic_load_int(&cache_fast_lookup)) 4036 cache_fast_lookup_enabled_recalc(); 4037 return (error); 4038 } 4039 SYSCTL_PROC(_vfs, OID_AUTO, cache_fast_lookup, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_MPSAFE, 4040 &cache_fast_lookup, 0, syscal_vfs_cache_fast_lookup, "IU", ""); 4041 4042 /* 4043 * Components of nameidata (or objects it can point to) which may 4044 * need restoring in case fast path lookup fails. 4045 */ 4046 struct nameidata_outer { 4047 size_t ni_pathlen; 4048 int cn_flags; 4049 }; 4050 4051 struct nameidata_saved { 4052 #ifdef INVARIANTS 4053 char *cn_nameptr; 4054 size_t ni_pathlen; 4055 #endif 4056 }; 4057 4058 #ifdef INVARIANTS 4059 struct cache_fpl_debug { 4060 size_t ni_pathlen; 4061 }; 4062 #endif 4063 4064 struct cache_fpl { 4065 struct nameidata *ndp; 4066 struct componentname *cnp; 4067 char *nulchar; 4068 struct vnode *dvp; 4069 struct vnode *tvp; 4070 seqc_t dvp_seqc; 4071 seqc_t tvp_seqc; 4072 uint32_t hash; 4073 struct nameidata_saved snd; 4074 struct nameidata_outer snd_outer; 4075 int line; 4076 enum cache_fpl_status status:8; 4077 bool in_smr; 4078 bool fsearch; 4079 struct pwd **pwd; 4080 #ifdef INVARIANTS 4081 struct cache_fpl_debug debug; 4082 #endif 4083 }; 4084 4085 static bool cache_fplookup_mp_supported(struct mount *mp); 4086 static bool cache_fplookup_is_mp(struct cache_fpl *fpl); 4087 static int cache_fplookup_cross_mount(struct cache_fpl *fpl); 4088 static int cache_fplookup_partial_setup(struct cache_fpl *fpl); 4089 static int cache_fplookup_skip_slashes(struct cache_fpl *fpl); 4090 static int cache_fplookup_trailingslash(struct cache_fpl *fpl); 4091 static void cache_fpl_pathlen_dec(struct cache_fpl *fpl); 4092 static void cache_fpl_pathlen_inc(struct cache_fpl *fpl); 4093 static void cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n); 4094 static void cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n); 4095 4096 static void 4097 cache_fpl_cleanup_cnp(struct componentname *cnp) 4098 { 4099 4100 uma_zfree(namei_zone, cnp->cn_pnbuf); 4101 cnp->cn_pnbuf = NULL; 4102 cnp->cn_nameptr = NULL; 4103 } 4104 4105 static struct vnode * 4106 cache_fpl_handle_root(struct cache_fpl *fpl) 4107 { 4108 struct nameidata *ndp; 4109 struct componentname *cnp; 4110 4111 ndp = fpl->ndp; 4112 cnp = fpl->cnp; 4113 4114 MPASS(*(cnp->cn_nameptr) == '/'); 4115 cnp->cn_nameptr++; 4116 cache_fpl_pathlen_dec(fpl); 4117 4118 if (__predict_false(*(cnp->cn_nameptr) == '/')) { 4119 do { 4120 cnp->cn_nameptr++; 4121 cache_fpl_pathlen_dec(fpl); 4122 } while (*(cnp->cn_nameptr) == '/'); 4123 } 4124 4125 return (ndp->ni_rootdir); 4126 } 4127 4128 static void 4129 cache_fpl_checkpoint_outer(struct cache_fpl *fpl) 4130 { 4131 4132 fpl->snd_outer.ni_pathlen = fpl->ndp->ni_pathlen; 4133 fpl->snd_outer.cn_flags = fpl->ndp->ni_cnd.cn_flags; 4134 } 4135 4136 static void 4137 cache_fpl_checkpoint(struct cache_fpl *fpl) 4138 { 4139 4140 #ifdef INVARIANTS 4141 fpl->snd.cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr; 4142 fpl->snd.ni_pathlen = fpl->debug.ni_pathlen; 4143 #endif 4144 } 4145 4146 static void 4147 cache_fpl_restore_partial(struct cache_fpl *fpl) 4148 { 4149 4150 fpl->ndp->ni_cnd.cn_flags = fpl->snd_outer.cn_flags; 4151 #ifdef INVARIANTS 4152 fpl->debug.ni_pathlen = fpl->snd.ni_pathlen; 4153 #endif 4154 } 4155 4156 static void 4157 cache_fpl_restore_abort(struct cache_fpl *fpl) 4158 { 4159 4160 cache_fpl_restore_partial(fpl); 4161 /* 4162 * It is 0 on entry by API contract. 4163 */ 4164 fpl->ndp->ni_resflags = 0; 4165 fpl->ndp->ni_cnd.cn_nameptr = fpl->ndp->ni_cnd.cn_pnbuf; 4166 fpl->ndp->ni_pathlen = fpl->snd_outer.ni_pathlen; 4167 } 4168 4169 #ifdef INVARIANTS 4170 #define cache_fpl_smr_assert_entered(fpl) ({ \ 4171 struct cache_fpl *_fpl = (fpl); \ 4172 MPASS(_fpl->in_smr == true); \ 4173 VFS_SMR_ASSERT_ENTERED(); \ 4174 }) 4175 #define cache_fpl_smr_assert_not_entered(fpl) ({ \ 4176 struct cache_fpl *_fpl = (fpl); \ 4177 MPASS(_fpl->in_smr == false); \ 4178 VFS_SMR_ASSERT_NOT_ENTERED(); \ 4179 }) 4180 static void 4181 cache_fpl_assert_status(struct cache_fpl *fpl) 4182 { 4183 4184 switch (fpl->status) { 4185 case CACHE_FPL_STATUS_UNSET: 4186 __assert_unreachable(); 4187 break; 4188 case CACHE_FPL_STATUS_DESTROYED: 4189 case CACHE_FPL_STATUS_ABORTED: 4190 case CACHE_FPL_STATUS_PARTIAL: 4191 case CACHE_FPL_STATUS_HANDLED: 4192 break; 4193 } 4194 } 4195 #else 4196 #define cache_fpl_smr_assert_entered(fpl) do { } while (0) 4197 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0) 4198 #define cache_fpl_assert_status(fpl) do { } while (0) 4199 #endif 4200 4201 #define cache_fpl_smr_enter_initial(fpl) ({ \ 4202 struct cache_fpl *_fpl = (fpl); \ 4203 vfs_smr_enter(); \ 4204 _fpl->in_smr = true; \ 4205 }) 4206 4207 #define cache_fpl_smr_enter(fpl) ({ \ 4208 struct cache_fpl *_fpl = (fpl); \ 4209 MPASS(_fpl->in_smr == false); \ 4210 vfs_smr_enter(); \ 4211 _fpl->in_smr = true; \ 4212 }) 4213 4214 #define cache_fpl_smr_exit(fpl) ({ \ 4215 struct cache_fpl *_fpl = (fpl); \ 4216 MPASS(_fpl->in_smr == true); \ 4217 vfs_smr_exit(); \ 4218 _fpl->in_smr = false; \ 4219 }) 4220 4221 static int 4222 cache_fpl_aborted_early_impl(struct cache_fpl *fpl, int line) 4223 { 4224 4225 if (fpl->status != CACHE_FPL_STATUS_UNSET) { 4226 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, 4227 ("%s: converting to abort from %d at %d, set at %d\n", 4228 __func__, fpl->status, line, fpl->line)); 4229 } 4230 cache_fpl_smr_assert_not_entered(fpl); 4231 fpl->status = CACHE_FPL_STATUS_ABORTED; 4232 fpl->line = line; 4233 return (CACHE_FPL_FAILED); 4234 } 4235 4236 #define cache_fpl_aborted_early(x) cache_fpl_aborted_early_impl((x), __LINE__) 4237 4238 static int __noinline 4239 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line) 4240 { 4241 struct nameidata *ndp; 4242 struct componentname *cnp; 4243 4244 ndp = fpl->ndp; 4245 cnp = fpl->cnp; 4246 4247 if (fpl->status != CACHE_FPL_STATUS_UNSET) { 4248 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, 4249 ("%s: converting to abort from %d at %d, set at %d\n", 4250 __func__, fpl->status, line, fpl->line)); 4251 } 4252 fpl->status = CACHE_FPL_STATUS_ABORTED; 4253 fpl->line = line; 4254 if (fpl->in_smr) 4255 cache_fpl_smr_exit(fpl); 4256 cache_fpl_restore_abort(fpl); 4257 /* 4258 * Resolving symlinks overwrites data passed by the caller. 4259 * Let namei know. 4260 */ 4261 if (ndp->ni_loopcnt > 0) { 4262 fpl->status = CACHE_FPL_STATUS_DESTROYED; 4263 cache_fpl_cleanup_cnp(cnp); 4264 } 4265 return (CACHE_FPL_FAILED); 4266 } 4267 4268 #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__) 4269 4270 static int __noinline 4271 cache_fpl_partial_impl(struct cache_fpl *fpl, int line) 4272 { 4273 4274 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 4275 ("%s: setting to partial at %d, but already set to %d at %d\n", 4276 __func__, line, fpl->status, fpl->line)); 4277 cache_fpl_smr_assert_entered(fpl); 4278 fpl->status = CACHE_FPL_STATUS_PARTIAL; 4279 fpl->line = line; 4280 return (cache_fplookup_partial_setup(fpl)); 4281 } 4282 4283 #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__) 4284 4285 static int 4286 cache_fpl_handled_impl(struct cache_fpl *fpl, int line) 4287 { 4288 4289 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 4290 ("%s: setting to handled at %d, but already set to %d at %d\n", 4291 __func__, line, fpl->status, fpl->line)); 4292 cache_fpl_smr_assert_not_entered(fpl); 4293 fpl->status = CACHE_FPL_STATUS_HANDLED; 4294 fpl->line = line; 4295 return (0); 4296 } 4297 4298 #define cache_fpl_handled(x) cache_fpl_handled_impl((x), __LINE__) 4299 4300 static int 4301 cache_fpl_handled_error_impl(struct cache_fpl *fpl, int error, int line) 4302 { 4303 4304 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 4305 ("%s: setting to handled at %d, but already set to %d at %d\n", 4306 __func__, line, fpl->status, fpl->line)); 4307 MPASS(error != 0); 4308 MPASS(error != CACHE_FPL_FAILED); 4309 cache_fpl_smr_assert_not_entered(fpl); 4310 fpl->status = CACHE_FPL_STATUS_HANDLED; 4311 fpl->line = line; 4312 fpl->dvp = NULL; 4313 fpl->tvp = NULL; 4314 return (error); 4315 } 4316 4317 #define cache_fpl_handled_error(x, e) cache_fpl_handled_error_impl((x), (e), __LINE__) 4318 4319 static bool 4320 cache_fpl_terminated(struct cache_fpl *fpl) 4321 { 4322 4323 return (fpl->status != CACHE_FPL_STATUS_UNSET); 4324 } 4325 4326 #define CACHE_FPL_SUPPORTED_CN_FLAGS \ 4327 (NC_NOMAKEENTRY | NC_KEEPPOSENTRY | LOCKLEAF | LOCKPARENT | WANTPARENT | \ 4328 FAILIFEXISTS | FOLLOW | EMPTYPATH | LOCKSHARED | ISRESTARTED | WILLBEDIR | \ 4329 ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK | OPENREAD | \ 4330 OPENWRITE | WANTIOCTLCAPS) 4331 4332 #define CACHE_FPL_INTERNAL_CN_FLAGS \ 4333 (ISDOTDOT | MAKEENTRY | ISLASTCN) 4334 4335 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, 4336 "supported and internal flags overlap"); 4337 4338 static bool 4339 cache_fpl_islastcn(struct nameidata *ndp) 4340 { 4341 4342 return (*ndp->ni_next == 0); 4343 } 4344 4345 static bool 4346 cache_fpl_istrailingslash(struct cache_fpl *fpl) 4347 { 4348 4349 MPASS(fpl->nulchar > fpl->cnp->cn_pnbuf); 4350 return (*(fpl->nulchar - 1) == '/'); 4351 } 4352 4353 static bool 4354 cache_fpl_isdotdot(struct componentname *cnp) 4355 { 4356 4357 if (cnp->cn_namelen == 2 && 4358 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') 4359 return (true); 4360 return (false); 4361 } 4362 4363 static bool 4364 cache_can_fplookup(struct cache_fpl *fpl) 4365 { 4366 struct nameidata *ndp; 4367 struct componentname *cnp; 4368 struct thread *td; 4369 4370 ndp = fpl->ndp; 4371 cnp = fpl->cnp; 4372 td = curthread; 4373 4374 if (!atomic_load_char(&cache_fast_lookup_enabled)) { 4375 cache_fpl_aborted_early(fpl); 4376 return (false); 4377 } 4378 if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) { 4379 cache_fpl_aborted_early(fpl); 4380 return (false); 4381 } 4382 if (IN_CAPABILITY_MODE(td)) { 4383 cache_fpl_aborted_early(fpl); 4384 return (false); 4385 } 4386 if (AUDITING_TD(td)) { 4387 cache_fpl_aborted_early(fpl); 4388 return (false); 4389 } 4390 if (ndp->ni_startdir != NULL) { 4391 cache_fpl_aborted_early(fpl); 4392 return (false); 4393 } 4394 return (true); 4395 } 4396 4397 static int __noinline 4398 cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp) 4399 { 4400 struct nameidata *ndp; 4401 struct componentname *cnp; 4402 int error; 4403 bool fsearch; 4404 4405 ndp = fpl->ndp; 4406 cnp = fpl->cnp; 4407 4408 error = fgetvp_lookup_smr(ndp->ni_dirfd, ndp, vpp, &fsearch); 4409 if (__predict_false(error != 0)) { 4410 return (cache_fpl_aborted(fpl)); 4411 } 4412 fpl->fsearch = fsearch; 4413 if ((*vpp)->v_type != VDIR) { 4414 if (!((cnp->cn_flags & EMPTYPATH) != 0 && cnp->cn_pnbuf[0] == '\0')) { 4415 cache_fpl_smr_exit(fpl); 4416 return (cache_fpl_handled_error(fpl, ENOTDIR)); 4417 } 4418 } 4419 return (0); 4420 } 4421 4422 static int __noinline 4423 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp, 4424 uint32_t hash) 4425 { 4426 struct componentname *cnp; 4427 struct vnode *dvp; 4428 4429 cnp = fpl->cnp; 4430 dvp = fpl->dvp; 4431 4432 cache_fpl_smr_exit(fpl); 4433 if (cache_neg_promote_cond(dvp, cnp, oncp, hash)) 4434 return (cache_fpl_handled_error(fpl, ENOENT)); 4435 else 4436 return (cache_fpl_aborted(fpl)); 4437 } 4438 4439 /* 4440 * The target vnode is not supported, prepare for the slow path to take over. 4441 */ 4442 static int __noinline 4443 cache_fplookup_partial_setup(struct cache_fpl *fpl) 4444 { 4445 struct nameidata *ndp; 4446 struct componentname *cnp; 4447 enum vgetstate dvs; 4448 struct vnode *dvp; 4449 struct pwd *pwd; 4450 seqc_t dvp_seqc; 4451 4452 ndp = fpl->ndp; 4453 cnp = fpl->cnp; 4454 pwd = *(fpl->pwd); 4455 dvp = fpl->dvp; 4456 dvp_seqc = fpl->dvp_seqc; 4457 4458 if (!pwd_hold_smr(pwd)) { 4459 return (cache_fpl_aborted(fpl)); 4460 } 4461 4462 /* 4463 * Note that seqc is checked before the vnode is locked, so by 4464 * the time regular lookup gets to it it may have moved. 4465 * 4466 * Ultimately this does not affect correctness, any lookup errors 4467 * are userspace racing with itself. It is guaranteed that any 4468 * path which ultimately gets found could also have been found 4469 * by regular lookup going all the way in absence of concurrent 4470 * modifications. 4471 */ 4472 dvs = vget_prep_smr(dvp); 4473 cache_fpl_smr_exit(fpl); 4474 if (__predict_false(dvs == VGET_NONE)) { 4475 pwd_drop(pwd); 4476 return (cache_fpl_aborted(fpl)); 4477 } 4478 4479 vget_finish_ref(dvp, dvs); 4480 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4481 vrele(dvp); 4482 pwd_drop(pwd); 4483 return (cache_fpl_aborted(fpl)); 4484 } 4485 4486 cache_fpl_restore_partial(fpl); 4487 #ifdef INVARIANTS 4488 if (cnp->cn_nameptr != fpl->snd.cn_nameptr) { 4489 panic("%s: cn_nameptr mismatch (%p != %p) full [%s]\n", __func__, 4490 cnp->cn_nameptr, fpl->snd.cn_nameptr, cnp->cn_pnbuf); 4491 } 4492 #endif 4493 4494 ndp->ni_startdir = dvp; 4495 cnp->cn_flags |= MAKEENTRY; 4496 if (cache_fpl_islastcn(ndp)) 4497 cnp->cn_flags |= ISLASTCN; 4498 if (cache_fpl_isdotdot(cnp)) 4499 cnp->cn_flags |= ISDOTDOT; 4500 4501 /* 4502 * Skip potential extra slashes parsing did not take care of. 4503 * cache_fplookup_skip_slashes explains the mechanism. 4504 */ 4505 if (__predict_false(*(cnp->cn_nameptr) == '/')) { 4506 do { 4507 cnp->cn_nameptr++; 4508 cache_fpl_pathlen_dec(fpl); 4509 } while (*(cnp->cn_nameptr) == '/'); 4510 } 4511 4512 ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1; 4513 #ifdef INVARIANTS 4514 if (ndp->ni_pathlen != fpl->debug.ni_pathlen) { 4515 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n", 4516 __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar, 4517 cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf); 4518 } 4519 #endif 4520 return (0); 4521 } 4522 4523 static int 4524 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs) 4525 { 4526 struct componentname *cnp; 4527 struct vnode *tvp; 4528 seqc_t tvp_seqc; 4529 int error, lkflags; 4530 4531 cnp = fpl->cnp; 4532 tvp = fpl->tvp; 4533 tvp_seqc = fpl->tvp_seqc; 4534 4535 if ((cnp->cn_flags & LOCKLEAF) != 0) { 4536 lkflags = LK_SHARED; 4537 if ((cnp->cn_flags & LOCKSHARED) == 0) 4538 lkflags = LK_EXCLUSIVE; 4539 error = vget_finish(tvp, lkflags, tvs); 4540 if (__predict_false(error != 0)) { 4541 return (cache_fpl_aborted(fpl)); 4542 } 4543 } else { 4544 vget_finish_ref(tvp, tvs); 4545 } 4546 4547 if (!vn_seqc_consistent(tvp, tvp_seqc)) { 4548 if ((cnp->cn_flags & LOCKLEAF) != 0) 4549 vput(tvp); 4550 else 4551 vrele(tvp); 4552 return (cache_fpl_aborted(fpl)); 4553 } 4554 4555 return (cache_fpl_handled(fpl)); 4556 } 4557 4558 /* 4559 * They want to possibly modify the state of the namecache. 4560 */ 4561 static int __noinline 4562 cache_fplookup_final_modifying(struct cache_fpl *fpl) 4563 { 4564 struct nameidata *ndp __diagused; 4565 struct componentname *cnp; 4566 enum vgetstate dvs; 4567 struct vnode *dvp, *tvp; 4568 struct mount *mp; 4569 seqc_t dvp_seqc; 4570 int error; 4571 bool docache; 4572 4573 ndp = fpl->ndp; 4574 cnp = fpl->cnp; 4575 dvp = fpl->dvp; 4576 dvp_seqc = fpl->dvp_seqc; 4577 4578 MPASS(*(cnp->cn_nameptr) != '/'); 4579 MPASS(cache_fpl_islastcn(ndp)); 4580 if ((cnp->cn_flags & LOCKPARENT) == 0) 4581 MPASS((cnp->cn_flags & WANTPARENT) != 0); 4582 MPASS((cnp->cn_flags & TRAILINGSLASH) == 0); 4583 MPASS(cnp->cn_nameiop == CREATE || cnp->cn_nameiop == DELETE || 4584 cnp->cn_nameiop == RENAME); 4585 MPASS((cnp->cn_flags & MAKEENTRY) == 0); 4586 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 4587 4588 docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE; 4589 if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME) 4590 docache = false; 4591 4592 /* 4593 * Regular lookup nulifies the slash, which we don't do here. 4594 * Don't take chances with filesystem routines seeing it for 4595 * the last entry. 4596 */ 4597 if (cache_fpl_istrailingslash(fpl)) { 4598 return (cache_fpl_partial(fpl)); 4599 } 4600 4601 mp = atomic_load_ptr(&dvp->v_mount); 4602 if (__predict_false(mp == NULL)) { 4603 return (cache_fpl_aborted(fpl)); 4604 } 4605 4606 if (__predict_false(mp->mnt_flag & MNT_RDONLY)) { 4607 cache_fpl_smr_exit(fpl); 4608 /* 4609 * Original code keeps not checking for CREATE which 4610 * might be a bug. For now let the old lookup decide. 4611 */ 4612 if (cnp->cn_nameiop == CREATE) { 4613 return (cache_fpl_aborted(fpl)); 4614 } 4615 return (cache_fpl_handled_error(fpl, EROFS)); 4616 } 4617 4618 if (fpl->tvp != NULL && (cnp->cn_flags & FAILIFEXISTS) != 0) { 4619 cache_fpl_smr_exit(fpl); 4620 return (cache_fpl_handled_error(fpl, EEXIST)); 4621 } 4622 4623 /* 4624 * Secure access to dvp; check cache_fplookup_partial_setup for 4625 * reasoning. 4626 * 4627 * XXX At least UFS requires its lookup routine to be called for 4628 * the last path component, which leads to some level of complication 4629 * and inefficiency: 4630 * - the target routine always locks the target vnode, but our caller 4631 * may not need it locked 4632 * - some of the VOP machinery asserts that the parent is locked, which 4633 * once more may be not required 4634 * 4635 * TODO: add a flag for filesystems which don't need this. 4636 */ 4637 dvs = vget_prep_smr(dvp); 4638 cache_fpl_smr_exit(fpl); 4639 if (__predict_false(dvs == VGET_NONE)) { 4640 return (cache_fpl_aborted(fpl)); 4641 } 4642 4643 vget_finish_ref(dvp, dvs); 4644 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4645 vrele(dvp); 4646 return (cache_fpl_aborted(fpl)); 4647 } 4648 4649 error = vn_lock(dvp, LK_EXCLUSIVE); 4650 if (__predict_false(error != 0)) { 4651 vrele(dvp); 4652 return (cache_fpl_aborted(fpl)); 4653 } 4654 4655 tvp = NULL; 4656 cnp->cn_flags |= ISLASTCN; 4657 if (docache) 4658 cnp->cn_flags |= MAKEENTRY; 4659 if (cache_fpl_isdotdot(cnp)) 4660 cnp->cn_flags |= ISDOTDOT; 4661 cnp->cn_lkflags = LK_EXCLUSIVE; 4662 error = VOP_LOOKUP(dvp, &tvp, cnp); 4663 switch (error) { 4664 case EJUSTRETURN: 4665 case 0: 4666 break; 4667 case ENOTDIR: 4668 case ENOENT: 4669 vput(dvp); 4670 return (cache_fpl_handled_error(fpl, error)); 4671 default: 4672 vput(dvp); 4673 return (cache_fpl_aborted(fpl)); 4674 } 4675 4676 fpl->tvp = tvp; 4677 4678 if (tvp == NULL) { 4679 MPASS(error == EJUSTRETURN); 4680 if ((cnp->cn_flags & LOCKPARENT) == 0) { 4681 VOP_UNLOCK(dvp); 4682 } 4683 return (cache_fpl_handled(fpl)); 4684 } 4685 4686 /* 4687 * There are very hairy corner cases concerning various flag combinations 4688 * and locking state. In particular here we only hold one lock instead of 4689 * two. 4690 * 4691 * Skip the complexity as it is of no significance for normal workloads. 4692 */ 4693 if (__predict_false(tvp == dvp)) { 4694 vput(dvp); 4695 vrele(tvp); 4696 return (cache_fpl_aborted(fpl)); 4697 } 4698 4699 /* 4700 * If they want the symlink itself we are fine, but if they want to 4701 * follow it regular lookup has to be engaged. 4702 */ 4703 if (tvp->v_type == VLNK) { 4704 if ((cnp->cn_flags & FOLLOW) != 0) { 4705 vput(dvp); 4706 vput(tvp); 4707 return (cache_fpl_aborted(fpl)); 4708 } 4709 } 4710 4711 /* 4712 * Since we expect this to be the terminal vnode it should almost never 4713 * be a mount point. 4714 */ 4715 if (__predict_false(cache_fplookup_is_mp(fpl))) { 4716 vput(dvp); 4717 vput(tvp); 4718 return (cache_fpl_aborted(fpl)); 4719 } 4720 4721 if ((cnp->cn_flags & FAILIFEXISTS) != 0) { 4722 vput(dvp); 4723 vput(tvp); 4724 return (cache_fpl_handled_error(fpl, EEXIST)); 4725 } 4726 4727 if ((cnp->cn_flags & LOCKLEAF) == 0) { 4728 VOP_UNLOCK(tvp); 4729 } 4730 4731 if ((cnp->cn_flags & LOCKPARENT) == 0) { 4732 VOP_UNLOCK(dvp); 4733 } 4734 4735 return (cache_fpl_handled(fpl)); 4736 } 4737 4738 static int __noinline 4739 cache_fplookup_modifying(struct cache_fpl *fpl) 4740 { 4741 struct nameidata *ndp; 4742 4743 ndp = fpl->ndp; 4744 4745 if (!cache_fpl_islastcn(ndp)) { 4746 return (cache_fpl_partial(fpl)); 4747 } 4748 return (cache_fplookup_final_modifying(fpl)); 4749 } 4750 4751 static int __noinline 4752 cache_fplookup_final_withparent(struct cache_fpl *fpl) 4753 { 4754 struct componentname *cnp; 4755 enum vgetstate dvs, tvs; 4756 struct vnode *dvp, *tvp; 4757 seqc_t dvp_seqc; 4758 int error; 4759 4760 cnp = fpl->cnp; 4761 dvp = fpl->dvp; 4762 dvp_seqc = fpl->dvp_seqc; 4763 tvp = fpl->tvp; 4764 4765 MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0); 4766 4767 /* 4768 * This is less efficient than it can be for simplicity. 4769 */ 4770 dvs = vget_prep_smr(dvp); 4771 if (__predict_false(dvs == VGET_NONE)) { 4772 return (cache_fpl_aborted(fpl)); 4773 } 4774 tvs = vget_prep_smr(tvp); 4775 if (__predict_false(tvs == VGET_NONE)) { 4776 cache_fpl_smr_exit(fpl); 4777 vget_abort(dvp, dvs); 4778 return (cache_fpl_aborted(fpl)); 4779 } 4780 4781 cache_fpl_smr_exit(fpl); 4782 4783 if ((cnp->cn_flags & LOCKPARENT) != 0) { 4784 error = vget_finish(dvp, LK_EXCLUSIVE, dvs); 4785 if (__predict_false(error != 0)) { 4786 vget_abort(tvp, tvs); 4787 return (cache_fpl_aborted(fpl)); 4788 } 4789 } else { 4790 vget_finish_ref(dvp, dvs); 4791 } 4792 4793 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4794 vget_abort(tvp, tvs); 4795 if ((cnp->cn_flags & LOCKPARENT) != 0) 4796 vput(dvp); 4797 else 4798 vrele(dvp); 4799 return (cache_fpl_aborted(fpl)); 4800 } 4801 4802 error = cache_fplookup_final_child(fpl, tvs); 4803 if (__predict_false(error != 0)) { 4804 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED || 4805 fpl->status == CACHE_FPL_STATUS_DESTROYED); 4806 if ((cnp->cn_flags & LOCKPARENT) != 0) 4807 vput(dvp); 4808 else 4809 vrele(dvp); 4810 return (error); 4811 } 4812 4813 MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED); 4814 return (0); 4815 } 4816 4817 static int 4818 cache_fplookup_final(struct cache_fpl *fpl) 4819 { 4820 struct componentname *cnp; 4821 enum vgetstate tvs; 4822 struct vnode *dvp, *tvp; 4823 seqc_t dvp_seqc; 4824 4825 cnp = fpl->cnp; 4826 dvp = fpl->dvp; 4827 dvp_seqc = fpl->dvp_seqc; 4828 tvp = fpl->tvp; 4829 4830 MPASS(*(cnp->cn_nameptr) != '/'); 4831 4832 if (cnp->cn_nameiop != LOOKUP) { 4833 return (cache_fplookup_final_modifying(fpl)); 4834 } 4835 4836 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) 4837 return (cache_fplookup_final_withparent(fpl)); 4838 4839 tvs = vget_prep_smr(tvp); 4840 if (__predict_false(tvs == VGET_NONE)) { 4841 return (cache_fpl_partial(fpl)); 4842 } 4843 4844 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4845 cache_fpl_smr_exit(fpl); 4846 vget_abort(tvp, tvs); 4847 return (cache_fpl_aborted(fpl)); 4848 } 4849 4850 cache_fpl_smr_exit(fpl); 4851 return (cache_fplookup_final_child(fpl, tvs)); 4852 } 4853 4854 /* 4855 * Comment from locked lookup: 4856 * Check for degenerate name (e.g. / or "") which is a way of talking about a 4857 * directory, e.g. like "/." or ".". 4858 */ 4859 static int __noinline 4860 cache_fplookup_degenerate(struct cache_fpl *fpl) 4861 { 4862 struct componentname *cnp; 4863 struct vnode *dvp; 4864 enum vgetstate dvs; 4865 int error, lkflags; 4866 #ifdef INVARIANTS 4867 char *cp; 4868 #endif 4869 4870 fpl->tvp = fpl->dvp; 4871 fpl->tvp_seqc = fpl->dvp_seqc; 4872 4873 cnp = fpl->cnp; 4874 dvp = fpl->dvp; 4875 4876 #ifdef INVARIANTS 4877 for (cp = cnp->cn_pnbuf; *cp != '\0'; cp++) { 4878 KASSERT(*cp == '/', 4879 ("%s: encountered non-slash; string [%s]\n", __func__, 4880 cnp->cn_pnbuf)); 4881 } 4882 #endif 4883 4884 if (__predict_false(cnp->cn_nameiop != LOOKUP)) { 4885 cache_fpl_smr_exit(fpl); 4886 return (cache_fpl_handled_error(fpl, EISDIR)); 4887 } 4888 4889 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) { 4890 return (cache_fplookup_final_withparent(fpl)); 4891 } 4892 4893 dvs = vget_prep_smr(dvp); 4894 cache_fpl_smr_exit(fpl); 4895 if (__predict_false(dvs == VGET_NONE)) { 4896 return (cache_fpl_aborted(fpl)); 4897 } 4898 4899 if ((cnp->cn_flags & LOCKLEAF) != 0) { 4900 lkflags = LK_SHARED; 4901 if ((cnp->cn_flags & LOCKSHARED) == 0) 4902 lkflags = LK_EXCLUSIVE; 4903 error = vget_finish(dvp, lkflags, dvs); 4904 if (__predict_false(error != 0)) { 4905 return (cache_fpl_aborted(fpl)); 4906 } 4907 } else { 4908 vget_finish_ref(dvp, dvs); 4909 } 4910 return (cache_fpl_handled(fpl)); 4911 } 4912 4913 static int __noinline 4914 cache_fplookup_emptypath(struct cache_fpl *fpl) 4915 { 4916 struct nameidata *ndp; 4917 struct componentname *cnp; 4918 enum vgetstate tvs; 4919 struct vnode *tvp; 4920 int error, lkflags; 4921 4922 fpl->tvp = fpl->dvp; 4923 fpl->tvp_seqc = fpl->dvp_seqc; 4924 4925 ndp = fpl->ndp; 4926 cnp = fpl->cnp; 4927 tvp = fpl->tvp; 4928 4929 MPASS(*cnp->cn_pnbuf == '\0'); 4930 4931 if (__predict_false((cnp->cn_flags & EMPTYPATH) == 0)) { 4932 cache_fpl_smr_exit(fpl); 4933 return (cache_fpl_handled_error(fpl, ENOENT)); 4934 } 4935 4936 MPASS((cnp->cn_flags & (LOCKPARENT | WANTPARENT)) == 0); 4937 4938 tvs = vget_prep_smr(tvp); 4939 cache_fpl_smr_exit(fpl); 4940 if (__predict_false(tvs == VGET_NONE)) { 4941 return (cache_fpl_aborted(fpl)); 4942 } 4943 4944 if ((cnp->cn_flags & LOCKLEAF) != 0) { 4945 lkflags = LK_SHARED; 4946 if ((cnp->cn_flags & LOCKSHARED) == 0) 4947 lkflags = LK_EXCLUSIVE; 4948 error = vget_finish(tvp, lkflags, tvs); 4949 if (__predict_false(error != 0)) { 4950 return (cache_fpl_aborted(fpl)); 4951 } 4952 } else { 4953 vget_finish_ref(tvp, tvs); 4954 } 4955 4956 ndp->ni_resflags |= NIRES_EMPTYPATH; 4957 return (cache_fpl_handled(fpl)); 4958 } 4959 4960 static int __noinline 4961 cache_fplookup_noentry(struct cache_fpl *fpl) 4962 { 4963 struct nameidata *ndp; 4964 struct componentname *cnp; 4965 enum vgetstate dvs; 4966 struct vnode *dvp, *tvp; 4967 seqc_t dvp_seqc; 4968 int error; 4969 4970 ndp = fpl->ndp; 4971 cnp = fpl->cnp; 4972 dvp = fpl->dvp; 4973 dvp_seqc = fpl->dvp_seqc; 4974 4975 MPASS((cnp->cn_flags & MAKEENTRY) == 0); 4976 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 4977 if (cnp->cn_nameiop == LOOKUP) 4978 MPASS((cnp->cn_flags & NOCACHE) == 0); 4979 MPASS(!cache_fpl_isdotdot(cnp)); 4980 4981 /* 4982 * Hack: delayed name len checking. 4983 */ 4984 if (__predict_false(cnp->cn_namelen > NAME_MAX)) { 4985 cache_fpl_smr_exit(fpl); 4986 return (cache_fpl_handled_error(fpl, ENAMETOOLONG)); 4987 } 4988 4989 if (cnp->cn_nameptr[0] == '/') { 4990 return (cache_fplookup_skip_slashes(fpl)); 4991 } 4992 4993 if (cnp->cn_pnbuf[0] == '\0') { 4994 return (cache_fplookup_emptypath(fpl)); 4995 } 4996 4997 if (cnp->cn_nameptr[0] == '\0') { 4998 if (fpl->tvp == NULL) { 4999 return (cache_fplookup_degenerate(fpl)); 5000 } 5001 return (cache_fplookup_trailingslash(fpl)); 5002 } 5003 5004 if (cnp->cn_nameiop != LOOKUP) { 5005 fpl->tvp = NULL; 5006 return (cache_fplookup_modifying(fpl)); 5007 } 5008 5009 /* 5010 * Only try to fill in the component if it is the last one, 5011 * otherwise not only there may be several to handle but the 5012 * walk may be complicated. 5013 */ 5014 if (!cache_fpl_islastcn(ndp)) { 5015 return (cache_fpl_partial(fpl)); 5016 } 5017 5018 /* 5019 * Regular lookup nulifies the slash, which we don't do here. 5020 * Don't take chances with filesystem routines seeing it for 5021 * the last entry. 5022 */ 5023 if (cache_fpl_istrailingslash(fpl)) { 5024 return (cache_fpl_partial(fpl)); 5025 } 5026 5027 /* 5028 * Secure access to dvp; check cache_fplookup_partial_setup for 5029 * reasoning. 5030 */ 5031 dvs = vget_prep_smr(dvp); 5032 cache_fpl_smr_exit(fpl); 5033 if (__predict_false(dvs == VGET_NONE)) { 5034 return (cache_fpl_aborted(fpl)); 5035 } 5036 5037 vget_finish_ref(dvp, dvs); 5038 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 5039 vrele(dvp); 5040 return (cache_fpl_aborted(fpl)); 5041 } 5042 5043 error = vn_lock(dvp, LK_SHARED); 5044 if (__predict_false(error != 0)) { 5045 vrele(dvp); 5046 return (cache_fpl_aborted(fpl)); 5047 } 5048 5049 tvp = NULL; 5050 /* 5051 * TODO: provide variants which don't require locking either vnode. 5052 */ 5053 cnp->cn_flags |= ISLASTCN | MAKEENTRY; 5054 cnp->cn_lkflags = LK_SHARED; 5055 if ((cnp->cn_flags & LOCKSHARED) == 0) { 5056 cnp->cn_lkflags = LK_EXCLUSIVE; 5057 } 5058 error = VOP_LOOKUP(dvp, &tvp, cnp); 5059 switch (error) { 5060 case EJUSTRETURN: 5061 case 0: 5062 break; 5063 case ENOTDIR: 5064 case ENOENT: 5065 vput(dvp); 5066 return (cache_fpl_handled_error(fpl, error)); 5067 default: 5068 vput(dvp); 5069 return (cache_fpl_aborted(fpl)); 5070 } 5071 5072 fpl->tvp = tvp; 5073 5074 if (tvp == NULL) { 5075 MPASS(error == EJUSTRETURN); 5076 if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) { 5077 vput(dvp); 5078 } else if ((cnp->cn_flags & LOCKPARENT) == 0) { 5079 VOP_UNLOCK(dvp); 5080 } 5081 return (cache_fpl_handled(fpl)); 5082 } 5083 5084 if (tvp->v_type == VLNK) { 5085 if ((cnp->cn_flags & FOLLOW) != 0) { 5086 vput(dvp); 5087 vput(tvp); 5088 return (cache_fpl_aborted(fpl)); 5089 } 5090 } 5091 5092 if (__predict_false(cache_fplookup_is_mp(fpl))) { 5093 vput(dvp); 5094 vput(tvp); 5095 return (cache_fpl_aborted(fpl)); 5096 } 5097 5098 if ((cnp->cn_flags & LOCKLEAF) == 0) { 5099 VOP_UNLOCK(tvp); 5100 } 5101 5102 if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) { 5103 vput(dvp); 5104 } else if ((cnp->cn_flags & LOCKPARENT) == 0) { 5105 VOP_UNLOCK(dvp); 5106 } 5107 return (cache_fpl_handled(fpl)); 5108 } 5109 5110 static int __noinline 5111 cache_fplookup_dot(struct cache_fpl *fpl) 5112 { 5113 int error; 5114 5115 MPASS(!seqc_in_modify(fpl->dvp_seqc)); 5116 5117 if (__predict_false(fpl->dvp->v_type != VDIR)) { 5118 cache_fpl_smr_exit(fpl); 5119 return (cache_fpl_handled_error(fpl, ENOTDIR)); 5120 } 5121 5122 /* 5123 * Just re-assign the value. seqc will be checked later for the first 5124 * non-dot path component in line and/or before deciding to return the 5125 * vnode. 5126 */ 5127 fpl->tvp = fpl->dvp; 5128 fpl->tvp_seqc = fpl->dvp_seqc; 5129 5130 SDT_PROBE3(vfs, namecache, lookup, hit, fpl->dvp, ".", fpl->dvp); 5131 5132 error = 0; 5133 if (cache_fplookup_is_mp(fpl)) { 5134 error = cache_fplookup_cross_mount(fpl); 5135 } 5136 return (error); 5137 } 5138 5139 static int __noinline 5140 cache_fplookup_dotdot(struct cache_fpl *fpl) 5141 { 5142 struct nameidata *ndp; 5143 struct componentname *cnp; 5144 struct namecache *ncp; 5145 struct vnode *dvp; 5146 struct prison *pr; 5147 u_char nc_flag; 5148 5149 ndp = fpl->ndp; 5150 cnp = fpl->cnp; 5151 dvp = fpl->dvp; 5152 5153 MPASS(cache_fpl_isdotdot(cnp)); 5154 5155 /* 5156 * XXX this is racy the same way regular lookup is 5157 */ 5158 for (pr = cnp->cn_cred->cr_prison; pr != NULL; 5159 pr = pr->pr_parent) 5160 if (dvp == pr->pr_root) 5161 break; 5162 5163 if (dvp == ndp->ni_rootdir || 5164 dvp == ndp->ni_topdir || 5165 dvp == rootvnode || 5166 pr != NULL) { 5167 fpl->tvp = dvp; 5168 fpl->tvp_seqc = vn_seqc_read_any(dvp); 5169 if (seqc_in_modify(fpl->tvp_seqc)) { 5170 return (cache_fpl_aborted(fpl)); 5171 } 5172 return (0); 5173 } 5174 5175 if ((dvp->v_vflag & VV_ROOT) != 0) { 5176 /* 5177 * TODO 5178 * The opposite of climb mount is needed here. 5179 */ 5180 return (cache_fpl_partial(fpl)); 5181 } 5182 5183 if (__predict_false(dvp->v_type != VDIR)) { 5184 cache_fpl_smr_exit(fpl); 5185 return (cache_fpl_handled_error(fpl, ENOTDIR)); 5186 } 5187 5188 ncp = atomic_load_consume_ptr(&dvp->v_cache_dd); 5189 if (ncp == NULL) { 5190 return (cache_fpl_aborted(fpl)); 5191 } 5192 5193 nc_flag = atomic_load_char(&ncp->nc_flag); 5194 if ((nc_flag & NCF_ISDOTDOT) != 0) { 5195 if ((nc_flag & NCF_NEGATIVE) != 0) 5196 return (cache_fpl_aborted(fpl)); 5197 fpl->tvp = ncp->nc_vp; 5198 } else { 5199 fpl->tvp = ncp->nc_dvp; 5200 } 5201 5202 fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp); 5203 if (seqc_in_modify(fpl->tvp_seqc)) { 5204 return (cache_fpl_partial(fpl)); 5205 } 5206 5207 /* 5208 * Acquire fence provided by vn_seqc_read_any above. 5209 */ 5210 if (__predict_false(atomic_load_ptr(&dvp->v_cache_dd) != ncp)) { 5211 return (cache_fpl_aborted(fpl)); 5212 } 5213 5214 if (!cache_ncp_canuse(ncp)) { 5215 return (cache_fpl_aborted(fpl)); 5216 } 5217 5218 return (0); 5219 } 5220 5221 static int __noinline 5222 cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash) 5223 { 5224 u_char nc_flag __diagused; 5225 bool neg_promote; 5226 5227 #ifdef INVARIANTS 5228 nc_flag = atomic_load_char(&ncp->nc_flag); 5229 MPASS((nc_flag & NCF_NEGATIVE) != 0); 5230 #endif 5231 /* 5232 * If they want to create an entry we need to replace this one. 5233 */ 5234 if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) { 5235 fpl->tvp = NULL; 5236 return (cache_fplookup_modifying(fpl)); 5237 } 5238 neg_promote = cache_neg_hit_prep(ncp); 5239 if (!cache_fpl_neg_ncp_canuse(ncp)) { 5240 cache_neg_hit_abort(ncp); 5241 return (cache_fpl_partial(fpl)); 5242 } 5243 if (neg_promote) { 5244 return (cache_fplookup_negative_promote(fpl, ncp, hash)); 5245 } 5246 cache_neg_hit_finish(ncp); 5247 cache_fpl_smr_exit(fpl); 5248 return (cache_fpl_handled_error(fpl, ENOENT)); 5249 } 5250 5251 /* 5252 * Resolve a symlink. Called by filesystem-specific routines. 5253 * 5254 * Code flow is: 5255 * ... -> cache_fplookup_symlink -> VOP_FPLOOKUP_SYMLINK -> cache_symlink_resolve 5256 */ 5257 int 5258 cache_symlink_resolve(struct cache_fpl *fpl, const char *string, size_t len) 5259 { 5260 struct nameidata *ndp; 5261 struct componentname *cnp; 5262 size_t adjust; 5263 5264 ndp = fpl->ndp; 5265 cnp = fpl->cnp; 5266 5267 if (__predict_false(len == 0)) { 5268 return (ENOENT); 5269 } 5270 5271 if (__predict_false(len > MAXPATHLEN - 2)) { 5272 if (cache_fpl_istrailingslash(fpl)) { 5273 return (EAGAIN); 5274 } 5275 } 5276 5277 ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr - cnp->cn_namelen + 1; 5278 #ifdef INVARIANTS 5279 if (ndp->ni_pathlen != fpl->debug.ni_pathlen) { 5280 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n", 5281 __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar, 5282 cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf); 5283 } 5284 #endif 5285 5286 if (__predict_false(len + ndp->ni_pathlen > MAXPATHLEN)) { 5287 return (ENAMETOOLONG); 5288 } 5289 5290 if (__predict_false(ndp->ni_loopcnt++ >= MAXSYMLINKS)) { 5291 return (ELOOP); 5292 } 5293 5294 adjust = len; 5295 if (ndp->ni_pathlen > 1) { 5296 bcopy(ndp->ni_next, cnp->cn_pnbuf + len, ndp->ni_pathlen); 5297 } else { 5298 if (cache_fpl_istrailingslash(fpl)) { 5299 adjust = len + 1; 5300 cnp->cn_pnbuf[len] = '/'; 5301 cnp->cn_pnbuf[len + 1] = '\0'; 5302 } else { 5303 cnp->cn_pnbuf[len] = '\0'; 5304 } 5305 } 5306 bcopy(string, cnp->cn_pnbuf, len); 5307 5308 ndp->ni_pathlen += adjust; 5309 cache_fpl_pathlen_add(fpl, adjust); 5310 cnp->cn_nameptr = cnp->cn_pnbuf; 5311 fpl->nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1]; 5312 fpl->tvp = NULL; 5313 return (0); 5314 } 5315 5316 static int __noinline 5317 cache_fplookup_symlink(struct cache_fpl *fpl) 5318 { 5319 struct mount *mp; 5320 struct nameidata *ndp; 5321 struct componentname *cnp; 5322 struct vnode *dvp, *tvp; 5323 int error; 5324 5325 ndp = fpl->ndp; 5326 cnp = fpl->cnp; 5327 dvp = fpl->dvp; 5328 tvp = fpl->tvp; 5329 5330 if (cache_fpl_islastcn(ndp)) { 5331 if ((cnp->cn_flags & FOLLOW) == 0) { 5332 return (cache_fplookup_final(fpl)); 5333 } 5334 } 5335 5336 mp = atomic_load_ptr(&dvp->v_mount); 5337 if (__predict_false(mp == NULL)) { 5338 return (cache_fpl_aborted(fpl)); 5339 } 5340 5341 /* 5342 * Note this check races against setting the flag just like regular 5343 * lookup. 5344 */ 5345 if (__predict_false((mp->mnt_flag & MNT_NOSYMFOLLOW) != 0)) { 5346 cache_fpl_smr_exit(fpl); 5347 return (cache_fpl_handled_error(fpl, EACCES)); 5348 } 5349 5350 error = VOP_FPLOOKUP_SYMLINK(tvp, fpl); 5351 if (__predict_false(error != 0)) { 5352 switch (error) { 5353 case EAGAIN: 5354 return (cache_fpl_partial(fpl)); 5355 case ENOENT: 5356 case ENAMETOOLONG: 5357 case ELOOP: 5358 cache_fpl_smr_exit(fpl); 5359 return (cache_fpl_handled_error(fpl, error)); 5360 default: 5361 return (cache_fpl_aborted(fpl)); 5362 } 5363 } 5364 5365 if (*(cnp->cn_nameptr) == '/') { 5366 fpl->dvp = cache_fpl_handle_root(fpl); 5367 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); 5368 if (seqc_in_modify(fpl->dvp_seqc)) { 5369 return (cache_fpl_aborted(fpl)); 5370 } 5371 /* 5372 * The main loop assumes that ->dvp points to a vnode belonging 5373 * to a filesystem which can do lockless lookup, but the absolute 5374 * symlink can be wandering off to one which does not. 5375 */ 5376 mp = atomic_load_ptr(&fpl->dvp->v_mount); 5377 if (__predict_false(mp == NULL)) { 5378 return (cache_fpl_aborted(fpl)); 5379 } 5380 if (!cache_fplookup_mp_supported(mp)) { 5381 cache_fpl_checkpoint(fpl); 5382 return (cache_fpl_partial(fpl)); 5383 } 5384 } 5385 return (0); 5386 } 5387 5388 static int 5389 cache_fplookup_next(struct cache_fpl *fpl) 5390 { 5391 struct componentname *cnp; 5392 struct namecache *ncp; 5393 struct vnode *dvp, *tvp; 5394 u_char nc_flag; 5395 uint32_t hash; 5396 int error; 5397 5398 cnp = fpl->cnp; 5399 dvp = fpl->dvp; 5400 hash = fpl->hash; 5401 5402 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 5403 if (cnp->cn_namelen == 1) { 5404 return (cache_fplookup_dot(fpl)); 5405 } 5406 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 5407 return (cache_fplookup_dotdot(fpl)); 5408 } 5409 } 5410 5411 MPASS(!cache_fpl_isdotdot(cnp)); 5412 5413 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 5414 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 5415 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 5416 break; 5417 } 5418 5419 if (__predict_false(ncp == NULL)) { 5420 return (cache_fplookup_noentry(fpl)); 5421 } 5422 5423 tvp = atomic_load_ptr(&ncp->nc_vp); 5424 nc_flag = atomic_load_char(&ncp->nc_flag); 5425 if ((nc_flag & NCF_NEGATIVE) != 0) { 5426 return (cache_fplookup_neg(fpl, ncp, hash)); 5427 } 5428 5429 if (!cache_ncp_canuse(ncp)) { 5430 return (cache_fpl_partial(fpl)); 5431 } 5432 5433 fpl->tvp = tvp; 5434 fpl->tvp_seqc = vn_seqc_read_any(tvp); 5435 if (seqc_in_modify(fpl->tvp_seqc)) { 5436 return (cache_fpl_partial(fpl)); 5437 } 5438 5439 counter_u64_add(numposhits, 1); 5440 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp); 5441 5442 error = 0; 5443 if (cache_fplookup_is_mp(fpl)) { 5444 error = cache_fplookup_cross_mount(fpl); 5445 } 5446 return (error); 5447 } 5448 5449 static bool 5450 cache_fplookup_mp_supported(struct mount *mp) 5451 { 5452 5453 MPASS(mp != NULL); 5454 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) 5455 return (false); 5456 return (true); 5457 } 5458 5459 /* 5460 * Walk up the mount stack (if any). 5461 * 5462 * Correctness is provided in the following ways: 5463 * - all vnodes are protected from freeing with SMR 5464 * - struct mount objects are type stable making them always safe to access 5465 * - stability of the particular mount is provided by busying it 5466 * - relationship between the vnode which is mounted on and the mount is 5467 * verified with the vnode sequence counter after busying 5468 * - association between root vnode of the mount and the mount is protected 5469 * by busy 5470 * 5471 * From that point on we can read the sequence counter of the root vnode 5472 * and get the next mount on the stack (if any) using the same protection. 5473 * 5474 * By the end of successful walk we are guaranteed the reached state was 5475 * indeed present at least at some point which matches the regular lookup. 5476 */ 5477 static int __noinline 5478 cache_fplookup_climb_mount(struct cache_fpl *fpl) 5479 { 5480 struct mount *mp, *prev_mp; 5481 struct mount_pcpu *mpcpu, *prev_mpcpu; 5482 struct vnode *vp; 5483 seqc_t vp_seqc; 5484 5485 vp = fpl->tvp; 5486 vp_seqc = fpl->tvp_seqc; 5487 5488 VNPASS(vp->v_type == VDIR || vp->v_type == VREG || vp->v_type == VBAD, vp); 5489 mp = atomic_load_ptr(&vp->v_mountedhere); 5490 if (__predict_false(mp == NULL)) { 5491 return (0); 5492 } 5493 5494 prev_mp = NULL; 5495 for (;;) { 5496 if (!vfs_op_thread_enter_crit(mp, mpcpu)) { 5497 if (prev_mp != NULL) 5498 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); 5499 return (cache_fpl_partial(fpl)); 5500 } 5501 if (prev_mp != NULL) 5502 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); 5503 if (!vn_seqc_consistent(vp, vp_seqc)) { 5504 vfs_op_thread_exit_crit(mp, mpcpu); 5505 return (cache_fpl_partial(fpl)); 5506 } 5507 if (!cache_fplookup_mp_supported(mp)) { 5508 vfs_op_thread_exit_crit(mp, mpcpu); 5509 return (cache_fpl_partial(fpl)); 5510 } 5511 vp = atomic_load_ptr(&mp->mnt_rootvnode); 5512 if (vp == NULL) { 5513 vfs_op_thread_exit_crit(mp, mpcpu); 5514 return (cache_fpl_partial(fpl)); 5515 } 5516 vp_seqc = vn_seqc_read_any(vp); 5517 if (seqc_in_modify(vp_seqc)) { 5518 vfs_op_thread_exit_crit(mp, mpcpu); 5519 return (cache_fpl_partial(fpl)); 5520 } 5521 prev_mp = mp; 5522 prev_mpcpu = mpcpu; 5523 mp = atomic_load_ptr(&vp->v_mountedhere); 5524 if (mp == NULL) 5525 break; 5526 } 5527 5528 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); 5529 fpl->tvp = vp; 5530 fpl->tvp_seqc = vp_seqc; 5531 return (0); 5532 } 5533 5534 static int __noinline 5535 cache_fplookup_cross_mount(struct cache_fpl *fpl) 5536 { 5537 struct mount *mp; 5538 struct mount_pcpu *mpcpu; 5539 struct vnode *vp; 5540 seqc_t vp_seqc; 5541 5542 vp = fpl->tvp; 5543 vp_seqc = fpl->tvp_seqc; 5544 5545 VNPASS(vp->v_type == VDIR || vp->v_type == VREG || vp->v_type == VBAD, vp); 5546 mp = atomic_load_ptr(&vp->v_mountedhere); 5547 if (__predict_false(mp == NULL)) { 5548 return (0); 5549 } 5550 5551 if (!vfs_op_thread_enter_crit(mp, mpcpu)) { 5552 return (cache_fpl_partial(fpl)); 5553 } 5554 if (!vn_seqc_consistent(vp, vp_seqc)) { 5555 vfs_op_thread_exit_crit(mp, mpcpu); 5556 return (cache_fpl_partial(fpl)); 5557 } 5558 if (!cache_fplookup_mp_supported(mp)) { 5559 vfs_op_thread_exit_crit(mp, mpcpu); 5560 return (cache_fpl_partial(fpl)); 5561 } 5562 vp = atomic_load_ptr(&mp->mnt_rootvnode); 5563 if (__predict_false(vp == NULL)) { 5564 vfs_op_thread_exit_crit(mp, mpcpu); 5565 return (cache_fpl_partial(fpl)); 5566 } 5567 vp_seqc = vn_seqc_read_any(vp); 5568 vfs_op_thread_exit_crit(mp, mpcpu); 5569 if (seqc_in_modify(vp_seqc)) { 5570 return (cache_fpl_partial(fpl)); 5571 } 5572 mp = atomic_load_ptr(&vp->v_mountedhere); 5573 if (__predict_false(mp != NULL)) { 5574 /* 5575 * There are possibly more mount points on top. 5576 * Normally this does not happen so for simplicity just start 5577 * over. 5578 */ 5579 return (cache_fplookup_climb_mount(fpl)); 5580 } 5581 5582 fpl->tvp = vp; 5583 fpl->tvp_seqc = vp_seqc; 5584 return (0); 5585 } 5586 5587 /* 5588 * Check if a vnode is mounted on. 5589 */ 5590 static bool 5591 cache_fplookup_is_mp(struct cache_fpl *fpl) 5592 { 5593 struct vnode *vp; 5594 5595 vp = fpl->tvp; 5596 return ((vn_irflag_read(vp) & VIRF_MOUNTPOINT) != 0); 5597 } 5598 5599 /* 5600 * Parse the path. 5601 * 5602 * The code was originally copy-pasted from regular lookup and despite 5603 * clean ups leaves performance on the table. Any modifications here 5604 * must take into account that in case off fallback the resulting 5605 * nameidata state has to be compatible with the original. 5606 */ 5607 5608 /* 5609 * Debug ni_pathlen tracking. 5610 */ 5611 #ifdef INVARIANTS 5612 static void 5613 cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n) 5614 { 5615 5616 fpl->debug.ni_pathlen += n; 5617 KASSERT(fpl->debug.ni_pathlen <= PATH_MAX, 5618 ("%s: pathlen overflow to %zd\n", __func__, fpl->debug.ni_pathlen)); 5619 } 5620 5621 static void 5622 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n) 5623 { 5624 5625 fpl->debug.ni_pathlen -= n; 5626 KASSERT(fpl->debug.ni_pathlen <= PATH_MAX, 5627 ("%s: pathlen underflow to %zd\n", __func__, fpl->debug.ni_pathlen)); 5628 } 5629 5630 static void 5631 cache_fpl_pathlen_inc(struct cache_fpl *fpl) 5632 { 5633 5634 cache_fpl_pathlen_add(fpl, 1); 5635 } 5636 5637 static void 5638 cache_fpl_pathlen_dec(struct cache_fpl *fpl) 5639 { 5640 5641 cache_fpl_pathlen_sub(fpl, 1); 5642 } 5643 #else 5644 static void 5645 cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n) 5646 { 5647 } 5648 5649 static void 5650 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n) 5651 { 5652 } 5653 5654 static void 5655 cache_fpl_pathlen_inc(struct cache_fpl *fpl) 5656 { 5657 } 5658 5659 static void 5660 cache_fpl_pathlen_dec(struct cache_fpl *fpl) 5661 { 5662 } 5663 #endif 5664 5665 static void 5666 cache_fplookup_parse(struct cache_fpl *fpl) 5667 { 5668 struct nameidata *ndp; 5669 struct componentname *cnp; 5670 struct vnode *dvp; 5671 char *cp; 5672 uint32_t hash; 5673 5674 ndp = fpl->ndp; 5675 cnp = fpl->cnp; 5676 dvp = fpl->dvp; 5677 5678 /* 5679 * Find the end of this path component, it is either / or nul. 5680 * 5681 * Store / as a temporary sentinel so that we only have one character 5682 * to test for. Pathnames tend to be short so this should not be 5683 * resulting in cache misses. 5684 * 5685 * TODO: fix this to be word-sized. 5686 */ 5687 MPASS(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] >= cnp->cn_pnbuf); 5688 KASSERT(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] == fpl->nulchar, 5689 ("%s: mismatch between pathlen (%zu) and nulchar (%p != %p), string [%s]\n", 5690 __func__, fpl->debug.ni_pathlen, &cnp->cn_nameptr[fpl->debug.ni_pathlen - 1], 5691 fpl->nulchar, cnp->cn_pnbuf)); 5692 KASSERT(*fpl->nulchar == '\0', 5693 ("%s: expected nul at %p; string [%s]\n", __func__, fpl->nulchar, 5694 cnp->cn_pnbuf)); 5695 hash = cache_get_hash_iter_start(dvp); 5696 *fpl->nulchar = '/'; 5697 for (cp = cnp->cn_nameptr; *cp != '/'; cp++) { 5698 KASSERT(*cp != '\0', 5699 ("%s: encountered unexpected nul; string [%s]\n", __func__, 5700 cnp->cn_nameptr)); 5701 hash = cache_get_hash_iter(*cp, hash); 5702 continue; 5703 } 5704 *fpl->nulchar = '\0'; 5705 fpl->hash = cache_get_hash_iter_finish(hash); 5706 5707 cnp->cn_namelen = cp - cnp->cn_nameptr; 5708 cache_fpl_pathlen_sub(fpl, cnp->cn_namelen); 5709 5710 #ifdef INVARIANTS 5711 /* 5712 * cache_get_hash only accepts lengths up to NAME_MAX. This is fine since 5713 * we are going to fail this lookup with ENAMETOOLONG (see below). 5714 */ 5715 if (cnp->cn_namelen <= NAME_MAX) { 5716 if (fpl->hash != cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp)) { 5717 panic("%s: mismatched hash for [%s] len %ld", __func__, 5718 cnp->cn_nameptr, cnp->cn_namelen); 5719 } 5720 } 5721 #endif 5722 5723 /* 5724 * Hack: we have to check if the found path component's length exceeds 5725 * NAME_MAX. However, the condition is very rarely true and check can 5726 * be elided in the common case -- if an entry was found in the cache, 5727 * then it could not have been too long to begin with. 5728 */ 5729 ndp->ni_next = cp; 5730 } 5731 5732 static void 5733 cache_fplookup_parse_advance(struct cache_fpl *fpl) 5734 { 5735 struct nameidata *ndp; 5736 struct componentname *cnp; 5737 5738 ndp = fpl->ndp; 5739 cnp = fpl->cnp; 5740 5741 cnp->cn_nameptr = ndp->ni_next; 5742 KASSERT(*(cnp->cn_nameptr) == '/', 5743 ("%s: should have seen slash at %p ; buf %p [%s]\n", __func__, 5744 cnp->cn_nameptr, cnp->cn_pnbuf, cnp->cn_pnbuf)); 5745 cnp->cn_nameptr++; 5746 cache_fpl_pathlen_dec(fpl); 5747 } 5748 5749 /* 5750 * Skip spurious slashes in a pathname (e.g., "foo///bar") and retry. 5751 * 5752 * Lockless lookup tries to elide checking for spurious slashes and should they 5753 * be present is guaranteed to fail to find an entry. In this case the caller 5754 * must check if the name starts with a slash and call this routine. It is 5755 * going to fast forward across the spurious slashes and set the state up for 5756 * retry. 5757 */ 5758 static int __noinline 5759 cache_fplookup_skip_slashes(struct cache_fpl *fpl) 5760 { 5761 struct nameidata *ndp; 5762 struct componentname *cnp; 5763 5764 ndp = fpl->ndp; 5765 cnp = fpl->cnp; 5766 5767 MPASS(*(cnp->cn_nameptr) == '/'); 5768 do { 5769 cnp->cn_nameptr++; 5770 cache_fpl_pathlen_dec(fpl); 5771 } while (*(cnp->cn_nameptr) == '/'); 5772 5773 /* 5774 * Go back to one slash so that cache_fplookup_parse_advance has 5775 * something to skip. 5776 */ 5777 cnp->cn_nameptr--; 5778 cache_fpl_pathlen_inc(fpl); 5779 5780 /* 5781 * cache_fplookup_parse_advance starts from ndp->ni_next 5782 */ 5783 ndp->ni_next = cnp->cn_nameptr; 5784 5785 /* 5786 * See cache_fplookup_dot. 5787 */ 5788 fpl->tvp = fpl->dvp; 5789 fpl->tvp_seqc = fpl->dvp_seqc; 5790 5791 return (0); 5792 } 5793 5794 /* 5795 * Handle trailing slashes (e.g., "foo/"). 5796 * 5797 * If a trailing slash is found the terminal vnode must be a directory. 5798 * Regular lookup shortens the path by nulifying the first trailing slash and 5799 * sets the TRAILINGSLASH flag to denote this took place. There are several 5800 * checks on it performed later. 5801 * 5802 * Similarly to spurious slashes, lockless lookup handles this in a speculative 5803 * manner relying on an invariant that a non-directory vnode will get a miss. 5804 * In this case cn_nameptr[0] == '\0' and cn_namelen == 0. 5805 * 5806 * Thus for a path like "foo/bar/" the code unwinds the state back to "bar/" 5807 * and denotes this is the last path component, which avoids looping back. 5808 * 5809 * Only plain lookups are supported for now to restrict corner cases to handle. 5810 */ 5811 static int __noinline 5812 cache_fplookup_trailingslash(struct cache_fpl *fpl) 5813 { 5814 #ifdef INVARIANTS 5815 size_t ni_pathlen; 5816 #endif 5817 struct nameidata *ndp; 5818 struct componentname *cnp; 5819 struct namecache *ncp; 5820 struct vnode *tvp; 5821 char *cn_nameptr_orig, *cn_nameptr_slash; 5822 seqc_t tvp_seqc; 5823 u_char nc_flag; 5824 5825 ndp = fpl->ndp; 5826 cnp = fpl->cnp; 5827 tvp = fpl->tvp; 5828 tvp_seqc = fpl->tvp_seqc; 5829 5830 MPASS(fpl->dvp == fpl->tvp); 5831 KASSERT(cache_fpl_istrailingslash(fpl), 5832 ("%s: expected trailing slash at %p; string [%s]\n", __func__, fpl->nulchar - 1, 5833 cnp->cn_pnbuf)); 5834 KASSERT(cnp->cn_nameptr[0] == '\0', 5835 ("%s: expected nul char at %p; string [%s]\n", __func__, &cnp->cn_nameptr[0], 5836 cnp->cn_pnbuf)); 5837 KASSERT(cnp->cn_namelen == 0, 5838 ("%s: namelen 0 but got %ld; string [%s]\n", __func__, cnp->cn_namelen, 5839 cnp->cn_pnbuf)); 5840 MPASS(cnp->cn_nameptr > cnp->cn_pnbuf); 5841 5842 if (cnp->cn_nameiop != LOOKUP) { 5843 return (cache_fpl_aborted(fpl)); 5844 } 5845 5846 if (__predict_false(tvp->v_type != VDIR)) { 5847 if (!vn_seqc_consistent(tvp, tvp_seqc)) { 5848 return (cache_fpl_aborted(fpl)); 5849 } 5850 cache_fpl_smr_exit(fpl); 5851 return (cache_fpl_handled_error(fpl, ENOTDIR)); 5852 } 5853 5854 /* 5855 * Denote the last component. 5856 */ 5857 ndp->ni_next = &cnp->cn_nameptr[0]; 5858 MPASS(cache_fpl_islastcn(ndp)); 5859 5860 /* 5861 * Unwind trailing slashes. 5862 */ 5863 cn_nameptr_orig = cnp->cn_nameptr; 5864 while (cnp->cn_nameptr >= cnp->cn_pnbuf) { 5865 cnp->cn_nameptr--; 5866 if (cnp->cn_nameptr[0] != '/') { 5867 break; 5868 } 5869 } 5870 5871 /* 5872 * Unwind to the beginning of the path component. 5873 * 5874 * Note the path may or may not have started with a slash. 5875 */ 5876 cn_nameptr_slash = cnp->cn_nameptr; 5877 while (cnp->cn_nameptr > cnp->cn_pnbuf) { 5878 cnp->cn_nameptr--; 5879 if (cnp->cn_nameptr[0] == '/') { 5880 break; 5881 } 5882 } 5883 if (cnp->cn_nameptr[0] == '/') { 5884 cnp->cn_nameptr++; 5885 } 5886 5887 cnp->cn_namelen = cn_nameptr_slash - cnp->cn_nameptr + 1; 5888 cache_fpl_pathlen_add(fpl, cn_nameptr_orig - cnp->cn_nameptr); 5889 cache_fpl_checkpoint(fpl); 5890 5891 #ifdef INVARIANTS 5892 ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1; 5893 if (ni_pathlen != fpl->debug.ni_pathlen) { 5894 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n", 5895 __func__, ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar, 5896 cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf); 5897 } 5898 #endif 5899 5900 /* 5901 * If this was a "./" lookup the parent directory is already correct. 5902 */ 5903 if (cnp->cn_nameptr[0] == '.' && cnp->cn_namelen == 1) { 5904 return (0); 5905 } 5906 5907 /* 5908 * Otherwise we need to look it up. 5909 */ 5910 tvp = fpl->tvp; 5911 ncp = atomic_load_consume_ptr(&tvp->v_cache_dd); 5912 if (__predict_false(ncp == NULL)) { 5913 return (cache_fpl_aborted(fpl)); 5914 } 5915 nc_flag = atomic_load_char(&ncp->nc_flag); 5916 if ((nc_flag & NCF_ISDOTDOT) != 0) { 5917 return (cache_fpl_aborted(fpl)); 5918 } 5919 fpl->dvp = ncp->nc_dvp; 5920 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); 5921 if (seqc_in_modify(fpl->dvp_seqc)) { 5922 return (cache_fpl_aborted(fpl)); 5923 } 5924 return (0); 5925 } 5926 5927 /* 5928 * See the API contract for VOP_FPLOOKUP_VEXEC. 5929 */ 5930 static int __noinline 5931 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error) 5932 { 5933 struct componentname *cnp; 5934 struct vnode *dvp; 5935 seqc_t dvp_seqc; 5936 5937 cnp = fpl->cnp; 5938 dvp = fpl->dvp; 5939 dvp_seqc = fpl->dvp_seqc; 5940 5941 /* 5942 * Hack: delayed empty path checking. 5943 */ 5944 if (cnp->cn_pnbuf[0] == '\0') { 5945 return (cache_fplookup_emptypath(fpl)); 5946 } 5947 5948 /* 5949 * TODO: Due to ignoring trailing slashes lookup will perform a 5950 * permission check on the last dir when it should not be doing it. It 5951 * may fail, but said failure should be ignored. It is possible to fix 5952 * it up fully without resorting to regular lookup, but for now just 5953 * abort. 5954 */ 5955 if (cache_fpl_istrailingslash(fpl)) { 5956 return (cache_fpl_aborted(fpl)); 5957 } 5958 5959 /* 5960 * Hack: delayed degenerate path checking. 5961 */ 5962 if (cnp->cn_nameptr[0] == '\0' && fpl->tvp == NULL) { 5963 return (cache_fplookup_degenerate(fpl)); 5964 } 5965 5966 /* 5967 * Hack: delayed name len checking. 5968 */ 5969 if (__predict_false(cnp->cn_namelen > NAME_MAX)) { 5970 cache_fpl_smr_exit(fpl); 5971 return (cache_fpl_handled_error(fpl, ENAMETOOLONG)); 5972 } 5973 5974 /* 5975 * Hack: they may be looking up foo/bar, where foo is not a directory. 5976 * In such a case we need to return ENOTDIR, but we may happen to get 5977 * here with a different error. 5978 */ 5979 if (dvp->v_type != VDIR) { 5980 error = ENOTDIR; 5981 } 5982 5983 /* 5984 * Hack: handle O_SEARCH. 5985 * 5986 * Open Group Base Specifications Issue 7, 2018 edition states: 5987 * <quote> 5988 * If the access mode of the open file description associated with the 5989 * file descriptor is not O_SEARCH, the function shall check whether 5990 * directory searches are permitted using the current permissions of 5991 * the directory underlying the file descriptor. If the access mode is 5992 * O_SEARCH, the function shall not perform the check. 5993 * </quote> 5994 * 5995 * Regular lookup tests for the NOEXECCHECK flag for every path 5996 * component to decide whether to do the permission check. However, 5997 * since most lookups never have the flag (and when they do it is only 5998 * present for the first path component), lockless lookup only acts on 5999 * it if there is a permission problem. Here the flag is represented 6000 * with a boolean so that we don't have to clear it on the way out. 6001 * 6002 * For simplicity this always aborts. 6003 * TODO: check if this is the first lookup and ignore the permission 6004 * problem. Note the flag has to survive fallback (if it happens to be 6005 * performed). 6006 */ 6007 if (fpl->fsearch) { 6008 return (cache_fpl_aborted(fpl)); 6009 } 6010 6011 switch (error) { 6012 case EAGAIN: 6013 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 6014 error = cache_fpl_aborted(fpl); 6015 } else { 6016 cache_fpl_partial(fpl); 6017 } 6018 break; 6019 default: 6020 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 6021 error = cache_fpl_aborted(fpl); 6022 } else { 6023 cache_fpl_smr_exit(fpl); 6024 cache_fpl_handled_error(fpl, error); 6025 } 6026 break; 6027 } 6028 return (error); 6029 } 6030 6031 static int 6032 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl) 6033 { 6034 struct nameidata *ndp; 6035 struct componentname *cnp; 6036 struct mount *mp; 6037 int error; 6038 6039 ndp = fpl->ndp; 6040 cnp = fpl->cnp; 6041 6042 cache_fpl_checkpoint(fpl); 6043 6044 /* 6045 * The vnode at hand is almost always stable, skip checking for it. 6046 * Worst case this postpones the check towards the end of the iteration 6047 * of the main loop. 6048 */ 6049 fpl->dvp = dvp; 6050 fpl->dvp_seqc = vn_seqc_read_notmodify(fpl->dvp); 6051 6052 mp = atomic_load_ptr(&dvp->v_mount); 6053 if (__predict_false(mp == NULL || !cache_fplookup_mp_supported(mp))) { 6054 return (cache_fpl_aborted(fpl)); 6055 } 6056 6057 MPASS(fpl->tvp == NULL); 6058 6059 for (;;) { 6060 cache_fplookup_parse(fpl); 6061 6062 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred); 6063 if (__predict_false(error != 0)) { 6064 error = cache_fplookup_failed_vexec(fpl, error); 6065 break; 6066 } 6067 6068 error = cache_fplookup_next(fpl); 6069 if (__predict_false(cache_fpl_terminated(fpl))) { 6070 break; 6071 } 6072 6073 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 6074 6075 if (fpl->tvp->v_type == VLNK) { 6076 error = cache_fplookup_symlink(fpl); 6077 if (cache_fpl_terminated(fpl)) { 6078 break; 6079 } 6080 } else { 6081 if (cache_fpl_islastcn(ndp)) { 6082 error = cache_fplookup_final(fpl); 6083 break; 6084 } 6085 6086 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { 6087 error = cache_fpl_aborted(fpl); 6088 break; 6089 } 6090 6091 fpl->dvp = fpl->tvp; 6092 fpl->dvp_seqc = fpl->tvp_seqc; 6093 cache_fplookup_parse_advance(fpl); 6094 } 6095 6096 cache_fpl_checkpoint(fpl); 6097 } 6098 6099 return (error); 6100 } 6101 6102 /* 6103 * Fast path lookup protected with SMR and sequence counters. 6104 * 6105 * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one. 6106 * 6107 * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria 6108 * outlined below. 6109 * 6110 * Traditional vnode lookup conceptually looks like this: 6111 * 6112 * vn_lock(current); 6113 * for (;;) { 6114 * next = find(); 6115 * vn_lock(next); 6116 * vn_unlock(current); 6117 * current = next; 6118 * if (last) 6119 * break; 6120 * } 6121 * return (current); 6122 * 6123 * Each jump to the next vnode is safe memory-wise and atomic with respect to 6124 * any modifications thanks to holding respective locks. 6125 * 6126 * The same guarantee can be provided with a combination of safe memory 6127 * reclamation and sequence counters instead. If all operations which affect 6128 * the relationship between the current vnode and the one we are looking for 6129 * also modify the counter, we can verify whether all the conditions held as 6130 * we made the jump. This includes things like permissions, mount points etc. 6131 * Counter modification is provided by enclosing relevant places in 6132 * vn_seqc_write_begin()/end() calls. 6133 * 6134 * Thus this translates to: 6135 * 6136 * vfs_smr_enter(); 6137 * dvp_seqc = seqc_read_any(dvp); 6138 * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode 6139 * abort(); 6140 * for (;;) { 6141 * tvp = find(); 6142 * tvp_seqc = seqc_read_any(tvp); 6143 * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode 6144 * abort(); 6145 * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode 6146 * abort(); 6147 * dvp = tvp; // we know nothing of importance has changed 6148 * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration 6149 * if (last) 6150 * break; 6151 * } 6152 * vget(); // secure the vnode 6153 * if (!seqc_consistent(tvp, tvp_seqc) // final check 6154 * abort(); 6155 * // at this point we know nothing has changed for any parent<->child pair 6156 * // as they were crossed during the lookup, meaning we matched the guarantee 6157 * // of the locked variant 6158 * return (tvp); 6159 * 6160 * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows: 6161 * - they are called while within vfs_smr protection which they must never exit 6162 * - EAGAIN can be returned to denote checking could not be performed, it is 6163 * always valid to return it 6164 * - if the sequence counter has not changed the result must be valid 6165 * - if the sequence counter has changed both false positives and false negatives 6166 * are permitted (since the result will be rejected later) 6167 * - for simple cases of unix permission checks vaccess_vexec_smr can be used 6168 * 6169 * Caveats to watch out for: 6170 * - vnodes are passed unlocked and unreferenced with nothing stopping 6171 * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised 6172 * to use atomic_load_ptr to fetch it. 6173 * - the aforementioned object can also get freed, meaning absent other means it 6174 * should be protected with vfs_smr 6175 * - either safely checking permissions as they are modified or guaranteeing 6176 * their stability is left to the routine 6177 */ 6178 int 6179 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status, 6180 struct pwd **pwdp) 6181 { 6182 struct cache_fpl fpl; 6183 struct pwd *pwd; 6184 struct vnode *dvp; 6185 struct componentname *cnp; 6186 int error; 6187 6188 fpl.status = CACHE_FPL_STATUS_UNSET; 6189 fpl.in_smr = false; 6190 fpl.ndp = ndp; 6191 fpl.cnp = cnp = &ndp->ni_cnd; 6192 MPASS(ndp->ni_lcf == 0); 6193 KASSERT ((cnp->cn_flags & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, 6194 ("%s: internal flags found in cn_flags %" PRIx64, __func__, 6195 cnp->cn_flags)); 6196 MPASS(cnp->cn_nameptr == cnp->cn_pnbuf); 6197 MPASS(ndp->ni_resflags == 0); 6198 6199 if (__predict_false(!cache_can_fplookup(&fpl))) { 6200 *status = fpl.status; 6201 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 6202 return (EOPNOTSUPP); 6203 } 6204 6205 cache_fpl_checkpoint_outer(&fpl); 6206 6207 cache_fpl_smr_enter_initial(&fpl); 6208 #ifdef INVARIANTS 6209 fpl.debug.ni_pathlen = ndp->ni_pathlen; 6210 #endif 6211 fpl.nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1]; 6212 fpl.fsearch = false; 6213 fpl.tvp = NULL; /* for degenerate path handling */ 6214 fpl.pwd = pwdp; 6215 pwd = pwd_get_smr(); 6216 *(fpl.pwd) = pwd; 6217 namei_setup_rootdir(ndp, cnp, pwd); 6218 ndp->ni_topdir = pwd->pwd_jdir; 6219 6220 if (cnp->cn_pnbuf[0] == '/') { 6221 dvp = cache_fpl_handle_root(&fpl); 6222 ndp->ni_resflags = NIRES_ABS; 6223 } else { 6224 if (ndp->ni_dirfd == AT_FDCWD) { 6225 dvp = pwd->pwd_cdir; 6226 } else { 6227 error = cache_fplookup_dirfd(&fpl, &dvp); 6228 if (__predict_false(error != 0)) { 6229 goto out; 6230 } 6231 } 6232 } 6233 6234 SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true); 6235 error = cache_fplookup_impl(dvp, &fpl); 6236 out: 6237 cache_fpl_smr_assert_not_entered(&fpl); 6238 cache_fpl_assert_status(&fpl); 6239 *status = fpl.status; 6240 if (SDT_PROBES_ENABLED()) { 6241 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 6242 if (fpl.status == CACHE_FPL_STATUS_HANDLED) 6243 SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true, 6244 ndp); 6245 } 6246 6247 if (__predict_true(fpl.status == CACHE_FPL_STATUS_HANDLED)) { 6248 MPASS(error != CACHE_FPL_FAILED); 6249 if (error != 0) { 6250 cache_fpl_cleanup_cnp(fpl.cnp); 6251 MPASS(fpl.dvp == NULL); 6252 MPASS(fpl.tvp == NULL); 6253 } 6254 ndp->ni_dvp = fpl.dvp; 6255 ndp->ni_vp = fpl.tvp; 6256 } 6257 return (error); 6258 } 6259