1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Poul-Henning Kamp of the FreeBSD Project. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 35 */ 36 37 #include <sys/cdefs.h> 38 #include "opt_ddb.h" 39 #include "opt_ktrace.h" 40 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/capsicum.h> 44 #include <sys/counter.h> 45 #include <sys/filedesc.h> 46 #include <sys/fnv_hash.h> 47 #include <sys/kernel.h> 48 #include <sys/ktr.h> 49 #include <sys/lock.h> 50 #include <sys/malloc.h> 51 #include <sys/fcntl.h> 52 #include <sys/jail.h> 53 #include <sys/mount.h> 54 #include <sys/namei.h> 55 #include <sys/proc.h> 56 #include <sys/seqc.h> 57 #include <sys/sdt.h> 58 #include <sys/smr.h> 59 #include <sys/smp.h> 60 #include <sys/syscallsubr.h> 61 #include <sys/sysctl.h> 62 #include <sys/sysproto.h> 63 #include <sys/vnode.h> 64 #include <ck_queue.h> 65 #ifdef KTRACE 66 #include <sys/ktrace.h> 67 #endif 68 #ifdef INVARIANTS 69 #include <machine/_inttypes.h> 70 #endif 71 72 #include <security/audit/audit.h> 73 #include <security/mac/mac_framework.h> 74 75 #ifdef DDB 76 #include <ddb/ddb.h> 77 #endif 78 79 #include <vm/uma.h> 80 81 /* 82 * High level overview of name caching in the VFS layer. 83 * 84 * Originally caching was implemented as part of UFS, later extracted to allow 85 * use by other filesystems. A decision was made to make it optional and 86 * completely detached from the rest of the kernel, which comes with limitations 87 * outlined near the end of this comment block. 88 * 89 * This fundamental choice needs to be revisited. In the meantime, the current 90 * state is described below. Significance of all notable routines is explained 91 * in comments placed above their implementation. Scattered thoroughout the 92 * file are TODO comments indicating shortcomings which can be fixed without 93 * reworking everything (most of the fixes will likely be reusable). Various 94 * details are omitted from this explanation to not clutter the overview, they 95 * have to be checked by reading the code and associated commentary. 96 * 97 * Keep in mind that it's individual path components which are cached, not full 98 * paths. That is, for a fully cached path "foo/bar/baz" there are 3 entries, 99 * one for each name. 100 * 101 * I. Data organization 102 * 103 * Entries are described by "struct namecache" objects and stored in a hash 104 * table. See cache_get_hash for more information. 105 * 106 * "struct vnode" contains pointers to source entries (names which can be found 107 * when traversing through said vnode), destination entries (names of that 108 * vnode (see "Limitations" for a breakdown on the subject) and a pointer to 109 * the parent vnode. 110 * 111 * The (directory vnode; name) tuple reliably determines the target entry if 112 * it exists. 113 * 114 * Since there are no small locks at this time (all are 32 bytes in size on 115 * LP64), the code works around the problem by introducing lock arrays to 116 * protect hash buckets and vnode lists. 117 * 118 * II. Filesystem integration 119 * 120 * Filesystems participating in name caching do the following: 121 * - set vop_lookup routine to vfs_cache_lookup 122 * - set vop_cachedlookup to whatever can perform the lookup if the above fails 123 * - if they support lockless lookup (see below), vop_fplookup_vexec and 124 * vop_fplookup_symlink are set along with the MNTK_FPLOOKUP flag on the 125 * mount point 126 * - call cache_purge or cache_vop_* routines to eliminate stale entries as 127 * applicable 128 * - call cache_enter to add entries depending on the MAKEENTRY flag 129 * 130 * With the above in mind, there are 2 entry points when doing lookups: 131 * - ... -> namei -> cache_fplookup -- this is the default 132 * - ... -> VOP_LOOKUP -> vfs_cache_lookup -- normally only called by namei 133 * should the above fail 134 * 135 * Example code flow how an entry is added: 136 * ... -> namei -> cache_fplookup -> cache_fplookup_noentry -> VOP_LOOKUP -> 137 * vfs_cache_lookup -> VOP_CACHEDLOOKUP -> ufs_lookup_ino -> cache_enter 138 * 139 * III. Performance considerations 140 * 141 * For lockless case forward lookup avoids any writes to shared areas apart 142 * from the terminal path component. In other words non-modifying lookups of 143 * different files don't suffer any scalability problems in the namecache. 144 * Looking up the same file is limited by VFS and goes beyond the scope of this 145 * file. 146 * 147 * At least on amd64 the single-threaded bottleneck for long paths is hashing 148 * (see cache_get_hash). There are cases where the code issues acquire fence 149 * multiple times, they can be combined on architectures which suffer from it. 150 * 151 * For locked case each encountered vnode has to be referenced and locked in 152 * order to be handed out to the caller (normally that's namei). This 153 * introduces significant hit single-threaded and serialization multi-threaded. 154 * 155 * Reverse lookup (e.g., "getcwd") fully scales provided it is fully cached -- 156 * avoids any writes to shared areas to any components. 157 * 158 * Unrelated insertions are partially serialized on updating the global entry 159 * counter and possibly serialized on colliding bucket or vnode locks. 160 * 161 * IV. Observability 162 * 163 * Note not everything has an explicit dtrace probe nor it should have, thus 164 * some of the one-liners below depend on implementation details. 165 * 166 * Examples: 167 * 168 * # Check what lookups failed to be handled in a lockless manner. Column 1 is 169 * # line number, column 2 is status code (see cache_fpl_status) 170 * dtrace -n 'vfs:fplookup:lookup:done { @[arg1, arg2] = count(); }' 171 * 172 * # Lengths of names added by binary name 173 * dtrace -n 'fbt::cache_enter_time:entry { @[execname] = quantize(args[2]->cn_namelen); }' 174 * 175 * # Same as above but only those which exceed 64 characters 176 * dtrace -n 'fbt::cache_enter_time:entry /args[2]->cn_namelen > 64/ { @[execname] = quantize(args[2]->cn_namelen); }' 177 * 178 * # Who is performing lookups with spurious slashes (e.g., "foo//bar") and what 179 * # path is it 180 * dtrace -n 'fbt::cache_fplookup_skip_slashes:entry { @[execname, stringof(args[0]->cnp->cn_pnbuf)] = count(); }' 181 * 182 * V. Limitations and implementation defects 183 * 184 * - since it is possible there is no entry for an open file, tools like 185 * "procstat" may fail to resolve fd -> vnode -> path to anything 186 * - even if a filesystem adds an entry, it may get purged (e.g., due to memory 187 * shortage) in which case the above problem applies 188 * - hardlinks are not tracked, thus if a vnode is reachable in more than one 189 * way, resolving a name may return a different path than the one used to 190 * open it (even if said path is still valid) 191 * - by default entries are not added for newly created files 192 * - adding an entry may need to evict negative entry first, which happens in 2 193 * distinct places (evicting on lookup, adding in a later VOP) making it 194 * impossible to simply reuse it 195 * - there is a simple scheme to evict negative entries as the cache is approaching 196 * its capacity, but it is very unclear if doing so is a good idea to begin with 197 * - vnodes are subject to being recycled even if target inode is left in memory, 198 * which loses the name cache entries when it perhaps should not. in case of tmpfs 199 * names get duplicated -- kept by filesystem itself and namecache separately 200 * - struct namecache has a fixed size and comes in 2 variants, often wasting space. 201 * now hard to replace with malloc due to dependence on SMR. 202 * - lack of better integration with the kernel also turns nullfs into a layered 203 * filesystem instead of something which can take advantage of caching 204 */ 205 206 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 207 "Name cache"); 208 209 SDT_PROVIDER_DECLARE(vfs); 210 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", 211 "struct vnode *"); 212 SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *", 213 "struct vnode *"); 214 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", 215 "char *"); 216 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *", 217 "const char *"); 218 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *", 219 "struct namecache *", "int", "int"); 220 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); 221 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", 222 "char *", "struct vnode *"); 223 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); 224 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", 225 "struct vnode *", "char *"); 226 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", 227 "struct vnode *"); 228 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, 229 "struct vnode *", "char *"); 230 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", 231 "char *"); 232 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *", 233 "struct componentname *"); 234 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *", 235 "struct componentname *"); 236 SDT_PROBE_DEFINE3(vfs, namecache, purge, done, "struct vnode *", "size_t", "size_t"); 237 SDT_PROBE_DEFINE1(vfs, namecache, purge, batch, "int"); 238 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); 239 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); 240 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", 241 "struct vnode *"); 242 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", 243 "char *"); 244 SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *", 245 "char *"); 246 SDT_PROBE_DEFINE1(vfs, namecache, symlink, alloc__fail, "size_t"); 247 248 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool"); 249 SDT_PROBE_DECLARE(vfs, namei, lookup, entry); 250 SDT_PROBE_DECLARE(vfs, namei, lookup, return); 251 252 static char __read_frequently cache_fast_lookup_enabled = true; 253 254 /* 255 * This structure describes the elements in the cache of recent 256 * names looked up by namei. 257 */ 258 struct negstate { 259 u_char neg_flag; 260 u_char neg_hit; 261 }; 262 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *), 263 "the state must fit in a union with a pointer without growing it"); 264 265 struct namecache { 266 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 267 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 268 CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */ 269 struct vnode *nc_dvp; /* vnode of parent of name */ 270 union { 271 struct vnode *nu_vp; /* vnode the name refers to */ 272 struct negstate nu_neg;/* negative entry state */ 273 } n_un; 274 u_char nc_flag; /* flag bits */ 275 u_char nc_nlen; /* length of name */ 276 char nc_name[]; /* segment name + nul */ 277 }; 278 279 /* 280 * struct namecache_ts repeats struct namecache layout up to the 281 * nc_nlen member. 282 * struct namecache_ts is used in place of struct namecache when time(s) need 283 * to be stored. The nc_dotdottime field is used when a cache entry is mapping 284 * both a non-dotdot directory name plus dotdot for the directory's 285 * parent. 286 * 287 * See below for alignment requirement. 288 */ 289 struct namecache_ts { 290 struct timespec nc_time; /* timespec provided by fs */ 291 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ 292 int nc_ticks; /* ticks value when entry was added */ 293 int nc_pad; 294 struct namecache nc_nc; 295 }; 296 297 TAILQ_HEAD(cache_freebatch, namecache); 298 299 /* 300 * At least mips n32 performs 64-bit accesses to timespec as found 301 * in namecache_ts and requires them to be aligned. Since others 302 * may be in the same spot suffer a little bit and enforce the 303 * alignment for everyone. Note this is a nop for 64-bit platforms. 304 */ 305 #define CACHE_ZONE_ALIGNMENT UMA_ALIGNOF(time_t) 306 307 /* 308 * TODO: the initial value of CACHE_PATH_CUTOFF was inherited from the 309 * 4.4 BSD codebase. Later on struct namecache was tweaked to become 310 * smaller and the value was bumped to retain the total size, but it 311 * was never re-evaluated for suitability. A simple test counting 312 * lengths during package building shows that the value of 45 covers 313 * about 86% of all added entries, reaching 99% at 65. 314 * 315 * Regardless of the above, use of dedicated zones instead of malloc may be 316 * inducing additional waste. This may be hard to address as said zones are 317 * tied to VFS SMR. Even if retaining them, the current split should be 318 * re-evaluated. 319 */ 320 #ifdef __LP64__ 321 #define CACHE_PATH_CUTOFF 45 322 #define CACHE_LARGE_PAD 6 323 #else 324 #define CACHE_PATH_CUTOFF 41 325 #define CACHE_LARGE_PAD 2 326 #endif 327 328 #define CACHE_ZONE_SMALL_SIZE (offsetof(struct namecache, nc_name) + CACHE_PATH_CUTOFF + 1) 329 #define CACHE_ZONE_SMALL_TS_SIZE (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_SMALL_SIZE) 330 #define CACHE_ZONE_LARGE_SIZE (offsetof(struct namecache, nc_name) + NAME_MAX + 1 + CACHE_LARGE_PAD) 331 #define CACHE_ZONE_LARGE_TS_SIZE (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_LARGE_SIZE) 332 333 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 334 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 335 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 336 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 337 338 #define nc_vp n_un.nu_vp 339 #define nc_neg n_un.nu_neg 340 341 /* 342 * Flags in namecache.nc_flag 343 */ 344 #define NCF_WHITE 0x01 345 #define NCF_ISDOTDOT 0x02 346 #define NCF_TS 0x04 347 #define NCF_DTS 0x08 348 #define NCF_DVDROP 0x10 349 #define NCF_NEGATIVE 0x20 350 #define NCF_INVALID 0x40 351 #define NCF_WIP 0x80 352 353 /* 354 * Flags in negstate.neg_flag 355 */ 356 #define NEG_HOT 0x01 357 358 static bool cache_neg_evict_cond(u_long lnumcache); 359 360 /* 361 * Mark an entry as invalid. 362 * 363 * This is called before it starts getting deconstructed. 364 */ 365 static void 366 cache_ncp_invalidate(struct namecache *ncp) 367 { 368 369 KASSERT((ncp->nc_flag & NCF_INVALID) == 0, 370 ("%s: entry %p already invalid", __func__, ncp)); 371 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID); 372 atomic_thread_fence_rel(); 373 } 374 375 /* 376 * Check whether the entry can be safely used. 377 * 378 * All places which elide locks are supposed to call this after they are 379 * done with reading from an entry. 380 */ 381 #define cache_ncp_canuse(ncp) ({ \ 382 struct namecache *_ncp = (ncp); \ 383 u_char _nc_flag; \ 384 \ 385 atomic_thread_fence_acq(); \ 386 _nc_flag = atomic_load_char(&_ncp->nc_flag); \ 387 __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP)) == 0); \ 388 }) 389 390 /* 391 * Like the above but also checks NCF_WHITE. 392 */ 393 #define cache_fpl_neg_ncp_canuse(ncp) ({ \ 394 struct namecache *_ncp = (ncp); \ 395 u_char _nc_flag; \ 396 \ 397 atomic_thread_fence_acq(); \ 398 _nc_flag = atomic_load_char(&_ncp->nc_flag); \ 399 __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP | NCF_WHITE)) == 0); \ 400 }) 401 402 VFS_SMR_DECLARE; 403 404 static SYSCTL_NODE(_vfs_cache, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 405 "Name cache parameters"); 406 407 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */ 408 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, size, CTLFLAG_RW, &ncsize, 0, 409 "Total namecache capacity"); 410 411 u_int ncsizefactor = 2; 412 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, sizefactor, CTLFLAG_RW, &ncsizefactor, 0, 413 "Size factor for namecache"); 414 415 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */ 416 SYSCTL_ULONG(_vfs_cache_param, OID_AUTO, negfactor, CTLFLAG_RW, &ncnegfactor, 0, 417 "Ratio of negative namecache entries"); 418 419 /* 420 * Negative entry % of namecache capacity above which automatic eviction is allowed. 421 * 422 * Check cache_neg_evict_cond for details. 423 */ 424 static u_int ncnegminpct = 3; 425 426 static u_int __read_mostly neg_min; /* the above recomputed against ncsize */ 427 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, negmin, CTLFLAG_RD, &neg_min, 0, 428 "Negative entry count above which automatic eviction is allowed"); 429 430 /* 431 * Structures associated with name caching. 432 */ 433 #define NCHHASH(hash) \ 434 (&nchashtbl[(hash) & nchash]) 435 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ 436 static u_long __read_mostly nchash; /* size of hash table */ 437 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 438 "Size of namecache hash table"); 439 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ 440 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ 441 442 struct nchstats nchstats; /* cache effectiveness statistics */ 443 444 static u_int __exclusive_cache_line neg_cycle; 445 446 #define ncneghash 3 447 #define numneglists (ncneghash + 1) 448 449 struct neglist { 450 struct mtx nl_evict_lock; 451 struct mtx nl_lock __aligned(CACHE_LINE_SIZE); 452 TAILQ_HEAD(, namecache) nl_list; 453 TAILQ_HEAD(, namecache) nl_hotlist; 454 u_long nl_hotnum; 455 } __aligned(CACHE_LINE_SIZE); 456 457 static struct neglist neglists[numneglists]; 458 459 static inline struct neglist * 460 NCP2NEGLIST(struct namecache *ncp) 461 { 462 463 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); 464 } 465 466 static inline struct negstate * 467 NCP2NEGSTATE(struct namecache *ncp) 468 { 469 470 MPASS(atomic_load_char(&ncp->nc_flag) & NCF_NEGATIVE); 471 return (&ncp->nc_neg); 472 } 473 474 #define numbucketlocks (ncbuckethash + 1) 475 static u_int __read_mostly ncbuckethash; 476 static struct mtx_padalign __read_mostly *bucketlocks; 477 #define HASH2BUCKETLOCK(hash) \ 478 ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)])) 479 480 #define numvnodelocks (ncvnodehash + 1) 481 static u_int __read_mostly ncvnodehash; 482 static struct mtx __read_mostly *vnodelocks; 483 static inline struct mtx * 484 VP2VNODELOCK(struct vnode *vp) 485 { 486 487 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); 488 } 489 490 static void 491 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) 492 { 493 struct namecache_ts *ncp_ts; 494 495 KASSERT((ncp->nc_flag & NCF_TS) != 0 || 496 (tsp == NULL && ticksp == NULL), 497 ("No NCF_TS")); 498 499 if (tsp == NULL) 500 return; 501 502 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 503 *tsp = ncp_ts->nc_time; 504 *ticksp = ncp_ts->nc_ticks; 505 } 506 507 #ifdef DEBUG_CACHE 508 static int __read_mostly doingcache = 1; /* 1 => enable the cache */ 509 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, 510 "VFS namecache enabled"); 511 #endif 512 513 /* Export size information to userland */ 514 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 515 sizeof(struct namecache), "sizeof(struct namecache)"); 516 517 /* 518 * The new name cache statistics 519 */ 520 static SYSCTL_NODE(_vfs_cache, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 521 "Name cache statistics"); 522 523 #define STATNODE_ULONG(name, varname, descr) \ 524 SYSCTL_ULONG(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr); 525 #define STATNODE_COUNTER(name, varname, descr) \ 526 static COUNTER_U64_DEFINE_EARLY(varname); \ 527 SYSCTL_COUNTER_U64(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, \ 528 descr); 529 STATNODE_ULONG(neg, numneg, "Number of negative cache entries"); 530 STATNODE_ULONG(count, numcache, "Number of cache entries"); 531 STATNODE_COUNTER(heldvnodes, numcachehv, "Number of namecache entries with vnodes held"); 532 STATNODE_COUNTER(drops, numdrops, "Number of dropped entries due to reaching the limit"); 533 STATNODE_COUNTER(dothits, dothits, "Number of '.' hits"); 534 STATNODE_COUNTER(dotdothits, dotdothits, "Number of '..' hits"); 535 STATNODE_COUNTER(miss, nummiss, "Number of cache misses"); 536 STATNODE_COUNTER(misszap, nummisszap, "Number of cache misses we do not want to cache"); 537 STATNODE_COUNTER(poszaps, numposzaps, 538 "Number of cache hits (positive) we do not want to cache"); 539 STATNODE_COUNTER(poshits, numposhits, "Number of cache hits (positive)"); 540 STATNODE_COUNTER(negzaps, numnegzaps, 541 "Number of cache hits (negative) we do not want to cache"); 542 STATNODE_COUNTER(neghits, numneghits, "Number of cache hits (negative)"); 543 /* These count for vn_getcwd(), too. */ 544 STATNODE_COUNTER(fullpathcalls, numfullpathcalls, "Number of fullpath search calls"); 545 STATNODE_COUNTER(fullpathfail1, numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); 546 STATNODE_COUNTER(fullpathfail2, numfullpathfail2, 547 "Number of fullpath search errors (VOP_VPTOCNP failures)"); 548 STATNODE_COUNTER(fullpathfail4, numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); 549 STATNODE_COUNTER(fullpathfound, numfullpathfound, "Number of successful fullpath calls"); 550 STATNODE_COUNTER(symlinktoobig, symlinktoobig, "Number of times symlink did not fit the cache"); 551 552 /* 553 * Debug or developer statistics. 554 */ 555 static SYSCTL_NODE(_vfs_cache, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 556 "Name cache debugging"); 557 #define DEBUGNODE_ULONG(name, varname, descr) \ 558 SYSCTL_ULONG(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr); 559 #define DEBUGNODE_COUNTER(name, varname, descr) \ 560 static COUNTER_U64_DEFINE_EARLY(varname); \ 561 SYSCTL_COUNTER_U64(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, \ 562 descr); 563 DEBUGNODE_COUNTER(zap_bucket_relock_success, zap_bucket_relock_success, 564 "Number of successful removals after relocking"); 565 static long zap_bucket_fail; 566 DEBUGNODE_ULONG(zap_bucket_fail, zap_bucket_fail, ""); 567 static long zap_bucket_fail2; 568 DEBUGNODE_ULONG(zap_bucket_fail2, zap_bucket_fail2, ""); 569 static long cache_lock_vnodes_cel_3_failures; 570 DEBUGNODE_ULONG(vnodes_cel_3_failures, cache_lock_vnodes_cel_3_failures, 571 "Number of times 3-way vnode locking failed"); 572 573 static void cache_zap_locked(struct namecache *ncp); 574 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 575 char **retbuf, size_t *buflen, size_t addend); 576 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, 577 char **retbuf, size_t *buflen); 578 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, 579 char **retbuf, size_t *len, size_t addend); 580 581 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 582 583 static inline void 584 cache_assert_vlp_locked(struct mtx *vlp) 585 { 586 587 if (vlp != NULL) 588 mtx_assert(vlp, MA_OWNED); 589 } 590 591 static inline void 592 cache_assert_vnode_locked(struct vnode *vp) 593 { 594 struct mtx *vlp; 595 596 vlp = VP2VNODELOCK(vp); 597 cache_assert_vlp_locked(vlp); 598 } 599 600 /* 601 * Directory vnodes with entries are held for two reasons: 602 * 1. make them less of a target for reclamation in vnlru 603 * 2. suffer smaller performance penalty in locked lookup as requeieing is avoided 604 * 605 * It will be feasible to stop doing it altogether if all filesystems start 606 * supporting lockless lookup. 607 */ 608 static void 609 cache_hold_vnode(struct vnode *vp) 610 { 611 612 cache_assert_vnode_locked(vp); 613 VNPASS(LIST_EMPTY(&vp->v_cache_src), vp); 614 vhold(vp); 615 counter_u64_add(numcachehv, 1); 616 } 617 618 static void 619 cache_drop_vnode(struct vnode *vp) 620 { 621 622 /* 623 * Called after all locks are dropped, meaning we can't assert 624 * on the state of v_cache_src. 625 */ 626 vdrop(vp); 627 counter_u64_add(numcachehv, -1); 628 } 629 630 /* 631 * UMA zones. 632 */ 633 static uma_zone_t __read_mostly cache_zone_small; 634 static uma_zone_t __read_mostly cache_zone_small_ts; 635 static uma_zone_t __read_mostly cache_zone_large; 636 static uma_zone_t __read_mostly cache_zone_large_ts; 637 638 char * 639 cache_symlink_alloc(size_t size, int flags) 640 { 641 642 if (size < CACHE_ZONE_SMALL_SIZE) { 643 return (uma_zalloc_smr(cache_zone_small, flags)); 644 } 645 if (size < CACHE_ZONE_LARGE_SIZE) { 646 return (uma_zalloc_smr(cache_zone_large, flags)); 647 } 648 counter_u64_add(symlinktoobig, 1); 649 SDT_PROBE1(vfs, namecache, symlink, alloc__fail, size); 650 return (NULL); 651 } 652 653 void 654 cache_symlink_free(char *string, size_t size) 655 { 656 657 MPASS(string != NULL); 658 KASSERT(size < CACHE_ZONE_LARGE_SIZE, 659 ("%s: size %zu too big", __func__, size)); 660 661 if (size < CACHE_ZONE_SMALL_SIZE) { 662 uma_zfree_smr(cache_zone_small, string); 663 return; 664 } 665 if (size < CACHE_ZONE_LARGE_SIZE) { 666 uma_zfree_smr(cache_zone_large, string); 667 return; 668 } 669 __assert_unreachable(); 670 } 671 672 static struct namecache * 673 cache_alloc_uma(int len, bool ts) 674 { 675 struct namecache_ts *ncp_ts; 676 struct namecache *ncp; 677 678 if (__predict_false(ts)) { 679 if (len <= CACHE_PATH_CUTOFF) 680 ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK); 681 else 682 ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK); 683 ncp = &ncp_ts->nc_nc; 684 } else { 685 if (len <= CACHE_PATH_CUTOFF) 686 ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK); 687 else 688 ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK); 689 } 690 return (ncp); 691 } 692 693 static void 694 cache_free_uma(struct namecache *ncp) 695 { 696 struct namecache_ts *ncp_ts; 697 698 if (__predict_false(ncp->nc_flag & NCF_TS)) { 699 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 700 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 701 uma_zfree_smr(cache_zone_small_ts, ncp_ts); 702 else 703 uma_zfree_smr(cache_zone_large_ts, ncp_ts); 704 } else { 705 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 706 uma_zfree_smr(cache_zone_small, ncp); 707 else 708 uma_zfree_smr(cache_zone_large, ncp); 709 } 710 } 711 712 static struct namecache * 713 cache_alloc(int len, bool ts) 714 { 715 u_long lnumcache; 716 717 /* 718 * Avoid blowout in namecache entries. 719 * 720 * Bugs: 721 * 1. filesystems may end up trying to add an already existing entry 722 * (for example this can happen after a cache miss during concurrent 723 * lookup), in which case we will call cache_neg_evict despite not 724 * adding anything. 725 * 2. the routine may fail to free anything and no provisions are made 726 * to make it try harder (see the inside for failure modes) 727 * 3. it only ever looks at negative entries. 728 */ 729 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; 730 if (cache_neg_evict_cond(lnumcache)) { 731 lnumcache = atomic_load_long(&numcache); 732 } 733 if (__predict_false(lnumcache >= ncsize)) { 734 atomic_subtract_long(&numcache, 1); 735 counter_u64_add(numdrops, 1); 736 return (NULL); 737 } 738 return (cache_alloc_uma(len, ts)); 739 } 740 741 static void 742 cache_free(struct namecache *ncp) 743 { 744 745 MPASS(ncp != NULL); 746 if ((ncp->nc_flag & NCF_DVDROP) != 0) { 747 cache_drop_vnode(ncp->nc_dvp); 748 } 749 cache_free_uma(ncp); 750 atomic_subtract_long(&numcache, 1); 751 } 752 753 static void 754 cache_free_batch(struct cache_freebatch *batch) 755 { 756 struct namecache *ncp, *nnp; 757 int i; 758 759 i = 0; 760 if (TAILQ_EMPTY(batch)) 761 goto out; 762 TAILQ_FOREACH_SAFE(ncp, batch, nc_dst, nnp) { 763 if ((ncp->nc_flag & NCF_DVDROP) != 0) { 764 cache_drop_vnode(ncp->nc_dvp); 765 } 766 cache_free_uma(ncp); 767 i++; 768 } 769 atomic_subtract_long(&numcache, i); 770 out: 771 SDT_PROBE1(vfs, namecache, purge, batch, i); 772 } 773 774 /* 775 * Hashing. 776 * 777 * The code was made to use FNV in 2001 and this choice needs to be revisited. 778 * 779 * Short summary of the difficulty: 780 * The longest name which can be inserted is NAME_MAX characters in length (or 781 * 255 at the time of writing this comment), while majority of names used in 782 * practice are significantly shorter (mostly below 10). More importantly 783 * majority of lookups performed find names are even shorter than that. 784 * 785 * This poses a problem where hashes which do better than FNV past word size 786 * (or so) tend to come with additional overhead when finalizing the result, 787 * making them noticeably slower for the most commonly used range. 788 * 789 * Consider a path like: /usr/obj/usr/src/sys/amd64/GENERIC/vnode_if.c 790 * 791 * When looking it up the most time consuming part by a large margin (at least 792 * on amd64) is hashing. Replacing FNV with something which pessimizes short 793 * input would make the slowest part stand out even more. 794 */ 795 796 /* 797 * TODO: With the value stored we can do better than computing the hash based 798 * on the address. 799 */ 800 static void 801 cache_prehash(struct vnode *vp) 802 { 803 804 vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT); 805 } 806 807 static uint32_t 808 cache_get_hash(char *name, u_char len, struct vnode *dvp) 809 { 810 811 return (fnv_32_buf(name, len, dvp->v_nchash)); 812 } 813 814 static uint32_t 815 cache_get_hash_iter_start(struct vnode *dvp) 816 { 817 818 return (dvp->v_nchash); 819 } 820 821 static uint32_t 822 cache_get_hash_iter(char c, uint32_t hash) 823 { 824 825 return (fnv_32_buf(&c, 1, hash)); 826 } 827 828 static uint32_t 829 cache_get_hash_iter_finish(uint32_t hash) 830 { 831 832 return (hash); 833 } 834 835 static inline struct nchashhead * 836 NCP2BUCKET(struct namecache *ncp) 837 { 838 uint32_t hash; 839 840 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 841 return (NCHHASH(hash)); 842 } 843 844 static inline struct mtx * 845 NCP2BUCKETLOCK(struct namecache *ncp) 846 { 847 uint32_t hash; 848 849 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 850 return (HASH2BUCKETLOCK(hash)); 851 } 852 853 #ifdef INVARIANTS 854 static void 855 cache_assert_bucket_locked(struct namecache *ncp) 856 { 857 struct mtx *blp; 858 859 blp = NCP2BUCKETLOCK(ncp); 860 mtx_assert(blp, MA_OWNED); 861 } 862 863 static void 864 cache_assert_bucket_unlocked(struct namecache *ncp) 865 { 866 struct mtx *blp; 867 868 blp = NCP2BUCKETLOCK(ncp); 869 mtx_assert(blp, MA_NOTOWNED); 870 } 871 #else 872 #define cache_assert_bucket_locked(x) do { } while (0) 873 #define cache_assert_bucket_unlocked(x) do { } while (0) 874 #endif 875 876 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) 877 static void 878 _cache_sort_vnodes(void **p1, void **p2) 879 { 880 void *tmp; 881 882 MPASS(*p1 != NULL || *p2 != NULL); 883 884 if (*p1 > *p2) { 885 tmp = *p2; 886 *p2 = *p1; 887 *p1 = tmp; 888 } 889 } 890 891 static void 892 cache_lock_all_buckets(void) 893 { 894 u_int i; 895 896 for (i = 0; i < numbucketlocks; i++) 897 mtx_lock(&bucketlocks[i]); 898 } 899 900 static void 901 cache_unlock_all_buckets(void) 902 { 903 u_int i; 904 905 for (i = 0; i < numbucketlocks; i++) 906 mtx_unlock(&bucketlocks[i]); 907 } 908 909 static void 910 cache_lock_all_vnodes(void) 911 { 912 u_int i; 913 914 for (i = 0; i < numvnodelocks; i++) 915 mtx_lock(&vnodelocks[i]); 916 } 917 918 static void 919 cache_unlock_all_vnodes(void) 920 { 921 u_int i; 922 923 for (i = 0; i < numvnodelocks; i++) 924 mtx_unlock(&vnodelocks[i]); 925 } 926 927 static int 928 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 929 { 930 931 cache_sort_vnodes(&vlp1, &vlp2); 932 933 if (vlp1 != NULL) { 934 if (!mtx_trylock(vlp1)) 935 return (EAGAIN); 936 } 937 if (!mtx_trylock(vlp2)) { 938 if (vlp1 != NULL) 939 mtx_unlock(vlp1); 940 return (EAGAIN); 941 } 942 943 return (0); 944 } 945 946 static void 947 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 948 { 949 950 MPASS(vlp1 != NULL || vlp2 != NULL); 951 MPASS(vlp1 <= vlp2); 952 953 if (vlp1 != NULL) 954 mtx_lock(vlp1); 955 if (vlp2 != NULL) 956 mtx_lock(vlp2); 957 } 958 959 static void 960 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 961 { 962 963 MPASS(vlp1 != NULL || vlp2 != NULL); 964 965 if (vlp1 != NULL) 966 mtx_unlock(vlp1); 967 if (vlp2 != NULL) 968 mtx_unlock(vlp2); 969 } 970 971 static int 972 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 973 { 974 struct nchstats snap; 975 976 if (req->oldptr == NULL) 977 return (SYSCTL_OUT(req, 0, sizeof(snap))); 978 979 snap = nchstats; 980 snap.ncs_goodhits = counter_u64_fetch(numposhits); 981 snap.ncs_neghits = counter_u64_fetch(numneghits); 982 snap.ncs_badhits = counter_u64_fetch(numposzaps) + 983 counter_u64_fetch(numnegzaps); 984 snap.ncs_miss = counter_u64_fetch(nummisszap) + 985 counter_u64_fetch(nummiss); 986 987 return (SYSCTL_OUT(req, &snap, sizeof(snap))); 988 } 989 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | 990 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", 991 "VFS cache effectiveness statistics"); 992 993 static void 994 cache_recalc_neg_min(u_int val) 995 { 996 997 neg_min = (ncsize * val) / 100; 998 } 999 1000 static int 1001 sysctl_negminpct(SYSCTL_HANDLER_ARGS) 1002 { 1003 u_int val; 1004 int error; 1005 1006 val = ncnegminpct; 1007 error = sysctl_handle_int(oidp, &val, 0, req); 1008 if (error != 0 || req->newptr == NULL) 1009 return (error); 1010 1011 if (val == ncnegminpct) 1012 return (0); 1013 if (val < 0 || val > 99) 1014 return (EINVAL); 1015 ncnegminpct = val; 1016 cache_recalc_neg_min(val); 1017 return (0); 1018 } 1019 1020 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, negminpct, 1021 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_negminpct, 1022 "I", "Negative entry \% of namecache capacity above which automatic eviction is allowed"); 1023 1024 #ifdef DEBUG_CACHE 1025 /* 1026 * Grab an atomic snapshot of the name cache hash chain lengths 1027 */ 1028 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, 1029 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 1030 "hash table stats"); 1031 1032 static int 1033 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) 1034 { 1035 struct nchashhead *ncpp; 1036 struct namecache *ncp; 1037 int i, error, n_nchash, *cntbuf; 1038 1039 retry: 1040 n_nchash = nchash + 1; /* nchash is max index, not count */ 1041 if (req->oldptr == NULL) 1042 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); 1043 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); 1044 cache_lock_all_buckets(); 1045 if (n_nchash != nchash + 1) { 1046 cache_unlock_all_buckets(); 1047 free(cntbuf, M_TEMP); 1048 goto retry; 1049 } 1050 /* Scan hash tables counting entries */ 1051 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) 1052 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) 1053 cntbuf[i]++; 1054 cache_unlock_all_buckets(); 1055 for (error = 0, i = 0; i < n_nchash; i++) 1056 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) 1057 break; 1058 free(cntbuf, M_TEMP); 1059 return (error); 1060 } 1061 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| 1062 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", 1063 "nchash chain lengths"); 1064 1065 static int 1066 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) 1067 { 1068 int error; 1069 struct nchashhead *ncpp; 1070 struct namecache *ncp; 1071 int n_nchash; 1072 int count, maxlength, used, pct; 1073 1074 if (!req->oldptr) 1075 return SYSCTL_OUT(req, 0, 4 * sizeof(int)); 1076 1077 cache_lock_all_buckets(); 1078 n_nchash = nchash + 1; /* nchash is max index, not count */ 1079 used = 0; 1080 maxlength = 0; 1081 1082 /* Scan hash tables for applicable entries */ 1083 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { 1084 count = 0; 1085 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) { 1086 count++; 1087 } 1088 if (count) 1089 used++; 1090 if (maxlength < count) 1091 maxlength = count; 1092 } 1093 n_nchash = nchash + 1; 1094 cache_unlock_all_buckets(); 1095 pct = (used * 100) / (n_nchash / 100); 1096 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); 1097 if (error) 1098 return (error); 1099 error = SYSCTL_OUT(req, &used, sizeof(used)); 1100 if (error) 1101 return (error); 1102 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); 1103 if (error) 1104 return (error); 1105 error = SYSCTL_OUT(req, &pct, sizeof(pct)); 1106 if (error) 1107 return (error); 1108 return (0); 1109 } 1110 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| 1111 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", 1112 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); 1113 #endif 1114 1115 /* 1116 * Negative entries management 1117 * 1118 * Various workloads create plenty of negative entries and barely use them 1119 * afterwards. Moreover malicious users can keep performing bogus lookups 1120 * adding even more entries. For example "make tinderbox" as of writing this 1121 * comment ends up with 2.6M namecache entries in total, 1.2M of which are 1122 * negative. 1123 * 1124 * As such, a rather aggressive eviction method is needed. The currently 1125 * employed method is a placeholder. 1126 * 1127 * Entries are split over numneglists separate lists, each of which is further 1128 * split into hot and cold entries. Entries get promoted after getting a hit. 1129 * Eviction happens on addition of new entry. 1130 */ 1131 static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1132 "Name cache negative entry statistics"); 1133 1134 SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0, 1135 "Number of negative cache entries"); 1136 1137 static COUNTER_U64_DEFINE_EARLY(neg_created); 1138 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created, 1139 "Number of created negative entries"); 1140 1141 static COUNTER_U64_DEFINE_EARLY(neg_evicted); 1142 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted, 1143 "Number of evicted negative entries"); 1144 1145 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty); 1146 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD, 1147 &neg_evict_skipped_empty, 1148 "Number of times evicting failed due to lack of entries"); 1149 1150 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed); 1151 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD, 1152 &neg_evict_skipped_missed, 1153 "Number of times evicting failed due to target entry disappearing"); 1154 1155 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended); 1156 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD, 1157 &neg_evict_skipped_contended, 1158 "Number of times evicting failed due to contention"); 1159 1160 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits, 1161 "Number of cache hits (negative)"); 1162 1163 static int 1164 sysctl_neg_hot(SYSCTL_HANDLER_ARGS) 1165 { 1166 int i, out; 1167 1168 out = 0; 1169 for (i = 0; i < numneglists; i++) 1170 out += neglists[i].nl_hotnum; 1171 1172 return (SYSCTL_OUT(req, &out, sizeof(out))); 1173 } 1174 SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD | 1175 CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I", 1176 "Number of hot negative entries"); 1177 1178 static void 1179 cache_neg_init(struct namecache *ncp) 1180 { 1181 struct negstate *ns; 1182 1183 ncp->nc_flag |= NCF_NEGATIVE; 1184 ns = NCP2NEGSTATE(ncp); 1185 ns->neg_flag = 0; 1186 ns->neg_hit = 0; 1187 counter_u64_add(neg_created, 1); 1188 } 1189 1190 #define CACHE_NEG_PROMOTION_THRESH 2 1191 1192 static bool 1193 cache_neg_hit_prep(struct namecache *ncp) 1194 { 1195 struct negstate *ns; 1196 u_char n; 1197 1198 ns = NCP2NEGSTATE(ncp); 1199 n = atomic_load_char(&ns->neg_hit); 1200 for (;;) { 1201 if (n >= CACHE_NEG_PROMOTION_THRESH) 1202 return (false); 1203 if (atomic_fcmpset_8(&ns->neg_hit, &n, n + 1)) 1204 break; 1205 } 1206 return (n + 1 == CACHE_NEG_PROMOTION_THRESH); 1207 } 1208 1209 /* 1210 * Nothing to do here but it is provided for completeness as some 1211 * cache_neg_hit_prep callers may end up returning without even 1212 * trying to promote. 1213 */ 1214 #define cache_neg_hit_abort(ncp) do { } while (0) 1215 1216 static void 1217 cache_neg_hit_finish(struct namecache *ncp) 1218 { 1219 1220 SDT_PROBE2(vfs, namecache, lookup, hit__negative, ncp->nc_dvp, ncp->nc_name); 1221 counter_u64_add(numneghits, 1); 1222 } 1223 1224 /* 1225 * Move a negative entry to the hot list. 1226 */ 1227 static void 1228 cache_neg_promote_locked(struct namecache *ncp) 1229 { 1230 struct neglist *nl; 1231 struct negstate *ns; 1232 1233 ns = NCP2NEGSTATE(ncp); 1234 nl = NCP2NEGLIST(ncp); 1235 mtx_assert(&nl->nl_lock, MA_OWNED); 1236 if ((ns->neg_flag & NEG_HOT) == 0) { 1237 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst); 1238 TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst); 1239 nl->nl_hotnum++; 1240 ns->neg_flag |= NEG_HOT; 1241 } 1242 } 1243 1244 /* 1245 * Move a hot negative entry to the cold list. 1246 */ 1247 static void 1248 cache_neg_demote_locked(struct namecache *ncp) 1249 { 1250 struct neglist *nl; 1251 struct negstate *ns; 1252 1253 ns = NCP2NEGSTATE(ncp); 1254 nl = NCP2NEGLIST(ncp); 1255 mtx_assert(&nl->nl_lock, MA_OWNED); 1256 MPASS(ns->neg_flag & NEG_HOT); 1257 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst); 1258 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst); 1259 nl->nl_hotnum--; 1260 ns->neg_flag &= ~NEG_HOT; 1261 atomic_store_char(&ns->neg_hit, 0); 1262 } 1263 1264 /* 1265 * Move a negative entry to the hot list if it matches the lookup. 1266 * 1267 * We have to take locks, but they may be contended and in the worst 1268 * case we may need to go off CPU. We don't want to spin within the 1269 * smr section and we can't block with it. Exiting the section means 1270 * the found entry could have been evicted. We are going to look it 1271 * up again. 1272 */ 1273 static bool 1274 cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp, 1275 struct namecache *oncp, uint32_t hash) 1276 { 1277 struct namecache *ncp; 1278 struct neglist *nl; 1279 u_char nc_flag; 1280 1281 nl = NCP2NEGLIST(oncp); 1282 1283 mtx_lock(&nl->nl_lock); 1284 /* 1285 * For hash iteration. 1286 */ 1287 vfs_smr_enter(); 1288 1289 /* 1290 * Avoid all surprises by only succeeding if we got the same entry and 1291 * bailing completely otherwise. 1292 * XXX There are no provisions to keep the vnode around, meaning we may 1293 * end up promoting a negative entry for a *new* vnode and returning 1294 * ENOENT on its account. This is the error we want to return anyway 1295 * and promotion is harmless. 1296 * 1297 * In particular at this point there can be a new ncp which matches the 1298 * search but hashes to a different neglist. 1299 */ 1300 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1301 if (ncp == oncp) 1302 break; 1303 } 1304 1305 /* 1306 * No match to begin with. 1307 */ 1308 if (__predict_false(ncp == NULL)) { 1309 goto out_abort; 1310 } 1311 1312 /* 1313 * The newly found entry may be something different... 1314 */ 1315 if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1316 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) { 1317 goto out_abort; 1318 } 1319 1320 /* 1321 * ... and not even negative. 1322 */ 1323 nc_flag = atomic_load_char(&ncp->nc_flag); 1324 if ((nc_flag & NCF_NEGATIVE) == 0) { 1325 goto out_abort; 1326 } 1327 1328 if (!cache_ncp_canuse(ncp)) { 1329 goto out_abort; 1330 } 1331 1332 cache_neg_promote_locked(ncp); 1333 cache_neg_hit_finish(ncp); 1334 vfs_smr_exit(); 1335 mtx_unlock(&nl->nl_lock); 1336 return (true); 1337 out_abort: 1338 vfs_smr_exit(); 1339 mtx_unlock(&nl->nl_lock); 1340 return (false); 1341 } 1342 1343 static void 1344 cache_neg_promote(struct namecache *ncp) 1345 { 1346 struct neglist *nl; 1347 1348 nl = NCP2NEGLIST(ncp); 1349 mtx_lock(&nl->nl_lock); 1350 cache_neg_promote_locked(ncp); 1351 mtx_unlock(&nl->nl_lock); 1352 } 1353 1354 static void 1355 cache_neg_insert(struct namecache *ncp) 1356 { 1357 struct neglist *nl; 1358 1359 MPASS(ncp->nc_flag & NCF_NEGATIVE); 1360 cache_assert_bucket_locked(ncp); 1361 nl = NCP2NEGLIST(ncp); 1362 mtx_lock(&nl->nl_lock); 1363 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst); 1364 mtx_unlock(&nl->nl_lock); 1365 atomic_add_long(&numneg, 1); 1366 } 1367 1368 static void 1369 cache_neg_remove(struct namecache *ncp) 1370 { 1371 struct neglist *nl; 1372 struct negstate *ns; 1373 1374 cache_assert_bucket_locked(ncp); 1375 nl = NCP2NEGLIST(ncp); 1376 ns = NCP2NEGSTATE(ncp); 1377 mtx_lock(&nl->nl_lock); 1378 if ((ns->neg_flag & NEG_HOT) != 0) { 1379 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst); 1380 nl->nl_hotnum--; 1381 } else { 1382 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst); 1383 } 1384 mtx_unlock(&nl->nl_lock); 1385 atomic_subtract_long(&numneg, 1); 1386 } 1387 1388 static struct neglist * 1389 cache_neg_evict_select_list(void) 1390 { 1391 struct neglist *nl; 1392 u_int c; 1393 1394 c = atomic_fetchadd_int(&neg_cycle, 1) + 1; 1395 nl = &neglists[c % numneglists]; 1396 if (!mtx_trylock(&nl->nl_evict_lock)) { 1397 counter_u64_add(neg_evict_skipped_contended, 1); 1398 return (NULL); 1399 } 1400 return (nl); 1401 } 1402 1403 static struct namecache * 1404 cache_neg_evict_select_entry(struct neglist *nl) 1405 { 1406 struct namecache *ncp, *lncp; 1407 struct negstate *ns, *lns; 1408 int i; 1409 1410 mtx_assert(&nl->nl_evict_lock, MA_OWNED); 1411 mtx_assert(&nl->nl_lock, MA_OWNED); 1412 ncp = TAILQ_FIRST(&nl->nl_list); 1413 if (ncp == NULL) 1414 return (NULL); 1415 lncp = ncp; 1416 lns = NCP2NEGSTATE(lncp); 1417 for (i = 1; i < 4; i++) { 1418 ncp = TAILQ_NEXT(ncp, nc_dst); 1419 if (ncp == NULL) 1420 break; 1421 ns = NCP2NEGSTATE(ncp); 1422 if (ns->neg_hit < lns->neg_hit) { 1423 lncp = ncp; 1424 lns = ns; 1425 } 1426 } 1427 return (lncp); 1428 } 1429 1430 static bool 1431 cache_neg_evict(void) 1432 { 1433 struct namecache *ncp, *ncp2; 1434 struct neglist *nl; 1435 struct vnode *dvp; 1436 struct mtx *dvlp; 1437 struct mtx *blp; 1438 uint32_t hash; 1439 u_char nlen; 1440 bool evicted; 1441 1442 nl = cache_neg_evict_select_list(); 1443 if (nl == NULL) { 1444 return (false); 1445 } 1446 1447 mtx_lock(&nl->nl_lock); 1448 ncp = TAILQ_FIRST(&nl->nl_hotlist); 1449 if (ncp != NULL) { 1450 cache_neg_demote_locked(ncp); 1451 } 1452 ncp = cache_neg_evict_select_entry(nl); 1453 if (ncp == NULL) { 1454 counter_u64_add(neg_evict_skipped_empty, 1); 1455 mtx_unlock(&nl->nl_lock); 1456 mtx_unlock(&nl->nl_evict_lock); 1457 return (false); 1458 } 1459 nlen = ncp->nc_nlen; 1460 dvp = ncp->nc_dvp; 1461 hash = cache_get_hash(ncp->nc_name, nlen, dvp); 1462 dvlp = VP2VNODELOCK(dvp); 1463 blp = HASH2BUCKETLOCK(hash); 1464 mtx_unlock(&nl->nl_lock); 1465 mtx_unlock(&nl->nl_evict_lock); 1466 mtx_lock(dvlp); 1467 mtx_lock(blp); 1468 /* 1469 * Note that since all locks were dropped above, the entry may be 1470 * gone or reallocated to be something else. 1471 */ 1472 CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) { 1473 if (ncp2 == ncp && ncp2->nc_dvp == dvp && 1474 ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0) 1475 break; 1476 } 1477 if (ncp2 == NULL) { 1478 counter_u64_add(neg_evict_skipped_missed, 1); 1479 ncp = NULL; 1480 evicted = false; 1481 } else { 1482 MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp)); 1483 MPASS(blp == NCP2BUCKETLOCK(ncp)); 1484 SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp, 1485 ncp->nc_name); 1486 cache_zap_locked(ncp); 1487 counter_u64_add(neg_evicted, 1); 1488 evicted = true; 1489 } 1490 mtx_unlock(blp); 1491 mtx_unlock(dvlp); 1492 if (ncp != NULL) 1493 cache_free(ncp); 1494 return (evicted); 1495 } 1496 1497 /* 1498 * Maybe evict a negative entry to create more room. 1499 * 1500 * The ncnegfactor parameter limits what fraction of the total count 1501 * can comprise of negative entries. However, if the cache is just 1502 * warming up this leads to excessive evictions. As such, ncnegminpct 1503 * (recomputed to neg_min) dictates whether the above should be 1504 * applied. 1505 * 1506 * Try evicting if the cache is close to full capacity regardless of 1507 * other considerations. 1508 */ 1509 static bool 1510 cache_neg_evict_cond(u_long lnumcache) 1511 { 1512 u_long lnumneg; 1513 1514 if (ncsize - 1000 < lnumcache) 1515 goto out_evict; 1516 lnumneg = atomic_load_long(&numneg); 1517 if (lnumneg < neg_min) 1518 return (false); 1519 if (lnumneg * ncnegfactor < lnumcache) 1520 return (false); 1521 out_evict: 1522 return (cache_neg_evict()); 1523 } 1524 1525 /* 1526 * cache_zap_locked(): 1527 * 1528 * Removes a namecache entry from cache, whether it contains an actual 1529 * pointer to a vnode or if it is just a negative cache entry. 1530 */ 1531 static void 1532 cache_zap_locked(struct namecache *ncp) 1533 { 1534 struct nchashhead *ncpp; 1535 struct vnode *dvp, *vp; 1536 1537 dvp = ncp->nc_dvp; 1538 vp = ncp->nc_vp; 1539 1540 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1541 cache_assert_vnode_locked(vp); 1542 cache_assert_vnode_locked(dvp); 1543 cache_assert_bucket_locked(ncp); 1544 1545 cache_ncp_invalidate(ncp); 1546 1547 ncpp = NCP2BUCKET(ncp); 1548 CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash); 1549 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 1550 SDT_PROBE3(vfs, namecache, zap, done, dvp, ncp->nc_name, vp); 1551 TAILQ_REMOVE(&vp->v_cache_dst, ncp, nc_dst); 1552 if (ncp == vp->v_cache_dd) { 1553 atomic_store_ptr(&vp->v_cache_dd, NULL); 1554 } 1555 } else { 1556 SDT_PROBE2(vfs, namecache, zap_negative, done, dvp, ncp->nc_name); 1557 cache_neg_remove(ncp); 1558 } 1559 if (ncp->nc_flag & NCF_ISDOTDOT) { 1560 if (ncp == dvp->v_cache_dd) { 1561 atomic_store_ptr(&dvp->v_cache_dd, NULL); 1562 } 1563 } else { 1564 LIST_REMOVE(ncp, nc_src); 1565 if (LIST_EMPTY(&dvp->v_cache_src)) { 1566 ncp->nc_flag |= NCF_DVDROP; 1567 } 1568 } 1569 } 1570 1571 static void 1572 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) 1573 { 1574 struct mtx *blp; 1575 1576 MPASS(ncp->nc_dvp == vp); 1577 MPASS(ncp->nc_flag & NCF_NEGATIVE); 1578 cache_assert_vnode_locked(vp); 1579 1580 blp = NCP2BUCKETLOCK(ncp); 1581 mtx_lock(blp); 1582 cache_zap_locked(ncp); 1583 mtx_unlock(blp); 1584 } 1585 1586 static bool 1587 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, 1588 struct mtx **vlpp) 1589 { 1590 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 1591 struct mtx *blp; 1592 1593 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 1594 cache_assert_vnode_locked(vp); 1595 1596 if (ncp->nc_flag & NCF_NEGATIVE) { 1597 if (*vlpp != NULL) { 1598 mtx_unlock(*vlpp); 1599 *vlpp = NULL; 1600 } 1601 cache_zap_negative_locked_vnode_kl(ncp, vp); 1602 return (true); 1603 } 1604 1605 pvlp = VP2VNODELOCK(vp); 1606 blp = NCP2BUCKETLOCK(ncp); 1607 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 1608 vlp2 = VP2VNODELOCK(ncp->nc_vp); 1609 1610 if (*vlpp == vlp1 || *vlpp == vlp2) { 1611 to_unlock = *vlpp; 1612 *vlpp = NULL; 1613 } else { 1614 if (*vlpp != NULL) { 1615 mtx_unlock(*vlpp); 1616 *vlpp = NULL; 1617 } 1618 cache_sort_vnodes(&vlp1, &vlp2); 1619 if (vlp1 == pvlp) { 1620 mtx_lock(vlp2); 1621 to_unlock = vlp2; 1622 } else { 1623 if (!mtx_trylock(vlp1)) 1624 goto out_relock; 1625 to_unlock = vlp1; 1626 } 1627 } 1628 mtx_lock(blp); 1629 cache_zap_locked(ncp); 1630 mtx_unlock(blp); 1631 if (to_unlock != NULL) 1632 mtx_unlock(to_unlock); 1633 return (true); 1634 1635 out_relock: 1636 mtx_unlock(vlp2); 1637 mtx_lock(vlp1); 1638 mtx_lock(vlp2); 1639 MPASS(*vlpp == NULL); 1640 *vlpp = vlp1; 1641 return (false); 1642 } 1643 1644 /* 1645 * If trylocking failed we can get here. We know enough to take all needed locks 1646 * in the right order and re-lookup the entry. 1647 */ 1648 static int 1649 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1650 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, 1651 struct mtx *blp) 1652 { 1653 struct namecache *rncp; 1654 1655 cache_assert_bucket_unlocked(ncp); 1656 1657 cache_sort_vnodes(&dvlp, &vlp); 1658 cache_lock_vnodes(dvlp, vlp); 1659 mtx_lock(blp); 1660 CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { 1661 if (rncp == ncp && rncp->nc_dvp == dvp && 1662 rncp->nc_nlen == cnp->cn_namelen && 1663 !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) 1664 break; 1665 } 1666 if (rncp != NULL) { 1667 cache_zap_locked(rncp); 1668 mtx_unlock(blp); 1669 cache_unlock_vnodes(dvlp, vlp); 1670 counter_u64_add(zap_bucket_relock_success, 1); 1671 return (0); 1672 } 1673 1674 mtx_unlock(blp); 1675 cache_unlock_vnodes(dvlp, vlp); 1676 return (EAGAIN); 1677 } 1678 1679 static int __noinline 1680 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp, 1681 uint32_t hash, struct mtx *blp) 1682 { 1683 struct mtx *dvlp, *vlp; 1684 struct vnode *dvp; 1685 1686 cache_assert_bucket_locked(ncp); 1687 1688 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1689 vlp = NULL; 1690 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1691 vlp = VP2VNODELOCK(ncp->nc_vp); 1692 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1693 cache_zap_locked(ncp); 1694 mtx_unlock(blp); 1695 cache_unlock_vnodes(dvlp, vlp); 1696 return (0); 1697 } 1698 1699 dvp = ncp->nc_dvp; 1700 mtx_unlock(blp); 1701 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1702 } 1703 1704 static __noinline int 1705 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp) 1706 { 1707 struct namecache *ncp; 1708 struct mtx *blp; 1709 struct mtx *dvlp, *dvlp2; 1710 uint32_t hash; 1711 int error; 1712 1713 if (cnp->cn_namelen == 2 && 1714 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1715 dvlp = VP2VNODELOCK(dvp); 1716 dvlp2 = NULL; 1717 mtx_lock(dvlp); 1718 retry_dotdot: 1719 ncp = dvp->v_cache_dd; 1720 if (ncp == NULL) { 1721 mtx_unlock(dvlp); 1722 if (dvlp2 != NULL) 1723 mtx_unlock(dvlp2); 1724 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1725 return (0); 1726 } 1727 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1728 if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2)) 1729 goto retry_dotdot; 1730 MPASS(dvp->v_cache_dd == NULL); 1731 mtx_unlock(dvlp); 1732 if (dvlp2 != NULL) 1733 mtx_unlock(dvlp2); 1734 cache_free(ncp); 1735 } else { 1736 atomic_store_ptr(&dvp->v_cache_dd, NULL); 1737 mtx_unlock(dvlp); 1738 if (dvlp2 != NULL) 1739 mtx_unlock(dvlp2); 1740 } 1741 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1742 return (1); 1743 } 1744 1745 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1746 blp = HASH2BUCKETLOCK(hash); 1747 retry: 1748 if (CK_SLIST_EMPTY(NCHHASH(hash))) 1749 goto out_no_entry; 1750 1751 mtx_lock(blp); 1752 1753 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1754 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1755 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1756 break; 1757 } 1758 1759 if (ncp == NULL) { 1760 mtx_unlock(blp); 1761 goto out_no_entry; 1762 } 1763 1764 error = cache_zap_locked_bucket(ncp, cnp, hash, blp); 1765 if (__predict_false(error != 0)) { 1766 zap_bucket_fail++; 1767 goto retry; 1768 } 1769 counter_u64_add(numposzaps, 1); 1770 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1771 cache_free(ncp); 1772 return (1); 1773 out_no_entry: 1774 counter_u64_add(nummisszap, 1); 1775 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1776 return (0); 1777 } 1778 1779 static int __noinline 1780 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1781 struct timespec *tsp, int *ticksp) 1782 { 1783 int ltype; 1784 1785 *vpp = dvp; 1786 counter_u64_add(dothits, 1); 1787 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); 1788 if (tsp != NULL) 1789 timespecclear(tsp); 1790 if (ticksp != NULL) 1791 *ticksp = ticks; 1792 vrefact(*vpp); 1793 /* 1794 * When we lookup "." we still can be asked to lock it 1795 * differently... 1796 */ 1797 ltype = cnp->cn_lkflags & LK_TYPE_MASK; 1798 if (ltype != VOP_ISLOCKED(*vpp)) { 1799 if (ltype == LK_EXCLUSIVE) { 1800 vn_lock(*vpp, LK_UPGRADE | LK_RETRY); 1801 if (VN_IS_DOOMED((*vpp))) { 1802 /* forced unmount */ 1803 vrele(*vpp); 1804 *vpp = NULL; 1805 return (ENOENT); 1806 } 1807 } else 1808 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); 1809 } 1810 return (-1); 1811 } 1812 1813 static int __noinline 1814 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1815 struct timespec *tsp, int *ticksp) 1816 { 1817 struct namecache_ts *ncp_ts; 1818 struct namecache *ncp; 1819 struct mtx *dvlp; 1820 enum vgetstate vs; 1821 int error, ltype; 1822 bool whiteout; 1823 1824 MPASS((cnp->cn_flags & ISDOTDOT) != 0); 1825 1826 if ((cnp->cn_flags & MAKEENTRY) == 0) { 1827 cache_remove_cnp(dvp, cnp); 1828 return (0); 1829 } 1830 1831 counter_u64_add(dotdothits, 1); 1832 retry: 1833 dvlp = VP2VNODELOCK(dvp); 1834 mtx_lock(dvlp); 1835 ncp = dvp->v_cache_dd; 1836 if (ncp == NULL) { 1837 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, ".."); 1838 mtx_unlock(dvlp); 1839 return (0); 1840 } 1841 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1842 if (ncp->nc_flag & NCF_NEGATIVE) 1843 *vpp = NULL; 1844 else 1845 *vpp = ncp->nc_vp; 1846 } else 1847 *vpp = ncp->nc_dvp; 1848 if (*vpp == NULL) 1849 goto negative_success; 1850 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp); 1851 cache_out_ts(ncp, tsp, ticksp); 1852 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == 1853 NCF_DTS && tsp != NULL) { 1854 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1855 *tsp = ncp_ts->nc_dotdottime; 1856 } 1857 1858 MPASS(dvp != *vpp); 1859 ltype = VOP_ISLOCKED(dvp); 1860 VOP_UNLOCK(dvp); 1861 vs = vget_prep(*vpp); 1862 mtx_unlock(dvlp); 1863 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1864 vn_lock(dvp, ltype | LK_RETRY); 1865 if (VN_IS_DOOMED(dvp)) { 1866 if (error == 0) 1867 vput(*vpp); 1868 *vpp = NULL; 1869 return (ENOENT); 1870 } 1871 if (error) { 1872 *vpp = NULL; 1873 goto retry; 1874 } 1875 return (-1); 1876 negative_success: 1877 if (__predict_false(cnp->cn_nameiop == CREATE)) { 1878 if (cnp->cn_flags & ISLASTCN) { 1879 counter_u64_add(numnegzaps, 1); 1880 cache_zap_negative_locked_vnode_kl(ncp, dvp); 1881 mtx_unlock(dvlp); 1882 cache_free(ncp); 1883 return (0); 1884 } 1885 } 1886 1887 whiteout = (ncp->nc_flag & NCF_WHITE); 1888 cache_out_ts(ncp, tsp, ticksp); 1889 if (cache_neg_hit_prep(ncp)) 1890 cache_neg_promote(ncp); 1891 else 1892 cache_neg_hit_finish(ncp); 1893 mtx_unlock(dvlp); 1894 if (whiteout) 1895 cnp->cn_flags |= ISWHITEOUT; 1896 return (ENOENT); 1897 } 1898 1899 /** 1900 * Lookup a name in the name cache 1901 * 1902 * # Arguments 1903 * 1904 * - dvp: Parent directory in which to search. 1905 * - vpp: Return argument. Will contain desired vnode on cache hit. 1906 * - cnp: Parameters of the name search. The most interesting bits of 1907 * the cn_flags field have the following meanings: 1908 * - MAKEENTRY: If clear, free an entry from the cache rather than look 1909 * it up. 1910 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." 1911 * - tsp: Return storage for cache timestamp. On a successful (positive 1912 * or negative) lookup, tsp will be filled with any timespec that 1913 * was stored when this cache entry was created. However, it will 1914 * be clear for "." entries. 1915 * - ticks: Return storage for alternate cache timestamp. On a successful 1916 * (positive or negative) lookup, it will contain the ticks value 1917 * that was current when the cache entry was created, unless cnp 1918 * was ".". 1919 * 1920 * Either both tsp and ticks have to be provided or neither of them. 1921 * 1922 * # Returns 1923 * 1924 * - -1: A positive cache hit. vpp will contain the desired vnode. 1925 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due 1926 * to a forced unmount. vpp will not be modified. If the entry 1927 * is a whiteout, then the ISWHITEOUT flag will be set in 1928 * cnp->cn_flags. 1929 * - 0: A cache miss. vpp will not be modified. 1930 * 1931 * # Locking 1932 * 1933 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up 1934 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the 1935 * lock is not recursively acquired. 1936 */ 1937 static int __noinline 1938 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1939 struct timespec *tsp, int *ticksp) 1940 { 1941 struct namecache *ncp; 1942 struct mtx *blp; 1943 uint32_t hash; 1944 enum vgetstate vs; 1945 int error; 1946 bool whiteout; 1947 1948 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 1949 MPASS((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) != 0); 1950 1951 retry: 1952 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1953 blp = HASH2BUCKETLOCK(hash); 1954 mtx_lock(blp); 1955 1956 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1957 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1958 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1959 break; 1960 } 1961 1962 if (__predict_false(ncp == NULL)) { 1963 mtx_unlock(blp); 1964 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr); 1965 counter_u64_add(nummiss, 1); 1966 return (0); 1967 } 1968 1969 if (ncp->nc_flag & NCF_NEGATIVE) 1970 goto negative_success; 1971 1972 counter_u64_add(numposhits, 1); 1973 *vpp = ncp->nc_vp; 1974 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); 1975 cache_out_ts(ncp, tsp, ticksp); 1976 MPASS(dvp != *vpp); 1977 vs = vget_prep(*vpp); 1978 mtx_unlock(blp); 1979 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1980 if (error) { 1981 *vpp = NULL; 1982 goto retry; 1983 } 1984 return (-1); 1985 negative_success: 1986 /* 1987 * We don't get here with regular lookup apart from corner cases. 1988 */ 1989 if (__predict_true(cnp->cn_nameiop == CREATE)) { 1990 if (cnp->cn_flags & ISLASTCN) { 1991 counter_u64_add(numnegzaps, 1); 1992 error = cache_zap_locked_bucket(ncp, cnp, hash, blp); 1993 if (__predict_false(error != 0)) { 1994 zap_bucket_fail2++; 1995 goto retry; 1996 } 1997 cache_free(ncp); 1998 return (0); 1999 } 2000 } 2001 2002 whiteout = (ncp->nc_flag & NCF_WHITE); 2003 cache_out_ts(ncp, tsp, ticksp); 2004 if (cache_neg_hit_prep(ncp)) 2005 cache_neg_promote(ncp); 2006 else 2007 cache_neg_hit_finish(ncp); 2008 mtx_unlock(blp); 2009 if (whiteout) 2010 cnp->cn_flags |= ISWHITEOUT; 2011 return (ENOENT); 2012 } 2013 2014 int 2015 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 2016 struct timespec *tsp, int *ticksp) 2017 { 2018 struct namecache *ncp; 2019 uint32_t hash; 2020 enum vgetstate vs; 2021 int error; 2022 bool whiteout, neg_promote; 2023 u_short nc_flag; 2024 2025 MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL)); 2026 2027 #ifdef DEBUG_CACHE 2028 if (__predict_false(!doingcache)) { 2029 cnp->cn_flags &= ~MAKEENTRY; 2030 return (0); 2031 } 2032 #endif 2033 2034 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 2035 if (cnp->cn_namelen == 1) 2036 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); 2037 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') 2038 return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp)); 2039 } 2040 2041 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 2042 2043 if ((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) == 0) { 2044 cache_remove_cnp(dvp, cnp); 2045 return (0); 2046 } 2047 2048 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 2049 vfs_smr_enter(); 2050 2051 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 2052 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 2053 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 2054 break; 2055 } 2056 2057 if (__predict_false(ncp == NULL)) { 2058 vfs_smr_exit(); 2059 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr); 2060 counter_u64_add(nummiss, 1); 2061 return (0); 2062 } 2063 2064 nc_flag = atomic_load_char(&ncp->nc_flag); 2065 if (nc_flag & NCF_NEGATIVE) 2066 goto negative_success; 2067 2068 counter_u64_add(numposhits, 1); 2069 *vpp = ncp->nc_vp; 2070 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); 2071 cache_out_ts(ncp, tsp, ticksp); 2072 MPASS(dvp != *vpp); 2073 if (!cache_ncp_canuse(ncp)) { 2074 vfs_smr_exit(); 2075 *vpp = NULL; 2076 goto out_fallback; 2077 } 2078 vs = vget_prep_smr(*vpp); 2079 vfs_smr_exit(); 2080 if (__predict_false(vs == VGET_NONE)) { 2081 *vpp = NULL; 2082 goto out_fallback; 2083 } 2084 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 2085 if (error) { 2086 *vpp = NULL; 2087 goto out_fallback; 2088 } 2089 return (-1); 2090 negative_success: 2091 if (cnp->cn_nameiop == CREATE) { 2092 if (cnp->cn_flags & ISLASTCN) { 2093 vfs_smr_exit(); 2094 goto out_fallback; 2095 } 2096 } 2097 2098 cache_out_ts(ncp, tsp, ticksp); 2099 whiteout = (atomic_load_char(&ncp->nc_flag) & NCF_WHITE); 2100 neg_promote = cache_neg_hit_prep(ncp); 2101 if (!cache_ncp_canuse(ncp)) { 2102 cache_neg_hit_abort(ncp); 2103 vfs_smr_exit(); 2104 goto out_fallback; 2105 } 2106 if (neg_promote) { 2107 vfs_smr_exit(); 2108 if (!cache_neg_promote_cond(dvp, cnp, ncp, hash)) 2109 goto out_fallback; 2110 } else { 2111 cache_neg_hit_finish(ncp); 2112 vfs_smr_exit(); 2113 } 2114 if (whiteout) 2115 cnp->cn_flags |= ISWHITEOUT; 2116 return (ENOENT); 2117 out_fallback: 2118 return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp)); 2119 } 2120 2121 struct celockstate { 2122 struct mtx *vlp[3]; 2123 struct mtx *blp[2]; 2124 }; 2125 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); 2126 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); 2127 2128 static inline void 2129 cache_celockstate_init(struct celockstate *cel) 2130 { 2131 2132 bzero(cel, sizeof(*cel)); 2133 } 2134 2135 static void 2136 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, 2137 struct vnode *dvp) 2138 { 2139 struct mtx *vlp1, *vlp2; 2140 2141 MPASS(cel->vlp[0] == NULL); 2142 MPASS(cel->vlp[1] == NULL); 2143 MPASS(cel->vlp[2] == NULL); 2144 2145 MPASS(vp != NULL || dvp != NULL); 2146 2147 vlp1 = VP2VNODELOCK(vp); 2148 vlp2 = VP2VNODELOCK(dvp); 2149 cache_sort_vnodes(&vlp1, &vlp2); 2150 2151 if (vlp1 != NULL) { 2152 mtx_lock(vlp1); 2153 cel->vlp[0] = vlp1; 2154 } 2155 mtx_lock(vlp2); 2156 cel->vlp[1] = vlp2; 2157 } 2158 2159 static void 2160 cache_unlock_vnodes_cel(struct celockstate *cel) 2161 { 2162 2163 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); 2164 2165 if (cel->vlp[0] != NULL) 2166 mtx_unlock(cel->vlp[0]); 2167 if (cel->vlp[1] != NULL) 2168 mtx_unlock(cel->vlp[1]); 2169 if (cel->vlp[2] != NULL) 2170 mtx_unlock(cel->vlp[2]); 2171 } 2172 2173 static bool 2174 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) 2175 { 2176 struct mtx *vlp; 2177 bool ret; 2178 2179 cache_assert_vlp_locked(cel->vlp[0]); 2180 cache_assert_vlp_locked(cel->vlp[1]); 2181 MPASS(cel->vlp[2] == NULL); 2182 2183 MPASS(vp != NULL); 2184 vlp = VP2VNODELOCK(vp); 2185 2186 ret = true; 2187 if (vlp >= cel->vlp[1]) { 2188 mtx_lock(vlp); 2189 } else { 2190 if (mtx_trylock(vlp)) 2191 goto out; 2192 cache_lock_vnodes_cel_3_failures++; 2193 cache_unlock_vnodes_cel(cel); 2194 if (vlp < cel->vlp[0]) { 2195 mtx_lock(vlp); 2196 mtx_lock(cel->vlp[0]); 2197 mtx_lock(cel->vlp[1]); 2198 } else { 2199 if (cel->vlp[0] != NULL) 2200 mtx_lock(cel->vlp[0]); 2201 mtx_lock(vlp); 2202 mtx_lock(cel->vlp[1]); 2203 } 2204 ret = false; 2205 } 2206 out: 2207 cel->vlp[2] = vlp; 2208 return (ret); 2209 } 2210 2211 static void 2212 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1, 2213 struct mtx *blp2) 2214 { 2215 2216 MPASS(cel->blp[0] == NULL); 2217 MPASS(cel->blp[1] == NULL); 2218 2219 cache_sort_vnodes(&blp1, &blp2); 2220 2221 if (blp1 != NULL) { 2222 mtx_lock(blp1); 2223 cel->blp[0] = blp1; 2224 } 2225 mtx_lock(blp2); 2226 cel->blp[1] = blp2; 2227 } 2228 2229 static void 2230 cache_unlock_buckets_cel(struct celockstate *cel) 2231 { 2232 2233 if (cel->blp[0] != NULL) 2234 mtx_unlock(cel->blp[0]); 2235 mtx_unlock(cel->blp[1]); 2236 } 2237 2238 /* 2239 * Lock part of the cache affected by the insertion. 2240 * 2241 * This means vnodelocks for dvp, vp and the relevant bucketlock. 2242 * However, insertion can result in removal of an old entry. In this 2243 * case we have an additional vnode and bucketlock pair to lock. 2244 * 2245 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while 2246 * preserving the locking order (smaller address first). 2247 */ 2248 static void 2249 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 2250 uint32_t hash) 2251 { 2252 struct namecache *ncp; 2253 struct mtx *blps[2]; 2254 u_char nc_flag; 2255 2256 blps[0] = HASH2BUCKETLOCK(hash); 2257 for (;;) { 2258 blps[1] = NULL; 2259 cache_lock_vnodes_cel(cel, dvp, vp); 2260 if (vp == NULL || vp->v_type != VDIR) 2261 break; 2262 ncp = atomic_load_consume_ptr(&vp->v_cache_dd); 2263 if (ncp == NULL) 2264 break; 2265 nc_flag = atomic_load_char(&ncp->nc_flag); 2266 if ((nc_flag & NCF_ISDOTDOT) == 0) 2267 break; 2268 MPASS(ncp->nc_dvp == vp); 2269 blps[1] = NCP2BUCKETLOCK(ncp); 2270 if ((nc_flag & NCF_NEGATIVE) != 0) 2271 break; 2272 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 2273 break; 2274 /* 2275 * All vnodes got re-locked. Re-validate the state and if 2276 * nothing changed we are done. Otherwise restart. 2277 */ 2278 if (ncp == vp->v_cache_dd && 2279 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 2280 blps[1] == NCP2BUCKETLOCK(ncp) && 2281 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 2282 break; 2283 cache_unlock_vnodes_cel(cel); 2284 cel->vlp[0] = NULL; 2285 cel->vlp[1] = NULL; 2286 cel->vlp[2] = NULL; 2287 } 2288 cache_lock_buckets_cel(cel, blps[0], blps[1]); 2289 } 2290 2291 static void 2292 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 2293 uint32_t hash) 2294 { 2295 struct namecache *ncp; 2296 struct mtx *blps[2]; 2297 u_char nc_flag; 2298 2299 blps[0] = HASH2BUCKETLOCK(hash); 2300 for (;;) { 2301 blps[1] = NULL; 2302 cache_lock_vnodes_cel(cel, dvp, vp); 2303 ncp = atomic_load_consume_ptr(&dvp->v_cache_dd); 2304 if (ncp == NULL) 2305 break; 2306 nc_flag = atomic_load_char(&ncp->nc_flag); 2307 if ((nc_flag & NCF_ISDOTDOT) == 0) 2308 break; 2309 MPASS(ncp->nc_dvp == dvp); 2310 blps[1] = NCP2BUCKETLOCK(ncp); 2311 if ((nc_flag & NCF_NEGATIVE) != 0) 2312 break; 2313 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 2314 break; 2315 if (ncp == dvp->v_cache_dd && 2316 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 2317 blps[1] == NCP2BUCKETLOCK(ncp) && 2318 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 2319 break; 2320 cache_unlock_vnodes_cel(cel); 2321 cel->vlp[0] = NULL; 2322 cel->vlp[1] = NULL; 2323 cel->vlp[2] = NULL; 2324 } 2325 cache_lock_buckets_cel(cel, blps[0], blps[1]); 2326 } 2327 2328 static void 2329 cache_enter_unlock(struct celockstate *cel) 2330 { 2331 2332 cache_unlock_buckets_cel(cel); 2333 cache_unlock_vnodes_cel(cel); 2334 } 2335 2336 static void __noinline 2337 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, 2338 struct componentname *cnp) 2339 { 2340 struct celockstate cel; 2341 struct namecache *ncp; 2342 uint32_t hash; 2343 int len; 2344 2345 if (atomic_load_ptr(&dvp->v_cache_dd) == NULL) 2346 return; 2347 len = cnp->cn_namelen; 2348 cache_celockstate_init(&cel); 2349 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 2350 cache_enter_lock_dd(&cel, dvp, vp, hash); 2351 ncp = dvp->v_cache_dd; 2352 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) { 2353 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); 2354 cache_zap_locked(ncp); 2355 } else { 2356 ncp = NULL; 2357 } 2358 atomic_store_ptr(&dvp->v_cache_dd, NULL); 2359 cache_enter_unlock(&cel); 2360 if (ncp != NULL) 2361 cache_free(ncp); 2362 } 2363 2364 /* 2365 * Add an entry to the cache. 2366 */ 2367 void 2368 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 2369 struct timespec *tsp, struct timespec *dtsp) 2370 { 2371 struct celockstate cel; 2372 struct namecache *ncp, *n2, *ndd; 2373 struct namecache_ts *ncp_ts; 2374 struct nchashhead *ncpp; 2375 uint32_t hash; 2376 int flag; 2377 int len; 2378 2379 KASSERT(cnp->cn_namelen <= NAME_MAX, 2380 ("%s: passed len %ld exceeds NAME_MAX (%d)", __func__, cnp->cn_namelen, 2381 NAME_MAX)); 2382 VNPASS(!VN_IS_DOOMED(dvp), dvp); 2383 VNPASS(dvp->v_type != VNON, dvp); 2384 if (vp != NULL) { 2385 VNPASS(!VN_IS_DOOMED(vp), vp); 2386 VNPASS(vp->v_type != VNON, vp); 2387 } 2388 if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') { 2389 KASSERT(dvp == vp, 2390 ("%s: different vnodes for dot entry (%p; %p)\n", __func__, 2391 dvp, vp)); 2392 } else { 2393 KASSERT(dvp != vp, 2394 ("%s: same vnode for non-dot entry [%s] (%p)\n", __func__, 2395 cnp->cn_nameptr, dvp)); 2396 } 2397 2398 #ifdef DEBUG_CACHE 2399 if (__predict_false(!doingcache)) 2400 return; 2401 #endif 2402 2403 flag = 0; 2404 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 2405 if (cnp->cn_namelen == 1) 2406 return; 2407 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 2408 cache_enter_dotdot_prep(dvp, vp, cnp); 2409 flag = NCF_ISDOTDOT; 2410 } 2411 } 2412 2413 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); 2414 if (ncp == NULL) 2415 return; 2416 2417 cache_celockstate_init(&cel); 2418 ndd = NULL; 2419 ncp_ts = NULL; 2420 2421 /* 2422 * Calculate the hash key and setup as much of the new 2423 * namecache entry as possible before acquiring the lock. 2424 */ 2425 ncp->nc_flag = flag | NCF_WIP; 2426 ncp->nc_vp = vp; 2427 if (vp == NULL) 2428 cache_neg_init(ncp); 2429 ncp->nc_dvp = dvp; 2430 if (tsp != NULL) { 2431 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 2432 ncp_ts->nc_time = *tsp; 2433 ncp_ts->nc_ticks = ticks; 2434 ncp_ts->nc_nc.nc_flag |= NCF_TS; 2435 if (dtsp != NULL) { 2436 ncp_ts->nc_dotdottime = *dtsp; 2437 ncp_ts->nc_nc.nc_flag |= NCF_DTS; 2438 } 2439 } 2440 len = ncp->nc_nlen = cnp->cn_namelen; 2441 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 2442 memcpy(ncp->nc_name, cnp->cn_nameptr, len); 2443 ncp->nc_name[len] = '\0'; 2444 cache_enter_lock(&cel, dvp, vp, hash); 2445 2446 /* 2447 * See if this vnode or negative entry is already in the cache 2448 * with this name. This can happen with concurrent lookups of 2449 * the same path name. 2450 */ 2451 ncpp = NCHHASH(hash); 2452 CK_SLIST_FOREACH(n2, ncpp, nc_hash) { 2453 if (n2->nc_dvp == dvp && 2454 n2->nc_nlen == cnp->cn_namelen && 2455 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { 2456 MPASS(cache_ncp_canuse(n2)); 2457 if ((n2->nc_flag & NCF_NEGATIVE) != 0) 2458 KASSERT(vp == NULL, 2459 ("%s: found entry pointing to a different vnode (%p != %p) ; name [%s]", 2460 __func__, NULL, vp, cnp->cn_nameptr)); 2461 else 2462 KASSERT(n2->nc_vp == vp, 2463 ("%s: found entry pointing to a different vnode (%p != %p) ; name [%s]", 2464 __func__, n2->nc_vp, vp, cnp->cn_nameptr)); 2465 /* 2466 * Entries are supposed to be immutable unless in the 2467 * process of getting destroyed. Accommodating for 2468 * changing timestamps is possible but not worth it. 2469 * This should be harmless in terms of correctness, in 2470 * the worst case resulting in an earlier expiration. 2471 * Alternatively, the found entry can be replaced 2472 * altogether. 2473 */ 2474 MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS))); 2475 #if 0 2476 if (tsp != NULL) { 2477 KASSERT((n2->nc_flag & NCF_TS) != 0, 2478 ("no NCF_TS")); 2479 n2_ts = __containerof(n2, struct namecache_ts, nc_nc); 2480 n2_ts->nc_time = ncp_ts->nc_time; 2481 n2_ts->nc_ticks = ncp_ts->nc_ticks; 2482 if (dtsp != NULL) { 2483 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; 2484 n2_ts->nc_nc.nc_flag |= NCF_DTS; 2485 } 2486 } 2487 #endif 2488 SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name, 2489 vp); 2490 goto out_unlock_free; 2491 } 2492 } 2493 2494 if (flag == NCF_ISDOTDOT) { 2495 /* 2496 * See if we are trying to add .. entry, but some other lookup 2497 * has populated v_cache_dd pointer already. 2498 */ 2499 if (dvp->v_cache_dd != NULL) 2500 goto out_unlock_free; 2501 KASSERT(vp == NULL || vp->v_type == VDIR, 2502 ("wrong vnode type %p", vp)); 2503 atomic_thread_fence_rel(); 2504 atomic_store_ptr(&dvp->v_cache_dd, ncp); 2505 } 2506 2507 if (vp != NULL) { 2508 if (flag != NCF_ISDOTDOT) { 2509 /* 2510 * For this case, the cache entry maps both the 2511 * directory name in it and the name ".." for the 2512 * directory's parent. 2513 */ 2514 if ((ndd = vp->v_cache_dd) != NULL) { 2515 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) 2516 cache_zap_locked(ndd); 2517 else 2518 ndd = NULL; 2519 } 2520 atomic_thread_fence_rel(); 2521 atomic_store_ptr(&vp->v_cache_dd, ncp); 2522 } else if (vp->v_type != VDIR) { 2523 if (vp->v_cache_dd != NULL) { 2524 atomic_store_ptr(&vp->v_cache_dd, NULL); 2525 } 2526 } 2527 } 2528 2529 if (flag != NCF_ISDOTDOT) { 2530 if (LIST_EMPTY(&dvp->v_cache_src)) { 2531 cache_hold_vnode(dvp); 2532 } 2533 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); 2534 } 2535 2536 /* 2537 * If the entry is "negative", we place it into the 2538 * "negative" cache queue, otherwise, we place it into the 2539 * destination vnode's cache entries queue. 2540 */ 2541 if (vp != NULL) { 2542 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); 2543 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, 2544 vp); 2545 } else { 2546 if (cnp->cn_flags & ISWHITEOUT) 2547 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_WHITE); 2548 cache_neg_insert(ncp); 2549 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, 2550 ncp->nc_name); 2551 } 2552 2553 /* 2554 * Insert the new namecache entry into the appropriate chain 2555 * within the cache entries table. 2556 */ 2557 CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash); 2558 2559 atomic_thread_fence_rel(); 2560 /* 2561 * Mark the entry as fully constructed. 2562 * It is immutable past this point until its removal. 2563 */ 2564 atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP); 2565 2566 cache_enter_unlock(&cel); 2567 if (ndd != NULL) 2568 cache_free(ndd); 2569 return; 2570 out_unlock_free: 2571 cache_enter_unlock(&cel); 2572 cache_free(ncp); 2573 return; 2574 } 2575 2576 /* 2577 * A variant of the above accepting flags. 2578 * 2579 * - VFS_CACHE_DROPOLD -- if a conflicting entry is found, drop it. 2580 * 2581 * TODO: this routine is a hack. It blindly removes the old entry, even if it 2582 * happens to match and it is doing it in an inefficient manner. It was added 2583 * to accommodate NFS which runs into a case where the target for a given name 2584 * may change from under it. Note this does nothing to solve the following 2585 * race: 2 callers of cache_enter_time_flags pass a different target vnode for 2586 * the same [dvp, cnp]. It may be argued that code doing this is broken. 2587 */ 2588 void 2589 cache_enter_time_flags(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 2590 struct timespec *tsp, struct timespec *dtsp, int flags) 2591 { 2592 2593 MPASS((flags & ~(VFS_CACHE_DROPOLD)) == 0); 2594 2595 if (flags & VFS_CACHE_DROPOLD) 2596 cache_remove_cnp(dvp, cnp); 2597 cache_enter_time(dvp, vp, cnp, tsp, dtsp); 2598 } 2599 2600 static u_long 2601 cache_roundup_2(u_long val) 2602 { 2603 u_long res; 2604 2605 for (res = 1; res <= val; res <<= 1) 2606 continue; 2607 2608 return (res); 2609 } 2610 2611 static struct nchashhead * 2612 nchinittbl(u_long elements, u_long *hashmask) 2613 { 2614 struct nchashhead *hashtbl; 2615 u_long hashsize, i; 2616 2617 hashsize = cache_roundup_2(elements) / 2; 2618 2619 hashtbl = malloc(hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK); 2620 for (i = 0; i < hashsize; i++) 2621 CK_SLIST_INIT(&hashtbl[i]); 2622 *hashmask = hashsize - 1; 2623 return (hashtbl); 2624 } 2625 2626 static void 2627 ncfreetbl(struct nchashhead *hashtbl) 2628 { 2629 2630 free(hashtbl, M_VFSCACHE); 2631 } 2632 2633 /* 2634 * Name cache initialization, from vfs_init() when we are booting 2635 */ 2636 static void 2637 nchinit(void *dummy __unused) 2638 { 2639 u_int i; 2640 2641 cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE, 2642 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2643 cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE, 2644 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2645 cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE, 2646 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2647 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE, 2648 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2649 2650 VFS_SMR_ZONE_SET(cache_zone_small); 2651 VFS_SMR_ZONE_SET(cache_zone_small_ts); 2652 VFS_SMR_ZONE_SET(cache_zone_large); 2653 VFS_SMR_ZONE_SET(cache_zone_large_ts); 2654 2655 ncsize = desiredvnodes * ncsizefactor; 2656 cache_recalc_neg_min(ncnegminpct); 2657 nchashtbl = nchinittbl(desiredvnodes * 2, &nchash); 2658 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; 2659 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ 2660 ncbuckethash = 7; 2661 if (ncbuckethash > nchash) 2662 ncbuckethash = nchash; 2663 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, 2664 M_WAITOK | M_ZERO); 2665 for (i = 0; i < numbucketlocks; i++) 2666 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE); 2667 ncvnodehash = ncbuckethash; 2668 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, 2669 M_WAITOK | M_ZERO); 2670 for (i = 0; i < numvnodelocks; i++) 2671 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); 2672 2673 for (i = 0; i < numneglists; i++) { 2674 mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF); 2675 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); 2676 TAILQ_INIT(&neglists[i].nl_list); 2677 TAILQ_INIT(&neglists[i].nl_hotlist); 2678 } 2679 } 2680 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); 2681 2682 void 2683 cache_vnode_init(struct vnode *vp) 2684 { 2685 2686 LIST_INIT(&vp->v_cache_src); 2687 TAILQ_INIT(&vp->v_cache_dst); 2688 vp->v_cache_dd = NULL; 2689 cache_prehash(vp); 2690 } 2691 2692 /* 2693 * Induce transient cache misses for lockless operation in cache_lookup() by 2694 * using a temporary hash table. 2695 * 2696 * This will force a fs lookup. 2697 * 2698 * Synchronisation is done in 2 steps, calling vfs_smr_synchronize each time 2699 * to observe all CPUs not performing the lookup. 2700 */ 2701 static void 2702 cache_changesize_set_temp(struct nchashhead *temptbl, u_long temphash) 2703 { 2704 2705 MPASS(temphash < nchash); 2706 /* 2707 * Change the size. The new size is smaller and can safely be used 2708 * against the existing table. All lookups which now hash wrong will 2709 * result in a cache miss, which all callers are supposed to know how 2710 * to handle. 2711 */ 2712 atomic_store_long(&nchash, temphash); 2713 atomic_thread_fence_rel(); 2714 vfs_smr_synchronize(); 2715 /* 2716 * At this point everyone sees the updated hash value, but they still 2717 * see the old table. 2718 */ 2719 atomic_store_ptr(&nchashtbl, temptbl); 2720 atomic_thread_fence_rel(); 2721 vfs_smr_synchronize(); 2722 /* 2723 * At this point everyone sees the updated table pointer and size pair. 2724 */ 2725 } 2726 2727 /* 2728 * Set the new hash table. 2729 * 2730 * Similarly to cache_changesize_set_temp(), this has to synchronize against 2731 * lockless operation in cache_lookup(). 2732 */ 2733 static void 2734 cache_changesize_set_new(struct nchashhead *new_tbl, u_long new_hash) 2735 { 2736 2737 MPASS(nchash < new_hash); 2738 /* 2739 * Change the pointer first. This wont result in out of bounds access 2740 * since the temporary table is guaranteed to be smaller. 2741 */ 2742 atomic_store_ptr(&nchashtbl, new_tbl); 2743 atomic_thread_fence_rel(); 2744 vfs_smr_synchronize(); 2745 /* 2746 * At this point everyone sees the updated pointer value, but they 2747 * still see the old size. 2748 */ 2749 atomic_store_long(&nchash, new_hash); 2750 atomic_thread_fence_rel(); 2751 vfs_smr_synchronize(); 2752 /* 2753 * At this point everyone sees the updated table pointer and size pair. 2754 */ 2755 } 2756 2757 void 2758 cache_changesize(u_long newmaxvnodes) 2759 { 2760 struct nchashhead *new_nchashtbl, *old_nchashtbl, *temptbl; 2761 u_long new_nchash, old_nchash, temphash; 2762 struct namecache *ncp; 2763 uint32_t hash; 2764 u_long newncsize; 2765 u_long i; 2766 2767 newncsize = newmaxvnodes * ncsizefactor; 2768 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); 2769 if (newmaxvnodes < numbucketlocks) 2770 newmaxvnodes = numbucketlocks; 2771 2772 new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash); 2773 /* If same hash table size, nothing to do */ 2774 if (nchash == new_nchash) { 2775 ncfreetbl(new_nchashtbl); 2776 return; 2777 } 2778 2779 temptbl = nchinittbl(1, &temphash); 2780 2781 /* 2782 * Move everything from the old hash table to the new table. 2783 * None of the namecache entries in the table can be removed 2784 * because to do so, they have to be removed from the hash table. 2785 */ 2786 cache_lock_all_vnodes(); 2787 cache_lock_all_buckets(); 2788 old_nchashtbl = nchashtbl; 2789 old_nchash = nchash; 2790 cache_changesize_set_temp(temptbl, temphash); 2791 for (i = 0; i <= old_nchash; i++) { 2792 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) { 2793 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, 2794 ncp->nc_dvp); 2795 CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash); 2796 CK_SLIST_INSERT_HEAD(&new_nchashtbl[hash & new_nchash], ncp, nc_hash); 2797 } 2798 } 2799 ncsize = newncsize; 2800 cache_recalc_neg_min(ncnegminpct); 2801 cache_changesize_set_new(new_nchashtbl, new_nchash); 2802 cache_unlock_all_buckets(); 2803 cache_unlock_all_vnodes(); 2804 ncfreetbl(old_nchashtbl); 2805 ncfreetbl(temptbl); 2806 } 2807 2808 /* 2809 * Remove all entries from and to a particular vnode. 2810 */ 2811 static void 2812 cache_purge_impl(struct vnode *vp) 2813 { 2814 struct cache_freebatch batch; 2815 struct namecache *ncp; 2816 struct mtx *vlp, *vlp2; 2817 2818 TAILQ_INIT(&batch); 2819 vlp = VP2VNODELOCK(vp); 2820 vlp2 = NULL; 2821 mtx_lock(vlp); 2822 retry: 2823 while (!LIST_EMPTY(&vp->v_cache_src)) { 2824 ncp = LIST_FIRST(&vp->v_cache_src); 2825 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2826 goto retry; 2827 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2828 } 2829 while (!TAILQ_EMPTY(&vp->v_cache_dst)) { 2830 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2831 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2832 goto retry; 2833 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2834 } 2835 ncp = vp->v_cache_dd; 2836 if (ncp != NULL) { 2837 KASSERT(ncp->nc_flag & NCF_ISDOTDOT, 2838 ("lost dotdot link")); 2839 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2840 goto retry; 2841 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2842 } 2843 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); 2844 mtx_unlock(vlp); 2845 if (vlp2 != NULL) 2846 mtx_unlock(vlp2); 2847 cache_free_batch(&batch); 2848 } 2849 2850 /* 2851 * Opportunistic check to see if there is anything to do. 2852 */ 2853 static bool 2854 cache_has_entries(struct vnode *vp) 2855 { 2856 2857 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2858 atomic_load_ptr(&vp->v_cache_dd) == NULL) 2859 return (false); 2860 return (true); 2861 } 2862 2863 void 2864 cache_purge(struct vnode *vp) 2865 { 2866 2867 SDT_PROBE1(vfs, namecache, purge, done, vp); 2868 if (!cache_has_entries(vp)) 2869 return; 2870 cache_purge_impl(vp); 2871 } 2872 2873 /* 2874 * Only to be used by vgone. 2875 */ 2876 void 2877 cache_purge_vgone(struct vnode *vp) 2878 { 2879 struct mtx *vlp; 2880 2881 VNPASS(VN_IS_DOOMED(vp), vp); 2882 if (cache_has_entries(vp)) { 2883 cache_purge_impl(vp); 2884 return; 2885 } 2886 2887 /* 2888 * Serialize against a potential thread doing cache_purge. 2889 */ 2890 vlp = VP2VNODELOCK(vp); 2891 mtx_wait_unlocked(vlp); 2892 if (cache_has_entries(vp)) { 2893 cache_purge_impl(vp); 2894 return; 2895 } 2896 return; 2897 } 2898 2899 /* 2900 * Remove all negative entries for a particular directory vnode. 2901 */ 2902 void 2903 cache_purge_negative(struct vnode *vp) 2904 { 2905 struct cache_freebatch batch; 2906 struct namecache *ncp, *nnp; 2907 struct mtx *vlp; 2908 2909 SDT_PROBE1(vfs, namecache, purge_negative, done, vp); 2910 if (LIST_EMPTY(&vp->v_cache_src)) 2911 return; 2912 TAILQ_INIT(&batch); 2913 vlp = VP2VNODELOCK(vp); 2914 mtx_lock(vlp); 2915 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { 2916 if (!(ncp->nc_flag & NCF_NEGATIVE)) 2917 continue; 2918 cache_zap_negative_locked_vnode_kl(ncp, vp); 2919 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2920 } 2921 mtx_unlock(vlp); 2922 cache_free_batch(&batch); 2923 } 2924 2925 /* 2926 * Entry points for modifying VOP operations. 2927 */ 2928 void 2929 cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, 2930 struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp) 2931 { 2932 2933 ASSERT_VOP_IN_SEQC(fdvp); 2934 ASSERT_VOP_IN_SEQC(fvp); 2935 ASSERT_VOP_IN_SEQC(tdvp); 2936 if (tvp != NULL) 2937 ASSERT_VOP_IN_SEQC(tvp); 2938 2939 cache_purge(fvp); 2940 if (tvp != NULL) { 2941 cache_purge(tvp); 2942 KASSERT(!cache_remove_cnp(tdvp, tcnp), 2943 ("%s: lingering negative entry", __func__)); 2944 } else { 2945 cache_remove_cnp(tdvp, tcnp); 2946 } 2947 2948 /* 2949 * TODO 2950 * 2951 * Historically renaming was always purging all revelang entries, 2952 * but that's quite wasteful. In particular turns out that in many cases 2953 * the target file is immediately accessed after rename, inducing a cache 2954 * miss. 2955 * 2956 * Recode this to reduce relocking and reuse the existing entry (if any) 2957 * instead of just removing it above and allocating a new one here. 2958 */ 2959 cache_enter(tdvp, fvp, tcnp); 2960 } 2961 2962 void 2963 cache_vop_rmdir(struct vnode *dvp, struct vnode *vp) 2964 { 2965 2966 ASSERT_VOP_IN_SEQC(dvp); 2967 ASSERT_VOP_IN_SEQC(vp); 2968 cache_purge(vp); 2969 } 2970 2971 #ifdef INVARIANTS 2972 /* 2973 * Validate that if an entry exists it matches. 2974 */ 2975 void 2976 cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) 2977 { 2978 struct namecache *ncp; 2979 struct mtx *blp; 2980 uint32_t hash; 2981 2982 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 2983 if (CK_SLIST_EMPTY(NCHHASH(hash))) 2984 return; 2985 blp = HASH2BUCKETLOCK(hash); 2986 mtx_lock(blp); 2987 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 2988 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 2989 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) { 2990 if (ncp->nc_vp != vp) 2991 panic("%s: mismatch (%p != %p); ncp %p [%s] dvp %p\n", 2992 __func__, vp, ncp->nc_vp, ncp, ncp->nc_name, ncp->nc_dvp); 2993 } 2994 } 2995 mtx_unlock(blp); 2996 } 2997 2998 void 2999 cache_assert_no_entries(struct vnode *vp) 3000 { 3001 3002 VNPASS(TAILQ_EMPTY(&vp->v_cache_dst), vp); 3003 VNPASS(LIST_EMPTY(&vp->v_cache_src), vp); 3004 VNPASS(vp->v_cache_dd == NULL, vp); 3005 } 3006 #endif 3007 3008 /* 3009 * Flush all entries referencing a particular filesystem. 3010 */ 3011 void 3012 cache_purgevfs(struct mount *mp) 3013 { 3014 struct vnode *vp, *mvp; 3015 size_t visited __sdt_used, purged __sdt_used; 3016 3017 visited = purged = 0; 3018 /* 3019 * Somewhat wasteful iteration over all vnodes. Would be better to 3020 * support filtering and avoid the interlock to begin with. 3021 */ 3022 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 3023 visited++; 3024 if (!cache_has_entries(vp)) { 3025 VI_UNLOCK(vp); 3026 continue; 3027 } 3028 vholdl(vp); 3029 VI_UNLOCK(vp); 3030 cache_purge(vp); 3031 purged++; 3032 vdrop(vp); 3033 } 3034 3035 SDT_PROBE3(vfs, namecache, purgevfs, done, mp, visited, purged); 3036 } 3037 3038 /* 3039 * Perform canonical checks and cache lookup and pass on to filesystem 3040 * through the vop_cachedlookup only if needed. 3041 */ 3042 3043 int 3044 vfs_cache_lookup(struct vop_lookup_args *ap) 3045 { 3046 struct vnode *dvp; 3047 int error; 3048 struct vnode **vpp = ap->a_vpp; 3049 struct componentname *cnp = ap->a_cnp; 3050 int flags = cnp->cn_flags; 3051 3052 *vpp = NULL; 3053 dvp = ap->a_dvp; 3054 3055 if (dvp->v_type != VDIR) 3056 return (ENOTDIR); 3057 3058 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && 3059 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) 3060 return (EROFS); 3061 3062 error = vn_dir_check_exec(dvp, cnp); 3063 if (error != 0) 3064 return (error); 3065 3066 error = cache_lookup(dvp, vpp, cnp, NULL, NULL); 3067 if (error == 0) 3068 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); 3069 if (error == -1) 3070 return (0); 3071 return (error); 3072 } 3073 3074 /* Implementation of the getcwd syscall. */ 3075 int 3076 sys___getcwd(struct thread *td, struct __getcwd_args *uap) 3077 { 3078 char *buf, *retbuf; 3079 size_t buflen; 3080 int error; 3081 3082 buflen = uap->buflen; 3083 if (__predict_false(buflen < 2)) 3084 return (EINVAL); 3085 if (buflen > MAXPATHLEN) 3086 buflen = MAXPATHLEN; 3087 3088 buf = uma_zalloc(namei_zone, M_WAITOK); 3089 error = vn_getcwd(buf, &retbuf, &buflen); 3090 if (error == 0) 3091 error = copyout(retbuf, uap->buf, buflen); 3092 uma_zfree(namei_zone, buf); 3093 return (error); 3094 } 3095 3096 int 3097 vn_getcwd(char *buf, char **retbuf, size_t *buflen) 3098 { 3099 struct pwd *pwd; 3100 int error; 3101 3102 vfs_smr_enter(); 3103 pwd = pwd_get_smr(); 3104 error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, 3105 buflen, 0); 3106 VFS_SMR_ASSERT_NOT_ENTERED(); 3107 if (error < 0) { 3108 pwd = pwd_hold(curthread); 3109 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf, 3110 retbuf, buflen); 3111 pwd_drop(pwd); 3112 } 3113 3114 #ifdef KTRACE 3115 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0) 3116 ktrnamei(*retbuf); 3117 #endif 3118 return (error); 3119 } 3120 3121 /* 3122 * Canonicalize a path by walking it forward and back. 3123 * 3124 * BUGS: 3125 * - Nothing guarantees the integrity of the entire chain. Consider the case 3126 * where the path "foo/bar/baz/qux" is passed, but "bar" is moved out of 3127 * "foo" into "quux" during the backwards walk. The result will be 3128 * "quux/bar/baz/qux", which could not have been obtained by an incremental 3129 * walk in userspace. Moreover, the path we return is inaccessible if the 3130 * calling thread lacks permission to traverse "quux". 3131 */ 3132 static int 3133 kern___realpathat(struct thread *td, int fd, const char *path, char *buf, 3134 size_t size, int flags, enum uio_seg pathseg) 3135 { 3136 struct nameidata nd; 3137 char *retbuf, *freebuf; 3138 int error; 3139 3140 if (flags != 0) 3141 return (EINVAL); 3142 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | WANTPARENT | AUDITVNODE1, 3143 pathseg, path, fd, &cap_fstat_rights); 3144 if ((error = namei(&nd)) != 0) 3145 return (error); 3146 3147 if (nd.ni_vp->v_type == VREG && nd.ni_dvp->v_type != VDIR && 3148 (nd.ni_vp->v_vflag & VV_ROOT) != 0) { 3149 /* 3150 * This happens if vp is a file mount. The call to 3151 * vn_fullpath_hardlink can panic if path resolution can't be 3152 * handled without the directory. 3153 * 3154 * To resolve this, we find the vnode which was mounted on - 3155 * this should have a unique global path since we disallow 3156 * mounting on linked files. 3157 */ 3158 struct vnode *covered_vp; 3159 error = vn_lock(nd.ni_vp, LK_SHARED); 3160 if (error != 0) 3161 goto out; 3162 covered_vp = nd.ni_vp->v_mount->mnt_vnodecovered; 3163 vref(covered_vp); 3164 VOP_UNLOCK(nd.ni_vp); 3165 error = vn_fullpath(covered_vp, &retbuf, &freebuf); 3166 vrele(covered_vp); 3167 } else { 3168 error = vn_fullpath_hardlink(nd.ni_vp, nd.ni_dvp, nd.ni_cnd.cn_nameptr, 3169 nd.ni_cnd.cn_namelen, &retbuf, &freebuf, &size); 3170 } 3171 if (error == 0) { 3172 error = copyout(retbuf, buf, size); 3173 free(freebuf, M_TEMP); 3174 } 3175 out: 3176 vrele(nd.ni_vp); 3177 vrele(nd.ni_dvp); 3178 NDFREE_PNBUF(&nd); 3179 return (error); 3180 } 3181 3182 int 3183 sys___realpathat(struct thread *td, struct __realpathat_args *uap) 3184 { 3185 3186 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size, 3187 uap->flags, UIO_USERSPACE)); 3188 } 3189 3190 /* 3191 * Retrieve the full filesystem path that correspond to a vnode from the name 3192 * cache (if available) 3193 */ 3194 int 3195 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf) 3196 { 3197 struct pwd *pwd; 3198 char *buf; 3199 size_t buflen; 3200 int error; 3201 3202 if (__predict_false(vp == NULL)) 3203 return (EINVAL); 3204 3205 buflen = MAXPATHLEN; 3206 buf = malloc(buflen, M_TEMP, M_WAITOK); 3207 vfs_smr_enter(); 3208 pwd = pwd_get_smr(); 3209 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, 0); 3210 VFS_SMR_ASSERT_NOT_ENTERED(); 3211 if (error < 0) { 3212 pwd = pwd_hold(curthread); 3213 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen); 3214 pwd_drop(pwd); 3215 } 3216 if (error == 0) 3217 *freebuf = buf; 3218 else 3219 free(buf, M_TEMP); 3220 return (error); 3221 } 3222 3223 /* 3224 * This function is similar to vn_fullpath, but it attempts to lookup the 3225 * pathname relative to the global root mount point. This is required for the 3226 * auditing sub-system, as audited pathnames must be absolute, relative to the 3227 * global root mount point. 3228 */ 3229 int 3230 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf) 3231 { 3232 char *buf; 3233 size_t buflen; 3234 int error; 3235 3236 if (__predict_false(vp == NULL)) 3237 return (EINVAL); 3238 buflen = MAXPATHLEN; 3239 buf = malloc(buflen, M_TEMP, M_WAITOK); 3240 vfs_smr_enter(); 3241 error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, 0); 3242 VFS_SMR_ASSERT_NOT_ENTERED(); 3243 if (error < 0) { 3244 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen); 3245 } 3246 if (error == 0) 3247 *freebuf = buf; 3248 else 3249 free(buf, M_TEMP); 3250 return (error); 3251 } 3252 3253 static struct namecache * 3254 vn_dd_from_dst(struct vnode *vp) 3255 { 3256 struct namecache *ncp; 3257 3258 cache_assert_vnode_locked(vp); 3259 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) { 3260 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 3261 return (ncp); 3262 } 3263 return (NULL); 3264 } 3265 3266 int 3267 vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen) 3268 { 3269 struct vnode *dvp; 3270 struct namecache *ncp; 3271 struct mtx *vlp; 3272 int error; 3273 3274 vlp = VP2VNODELOCK(*vp); 3275 mtx_lock(vlp); 3276 ncp = (*vp)->v_cache_dd; 3277 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) { 3278 KASSERT(ncp == vn_dd_from_dst(*vp), 3279 ("%s: mismatch for dd entry (%p != %p)", __func__, 3280 ncp, vn_dd_from_dst(*vp))); 3281 } else { 3282 ncp = vn_dd_from_dst(*vp); 3283 } 3284 if (ncp != NULL) { 3285 if (*buflen < ncp->nc_nlen) { 3286 mtx_unlock(vlp); 3287 vrele(*vp); 3288 counter_u64_add(numfullpathfail4, 1); 3289 error = ENOMEM; 3290 SDT_PROBE3(vfs, namecache, fullpath, return, error, 3291 vp, NULL); 3292 return (error); 3293 } 3294 *buflen -= ncp->nc_nlen; 3295 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 3296 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, 3297 ncp->nc_name, vp); 3298 dvp = *vp; 3299 *vp = ncp->nc_dvp; 3300 vref(*vp); 3301 mtx_unlock(vlp); 3302 vrele(dvp); 3303 return (0); 3304 } 3305 SDT_PROBE1(vfs, namecache, fullpath, miss, vp); 3306 3307 mtx_unlock(vlp); 3308 vn_lock(*vp, LK_SHARED | LK_RETRY); 3309 error = VOP_VPTOCNP(*vp, &dvp, buf, buflen); 3310 vput(*vp); 3311 if (error) { 3312 counter_u64_add(numfullpathfail2, 1); 3313 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 3314 return (error); 3315 } 3316 3317 *vp = dvp; 3318 if (VN_IS_DOOMED(dvp)) { 3319 /* forced unmount */ 3320 vrele(dvp); 3321 error = ENOENT; 3322 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 3323 return (error); 3324 } 3325 /* 3326 * *vp has its use count incremented still. 3327 */ 3328 3329 return (0); 3330 } 3331 3332 /* 3333 * Resolve a directory to a pathname. 3334 * 3335 * The name of the directory can always be found in the namecache or fetched 3336 * from the filesystem. There is also guaranteed to be only one parent, meaning 3337 * we can just follow vnodes up until we find the root. 3338 * 3339 * The vnode must be referenced. 3340 */ 3341 static int 3342 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 3343 size_t *len, size_t addend) 3344 { 3345 #ifdef KDTRACE_HOOKS 3346 struct vnode *startvp = vp; 3347 #endif 3348 struct vnode *vp1; 3349 size_t buflen; 3350 int error; 3351 bool slash_prefixed; 3352 3353 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); 3354 VNPASS(vp->v_usecount > 0, vp); 3355 3356 buflen = *len; 3357 3358 slash_prefixed = true; 3359 if (addend == 0) { 3360 MPASS(*len >= 2); 3361 buflen--; 3362 buf[buflen] = '\0'; 3363 slash_prefixed = false; 3364 } 3365 3366 error = 0; 3367 3368 SDT_PROBE1(vfs, namecache, fullpath, entry, vp); 3369 counter_u64_add(numfullpathcalls, 1); 3370 while (vp != rdir && vp != rootvnode) { 3371 /* 3372 * The vp vnode must be already fully constructed, 3373 * since it is either found in namecache or obtained 3374 * from VOP_VPTOCNP(). We may test for VV_ROOT safely 3375 * without obtaining the vnode lock. 3376 */ 3377 if ((vp->v_vflag & VV_ROOT) != 0) { 3378 vn_lock(vp, LK_RETRY | LK_SHARED); 3379 3380 /* 3381 * With the vnode locked, check for races with 3382 * unmount, forced or not. Note that we 3383 * already verified that vp is not equal to 3384 * the root vnode, which means that 3385 * mnt_vnodecovered can be NULL only for the 3386 * case of unmount. 3387 */ 3388 if (VN_IS_DOOMED(vp) || 3389 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || 3390 vp1->v_mountedhere != vp->v_mount) { 3391 vput(vp); 3392 error = ENOENT; 3393 SDT_PROBE3(vfs, namecache, fullpath, return, 3394 error, vp, NULL); 3395 break; 3396 } 3397 3398 vref(vp1); 3399 vput(vp); 3400 vp = vp1; 3401 continue; 3402 } 3403 if (vp->v_type != VDIR) { 3404 vrele(vp); 3405 counter_u64_add(numfullpathfail1, 1); 3406 error = ENOTDIR; 3407 SDT_PROBE3(vfs, namecache, fullpath, return, 3408 error, vp, NULL); 3409 break; 3410 } 3411 error = vn_vptocnp(&vp, buf, &buflen); 3412 if (error) 3413 break; 3414 if (buflen == 0) { 3415 vrele(vp); 3416 error = ENOMEM; 3417 SDT_PROBE3(vfs, namecache, fullpath, return, error, 3418 startvp, NULL); 3419 break; 3420 } 3421 buf[--buflen] = '/'; 3422 slash_prefixed = true; 3423 } 3424 if (error) 3425 return (error); 3426 if (!slash_prefixed) { 3427 if (buflen == 0) { 3428 vrele(vp); 3429 counter_u64_add(numfullpathfail4, 1); 3430 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, 3431 startvp, NULL); 3432 return (ENOMEM); 3433 } 3434 buf[--buflen] = '/'; 3435 } 3436 counter_u64_add(numfullpathfound, 1); 3437 vrele(vp); 3438 3439 *retbuf = buf + buflen; 3440 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf); 3441 *len -= buflen; 3442 *len += addend; 3443 return (0); 3444 } 3445 3446 /* 3447 * Resolve an arbitrary vnode to a pathname. 3448 * 3449 * Note 2 caveats: 3450 * - hardlinks are not tracked, thus if the vnode is not a directory this can 3451 * resolve to a different path than the one used to find it 3452 * - namecache is not mandatory, meaning names are not guaranteed to be added 3453 * (in which case resolving fails) 3454 */ 3455 static void __inline 3456 cache_rev_failed_impl(int *reason, int line) 3457 { 3458 3459 *reason = line; 3460 } 3461 #define cache_rev_failed(var) cache_rev_failed_impl((var), __LINE__) 3462 3463 static int 3464 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 3465 char **retbuf, size_t *buflen, size_t addend) 3466 { 3467 #ifdef KDTRACE_HOOKS 3468 struct vnode *startvp = vp; 3469 #endif 3470 struct vnode *tvp; 3471 struct mount *mp; 3472 struct namecache *ncp; 3473 size_t orig_buflen; 3474 int reason; 3475 int error; 3476 #ifdef KDTRACE_HOOKS 3477 int i; 3478 #endif 3479 seqc_t vp_seqc, tvp_seqc; 3480 u_char nc_flag; 3481 3482 VFS_SMR_ASSERT_ENTERED(); 3483 3484 if (!atomic_load_char(&cache_fast_lookup_enabled)) { 3485 vfs_smr_exit(); 3486 return (-1); 3487 } 3488 3489 orig_buflen = *buflen; 3490 3491 if (addend == 0) { 3492 MPASS(*buflen >= 2); 3493 *buflen -= 1; 3494 buf[*buflen] = '\0'; 3495 } 3496 3497 if (vp == rdir || vp == rootvnode) { 3498 if (addend == 0) { 3499 *buflen -= 1; 3500 buf[*buflen] = '/'; 3501 } 3502 goto out_ok; 3503 } 3504 3505 #ifdef KDTRACE_HOOKS 3506 i = 0; 3507 #endif 3508 error = -1; 3509 ncp = NULL; /* for sdt probe down below */ 3510 vp_seqc = vn_seqc_read_any(vp); 3511 if (seqc_in_modify(vp_seqc)) { 3512 cache_rev_failed(&reason); 3513 goto out_abort; 3514 } 3515 3516 for (;;) { 3517 #ifdef KDTRACE_HOOKS 3518 i++; 3519 #endif 3520 if ((vp->v_vflag & VV_ROOT) != 0) { 3521 mp = atomic_load_ptr(&vp->v_mount); 3522 if (mp == NULL) { 3523 cache_rev_failed(&reason); 3524 goto out_abort; 3525 } 3526 tvp = atomic_load_ptr(&mp->mnt_vnodecovered); 3527 tvp_seqc = vn_seqc_read_any(tvp); 3528 if (seqc_in_modify(tvp_seqc)) { 3529 cache_rev_failed(&reason); 3530 goto out_abort; 3531 } 3532 if (!vn_seqc_consistent(vp, vp_seqc)) { 3533 cache_rev_failed(&reason); 3534 goto out_abort; 3535 } 3536 vp = tvp; 3537 vp_seqc = tvp_seqc; 3538 continue; 3539 } 3540 ncp = atomic_load_consume_ptr(&vp->v_cache_dd); 3541 if (ncp == NULL) { 3542 cache_rev_failed(&reason); 3543 goto out_abort; 3544 } 3545 nc_flag = atomic_load_char(&ncp->nc_flag); 3546 if ((nc_flag & NCF_ISDOTDOT) != 0) { 3547 cache_rev_failed(&reason); 3548 goto out_abort; 3549 } 3550 if (ncp->nc_nlen >= *buflen) { 3551 cache_rev_failed(&reason); 3552 error = ENOMEM; 3553 goto out_abort; 3554 } 3555 *buflen -= ncp->nc_nlen; 3556 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 3557 *buflen -= 1; 3558 buf[*buflen] = '/'; 3559 tvp = ncp->nc_dvp; 3560 tvp_seqc = vn_seqc_read_any(tvp); 3561 if (seqc_in_modify(tvp_seqc)) { 3562 cache_rev_failed(&reason); 3563 goto out_abort; 3564 } 3565 if (!vn_seqc_consistent(vp, vp_seqc)) { 3566 cache_rev_failed(&reason); 3567 goto out_abort; 3568 } 3569 /* 3570 * Acquire fence provided by vn_seqc_read_any above. 3571 */ 3572 if (__predict_false(atomic_load_ptr(&vp->v_cache_dd) != ncp)) { 3573 cache_rev_failed(&reason); 3574 goto out_abort; 3575 } 3576 if (!cache_ncp_canuse(ncp)) { 3577 cache_rev_failed(&reason); 3578 goto out_abort; 3579 } 3580 vp = tvp; 3581 vp_seqc = tvp_seqc; 3582 if (vp == rdir || vp == rootvnode) 3583 break; 3584 } 3585 out_ok: 3586 vfs_smr_exit(); 3587 *retbuf = buf + *buflen; 3588 *buflen = orig_buflen - *buflen + addend; 3589 SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf); 3590 return (0); 3591 3592 out_abort: 3593 *buflen = orig_buflen; 3594 SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i); 3595 vfs_smr_exit(); 3596 return (error); 3597 } 3598 3599 static int 3600 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 3601 size_t *buflen) 3602 { 3603 size_t orig_buflen, addend; 3604 int error; 3605 3606 if (*buflen < 2) 3607 return (EINVAL); 3608 3609 orig_buflen = *buflen; 3610 3611 vref(vp); 3612 addend = 0; 3613 if (vp->v_type != VDIR) { 3614 *buflen -= 1; 3615 buf[*buflen] = '\0'; 3616 error = vn_vptocnp(&vp, buf, buflen); 3617 if (error) 3618 return (error); 3619 if (*buflen == 0) { 3620 vrele(vp); 3621 return (ENOMEM); 3622 } 3623 *buflen -= 1; 3624 buf[*buflen] = '/'; 3625 addend = orig_buflen - *buflen; 3626 } 3627 3628 return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, addend)); 3629 } 3630 3631 /* 3632 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks). 3633 * 3634 * Since the namecache does not track hardlinks, the caller is expected to 3635 * first look up the target vnode with WANTPARENT flag passed to namei to get 3636 * dvp and vp. 3637 * 3638 * Then we have 2 cases: 3639 * - if the found vnode is a directory, the path can be constructed just by 3640 * following names up the chain 3641 * - otherwise we populate the buffer with the saved name and start resolving 3642 * from the parent 3643 */ 3644 int 3645 vn_fullpath_hardlink(struct vnode *vp, struct vnode *dvp, 3646 const char *hrdl_name, size_t hrdl_name_length, 3647 char **retbuf, char **freebuf, size_t *buflen) 3648 { 3649 char *buf, *tmpbuf; 3650 struct pwd *pwd; 3651 size_t addend; 3652 int error; 3653 __enum_uint8(vtype) type; 3654 3655 if (*buflen < 2) 3656 return (EINVAL); 3657 if (*buflen > MAXPATHLEN) 3658 *buflen = MAXPATHLEN; 3659 3660 buf = malloc(*buflen, M_TEMP, M_WAITOK); 3661 3662 addend = 0; 3663 3664 /* 3665 * Check for VBAD to work around the vp_crossmp bug in lookup(). 3666 * 3667 * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be 3668 * set to mount point's root vnode while ni_dvp will be vp_crossmp. 3669 * If the type is VDIR (like in this very case) we can skip looking 3670 * at ni_dvp in the first place. However, since vnodes get passed here 3671 * unlocked the target may transition to doomed state (type == VBAD) 3672 * before we get to evaluate the condition. If this happens, we will 3673 * populate part of the buffer and descend to vn_fullpath_dir with 3674 * vp == vp_crossmp. Prevent the problem by checking for VBAD. 3675 */ 3676 type = atomic_load_8(&vp->v_type); 3677 if (type == VBAD) { 3678 error = ENOENT; 3679 goto out_bad; 3680 } 3681 if (type != VDIR) { 3682 addend = hrdl_name_length + 2; 3683 if (*buflen < addend) { 3684 error = ENOMEM; 3685 goto out_bad; 3686 } 3687 *buflen -= addend; 3688 tmpbuf = buf + *buflen; 3689 tmpbuf[0] = '/'; 3690 memcpy(&tmpbuf[1], hrdl_name, hrdl_name_length); 3691 tmpbuf[addend - 1] = '\0'; 3692 vp = dvp; 3693 } 3694 3695 vfs_smr_enter(); 3696 pwd = pwd_get_smr(); 3697 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen, 3698 addend); 3699 VFS_SMR_ASSERT_NOT_ENTERED(); 3700 if (error < 0) { 3701 pwd = pwd_hold(curthread); 3702 vref(vp); 3703 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen, 3704 addend); 3705 pwd_drop(pwd); 3706 } 3707 if (error != 0) 3708 goto out_bad; 3709 3710 *freebuf = buf; 3711 3712 return (0); 3713 out_bad: 3714 free(buf, M_TEMP); 3715 return (error); 3716 } 3717 3718 struct vnode * 3719 vn_dir_dd_ino(struct vnode *vp) 3720 { 3721 struct namecache *ncp; 3722 struct vnode *ddvp; 3723 struct mtx *vlp; 3724 enum vgetstate vs; 3725 3726 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); 3727 vlp = VP2VNODELOCK(vp); 3728 mtx_lock(vlp); 3729 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { 3730 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 3731 continue; 3732 ddvp = ncp->nc_dvp; 3733 vs = vget_prep(ddvp); 3734 mtx_unlock(vlp); 3735 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) 3736 return (NULL); 3737 return (ddvp); 3738 } 3739 mtx_unlock(vlp); 3740 return (NULL); 3741 } 3742 3743 int 3744 vn_commname(struct vnode *vp, char *buf, u_int buflen) 3745 { 3746 struct namecache *ncp; 3747 struct mtx *vlp; 3748 int l; 3749 3750 vlp = VP2VNODELOCK(vp); 3751 mtx_lock(vlp); 3752 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) 3753 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 3754 break; 3755 if (ncp == NULL) { 3756 mtx_unlock(vlp); 3757 return (ENOENT); 3758 } 3759 l = min(ncp->nc_nlen, buflen - 1); 3760 memcpy(buf, ncp->nc_name, l); 3761 mtx_unlock(vlp); 3762 buf[l] = '\0'; 3763 return (0); 3764 } 3765 3766 /* 3767 * This function updates path string to vnode's full global path 3768 * and checks the size of the new path string against the pathlen argument. 3769 * 3770 * Requires a locked, referenced vnode. 3771 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 3772 * 3773 * If vp is a directory, the call to vn_fullpath_global() always succeeds 3774 * because it falls back to the ".." lookup if the namecache lookup fails. 3775 */ 3776 int 3777 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, 3778 u_int pathlen) 3779 { 3780 struct nameidata nd; 3781 struct vnode *vp1; 3782 char *rpath, *fbuf; 3783 int error; 3784 3785 ASSERT_VOP_ELOCKED(vp, __func__); 3786 3787 /* Construct global filesystem path from vp. */ 3788 VOP_UNLOCK(vp); 3789 error = vn_fullpath_global(vp, &rpath, &fbuf); 3790 3791 if (error != 0) { 3792 vrele(vp); 3793 return (error); 3794 } 3795 3796 if (strlen(rpath) >= pathlen) { 3797 vrele(vp); 3798 error = ENAMETOOLONG; 3799 goto out; 3800 } 3801 3802 /* 3803 * Re-lookup the vnode by path to detect a possible rename. 3804 * As a side effect, the vnode is relocked. 3805 * If vnode was renamed, return ENOENT. 3806 */ 3807 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, path); 3808 error = namei(&nd); 3809 if (error != 0) { 3810 vrele(vp); 3811 goto out; 3812 } 3813 NDFREE_PNBUF(&nd); 3814 vp1 = nd.ni_vp; 3815 vrele(vp); 3816 if (vp1 == vp) 3817 strcpy(path, rpath); 3818 else { 3819 vput(vp1); 3820 error = ENOENT; 3821 } 3822 3823 out: 3824 free(fbuf, M_TEMP); 3825 return (error); 3826 } 3827 3828 /* 3829 * This is similar to vn_path_to_global_path but allows for regular 3830 * files which may not be present in the cache. 3831 * 3832 * Requires a locked, referenced vnode. 3833 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 3834 */ 3835 int 3836 vn_path_to_global_path_hardlink(struct thread *td, struct vnode *vp, 3837 struct vnode *dvp, char *path, u_int pathlen, const char *leaf_name, 3838 size_t leaf_length) 3839 { 3840 struct nameidata nd; 3841 struct vnode *vp1; 3842 char *rpath, *fbuf; 3843 size_t len; 3844 int error; 3845 3846 ASSERT_VOP_ELOCKED(vp, __func__); 3847 3848 /* 3849 * Construct global filesystem path from dvp, vp and leaf 3850 * name. 3851 */ 3852 VOP_UNLOCK(vp); 3853 len = pathlen; 3854 error = vn_fullpath_hardlink(vp, dvp, leaf_name, leaf_length, 3855 &rpath, &fbuf, &len); 3856 3857 if (error != 0) { 3858 vrele(vp); 3859 return (error); 3860 } 3861 3862 if (strlen(rpath) >= pathlen) { 3863 vrele(vp); 3864 error = ENAMETOOLONG; 3865 goto out; 3866 } 3867 3868 /* 3869 * Re-lookup the vnode by path to detect a possible rename. 3870 * As a side effect, the vnode is relocked. 3871 * If vnode was renamed, return ENOENT. 3872 */ 3873 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, path); 3874 error = namei(&nd); 3875 if (error != 0) { 3876 vrele(vp); 3877 goto out; 3878 } 3879 NDFREE_PNBUF(&nd); 3880 vp1 = nd.ni_vp; 3881 vrele(vp); 3882 if (vp1 == vp) 3883 strcpy(path, rpath); 3884 else { 3885 vput(vp1); 3886 error = ENOENT; 3887 } 3888 3889 out: 3890 free(fbuf, M_TEMP); 3891 return (error); 3892 } 3893 3894 #ifdef DDB 3895 static void 3896 db_print_vpath(struct vnode *vp) 3897 { 3898 3899 while (vp != NULL) { 3900 db_printf("%p: ", vp); 3901 if (vp == rootvnode) { 3902 db_printf("/"); 3903 vp = NULL; 3904 } else { 3905 if (vp->v_vflag & VV_ROOT) { 3906 db_printf("<mount point>"); 3907 vp = vp->v_mount->mnt_vnodecovered; 3908 } else { 3909 struct namecache *ncp; 3910 char *ncn; 3911 int i; 3912 3913 ncp = TAILQ_FIRST(&vp->v_cache_dst); 3914 if (ncp != NULL) { 3915 ncn = ncp->nc_name; 3916 for (i = 0; i < ncp->nc_nlen; i++) 3917 db_printf("%c", *ncn++); 3918 vp = ncp->nc_dvp; 3919 } else { 3920 vp = NULL; 3921 } 3922 } 3923 } 3924 db_printf("\n"); 3925 } 3926 3927 return; 3928 } 3929 3930 DB_SHOW_COMMAND(vpath, db_show_vpath) 3931 { 3932 struct vnode *vp; 3933 3934 if (!have_addr) { 3935 db_printf("usage: show vpath <struct vnode *>\n"); 3936 return; 3937 } 3938 3939 vp = (struct vnode *)addr; 3940 db_print_vpath(vp); 3941 } 3942 3943 #endif 3944 3945 static int cache_fast_lookup = 1; 3946 3947 #define CACHE_FPL_FAILED -2020 3948 3949 static int 3950 cache_vop_bad_vexec(struct vop_fplookup_vexec_args *v) 3951 { 3952 vn_printf(v->a_vp, "no proper vop_fplookup_vexec\n"); 3953 panic("no proper vop_fplookup_vexec"); 3954 } 3955 3956 static int 3957 cache_vop_bad_symlink(struct vop_fplookup_symlink_args *v) 3958 { 3959 vn_printf(v->a_vp, "no proper vop_fplookup_symlink\n"); 3960 panic("no proper vop_fplookup_symlink"); 3961 } 3962 3963 void 3964 cache_vop_vector_register(struct vop_vector *v) 3965 { 3966 size_t ops; 3967 3968 ops = 0; 3969 if (v->vop_fplookup_vexec != NULL) { 3970 ops++; 3971 } 3972 if (v->vop_fplookup_symlink != NULL) { 3973 ops++; 3974 } 3975 3976 if (ops == 2) { 3977 return; 3978 } 3979 3980 if (ops == 0) { 3981 v->vop_fplookup_vexec = cache_vop_bad_vexec; 3982 v->vop_fplookup_symlink = cache_vop_bad_symlink; 3983 return; 3984 } 3985 3986 printf("%s: invalid vop vector %p -- either all or none fplookup vops " 3987 "need to be provided", __func__, v); 3988 if (v->vop_fplookup_vexec == NULL) { 3989 printf("%s: missing vop_fplookup_vexec\n", __func__); 3990 } 3991 if (v->vop_fplookup_symlink == NULL) { 3992 printf("%s: missing vop_fplookup_symlink\n", __func__); 3993 } 3994 panic("bad vop vector %p", v); 3995 } 3996 3997 #ifdef INVARIANTS 3998 void 3999 cache_validate_vop_vector(struct mount *mp, struct vop_vector *vops) 4000 { 4001 if (mp == NULL) 4002 return; 4003 4004 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) 4005 return; 4006 4007 if (vops->vop_fplookup_vexec == NULL || 4008 vops->vop_fplookup_vexec == cache_vop_bad_vexec) 4009 panic("bad vop_fplookup_vexec on vector %p for filesystem %s", 4010 vops, mp->mnt_vfc->vfc_name); 4011 4012 if (vops->vop_fplookup_symlink == NULL || 4013 vops->vop_fplookup_symlink == cache_vop_bad_symlink) 4014 panic("bad vop_fplookup_symlink on vector %p for filesystem %s", 4015 vops, mp->mnt_vfc->vfc_name); 4016 } 4017 #endif 4018 4019 void 4020 cache_fast_lookup_enabled_recalc(void) 4021 { 4022 int lookup_flag; 4023 int mac_on; 4024 4025 #ifdef MAC 4026 mac_on = mac_vnode_check_lookup_enabled(); 4027 mac_on |= mac_vnode_check_readlink_enabled(); 4028 #else 4029 mac_on = 0; 4030 #endif 4031 4032 lookup_flag = atomic_load_int(&cache_fast_lookup); 4033 if (lookup_flag && !mac_on) { 4034 atomic_store_char(&cache_fast_lookup_enabled, true); 4035 } else { 4036 atomic_store_char(&cache_fast_lookup_enabled, false); 4037 } 4038 } 4039 4040 static int 4041 syscal_vfs_cache_fast_lookup(SYSCTL_HANDLER_ARGS) 4042 { 4043 int error, old; 4044 4045 old = atomic_load_int(&cache_fast_lookup); 4046 error = sysctl_handle_int(oidp, arg1, arg2, req); 4047 if (error == 0 && req->newptr && old != atomic_load_int(&cache_fast_lookup)) 4048 cache_fast_lookup_enabled_recalc(); 4049 return (error); 4050 } 4051 SYSCTL_PROC(_vfs, OID_AUTO, cache_fast_lookup, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_MPSAFE, 4052 &cache_fast_lookup, 0, syscal_vfs_cache_fast_lookup, "IU", ""); 4053 4054 /* 4055 * Components of nameidata (or objects it can point to) which may 4056 * need restoring in case fast path lookup fails. 4057 */ 4058 struct nameidata_outer { 4059 size_t ni_pathlen; 4060 int cn_flags; 4061 }; 4062 4063 struct nameidata_saved { 4064 #ifdef INVARIANTS 4065 char *cn_nameptr; 4066 size_t ni_pathlen; 4067 #endif 4068 }; 4069 4070 #ifdef INVARIANTS 4071 struct cache_fpl_debug { 4072 size_t ni_pathlen; 4073 }; 4074 #endif 4075 4076 struct cache_fpl { 4077 struct nameidata *ndp; 4078 struct componentname *cnp; 4079 char *nulchar; 4080 struct vnode *dvp; 4081 struct vnode *tvp; 4082 seqc_t dvp_seqc; 4083 seqc_t tvp_seqc; 4084 uint32_t hash; 4085 struct nameidata_saved snd; 4086 struct nameidata_outer snd_outer; 4087 int line; 4088 enum cache_fpl_status status:8; 4089 bool in_smr; 4090 bool fsearch; 4091 struct pwd **pwd; 4092 #ifdef INVARIANTS 4093 struct cache_fpl_debug debug; 4094 #endif 4095 }; 4096 4097 static bool cache_fplookup_mp_supported(struct mount *mp); 4098 static bool cache_fplookup_is_mp(struct cache_fpl *fpl); 4099 static int cache_fplookup_cross_mount(struct cache_fpl *fpl); 4100 static int cache_fplookup_partial_setup(struct cache_fpl *fpl); 4101 static int cache_fplookup_skip_slashes(struct cache_fpl *fpl); 4102 static int cache_fplookup_trailingslash(struct cache_fpl *fpl); 4103 static void cache_fpl_pathlen_dec(struct cache_fpl *fpl); 4104 static void cache_fpl_pathlen_inc(struct cache_fpl *fpl); 4105 static void cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n); 4106 static void cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n); 4107 4108 static void 4109 cache_fpl_cleanup_cnp(struct componentname *cnp) 4110 { 4111 4112 uma_zfree(namei_zone, cnp->cn_pnbuf); 4113 cnp->cn_pnbuf = NULL; 4114 cnp->cn_nameptr = NULL; 4115 } 4116 4117 static struct vnode * 4118 cache_fpl_handle_root(struct cache_fpl *fpl) 4119 { 4120 struct nameidata *ndp; 4121 struct componentname *cnp; 4122 4123 ndp = fpl->ndp; 4124 cnp = fpl->cnp; 4125 4126 MPASS(*(cnp->cn_nameptr) == '/'); 4127 cnp->cn_nameptr++; 4128 cache_fpl_pathlen_dec(fpl); 4129 4130 if (__predict_false(*(cnp->cn_nameptr) == '/')) { 4131 do { 4132 cnp->cn_nameptr++; 4133 cache_fpl_pathlen_dec(fpl); 4134 } while (*(cnp->cn_nameptr) == '/'); 4135 } 4136 4137 return (ndp->ni_rootdir); 4138 } 4139 4140 static void 4141 cache_fpl_checkpoint_outer(struct cache_fpl *fpl) 4142 { 4143 4144 fpl->snd_outer.ni_pathlen = fpl->ndp->ni_pathlen; 4145 fpl->snd_outer.cn_flags = fpl->ndp->ni_cnd.cn_flags; 4146 } 4147 4148 static void 4149 cache_fpl_checkpoint(struct cache_fpl *fpl) 4150 { 4151 4152 #ifdef INVARIANTS 4153 fpl->snd.cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr; 4154 fpl->snd.ni_pathlen = fpl->debug.ni_pathlen; 4155 #endif 4156 } 4157 4158 static void 4159 cache_fpl_restore_partial(struct cache_fpl *fpl) 4160 { 4161 4162 fpl->ndp->ni_cnd.cn_flags = fpl->snd_outer.cn_flags; 4163 #ifdef INVARIANTS 4164 fpl->debug.ni_pathlen = fpl->snd.ni_pathlen; 4165 #endif 4166 } 4167 4168 static void 4169 cache_fpl_restore_abort(struct cache_fpl *fpl) 4170 { 4171 4172 cache_fpl_restore_partial(fpl); 4173 /* 4174 * It is 0 on entry by API contract. 4175 */ 4176 fpl->ndp->ni_resflags = 0; 4177 fpl->ndp->ni_cnd.cn_nameptr = fpl->ndp->ni_cnd.cn_pnbuf; 4178 fpl->ndp->ni_pathlen = fpl->snd_outer.ni_pathlen; 4179 } 4180 4181 #ifdef INVARIANTS 4182 #define cache_fpl_smr_assert_entered(fpl) ({ \ 4183 struct cache_fpl *_fpl = (fpl); \ 4184 MPASS(_fpl->in_smr == true); \ 4185 VFS_SMR_ASSERT_ENTERED(); \ 4186 }) 4187 #define cache_fpl_smr_assert_not_entered(fpl) ({ \ 4188 struct cache_fpl *_fpl = (fpl); \ 4189 MPASS(_fpl->in_smr == false); \ 4190 VFS_SMR_ASSERT_NOT_ENTERED(); \ 4191 }) 4192 static void 4193 cache_fpl_assert_status(struct cache_fpl *fpl) 4194 { 4195 4196 switch (fpl->status) { 4197 case CACHE_FPL_STATUS_UNSET: 4198 __assert_unreachable(); 4199 break; 4200 case CACHE_FPL_STATUS_DESTROYED: 4201 case CACHE_FPL_STATUS_ABORTED: 4202 case CACHE_FPL_STATUS_PARTIAL: 4203 case CACHE_FPL_STATUS_HANDLED: 4204 break; 4205 } 4206 } 4207 #else 4208 #define cache_fpl_smr_assert_entered(fpl) do { } while (0) 4209 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0) 4210 #define cache_fpl_assert_status(fpl) do { } while (0) 4211 #endif 4212 4213 #define cache_fpl_smr_enter_initial(fpl) ({ \ 4214 struct cache_fpl *_fpl = (fpl); \ 4215 vfs_smr_enter(); \ 4216 _fpl->in_smr = true; \ 4217 }) 4218 4219 #define cache_fpl_smr_enter(fpl) ({ \ 4220 struct cache_fpl *_fpl = (fpl); \ 4221 MPASS(_fpl->in_smr == false); \ 4222 vfs_smr_enter(); \ 4223 _fpl->in_smr = true; \ 4224 }) 4225 4226 #define cache_fpl_smr_exit(fpl) ({ \ 4227 struct cache_fpl *_fpl = (fpl); \ 4228 MPASS(_fpl->in_smr == true); \ 4229 vfs_smr_exit(); \ 4230 _fpl->in_smr = false; \ 4231 }) 4232 4233 static int 4234 cache_fpl_aborted_early_impl(struct cache_fpl *fpl, int line) 4235 { 4236 4237 if (fpl->status != CACHE_FPL_STATUS_UNSET) { 4238 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, 4239 ("%s: converting to abort from %d at %d, set at %d\n", 4240 __func__, fpl->status, line, fpl->line)); 4241 } 4242 cache_fpl_smr_assert_not_entered(fpl); 4243 fpl->status = CACHE_FPL_STATUS_ABORTED; 4244 fpl->line = line; 4245 return (CACHE_FPL_FAILED); 4246 } 4247 4248 #define cache_fpl_aborted_early(x) cache_fpl_aborted_early_impl((x), __LINE__) 4249 4250 static int __noinline 4251 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line) 4252 { 4253 struct nameidata *ndp; 4254 struct componentname *cnp; 4255 4256 ndp = fpl->ndp; 4257 cnp = fpl->cnp; 4258 4259 if (fpl->status != CACHE_FPL_STATUS_UNSET) { 4260 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, 4261 ("%s: converting to abort from %d at %d, set at %d\n", 4262 __func__, fpl->status, line, fpl->line)); 4263 } 4264 fpl->status = CACHE_FPL_STATUS_ABORTED; 4265 fpl->line = line; 4266 if (fpl->in_smr) 4267 cache_fpl_smr_exit(fpl); 4268 cache_fpl_restore_abort(fpl); 4269 /* 4270 * Resolving symlinks overwrites data passed by the caller. 4271 * Let namei know. 4272 */ 4273 if (ndp->ni_loopcnt > 0) { 4274 fpl->status = CACHE_FPL_STATUS_DESTROYED; 4275 cache_fpl_cleanup_cnp(cnp); 4276 } 4277 return (CACHE_FPL_FAILED); 4278 } 4279 4280 #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__) 4281 4282 static int __noinline 4283 cache_fpl_partial_impl(struct cache_fpl *fpl, int line) 4284 { 4285 4286 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 4287 ("%s: setting to partial at %d, but already set to %d at %d\n", 4288 __func__, line, fpl->status, fpl->line)); 4289 cache_fpl_smr_assert_entered(fpl); 4290 fpl->status = CACHE_FPL_STATUS_PARTIAL; 4291 fpl->line = line; 4292 return (cache_fplookup_partial_setup(fpl)); 4293 } 4294 4295 #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__) 4296 4297 static int 4298 cache_fpl_handled_impl(struct cache_fpl *fpl, int line) 4299 { 4300 4301 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 4302 ("%s: setting to handled at %d, but already set to %d at %d\n", 4303 __func__, line, fpl->status, fpl->line)); 4304 cache_fpl_smr_assert_not_entered(fpl); 4305 fpl->status = CACHE_FPL_STATUS_HANDLED; 4306 fpl->line = line; 4307 return (0); 4308 } 4309 4310 #define cache_fpl_handled(x) cache_fpl_handled_impl((x), __LINE__) 4311 4312 static int 4313 cache_fpl_handled_error_impl(struct cache_fpl *fpl, int error, int line) 4314 { 4315 4316 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 4317 ("%s: setting to handled at %d, but already set to %d at %d\n", 4318 __func__, line, fpl->status, fpl->line)); 4319 MPASS(error != 0); 4320 MPASS(error != CACHE_FPL_FAILED); 4321 cache_fpl_smr_assert_not_entered(fpl); 4322 fpl->status = CACHE_FPL_STATUS_HANDLED; 4323 fpl->line = line; 4324 fpl->dvp = NULL; 4325 fpl->tvp = NULL; 4326 return (error); 4327 } 4328 4329 #define cache_fpl_handled_error(x, e) cache_fpl_handled_error_impl((x), (e), __LINE__) 4330 4331 static bool 4332 cache_fpl_terminated(struct cache_fpl *fpl) 4333 { 4334 4335 return (fpl->status != CACHE_FPL_STATUS_UNSET); 4336 } 4337 4338 #define CACHE_FPL_SUPPORTED_CN_FLAGS \ 4339 (NC_NOMAKEENTRY | NC_KEEPPOSENTRY | LOCKLEAF | LOCKPARENT | WANTPARENT | \ 4340 FAILIFEXISTS | FOLLOW | EMPTYPATH | LOCKSHARED | ISRESTARTED | WILLBEDIR | \ 4341 ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK | OPENREAD | \ 4342 OPENWRITE | WANTIOCTLCAPS) 4343 4344 #define CACHE_FPL_INTERNAL_CN_FLAGS \ 4345 (ISDOTDOT | MAKEENTRY | ISLASTCN) 4346 4347 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, 4348 "supported and internal flags overlap"); 4349 4350 static bool 4351 cache_fpl_islastcn(struct nameidata *ndp) 4352 { 4353 4354 return (*ndp->ni_next == 0); 4355 } 4356 4357 static bool 4358 cache_fpl_istrailingslash(struct cache_fpl *fpl) 4359 { 4360 4361 MPASS(fpl->nulchar > fpl->cnp->cn_pnbuf); 4362 return (*(fpl->nulchar - 1) == '/'); 4363 } 4364 4365 static bool 4366 cache_fpl_isdotdot(struct componentname *cnp) 4367 { 4368 4369 if (cnp->cn_namelen == 2 && 4370 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') 4371 return (true); 4372 return (false); 4373 } 4374 4375 static bool 4376 cache_can_fplookup(struct cache_fpl *fpl) 4377 { 4378 struct nameidata *ndp; 4379 struct componentname *cnp; 4380 struct thread *td; 4381 4382 ndp = fpl->ndp; 4383 cnp = fpl->cnp; 4384 td = curthread; 4385 4386 if (!atomic_load_char(&cache_fast_lookup_enabled)) { 4387 cache_fpl_aborted_early(fpl); 4388 return (false); 4389 } 4390 if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) { 4391 cache_fpl_aborted_early(fpl); 4392 return (false); 4393 } 4394 if (IN_CAPABILITY_MODE(td)) { 4395 cache_fpl_aborted_early(fpl); 4396 return (false); 4397 } 4398 if (AUDITING_TD(td)) { 4399 cache_fpl_aborted_early(fpl); 4400 return (false); 4401 } 4402 if (ndp->ni_startdir != NULL) { 4403 cache_fpl_aborted_early(fpl); 4404 return (false); 4405 } 4406 return (true); 4407 } 4408 4409 static int __noinline 4410 cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp) 4411 { 4412 struct nameidata *ndp; 4413 struct componentname *cnp; 4414 int error; 4415 bool fsearch; 4416 4417 ndp = fpl->ndp; 4418 cnp = fpl->cnp; 4419 4420 error = fgetvp_lookup_smr(ndp->ni_dirfd, ndp, vpp, &fsearch); 4421 if (__predict_false(error != 0)) { 4422 return (cache_fpl_aborted(fpl)); 4423 } 4424 fpl->fsearch = fsearch; 4425 if ((*vpp)->v_type != VDIR) { 4426 if (!((cnp->cn_flags & EMPTYPATH) != 0 && cnp->cn_pnbuf[0] == '\0')) { 4427 cache_fpl_smr_exit(fpl); 4428 return (cache_fpl_handled_error(fpl, ENOTDIR)); 4429 } 4430 } 4431 return (0); 4432 } 4433 4434 static int __noinline 4435 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp, 4436 uint32_t hash) 4437 { 4438 struct componentname *cnp; 4439 struct vnode *dvp; 4440 4441 cnp = fpl->cnp; 4442 dvp = fpl->dvp; 4443 4444 cache_fpl_smr_exit(fpl); 4445 if (cache_neg_promote_cond(dvp, cnp, oncp, hash)) 4446 return (cache_fpl_handled_error(fpl, ENOENT)); 4447 else 4448 return (cache_fpl_aborted(fpl)); 4449 } 4450 4451 /* 4452 * The target vnode is not supported, prepare for the slow path to take over. 4453 */ 4454 static int __noinline 4455 cache_fplookup_partial_setup(struct cache_fpl *fpl) 4456 { 4457 struct nameidata *ndp; 4458 struct componentname *cnp; 4459 enum vgetstate dvs; 4460 struct vnode *dvp; 4461 struct pwd *pwd; 4462 seqc_t dvp_seqc; 4463 4464 ndp = fpl->ndp; 4465 cnp = fpl->cnp; 4466 pwd = *(fpl->pwd); 4467 dvp = fpl->dvp; 4468 dvp_seqc = fpl->dvp_seqc; 4469 4470 if (!pwd_hold_smr(pwd)) { 4471 return (cache_fpl_aborted(fpl)); 4472 } 4473 4474 /* 4475 * Note that seqc is checked before the vnode is locked, so by 4476 * the time regular lookup gets to it it may have moved. 4477 * 4478 * Ultimately this does not affect correctness, any lookup errors 4479 * are userspace racing with itself. It is guaranteed that any 4480 * path which ultimately gets found could also have been found 4481 * by regular lookup going all the way in absence of concurrent 4482 * modifications. 4483 */ 4484 dvs = vget_prep_smr(dvp); 4485 cache_fpl_smr_exit(fpl); 4486 if (__predict_false(dvs == VGET_NONE)) { 4487 pwd_drop(pwd); 4488 return (cache_fpl_aborted(fpl)); 4489 } 4490 4491 vget_finish_ref(dvp, dvs); 4492 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4493 vrele(dvp); 4494 pwd_drop(pwd); 4495 return (cache_fpl_aborted(fpl)); 4496 } 4497 4498 cache_fpl_restore_partial(fpl); 4499 #ifdef INVARIANTS 4500 if (cnp->cn_nameptr != fpl->snd.cn_nameptr) { 4501 panic("%s: cn_nameptr mismatch (%p != %p) full [%s]\n", __func__, 4502 cnp->cn_nameptr, fpl->snd.cn_nameptr, cnp->cn_pnbuf); 4503 } 4504 #endif 4505 4506 ndp->ni_startdir = dvp; 4507 cnp->cn_flags |= MAKEENTRY; 4508 if (cache_fpl_islastcn(ndp)) 4509 cnp->cn_flags |= ISLASTCN; 4510 if (cache_fpl_isdotdot(cnp)) 4511 cnp->cn_flags |= ISDOTDOT; 4512 4513 /* 4514 * Skip potential extra slashes parsing did not take care of. 4515 * cache_fplookup_skip_slashes explains the mechanism. 4516 */ 4517 if (__predict_false(*(cnp->cn_nameptr) == '/')) { 4518 do { 4519 cnp->cn_nameptr++; 4520 cache_fpl_pathlen_dec(fpl); 4521 } while (*(cnp->cn_nameptr) == '/'); 4522 } 4523 4524 ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1; 4525 #ifdef INVARIANTS 4526 if (ndp->ni_pathlen != fpl->debug.ni_pathlen) { 4527 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n", 4528 __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar, 4529 cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf); 4530 } 4531 #endif 4532 return (0); 4533 } 4534 4535 static int 4536 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs) 4537 { 4538 struct componentname *cnp; 4539 struct vnode *tvp; 4540 seqc_t tvp_seqc; 4541 int error, lkflags; 4542 4543 cnp = fpl->cnp; 4544 tvp = fpl->tvp; 4545 tvp_seqc = fpl->tvp_seqc; 4546 4547 if ((cnp->cn_flags & LOCKLEAF) != 0) { 4548 lkflags = LK_SHARED; 4549 if ((cnp->cn_flags & LOCKSHARED) == 0) 4550 lkflags = LK_EXCLUSIVE; 4551 error = vget_finish(tvp, lkflags, tvs); 4552 if (__predict_false(error != 0)) { 4553 return (cache_fpl_aborted(fpl)); 4554 } 4555 } else { 4556 vget_finish_ref(tvp, tvs); 4557 } 4558 4559 if (!vn_seqc_consistent(tvp, tvp_seqc)) { 4560 if ((cnp->cn_flags & LOCKLEAF) != 0) 4561 vput(tvp); 4562 else 4563 vrele(tvp); 4564 return (cache_fpl_aborted(fpl)); 4565 } 4566 4567 return (cache_fpl_handled(fpl)); 4568 } 4569 4570 /* 4571 * They want to possibly modify the state of the namecache. 4572 */ 4573 static int __noinline 4574 cache_fplookup_final_modifying(struct cache_fpl *fpl) 4575 { 4576 struct nameidata *ndp __diagused; 4577 struct componentname *cnp; 4578 enum vgetstate dvs; 4579 struct vnode *dvp, *tvp; 4580 struct mount *mp; 4581 seqc_t dvp_seqc; 4582 int error; 4583 bool docache; 4584 4585 ndp = fpl->ndp; 4586 cnp = fpl->cnp; 4587 dvp = fpl->dvp; 4588 dvp_seqc = fpl->dvp_seqc; 4589 4590 MPASS(*(cnp->cn_nameptr) != '/'); 4591 MPASS(cache_fpl_islastcn(ndp)); 4592 if ((cnp->cn_flags & LOCKPARENT) == 0) 4593 MPASS((cnp->cn_flags & WANTPARENT) != 0); 4594 MPASS((cnp->cn_flags & TRAILINGSLASH) == 0); 4595 MPASS(cnp->cn_nameiop == CREATE || cnp->cn_nameiop == DELETE || 4596 cnp->cn_nameiop == RENAME); 4597 MPASS((cnp->cn_flags & MAKEENTRY) == 0); 4598 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 4599 4600 docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE; 4601 if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME) 4602 docache = false; 4603 4604 /* 4605 * Regular lookup nulifies the slash, which we don't do here. 4606 * Don't take chances with filesystem routines seeing it for 4607 * the last entry. 4608 */ 4609 if (cache_fpl_istrailingslash(fpl)) { 4610 return (cache_fpl_partial(fpl)); 4611 } 4612 4613 mp = atomic_load_ptr(&dvp->v_mount); 4614 if (__predict_false(mp == NULL)) { 4615 return (cache_fpl_aborted(fpl)); 4616 } 4617 4618 if (__predict_false(mp->mnt_flag & MNT_RDONLY)) { 4619 cache_fpl_smr_exit(fpl); 4620 /* 4621 * Original code keeps not checking for CREATE which 4622 * might be a bug. For now let the old lookup decide. 4623 */ 4624 if (cnp->cn_nameiop == CREATE) { 4625 return (cache_fpl_aborted(fpl)); 4626 } 4627 return (cache_fpl_handled_error(fpl, EROFS)); 4628 } 4629 4630 if (fpl->tvp != NULL && (cnp->cn_flags & FAILIFEXISTS) != 0) { 4631 cache_fpl_smr_exit(fpl); 4632 return (cache_fpl_handled_error(fpl, EEXIST)); 4633 } 4634 4635 /* 4636 * Secure access to dvp; check cache_fplookup_partial_setup for 4637 * reasoning. 4638 * 4639 * XXX At least UFS requires its lookup routine to be called for 4640 * the last path component, which leads to some level of complication 4641 * and inefficiency: 4642 * - the target routine always locks the target vnode, but our caller 4643 * may not need it locked 4644 * - some of the VOP machinery asserts that the parent is locked, which 4645 * once more may be not required 4646 * 4647 * TODO: add a flag for filesystems which don't need this. 4648 */ 4649 dvs = vget_prep_smr(dvp); 4650 cache_fpl_smr_exit(fpl); 4651 if (__predict_false(dvs == VGET_NONE)) { 4652 return (cache_fpl_aborted(fpl)); 4653 } 4654 4655 vget_finish_ref(dvp, dvs); 4656 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4657 vrele(dvp); 4658 return (cache_fpl_aborted(fpl)); 4659 } 4660 4661 error = vn_lock(dvp, LK_EXCLUSIVE); 4662 if (__predict_false(error != 0)) { 4663 vrele(dvp); 4664 return (cache_fpl_aborted(fpl)); 4665 } 4666 4667 tvp = NULL; 4668 cnp->cn_flags |= ISLASTCN; 4669 if (docache) 4670 cnp->cn_flags |= MAKEENTRY; 4671 if (cache_fpl_isdotdot(cnp)) 4672 cnp->cn_flags |= ISDOTDOT; 4673 cnp->cn_lkflags = LK_EXCLUSIVE; 4674 error = VOP_LOOKUP(dvp, &tvp, cnp); 4675 switch (error) { 4676 case EJUSTRETURN: 4677 case 0: 4678 break; 4679 case ENOTDIR: 4680 case ENOENT: 4681 vput(dvp); 4682 return (cache_fpl_handled_error(fpl, error)); 4683 default: 4684 vput(dvp); 4685 return (cache_fpl_aborted(fpl)); 4686 } 4687 4688 fpl->tvp = tvp; 4689 4690 if (tvp == NULL) { 4691 MPASS(error == EJUSTRETURN); 4692 if ((cnp->cn_flags & LOCKPARENT) == 0) { 4693 VOP_UNLOCK(dvp); 4694 } 4695 return (cache_fpl_handled(fpl)); 4696 } 4697 4698 /* 4699 * There are very hairy corner cases concerning various flag combinations 4700 * and locking state. In particular here we only hold one lock instead of 4701 * two. 4702 * 4703 * Skip the complexity as it is of no significance for normal workloads. 4704 */ 4705 if (__predict_false(tvp == dvp)) { 4706 vput(dvp); 4707 vrele(tvp); 4708 return (cache_fpl_aborted(fpl)); 4709 } 4710 4711 /* 4712 * If they want the symlink itself we are fine, but if they want to 4713 * follow it regular lookup has to be engaged. 4714 */ 4715 if (tvp->v_type == VLNK) { 4716 if ((cnp->cn_flags & FOLLOW) != 0) { 4717 vput(dvp); 4718 vput(tvp); 4719 return (cache_fpl_aborted(fpl)); 4720 } 4721 } 4722 4723 /* 4724 * Since we expect this to be the terminal vnode it should almost never 4725 * be a mount point. 4726 */ 4727 if (__predict_false(cache_fplookup_is_mp(fpl))) { 4728 vput(dvp); 4729 vput(tvp); 4730 return (cache_fpl_aborted(fpl)); 4731 } 4732 4733 if ((cnp->cn_flags & FAILIFEXISTS) != 0) { 4734 vput(dvp); 4735 vput(tvp); 4736 return (cache_fpl_handled_error(fpl, EEXIST)); 4737 } 4738 4739 if ((cnp->cn_flags & LOCKLEAF) == 0) { 4740 VOP_UNLOCK(tvp); 4741 } 4742 4743 if ((cnp->cn_flags & LOCKPARENT) == 0) { 4744 VOP_UNLOCK(dvp); 4745 } 4746 4747 return (cache_fpl_handled(fpl)); 4748 } 4749 4750 static int __noinline 4751 cache_fplookup_modifying(struct cache_fpl *fpl) 4752 { 4753 struct nameidata *ndp; 4754 4755 ndp = fpl->ndp; 4756 4757 if (!cache_fpl_islastcn(ndp)) { 4758 return (cache_fpl_partial(fpl)); 4759 } 4760 return (cache_fplookup_final_modifying(fpl)); 4761 } 4762 4763 static int __noinline 4764 cache_fplookup_final_withparent(struct cache_fpl *fpl) 4765 { 4766 struct componentname *cnp; 4767 enum vgetstate dvs, tvs; 4768 struct vnode *dvp, *tvp; 4769 seqc_t dvp_seqc; 4770 int error; 4771 4772 cnp = fpl->cnp; 4773 dvp = fpl->dvp; 4774 dvp_seqc = fpl->dvp_seqc; 4775 tvp = fpl->tvp; 4776 4777 MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0); 4778 4779 /* 4780 * This is less efficient than it can be for simplicity. 4781 */ 4782 dvs = vget_prep_smr(dvp); 4783 if (__predict_false(dvs == VGET_NONE)) { 4784 return (cache_fpl_aborted(fpl)); 4785 } 4786 tvs = vget_prep_smr(tvp); 4787 if (__predict_false(tvs == VGET_NONE)) { 4788 cache_fpl_smr_exit(fpl); 4789 vget_abort(dvp, dvs); 4790 return (cache_fpl_aborted(fpl)); 4791 } 4792 4793 cache_fpl_smr_exit(fpl); 4794 4795 if ((cnp->cn_flags & LOCKPARENT) != 0) { 4796 error = vget_finish(dvp, LK_EXCLUSIVE, dvs); 4797 if (__predict_false(error != 0)) { 4798 vget_abort(tvp, tvs); 4799 return (cache_fpl_aborted(fpl)); 4800 } 4801 } else { 4802 vget_finish_ref(dvp, dvs); 4803 } 4804 4805 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4806 vget_abort(tvp, tvs); 4807 if ((cnp->cn_flags & LOCKPARENT) != 0) 4808 vput(dvp); 4809 else 4810 vrele(dvp); 4811 return (cache_fpl_aborted(fpl)); 4812 } 4813 4814 error = cache_fplookup_final_child(fpl, tvs); 4815 if (__predict_false(error != 0)) { 4816 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED || 4817 fpl->status == CACHE_FPL_STATUS_DESTROYED); 4818 if ((cnp->cn_flags & LOCKPARENT) != 0) 4819 vput(dvp); 4820 else 4821 vrele(dvp); 4822 return (error); 4823 } 4824 4825 MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED); 4826 return (0); 4827 } 4828 4829 static int 4830 cache_fplookup_final(struct cache_fpl *fpl) 4831 { 4832 struct componentname *cnp; 4833 enum vgetstate tvs; 4834 struct vnode *dvp, *tvp; 4835 seqc_t dvp_seqc; 4836 4837 cnp = fpl->cnp; 4838 dvp = fpl->dvp; 4839 dvp_seqc = fpl->dvp_seqc; 4840 tvp = fpl->tvp; 4841 4842 MPASS(*(cnp->cn_nameptr) != '/'); 4843 4844 if (cnp->cn_nameiop != LOOKUP) { 4845 return (cache_fplookup_final_modifying(fpl)); 4846 } 4847 4848 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) 4849 return (cache_fplookup_final_withparent(fpl)); 4850 4851 tvs = vget_prep_smr(tvp); 4852 if (__predict_false(tvs == VGET_NONE)) { 4853 return (cache_fpl_partial(fpl)); 4854 } 4855 4856 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4857 cache_fpl_smr_exit(fpl); 4858 vget_abort(tvp, tvs); 4859 return (cache_fpl_aborted(fpl)); 4860 } 4861 4862 cache_fpl_smr_exit(fpl); 4863 return (cache_fplookup_final_child(fpl, tvs)); 4864 } 4865 4866 /* 4867 * Comment from locked lookup: 4868 * Check for degenerate name (e.g. / or "") which is a way of talking about a 4869 * directory, e.g. like "/." or ".". 4870 */ 4871 static int __noinline 4872 cache_fplookup_degenerate(struct cache_fpl *fpl) 4873 { 4874 struct componentname *cnp; 4875 struct vnode *dvp; 4876 enum vgetstate dvs; 4877 int error, lkflags; 4878 #ifdef INVARIANTS 4879 char *cp; 4880 #endif 4881 4882 fpl->tvp = fpl->dvp; 4883 fpl->tvp_seqc = fpl->dvp_seqc; 4884 4885 cnp = fpl->cnp; 4886 dvp = fpl->dvp; 4887 4888 #ifdef INVARIANTS 4889 for (cp = cnp->cn_pnbuf; *cp != '\0'; cp++) { 4890 KASSERT(*cp == '/', 4891 ("%s: encountered non-slash; string [%s]\n", __func__, 4892 cnp->cn_pnbuf)); 4893 } 4894 #endif 4895 4896 if (__predict_false(cnp->cn_nameiop != LOOKUP)) { 4897 cache_fpl_smr_exit(fpl); 4898 return (cache_fpl_handled_error(fpl, EISDIR)); 4899 } 4900 4901 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) { 4902 return (cache_fplookup_final_withparent(fpl)); 4903 } 4904 4905 dvs = vget_prep_smr(dvp); 4906 cache_fpl_smr_exit(fpl); 4907 if (__predict_false(dvs == VGET_NONE)) { 4908 return (cache_fpl_aborted(fpl)); 4909 } 4910 4911 if ((cnp->cn_flags & LOCKLEAF) != 0) { 4912 lkflags = LK_SHARED; 4913 if ((cnp->cn_flags & LOCKSHARED) == 0) 4914 lkflags = LK_EXCLUSIVE; 4915 error = vget_finish(dvp, lkflags, dvs); 4916 if (__predict_false(error != 0)) { 4917 return (cache_fpl_aborted(fpl)); 4918 } 4919 } else { 4920 vget_finish_ref(dvp, dvs); 4921 } 4922 return (cache_fpl_handled(fpl)); 4923 } 4924 4925 static int __noinline 4926 cache_fplookup_emptypath(struct cache_fpl *fpl) 4927 { 4928 struct nameidata *ndp; 4929 struct componentname *cnp; 4930 enum vgetstate tvs; 4931 struct vnode *tvp; 4932 int error, lkflags; 4933 4934 fpl->tvp = fpl->dvp; 4935 fpl->tvp_seqc = fpl->dvp_seqc; 4936 4937 ndp = fpl->ndp; 4938 cnp = fpl->cnp; 4939 tvp = fpl->tvp; 4940 4941 MPASS(*cnp->cn_pnbuf == '\0'); 4942 4943 if (__predict_false((cnp->cn_flags & EMPTYPATH) == 0)) { 4944 cache_fpl_smr_exit(fpl); 4945 return (cache_fpl_handled_error(fpl, ENOENT)); 4946 } 4947 4948 MPASS((cnp->cn_flags & (LOCKPARENT | WANTPARENT)) == 0); 4949 4950 tvs = vget_prep_smr(tvp); 4951 cache_fpl_smr_exit(fpl); 4952 if (__predict_false(tvs == VGET_NONE)) { 4953 return (cache_fpl_aborted(fpl)); 4954 } 4955 4956 if ((cnp->cn_flags & LOCKLEAF) != 0) { 4957 lkflags = LK_SHARED; 4958 if ((cnp->cn_flags & LOCKSHARED) == 0) 4959 lkflags = LK_EXCLUSIVE; 4960 error = vget_finish(tvp, lkflags, tvs); 4961 if (__predict_false(error != 0)) { 4962 return (cache_fpl_aborted(fpl)); 4963 } 4964 } else { 4965 vget_finish_ref(tvp, tvs); 4966 } 4967 4968 ndp->ni_resflags |= NIRES_EMPTYPATH; 4969 return (cache_fpl_handled(fpl)); 4970 } 4971 4972 static int __noinline 4973 cache_fplookup_noentry(struct cache_fpl *fpl) 4974 { 4975 struct nameidata *ndp; 4976 struct componentname *cnp; 4977 enum vgetstate dvs; 4978 struct vnode *dvp, *tvp; 4979 seqc_t dvp_seqc; 4980 int error; 4981 4982 ndp = fpl->ndp; 4983 cnp = fpl->cnp; 4984 dvp = fpl->dvp; 4985 dvp_seqc = fpl->dvp_seqc; 4986 4987 MPASS((cnp->cn_flags & MAKEENTRY) == 0); 4988 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 4989 if (cnp->cn_nameiop == LOOKUP) 4990 MPASS((cnp->cn_flags & NOCACHE) == 0); 4991 MPASS(!cache_fpl_isdotdot(cnp)); 4992 4993 /* 4994 * Hack: delayed name len checking. 4995 */ 4996 if (__predict_false(cnp->cn_namelen > NAME_MAX)) { 4997 cache_fpl_smr_exit(fpl); 4998 return (cache_fpl_handled_error(fpl, ENAMETOOLONG)); 4999 } 5000 5001 if (cnp->cn_nameptr[0] == '/') { 5002 return (cache_fplookup_skip_slashes(fpl)); 5003 } 5004 5005 if (cnp->cn_pnbuf[0] == '\0') { 5006 return (cache_fplookup_emptypath(fpl)); 5007 } 5008 5009 if (cnp->cn_nameptr[0] == '\0') { 5010 if (fpl->tvp == NULL) { 5011 return (cache_fplookup_degenerate(fpl)); 5012 } 5013 return (cache_fplookup_trailingslash(fpl)); 5014 } 5015 5016 if (cnp->cn_nameiop != LOOKUP) { 5017 fpl->tvp = NULL; 5018 return (cache_fplookup_modifying(fpl)); 5019 } 5020 5021 /* 5022 * Only try to fill in the component if it is the last one, 5023 * otherwise not only there may be several to handle but the 5024 * walk may be complicated. 5025 */ 5026 if (!cache_fpl_islastcn(ndp)) { 5027 return (cache_fpl_partial(fpl)); 5028 } 5029 5030 /* 5031 * Regular lookup nulifies the slash, which we don't do here. 5032 * Don't take chances with filesystem routines seeing it for 5033 * the last entry. 5034 */ 5035 if (cache_fpl_istrailingslash(fpl)) { 5036 return (cache_fpl_partial(fpl)); 5037 } 5038 5039 /* 5040 * Secure access to dvp; check cache_fplookup_partial_setup for 5041 * reasoning. 5042 */ 5043 dvs = vget_prep_smr(dvp); 5044 cache_fpl_smr_exit(fpl); 5045 if (__predict_false(dvs == VGET_NONE)) { 5046 return (cache_fpl_aborted(fpl)); 5047 } 5048 5049 vget_finish_ref(dvp, dvs); 5050 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 5051 vrele(dvp); 5052 return (cache_fpl_aborted(fpl)); 5053 } 5054 5055 error = vn_lock(dvp, LK_SHARED); 5056 if (__predict_false(error != 0)) { 5057 vrele(dvp); 5058 return (cache_fpl_aborted(fpl)); 5059 } 5060 5061 tvp = NULL; 5062 /* 5063 * TODO: provide variants which don't require locking either vnode. 5064 */ 5065 cnp->cn_flags |= ISLASTCN | MAKEENTRY; 5066 cnp->cn_lkflags = LK_SHARED; 5067 if ((cnp->cn_flags & LOCKSHARED) == 0) { 5068 cnp->cn_lkflags = LK_EXCLUSIVE; 5069 } 5070 error = VOP_LOOKUP(dvp, &tvp, cnp); 5071 switch (error) { 5072 case EJUSTRETURN: 5073 case 0: 5074 break; 5075 case ENOTDIR: 5076 case ENOENT: 5077 vput(dvp); 5078 return (cache_fpl_handled_error(fpl, error)); 5079 default: 5080 vput(dvp); 5081 return (cache_fpl_aborted(fpl)); 5082 } 5083 5084 fpl->tvp = tvp; 5085 5086 if (tvp == NULL) { 5087 MPASS(error == EJUSTRETURN); 5088 if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) { 5089 vput(dvp); 5090 } else if ((cnp->cn_flags & LOCKPARENT) == 0) { 5091 VOP_UNLOCK(dvp); 5092 } 5093 return (cache_fpl_handled(fpl)); 5094 } 5095 5096 if (tvp->v_type == VLNK) { 5097 if ((cnp->cn_flags & FOLLOW) != 0) { 5098 vput(dvp); 5099 vput(tvp); 5100 return (cache_fpl_aborted(fpl)); 5101 } 5102 } 5103 5104 if (__predict_false(cache_fplookup_is_mp(fpl))) { 5105 vput(dvp); 5106 vput(tvp); 5107 return (cache_fpl_aborted(fpl)); 5108 } 5109 5110 if ((cnp->cn_flags & LOCKLEAF) == 0) { 5111 VOP_UNLOCK(tvp); 5112 } 5113 5114 if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) { 5115 vput(dvp); 5116 } else if ((cnp->cn_flags & LOCKPARENT) == 0) { 5117 VOP_UNLOCK(dvp); 5118 } 5119 return (cache_fpl_handled(fpl)); 5120 } 5121 5122 static int __noinline 5123 cache_fplookup_dot(struct cache_fpl *fpl) 5124 { 5125 int error; 5126 5127 MPASS(!seqc_in_modify(fpl->dvp_seqc)); 5128 5129 if (__predict_false(fpl->dvp->v_type != VDIR)) { 5130 cache_fpl_smr_exit(fpl); 5131 return (cache_fpl_handled_error(fpl, ENOTDIR)); 5132 } 5133 5134 /* 5135 * Just re-assign the value. seqc will be checked later for the first 5136 * non-dot path component in line and/or before deciding to return the 5137 * vnode. 5138 */ 5139 fpl->tvp = fpl->dvp; 5140 fpl->tvp_seqc = fpl->dvp_seqc; 5141 5142 counter_u64_add(dothits, 1); 5143 SDT_PROBE3(vfs, namecache, lookup, hit, fpl->dvp, ".", fpl->dvp); 5144 5145 error = 0; 5146 if (cache_fplookup_is_mp(fpl)) { 5147 error = cache_fplookup_cross_mount(fpl); 5148 } 5149 return (error); 5150 } 5151 5152 static int __noinline 5153 cache_fplookup_dotdot(struct cache_fpl *fpl) 5154 { 5155 struct nameidata *ndp; 5156 struct componentname *cnp; 5157 struct namecache *ncp; 5158 struct vnode *dvp; 5159 struct prison *pr; 5160 u_char nc_flag; 5161 5162 ndp = fpl->ndp; 5163 cnp = fpl->cnp; 5164 dvp = fpl->dvp; 5165 5166 MPASS(cache_fpl_isdotdot(cnp)); 5167 5168 /* 5169 * XXX this is racy the same way regular lookup is 5170 */ 5171 for (pr = cnp->cn_cred->cr_prison; pr != NULL; 5172 pr = pr->pr_parent) 5173 if (dvp == pr->pr_root) 5174 break; 5175 5176 if (dvp == ndp->ni_rootdir || 5177 dvp == ndp->ni_topdir || 5178 dvp == rootvnode || 5179 pr != NULL) { 5180 fpl->tvp = dvp; 5181 fpl->tvp_seqc = vn_seqc_read_any(dvp); 5182 if (seqc_in_modify(fpl->tvp_seqc)) { 5183 return (cache_fpl_aborted(fpl)); 5184 } 5185 return (0); 5186 } 5187 5188 if ((dvp->v_vflag & VV_ROOT) != 0) { 5189 /* 5190 * TODO 5191 * The opposite of climb mount is needed here. 5192 */ 5193 return (cache_fpl_partial(fpl)); 5194 } 5195 5196 if (__predict_false(dvp->v_type != VDIR)) { 5197 cache_fpl_smr_exit(fpl); 5198 return (cache_fpl_handled_error(fpl, ENOTDIR)); 5199 } 5200 5201 ncp = atomic_load_consume_ptr(&dvp->v_cache_dd); 5202 if (ncp == NULL) { 5203 return (cache_fpl_aborted(fpl)); 5204 } 5205 5206 nc_flag = atomic_load_char(&ncp->nc_flag); 5207 if ((nc_flag & NCF_ISDOTDOT) != 0) { 5208 if ((nc_flag & NCF_NEGATIVE) != 0) 5209 return (cache_fpl_aborted(fpl)); 5210 fpl->tvp = ncp->nc_vp; 5211 } else { 5212 fpl->tvp = ncp->nc_dvp; 5213 } 5214 5215 fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp); 5216 if (seqc_in_modify(fpl->tvp_seqc)) { 5217 return (cache_fpl_partial(fpl)); 5218 } 5219 5220 /* 5221 * Acquire fence provided by vn_seqc_read_any above. 5222 */ 5223 if (__predict_false(atomic_load_ptr(&dvp->v_cache_dd) != ncp)) { 5224 return (cache_fpl_aborted(fpl)); 5225 } 5226 5227 if (!cache_ncp_canuse(ncp)) { 5228 return (cache_fpl_aborted(fpl)); 5229 } 5230 5231 counter_u64_add(dotdothits, 1); 5232 return (0); 5233 } 5234 5235 static int __noinline 5236 cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash) 5237 { 5238 u_char nc_flag __diagused; 5239 bool neg_promote; 5240 5241 #ifdef INVARIANTS 5242 nc_flag = atomic_load_char(&ncp->nc_flag); 5243 MPASS((nc_flag & NCF_NEGATIVE) != 0); 5244 #endif 5245 /* 5246 * If they want to create an entry we need to replace this one. 5247 */ 5248 if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) { 5249 fpl->tvp = NULL; 5250 return (cache_fplookup_modifying(fpl)); 5251 } 5252 neg_promote = cache_neg_hit_prep(ncp); 5253 if (!cache_fpl_neg_ncp_canuse(ncp)) { 5254 cache_neg_hit_abort(ncp); 5255 return (cache_fpl_partial(fpl)); 5256 } 5257 if (neg_promote) { 5258 return (cache_fplookup_negative_promote(fpl, ncp, hash)); 5259 } 5260 cache_neg_hit_finish(ncp); 5261 cache_fpl_smr_exit(fpl); 5262 return (cache_fpl_handled_error(fpl, ENOENT)); 5263 } 5264 5265 /* 5266 * Resolve a symlink. Called by filesystem-specific routines. 5267 * 5268 * Code flow is: 5269 * ... -> cache_fplookup_symlink -> VOP_FPLOOKUP_SYMLINK -> cache_symlink_resolve 5270 */ 5271 int 5272 cache_symlink_resolve(struct cache_fpl *fpl, const char *string, size_t len) 5273 { 5274 struct nameidata *ndp; 5275 struct componentname *cnp; 5276 size_t adjust; 5277 5278 ndp = fpl->ndp; 5279 cnp = fpl->cnp; 5280 5281 if (__predict_false(len == 0)) { 5282 return (ENOENT); 5283 } 5284 5285 if (__predict_false(len > MAXPATHLEN - 2)) { 5286 if (cache_fpl_istrailingslash(fpl)) { 5287 return (EAGAIN); 5288 } 5289 } 5290 5291 ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr - cnp->cn_namelen + 1; 5292 #ifdef INVARIANTS 5293 if (ndp->ni_pathlen != fpl->debug.ni_pathlen) { 5294 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n", 5295 __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar, 5296 cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf); 5297 } 5298 #endif 5299 5300 if (__predict_false(len + ndp->ni_pathlen > MAXPATHLEN)) { 5301 return (ENAMETOOLONG); 5302 } 5303 5304 if (__predict_false(ndp->ni_loopcnt++ >= MAXSYMLINKS)) { 5305 return (ELOOP); 5306 } 5307 5308 adjust = len; 5309 if (ndp->ni_pathlen > 1) { 5310 bcopy(ndp->ni_next, cnp->cn_pnbuf + len, ndp->ni_pathlen); 5311 } else { 5312 if (cache_fpl_istrailingslash(fpl)) { 5313 adjust = len + 1; 5314 cnp->cn_pnbuf[len] = '/'; 5315 cnp->cn_pnbuf[len + 1] = '\0'; 5316 } else { 5317 cnp->cn_pnbuf[len] = '\0'; 5318 } 5319 } 5320 bcopy(string, cnp->cn_pnbuf, len); 5321 5322 ndp->ni_pathlen += adjust; 5323 cache_fpl_pathlen_add(fpl, adjust); 5324 cnp->cn_nameptr = cnp->cn_pnbuf; 5325 fpl->nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1]; 5326 fpl->tvp = NULL; 5327 return (0); 5328 } 5329 5330 static int __noinline 5331 cache_fplookup_symlink(struct cache_fpl *fpl) 5332 { 5333 struct mount *mp; 5334 struct nameidata *ndp; 5335 struct componentname *cnp; 5336 struct vnode *dvp, *tvp; 5337 int error; 5338 5339 ndp = fpl->ndp; 5340 cnp = fpl->cnp; 5341 dvp = fpl->dvp; 5342 tvp = fpl->tvp; 5343 5344 if (cache_fpl_islastcn(ndp)) { 5345 if ((cnp->cn_flags & FOLLOW) == 0) { 5346 return (cache_fplookup_final(fpl)); 5347 } 5348 } 5349 5350 mp = atomic_load_ptr(&dvp->v_mount); 5351 if (__predict_false(mp == NULL)) { 5352 return (cache_fpl_aborted(fpl)); 5353 } 5354 5355 /* 5356 * Note this check races against setting the flag just like regular 5357 * lookup. 5358 */ 5359 if (__predict_false((mp->mnt_flag & MNT_NOSYMFOLLOW) != 0)) { 5360 cache_fpl_smr_exit(fpl); 5361 return (cache_fpl_handled_error(fpl, EACCES)); 5362 } 5363 5364 error = VOP_FPLOOKUP_SYMLINK(tvp, fpl); 5365 if (__predict_false(error != 0)) { 5366 switch (error) { 5367 case EAGAIN: 5368 return (cache_fpl_partial(fpl)); 5369 case ENOENT: 5370 case ENAMETOOLONG: 5371 case ELOOP: 5372 cache_fpl_smr_exit(fpl); 5373 return (cache_fpl_handled_error(fpl, error)); 5374 default: 5375 return (cache_fpl_aborted(fpl)); 5376 } 5377 } 5378 5379 if (*(cnp->cn_nameptr) == '/') { 5380 fpl->dvp = cache_fpl_handle_root(fpl); 5381 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); 5382 if (seqc_in_modify(fpl->dvp_seqc)) { 5383 return (cache_fpl_aborted(fpl)); 5384 } 5385 /* 5386 * The main loop assumes that ->dvp points to a vnode belonging 5387 * to a filesystem which can do lockless lookup, but the absolute 5388 * symlink can be wandering off to one which does not. 5389 */ 5390 mp = atomic_load_ptr(&fpl->dvp->v_mount); 5391 if (__predict_false(mp == NULL)) { 5392 return (cache_fpl_aborted(fpl)); 5393 } 5394 if (!cache_fplookup_mp_supported(mp)) { 5395 cache_fpl_checkpoint(fpl); 5396 return (cache_fpl_partial(fpl)); 5397 } 5398 } 5399 return (0); 5400 } 5401 5402 static int 5403 cache_fplookup_next(struct cache_fpl *fpl) 5404 { 5405 struct componentname *cnp; 5406 struct namecache *ncp; 5407 struct vnode *dvp, *tvp; 5408 u_char nc_flag; 5409 uint32_t hash; 5410 int error; 5411 5412 cnp = fpl->cnp; 5413 dvp = fpl->dvp; 5414 hash = fpl->hash; 5415 5416 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 5417 if (cnp->cn_namelen == 1) { 5418 return (cache_fplookup_dot(fpl)); 5419 } 5420 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 5421 return (cache_fplookup_dotdot(fpl)); 5422 } 5423 } 5424 5425 MPASS(!cache_fpl_isdotdot(cnp)); 5426 5427 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 5428 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 5429 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 5430 break; 5431 } 5432 5433 if (__predict_false(ncp == NULL)) { 5434 return (cache_fplookup_noentry(fpl)); 5435 } 5436 5437 tvp = atomic_load_ptr(&ncp->nc_vp); 5438 nc_flag = atomic_load_char(&ncp->nc_flag); 5439 if ((nc_flag & NCF_NEGATIVE) != 0) { 5440 return (cache_fplookup_neg(fpl, ncp, hash)); 5441 } 5442 5443 if (!cache_ncp_canuse(ncp)) { 5444 return (cache_fpl_partial(fpl)); 5445 } 5446 5447 fpl->tvp = tvp; 5448 fpl->tvp_seqc = vn_seqc_read_any(tvp); 5449 if (seqc_in_modify(fpl->tvp_seqc)) { 5450 return (cache_fpl_partial(fpl)); 5451 } 5452 5453 counter_u64_add(numposhits, 1); 5454 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp); 5455 5456 error = 0; 5457 if (cache_fplookup_is_mp(fpl)) { 5458 error = cache_fplookup_cross_mount(fpl); 5459 } 5460 return (error); 5461 } 5462 5463 static bool 5464 cache_fplookup_mp_supported(struct mount *mp) 5465 { 5466 5467 MPASS(mp != NULL); 5468 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) 5469 return (false); 5470 return (true); 5471 } 5472 5473 /* 5474 * Walk up the mount stack (if any). 5475 * 5476 * Correctness is provided in the following ways: 5477 * - all vnodes are protected from freeing with SMR 5478 * - struct mount objects are type stable making them always safe to access 5479 * - stability of the particular mount is provided by busying it 5480 * - relationship between the vnode which is mounted on and the mount is 5481 * verified with the vnode sequence counter after busying 5482 * - association between root vnode of the mount and the mount is protected 5483 * by busy 5484 * 5485 * From that point on we can read the sequence counter of the root vnode 5486 * and get the next mount on the stack (if any) using the same protection. 5487 * 5488 * By the end of successful walk we are guaranteed the reached state was 5489 * indeed present at least at some point which matches the regular lookup. 5490 */ 5491 static int __noinline 5492 cache_fplookup_climb_mount(struct cache_fpl *fpl) 5493 { 5494 struct mount *mp, *prev_mp; 5495 struct mount_pcpu *mpcpu, *prev_mpcpu; 5496 struct vnode *vp; 5497 seqc_t vp_seqc; 5498 5499 vp = fpl->tvp; 5500 vp_seqc = fpl->tvp_seqc; 5501 5502 VNPASS(vp->v_type == VDIR || vp->v_type == VREG || vp->v_type == VBAD, vp); 5503 mp = atomic_load_ptr(&vp->v_mountedhere); 5504 if (__predict_false(mp == NULL)) { 5505 return (0); 5506 } 5507 5508 prev_mp = NULL; 5509 for (;;) { 5510 if (!vfs_op_thread_enter_crit(mp, mpcpu)) { 5511 if (prev_mp != NULL) 5512 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); 5513 return (cache_fpl_partial(fpl)); 5514 } 5515 if (prev_mp != NULL) 5516 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); 5517 if (!vn_seqc_consistent(vp, vp_seqc)) { 5518 vfs_op_thread_exit_crit(mp, mpcpu); 5519 return (cache_fpl_partial(fpl)); 5520 } 5521 if (!cache_fplookup_mp_supported(mp)) { 5522 vfs_op_thread_exit_crit(mp, mpcpu); 5523 return (cache_fpl_partial(fpl)); 5524 } 5525 vp = atomic_load_ptr(&mp->mnt_rootvnode); 5526 if (vp == NULL) { 5527 vfs_op_thread_exit_crit(mp, mpcpu); 5528 return (cache_fpl_partial(fpl)); 5529 } 5530 vp_seqc = vn_seqc_read_any(vp); 5531 if (seqc_in_modify(vp_seqc)) { 5532 vfs_op_thread_exit_crit(mp, mpcpu); 5533 return (cache_fpl_partial(fpl)); 5534 } 5535 prev_mp = mp; 5536 prev_mpcpu = mpcpu; 5537 mp = atomic_load_ptr(&vp->v_mountedhere); 5538 if (mp == NULL) 5539 break; 5540 } 5541 5542 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); 5543 fpl->tvp = vp; 5544 fpl->tvp_seqc = vp_seqc; 5545 return (0); 5546 } 5547 5548 static int __noinline 5549 cache_fplookup_cross_mount(struct cache_fpl *fpl) 5550 { 5551 struct mount *mp; 5552 struct mount_pcpu *mpcpu; 5553 struct vnode *vp; 5554 seqc_t vp_seqc; 5555 5556 vp = fpl->tvp; 5557 vp_seqc = fpl->tvp_seqc; 5558 5559 VNPASS(vp->v_type == VDIR || vp->v_type == VREG || vp->v_type == VBAD, vp); 5560 mp = atomic_load_ptr(&vp->v_mountedhere); 5561 if (__predict_false(mp == NULL)) { 5562 return (0); 5563 } 5564 5565 if (!vfs_op_thread_enter_crit(mp, mpcpu)) { 5566 return (cache_fpl_partial(fpl)); 5567 } 5568 if (!vn_seqc_consistent(vp, vp_seqc)) { 5569 vfs_op_thread_exit_crit(mp, mpcpu); 5570 return (cache_fpl_partial(fpl)); 5571 } 5572 if (!cache_fplookup_mp_supported(mp)) { 5573 vfs_op_thread_exit_crit(mp, mpcpu); 5574 return (cache_fpl_partial(fpl)); 5575 } 5576 vp = atomic_load_ptr(&mp->mnt_rootvnode); 5577 if (__predict_false(vp == NULL)) { 5578 vfs_op_thread_exit_crit(mp, mpcpu); 5579 return (cache_fpl_partial(fpl)); 5580 } 5581 vp_seqc = vn_seqc_read_any(vp); 5582 vfs_op_thread_exit_crit(mp, mpcpu); 5583 if (seqc_in_modify(vp_seqc)) { 5584 return (cache_fpl_partial(fpl)); 5585 } 5586 mp = atomic_load_ptr(&vp->v_mountedhere); 5587 if (__predict_false(mp != NULL)) { 5588 /* 5589 * There are possibly more mount points on top. 5590 * Normally this does not happen so for simplicity just start 5591 * over. 5592 */ 5593 return (cache_fplookup_climb_mount(fpl)); 5594 } 5595 5596 fpl->tvp = vp; 5597 fpl->tvp_seqc = vp_seqc; 5598 return (0); 5599 } 5600 5601 /* 5602 * Check if a vnode is mounted on. 5603 */ 5604 static bool 5605 cache_fplookup_is_mp(struct cache_fpl *fpl) 5606 { 5607 struct vnode *vp; 5608 5609 vp = fpl->tvp; 5610 return ((vn_irflag_read(vp) & VIRF_MOUNTPOINT) != 0); 5611 } 5612 5613 /* 5614 * Parse the path. 5615 * 5616 * The code was originally copy-pasted from regular lookup and despite 5617 * clean ups leaves performance on the table. Any modifications here 5618 * must take into account that in case off fallback the resulting 5619 * nameidata state has to be compatible with the original. 5620 */ 5621 5622 /* 5623 * Debug ni_pathlen tracking. 5624 */ 5625 #ifdef INVARIANTS 5626 static void 5627 cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n) 5628 { 5629 5630 fpl->debug.ni_pathlen += n; 5631 KASSERT(fpl->debug.ni_pathlen <= PATH_MAX, 5632 ("%s: pathlen overflow to %zd\n", __func__, fpl->debug.ni_pathlen)); 5633 } 5634 5635 static void 5636 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n) 5637 { 5638 5639 fpl->debug.ni_pathlen -= n; 5640 KASSERT(fpl->debug.ni_pathlen <= PATH_MAX, 5641 ("%s: pathlen underflow to %zd\n", __func__, fpl->debug.ni_pathlen)); 5642 } 5643 5644 static void 5645 cache_fpl_pathlen_inc(struct cache_fpl *fpl) 5646 { 5647 5648 cache_fpl_pathlen_add(fpl, 1); 5649 } 5650 5651 static void 5652 cache_fpl_pathlen_dec(struct cache_fpl *fpl) 5653 { 5654 5655 cache_fpl_pathlen_sub(fpl, 1); 5656 } 5657 #else 5658 static void 5659 cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n) 5660 { 5661 } 5662 5663 static void 5664 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n) 5665 { 5666 } 5667 5668 static void 5669 cache_fpl_pathlen_inc(struct cache_fpl *fpl) 5670 { 5671 } 5672 5673 static void 5674 cache_fpl_pathlen_dec(struct cache_fpl *fpl) 5675 { 5676 } 5677 #endif 5678 5679 static void 5680 cache_fplookup_parse(struct cache_fpl *fpl) 5681 { 5682 struct nameidata *ndp; 5683 struct componentname *cnp; 5684 struct vnode *dvp; 5685 char *cp; 5686 uint32_t hash; 5687 5688 ndp = fpl->ndp; 5689 cnp = fpl->cnp; 5690 dvp = fpl->dvp; 5691 5692 /* 5693 * Find the end of this path component, it is either / or nul. 5694 * 5695 * Store / as a temporary sentinel so that we only have one character 5696 * to test for. Pathnames tend to be short so this should not be 5697 * resulting in cache misses. 5698 * 5699 * TODO: fix this to be word-sized. 5700 */ 5701 MPASS(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] >= cnp->cn_pnbuf); 5702 KASSERT(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] == fpl->nulchar, 5703 ("%s: mismatch between pathlen (%zu) and nulchar (%p != %p), string [%s]\n", 5704 __func__, fpl->debug.ni_pathlen, &cnp->cn_nameptr[fpl->debug.ni_pathlen - 1], 5705 fpl->nulchar, cnp->cn_pnbuf)); 5706 KASSERT(*fpl->nulchar == '\0', 5707 ("%s: expected nul at %p; string [%s]\n", __func__, fpl->nulchar, 5708 cnp->cn_pnbuf)); 5709 hash = cache_get_hash_iter_start(dvp); 5710 *fpl->nulchar = '/'; 5711 for (cp = cnp->cn_nameptr; *cp != '/'; cp++) { 5712 KASSERT(*cp != '\0', 5713 ("%s: encountered unexpected nul; string [%s]\n", __func__, 5714 cnp->cn_nameptr)); 5715 hash = cache_get_hash_iter(*cp, hash); 5716 continue; 5717 } 5718 *fpl->nulchar = '\0'; 5719 fpl->hash = cache_get_hash_iter_finish(hash); 5720 5721 cnp->cn_namelen = cp - cnp->cn_nameptr; 5722 cache_fpl_pathlen_sub(fpl, cnp->cn_namelen); 5723 5724 #ifdef INVARIANTS 5725 /* 5726 * cache_get_hash only accepts lengths up to NAME_MAX. This is fine since 5727 * we are going to fail this lookup with ENAMETOOLONG (see below). 5728 */ 5729 if (cnp->cn_namelen <= NAME_MAX) { 5730 if (fpl->hash != cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp)) { 5731 panic("%s: mismatched hash for [%s] len %ld", __func__, 5732 cnp->cn_nameptr, cnp->cn_namelen); 5733 } 5734 } 5735 #endif 5736 5737 /* 5738 * Hack: we have to check if the found path component's length exceeds 5739 * NAME_MAX. However, the condition is very rarely true and check can 5740 * be elided in the common case -- if an entry was found in the cache, 5741 * then it could not have been too long to begin with. 5742 */ 5743 ndp->ni_next = cp; 5744 } 5745 5746 static void 5747 cache_fplookup_parse_advance(struct cache_fpl *fpl) 5748 { 5749 struct nameidata *ndp; 5750 struct componentname *cnp; 5751 5752 ndp = fpl->ndp; 5753 cnp = fpl->cnp; 5754 5755 cnp->cn_nameptr = ndp->ni_next; 5756 KASSERT(*(cnp->cn_nameptr) == '/', 5757 ("%s: should have seen slash at %p ; buf %p [%s]\n", __func__, 5758 cnp->cn_nameptr, cnp->cn_pnbuf, cnp->cn_pnbuf)); 5759 cnp->cn_nameptr++; 5760 cache_fpl_pathlen_dec(fpl); 5761 } 5762 5763 /* 5764 * Skip spurious slashes in a pathname (e.g., "foo///bar") and retry. 5765 * 5766 * Lockless lookup tries to elide checking for spurious slashes and should they 5767 * be present is guaranteed to fail to find an entry. In this case the caller 5768 * must check if the name starts with a slash and call this routine. It is 5769 * going to fast forward across the spurious slashes and set the state up for 5770 * retry. 5771 */ 5772 static int __noinline 5773 cache_fplookup_skip_slashes(struct cache_fpl *fpl) 5774 { 5775 struct nameidata *ndp; 5776 struct componentname *cnp; 5777 5778 ndp = fpl->ndp; 5779 cnp = fpl->cnp; 5780 5781 MPASS(*(cnp->cn_nameptr) == '/'); 5782 do { 5783 cnp->cn_nameptr++; 5784 cache_fpl_pathlen_dec(fpl); 5785 } while (*(cnp->cn_nameptr) == '/'); 5786 5787 /* 5788 * Go back to one slash so that cache_fplookup_parse_advance has 5789 * something to skip. 5790 */ 5791 cnp->cn_nameptr--; 5792 cache_fpl_pathlen_inc(fpl); 5793 5794 /* 5795 * cache_fplookup_parse_advance starts from ndp->ni_next 5796 */ 5797 ndp->ni_next = cnp->cn_nameptr; 5798 5799 /* 5800 * See cache_fplookup_dot. 5801 */ 5802 fpl->tvp = fpl->dvp; 5803 fpl->tvp_seqc = fpl->dvp_seqc; 5804 5805 return (0); 5806 } 5807 5808 /* 5809 * Handle trailing slashes (e.g., "foo/"). 5810 * 5811 * If a trailing slash is found the terminal vnode must be a directory. 5812 * Regular lookup shortens the path by nulifying the first trailing slash and 5813 * sets the TRAILINGSLASH flag to denote this took place. There are several 5814 * checks on it performed later. 5815 * 5816 * Similarly to spurious slashes, lockless lookup handles this in a speculative 5817 * manner relying on an invariant that a non-directory vnode will get a miss. 5818 * In this case cn_nameptr[0] == '\0' and cn_namelen == 0. 5819 * 5820 * Thus for a path like "foo/bar/" the code unwinds the state back to "bar/" 5821 * and denotes this is the last path component, which avoids looping back. 5822 * 5823 * Only plain lookups are supported for now to restrict corner cases to handle. 5824 */ 5825 static int __noinline 5826 cache_fplookup_trailingslash(struct cache_fpl *fpl) 5827 { 5828 #ifdef INVARIANTS 5829 size_t ni_pathlen; 5830 #endif 5831 struct nameidata *ndp; 5832 struct componentname *cnp; 5833 struct namecache *ncp; 5834 struct vnode *tvp; 5835 char *cn_nameptr_orig, *cn_nameptr_slash; 5836 seqc_t tvp_seqc; 5837 u_char nc_flag; 5838 5839 ndp = fpl->ndp; 5840 cnp = fpl->cnp; 5841 tvp = fpl->tvp; 5842 tvp_seqc = fpl->tvp_seqc; 5843 5844 MPASS(fpl->dvp == fpl->tvp); 5845 KASSERT(cache_fpl_istrailingslash(fpl), 5846 ("%s: expected trailing slash at %p; string [%s]\n", __func__, fpl->nulchar - 1, 5847 cnp->cn_pnbuf)); 5848 KASSERT(cnp->cn_nameptr[0] == '\0', 5849 ("%s: expected nul char at %p; string [%s]\n", __func__, &cnp->cn_nameptr[0], 5850 cnp->cn_pnbuf)); 5851 KASSERT(cnp->cn_namelen == 0, 5852 ("%s: namelen 0 but got %ld; string [%s]\n", __func__, cnp->cn_namelen, 5853 cnp->cn_pnbuf)); 5854 MPASS(cnp->cn_nameptr > cnp->cn_pnbuf); 5855 5856 if (cnp->cn_nameiop != LOOKUP) { 5857 return (cache_fpl_aborted(fpl)); 5858 } 5859 5860 if (__predict_false(tvp->v_type != VDIR)) { 5861 if (!vn_seqc_consistent(tvp, tvp_seqc)) { 5862 return (cache_fpl_aborted(fpl)); 5863 } 5864 cache_fpl_smr_exit(fpl); 5865 return (cache_fpl_handled_error(fpl, ENOTDIR)); 5866 } 5867 5868 /* 5869 * Denote the last component. 5870 */ 5871 ndp->ni_next = &cnp->cn_nameptr[0]; 5872 MPASS(cache_fpl_islastcn(ndp)); 5873 5874 /* 5875 * Unwind trailing slashes. 5876 */ 5877 cn_nameptr_orig = cnp->cn_nameptr; 5878 while (cnp->cn_nameptr >= cnp->cn_pnbuf) { 5879 cnp->cn_nameptr--; 5880 if (cnp->cn_nameptr[0] != '/') { 5881 break; 5882 } 5883 } 5884 5885 /* 5886 * Unwind to the beginning of the path component. 5887 * 5888 * Note the path may or may not have started with a slash. 5889 */ 5890 cn_nameptr_slash = cnp->cn_nameptr; 5891 while (cnp->cn_nameptr > cnp->cn_pnbuf) { 5892 cnp->cn_nameptr--; 5893 if (cnp->cn_nameptr[0] == '/') { 5894 break; 5895 } 5896 } 5897 if (cnp->cn_nameptr[0] == '/') { 5898 cnp->cn_nameptr++; 5899 } 5900 5901 cnp->cn_namelen = cn_nameptr_slash - cnp->cn_nameptr + 1; 5902 cache_fpl_pathlen_add(fpl, cn_nameptr_orig - cnp->cn_nameptr); 5903 cache_fpl_checkpoint(fpl); 5904 5905 #ifdef INVARIANTS 5906 ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1; 5907 if (ni_pathlen != fpl->debug.ni_pathlen) { 5908 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n", 5909 __func__, ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar, 5910 cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf); 5911 } 5912 #endif 5913 5914 /* 5915 * If this was a "./" lookup the parent directory is already correct. 5916 */ 5917 if (cnp->cn_nameptr[0] == '.' && cnp->cn_namelen == 1) { 5918 return (0); 5919 } 5920 5921 /* 5922 * Otherwise we need to look it up. 5923 */ 5924 tvp = fpl->tvp; 5925 ncp = atomic_load_consume_ptr(&tvp->v_cache_dd); 5926 if (__predict_false(ncp == NULL)) { 5927 return (cache_fpl_aborted(fpl)); 5928 } 5929 nc_flag = atomic_load_char(&ncp->nc_flag); 5930 if ((nc_flag & NCF_ISDOTDOT) != 0) { 5931 return (cache_fpl_aborted(fpl)); 5932 } 5933 fpl->dvp = ncp->nc_dvp; 5934 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); 5935 if (seqc_in_modify(fpl->dvp_seqc)) { 5936 return (cache_fpl_aborted(fpl)); 5937 } 5938 return (0); 5939 } 5940 5941 /* 5942 * See the API contract for VOP_FPLOOKUP_VEXEC. 5943 */ 5944 static int __noinline 5945 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error) 5946 { 5947 struct componentname *cnp; 5948 struct vnode *dvp; 5949 seqc_t dvp_seqc; 5950 5951 cnp = fpl->cnp; 5952 dvp = fpl->dvp; 5953 dvp_seqc = fpl->dvp_seqc; 5954 5955 /* 5956 * Hack: delayed empty path checking. 5957 */ 5958 if (cnp->cn_pnbuf[0] == '\0') { 5959 return (cache_fplookup_emptypath(fpl)); 5960 } 5961 5962 /* 5963 * TODO: Due to ignoring trailing slashes lookup will perform a 5964 * permission check on the last dir when it should not be doing it. It 5965 * may fail, but said failure should be ignored. It is possible to fix 5966 * it up fully without resorting to regular lookup, but for now just 5967 * abort. 5968 */ 5969 if (cache_fpl_istrailingslash(fpl)) { 5970 return (cache_fpl_aborted(fpl)); 5971 } 5972 5973 /* 5974 * Hack: delayed degenerate path checking. 5975 */ 5976 if (cnp->cn_nameptr[0] == '\0' && fpl->tvp == NULL) { 5977 return (cache_fplookup_degenerate(fpl)); 5978 } 5979 5980 /* 5981 * Hack: delayed name len checking. 5982 */ 5983 if (__predict_false(cnp->cn_namelen > NAME_MAX)) { 5984 cache_fpl_smr_exit(fpl); 5985 return (cache_fpl_handled_error(fpl, ENAMETOOLONG)); 5986 } 5987 5988 /* 5989 * Hack: they may be looking up foo/bar, where foo is not a directory. 5990 * In such a case we need to return ENOTDIR, but we may happen to get 5991 * here with a different error. 5992 */ 5993 if (dvp->v_type != VDIR) { 5994 error = ENOTDIR; 5995 } 5996 5997 /* 5998 * Hack: handle O_SEARCH. 5999 * 6000 * Open Group Base Specifications Issue 7, 2018 edition states: 6001 * <quote> 6002 * If the access mode of the open file description associated with the 6003 * file descriptor is not O_SEARCH, the function shall check whether 6004 * directory searches are permitted using the current permissions of 6005 * the directory underlying the file descriptor. If the access mode is 6006 * O_SEARCH, the function shall not perform the check. 6007 * </quote> 6008 * 6009 * Regular lookup tests for the NOEXECCHECK flag for every path 6010 * component to decide whether to do the permission check. However, 6011 * since most lookups never have the flag (and when they do it is only 6012 * present for the first path component), lockless lookup only acts on 6013 * it if there is a permission problem. Here the flag is represented 6014 * with a boolean so that we don't have to clear it on the way out. 6015 * 6016 * For simplicity this always aborts. 6017 * TODO: check if this is the first lookup and ignore the permission 6018 * problem. Note the flag has to survive fallback (if it happens to be 6019 * performed). 6020 */ 6021 if (fpl->fsearch) { 6022 return (cache_fpl_aborted(fpl)); 6023 } 6024 6025 switch (error) { 6026 case EAGAIN: 6027 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 6028 error = cache_fpl_aborted(fpl); 6029 } else { 6030 cache_fpl_partial(fpl); 6031 } 6032 break; 6033 default: 6034 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 6035 error = cache_fpl_aborted(fpl); 6036 } else { 6037 cache_fpl_smr_exit(fpl); 6038 cache_fpl_handled_error(fpl, error); 6039 } 6040 break; 6041 } 6042 return (error); 6043 } 6044 6045 static int 6046 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl) 6047 { 6048 struct nameidata *ndp; 6049 struct componentname *cnp; 6050 struct mount *mp; 6051 int error; 6052 6053 ndp = fpl->ndp; 6054 cnp = fpl->cnp; 6055 6056 cache_fpl_checkpoint(fpl); 6057 6058 /* 6059 * The vnode at hand is almost always stable, skip checking for it. 6060 * Worst case this postpones the check towards the end of the iteration 6061 * of the main loop. 6062 */ 6063 fpl->dvp = dvp; 6064 fpl->dvp_seqc = vn_seqc_read_notmodify(fpl->dvp); 6065 6066 mp = atomic_load_ptr(&dvp->v_mount); 6067 if (__predict_false(mp == NULL || !cache_fplookup_mp_supported(mp))) { 6068 return (cache_fpl_aborted(fpl)); 6069 } 6070 6071 MPASS(fpl->tvp == NULL); 6072 6073 for (;;) { 6074 cache_fplookup_parse(fpl); 6075 6076 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred); 6077 if (__predict_false(error != 0)) { 6078 error = cache_fplookup_failed_vexec(fpl, error); 6079 break; 6080 } 6081 6082 error = cache_fplookup_next(fpl); 6083 if (__predict_false(cache_fpl_terminated(fpl))) { 6084 break; 6085 } 6086 6087 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 6088 6089 if (fpl->tvp->v_type == VLNK) { 6090 error = cache_fplookup_symlink(fpl); 6091 if (cache_fpl_terminated(fpl)) { 6092 break; 6093 } 6094 } else { 6095 if (cache_fpl_islastcn(ndp)) { 6096 error = cache_fplookup_final(fpl); 6097 break; 6098 } 6099 6100 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { 6101 error = cache_fpl_aborted(fpl); 6102 break; 6103 } 6104 6105 fpl->dvp = fpl->tvp; 6106 fpl->dvp_seqc = fpl->tvp_seqc; 6107 cache_fplookup_parse_advance(fpl); 6108 } 6109 6110 cache_fpl_checkpoint(fpl); 6111 } 6112 6113 return (error); 6114 } 6115 6116 /* 6117 * Fast path lookup protected with SMR and sequence counters. 6118 * 6119 * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one. 6120 * 6121 * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria 6122 * outlined below. 6123 * 6124 * Traditional vnode lookup conceptually looks like this: 6125 * 6126 * vn_lock(current); 6127 * for (;;) { 6128 * next = find(); 6129 * vn_lock(next); 6130 * vn_unlock(current); 6131 * current = next; 6132 * if (last) 6133 * break; 6134 * } 6135 * return (current); 6136 * 6137 * Each jump to the next vnode is safe memory-wise and atomic with respect to 6138 * any modifications thanks to holding respective locks. 6139 * 6140 * The same guarantee can be provided with a combination of safe memory 6141 * reclamation and sequence counters instead. If all operations which affect 6142 * the relationship between the current vnode and the one we are looking for 6143 * also modify the counter, we can verify whether all the conditions held as 6144 * we made the jump. This includes things like permissions, mount points etc. 6145 * Counter modification is provided by enclosing relevant places in 6146 * vn_seqc_write_begin()/end() calls. 6147 * 6148 * Thus this translates to: 6149 * 6150 * vfs_smr_enter(); 6151 * dvp_seqc = seqc_read_any(dvp); 6152 * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode 6153 * abort(); 6154 * for (;;) { 6155 * tvp = find(); 6156 * tvp_seqc = seqc_read_any(tvp); 6157 * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode 6158 * abort(); 6159 * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode 6160 * abort(); 6161 * dvp = tvp; // we know nothing of importance has changed 6162 * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration 6163 * if (last) 6164 * break; 6165 * } 6166 * vget(); // secure the vnode 6167 * if (!seqc_consistent(tvp, tvp_seqc) // final check 6168 * abort(); 6169 * // at this point we know nothing has changed for any parent<->child pair 6170 * // as they were crossed during the lookup, meaning we matched the guarantee 6171 * // of the locked variant 6172 * return (tvp); 6173 * 6174 * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows: 6175 * - they are called while within vfs_smr protection which they must never exit 6176 * - EAGAIN can be returned to denote checking could not be performed, it is 6177 * always valid to return it 6178 * - if the sequence counter has not changed the result must be valid 6179 * - if the sequence counter has changed both false positives and false negatives 6180 * are permitted (since the result will be rejected later) 6181 * - for simple cases of unix permission checks vaccess_vexec_smr can be used 6182 * 6183 * Caveats to watch out for: 6184 * - vnodes are passed unlocked and unreferenced with nothing stopping 6185 * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised 6186 * to use atomic_load_ptr to fetch it. 6187 * - the aforementioned object can also get freed, meaning absent other means it 6188 * should be protected with vfs_smr 6189 * - either safely checking permissions as they are modified or guaranteeing 6190 * their stability is left to the routine 6191 */ 6192 int 6193 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status, 6194 struct pwd **pwdp) 6195 { 6196 struct cache_fpl fpl; 6197 struct pwd *pwd; 6198 struct vnode *dvp; 6199 struct componentname *cnp; 6200 int error; 6201 6202 fpl.status = CACHE_FPL_STATUS_UNSET; 6203 fpl.in_smr = false; 6204 fpl.ndp = ndp; 6205 fpl.cnp = cnp = &ndp->ni_cnd; 6206 MPASS(ndp->ni_lcf == 0); 6207 KASSERT ((cnp->cn_flags & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, 6208 ("%s: internal flags found in cn_flags %" PRIx64, __func__, 6209 cnp->cn_flags)); 6210 MPASS(cnp->cn_nameptr == cnp->cn_pnbuf); 6211 MPASS(ndp->ni_resflags == 0); 6212 6213 if (__predict_false(!cache_can_fplookup(&fpl))) { 6214 *status = fpl.status; 6215 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 6216 return (EOPNOTSUPP); 6217 } 6218 6219 cache_fpl_checkpoint_outer(&fpl); 6220 6221 cache_fpl_smr_enter_initial(&fpl); 6222 #ifdef INVARIANTS 6223 fpl.debug.ni_pathlen = ndp->ni_pathlen; 6224 #endif 6225 fpl.nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1]; 6226 fpl.fsearch = false; 6227 fpl.tvp = NULL; /* for degenerate path handling */ 6228 fpl.pwd = pwdp; 6229 pwd = pwd_get_smr(); 6230 *(fpl.pwd) = pwd; 6231 namei_setup_rootdir(ndp, cnp, pwd); 6232 ndp->ni_topdir = pwd->pwd_jdir; 6233 6234 if (cnp->cn_pnbuf[0] == '/') { 6235 dvp = cache_fpl_handle_root(&fpl); 6236 ndp->ni_resflags = NIRES_ABS; 6237 } else { 6238 if (ndp->ni_dirfd == AT_FDCWD) { 6239 dvp = pwd->pwd_cdir; 6240 } else { 6241 error = cache_fplookup_dirfd(&fpl, &dvp); 6242 if (__predict_false(error != 0)) { 6243 goto out; 6244 } 6245 } 6246 } 6247 6248 SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true); 6249 error = cache_fplookup_impl(dvp, &fpl); 6250 out: 6251 cache_fpl_smr_assert_not_entered(&fpl); 6252 cache_fpl_assert_status(&fpl); 6253 *status = fpl.status; 6254 if (SDT_PROBES_ENABLED()) { 6255 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 6256 if (fpl.status == CACHE_FPL_STATUS_HANDLED) 6257 SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true, 6258 ndp); 6259 } 6260 6261 if (__predict_true(fpl.status == CACHE_FPL_STATUS_HANDLED)) { 6262 MPASS(error != CACHE_FPL_FAILED); 6263 if (error != 0) { 6264 cache_fpl_cleanup_cnp(fpl.cnp); 6265 MPASS(fpl.dvp == NULL); 6266 MPASS(fpl.tvp == NULL); 6267 } 6268 ndp->ni_dvp = fpl.dvp; 6269 ndp->ni_vp = fpl.tvp; 6270 } 6271 return (error); 6272 } 6273