1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Poul-Henning Kamp of the FreeBSD Project. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include "opt_ddb.h" 36 #include "opt_ktrace.h" 37 38 #include <sys/param.h> 39 #include <sys/systm.h> 40 #include <sys/capsicum.h> 41 #include <sys/counter.h> 42 #include <sys/filedesc.h> 43 #include <sys/fnv_hash.h> 44 #include <sys/kernel.h> 45 #include <sys/ktr.h> 46 #include <sys/lock.h> 47 #include <sys/malloc.h> 48 #include <sys/fcntl.h> 49 #include <sys/jail.h> 50 #include <sys/mount.h> 51 #include <sys/namei.h> 52 #include <sys/proc.h> 53 #include <sys/seqc.h> 54 #include <sys/sdt.h> 55 #include <sys/smr.h> 56 #include <sys/smp.h> 57 #include <sys/syscallsubr.h> 58 #include <sys/sysctl.h> 59 #include <sys/sysproto.h> 60 #include <sys/vnode.h> 61 #include <ck_queue.h> 62 #ifdef KTRACE 63 #include <sys/ktrace.h> 64 #endif 65 #ifdef INVARIANTS 66 #include <machine/_inttypes.h> 67 #endif 68 69 #include <security/audit/audit.h> 70 #include <security/mac/mac_framework.h> 71 72 #ifdef DDB 73 #include <ddb/ddb.h> 74 #endif 75 76 #include <vm/uma.h> 77 78 /* 79 * High level overview of name caching in the VFS layer. 80 * 81 * Originally caching was implemented as part of UFS, later extracted to allow 82 * use by other filesystems. A decision was made to make it optional and 83 * completely detached from the rest of the kernel, which comes with limitations 84 * outlined near the end of this comment block. 85 * 86 * This fundamental choice needs to be revisited. In the meantime, the current 87 * state is described below. Significance of all notable routines is explained 88 * in comments placed above their implementation. Scattered thoroughout the 89 * file are TODO comments indicating shortcomings which can be fixed without 90 * reworking everything (most of the fixes will likely be reusable). Various 91 * details are omitted from this explanation to not clutter the overview, they 92 * have to be checked by reading the code and associated commentary. 93 * 94 * Keep in mind that it's individual path components which are cached, not full 95 * paths. That is, for a fully cached path "foo/bar/baz" there are 3 entries, 96 * one for each name. 97 * 98 * I. Data organization 99 * 100 * Entries are described by "struct namecache" objects and stored in a hash 101 * table. See cache_get_hash for more information. 102 * 103 * "struct vnode" contains pointers to source entries (names which can be found 104 * when traversing through said vnode), destination entries (names of that 105 * vnode (see "Limitations" for a breakdown on the subject) and a pointer to 106 * the parent vnode. 107 * 108 * The (directory vnode; name) tuple reliably determines the target entry if 109 * it exists. 110 * 111 * Since there are no small locks at this time (all are 32 bytes in size on 112 * LP64), the code works around the problem by introducing lock arrays to 113 * protect hash buckets and vnode lists. 114 * 115 * II. Filesystem integration 116 * 117 * Filesystems participating in name caching do the following: 118 * - set vop_lookup routine to vfs_cache_lookup 119 * - set vop_cachedlookup to whatever can perform the lookup if the above fails 120 * - if they support lockless lookup (see below), vop_fplookup_vexec and 121 * vop_fplookup_symlink are set along with the MNTK_FPLOOKUP flag on the 122 * mount point 123 * - call cache_purge or cache_vop_* routines to eliminate stale entries as 124 * applicable 125 * - call cache_enter to add entries depending on the MAKEENTRY flag 126 * 127 * With the above in mind, there are 2 entry points when doing lookups: 128 * - ... -> namei -> cache_fplookup -- this is the default 129 * - ... -> VOP_LOOKUP -> vfs_cache_lookup -- normally only called by namei 130 * should the above fail 131 * 132 * Example code flow how an entry is added: 133 * ... -> namei -> cache_fplookup -> cache_fplookup_noentry -> VOP_LOOKUP -> 134 * vfs_cache_lookup -> VOP_CACHEDLOOKUP -> ufs_lookup_ino -> cache_enter 135 * 136 * III. Performance considerations 137 * 138 * For lockless case forward lookup avoids any writes to shared areas apart 139 * from the terminal path component. In other words non-modifying lookups of 140 * different files don't suffer any scalability problems in the namecache. 141 * Looking up the same file is limited by VFS and goes beyond the scope of this 142 * file. 143 * 144 * At least on amd64 the single-threaded bottleneck for long paths is hashing 145 * (see cache_get_hash). There are cases where the code issues acquire fence 146 * multiple times, they can be combined on architectures which suffer from it. 147 * 148 * For locked case each encountered vnode has to be referenced and locked in 149 * order to be handed out to the caller (normally that's namei). This 150 * introduces significant hit single-threaded and serialization multi-threaded. 151 * 152 * Reverse lookup (e.g., "getcwd") fully scales provided it is fully cached -- 153 * avoids any writes to shared areas to any components. 154 * 155 * Unrelated insertions are partially serialized on updating the global entry 156 * counter and possibly serialized on colliding bucket or vnode locks. 157 * 158 * IV. Observability 159 * 160 * Note not everything has an explicit dtrace probe nor it should have, thus 161 * some of the one-liners below depend on implementation details. 162 * 163 * Examples: 164 * 165 * # Check what lookups failed to be handled in a lockless manner. Column 1 is 166 * # line number, column 2 is status code (see cache_fpl_status) 167 * dtrace -n 'vfs:fplookup:lookup:done { @[arg1, arg2] = count(); }' 168 * 169 * # Lengths of names added by binary name 170 * dtrace -n 'fbt::cache_enter_time:entry { @[execname] = quantize(args[2]->cn_namelen); }' 171 * 172 * # Same as above but only those which exceed 64 characters 173 * dtrace -n 'fbt::cache_enter_time:entry /args[2]->cn_namelen > 64/ { @[execname] = quantize(args[2]->cn_namelen); }' 174 * 175 * # Who is performing lookups with spurious slashes (e.g., "foo//bar") and what 176 * # path is it 177 * dtrace -n 'fbt::cache_fplookup_skip_slashes:entry { @[execname, stringof(args[0]->cnp->cn_pnbuf)] = count(); }' 178 * 179 * V. Limitations and implementation defects 180 * 181 * - since it is possible there is no entry for an open file, tools like 182 * "procstat" may fail to resolve fd -> vnode -> path to anything 183 * - even if a filesystem adds an entry, it may get purged (e.g., due to memory 184 * shortage) in which case the above problem applies 185 * - hardlinks are not tracked, thus if a vnode is reachable in more than one 186 * way, resolving a name may return a different path than the one used to 187 * open it (even if said path is still valid) 188 * - by default entries are not added for newly created files 189 * - adding an entry may need to evict negative entry first, which happens in 2 190 * distinct places (evicting on lookup, adding in a later VOP) making it 191 * impossible to simply reuse it 192 * - there is a simple scheme to evict negative entries as the cache is approaching 193 * its capacity, but it is very unclear if doing so is a good idea to begin with 194 * - vnodes are subject to being recycled even if target inode is left in memory, 195 * which loses the name cache entries when it perhaps should not. in case of tmpfs 196 * names get duplicated -- kept by filesystem itself and namecache separately 197 * - struct namecache has a fixed size and comes in 2 variants, often wasting 198 * space. now hard to replace with malloc due to dependence on SMR, which 199 * requires UMA zones to opt in 200 * - lack of better integration with the kernel also turns nullfs into a layered 201 * filesystem instead of something which can take advantage of caching 202 * 203 * Appendix A: where is the time lost, expanding on paragraph III 204 * 205 * While some care went into optimizing lookups, there is still plenty of 206 * performance left on the table, most notably from single-threaded standpoint. 207 * Below is a woefully incomplete list of changes which can help. Ideas are 208 * mostly sketched out, no claim is made all kinks or prerequisites are laid 209 * out. 210 * 211 * Note there is performance lost all over VFS. 212 * 213 * === SMR-only lookup 214 * 215 * For commonly used ops like stat(2), when the terminal vnode *is* cached, 216 * lockless lookup could refrain from refing/locking the found vnode and 217 * instead return while within the SMR section. Then a call to, say, 218 * vop_stat_smr could do the work (or fail with EAGAIN), finally the result 219 * would be validated with seqc not changing. This would be faster 220 * single-threaded as it dodges atomics and would provide full scalability for 221 * multicore uses. This would *not* work for open(2) or other calls which need 222 * the vnode to hang around for the long haul, but would work for aforementioned 223 * stat(2) but also access(2), readlink(2), realpathat(2) and probably more. 224 * 225 * === hotpatching for sdt probes 226 * 227 * They result in *tons* of branches all over with rather regrettable codegen 228 * at times. Removing sdt probes altogether gives over 2% boost in lookup rate. 229 * Reworking the code to patch itself at runtime with asm goto would solve it. 230 * asm goto is fully supported by gcc and clang. 231 * 232 * === copyinstr 233 * 234 * On all architectures it operates one byte at a time, while it could be 235 * word-sized instead thanks to the Mycroft trick. 236 * 237 * API itself is rather pessimal for path lookup, accepting arbitrary sizes and 238 * *optionally* filling in the length parameter. 239 * 240 * Instead a new routine (copyinpath?) could be introduced, demanding a buffer 241 * size which is a multiply of the word (and never zero), with the length 242 * always returned. On top of it the routine could be allowed to transform the 243 * buffer in arbitrary ways, most notably writing past the found length (not to 244 * be confused with writing past buffer size) -- this would allow word-sized 245 * movs while checking for '\0' later. 246 * 247 * === detour through namei 248 * 249 * Currently one suffers being called from namei, which then has to check if 250 * things worked out locklessly. Instead the lockless lookup could be the 251 * actual entry point which calls what is currently namei as a fallback. 252 * 253 * === avoidable branches in cache_can_fplookup 254 * 255 * The cache_fast_lookup_enabled flag check could be hotpatchable (in fact if 256 * this is off, none of fplookup code should execute). 257 * 258 * Both audit and capsicum branches can be combined into one, but it requires 259 * paying off a lot of tech debt first. 260 * 261 * ni_startdir could be indicated with a flag in cn_flags, eliminating the 262 * branch. 263 * 264 * === mount stacks 265 * 266 * Crossing a mount requires checking if perhaps something is mounted on top. 267 * Instead, an additional entry could be added to struct mount with a pointer 268 * to the final mount on the stack. This would be recalculated on each 269 * mount/unmount. 270 * 271 * === root vnodes 272 * 273 * It could become part of the API contract to *always* have a rootvnode set in 274 * mnt_rootvnode. Such vnodes are annotated with VV_ROOT and vnlru would have 275 * to be modified to always skip them. 276 * 277 * === inactive on v_usecount reaching 0 278 * 279 * VOP_NEED_INACTIVE should not exist. Filesystems would indicate need for such 280 * processing with a bit in usecount. 281 * 282 * === v_holdcnt 283 * 284 * Hold count should probably get eliminated, but one can argue it is a useful 285 * feature. Even if so, handling of v_usecount could be decoupled from it -- 286 * vnlru et al would consider the vnode not-freeable if has either hold or 287 * usecount on it. 288 * 289 * This would eliminate 2 atomics. 290 */ 291 292 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 293 "Name cache"); 294 295 SDT_PROVIDER_DECLARE(vfs); 296 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", 297 "struct vnode *"); 298 SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *", 299 "struct vnode *"); 300 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", 301 "char *"); 302 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *", 303 "const char *"); 304 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *", 305 "struct namecache *", "int", "int"); 306 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); 307 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", 308 "char *", "struct vnode *"); 309 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); 310 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", 311 "struct vnode *", "char *"); 312 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", 313 "struct vnode *"); 314 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, 315 "struct vnode *", "char *"); 316 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", 317 "char *"); 318 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *", 319 "struct componentname *"); 320 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *", 321 "struct componentname *"); 322 SDT_PROBE_DEFINE3(vfs, namecache, purge, done, "struct vnode *", "size_t", "size_t"); 323 SDT_PROBE_DEFINE1(vfs, namecache, purge, batch, "int"); 324 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); 325 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); 326 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", 327 "struct vnode *"); 328 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", 329 "char *"); 330 SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *", 331 "char *"); 332 SDT_PROBE_DEFINE1(vfs, namecache, symlink, alloc__fail, "size_t"); 333 334 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool"); 335 SDT_PROBE_DECLARE(vfs, namei, lookup, entry); 336 SDT_PROBE_DECLARE(vfs, namei, lookup, return); 337 338 static char __read_frequently cache_fast_lookup_enabled = true; 339 340 /* 341 * This structure describes the elements in the cache of recent 342 * names looked up by namei. 343 */ 344 struct negstate { 345 u_char neg_flag; 346 u_char neg_hit; 347 }; 348 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *), 349 "the state must fit in a union with a pointer without growing it"); 350 351 struct namecache { 352 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 353 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 354 CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */ 355 struct vnode *nc_dvp; /* vnode of parent of name */ 356 union { 357 struct vnode *nu_vp; /* vnode the name refers to */ 358 struct negstate nu_neg;/* negative entry state */ 359 } n_un; 360 u_char nc_flag; /* flag bits */ 361 u_char nc_nlen; /* length of name */ 362 char nc_name[]; /* segment name + nul */ 363 }; 364 365 /* 366 * struct namecache_ts repeats struct namecache layout up to the 367 * nc_nlen member. 368 * struct namecache_ts is used in place of struct namecache when time(s) need 369 * to be stored. The nc_dotdottime field is used when a cache entry is mapping 370 * both a non-dotdot directory name plus dotdot for the directory's 371 * parent. 372 * 373 * See below for alignment requirement. 374 */ 375 struct namecache_ts { 376 struct timespec nc_time; /* timespec provided by fs */ 377 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ 378 int nc_ticks; /* ticks value when entry was added */ 379 int nc_pad; 380 struct namecache nc_nc; 381 }; 382 383 TAILQ_HEAD(cache_freebatch, namecache); 384 385 /* 386 * At least mips n32 performs 64-bit accesses to timespec as found 387 * in namecache_ts and requires them to be aligned. Since others 388 * may be in the same spot suffer a little bit and enforce the 389 * alignment for everyone. Note this is a nop for 64-bit platforms. 390 */ 391 #define CACHE_ZONE_ALIGNMENT UMA_ALIGNOF(time_t) 392 393 /* 394 * TODO: the initial value of CACHE_PATH_CUTOFF was inherited from the 395 * 4.4 BSD codebase. Later on struct namecache was tweaked to become 396 * smaller and the value was bumped to retain the total size, but it 397 * was never re-evaluated for suitability. A simple test counting 398 * lengths during package building shows that the value of 45 covers 399 * about 86% of all added entries, reaching 99% at 65. 400 * 401 * Regardless of the above, use of dedicated zones instead of malloc may be 402 * inducing additional waste. This may be hard to address as said zones are 403 * tied to VFS SMR. Even if retaining them, the current split should be 404 * re-evaluated. 405 */ 406 #ifdef __LP64__ 407 #define CACHE_PATH_CUTOFF 45 408 #define CACHE_LARGE_PAD 6 409 #else 410 #define CACHE_PATH_CUTOFF 41 411 #define CACHE_LARGE_PAD 2 412 #endif 413 414 #define CACHE_ZONE_SMALL_SIZE (offsetof(struct namecache, nc_name) + CACHE_PATH_CUTOFF + 1) 415 #define CACHE_ZONE_SMALL_TS_SIZE (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_SMALL_SIZE) 416 #define CACHE_ZONE_LARGE_SIZE (offsetof(struct namecache, nc_name) + NAME_MAX + 1 + CACHE_LARGE_PAD) 417 #define CACHE_ZONE_LARGE_TS_SIZE (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_LARGE_SIZE) 418 419 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 420 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 421 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 422 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 423 424 #define nc_vp n_un.nu_vp 425 #define nc_neg n_un.nu_neg 426 427 /* 428 * Flags in namecache.nc_flag 429 */ 430 #define NCF_WHITE 0x01 431 #define NCF_ISDOTDOT 0x02 432 #define NCF_TS 0x04 433 #define NCF_DTS 0x08 434 #define NCF_DVDROP 0x10 435 #define NCF_NEGATIVE 0x20 436 #define NCF_INVALID 0x40 437 #define NCF_WIP 0x80 438 439 /* 440 * Flags in negstate.neg_flag 441 */ 442 #define NEG_HOT 0x01 443 444 static bool cache_neg_evict_cond(u_long lnumcache); 445 446 /* 447 * Mark an entry as invalid. 448 * 449 * This is called before it starts getting deconstructed. 450 */ 451 static void 452 cache_ncp_invalidate(struct namecache *ncp) 453 { 454 455 KASSERT((ncp->nc_flag & NCF_INVALID) == 0, 456 ("%s: entry %p already invalid", __func__, ncp)); 457 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID); 458 atomic_thread_fence_rel(); 459 } 460 461 /* 462 * Does this entry match the given directory and name? 463 */ 464 static bool 465 cache_ncp_match(struct namecache *ncp, struct vnode *dvp, 466 struct componentname *cnp) 467 { 468 return (ncp->nc_dvp == dvp && 469 ncp->nc_nlen == cnp->cn_namelen && 470 bcmp(ncp->nc_name, cnp->cn_nameptr, cnp->cn_namelen) == 0); 471 } 472 473 /* 474 * Check whether the entry can be safely used. 475 * 476 * All places which elide locks are supposed to call this after they are 477 * done with reading from an entry. 478 */ 479 #define cache_ncp_canuse(ncp) ({ \ 480 struct namecache *_ncp = (ncp); \ 481 u_char _nc_flag; \ 482 \ 483 atomic_thread_fence_acq(); \ 484 _nc_flag = atomic_load_char(&_ncp->nc_flag); \ 485 __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP)) == 0); \ 486 }) 487 488 /* 489 * Like the above but also checks NCF_WHITE. 490 */ 491 #define cache_fpl_neg_ncp_canuse(ncp) ({ \ 492 struct namecache *_ncp = (ncp); \ 493 u_char _nc_flag; \ 494 \ 495 atomic_thread_fence_acq(); \ 496 _nc_flag = atomic_load_char(&_ncp->nc_flag); \ 497 __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP | NCF_WHITE)) == 0); \ 498 }) 499 500 VFS_SMR_DECLARE; 501 502 static SYSCTL_NODE(_vfs_cache, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 503 "Name cache parameters"); 504 505 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */ 506 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, size, CTLFLAG_RD, &ncsize, 0, 507 "Total namecache capacity"); 508 509 u_int ncsizefactor = 2; 510 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, sizefactor, CTLFLAG_RW, &ncsizefactor, 0, 511 "Size factor for namecache"); 512 513 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */ 514 SYSCTL_ULONG(_vfs_cache_param, OID_AUTO, negfactor, CTLFLAG_RW, &ncnegfactor, 0, 515 "Ratio of negative namecache entries"); 516 517 /* 518 * Negative entry % of namecache capacity above which automatic eviction is allowed. 519 * 520 * Check cache_neg_evict_cond for details. 521 */ 522 static u_int ncnegminpct = 3; 523 524 static u_int __read_mostly neg_min; /* the above recomputed against ncsize */ 525 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, negmin, CTLFLAG_RD, &neg_min, 0, 526 "Negative entry count above which automatic eviction is allowed"); 527 528 /* 529 * Structures associated with name caching. 530 */ 531 #define NCHHASH(hash) \ 532 (&nchashtbl[(hash) & nchash]) 533 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ 534 static u_long __read_mostly nchash; /* size of hash table */ 535 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 536 "Size of namecache hash table"); 537 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ 538 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ 539 540 struct nchstats nchstats; /* cache effectiveness statistics */ 541 542 static u_int __exclusive_cache_line neg_cycle; 543 544 #define ncneghash 3 545 #define numneglists (ncneghash + 1) 546 547 struct neglist { 548 struct mtx nl_evict_lock; 549 struct mtx nl_lock __aligned(CACHE_LINE_SIZE); 550 TAILQ_HEAD(, namecache) nl_list; 551 TAILQ_HEAD(, namecache) nl_hotlist; 552 u_long nl_hotnum; 553 } __aligned(CACHE_LINE_SIZE); 554 555 static struct neglist neglists[numneglists]; 556 557 static inline struct neglist * 558 NCP2NEGLIST(struct namecache *ncp) 559 { 560 561 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); 562 } 563 564 static inline struct negstate * 565 NCP2NEGSTATE(struct namecache *ncp) 566 { 567 568 MPASS(atomic_load_char(&ncp->nc_flag) & NCF_NEGATIVE); 569 return (&ncp->nc_neg); 570 } 571 572 #define numbucketlocks (ncbuckethash + 1) 573 static u_int __read_mostly ncbuckethash; 574 static struct mtx_padalign __read_mostly *bucketlocks; 575 #define HASH2BUCKETLOCK(hash) \ 576 ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)])) 577 578 #define numvnodelocks (ncvnodehash + 1) 579 static u_int __read_mostly ncvnodehash; 580 static struct mtx __read_mostly *vnodelocks; 581 static inline struct mtx * 582 VP2VNODELOCK(struct vnode *vp) 583 { 584 585 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); 586 } 587 588 /* 589 * Search the hash table for a namecache entry. Either the corresponding bucket 590 * must be locked, or the caller must be in an SMR read section. 591 */ 592 static struct namecache * 593 cache_ncp_find(struct vnode *dvp, struct componentname *cnp, uint32_t hash) 594 { 595 struct namecache *ncp; 596 597 KASSERT(mtx_owned(HASH2BUCKETLOCK(hash)) || VFS_SMR_ENTERED(), 598 ("%s: hash %u not locked", __func__, hash)); 599 CK_SLIST_FOREACH(ncp, NCHHASH(hash), nc_hash) { 600 if (cache_ncp_match(ncp, dvp, cnp)) 601 break; 602 } 603 return (ncp); 604 } 605 606 static void 607 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) 608 { 609 struct namecache_ts *ncp_ts; 610 611 KASSERT((ncp->nc_flag & NCF_TS) != 0 || 612 (tsp == NULL && ticksp == NULL), 613 ("No NCF_TS")); 614 615 if (tsp == NULL) 616 return; 617 618 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 619 *tsp = ncp_ts->nc_time; 620 *ticksp = ncp_ts->nc_ticks; 621 } 622 623 #ifdef DEBUG_CACHE 624 static int __read_mostly doingcache = 1; /* 1 => enable the cache */ 625 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, 626 "VFS namecache enabled"); 627 #endif 628 629 /* Export size information to userland */ 630 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 631 sizeof(struct namecache), "sizeof(struct namecache)"); 632 633 /* 634 * The new name cache statistics 635 */ 636 static SYSCTL_NODE(_vfs_cache, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 637 "Name cache statistics"); 638 639 #define STATNODE_ULONG(name, varname, descr) \ 640 SYSCTL_ULONG(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr); 641 #define STATNODE_COUNTER(name, varname, descr) \ 642 static COUNTER_U64_DEFINE_EARLY(varname); \ 643 SYSCTL_COUNTER_U64(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, \ 644 descr); 645 STATNODE_ULONG(neg, numneg, "Number of negative cache entries"); 646 STATNODE_ULONG(count, numcache, "Number of cache entries"); 647 STATNODE_COUNTER(heldvnodes, numcachehv, "Number of namecache entries with vnodes held"); 648 STATNODE_COUNTER(drops, numdrops, "Number of dropped entries due to reaching the limit"); 649 STATNODE_COUNTER(miss, nummiss, "Number of cache misses"); 650 STATNODE_COUNTER(misszap, nummisszap, "Number of cache misses we do not want to cache"); 651 STATNODE_COUNTER(poszaps, numposzaps, 652 "Number of cache hits (positive) we do not want to cache"); 653 STATNODE_COUNTER(poshits, numposhits, "Number of cache hits (positive)"); 654 STATNODE_COUNTER(negzaps, numnegzaps, 655 "Number of cache hits (negative) we do not want to cache"); 656 STATNODE_COUNTER(neghits, numneghits, "Number of cache hits (negative)"); 657 /* These count for vn_getcwd(), too. */ 658 STATNODE_COUNTER(fullpathcalls, numfullpathcalls, "Number of fullpath search calls"); 659 STATNODE_COUNTER(fullpathfail2, numfullpathfail2, 660 "Number of fullpath search errors (VOP_VPTOCNP failures)"); 661 STATNODE_COUNTER(fullpathfail4, numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); 662 STATNODE_COUNTER(fullpathfound, numfullpathfound, "Number of successful fullpath calls"); 663 STATNODE_COUNTER(symlinktoobig, symlinktoobig, "Number of times symlink did not fit the cache"); 664 665 /* 666 * Debug or developer statistics. 667 */ 668 static SYSCTL_NODE(_vfs_cache, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 669 "Name cache debugging"); 670 #define DEBUGNODE_ULONG(name, varname, descr) \ 671 SYSCTL_ULONG(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr); 672 static u_long zap_bucket_relock_success; 673 DEBUGNODE_ULONG(zap_bucket_relock_success, zap_bucket_relock_success, 674 "Number of successful removals after relocking"); 675 static u_long zap_bucket_fail; 676 DEBUGNODE_ULONG(zap_bucket_fail, zap_bucket_fail, ""); 677 static u_long zap_bucket_fail2; 678 DEBUGNODE_ULONG(zap_bucket_fail2, zap_bucket_fail2, ""); 679 static u_long cache_lock_vnodes_cel_3_failures; 680 DEBUGNODE_ULONG(vnodes_cel_3_failures, cache_lock_vnodes_cel_3_failures, 681 "Number of times 3-way vnode locking failed"); 682 683 static void cache_zap_locked(struct namecache *ncp); 684 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 685 char **retbuf, size_t *buflen, size_t addend); 686 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, 687 char **retbuf, size_t *buflen); 688 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, 689 char **retbuf, size_t *len, size_t addend); 690 691 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 692 693 static inline void 694 cache_assert_vlp_locked(struct mtx *vlp) 695 { 696 697 if (vlp != NULL) 698 mtx_assert(vlp, MA_OWNED); 699 } 700 701 static inline void 702 cache_assert_vnode_locked(struct vnode *vp) 703 { 704 struct mtx *vlp; 705 706 vlp = VP2VNODELOCK(vp); 707 cache_assert_vlp_locked(vlp); 708 } 709 710 /* 711 * Directory vnodes with entries are held for two reasons: 712 * 1. make them less of a target for reclamation in vnlru 713 * 2. suffer smaller performance penalty in locked lookup as requeieing is avoided 714 * 715 * It will be feasible to stop doing it altogether if all filesystems start 716 * supporting lockless lookup. 717 */ 718 static void 719 cache_hold_vnode(struct vnode *vp) 720 { 721 722 cache_assert_vnode_locked(vp); 723 VNPASS(LIST_EMPTY(&vp->v_cache_src), vp); 724 vhold(vp); 725 counter_u64_add(numcachehv, 1); 726 } 727 728 static void 729 cache_drop_vnode(struct vnode *vp) 730 { 731 732 /* 733 * Called after all locks are dropped, meaning we can't assert 734 * on the state of v_cache_src. 735 */ 736 vdrop(vp); 737 counter_u64_add(numcachehv, -1); 738 } 739 740 /* 741 * UMA zones. 742 */ 743 static uma_zone_t __read_mostly cache_zone_small; 744 static uma_zone_t __read_mostly cache_zone_small_ts; 745 static uma_zone_t __read_mostly cache_zone_large; 746 static uma_zone_t __read_mostly cache_zone_large_ts; 747 748 char * 749 cache_symlink_alloc(size_t size, int flags) 750 { 751 752 if (size < CACHE_ZONE_SMALL_SIZE) { 753 return (uma_zalloc_smr(cache_zone_small, flags)); 754 } 755 if (size < CACHE_ZONE_LARGE_SIZE) { 756 return (uma_zalloc_smr(cache_zone_large, flags)); 757 } 758 counter_u64_add(symlinktoobig, 1); 759 SDT_PROBE1(vfs, namecache, symlink, alloc__fail, size); 760 return (NULL); 761 } 762 763 void 764 cache_symlink_free(char *string, size_t size) 765 { 766 767 MPASS(string != NULL); 768 KASSERT(size < CACHE_ZONE_LARGE_SIZE, 769 ("%s: size %zu too big", __func__, size)); 770 771 if (size < CACHE_ZONE_SMALL_SIZE) { 772 uma_zfree_smr(cache_zone_small, string); 773 return; 774 } 775 if (size < CACHE_ZONE_LARGE_SIZE) { 776 uma_zfree_smr(cache_zone_large, string); 777 return; 778 } 779 __assert_unreachable(); 780 } 781 782 static struct namecache * 783 cache_alloc_uma(int len, bool ts) 784 { 785 struct namecache_ts *ncp_ts; 786 struct namecache *ncp; 787 788 if (__predict_false(ts)) { 789 if (len <= CACHE_PATH_CUTOFF) 790 ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK); 791 else 792 ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK); 793 ncp = &ncp_ts->nc_nc; 794 } else { 795 if (len <= CACHE_PATH_CUTOFF) 796 ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK); 797 else 798 ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK); 799 } 800 return (ncp); 801 } 802 803 static void 804 cache_free_uma(struct namecache *ncp) 805 { 806 struct namecache_ts *ncp_ts; 807 808 if (__predict_false(ncp->nc_flag & NCF_TS)) { 809 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 810 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 811 uma_zfree_smr(cache_zone_small_ts, ncp_ts); 812 else 813 uma_zfree_smr(cache_zone_large_ts, ncp_ts); 814 } else { 815 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 816 uma_zfree_smr(cache_zone_small, ncp); 817 else 818 uma_zfree_smr(cache_zone_large, ncp); 819 } 820 } 821 822 static struct namecache * 823 cache_alloc(int len, bool ts) 824 { 825 u_long lnumcache; 826 827 /* 828 * Avoid blowout in namecache entries. 829 * 830 * Bugs: 831 * 1. filesystems may end up trying to add an already existing entry 832 * (for example this can happen after a cache miss during concurrent 833 * lookup), in which case we will call cache_neg_evict despite not 834 * adding anything. 835 * 2. the routine may fail to free anything and no provisions are made 836 * to make it try harder (see the inside for failure modes) 837 * 3. it only ever looks at negative entries. 838 */ 839 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; 840 if (cache_neg_evict_cond(lnumcache)) { 841 lnumcache = atomic_load_long(&numcache); 842 } 843 if (__predict_false(lnumcache >= ncsize)) { 844 atomic_subtract_long(&numcache, 1); 845 counter_u64_add(numdrops, 1); 846 return (NULL); 847 } 848 return (cache_alloc_uma(len, ts)); 849 } 850 851 static void 852 cache_free(struct namecache *ncp) 853 { 854 855 MPASS(ncp != NULL); 856 if ((ncp->nc_flag & NCF_DVDROP) != 0) { 857 cache_drop_vnode(ncp->nc_dvp); 858 } 859 cache_free_uma(ncp); 860 atomic_subtract_long(&numcache, 1); 861 } 862 863 static void 864 cache_free_batch(struct cache_freebatch *batch) 865 { 866 struct namecache *ncp, *nnp; 867 int i; 868 869 i = 0; 870 if (TAILQ_EMPTY(batch)) 871 goto out; 872 TAILQ_FOREACH_SAFE(ncp, batch, nc_dst, nnp) { 873 if ((ncp->nc_flag & NCF_DVDROP) != 0) { 874 cache_drop_vnode(ncp->nc_dvp); 875 } 876 cache_free_uma(ncp); 877 i++; 878 } 879 atomic_subtract_long(&numcache, i); 880 out: 881 SDT_PROBE1(vfs, namecache, purge, batch, i); 882 } 883 884 /* 885 * Hashing. 886 * 887 * The code was made to use FNV in 2001 and this choice needs to be revisited. 888 * 889 * Short summary of the difficulty: 890 * The longest name which can be inserted is NAME_MAX characters in length (or 891 * 255 at the time of writing this comment), while majority of names used in 892 * practice are significantly shorter (mostly below 10). More importantly 893 * majority of lookups performed find names are even shorter than that. 894 * 895 * This poses a problem where hashes which do better than FNV past word size 896 * (or so) tend to come with additional overhead when finalizing the result, 897 * making them noticeably slower for the most commonly used range. 898 * 899 * Consider a path like: /usr/obj/usr/src/sys/amd64/GENERIC/vnode_if.c 900 * 901 * When looking it up the most time consuming part by a large margin (at least 902 * on amd64) is hashing. Replacing FNV with something which pessimizes short 903 * input would make the slowest part stand out even more. 904 */ 905 906 /* 907 * TODO: With the value stored we can do better than computing the hash based 908 * on the address. 909 */ 910 static void 911 cache_prehash(struct vnode *vp) 912 { 913 914 vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT); 915 } 916 917 static uint32_t 918 cache_get_hash(char *name, u_char len, struct vnode *dvp) 919 { 920 921 return (fnv_32_buf(name, len, dvp->v_nchash)); 922 } 923 924 static uint32_t 925 cache_get_hash_iter_start(struct vnode *dvp) 926 { 927 928 return (dvp->v_nchash); 929 } 930 931 static uint32_t 932 cache_get_hash_iter(char c, uint32_t hash) 933 { 934 935 return (fnv_32_buf(&c, 1, hash)); 936 } 937 938 static uint32_t 939 cache_get_hash_iter_finish(uint32_t hash) 940 { 941 942 return (hash); 943 } 944 945 static inline struct nchashhead * 946 NCP2BUCKET(struct namecache *ncp) 947 { 948 uint32_t hash; 949 950 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 951 return (NCHHASH(hash)); 952 } 953 954 static inline struct mtx * 955 NCP2BUCKETLOCK(struct namecache *ncp) 956 { 957 uint32_t hash; 958 959 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 960 return (HASH2BUCKETLOCK(hash)); 961 } 962 963 #ifdef INVARIANTS 964 static void 965 cache_assert_bucket_locked(struct namecache *ncp) 966 { 967 struct mtx *blp; 968 969 blp = NCP2BUCKETLOCK(ncp); 970 mtx_assert(blp, MA_OWNED); 971 } 972 973 static void 974 cache_assert_bucket_unlocked(struct namecache *ncp) 975 { 976 struct mtx *blp; 977 978 blp = NCP2BUCKETLOCK(ncp); 979 mtx_assert(blp, MA_NOTOWNED); 980 } 981 #else 982 #define cache_assert_bucket_locked(x) do { } while (0) 983 #define cache_assert_bucket_unlocked(x) do { } while (0) 984 #endif 985 986 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) 987 static void 988 _cache_sort_vnodes(void **p1, void **p2) 989 { 990 void *tmp; 991 992 MPASS(*p1 != NULL || *p2 != NULL); 993 994 if (*p1 > *p2) { 995 tmp = *p2; 996 *p2 = *p1; 997 *p1 = tmp; 998 } 999 } 1000 1001 static void 1002 cache_lock_all_buckets(void) 1003 { 1004 u_int i; 1005 1006 for (i = 0; i < numbucketlocks; i++) 1007 mtx_lock(&bucketlocks[i]); 1008 } 1009 1010 static void 1011 cache_unlock_all_buckets(void) 1012 { 1013 u_int i; 1014 1015 for (i = 0; i < numbucketlocks; i++) 1016 mtx_unlock(&bucketlocks[i]); 1017 } 1018 1019 static void 1020 cache_lock_all_vnodes(void) 1021 { 1022 u_int i; 1023 1024 for (i = 0; i < numvnodelocks; i++) 1025 mtx_lock(&vnodelocks[i]); 1026 } 1027 1028 static void 1029 cache_unlock_all_vnodes(void) 1030 { 1031 u_int i; 1032 1033 for (i = 0; i < numvnodelocks; i++) 1034 mtx_unlock(&vnodelocks[i]); 1035 } 1036 1037 static int 1038 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 1039 { 1040 1041 cache_sort_vnodes(&vlp1, &vlp2); 1042 1043 if (vlp1 != NULL) { 1044 if (!mtx_trylock(vlp1)) 1045 return (EAGAIN); 1046 } 1047 if (!mtx_trylock(vlp2)) { 1048 if (vlp1 != NULL) 1049 mtx_unlock(vlp1); 1050 return (EAGAIN); 1051 } 1052 1053 return (0); 1054 } 1055 1056 static void 1057 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 1058 { 1059 1060 MPASS(vlp1 != NULL || vlp2 != NULL); 1061 MPASS(vlp1 <= vlp2); 1062 1063 if (vlp1 != NULL) 1064 mtx_lock(vlp1); 1065 if (vlp2 != NULL) 1066 mtx_lock(vlp2); 1067 } 1068 1069 static void 1070 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 1071 { 1072 1073 MPASS(vlp1 != NULL || vlp2 != NULL); 1074 1075 if (vlp1 != NULL) 1076 mtx_unlock(vlp1); 1077 if (vlp2 != NULL) 1078 mtx_unlock(vlp2); 1079 } 1080 1081 static int 1082 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 1083 { 1084 struct nchstats snap; 1085 1086 if (req->oldptr == NULL) 1087 return (SYSCTL_OUT(req, 0, sizeof(snap))); 1088 1089 snap = nchstats; 1090 snap.ncs_goodhits = counter_u64_fetch(numposhits); 1091 snap.ncs_neghits = counter_u64_fetch(numneghits); 1092 snap.ncs_badhits = counter_u64_fetch(numposzaps) + 1093 counter_u64_fetch(numnegzaps); 1094 snap.ncs_miss = counter_u64_fetch(nummisszap) + 1095 counter_u64_fetch(nummiss); 1096 1097 return (SYSCTL_OUT(req, &snap, sizeof(snap))); 1098 } 1099 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | 1100 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", 1101 "VFS cache effectiveness statistics"); 1102 1103 static int 1104 sysctl_hitpct(SYSCTL_HANDLER_ARGS) 1105 { 1106 long poshits, neghits, miss, total; 1107 long pct; 1108 1109 poshits = counter_u64_fetch(numposhits); 1110 neghits = counter_u64_fetch(numneghits); 1111 miss = counter_u64_fetch(nummiss); 1112 total = poshits + neghits + miss; 1113 1114 pct = 0; 1115 if (total != 0) 1116 pct = ((poshits + neghits) * 100) / total; 1117 return (sysctl_handle_int(oidp, 0, pct, req)); 1118 } 1119 SYSCTL_PROC(_vfs_cache_stats, OID_AUTO, hitpct, 1120 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_hitpct, 1121 "I", "Percentage of hits"); 1122 1123 static void 1124 cache_recalc_neg_min(void) 1125 { 1126 1127 neg_min = (ncsize * ncnegminpct) / 100; 1128 } 1129 1130 static int 1131 sysctl_negminpct(SYSCTL_HANDLER_ARGS) 1132 { 1133 u_int val; 1134 int error; 1135 1136 val = ncnegminpct; 1137 error = sysctl_handle_int(oidp, &val, 0, req); 1138 if (error != 0 || req->newptr == NULL) 1139 return (error); 1140 1141 if (val == ncnegminpct) 1142 return (0); 1143 if (val < 0 || val > 99) 1144 return (EINVAL); 1145 ncnegminpct = val; 1146 cache_recalc_neg_min(); 1147 return (0); 1148 } 1149 1150 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, negminpct, 1151 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_negminpct, 1152 "I", "Negative entry \% of namecache capacity above which automatic eviction is allowed"); 1153 1154 #ifdef DEBUG_CACHE 1155 /* 1156 * Grab an atomic snapshot of the name cache hash chain lengths 1157 */ 1158 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, 1159 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 1160 "hash table stats"); 1161 1162 static int 1163 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) 1164 { 1165 struct nchashhead *ncpp; 1166 struct namecache *ncp; 1167 int i, error, n_nchash, *cntbuf; 1168 1169 retry: 1170 n_nchash = nchash + 1; /* nchash is max index, not count */ 1171 if (req->oldptr == NULL) 1172 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); 1173 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); 1174 cache_lock_all_buckets(); 1175 if (n_nchash != nchash + 1) { 1176 cache_unlock_all_buckets(); 1177 free(cntbuf, M_TEMP); 1178 goto retry; 1179 } 1180 /* Scan hash tables counting entries */ 1181 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) 1182 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) 1183 cntbuf[i]++; 1184 cache_unlock_all_buckets(); 1185 for (error = 0, i = 0; i < n_nchash; i++) 1186 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) 1187 break; 1188 free(cntbuf, M_TEMP); 1189 return (error); 1190 } 1191 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| 1192 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", 1193 "nchash chain lengths"); 1194 1195 static int 1196 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) 1197 { 1198 int error; 1199 struct nchashhead *ncpp; 1200 struct namecache *ncp; 1201 int n_nchash; 1202 int count, maxlength, used, pct; 1203 1204 if (!req->oldptr) 1205 return SYSCTL_OUT(req, 0, 4 * sizeof(int)); 1206 1207 cache_lock_all_buckets(); 1208 n_nchash = nchash + 1; /* nchash is max index, not count */ 1209 used = 0; 1210 maxlength = 0; 1211 1212 /* Scan hash tables for applicable entries */ 1213 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { 1214 count = 0; 1215 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) { 1216 count++; 1217 } 1218 if (count) 1219 used++; 1220 if (maxlength < count) 1221 maxlength = count; 1222 } 1223 n_nchash = nchash + 1; 1224 cache_unlock_all_buckets(); 1225 pct = (used * 100) / (n_nchash / 100); 1226 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); 1227 if (error) 1228 return (error); 1229 error = SYSCTL_OUT(req, &used, sizeof(used)); 1230 if (error) 1231 return (error); 1232 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); 1233 if (error) 1234 return (error); 1235 error = SYSCTL_OUT(req, &pct, sizeof(pct)); 1236 if (error) 1237 return (error); 1238 return (0); 1239 } 1240 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| 1241 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", 1242 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); 1243 #endif 1244 1245 /* 1246 * Negative entries management 1247 * 1248 * Various workloads create plenty of negative entries and barely use them 1249 * afterwards. Moreover malicious users can keep performing bogus lookups 1250 * adding even more entries. For example "make tinderbox" as of writing this 1251 * comment ends up with 2.6M namecache entries in total, 1.2M of which are 1252 * negative. 1253 * 1254 * As such, a rather aggressive eviction method is needed. The currently 1255 * employed method is a placeholder. 1256 * 1257 * Entries are split over numneglists separate lists, each of which is further 1258 * split into hot and cold entries. Entries get promoted after getting a hit. 1259 * Eviction happens on addition of new entry. 1260 */ 1261 static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1262 "Name cache negative entry statistics"); 1263 1264 SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0, 1265 "Number of negative cache entries"); 1266 1267 static COUNTER_U64_DEFINE_EARLY(neg_created); 1268 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created, 1269 "Number of created negative entries"); 1270 1271 static COUNTER_U64_DEFINE_EARLY(neg_evicted); 1272 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted, 1273 "Number of evicted negative entries"); 1274 1275 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty); 1276 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD, 1277 &neg_evict_skipped_empty, 1278 "Number of times evicting failed due to lack of entries"); 1279 1280 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed); 1281 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD, 1282 &neg_evict_skipped_missed, 1283 "Number of times evicting failed due to target entry disappearing"); 1284 1285 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended); 1286 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD, 1287 &neg_evict_skipped_contended, 1288 "Number of times evicting failed due to contention"); 1289 1290 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits, 1291 "Number of cache hits (negative)"); 1292 1293 static int 1294 sysctl_neg_hot(SYSCTL_HANDLER_ARGS) 1295 { 1296 int i, out; 1297 1298 out = 0; 1299 for (i = 0; i < numneglists; i++) 1300 out += neglists[i].nl_hotnum; 1301 1302 return (SYSCTL_OUT(req, &out, sizeof(out))); 1303 } 1304 SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD | 1305 CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I", 1306 "Number of hot negative entries"); 1307 1308 static void 1309 cache_neg_init(struct namecache *ncp) 1310 { 1311 struct negstate *ns; 1312 1313 ncp->nc_flag |= NCF_NEGATIVE; 1314 ns = NCP2NEGSTATE(ncp); 1315 ns->neg_flag = 0; 1316 ns->neg_hit = 0; 1317 counter_u64_add(neg_created, 1); 1318 } 1319 1320 #define CACHE_NEG_PROMOTION_THRESH 2 1321 1322 static bool 1323 cache_neg_hit_prep(struct namecache *ncp) 1324 { 1325 struct negstate *ns; 1326 u_char n; 1327 1328 ns = NCP2NEGSTATE(ncp); 1329 n = atomic_load_char(&ns->neg_hit); 1330 for (;;) { 1331 if (n >= CACHE_NEG_PROMOTION_THRESH) 1332 return (false); 1333 if (atomic_fcmpset_8(&ns->neg_hit, &n, n + 1)) 1334 break; 1335 } 1336 return (n + 1 == CACHE_NEG_PROMOTION_THRESH); 1337 } 1338 1339 /* 1340 * Nothing to do here but it is provided for completeness as some 1341 * cache_neg_hit_prep callers may end up returning without even 1342 * trying to promote. 1343 */ 1344 #define cache_neg_hit_abort(ncp) do { } while (0) 1345 1346 static void 1347 cache_neg_hit_finish(struct namecache *ncp) 1348 { 1349 1350 SDT_PROBE2(vfs, namecache, lookup, hit__negative, ncp->nc_dvp, ncp->nc_name); 1351 counter_u64_add(numneghits, 1); 1352 } 1353 1354 /* 1355 * Move a negative entry to the hot list. 1356 */ 1357 static void 1358 cache_neg_promote_locked(struct namecache *ncp) 1359 { 1360 struct neglist *nl; 1361 struct negstate *ns; 1362 1363 ns = NCP2NEGSTATE(ncp); 1364 nl = NCP2NEGLIST(ncp); 1365 mtx_assert(&nl->nl_lock, MA_OWNED); 1366 if ((ns->neg_flag & NEG_HOT) == 0) { 1367 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst); 1368 TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst); 1369 nl->nl_hotnum++; 1370 ns->neg_flag |= NEG_HOT; 1371 } 1372 } 1373 1374 /* 1375 * Move a hot negative entry to the cold list. 1376 */ 1377 static void 1378 cache_neg_demote_locked(struct namecache *ncp) 1379 { 1380 struct neglist *nl; 1381 struct negstate *ns; 1382 1383 ns = NCP2NEGSTATE(ncp); 1384 nl = NCP2NEGLIST(ncp); 1385 mtx_assert(&nl->nl_lock, MA_OWNED); 1386 MPASS(ns->neg_flag & NEG_HOT); 1387 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst); 1388 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst); 1389 nl->nl_hotnum--; 1390 ns->neg_flag &= ~NEG_HOT; 1391 atomic_store_char(&ns->neg_hit, 0); 1392 } 1393 1394 /* 1395 * Move a negative entry to the hot list if it matches the lookup. 1396 * 1397 * We have to take locks, but they may be contended and in the worst 1398 * case we may need to go off CPU. We don't want to spin within the 1399 * smr section and we can't block with it. Exiting the section means 1400 * the found entry could have been evicted. We are going to look it 1401 * up again. 1402 */ 1403 static bool 1404 cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp, 1405 struct namecache *oncp, uint32_t hash) 1406 { 1407 struct namecache *ncp; 1408 struct neglist *nl; 1409 u_char nc_flag; 1410 1411 nl = NCP2NEGLIST(oncp); 1412 1413 mtx_lock(&nl->nl_lock); 1414 /* 1415 * For hash iteration. 1416 */ 1417 vfs_smr_enter(); 1418 1419 /* 1420 * Avoid all surprises by only succeeding if we got the same entry and 1421 * bailing completely otherwise. 1422 * XXX There are no provisions to keep the vnode around, meaning we may 1423 * end up promoting a negative entry for a *new* vnode and returning 1424 * ENOENT on its account. This is the error we want to return anyway 1425 * and promotion is harmless. 1426 * 1427 * In particular at this point there can be a new ncp which matches the 1428 * search but hashes to a different neglist. 1429 */ 1430 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1431 if (ncp == oncp) 1432 break; 1433 } 1434 1435 /* 1436 * No match to begin with. 1437 */ 1438 if (__predict_false(ncp == NULL)) { 1439 goto out_abort; 1440 } 1441 1442 /* 1443 * The newly found entry may be something different... 1444 */ 1445 if (!cache_ncp_match(ncp, dvp, cnp)) { 1446 goto out_abort; 1447 } 1448 1449 /* 1450 * ... and not even negative. 1451 */ 1452 nc_flag = atomic_load_char(&ncp->nc_flag); 1453 if ((nc_flag & NCF_NEGATIVE) == 0) { 1454 goto out_abort; 1455 } 1456 1457 if (!cache_ncp_canuse(ncp)) { 1458 goto out_abort; 1459 } 1460 1461 cache_neg_promote_locked(ncp); 1462 cache_neg_hit_finish(ncp); 1463 vfs_smr_exit(); 1464 mtx_unlock(&nl->nl_lock); 1465 return (true); 1466 out_abort: 1467 vfs_smr_exit(); 1468 mtx_unlock(&nl->nl_lock); 1469 return (false); 1470 } 1471 1472 static void 1473 cache_neg_promote(struct namecache *ncp) 1474 { 1475 struct neglist *nl; 1476 1477 nl = NCP2NEGLIST(ncp); 1478 mtx_lock(&nl->nl_lock); 1479 cache_neg_promote_locked(ncp); 1480 mtx_unlock(&nl->nl_lock); 1481 } 1482 1483 static void 1484 cache_neg_insert(struct namecache *ncp) 1485 { 1486 struct neglist *nl; 1487 1488 MPASS(ncp->nc_flag & NCF_NEGATIVE); 1489 cache_assert_bucket_locked(ncp); 1490 nl = NCP2NEGLIST(ncp); 1491 mtx_lock(&nl->nl_lock); 1492 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst); 1493 mtx_unlock(&nl->nl_lock); 1494 atomic_add_long(&numneg, 1); 1495 } 1496 1497 static void 1498 cache_neg_remove(struct namecache *ncp) 1499 { 1500 struct neglist *nl; 1501 struct negstate *ns; 1502 1503 cache_assert_bucket_locked(ncp); 1504 nl = NCP2NEGLIST(ncp); 1505 ns = NCP2NEGSTATE(ncp); 1506 mtx_lock(&nl->nl_lock); 1507 if ((ns->neg_flag & NEG_HOT) != 0) { 1508 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst); 1509 nl->nl_hotnum--; 1510 } else { 1511 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst); 1512 } 1513 mtx_unlock(&nl->nl_lock); 1514 atomic_subtract_long(&numneg, 1); 1515 } 1516 1517 static struct neglist * 1518 cache_neg_evict_select_list(void) 1519 { 1520 struct neglist *nl; 1521 u_int c; 1522 1523 c = atomic_fetchadd_int(&neg_cycle, 1) + 1; 1524 nl = &neglists[c % numneglists]; 1525 if (!mtx_trylock(&nl->nl_evict_lock)) { 1526 counter_u64_add(neg_evict_skipped_contended, 1); 1527 return (NULL); 1528 } 1529 return (nl); 1530 } 1531 1532 static struct namecache * 1533 cache_neg_evict_select_entry(struct neglist *nl) 1534 { 1535 struct namecache *ncp, *lncp; 1536 struct negstate *ns, *lns; 1537 int i; 1538 1539 mtx_assert(&nl->nl_evict_lock, MA_OWNED); 1540 mtx_assert(&nl->nl_lock, MA_OWNED); 1541 ncp = TAILQ_FIRST(&nl->nl_list); 1542 if (ncp == NULL) 1543 return (NULL); 1544 lncp = ncp; 1545 lns = NCP2NEGSTATE(lncp); 1546 for (i = 1; i < 4; i++) { 1547 ncp = TAILQ_NEXT(ncp, nc_dst); 1548 if (ncp == NULL) 1549 break; 1550 ns = NCP2NEGSTATE(ncp); 1551 if (ns->neg_hit < lns->neg_hit) { 1552 lncp = ncp; 1553 lns = ns; 1554 } 1555 } 1556 return (lncp); 1557 } 1558 1559 static bool 1560 cache_neg_evict(void) 1561 { 1562 struct namecache *ncp, *ncp2; 1563 struct neglist *nl; 1564 struct vnode *dvp; 1565 struct mtx *dvlp; 1566 struct mtx *blp; 1567 uint32_t hash; 1568 u_char nlen; 1569 bool evicted; 1570 1571 nl = cache_neg_evict_select_list(); 1572 if (nl == NULL) { 1573 return (false); 1574 } 1575 1576 mtx_lock(&nl->nl_lock); 1577 ncp = TAILQ_FIRST(&nl->nl_hotlist); 1578 if (ncp != NULL) { 1579 cache_neg_demote_locked(ncp); 1580 } 1581 ncp = cache_neg_evict_select_entry(nl); 1582 if (ncp == NULL) { 1583 counter_u64_add(neg_evict_skipped_empty, 1); 1584 mtx_unlock(&nl->nl_lock); 1585 mtx_unlock(&nl->nl_evict_lock); 1586 return (false); 1587 } 1588 nlen = ncp->nc_nlen; 1589 dvp = ncp->nc_dvp; 1590 hash = cache_get_hash(ncp->nc_name, nlen, dvp); 1591 dvlp = VP2VNODELOCK(dvp); 1592 blp = HASH2BUCKETLOCK(hash); 1593 mtx_unlock(&nl->nl_lock); 1594 mtx_unlock(&nl->nl_evict_lock); 1595 mtx_lock(dvlp); 1596 mtx_lock(blp); 1597 /* 1598 * Note that since all locks were dropped above, the entry may be 1599 * gone or reallocated to be something else. 1600 */ 1601 CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) { 1602 if (ncp2 == ncp && ncp2->nc_dvp == dvp && 1603 ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0) 1604 break; 1605 } 1606 if (ncp2 == NULL) { 1607 counter_u64_add(neg_evict_skipped_missed, 1); 1608 ncp = NULL; 1609 evicted = false; 1610 } else { 1611 MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp)); 1612 MPASS(blp == NCP2BUCKETLOCK(ncp)); 1613 SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp, 1614 ncp->nc_name); 1615 cache_zap_locked(ncp); 1616 counter_u64_add(neg_evicted, 1); 1617 evicted = true; 1618 } 1619 mtx_unlock(blp); 1620 mtx_unlock(dvlp); 1621 if (ncp != NULL) 1622 cache_free(ncp); 1623 return (evicted); 1624 } 1625 1626 /* 1627 * Maybe evict a negative entry to create more room. 1628 * 1629 * The ncnegfactor parameter limits what fraction of the total count 1630 * can comprise of negative entries. However, if the cache is just 1631 * warming up this leads to excessive evictions. As such, ncnegminpct 1632 * (recomputed to neg_min) dictates whether the above should be 1633 * applied. 1634 * 1635 * Try evicting if the cache is close to full capacity regardless of 1636 * other considerations. 1637 */ 1638 static bool 1639 cache_neg_evict_cond(u_long lnumcache) 1640 { 1641 u_long lnumneg; 1642 1643 if (ncsize - 1000 < lnumcache) 1644 goto out_evict; 1645 lnumneg = atomic_load_long(&numneg); 1646 if (lnumneg < neg_min) 1647 return (false); 1648 if (lnumneg * ncnegfactor < lnumcache) 1649 return (false); 1650 out_evict: 1651 return (cache_neg_evict()); 1652 } 1653 1654 /* 1655 * cache_zap_locked(): 1656 * 1657 * Removes a namecache entry from cache, whether it contains an actual 1658 * pointer to a vnode or if it is just a negative cache entry. 1659 */ 1660 static void 1661 cache_zap_locked(struct namecache *ncp) 1662 { 1663 struct nchashhead *ncpp; 1664 struct vnode *dvp, *vp; 1665 1666 dvp = ncp->nc_dvp; 1667 vp = ncp->nc_vp; 1668 1669 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1670 cache_assert_vnode_locked(vp); 1671 cache_assert_vnode_locked(dvp); 1672 cache_assert_bucket_locked(ncp); 1673 1674 cache_ncp_invalidate(ncp); 1675 1676 ncpp = NCP2BUCKET(ncp); 1677 CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash); 1678 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 1679 SDT_PROBE3(vfs, namecache, zap, done, dvp, ncp->nc_name, vp); 1680 TAILQ_REMOVE(&vp->v_cache_dst, ncp, nc_dst); 1681 if (ncp == vp->v_cache_dd) { 1682 atomic_store_ptr(&vp->v_cache_dd, NULL); 1683 } 1684 } else { 1685 SDT_PROBE2(vfs, namecache, zap_negative, done, dvp, ncp->nc_name); 1686 cache_neg_remove(ncp); 1687 } 1688 if (ncp->nc_flag & NCF_ISDOTDOT) { 1689 if (ncp == dvp->v_cache_dd) { 1690 atomic_store_ptr(&dvp->v_cache_dd, NULL); 1691 } 1692 } else { 1693 LIST_REMOVE(ncp, nc_src); 1694 if (LIST_EMPTY(&dvp->v_cache_src)) { 1695 ncp->nc_flag |= NCF_DVDROP; 1696 } 1697 } 1698 } 1699 1700 static void 1701 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) 1702 { 1703 struct mtx *blp; 1704 1705 MPASS(ncp->nc_dvp == vp); 1706 MPASS(ncp->nc_flag & NCF_NEGATIVE); 1707 cache_assert_vnode_locked(vp); 1708 1709 blp = NCP2BUCKETLOCK(ncp); 1710 mtx_lock(blp); 1711 cache_zap_locked(ncp); 1712 mtx_unlock(blp); 1713 } 1714 1715 static bool 1716 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, 1717 struct mtx **vlpp) 1718 { 1719 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 1720 struct mtx *blp; 1721 1722 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 1723 cache_assert_vnode_locked(vp); 1724 1725 if (ncp->nc_flag & NCF_NEGATIVE) { 1726 if (*vlpp != NULL) { 1727 mtx_unlock(*vlpp); 1728 *vlpp = NULL; 1729 } 1730 cache_zap_negative_locked_vnode_kl(ncp, vp); 1731 return (true); 1732 } 1733 1734 pvlp = VP2VNODELOCK(vp); 1735 blp = NCP2BUCKETLOCK(ncp); 1736 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 1737 vlp2 = VP2VNODELOCK(ncp->nc_vp); 1738 1739 if (*vlpp == vlp1 || *vlpp == vlp2) { 1740 to_unlock = *vlpp; 1741 *vlpp = NULL; 1742 } else { 1743 if (*vlpp != NULL) { 1744 mtx_unlock(*vlpp); 1745 *vlpp = NULL; 1746 } 1747 cache_sort_vnodes(&vlp1, &vlp2); 1748 if (vlp1 == pvlp) { 1749 mtx_lock(vlp2); 1750 to_unlock = vlp2; 1751 } else { 1752 if (!mtx_trylock(vlp1)) 1753 goto out_relock; 1754 to_unlock = vlp1; 1755 } 1756 } 1757 mtx_lock(blp); 1758 cache_zap_locked(ncp); 1759 mtx_unlock(blp); 1760 if (to_unlock != NULL) 1761 mtx_unlock(to_unlock); 1762 return (true); 1763 1764 out_relock: 1765 mtx_unlock(vlp2); 1766 mtx_lock(vlp1); 1767 mtx_lock(vlp2); 1768 MPASS(*vlpp == NULL); 1769 *vlpp = vlp1; 1770 return (false); 1771 } 1772 1773 /* 1774 * If trylocking failed we can get here. We know enough to take all needed locks 1775 * in the right order and re-lookup the entry. 1776 */ 1777 static int 1778 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1779 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, 1780 struct mtx *blp) 1781 { 1782 struct namecache *rncp; 1783 struct mtx *rvlp; 1784 1785 cache_assert_bucket_unlocked(ncp); 1786 1787 cache_sort_vnodes(&dvlp, &vlp); 1788 cache_lock_vnodes(dvlp, vlp); 1789 mtx_lock(blp); 1790 CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { 1791 if (rncp == ncp && cache_ncp_match(rncp, dvp, cnp)) 1792 break; 1793 } 1794 if (rncp == NULL) 1795 goto out_mismatch; 1796 1797 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1798 rvlp = VP2VNODELOCK(rncp->nc_vp); 1799 else 1800 rvlp = NULL; 1801 if (rvlp != vlp) 1802 goto out_mismatch; 1803 1804 cache_zap_locked(rncp); 1805 mtx_unlock(blp); 1806 cache_unlock_vnodes(dvlp, vlp); 1807 atomic_add_long(&zap_bucket_relock_success, 1); 1808 return (0); 1809 1810 out_mismatch: 1811 mtx_unlock(blp); 1812 cache_unlock_vnodes(dvlp, vlp); 1813 return (EAGAIN); 1814 } 1815 1816 static int __noinline 1817 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp, 1818 uint32_t hash, struct mtx *blp) 1819 { 1820 struct mtx *dvlp, *vlp; 1821 struct vnode *dvp; 1822 1823 cache_assert_bucket_locked(ncp); 1824 1825 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1826 vlp = NULL; 1827 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1828 vlp = VP2VNODELOCK(ncp->nc_vp); 1829 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1830 cache_zap_locked(ncp); 1831 mtx_unlock(blp); 1832 cache_unlock_vnodes(dvlp, vlp); 1833 return (0); 1834 } 1835 1836 dvp = ncp->nc_dvp; 1837 mtx_unlock(blp); 1838 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1839 } 1840 1841 static __noinline int 1842 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp) 1843 { 1844 struct namecache *ncp; 1845 struct mtx *blp; 1846 struct mtx *dvlp, *dvlp2; 1847 uint32_t hash; 1848 int error; 1849 1850 if (cnp->cn_namelen == 2 && 1851 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1852 dvlp = VP2VNODELOCK(dvp); 1853 dvlp2 = NULL; 1854 mtx_lock(dvlp); 1855 retry_dotdot: 1856 ncp = dvp->v_cache_dd; 1857 if (ncp == NULL) { 1858 mtx_unlock(dvlp); 1859 if (dvlp2 != NULL) 1860 mtx_unlock(dvlp2); 1861 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1862 return (0); 1863 } 1864 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1865 if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2)) 1866 goto retry_dotdot; 1867 MPASS(dvp->v_cache_dd == NULL); 1868 mtx_unlock(dvlp); 1869 if (dvlp2 != NULL) 1870 mtx_unlock(dvlp2); 1871 cache_free(ncp); 1872 } else { 1873 atomic_store_ptr(&dvp->v_cache_dd, NULL); 1874 mtx_unlock(dvlp); 1875 if (dvlp2 != NULL) 1876 mtx_unlock(dvlp2); 1877 } 1878 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1879 return (1); 1880 } 1881 1882 /* 1883 * XXX note that access here is completely unlocked with no provisions 1884 * to keep the hash allocated. If one is sufficiently unlucky a 1885 * parallel cache resize can reallocate the hash, unmap backing pages 1886 * and cause the empty check below to fault. 1887 * 1888 * Fixing this has epsilon priority, but can be done with no overhead 1889 * for this codepath with sufficient effort. 1890 */ 1891 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1892 blp = HASH2BUCKETLOCK(hash); 1893 retry: 1894 if (CK_SLIST_EMPTY(NCHHASH(hash))) 1895 goto out_no_entry; 1896 1897 mtx_lock(blp); 1898 ncp = cache_ncp_find(dvp, cnp, hash); 1899 if (ncp == NULL) { 1900 mtx_unlock(blp); 1901 goto out_no_entry; 1902 } 1903 1904 error = cache_zap_locked_bucket(ncp, cnp, hash, blp); 1905 if (__predict_false(error != 0)) { 1906 atomic_add_long(&zap_bucket_fail, 1); 1907 goto retry; 1908 } 1909 counter_u64_add(numposzaps, 1); 1910 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1911 cache_free(ncp); 1912 return (1); 1913 out_no_entry: 1914 counter_u64_add(nummisszap, 1); 1915 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1916 return (0); 1917 } 1918 1919 static int __noinline 1920 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1921 struct timespec *tsp, int *ticksp) 1922 { 1923 int ltype; 1924 1925 *vpp = dvp; 1926 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); 1927 if (tsp != NULL) 1928 timespecclear(tsp); 1929 if (ticksp != NULL) 1930 *ticksp = ticks; 1931 vrefact(*vpp); 1932 /* 1933 * When we lookup "." we still can be asked to lock it 1934 * differently... 1935 */ 1936 ltype = cnp->cn_lkflags & LK_TYPE_MASK; 1937 if (ltype != VOP_ISLOCKED(*vpp)) { 1938 if (ltype == LK_EXCLUSIVE) { 1939 vn_lock(*vpp, LK_UPGRADE | LK_RETRY); 1940 if (VN_IS_DOOMED((*vpp))) { 1941 /* forced unmount */ 1942 vrele(*vpp); 1943 *vpp = NULL; 1944 return (ENOENT); 1945 } 1946 } else 1947 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); 1948 } 1949 return (-1); 1950 } 1951 1952 static int __noinline 1953 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1954 struct timespec *tsp, int *ticksp) 1955 { 1956 struct namecache_ts *ncp_ts; 1957 struct namecache *ncp; 1958 struct mtx *dvlp; 1959 enum vgetstate vs; 1960 int error, ltype; 1961 bool whiteout; 1962 1963 MPASS((cnp->cn_flags & ISDOTDOT) != 0); 1964 1965 if ((cnp->cn_flags & MAKEENTRY) == 0) { 1966 cache_remove_cnp(dvp, cnp); 1967 return (0); 1968 } 1969 1970 retry: 1971 dvlp = VP2VNODELOCK(dvp); 1972 mtx_lock(dvlp); 1973 ncp = dvp->v_cache_dd; 1974 if (ncp == NULL) { 1975 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, ".."); 1976 mtx_unlock(dvlp); 1977 return (0); 1978 } 1979 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1980 if (ncp->nc_flag & NCF_NEGATIVE) 1981 *vpp = NULL; 1982 else 1983 *vpp = ncp->nc_vp; 1984 } else 1985 *vpp = ncp->nc_dvp; 1986 if (*vpp == NULL) 1987 goto negative_success; 1988 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp); 1989 cache_out_ts(ncp, tsp, ticksp); 1990 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == 1991 NCF_DTS && tsp != NULL) { 1992 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1993 *tsp = ncp_ts->nc_dotdottime; 1994 } 1995 1996 MPASS(dvp != *vpp); 1997 ltype = VOP_ISLOCKED(dvp); 1998 VOP_UNLOCK(dvp); 1999 vs = vget_prep(*vpp); 2000 mtx_unlock(dvlp); 2001 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 2002 vn_lock(dvp, ltype | LK_RETRY); 2003 if (VN_IS_DOOMED(dvp)) { 2004 if (error == 0) 2005 vput(*vpp); 2006 *vpp = NULL; 2007 return (ENOENT); 2008 } 2009 if (error) { 2010 *vpp = NULL; 2011 goto retry; 2012 } 2013 return (-1); 2014 negative_success: 2015 if (__predict_false(cnp->cn_nameiop == CREATE)) { 2016 if (cnp->cn_flags & ISLASTCN) { 2017 counter_u64_add(numnegzaps, 1); 2018 cache_zap_negative_locked_vnode_kl(ncp, dvp); 2019 mtx_unlock(dvlp); 2020 cache_free(ncp); 2021 return (0); 2022 } 2023 } 2024 2025 whiteout = (ncp->nc_flag & NCF_WHITE); 2026 cache_out_ts(ncp, tsp, ticksp); 2027 if (cache_neg_hit_prep(ncp)) 2028 cache_neg_promote(ncp); 2029 else 2030 cache_neg_hit_finish(ncp); 2031 mtx_unlock(dvlp); 2032 if (whiteout) 2033 cnp->cn_flags |= ISWHITEOUT; 2034 return (ENOENT); 2035 } 2036 2037 /** 2038 * Lookup a name in the name cache 2039 * 2040 * # Arguments 2041 * 2042 * - dvp: Parent directory in which to search. 2043 * - vpp: Return argument. Will contain desired vnode on cache hit. 2044 * - cnp: Parameters of the name search. The most interesting bits of 2045 * the cn_flags field have the following meanings: 2046 * - MAKEENTRY: If clear, free an entry from the cache rather than look 2047 * it up. 2048 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." 2049 * - tsp: Return storage for cache timestamp. On a successful (positive 2050 * or negative) lookup, tsp will be filled with any timespec that 2051 * was stored when this cache entry was created. However, it will 2052 * be clear for "." entries. 2053 * - ticks: Return storage for alternate cache timestamp. On a successful 2054 * (positive or negative) lookup, it will contain the ticks value 2055 * that was current when the cache entry was created, unless cnp 2056 * was ".". 2057 * 2058 * Either both tsp and ticks have to be provided or neither of them. 2059 * 2060 * # Returns 2061 * 2062 * - -1: A positive cache hit. vpp will contain the desired vnode. 2063 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due 2064 * to a forced unmount. vpp will not be modified. If the entry 2065 * is a whiteout, then the ISWHITEOUT flag will be set in 2066 * cnp->cn_flags. 2067 * - 0: A cache miss. vpp will not be modified. 2068 * 2069 * # Locking 2070 * 2071 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up 2072 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the 2073 * lock is not recursively acquired. 2074 */ 2075 static int __noinline 2076 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 2077 struct timespec *tsp, int *ticksp) 2078 { 2079 struct namecache *ncp; 2080 struct mtx *blp; 2081 uint32_t hash; 2082 enum vgetstate vs; 2083 int error; 2084 bool whiteout; 2085 2086 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 2087 MPASS((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) != 0); 2088 2089 retry: 2090 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 2091 blp = HASH2BUCKETLOCK(hash); 2092 mtx_lock(blp); 2093 2094 ncp = cache_ncp_find(dvp, cnp, hash); 2095 if (__predict_false(ncp == NULL)) { 2096 mtx_unlock(blp); 2097 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr); 2098 counter_u64_add(nummiss, 1); 2099 return (0); 2100 } 2101 2102 if (ncp->nc_flag & NCF_NEGATIVE) 2103 goto negative_success; 2104 2105 counter_u64_add(numposhits, 1); 2106 *vpp = ncp->nc_vp; 2107 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); 2108 cache_out_ts(ncp, tsp, ticksp); 2109 MPASS(dvp != *vpp); 2110 vs = vget_prep(*vpp); 2111 mtx_unlock(blp); 2112 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 2113 if (error) { 2114 *vpp = NULL; 2115 goto retry; 2116 } 2117 return (-1); 2118 negative_success: 2119 /* 2120 * We don't get here with regular lookup apart from corner cases. 2121 */ 2122 if (__predict_true(cnp->cn_nameiop == CREATE)) { 2123 if (cnp->cn_flags & ISLASTCN) { 2124 counter_u64_add(numnegzaps, 1); 2125 error = cache_zap_locked_bucket(ncp, cnp, hash, blp); 2126 if (__predict_false(error != 0)) { 2127 atomic_add_long(&zap_bucket_fail2, 1); 2128 goto retry; 2129 } 2130 cache_free(ncp); 2131 return (0); 2132 } 2133 } 2134 2135 whiteout = (ncp->nc_flag & NCF_WHITE); 2136 cache_out_ts(ncp, tsp, ticksp); 2137 if (cache_neg_hit_prep(ncp)) 2138 cache_neg_promote(ncp); 2139 else 2140 cache_neg_hit_finish(ncp); 2141 mtx_unlock(blp); 2142 if (whiteout) 2143 cnp->cn_flags |= ISWHITEOUT; 2144 return (ENOENT); 2145 } 2146 2147 int 2148 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 2149 struct timespec *tsp, int *ticksp) 2150 { 2151 struct namecache *ncp; 2152 uint32_t hash; 2153 enum vgetstate vs; 2154 int error; 2155 bool whiteout, neg_promote; 2156 u_short nc_flag; 2157 2158 MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL)); 2159 2160 #ifdef DEBUG_CACHE 2161 if (__predict_false(!doingcache)) { 2162 cnp->cn_flags &= ~MAKEENTRY; 2163 return (0); 2164 } 2165 #endif 2166 2167 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 2168 if (cnp->cn_namelen == 1) 2169 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); 2170 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') 2171 return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp)); 2172 } 2173 2174 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 2175 2176 if ((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) == 0) { 2177 cache_remove_cnp(dvp, cnp); 2178 return (0); 2179 } 2180 2181 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 2182 vfs_smr_enter(); 2183 2184 ncp = cache_ncp_find(dvp, cnp, hash); 2185 if (__predict_false(ncp == NULL)) { 2186 vfs_smr_exit(); 2187 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr); 2188 counter_u64_add(nummiss, 1); 2189 return (0); 2190 } 2191 2192 nc_flag = atomic_load_char(&ncp->nc_flag); 2193 if (nc_flag & NCF_NEGATIVE) 2194 goto negative_success; 2195 2196 counter_u64_add(numposhits, 1); 2197 *vpp = ncp->nc_vp; 2198 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); 2199 cache_out_ts(ncp, tsp, ticksp); 2200 MPASS(dvp != *vpp); 2201 if (!cache_ncp_canuse(ncp)) { 2202 vfs_smr_exit(); 2203 *vpp = NULL; 2204 goto out_fallback; 2205 } 2206 vs = vget_prep_smr(*vpp); 2207 vfs_smr_exit(); 2208 if (__predict_false(vs == VGET_NONE)) { 2209 *vpp = NULL; 2210 goto out_fallback; 2211 } 2212 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 2213 if (error) { 2214 *vpp = NULL; 2215 goto out_fallback; 2216 } 2217 return (-1); 2218 negative_success: 2219 if (cnp->cn_nameiop == CREATE) { 2220 if (cnp->cn_flags & ISLASTCN) { 2221 vfs_smr_exit(); 2222 goto out_fallback; 2223 } 2224 } 2225 2226 cache_out_ts(ncp, tsp, ticksp); 2227 whiteout = (atomic_load_char(&ncp->nc_flag) & NCF_WHITE); 2228 neg_promote = cache_neg_hit_prep(ncp); 2229 if (!cache_ncp_canuse(ncp)) { 2230 cache_neg_hit_abort(ncp); 2231 vfs_smr_exit(); 2232 goto out_fallback; 2233 } 2234 if (neg_promote) { 2235 vfs_smr_exit(); 2236 if (!cache_neg_promote_cond(dvp, cnp, ncp, hash)) 2237 goto out_fallback; 2238 } else { 2239 cache_neg_hit_finish(ncp); 2240 vfs_smr_exit(); 2241 } 2242 if (whiteout) 2243 cnp->cn_flags |= ISWHITEOUT; 2244 return (ENOENT); 2245 out_fallback: 2246 return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp)); 2247 } 2248 2249 struct celockstate { 2250 struct mtx *vlp[3]; 2251 struct mtx *blp[2]; 2252 }; 2253 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); 2254 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); 2255 2256 static inline void 2257 cache_celockstate_init(struct celockstate *cel) 2258 { 2259 2260 bzero(cel, sizeof(*cel)); 2261 } 2262 2263 static void 2264 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, 2265 struct vnode *dvp) 2266 { 2267 struct mtx *vlp1, *vlp2; 2268 2269 MPASS(cel->vlp[0] == NULL); 2270 MPASS(cel->vlp[1] == NULL); 2271 MPASS(cel->vlp[2] == NULL); 2272 2273 MPASS(vp != NULL || dvp != NULL); 2274 2275 vlp1 = VP2VNODELOCK(vp); 2276 vlp2 = VP2VNODELOCK(dvp); 2277 cache_sort_vnodes(&vlp1, &vlp2); 2278 2279 if (vlp1 != NULL) { 2280 mtx_lock(vlp1); 2281 cel->vlp[0] = vlp1; 2282 } 2283 mtx_lock(vlp2); 2284 cel->vlp[1] = vlp2; 2285 } 2286 2287 static void 2288 cache_unlock_vnodes_cel(struct celockstate *cel) 2289 { 2290 2291 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); 2292 2293 if (cel->vlp[0] != NULL) 2294 mtx_unlock(cel->vlp[0]); 2295 if (cel->vlp[1] != NULL) 2296 mtx_unlock(cel->vlp[1]); 2297 if (cel->vlp[2] != NULL) 2298 mtx_unlock(cel->vlp[2]); 2299 } 2300 2301 static bool 2302 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) 2303 { 2304 struct mtx *vlp; 2305 bool ret; 2306 2307 cache_assert_vlp_locked(cel->vlp[0]); 2308 cache_assert_vlp_locked(cel->vlp[1]); 2309 MPASS(cel->vlp[2] == NULL); 2310 2311 MPASS(vp != NULL); 2312 vlp = VP2VNODELOCK(vp); 2313 2314 ret = true; 2315 if (vlp >= cel->vlp[1]) { 2316 mtx_lock(vlp); 2317 } else { 2318 if (mtx_trylock(vlp)) 2319 goto out; 2320 cache_unlock_vnodes_cel(cel); 2321 atomic_add_long(&cache_lock_vnodes_cel_3_failures, 1); 2322 if (vlp < cel->vlp[0]) { 2323 mtx_lock(vlp); 2324 mtx_lock(cel->vlp[0]); 2325 mtx_lock(cel->vlp[1]); 2326 } else { 2327 if (cel->vlp[0] != NULL) 2328 mtx_lock(cel->vlp[0]); 2329 mtx_lock(vlp); 2330 mtx_lock(cel->vlp[1]); 2331 } 2332 ret = false; 2333 } 2334 out: 2335 cel->vlp[2] = vlp; 2336 return (ret); 2337 } 2338 2339 static void 2340 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1, 2341 struct mtx *blp2) 2342 { 2343 2344 MPASS(cel->blp[0] == NULL); 2345 MPASS(cel->blp[1] == NULL); 2346 2347 cache_sort_vnodes(&blp1, &blp2); 2348 2349 if (blp1 != NULL) { 2350 mtx_lock(blp1); 2351 cel->blp[0] = blp1; 2352 } 2353 mtx_lock(blp2); 2354 cel->blp[1] = blp2; 2355 } 2356 2357 static void 2358 cache_unlock_buckets_cel(struct celockstate *cel) 2359 { 2360 2361 if (cel->blp[0] != NULL) 2362 mtx_unlock(cel->blp[0]); 2363 mtx_unlock(cel->blp[1]); 2364 } 2365 2366 /* 2367 * Lock part of the cache affected by the insertion. 2368 * 2369 * This means vnodelocks for dvp, vp and the relevant bucketlock. 2370 * However, insertion can result in removal of an old entry. In this 2371 * case we have an additional vnode and bucketlock pair to lock. 2372 * 2373 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while 2374 * preserving the locking order (smaller address first). 2375 */ 2376 static void 2377 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 2378 uint32_t hash) 2379 { 2380 struct namecache *ncp; 2381 struct mtx *blps[2]; 2382 u_char nc_flag; 2383 2384 blps[0] = HASH2BUCKETLOCK(hash); 2385 for (;;) { 2386 blps[1] = NULL; 2387 cache_lock_vnodes_cel(cel, dvp, vp); 2388 if (vp == NULL || vp->v_type != VDIR) 2389 break; 2390 ncp = atomic_load_consume_ptr(&vp->v_cache_dd); 2391 if (ncp == NULL) 2392 break; 2393 nc_flag = atomic_load_char(&ncp->nc_flag); 2394 if ((nc_flag & NCF_ISDOTDOT) == 0) 2395 break; 2396 MPASS(ncp->nc_dvp == vp); 2397 blps[1] = NCP2BUCKETLOCK(ncp); 2398 if ((nc_flag & NCF_NEGATIVE) != 0) 2399 break; 2400 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 2401 break; 2402 /* 2403 * All vnodes got re-locked. Re-validate the state and if 2404 * nothing changed we are done. Otherwise restart. 2405 */ 2406 if (ncp == vp->v_cache_dd && 2407 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 2408 blps[1] == NCP2BUCKETLOCK(ncp) && 2409 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 2410 break; 2411 cache_unlock_vnodes_cel(cel); 2412 cel->vlp[0] = NULL; 2413 cel->vlp[1] = NULL; 2414 cel->vlp[2] = NULL; 2415 } 2416 cache_lock_buckets_cel(cel, blps[0], blps[1]); 2417 } 2418 2419 static void 2420 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 2421 uint32_t hash) 2422 { 2423 struct namecache *ncp; 2424 struct mtx *blps[2]; 2425 u_char nc_flag; 2426 2427 blps[0] = HASH2BUCKETLOCK(hash); 2428 for (;;) { 2429 blps[1] = NULL; 2430 cache_lock_vnodes_cel(cel, dvp, vp); 2431 ncp = atomic_load_consume_ptr(&dvp->v_cache_dd); 2432 if (ncp == NULL) 2433 break; 2434 nc_flag = atomic_load_char(&ncp->nc_flag); 2435 if ((nc_flag & NCF_ISDOTDOT) == 0) 2436 break; 2437 MPASS(ncp->nc_dvp == dvp); 2438 blps[1] = NCP2BUCKETLOCK(ncp); 2439 if ((nc_flag & NCF_NEGATIVE) != 0) 2440 break; 2441 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 2442 break; 2443 if (ncp == dvp->v_cache_dd && 2444 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 2445 blps[1] == NCP2BUCKETLOCK(ncp) && 2446 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 2447 break; 2448 cache_unlock_vnodes_cel(cel); 2449 cel->vlp[0] = NULL; 2450 cel->vlp[1] = NULL; 2451 cel->vlp[2] = NULL; 2452 } 2453 cache_lock_buckets_cel(cel, blps[0], blps[1]); 2454 } 2455 2456 static void 2457 cache_enter_unlock(struct celockstate *cel) 2458 { 2459 2460 cache_unlock_buckets_cel(cel); 2461 cache_unlock_vnodes_cel(cel); 2462 } 2463 2464 static void __noinline 2465 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, 2466 struct componentname *cnp) 2467 { 2468 struct celockstate cel; 2469 struct namecache *ncp; 2470 uint32_t hash; 2471 int len; 2472 2473 if (atomic_load_ptr(&dvp->v_cache_dd) == NULL) 2474 return; 2475 len = cnp->cn_namelen; 2476 cache_celockstate_init(&cel); 2477 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 2478 cache_enter_lock_dd(&cel, dvp, vp, hash); 2479 ncp = dvp->v_cache_dd; 2480 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) { 2481 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); 2482 cache_zap_locked(ncp); 2483 } else { 2484 ncp = NULL; 2485 } 2486 atomic_store_ptr(&dvp->v_cache_dd, NULL); 2487 cache_enter_unlock(&cel); 2488 if (ncp != NULL) 2489 cache_free(ncp); 2490 } 2491 2492 /* 2493 * Add an entry to the cache. 2494 */ 2495 void 2496 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 2497 struct timespec *tsp, struct timespec *dtsp) 2498 { 2499 struct celockstate cel; 2500 struct namecache *ncp, *n2, *ndd; 2501 struct namecache_ts *ncp_ts; 2502 uint32_t hash; 2503 int flag; 2504 int len; 2505 2506 KASSERT(cnp->cn_namelen <= NAME_MAX, 2507 ("%s: passed len %ld exceeds NAME_MAX (%d)", __func__, cnp->cn_namelen, 2508 NAME_MAX)); 2509 VNPASS(!VN_IS_DOOMED(dvp), dvp); 2510 VNPASS(dvp->v_type != VNON, dvp); 2511 if (vp != NULL) { 2512 VNPASS(!VN_IS_DOOMED(vp), vp); 2513 VNPASS(vp->v_type != VNON, vp); 2514 } 2515 if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') { 2516 KASSERT(dvp == vp, 2517 ("%s: different vnodes for dot entry (%p; %p)\n", __func__, 2518 dvp, vp)); 2519 } else { 2520 KASSERT(dvp != vp, 2521 ("%s: same vnode for non-dot entry [%s] (%p)\n", __func__, 2522 cnp->cn_nameptr, dvp)); 2523 } 2524 2525 #ifdef DEBUG_CACHE 2526 if (__predict_false(!doingcache)) 2527 return; 2528 #endif 2529 2530 flag = 0; 2531 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 2532 if (cnp->cn_namelen == 1) 2533 return; 2534 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 2535 cache_enter_dotdot_prep(dvp, vp, cnp); 2536 flag = NCF_ISDOTDOT; 2537 } 2538 } 2539 2540 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); 2541 if (ncp == NULL) 2542 return; 2543 2544 cache_celockstate_init(&cel); 2545 ndd = NULL; 2546 ncp_ts = NULL; 2547 2548 /* 2549 * Calculate the hash key and setup as much of the new 2550 * namecache entry as possible before acquiring the lock. 2551 */ 2552 ncp->nc_flag = flag | NCF_WIP; 2553 ncp->nc_vp = vp; 2554 if (vp == NULL) 2555 cache_neg_init(ncp); 2556 ncp->nc_dvp = dvp; 2557 if (tsp != NULL) { 2558 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 2559 ncp_ts->nc_time = *tsp; 2560 ncp_ts->nc_ticks = ticks; 2561 ncp_ts->nc_nc.nc_flag |= NCF_TS; 2562 if (dtsp != NULL) { 2563 ncp_ts->nc_dotdottime = *dtsp; 2564 ncp_ts->nc_nc.nc_flag |= NCF_DTS; 2565 } 2566 } 2567 len = ncp->nc_nlen = cnp->cn_namelen; 2568 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 2569 memcpy(ncp->nc_name, cnp->cn_nameptr, len); 2570 ncp->nc_name[len] = '\0'; 2571 cache_enter_lock(&cel, dvp, vp, hash); 2572 2573 /* 2574 * See if this vnode or negative entry is already in the cache 2575 * with this name. This can happen with concurrent lookups of 2576 * the same path name. 2577 */ 2578 n2 = cache_ncp_find(dvp, cnp, hash); 2579 if (n2 != NULL) { 2580 MPASS(cache_ncp_canuse(n2)); 2581 if ((n2->nc_flag & NCF_NEGATIVE) != 0) 2582 KASSERT(vp == NULL, 2583 ("%s: found entry pointing to a different vnode " 2584 "(%p != %p); name [%s]", 2585 __func__, NULL, vp, cnp->cn_nameptr)); 2586 else 2587 KASSERT(n2->nc_vp == vp, 2588 ("%s: found entry pointing to a different vnode " 2589 "(%p != %p); name [%s]", 2590 __func__, n2->nc_vp, vp, cnp->cn_nameptr)); 2591 /* 2592 * Entries are supposed to be immutable unless in the 2593 * process of getting destroyed. Accommodating for 2594 * changing timestamps is possible but not worth it. 2595 * This should be harmless in terms of correctness, in 2596 * the worst case resulting in an earlier expiration. 2597 * Alternatively, the found entry can be replaced 2598 * altogether. 2599 */ 2600 MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == 2601 (ncp->nc_flag & (NCF_TS | NCF_DTS))); 2602 #if 0 2603 if (tsp != NULL) { 2604 KASSERT((n2->nc_flag & NCF_TS) != 0, 2605 ("no NCF_TS")); 2606 n2_ts = __containerof(n2, struct namecache_ts, nc_nc); 2607 n2_ts->nc_time = ncp_ts->nc_time; 2608 n2_ts->nc_ticks = ncp_ts->nc_ticks; 2609 if (dtsp != NULL) { 2610 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; 2611 n2_ts->nc_nc.nc_flag |= NCF_DTS; 2612 } 2613 } 2614 #endif 2615 SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name, 2616 vp); 2617 goto out_unlock_free; 2618 } 2619 2620 if (flag == NCF_ISDOTDOT) { 2621 /* 2622 * See if we are trying to add .. entry, but some other lookup 2623 * has populated v_cache_dd pointer already. 2624 */ 2625 if (dvp->v_cache_dd != NULL) 2626 goto out_unlock_free; 2627 KASSERT(vp == NULL || vp->v_type == VDIR, 2628 ("wrong vnode type %p", vp)); 2629 atomic_thread_fence_rel(); 2630 atomic_store_ptr(&dvp->v_cache_dd, ncp); 2631 } else if (vp != NULL) { 2632 /* 2633 * For this case, the cache entry maps both the 2634 * directory name in it and the name ".." for the 2635 * directory's parent. 2636 */ 2637 if ((ndd = vp->v_cache_dd) != NULL) { 2638 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) 2639 cache_zap_locked(ndd); 2640 else 2641 ndd = NULL; 2642 } 2643 atomic_thread_fence_rel(); 2644 atomic_store_ptr(&vp->v_cache_dd, ncp); 2645 } 2646 2647 if (flag != NCF_ISDOTDOT) { 2648 if (LIST_EMPTY(&dvp->v_cache_src)) { 2649 cache_hold_vnode(dvp); 2650 } 2651 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); 2652 } 2653 2654 /* 2655 * If the entry is "negative", we place it into the 2656 * "negative" cache queue, otherwise, we place it into the 2657 * destination vnode's cache entries queue. 2658 */ 2659 if (vp != NULL) { 2660 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); 2661 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, 2662 vp); 2663 } else { 2664 if (cnp->cn_flags & ISWHITEOUT) 2665 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_WHITE); 2666 cache_neg_insert(ncp); 2667 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, 2668 ncp->nc_name); 2669 } 2670 2671 /* 2672 * Insert the new namecache entry into the appropriate chain 2673 * within the cache entries table. 2674 */ 2675 CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); 2676 2677 atomic_thread_fence_rel(); 2678 /* 2679 * Mark the entry as fully constructed. 2680 * It is immutable past this point until its removal. 2681 */ 2682 atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP); 2683 2684 cache_enter_unlock(&cel); 2685 if (ndd != NULL) 2686 cache_free(ndd); 2687 return; 2688 out_unlock_free: 2689 cache_enter_unlock(&cel); 2690 cache_free(ncp); 2691 return; 2692 } 2693 2694 /* 2695 * A variant of the above accepting flags. 2696 * 2697 * - VFS_CACHE_DROPOLD -- if a conflicting entry is found, drop it. 2698 * 2699 * TODO: this routine is a hack. It blindly removes the old entry, even if it 2700 * happens to match and it is doing it in an inefficient manner. It was added 2701 * to accommodate NFS which runs into a case where the target for a given name 2702 * may change from under it. Note this does nothing to solve the following 2703 * race: 2 callers of cache_enter_time_flags pass a different target vnode for 2704 * the same [dvp, cnp]. It may be argued that code doing this is broken. 2705 */ 2706 void 2707 cache_enter_time_flags(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 2708 struct timespec *tsp, struct timespec *dtsp, int flags) 2709 { 2710 2711 MPASS((flags & ~(VFS_CACHE_DROPOLD)) == 0); 2712 2713 if (flags & VFS_CACHE_DROPOLD) 2714 cache_remove_cnp(dvp, cnp); 2715 cache_enter_time(dvp, vp, cnp, tsp, dtsp); 2716 } 2717 2718 static u_long 2719 cache_roundup_2(u_long val) 2720 { 2721 u_long res; 2722 2723 for (res = 1; res <= val; res <<= 1) 2724 continue; 2725 2726 return (res); 2727 } 2728 2729 static struct nchashhead * 2730 nchinittbl(u_long elements, u_long *hashmask) 2731 { 2732 struct nchashhead *hashtbl; 2733 u_long hashsize, i; 2734 2735 hashsize = cache_roundup_2(elements) / 2; 2736 2737 hashtbl = malloc(hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK); 2738 for (i = 0; i < hashsize; i++) 2739 CK_SLIST_INIT(&hashtbl[i]); 2740 *hashmask = hashsize - 1; 2741 return (hashtbl); 2742 } 2743 2744 static void 2745 ncfreetbl(struct nchashhead *hashtbl) 2746 { 2747 2748 free(hashtbl, M_VFSCACHE); 2749 } 2750 2751 /* 2752 * Name cache initialization, from vfs_init() when we are booting 2753 */ 2754 static void 2755 nchinit(void *dummy __unused) 2756 { 2757 u_int i; 2758 2759 cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE, 2760 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2761 cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE, 2762 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2763 cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE, 2764 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2765 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE, 2766 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2767 2768 VFS_SMR_ZONE_SET(cache_zone_small); 2769 VFS_SMR_ZONE_SET(cache_zone_small_ts); 2770 VFS_SMR_ZONE_SET(cache_zone_large); 2771 VFS_SMR_ZONE_SET(cache_zone_large_ts); 2772 2773 ncsize = desiredvnodes * ncsizefactor; 2774 cache_recalc_neg_min(); 2775 nchashtbl = nchinittbl(desiredvnodes * 2, &nchash); 2776 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; 2777 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ 2778 ncbuckethash = 7; 2779 if (ncbuckethash > nchash) 2780 ncbuckethash = nchash; 2781 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, 2782 M_WAITOK | M_ZERO); 2783 for (i = 0; i < numbucketlocks; i++) 2784 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE); 2785 ncvnodehash = ncbuckethash; 2786 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, 2787 M_WAITOK | M_ZERO); 2788 for (i = 0; i < numvnodelocks; i++) 2789 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); 2790 2791 for (i = 0; i < numneglists; i++) { 2792 mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF); 2793 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); 2794 TAILQ_INIT(&neglists[i].nl_list); 2795 TAILQ_INIT(&neglists[i].nl_hotlist); 2796 } 2797 } 2798 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); 2799 2800 void 2801 cache_vnode_init(struct vnode *vp) 2802 { 2803 2804 LIST_INIT(&vp->v_cache_src); 2805 TAILQ_INIT(&vp->v_cache_dst); 2806 vp->v_cache_dd = NULL; 2807 cache_prehash(vp); 2808 } 2809 2810 /* 2811 * Induce transient cache misses for lockless operation in cache_lookup() by 2812 * using a temporary hash table. 2813 * 2814 * This will force a fs lookup. 2815 * 2816 * Synchronisation is done in 2 steps, calling vfs_smr_synchronize each time 2817 * to observe all CPUs not performing the lookup. 2818 */ 2819 static void 2820 cache_changesize_set_temp(struct nchashhead *temptbl, u_long temphash) 2821 { 2822 2823 MPASS(temphash < nchash); 2824 /* 2825 * Change the size. The new size is smaller and can safely be used 2826 * against the existing table. All lookups which now hash wrong will 2827 * result in a cache miss, which all callers are supposed to know how 2828 * to handle. 2829 */ 2830 atomic_store_long(&nchash, temphash); 2831 atomic_thread_fence_rel(); 2832 vfs_smr_synchronize(); 2833 /* 2834 * At this point everyone sees the updated hash value, but they still 2835 * see the old table. 2836 */ 2837 atomic_store_ptr(&nchashtbl, temptbl); 2838 atomic_thread_fence_rel(); 2839 vfs_smr_synchronize(); 2840 /* 2841 * At this point everyone sees the updated table pointer and size pair. 2842 */ 2843 } 2844 2845 /* 2846 * Set the new hash table. 2847 * 2848 * Similarly to cache_changesize_set_temp(), this has to synchronize against 2849 * lockless operation in cache_lookup(). 2850 */ 2851 static void 2852 cache_changesize_set_new(struct nchashhead *new_tbl, u_long new_hash) 2853 { 2854 2855 MPASS(nchash < new_hash); 2856 /* 2857 * Change the pointer first. This wont result in out of bounds access 2858 * since the temporary table is guaranteed to be smaller. 2859 */ 2860 atomic_store_ptr(&nchashtbl, new_tbl); 2861 atomic_thread_fence_rel(); 2862 vfs_smr_synchronize(); 2863 /* 2864 * At this point everyone sees the updated pointer value, but they 2865 * still see the old size. 2866 */ 2867 atomic_store_long(&nchash, new_hash); 2868 atomic_thread_fence_rel(); 2869 vfs_smr_synchronize(); 2870 /* 2871 * At this point everyone sees the updated table pointer and size pair. 2872 */ 2873 } 2874 2875 void 2876 cache_changesize(u_long newmaxvnodes) 2877 { 2878 struct nchashhead *new_nchashtbl, *old_nchashtbl, *temptbl; 2879 u_long new_nchash, old_nchash, temphash; 2880 struct namecache *ncp; 2881 uint32_t hash; 2882 u_long newncsize; 2883 u_long i; 2884 2885 newncsize = newmaxvnodes * ncsizefactor; 2886 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); 2887 if (newmaxvnodes < numbucketlocks) 2888 newmaxvnodes = numbucketlocks; 2889 2890 new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash); 2891 /* If same hash table size, nothing to do */ 2892 if (nchash == new_nchash) { 2893 ncfreetbl(new_nchashtbl); 2894 return; 2895 } 2896 2897 temptbl = nchinittbl(1, &temphash); 2898 2899 /* 2900 * Move everything from the old hash table to the new table. 2901 * None of the namecache entries in the table can be removed 2902 * because to do so, they have to be removed from the hash table. 2903 */ 2904 cache_lock_all_vnodes(); 2905 cache_lock_all_buckets(); 2906 old_nchashtbl = nchashtbl; 2907 old_nchash = nchash; 2908 cache_changesize_set_temp(temptbl, temphash); 2909 for (i = 0; i <= old_nchash; i++) { 2910 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) { 2911 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, 2912 ncp->nc_dvp); 2913 CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash); 2914 CK_SLIST_INSERT_HEAD(&new_nchashtbl[hash & new_nchash], ncp, nc_hash); 2915 } 2916 } 2917 ncsize = newncsize; 2918 cache_recalc_neg_min(); 2919 cache_changesize_set_new(new_nchashtbl, new_nchash); 2920 cache_unlock_all_buckets(); 2921 cache_unlock_all_vnodes(); 2922 ncfreetbl(old_nchashtbl); 2923 ncfreetbl(temptbl); 2924 } 2925 2926 /* 2927 * Remove all entries from and to a particular vnode. 2928 */ 2929 static void 2930 cache_purge_impl(struct vnode *vp) 2931 { 2932 struct cache_freebatch batch; 2933 struct namecache *ncp; 2934 struct mtx *vlp, *vlp2; 2935 2936 TAILQ_INIT(&batch); 2937 vlp = VP2VNODELOCK(vp); 2938 vlp2 = NULL; 2939 mtx_lock(vlp); 2940 retry: 2941 while (!LIST_EMPTY(&vp->v_cache_src)) { 2942 ncp = LIST_FIRST(&vp->v_cache_src); 2943 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2944 goto retry; 2945 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2946 } 2947 while (!TAILQ_EMPTY(&vp->v_cache_dst)) { 2948 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2949 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2950 goto retry; 2951 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2952 } 2953 ncp = vp->v_cache_dd; 2954 if (ncp != NULL) { 2955 KASSERT(ncp->nc_flag & NCF_ISDOTDOT, 2956 ("lost dotdot link")); 2957 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2958 goto retry; 2959 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2960 } 2961 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); 2962 mtx_unlock(vlp); 2963 if (vlp2 != NULL) 2964 mtx_unlock(vlp2); 2965 cache_free_batch(&batch); 2966 } 2967 2968 /* 2969 * Opportunistic check to see if there is anything to do. 2970 */ 2971 static bool 2972 cache_has_entries(struct vnode *vp) 2973 { 2974 2975 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2976 atomic_load_ptr(&vp->v_cache_dd) == NULL) 2977 return (false); 2978 return (true); 2979 } 2980 2981 void 2982 cache_purge(struct vnode *vp) 2983 { 2984 2985 SDT_PROBE1(vfs, namecache, purge, done, vp); 2986 if (!cache_has_entries(vp)) 2987 return; 2988 cache_purge_impl(vp); 2989 } 2990 2991 /* 2992 * Only to be used by vgone. 2993 */ 2994 void 2995 cache_purge_vgone(struct vnode *vp) 2996 { 2997 struct mtx *vlp; 2998 2999 VNPASS(VN_IS_DOOMED(vp), vp); 3000 if (cache_has_entries(vp)) { 3001 cache_purge_impl(vp); 3002 return; 3003 } 3004 3005 /* 3006 * Serialize against a potential thread doing cache_purge. 3007 */ 3008 vlp = VP2VNODELOCK(vp); 3009 mtx_wait_unlocked(vlp); 3010 if (cache_has_entries(vp)) { 3011 cache_purge_impl(vp); 3012 return; 3013 } 3014 return; 3015 } 3016 3017 /* 3018 * Remove all negative entries for a particular directory vnode. 3019 */ 3020 void 3021 cache_purge_negative(struct vnode *vp) 3022 { 3023 struct cache_freebatch batch; 3024 struct namecache *ncp, *nnp; 3025 struct mtx *vlp; 3026 3027 SDT_PROBE1(vfs, namecache, purge_negative, done, vp); 3028 if (LIST_EMPTY(&vp->v_cache_src)) 3029 return; 3030 TAILQ_INIT(&batch); 3031 vlp = VP2VNODELOCK(vp); 3032 mtx_lock(vlp); 3033 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { 3034 if (!(ncp->nc_flag & NCF_NEGATIVE)) 3035 continue; 3036 cache_zap_negative_locked_vnode_kl(ncp, vp); 3037 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 3038 } 3039 mtx_unlock(vlp); 3040 cache_free_batch(&batch); 3041 } 3042 3043 /* 3044 * Entry points for modifying VOP operations. 3045 */ 3046 void 3047 cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, 3048 struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp) 3049 { 3050 3051 ASSERT_VOP_IN_SEQC(fdvp); 3052 ASSERT_VOP_IN_SEQC(fvp); 3053 ASSERT_VOP_IN_SEQC(tdvp); 3054 if (tvp != NULL) 3055 ASSERT_VOP_IN_SEQC(tvp); 3056 3057 cache_purge(fvp); 3058 if (tvp != NULL) { 3059 cache_purge(tvp); 3060 KASSERT(!cache_remove_cnp(tdvp, tcnp), 3061 ("%s: lingering negative entry", __func__)); 3062 } else { 3063 cache_remove_cnp(tdvp, tcnp); 3064 } 3065 3066 /* 3067 * TODO 3068 * 3069 * Historically renaming was always purging all revelang entries, 3070 * but that's quite wasteful. In particular turns out that in many cases 3071 * the target file is immediately accessed after rename, inducing a cache 3072 * miss. 3073 * 3074 * Recode this to reduce relocking and reuse the existing entry (if any) 3075 * instead of just removing it above and allocating a new one here. 3076 */ 3077 cache_enter(tdvp, fvp, tcnp); 3078 } 3079 3080 void 3081 cache_vop_rmdir(struct vnode *dvp, struct vnode *vp) 3082 { 3083 3084 ASSERT_VOP_IN_SEQC(dvp); 3085 ASSERT_VOP_IN_SEQC(vp); 3086 cache_purge(vp); 3087 } 3088 3089 #ifdef INVARIANTS 3090 /* 3091 * Validate that if an entry exists it matches. 3092 */ 3093 void 3094 cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) 3095 { 3096 struct namecache *ncp; 3097 struct mtx *blp; 3098 uint32_t hash; 3099 3100 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 3101 if (CK_SLIST_EMPTY(NCHHASH(hash))) 3102 return; 3103 blp = HASH2BUCKETLOCK(hash); 3104 mtx_lock(blp); 3105 ncp = cache_ncp_find(dvp, cnp, hash); 3106 if (ncp != NULL && ncp->nc_vp != vp) { 3107 panic("%s: mismatch (%p != %p); ncp %p [%s] dvp %p\n", 3108 __func__, vp, ncp->nc_vp, ncp, ncp->nc_name, ncp->nc_dvp); 3109 } 3110 mtx_unlock(blp); 3111 } 3112 3113 void 3114 cache_assert_no_entries(struct vnode *vp) 3115 { 3116 3117 VNPASS(TAILQ_EMPTY(&vp->v_cache_dst), vp); 3118 VNPASS(LIST_EMPTY(&vp->v_cache_src), vp); 3119 VNPASS(vp->v_cache_dd == NULL, vp); 3120 } 3121 #endif 3122 3123 /* 3124 * Flush all entries referencing a particular filesystem. 3125 */ 3126 void 3127 cache_purgevfs(struct mount *mp) 3128 { 3129 struct vnode *vp, *mvp; 3130 size_t visited __sdt_used, purged __sdt_used; 3131 3132 visited = purged = 0; 3133 /* 3134 * Somewhat wasteful iteration over all vnodes. Would be better to 3135 * support filtering and avoid the interlock to begin with. 3136 */ 3137 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 3138 visited++; 3139 if (!cache_has_entries(vp)) { 3140 VI_UNLOCK(vp); 3141 continue; 3142 } 3143 vholdl(vp); 3144 VI_UNLOCK(vp); 3145 cache_purge(vp); 3146 purged++; 3147 vdrop(vp); 3148 } 3149 3150 SDT_PROBE3(vfs, namecache, purgevfs, done, mp, visited, purged); 3151 } 3152 3153 /* 3154 * Perform canonical checks and cache lookup and pass on to filesystem 3155 * through the vop_cachedlookup only if needed. 3156 */ 3157 3158 int 3159 vfs_cache_lookup(struct vop_lookup_args *ap) 3160 { 3161 struct vnode *dvp; 3162 int error; 3163 struct vnode **vpp = ap->a_vpp; 3164 struct componentname *cnp = ap->a_cnp; 3165 int flags = cnp->cn_flags; 3166 3167 *vpp = NULL; 3168 dvp = ap->a_dvp; 3169 3170 if (dvp->v_type != VDIR) 3171 return (ENOTDIR); 3172 3173 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && 3174 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) 3175 return (EROFS); 3176 3177 error = vn_dir_check_exec(dvp, cnp); 3178 if (error != 0) 3179 return (error); 3180 3181 error = cache_lookup(dvp, vpp, cnp, NULL, NULL); 3182 if (error == 0) 3183 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); 3184 if (error == -1) 3185 return (0); 3186 return (error); 3187 } 3188 3189 /* Implementation of the getcwd syscall. */ 3190 int 3191 sys___getcwd(struct thread *td, struct __getcwd_args *uap) 3192 { 3193 char *buf, *retbuf; 3194 size_t buflen; 3195 int error; 3196 3197 buflen = uap->buflen; 3198 if (__predict_false(buflen < 2)) 3199 return (EINVAL); 3200 if (buflen > MAXPATHLEN) 3201 buflen = MAXPATHLEN; 3202 3203 buf = uma_zalloc(namei_zone, M_WAITOK); 3204 error = vn_getcwd(buf, &retbuf, &buflen); 3205 if (error == 0) 3206 error = copyout(retbuf, uap->buf, buflen); 3207 uma_zfree(namei_zone, buf); 3208 return (error); 3209 } 3210 3211 int 3212 vn_getcwd(char *buf, char **retbuf, size_t *buflen) 3213 { 3214 struct pwd *pwd; 3215 int error; 3216 3217 vfs_smr_enter(); 3218 pwd = pwd_get_smr(); 3219 error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, 3220 buflen, 0); 3221 VFS_SMR_ASSERT_NOT_ENTERED(); 3222 if (error < 0) { 3223 pwd = pwd_hold(curthread); 3224 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf, 3225 retbuf, buflen); 3226 pwd_drop(pwd); 3227 } 3228 3229 #ifdef KTRACE 3230 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0) 3231 ktrnamei(*retbuf); 3232 #endif 3233 return (error); 3234 } 3235 3236 /* 3237 * Canonicalize a path by walking it forward and back. 3238 * 3239 * BUGS: 3240 * - Nothing guarantees the integrity of the entire chain. Consider the case 3241 * where the path "foo/bar/baz/qux" is passed, but "bar" is moved out of 3242 * "foo" into "quux" during the backwards walk. The result will be 3243 * "quux/bar/baz/qux", which could not have been obtained by an incremental 3244 * walk in userspace. Moreover, the path we return is inaccessible if the 3245 * calling thread lacks permission to traverse "quux". 3246 */ 3247 static int 3248 kern___realpathat(struct thread *td, int fd, const char *path, char *buf, 3249 size_t size, int flags, enum uio_seg pathseg) 3250 { 3251 struct nameidata nd; 3252 char *retbuf, *freebuf; 3253 int error; 3254 3255 if (flags != 0) 3256 return (EINVAL); 3257 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | WANTPARENT | AUDITVNODE1, 3258 pathseg, path, fd, &cap_fstat_rights); 3259 if ((error = namei(&nd)) != 0) 3260 return (error); 3261 3262 if (nd.ni_vp->v_type == VREG && nd.ni_dvp->v_type != VDIR && 3263 (nd.ni_vp->v_vflag & VV_ROOT) != 0) { 3264 struct vnode *covered_vp; 3265 3266 /* 3267 * This happens if vp is a file mount. The call to 3268 * vn_fullpath_hardlink can panic if path resolution can't be 3269 * handled without the directory. 3270 * 3271 * To resolve this, we find the vnode which was mounted on - 3272 * this should have a unique global path since we disallow 3273 * mounting on linked files. 3274 */ 3275 error = vn_lock(nd.ni_vp, LK_SHARED); 3276 if (error != 0) 3277 goto out; 3278 covered_vp = nd.ni_vp->v_mount->mnt_vnodecovered; 3279 vref(covered_vp); 3280 VOP_UNLOCK(nd.ni_vp); 3281 error = vn_fullpath(covered_vp, &retbuf, &freebuf); 3282 vrele(covered_vp); 3283 } else { 3284 error = vn_fullpath_hardlink(nd.ni_vp, nd.ni_dvp, 3285 nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, &retbuf, 3286 &freebuf, &size); 3287 } 3288 if (error == 0) { 3289 size_t len; 3290 3291 len = strlen(retbuf) + 1; 3292 if (size < len) 3293 error = ENAMETOOLONG; 3294 else if (pathseg == UIO_USERSPACE) 3295 error = copyout(retbuf, buf, len); 3296 else 3297 memcpy(buf, retbuf, len); 3298 free(freebuf, M_TEMP); 3299 } 3300 out: 3301 vrele(nd.ni_vp); 3302 vrele(nd.ni_dvp); 3303 NDFREE_PNBUF(&nd); 3304 return (error); 3305 } 3306 3307 int 3308 sys___realpathat(struct thread *td, struct __realpathat_args *uap) 3309 { 3310 3311 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size, 3312 uap->flags, UIO_USERSPACE)); 3313 } 3314 3315 /* 3316 * Retrieve the full filesystem path that correspond to a vnode from the name 3317 * cache (if available) 3318 */ 3319 int 3320 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf) 3321 { 3322 struct pwd *pwd; 3323 char *buf; 3324 size_t buflen; 3325 int error; 3326 3327 if (__predict_false(vp == NULL)) 3328 return (EINVAL); 3329 3330 buflen = MAXPATHLEN; 3331 buf = malloc(buflen, M_TEMP, M_WAITOK); 3332 vfs_smr_enter(); 3333 pwd = pwd_get_smr(); 3334 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, 0); 3335 VFS_SMR_ASSERT_NOT_ENTERED(); 3336 if (error < 0) { 3337 pwd = pwd_hold(curthread); 3338 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen); 3339 pwd_drop(pwd); 3340 } 3341 if (error == 0) 3342 *freebuf = buf; 3343 else 3344 free(buf, M_TEMP); 3345 return (error); 3346 } 3347 3348 /* 3349 * This function is similar to vn_fullpath, but it attempts to lookup the 3350 * pathname relative to the global root mount point. This is required for the 3351 * auditing sub-system, as audited pathnames must be absolute, relative to the 3352 * global root mount point. 3353 */ 3354 int 3355 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf) 3356 { 3357 char *buf; 3358 size_t buflen; 3359 int error; 3360 3361 if (__predict_false(vp == NULL)) 3362 return (EINVAL); 3363 buflen = MAXPATHLEN; 3364 buf = malloc(buflen, M_TEMP, M_WAITOK); 3365 vfs_smr_enter(); 3366 error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, 0); 3367 VFS_SMR_ASSERT_NOT_ENTERED(); 3368 if (error < 0) { 3369 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen); 3370 } 3371 if (error == 0) 3372 *freebuf = buf; 3373 else 3374 free(buf, M_TEMP); 3375 return (error); 3376 } 3377 3378 static struct namecache * 3379 vn_dd_from_dst(struct vnode *vp) 3380 { 3381 struct namecache *ncp; 3382 3383 cache_assert_vnode_locked(vp); 3384 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) { 3385 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 3386 return (ncp); 3387 } 3388 return (NULL); 3389 } 3390 3391 int 3392 vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen) 3393 { 3394 struct vnode *dvp; 3395 struct namecache *ncp; 3396 struct mtx *vlp; 3397 int error; 3398 3399 vlp = VP2VNODELOCK(*vp); 3400 mtx_lock(vlp); 3401 ncp = (*vp)->v_cache_dd; 3402 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) { 3403 KASSERT(ncp == vn_dd_from_dst(*vp), 3404 ("%s: mismatch for dd entry (%p != %p)", __func__, 3405 ncp, vn_dd_from_dst(*vp))); 3406 } else { 3407 ncp = vn_dd_from_dst(*vp); 3408 } 3409 if (ncp != NULL) { 3410 if (*buflen < ncp->nc_nlen) { 3411 mtx_unlock(vlp); 3412 vrele(*vp); 3413 counter_u64_add(numfullpathfail4, 1); 3414 error = ENOMEM; 3415 SDT_PROBE3(vfs, namecache, fullpath, return, error, 3416 vp, NULL); 3417 return (error); 3418 } 3419 *buflen -= ncp->nc_nlen; 3420 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 3421 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, 3422 ncp->nc_name, vp); 3423 dvp = *vp; 3424 *vp = ncp->nc_dvp; 3425 vref(*vp); 3426 mtx_unlock(vlp); 3427 vrele(dvp); 3428 return (0); 3429 } 3430 SDT_PROBE1(vfs, namecache, fullpath, miss, vp); 3431 3432 mtx_unlock(vlp); 3433 vn_lock(*vp, LK_SHARED | LK_RETRY); 3434 error = VOP_VPTOCNP(*vp, &dvp, buf, buflen); 3435 vput(*vp); 3436 if (error) { 3437 counter_u64_add(numfullpathfail2, 1); 3438 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 3439 return (error); 3440 } 3441 3442 *vp = dvp; 3443 if (VN_IS_DOOMED(dvp)) { 3444 /* forced unmount */ 3445 vrele(dvp); 3446 error = ENOENT; 3447 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 3448 return (error); 3449 } 3450 /* 3451 * *vp has its use count incremented still. 3452 */ 3453 3454 return (0); 3455 } 3456 3457 /* 3458 * Resolve a directory to a pathname. 3459 * 3460 * The name of the directory can always be found in the namecache or fetched 3461 * from the filesystem. There is also guaranteed to be only one parent, meaning 3462 * we can just follow vnodes up until we find the root. 3463 * 3464 * The vnode must be referenced. 3465 */ 3466 static int 3467 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 3468 size_t *len, size_t addend) 3469 { 3470 #ifdef KDTRACE_HOOKS 3471 struct vnode *startvp = vp; 3472 #endif 3473 struct vnode *vp1; 3474 size_t buflen; 3475 int error; 3476 bool slash_prefixed; 3477 3478 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); 3479 VNPASS(vp->v_usecount > 0, vp); 3480 3481 buflen = *len; 3482 3483 slash_prefixed = true; 3484 if (addend == 0) { 3485 MPASS(*len >= 2); 3486 buflen--; 3487 buf[buflen] = '\0'; 3488 slash_prefixed = false; 3489 } 3490 3491 error = 0; 3492 3493 SDT_PROBE1(vfs, namecache, fullpath, entry, vp); 3494 counter_u64_add(numfullpathcalls, 1); 3495 while (vp != rdir && vp != rootvnode) { 3496 /* 3497 * The vp vnode must be already fully constructed, 3498 * since it is either found in namecache or obtained 3499 * from VOP_VPTOCNP(). We may test for VV_ROOT safely 3500 * without obtaining the vnode lock. 3501 */ 3502 if ((vp->v_vflag & VV_ROOT) != 0) { 3503 vn_lock(vp, LK_RETRY | LK_SHARED); 3504 3505 /* 3506 * With the vnode locked, check for races with 3507 * unmount, forced or not. Note that we 3508 * already verified that vp is not equal to 3509 * the root vnode, which means that 3510 * mnt_vnodecovered can be NULL only for the 3511 * case of unmount. 3512 */ 3513 if (VN_IS_DOOMED(vp) || 3514 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || 3515 vp1->v_mountedhere != vp->v_mount) { 3516 vput(vp); 3517 error = ENOENT; 3518 SDT_PROBE3(vfs, namecache, fullpath, return, 3519 error, vp, NULL); 3520 break; 3521 } 3522 3523 vref(vp1); 3524 vput(vp); 3525 vp = vp1; 3526 continue; 3527 } 3528 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); 3529 error = vn_vptocnp(&vp, buf, &buflen); 3530 if (error) 3531 break; 3532 if (buflen == 0) { 3533 vrele(vp); 3534 error = ENOMEM; 3535 SDT_PROBE3(vfs, namecache, fullpath, return, error, 3536 startvp, NULL); 3537 break; 3538 } 3539 buf[--buflen] = '/'; 3540 slash_prefixed = true; 3541 } 3542 if (error) 3543 return (error); 3544 if (!slash_prefixed) { 3545 if (buflen == 0) { 3546 vrele(vp); 3547 counter_u64_add(numfullpathfail4, 1); 3548 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, 3549 startvp, NULL); 3550 return (ENOMEM); 3551 } 3552 buf[--buflen] = '/'; 3553 } 3554 counter_u64_add(numfullpathfound, 1); 3555 vrele(vp); 3556 3557 *retbuf = buf + buflen; 3558 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf); 3559 *len -= buflen; 3560 *len += addend; 3561 return (0); 3562 } 3563 3564 /* 3565 * Resolve an arbitrary vnode to a pathname. 3566 * 3567 * Note 2 caveats: 3568 * - hardlinks are not tracked, thus if the vnode is not a directory this can 3569 * resolve to a different path than the one used to find it 3570 * - namecache is not mandatory, meaning names are not guaranteed to be added 3571 * (in which case resolving fails) 3572 */ 3573 static void __inline 3574 cache_rev_failed_impl(int *reason, int line) 3575 { 3576 3577 *reason = line; 3578 } 3579 #define cache_rev_failed(var) cache_rev_failed_impl((var), __LINE__) 3580 3581 static int 3582 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 3583 char **retbuf, size_t *buflen, size_t addend) 3584 { 3585 #ifdef KDTRACE_HOOKS 3586 struct vnode *startvp = vp; 3587 #endif 3588 struct vnode *tvp; 3589 struct mount *mp; 3590 struct namecache *ncp; 3591 size_t orig_buflen; 3592 int reason; 3593 int error; 3594 #ifdef KDTRACE_HOOKS 3595 int i; 3596 #endif 3597 seqc_t vp_seqc, tvp_seqc; 3598 u_char nc_flag; 3599 3600 VFS_SMR_ASSERT_ENTERED(); 3601 3602 if (!atomic_load_char(&cache_fast_lookup_enabled)) { 3603 vfs_smr_exit(); 3604 return (-1); 3605 } 3606 3607 orig_buflen = *buflen; 3608 3609 if (addend == 0) { 3610 MPASS(*buflen >= 2); 3611 *buflen -= 1; 3612 buf[*buflen] = '\0'; 3613 } 3614 3615 if (vp == rdir || vp == rootvnode) { 3616 if (addend == 0) { 3617 *buflen -= 1; 3618 buf[*buflen] = '/'; 3619 } 3620 goto out_ok; 3621 } 3622 3623 #ifdef KDTRACE_HOOKS 3624 i = 0; 3625 #endif 3626 error = -1; 3627 ncp = NULL; /* for sdt probe down below */ 3628 vp_seqc = vn_seqc_read_any(vp); 3629 if (seqc_in_modify(vp_seqc)) { 3630 cache_rev_failed(&reason); 3631 goto out_abort; 3632 } 3633 3634 for (;;) { 3635 #ifdef KDTRACE_HOOKS 3636 i++; 3637 #endif 3638 if ((vp->v_vflag & VV_ROOT) != 0) { 3639 mp = atomic_load_ptr(&vp->v_mount); 3640 if (mp == NULL) { 3641 cache_rev_failed(&reason); 3642 goto out_abort; 3643 } 3644 tvp = atomic_load_ptr(&mp->mnt_vnodecovered); 3645 tvp_seqc = vn_seqc_read_any(tvp); 3646 if (seqc_in_modify(tvp_seqc)) { 3647 cache_rev_failed(&reason); 3648 goto out_abort; 3649 } 3650 if (!vn_seqc_consistent(vp, vp_seqc)) { 3651 cache_rev_failed(&reason); 3652 goto out_abort; 3653 } 3654 vp = tvp; 3655 vp_seqc = tvp_seqc; 3656 continue; 3657 } 3658 ncp = atomic_load_consume_ptr(&vp->v_cache_dd); 3659 if (ncp == NULL) { 3660 cache_rev_failed(&reason); 3661 goto out_abort; 3662 } 3663 nc_flag = atomic_load_char(&ncp->nc_flag); 3664 if ((nc_flag & NCF_ISDOTDOT) != 0) { 3665 cache_rev_failed(&reason); 3666 goto out_abort; 3667 } 3668 if (ncp->nc_nlen >= *buflen) { 3669 cache_rev_failed(&reason); 3670 error = ENOMEM; 3671 goto out_abort; 3672 } 3673 *buflen -= ncp->nc_nlen; 3674 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 3675 *buflen -= 1; 3676 buf[*buflen] = '/'; 3677 tvp = ncp->nc_dvp; 3678 tvp_seqc = vn_seqc_read_any(tvp); 3679 if (seqc_in_modify(tvp_seqc)) { 3680 cache_rev_failed(&reason); 3681 goto out_abort; 3682 } 3683 if (!vn_seqc_consistent(vp, vp_seqc)) { 3684 cache_rev_failed(&reason); 3685 goto out_abort; 3686 } 3687 /* 3688 * Acquire fence provided by vn_seqc_read_any above. 3689 */ 3690 if (__predict_false(atomic_load_ptr(&vp->v_cache_dd) != ncp)) { 3691 cache_rev_failed(&reason); 3692 goto out_abort; 3693 } 3694 if (!cache_ncp_canuse(ncp)) { 3695 cache_rev_failed(&reason); 3696 goto out_abort; 3697 } 3698 vp = tvp; 3699 vp_seqc = tvp_seqc; 3700 if (vp == rdir || vp == rootvnode) 3701 break; 3702 } 3703 out_ok: 3704 vfs_smr_exit(); 3705 *retbuf = buf + *buflen; 3706 *buflen = orig_buflen - *buflen + addend; 3707 SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf); 3708 return (0); 3709 3710 out_abort: 3711 *buflen = orig_buflen; 3712 SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i); 3713 vfs_smr_exit(); 3714 return (error); 3715 } 3716 3717 static int 3718 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 3719 size_t *buflen) 3720 { 3721 size_t orig_buflen, addend; 3722 int error; 3723 3724 if (*buflen < 2) 3725 return (EINVAL); 3726 3727 orig_buflen = *buflen; 3728 3729 vref(vp); 3730 addend = 0; 3731 if (vp->v_type != VDIR) { 3732 *buflen -= 1; 3733 buf[*buflen] = '\0'; 3734 error = vn_vptocnp(&vp, buf, buflen); 3735 if (error) 3736 return (error); 3737 if (*buflen == 0) { 3738 vrele(vp); 3739 return (ENOMEM); 3740 } 3741 *buflen -= 1; 3742 buf[*buflen] = '/'; 3743 addend = orig_buflen - *buflen; 3744 } 3745 3746 return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, addend)); 3747 } 3748 3749 /* 3750 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks). 3751 * 3752 * Since the namecache does not track hardlinks, the caller is expected to 3753 * first look up the target vnode with WANTPARENT flag passed to namei to get 3754 * dvp and vp. 3755 * 3756 * Then we have 2 cases: 3757 * - if the found vnode is a directory, the path can be constructed just by 3758 * following names up the chain 3759 * - otherwise we populate the buffer with the saved name and start resolving 3760 * from the parent 3761 */ 3762 int 3763 vn_fullpath_hardlink(struct vnode *vp, struct vnode *dvp, 3764 const char *hrdl_name, size_t hrdl_name_length, 3765 char **retbuf, char **freebuf, size_t *buflen) 3766 { 3767 char *buf, *tmpbuf; 3768 struct pwd *pwd; 3769 size_t addend; 3770 int error; 3771 __enum_uint8(vtype) type; 3772 3773 if (*buflen < 2) 3774 return (EINVAL); 3775 if (*buflen > MAXPATHLEN) 3776 *buflen = MAXPATHLEN; 3777 3778 buf = malloc(*buflen, M_TEMP, M_WAITOK); 3779 3780 addend = 0; 3781 3782 /* 3783 * Check for VBAD to work around the vp_crossmp bug in lookup(). 3784 * 3785 * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be 3786 * set to mount point's root vnode while ni_dvp will be vp_crossmp. 3787 * If the type is VDIR (like in this very case) we can skip looking 3788 * at ni_dvp in the first place. However, since vnodes get passed here 3789 * unlocked the target may transition to doomed state (type == VBAD) 3790 * before we get to evaluate the condition. If this happens, we will 3791 * populate part of the buffer and descend to vn_fullpath_dir with 3792 * vp == vp_crossmp. Prevent the problem by checking for VBAD. 3793 */ 3794 type = atomic_load_8(&vp->v_type); 3795 if (type == VBAD) { 3796 error = ENOENT; 3797 goto out_bad; 3798 } 3799 if (type != VDIR) { 3800 addend = hrdl_name_length + 2; 3801 if (*buflen < addend) { 3802 error = ENOMEM; 3803 goto out_bad; 3804 } 3805 *buflen -= addend; 3806 tmpbuf = buf + *buflen; 3807 tmpbuf[0] = '/'; 3808 memcpy(&tmpbuf[1], hrdl_name, hrdl_name_length); 3809 tmpbuf[addend - 1] = '\0'; 3810 vp = dvp; 3811 } 3812 3813 vfs_smr_enter(); 3814 pwd = pwd_get_smr(); 3815 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen, 3816 addend); 3817 VFS_SMR_ASSERT_NOT_ENTERED(); 3818 if (error < 0) { 3819 pwd = pwd_hold(curthread); 3820 vref(vp); 3821 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen, 3822 addend); 3823 pwd_drop(pwd); 3824 } 3825 if (error != 0) 3826 goto out_bad; 3827 3828 *freebuf = buf; 3829 3830 return (0); 3831 out_bad: 3832 free(buf, M_TEMP); 3833 return (error); 3834 } 3835 3836 struct vnode * 3837 vn_dir_dd_ino(struct vnode *vp) 3838 { 3839 struct namecache *ncp; 3840 struct vnode *ddvp; 3841 struct mtx *vlp; 3842 enum vgetstate vs; 3843 3844 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); 3845 vlp = VP2VNODELOCK(vp); 3846 mtx_lock(vlp); 3847 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { 3848 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 3849 continue; 3850 ddvp = ncp->nc_dvp; 3851 vs = vget_prep(ddvp); 3852 mtx_unlock(vlp); 3853 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) 3854 return (NULL); 3855 return (ddvp); 3856 } 3857 mtx_unlock(vlp); 3858 return (NULL); 3859 } 3860 3861 int 3862 vn_commname(struct vnode *vp, char *buf, u_int buflen) 3863 { 3864 struct namecache *ncp; 3865 struct mtx *vlp; 3866 int l; 3867 3868 vlp = VP2VNODELOCK(vp); 3869 mtx_lock(vlp); 3870 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) 3871 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 3872 break; 3873 if (ncp == NULL) { 3874 mtx_unlock(vlp); 3875 return (ENOENT); 3876 } 3877 l = min(ncp->nc_nlen, buflen - 1); 3878 memcpy(buf, ncp->nc_name, l); 3879 mtx_unlock(vlp); 3880 buf[l] = '\0'; 3881 return (0); 3882 } 3883 3884 /* 3885 * This function updates path string to vnode's full global path 3886 * and checks the size of the new path string against the pathlen argument. 3887 * 3888 * Requires a locked, referenced vnode. 3889 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 3890 * 3891 * If vp is a directory, the call to vn_fullpath_global() always succeeds 3892 * because it falls back to the ".." lookup if the namecache lookup fails. 3893 */ 3894 int 3895 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, 3896 u_int pathlen) 3897 { 3898 struct nameidata nd; 3899 struct vnode *vp1; 3900 char *rpath, *fbuf; 3901 int error; 3902 3903 ASSERT_VOP_ELOCKED(vp, __func__); 3904 3905 /* Construct global filesystem path from vp. */ 3906 VOP_UNLOCK(vp); 3907 error = vn_fullpath_global(vp, &rpath, &fbuf); 3908 3909 if (error != 0) { 3910 vrele(vp); 3911 return (error); 3912 } 3913 3914 if (strlen(rpath) >= pathlen) { 3915 vrele(vp); 3916 error = ENAMETOOLONG; 3917 goto out; 3918 } 3919 3920 /* 3921 * Re-lookup the vnode by path to detect a possible rename. 3922 * As a side effect, the vnode is relocked. 3923 * If vnode was renamed, return ENOENT. 3924 */ 3925 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, path); 3926 error = namei(&nd); 3927 if (error != 0) { 3928 vrele(vp); 3929 goto out; 3930 } 3931 NDFREE_PNBUF(&nd); 3932 vp1 = nd.ni_vp; 3933 vrele(vp); 3934 if (vp1 == vp) 3935 strcpy(path, rpath); 3936 else { 3937 vput(vp1); 3938 error = ENOENT; 3939 } 3940 3941 out: 3942 free(fbuf, M_TEMP); 3943 return (error); 3944 } 3945 3946 /* 3947 * This is similar to vn_path_to_global_path but allows for regular 3948 * files which may not be present in the cache. 3949 * 3950 * Requires a locked, referenced vnode. 3951 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 3952 */ 3953 int 3954 vn_path_to_global_path_hardlink(struct thread *td, struct vnode *vp, 3955 struct vnode *dvp, char *path, u_int pathlen, const char *leaf_name, 3956 size_t leaf_length) 3957 { 3958 struct nameidata nd; 3959 struct vnode *vp1; 3960 char *rpath, *fbuf; 3961 size_t len; 3962 int error; 3963 3964 ASSERT_VOP_ELOCKED(vp, __func__); 3965 3966 /* 3967 * Construct global filesystem path from dvp, vp and leaf 3968 * name. 3969 */ 3970 VOP_UNLOCK(vp); 3971 len = pathlen; 3972 error = vn_fullpath_hardlink(vp, dvp, leaf_name, leaf_length, 3973 &rpath, &fbuf, &len); 3974 3975 if (error != 0) { 3976 vrele(vp); 3977 return (error); 3978 } 3979 3980 if (strlen(rpath) >= pathlen) { 3981 vrele(vp); 3982 error = ENAMETOOLONG; 3983 goto out; 3984 } 3985 3986 /* 3987 * Re-lookup the vnode by path to detect a possible rename. 3988 * As a side effect, the vnode is relocked. 3989 * If vnode was renamed, return ENOENT. 3990 */ 3991 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, path); 3992 error = namei(&nd); 3993 if (error != 0) { 3994 vrele(vp); 3995 goto out; 3996 } 3997 NDFREE_PNBUF(&nd); 3998 vp1 = nd.ni_vp; 3999 vrele(vp); 4000 if (vp1 == vp) 4001 strcpy(path, rpath); 4002 else { 4003 vput(vp1); 4004 error = ENOENT; 4005 } 4006 4007 out: 4008 free(fbuf, M_TEMP); 4009 return (error); 4010 } 4011 4012 #ifdef DDB 4013 static void 4014 db_print_vpath(struct vnode *vp) 4015 { 4016 4017 while (vp != NULL) { 4018 db_printf("%p: ", vp); 4019 if (vp == rootvnode) { 4020 db_printf("/"); 4021 vp = NULL; 4022 } else { 4023 if (vp->v_vflag & VV_ROOT) { 4024 db_printf("<mount point>"); 4025 vp = vp->v_mount->mnt_vnodecovered; 4026 } else { 4027 struct namecache *ncp; 4028 char *ncn; 4029 int i; 4030 4031 ncp = TAILQ_FIRST(&vp->v_cache_dst); 4032 if (ncp != NULL) { 4033 ncn = ncp->nc_name; 4034 for (i = 0; i < ncp->nc_nlen; i++) 4035 db_printf("%c", *ncn++); 4036 vp = ncp->nc_dvp; 4037 } else { 4038 vp = NULL; 4039 } 4040 } 4041 } 4042 db_printf("\n"); 4043 } 4044 4045 return; 4046 } 4047 4048 DB_SHOW_COMMAND(vpath, db_show_vpath) 4049 { 4050 struct vnode *vp; 4051 4052 if (!have_addr) { 4053 db_printf("usage: show vpath <struct vnode *>\n"); 4054 return; 4055 } 4056 4057 vp = (struct vnode *)addr; 4058 db_print_vpath(vp); 4059 } 4060 4061 #endif 4062 4063 static int cache_fast_lookup = 1; 4064 4065 #define CACHE_FPL_FAILED -2020 4066 4067 static int 4068 cache_vop_bad_vexec(struct vop_fplookup_vexec_args *v) 4069 { 4070 vn_printf(v->a_vp, "no proper vop_fplookup_vexec\n"); 4071 panic("no proper vop_fplookup_vexec"); 4072 } 4073 4074 static int 4075 cache_vop_bad_symlink(struct vop_fplookup_symlink_args *v) 4076 { 4077 vn_printf(v->a_vp, "no proper vop_fplookup_symlink\n"); 4078 panic("no proper vop_fplookup_symlink"); 4079 } 4080 4081 void 4082 cache_vop_vector_register(struct vop_vector *v) 4083 { 4084 size_t ops; 4085 4086 ops = 0; 4087 if (v->vop_fplookup_vexec != NULL) { 4088 ops++; 4089 } 4090 if (v->vop_fplookup_symlink != NULL) { 4091 ops++; 4092 } 4093 4094 if (ops == 2) { 4095 return; 4096 } 4097 4098 if (ops == 0) { 4099 v->vop_fplookup_vexec = cache_vop_bad_vexec; 4100 v->vop_fplookup_symlink = cache_vop_bad_symlink; 4101 return; 4102 } 4103 4104 printf("%s: invalid vop vector %p -- either all or none fplookup vops " 4105 "need to be provided", __func__, v); 4106 if (v->vop_fplookup_vexec == NULL) { 4107 printf("%s: missing vop_fplookup_vexec\n", __func__); 4108 } 4109 if (v->vop_fplookup_symlink == NULL) { 4110 printf("%s: missing vop_fplookup_symlink\n", __func__); 4111 } 4112 panic("bad vop vector %p", v); 4113 } 4114 4115 #ifdef INVARIANTS 4116 void 4117 cache_validate_vop_vector(struct mount *mp, struct vop_vector *vops) 4118 { 4119 if (mp == NULL) 4120 return; 4121 4122 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) 4123 return; 4124 4125 if (vops->vop_fplookup_vexec == NULL || 4126 vops->vop_fplookup_vexec == cache_vop_bad_vexec) 4127 panic("bad vop_fplookup_vexec on vector %p for filesystem %s", 4128 vops, mp->mnt_vfc->vfc_name); 4129 4130 if (vops->vop_fplookup_symlink == NULL || 4131 vops->vop_fplookup_symlink == cache_vop_bad_symlink) 4132 panic("bad vop_fplookup_symlink on vector %p for filesystem %s", 4133 vops, mp->mnt_vfc->vfc_name); 4134 } 4135 #endif 4136 4137 void 4138 cache_fast_lookup_enabled_recalc(void) 4139 { 4140 int lookup_flag; 4141 int mac_on; 4142 4143 #ifdef MAC 4144 mac_on = mac_vnode_check_lookup_enabled(); 4145 mac_on |= mac_vnode_check_readlink_enabled(); 4146 #else 4147 mac_on = 0; 4148 #endif 4149 4150 lookup_flag = atomic_load_int(&cache_fast_lookup); 4151 if (lookup_flag && !mac_on) { 4152 atomic_store_char(&cache_fast_lookup_enabled, true); 4153 } else { 4154 atomic_store_char(&cache_fast_lookup_enabled, false); 4155 } 4156 } 4157 4158 static int 4159 syscal_vfs_cache_fast_lookup(SYSCTL_HANDLER_ARGS) 4160 { 4161 int error, old; 4162 4163 old = atomic_load_int(&cache_fast_lookup); 4164 error = sysctl_handle_int(oidp, arg1, arg2, req); 4165 if (error == 0 && req->newptr && old != atomic_load_int(&cache_fast_lookup)) 4166 cache_fast_lookup_enabled_recalc(); 4167 return (error); 4168 } 4169 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, fast_lookup, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_MPSAFE, 4170 &cache_fast_lookup, 0, syscal_vfs_cache_fast_lookup, "IU", ""); 4171 4172 /* 4173 * Components of nameidata (or objects it can point to) which may 4174 * need restoring in case fast path lookup fails. 4175 */ 4176 struct nameidata_outer { 4177 size_t ni_pathlen; 4178 int cn_flags; 4179 }; 4180 4181 struct nameidata_saved { 4182 #ifdef INVARIANTS 4183 char *cn_nameptr; 4184 size_t ni_pathlen; 4185 #endif 4186 }; 4187 4188 #ifdef INVARIANTS 4189 struct cache_fpl_debug { 4190 size_t ni_pathlen; 4191 }; 4192 #endif 4193 4194 struct cache_fpl { 4195 struct nameidata *ndp; 4196 struct componentname *cnp; 4197 char *nulchar; 4198 struct vnode *dvp; 4199 struct vnode *tvp; 4200 seqc_t dvp_seqc; 4201 seqc_t tvp_seqc; 4202 uint32_t hash; 4203 struct nameidata_saved snd; 4204 struct nameidata_outer snd_outer; 4205 int line; 4206 enum cache_fpl_status status:8; 4207 bool in_smr; 4208 bool fsearch; 4209 struct pwd **pwd; 4210 #ifdef INVARIANTS 4211 struct cache_fpl_debug debug; 4212 #endif 4213 }; 4214 4215 static bool cache_fplookup_mp_supported(struct mount *mp); 4216 static bool cache_fplookup_is_mp(struct cache_fpl *fpl); 4217 static int cache_fplookup_cross_mount(struct cache_fpl *fpl); 4218 static int cache_fplookup_partial_setup(struct cache_fpl *fpl); 4219 static int cache_fplookup_skip_slashes(struct cache_fpl *fpl); 4220 static int cache_fplookup_trailingslash(struct cache_fpl *fpl); 4221 static void cache_fpl_pathlen_dec(struct cache_fpl *fpl); 4222 static void cache_fpl_pathlen_inc(struct cache_fpl *fpl); 4223 static void cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n); 4224 static void cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n); 4225 4226 static void 4227 cache_fpl_cleanup_cnp(struct componentname *cnp) 4228 { 4229 4230 uma_zfree(namei_zone, cnp->cn_pnbuf); 4231 cnp->cn_pnbuf = NULL; 4232 cnp->cn_nameptr = NULL; 4233 } 4234 4235 static struct vnode * 4236 cache_fpl_handle_root(struct cache_fpl *fpl) 4237 { 4238 struct nameidata *ndp; 4239 struct componentname *cnp; 4240 4241 ndp = fpl->ndp; 4242 cnp = fpl->cnp; 4243 4244 MPASS(*(cnp->cn_nameptr) == '/'); 4245 cnp->cn_nameptr++; 4246 cache_fpl_pathlen_dec(fpl); 4247 4248 if (__predict_false(*(cnp->cn_nameptr) == '/')) { 4249 do { 4250 cnp->cn_nameptr++; 4251 cache_fpl_pathlen_dec(fpl); 4252 } while (*(cnp->cn_nameptr) == '/'); 4253 } 4254 4255 return (ndp->ni_rootdir); 4256 } 4257 4258 static void 4259 cache_fpl_checkpoint_outer(struct cache_fpl *fpl) 4260 { 4261 4262 fpl->snd_outer.ni_pathlen = fpl->ndp->ni_pathlen; 4263 fpl->snd_outer.cn_flags = fpl->ndp->ni_cnd.cn_flags; 4264 } 4265 4266 static void 4267 cache_fpl_checkpoint(struct cache_fpl *fpl) 4268 { 4269 4270 #ifdef INVARIANTS 4271 fpl->snd.cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr; 4272 fpl->snd.ni_pathlen = fpl->debug.ni_pathlen; 4273 #endif 4274 } 4275 4276 static void 4277 cache_fpl_restore_partial(struct cache_fpl *fpl) 4278 { 4279 4280 fpl->ndp->ni_cnd.cn_flags = fpl->snd_outer.cn_flags; 4281 #ifdef INVARIANTS 4282 fpl->debug.ni_pathlen = fpl->snd.ni_pathlen; 4283 #endif 4284 } 4285 4286 static void 4287 cache_fpl_restore_abort(struct cache_fpl *fpl) 4288 { 4289 4290 cache_fpl_restore_partial(fpl); 4291 /* 4292 * It is 0 on entry by API contract. 4293 */ 4294 fpl->ndp->ni_resflags = 0; 4295 fpl->ndp->ni_cnd.cn_nameptr = fpl->ndp->ni_cnd.cn_pnbuf; 4296 fpl->ndp->ni_pathlen = fpl->snd_outer.ni_pathlen; 4297 } 4298 4299 #ifdef INVARIANTS 4300 #define cache_fpl_smr_assert_entered(fpl) ({ \ 4301 struct cache_fpl *_fpl = (fpl); \ 4302 MPASS(_fpl->in_smr == true); \ 4303 VFS_SMR_ASSERT_ENTERED(); \ 4304 }) 4305 #define cache_fpl_smr_assert_not_entered(fpl) ({ \ 4306 struct cache_fpl *_fpl = (fpl); \ 4307 MPASS(_fpl->in_smr == false); \ 4308 VFS_SMR_ASSERT_NOT_ENTERED(); \ 4309 }) 4310 static void 4311 cache_fpl_assert_status(struct cache_fpl *fpl) 4312 { 4313 4314 switch (fpl->status) { 4315 case CACHE_FPL_STATUS_UNSET: 4316 __assert_unreachable(); 4317 break; 4318 case CACHE_FPL_STATUS_DESTROYED: 4319 case CACHE_FPL_STATUS_ABORTED: 4320 case CACHE_FPL_STATUS_PARTIAL: 4321 case CACHE_FPL_STATUS_HANDLED: 4322 break; 4323 } 4324 } 4325 #else 4326 #define cache_fpl_smr_assert_entered(fpl) do { } while (0) 4327 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0) 4328 #define cache_fpl_assert_status(fpl) do { } while (0) 4329 #endif 4330 4331 #define cache_fpl_smr_enter_initial(fpl) ({ \ 4332 struct cache_fpl *_fpl = (fpl); \ 4333 vfs_smr_enter(); \ 4334 _fpl->in_smr = true; \ 4335 }) 4336 4337 #define cache_fpl_smr_enter(fpl) ({ \ 4338 struct cache_fpl *_fpl = (fpl); \ 4339 MPASS(_fpl->in_smr == false); \ 4340 vfs_smr_enter(); \ 4341 _fpl->in_smr = true; \ 4342 }) 4343 4344 #define cache_fpl_smr_exit(fpl) ({ \ 4345 struct cache_fpl *_fpl = (fpl); \ 4346 MPASS(_fpl->in_smr == true); \ 4347 vfs_smr_exit(); \ 4348 _fpl->in_smr = false; \ 4349 }) 4350 4351 static int 4352 cache_fpl_aborted_early_impl(struct cache_fpl *fpl, int line) 4353 { 4354 4355 if (fpl->status != CACHE_FPL_STATUS_UNSET) { 4356 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, 4357 ("%s: converting to abort from %d at %d, set at %d\n", 4358 __func__, fpl->status, line, fpl->line)); 4359 } 4360 cache_fpl_smr_assert_not_entered(fpl); 4361 fpl->status = CACHE_FPL_STATUS_ABORTED; 4362 fpl->line = line; 4363 return (CACHE_FPL_FAILED); 4364 } 4365 4366 #define cache_fpl_aborted_early(x) cache_fpl_aborted_early_impl((x), __LINE__) 4367 4368 static int __noinline 4369 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line) 4370 { 4371 struct nameidata *ndp; 4372 struct componentname *cnp; 4373 4374 ndp = fpl->ndp; 4375 cnp = fpl->cnp; 4376 4377 if (fpl->status != CACHE_FPL_STATUS_UNSET) { 4378 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, 4379 ("%s: converting to abort from %d at %d, set at %d\n", 4380 __func__, fpl->status, line, fpl->line)); 4381 } 4382 fpl->status = CACHE_FPL_STATUS_ABORTED; 4383 fpl->line = line; 4384 if (fpl->in_smr) 4385 cache_fpl_smr_exit(fpl); 4386 cache_fpl_restore_abort(fpl); 4387 /* 4388 * Resolving symlinks overwrites data passed by the caller. 4389 * Let namei know. 4390 */ 4391 if (ndp->ni_loopcnt > 0) { 4392 fpl->status = CACHE_FPL_STATUS_DESTROYED; 4393 cache_fpl_cleanup_cnp(cnp); 4394 } 4395 return (CACHE_FPL_FAILED); 4396 } 4397 4398 #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__) 4399 4400 static int __noinline 4401 cache_fpl_partial_impl(struct cache_fpl *fpl, int line) 4402 { 4403 4404 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 4405 ("%s: setting to partial at %d, but already set to %d at %d\n", 4406 __func__, line, fpl->status, fpl->line)); 4407 cache_fpl_smr_assert_entered(fpl); 4408 fpl->status = CACHE_FPL_STATUS_PARTIAL; 4409 fpl->line = line; 4410 return (cache_fplookup_partial_setup(fpl)); 4411 } 4412 4413 #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__) 4414 4415 static int 4416 cache_fpl_handled_impl(struct cache_fpl *fpl, int line) 4417 { 4418 4419 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 4420 ("%s: setting to handled at %d, but already set to %d at %d\n", 4421 __func__, line, fpl->status, fpl->line)); 4422 cache_fpl_smr_assert_not_entered(fpl); 4423 fpl->status = CACHE_FPL_STATUS_HANDLED; 4424 fpl->line = line; 4425 return (0); 4426 } 4427 4428 #define cache_fpl_handled(x) cache_fpl_handled_impl((x), __LINE__) 4429 4430 static int 4431 cache_fpl_handled_error_impl(struct cache_fpl *fpl, int error, int line) 4432 { 4433 4434 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 4435 ("%s: setting to handled at %d, but already set to %d at %d\n", 4436 __func__, line, fpl->status, fpl->line)); 4437 MPASS(error != 0); 4438 MPASS(error != CACHE_FPL_FAILED); 4439 cache_fpl_smr_assert_not_entered(fpl); 4440 fpl->status = CACHE_FPL_STATUS_HANDLED; 4441 fpl->line = line; 4442 fpl->dvp = NULL; 4443 fpl->tvp = NULL; 4444 return (error); 4445 } 4446 4447 #define cache_fpl_handled_error(x, e) cache_fpl_handled_error_impl((x), (e), __LINE__) 4448 4449 static bool 4450 cache_fpl_terminated(struct cache_fpl *fpl) 4451 { 4452 4453 return (fpl->status != CACHE_FPL_STATUS_UNSET); 4454 } 4455 4456 #define CACHE_FPL_SUPPORTED_CN_FLAGS \ 4457 (NC_NOMAKEENTRY | NC_KEEPPOSENTRY | LOCKLEAF | LOCKPARENT | WANTPARENT | \ 4458 FAILIFEXISTS | FOLLOW | EMPTYPATH | LOCKSHARED | ISRESTARTED | WILLBEDIR | \ 4459 ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK | OPENREAD | \ 4460 OPENWRITE | WANTIOCTLCAPS | OPENNAMED) 4461 4462 #define CACHE_FPL_INTERNAL_CN_FLAGS \ 4463 (ISDOTDOT | MAKEENTRY | ISLASTCN) 4464 4465 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, 4466 "supported and internal flags overlap"); 4467 4468 static bool 4469 cache_fpl_islastcn(struct nameidata *ndp) 4470 { 4471 4472 return (*ndp->ni_next == 0); 4473 } 4474 4475 static bool 4476 cache_fpl_istrailingslash(struct cache_fpl *fpl) 4477 { 4478 4479 MPASS(fpl->nulchar > fpl->cnp->cn_pnbuf); 4480 return (*(fpl->nulchar - 1) == '/'); 4481 } 4482 4483 static bool 4484 cache_fpl_isdotdot(struct componentname *cnp) 4485 { 4486 4487 if (cnp->cn_namelen == 2 && 4488 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') 4489 return (true); 4490 return (false); 4491 } 4492 4493 static bool 4494 cache_can_fplookup(struct cache_fpl *fpl) 4495 { 4496 struct nameidata *ndp; 4497 struct componentname *cnp; 4498 struct thread *td; 4499 4500 ndp = fpl->ndp; 4501 cnp = fpl->cnp; 4502 td = curthread; 4503 4504 if (!atomic_load_char(&cache_fast_lookup_enabled)) { 4505 cache_fpl_aborted_early(fpl); 4506 return (false); 4507 } 4508 if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) { 4509 cache_fpl_aborted_early(fpl); 4510 return (false); 4511 } 4512 if (IN_CAPABILITY_MODE(td) || CAP_TRACING(td)) { 4513 cache_fpl_aborted_early(fpl); 4514 return (false); 4515 } 4516 if (AUDITING_TD(td)) { 4517 cache_fpl_aborted_early(fpl); 4518 return (false); 4519 } 4520 if (ndp->ni_startdir != NULL) { 4521 cache_fpl_aborted_early(fpl); 4522 return (false); 4523 } 4524 if ((cnp->cn_flags & OPENNAMED) != 0) { 4525 cache_fpl_aborted_early(fpl); 4526 return (false); 4527 } 4528 return (true); 4529 } 4530 4531 static int __noinline 4532 cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp) 4533 { 4534 struct nameidata *ndp; 4535 struct componentname *cnp; 4536 int error; 4537 bool fsearch; 4538 4539 ndp = fpl->ndp; 4540 cnp = fpl->cnp; 4541 4542 error = fgetvp_lookup_smr(ndp, vpp, &fsearch); 4543 if (__predict_false(error != 0)) { 4544 return (cache_fpl_aborted(fpl)); 4545 } 4546 fpl->fsearch = fsearch; 4547 if ((*vpp)->v_type != VDIR) { 4548 if (!((cnp->cn_flags & EMPTYPATH) != 0 && cnp->cn_pnbuf[0] == '\0')) { 4549 cache_fpl_smr_exit(fpl); 4550 return (cache_fpl_handled_error(fpl, ENOTDIR)); 4551 } 4552 } 4553 return (0); 4554 } 4555 4556 static int __noinline 4557 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp, 4558 uint32_t hash) 4559 { 4560 struct componentname *cnp; 4561 struct vnode *dvp; 4562 4563 cnp = fpl->cnp; 4564 dvp = fpl->dvp; 4565 4566 cache_fpl_smr_exit(fpl); 4567 if (cache_neg_promote_cond(dvp, cnp, oncp, hash)) 4568 return (cache_fpl_handled_error(fpl, ENOENT)); 4569 else 4570 return (cache_fpl_aborted(fpl)); 4571 } 4572 4573 /* 4574 * The target vnode is not supported, prepare for the slow path to take over. 4575 */ 4576 static int __noinline 4577 cache_fplookup_partial_setup(struct cache_fpl *fpl) 4578 { 4579 struct nameidata *ndp; 4580 struct componentname *cnp; 4581 enum vgetstate dvs; 4582 struct vnode *dvp; 4583 struct pwd *pwd; 4584 seqc_t dvp_seqc; 4585 4586 ndp = fpl->ndp; 4587 cnp = fpl->cnp; 4588 pwd = *(fpl->pwd); 4589 dvp = fpl->dvp; 4590 dvp_seqc = fpl->dvp_seqc; 4591 4592 if (!pwd_hold_smr(pwd)) { 4593 return (cache_fpl_aborted(fpl)); 4594 } 4595 4596 /* 4597 * Note that seqc is checked before the vnode is locked, so by 4598 * the time regular lookup gets to it it may have moved. 4599 * 4600 * Ultimately this does not affect correctness, any lookup errors 4601 * are userspace racing with itself. It is guaranteed that any 4602 * path which ultimately gets found could also have been found 4603 * by regular lookup going all the way in absence of concurrent 4604 * modifications. 4605 */ 4606 dvs = vget_prep_smr(dvp); 4607 cache_fpl_smr_exit(fpl); 4608 if (__predict_false(dvs == VGET_NONE)) { 4609 pwd_drop(pwd); 4610 return (cache_fpl_aborted(fpl)); 4611 } 4612 4613 vget_finish_ref(dvp, dvs); 4614 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4615 vrele(dvp); 4616 pwd_drop(pwd); 4617 return (cache_fpl_aborted(fpl)); 4618 } 4619 4620 cache_fpl_restore_partial(fpl); 4621 #ifdef INVARIANTS 4622 if (cnp->cn_nameptr != fpl->snd.cn_nameptr) { 4623 panic("%s: cn_nameptr mismatch (%p != %p) full [%s]\n", __func__, 4624 cnp->cn_nameptr, fpl->snd.cn_nameptr, cnp->cn_pnbuf); 4625 } 4626 #endif 4627 4628 ndp->ni_startdir = dvp; 4629 cnp->cn_flags |= MAKEENTRY; 4630 if (cache_fpl_islastcn(ndp)) 4631 cnp->cn_flags |= ISLASTCN; 4632 if (cache_fpl_isdotdot(cnp)) 4633 cnp->cn_flags |= ISDOTDOT; 4634 4635 /* 4636 * Skip potential extra slashes parsing did not take care of. 4637 * cache_fplookup_skip_slashes explains the mechanism. 4638 */ 4639 if (__predict_false(*(cnp->cn_nameptr) == '/')) { 4640 do { 4641 cnp->cn_nameptr++; 4642 cache_fpl_pathlen_dec(fpl); 4643 } while (*(cnp->cn_nameptr) == '/'); 4644 } 4645 4646 ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1; 4647 #ifdef INVARIANTS 4648 if (ndp->ni_pathlen != fpl->debug.ni_pathlen) { 4649 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n", 4650 __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar, 4651 cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf); 4652 } 4653 #endif 4654 return (0); 4655 } 4656 4657 static int 4658 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs) 4659 { 4660 struct componentname *cnp; 4661 struct vnode *tvp; 4662 seqc_t tvp_seqc; 4663 int error, lkflags; 4664 4665 cnp = fpl->cnp; 4666 tvp = fpl->tvp; 4667 tvp_seqc = fpl->tvp_seqc; 4668 4669 if ((cnp->cn_flags & LOCKLEAF) != 0) { 4670 lkflags = LK_SHARED; 4671 if ((cnp->cn_flags & LOCKSHARED) == 0) 4672 lkflags = LK_EXCLUSIVE; 4673 error = vget_finish(tvp, lkflags, tvs); 4674 if (__predict_false(error != 0)) { 4675 return (cache_fpl_aborted(fpl)); 4676 } 4677 } else { 4678 vget_finish_ref(tvp, tvs); 4679 } 4680 4681 if (!vn_seqc_consistent(tvp, tvp_seqc)) { 4682 if ((cnp->cn_flags & LOCKLEAF) != 0) 4683 vput(tvp); 4684 else 4685 vrele(tvp); 4686 return (cache_fpl_aborted(fpl)); 4687 } 4688 4689 return (cache_fpl_handled(fpl)); 4690 } 4691 4692 /* 4693 * They want to possibly modify the state of the namecache. 4694 */ 4695 static int __noinline 4696 cache_fplookup_final_modifying(struct cache_fpl *fpl) 4697 { 4698 struct nameidata *ndp __diagused; 4699 struct componentname *cnp; 4700 enum vgetstate dvs; 4701 struct vnode *dvp, *tvp; 4702 struct mount *mp; 4703 seqc_t dvp_seqc; 4704 int error; 4705 bool docache; 4706 4707 ndp = fpl->ndp; 4708 cnp = fpl->cnp; 4709 dvp = fpl->dvp; 4710 dvp_seqc = fpl->dvp_seqc; 4711 4712 MPASS(*(cnp->cn_nameptr) != '/'); 4713 MPASS(cache_fpl_islastcn(ndp)); 4714 if ((cnp->cn_flags & LOCKPARENT) == 0) 4715 MPASS((cnp->cn_flags & WANTPARENT) != 0); 4716 MPASS((cnp->cn_flags & TRAILINGSLASH) == 0); 4717 MPASS(cnp->cn_nameiop == CREATE || cnp->cn_nameiop == DELETE || 4718 cnp->cn_nameiop == RENAME); 4719 MPASS((cnp->cn_flags & MAKEENTRY) == 0); 4720 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 4721 4722 docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE; 4723 if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME) 4724 docache = false; 4725 4726 /* 4727 * Regular lookup nulifies the slash, which we don't do here. 4728 * Don't take chances with filesystem routines seeing it for 4729 * the last entry. 4730 */ 4731 if (cache_fpl_istrailingslash(fpl)) { 4732 return (cache_fpl_partial(fpl)); 4733 } 4734 4735 mp = atomic_load_ptr(&dvp->v_mount); 4736 if (__predict_false(mp == NULL)) { 4737 return (cache_fpl_aborted(fpl)); 4738 } 4739 4740 if (__predict_false(mp->mnt_flag & MNT_RDONLY)) { 4741 cache_fpl_smr_exit(fpl); 4742 /* 4743 * Original code keeps not checking for CREATE which 4744 * might be a bug. For now let the old lookup decide. 4745 */ 4746 if (cnp->cn_nameiop == CREATE) { 4747 return (cache_fpl_aborted(fpl)); 4748 } 4749 return (cache_fpl_handled_error(fpl, EROFS)); 4750 } 4751 4752 if (fpl->tvp != NULL && (cnp->cn_flags & FAILIFEXISTS) != 0) { 4753 cache_fpl_smr_exit(fpl); 4754 return (cache_fpl_handled_error(fpl, EEXIST)); 4755 } 4756 4757 /* 4758 * Secure access to dvp; check cache_fplookup_partial_setup for 4759 * reasoning. 4760 * 4761 * XXX At least UFS requires its lookup routine to be called for 4762 * the last path component, which leads to some level of complication 4763 * and inefficiency: 4764 * - the target routine always locks the target vnode, but our caller 4765 * may not need it locked 4766 * - some of the VOP machinery asserts that the parent is locked, which 4767 * once more may be not required 4768 * 4769 * TODO: add a flag for filesystems which don't need this. 4770 */ 4771 dvs = vget_prep_smr(dvp); 4772 cache_fpl_smr_exit(fpl); 4773 if (__predict_false(dvs == VGET_NONE)) { 4774 return (cache_fpl_aborted(fpl)); 4775 } 4776 4777 vget_finish_ref(dvp, dvs); 4778 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4779 vrele(dvp); 4780 return (cache_fpl_aborted(fpl)); 4781 } 4782 4783 error = vn_lock(dvp, LK_EXCLUSIVE); 4784 if (__predict_false(error != 0)) { 4785 vrele(dvp); 4786 return (cache_fpl_aborted(fpl)); 4787 } 4788 4789 tvp = NULL; 4790 cnp->cn_flags |= ISLASTCN; 4791 if (docache) 4792 cnp->cn_flags |= MAKEENTRY; 4793 if (cache_fpl_isdotdot(cnp)) 4794 cnp->cn_flags |= ISDOTDOT; 4795 cnp->cn_lkflags = LK_EXCLUSIVE; 4796 error = VOP_LOOKUP(dvp, &tvp, cnp); 4797 switch (error) { 4798 case EJUSTRETURN: 4799 case 0: 4800 break; 4801 case ENOTDIR: 4802 case ENOENT: 4803 vput(dvp); 4804 return (cache_fpl_handled_error(fpl, error)); 4805 default: 4806 vput(dvp); 4807 return (cache_fpl_aborted(fpl)); 4808 } 4809 4810 fpl->tvp = tvp; 4811 4812 if (tvp == NULL) { 4813 MPASS(error == EJUSTRETURN); 4814 if ((cnp->cn_flags & LOCKPARENT) == 0) { 4815 VOP_UNLOCK(dvp); 4816 } 4817 return (cache_fpl_handled(fpl)); 4818 } 4819 4820 /* 4821 * There are very hairy corner cases concerning various flag combinations 4822 * and locking state. In particular here we only hold one lock instead of 4823 * two. 4824 * 4825 * Skip the complexity as it is of no significance for normal workloads. 4826 */ 4827 if (__predict_false(tvp == dvp)) { 4828 vput(dvp); 4829 vrele(tvp); 4830 return (cache_fpl_aborted(fpl)); 4831 } 4832 4833 /* 4834 * If they want the symlink itself we are fine, but if they want to 4835 * follow it regular lookup has to be engaged. 4836 */ 4837 if (tvp->v_type == VLNK) { 4838 if ((cnp->cn_flags & FOLLOW) != 0) { 4839 vput(dvp); 4840 vput(tvp); 4841 return (cache_fpl_aborted(fpl)); 4842 } 4843 } 4844 4845 /* 4846 * Since we expect this to be the terminal vnode it should almost never 4847 * be a mount point. 4848 */ 4849 if (__predict_false(cache_fplookup_is_mp(fpl))) { 4850 vput(dvp); 4851 vput(tvp); 4852 return (cache_fpl_aborted(fpl)); 4853 } 4854 4855 if ((cnp->cn_flags & FAILIFEXISTS) != 0) { 4856 vput(dvp); 4857 vput(tvp); 4858 return (cache_fpl_handled_error(fpl, EEXIST)); 4859 } 4860 4861 if ((cnp->cn_flags & LOCKLEAF) == 0) { 4862 VOP_UNLOCK(tvp); 4863 } 4864 4865 if ((cnp->cn_flags & LOCKPARENT) == 0) { 4866 VOP_UNLOCK(dvp); 4867 } 4868 4869 return (cache_fpl_handled(fpl)); 4870 } 4871 4872 static int __noinline 4873 cache_fplookup_modifying(struct cache_fpl *fpl) 4874 { 4875 struct nameidata *ndp; 4876 4877 ndp = fpl->ndp; 4878 4879 if (!cache_fpl_islastcn(ndp)) { 4880 return (cache_fpl_partial(fpl)); 4881 } 4882 return (cache_fplookup_final_modifying(fpl)); 4883 } 4884 4885 static int __noinline 4886 cache_fplookup_final_withparent(struct cache_fpl *fpl) 4887 { 4888 struct componentname *cnp; 4889 enum vgetstate dvs, tvs; 4890 struct vnode *dvp, *tvp; 4891 seqc_t dvp_seqc; 4892 int error; 4893 4894 cnp = fpl->cnp; 4895 dvp = fpl->dvp; 4896 dvp_seqc = fpl->dvp_seqc; 4897 tvp = fpl->tvp; 4898 4899 MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0); 4900 4901 /* 4902 * This is less efficient than it can be for simplicity. 4903 */ 4904 dvs = vget_prep_smr(dvp); 4905 if (__predict_false(dvs == VGET_NONE)) { 4906 return (cache_fpl_aborted(fpl)); 4907 } 4908 tvs = vget_prep_smr(tvp); 4909 if (__predict_false(tvs == VGET_NONE)) { 4910 cache_fpl_smr_exit(fpl); 4911 vget_abort(dvp, dvs); 4912 return (cache_fpl_aborted(fpl)); 4913 } 4914 4915 cache_fpl_smr_exit(fpl); 4916 4917 if ((cnp->cn_flags & LOCKPARENT) != 0) { 4918 error = vget_finish(dvp, LK_EXCLUSIVE, dvs); 4919 if (__predict_false(error != 0)) { 4920 vget_abort(tvp, tvs); 4921 return (cache_fpl_aborted(fpl)); 4922 } 4923 } else { 4924 vget_finish_ref(dvp, dvs); 4925 } 4926 4927 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4928 vget_abort(tvp, tvs); 4929 if ((cnp->cn_flags & LOCKPARENT) != 0) 4930 vput(dvp); 4931 else 4932 vrele(dvp); 4933 return (cache_fpl_aborted(fpl)); 4934 } 4935 4936 error = cache_fplookup_final_child(fpl, tvs); 4937 if (__predict_false(error != 0)) { 4938 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED || 4939 fpl->status == CACHE_FPL_STATUS_DESTROYED); 4940 if ((cnp->cn_flags & LOCKPARENT) != 0) 4941 vput(dvp); 4942 else 4943 vrele(dvp); 4944 return (error); 4945 } 4946 4947 MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED); 4948 return (0); 4949 } 4950 4951 static int 4952 cache_fplookup_final(struct cache_fpl *fpl) 4953 { 4954 struct componentname *cnp; 4955 enum vgetstate tvs; 4956 struct vnode *dvp, *tvp; 4957 seqc_t dvp_seqc; 4958 4959 cnp = fpl->cnp; 4960 dvp = fpl->dvp; 4961 dvp_seqc = fpl->dvp_seqc; 4962 tvp = fpl->tvp; 4963 4964 MPASS(*(cnp->cn_nameptr) != '/'); 4965 4966 if (cnp->cn_nameiop != LOOKUP) { 4967 return (cache_fplookup_final_modifying(fpl)); 4968 } 4969 4970 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) 4971 return (cache_fplookup_final_withparent(fpl)); 4972 4973 tvs = vget_prep_smr(tvp); 4974 if (__predict_false(tvs == VGET_NONE)) { 4975 return (cache_fpl_partial(fpl)); 4976 } 4977 4978 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4979 cache_fpl_smr_exit(fpl); 4980 vget_abort(tvp, tvs); 4981 return (cache_fpl_aborted(fpl)); 4982 } 4983 4984 cache_fpl_smr_exit(fpl); 4985 return (cache_fplookup_final_child(fpl, tvs)); 4986 } 4987 4988 /* 4989 * Comment from locked lookup: 4990 * Check for degenerate name (e.g. / or "") which is a way of talking about a 4991 * directory, e.g. like "/." or ".". 4992 */ 4993 static int __noinline 4994 cache_fplookup_degenerate(struct cache_fpl *fpl) 4995 { 4996 struct componentname *cnp; 4997 struct vnode *dvp; 4998 enum vgetstate dvs; 4999 int error, lkflags; 5000 #ifdef INVARIANTS 5001 char *cp; 5002 #endif 5003 5004 fpl->tvp = fpl->dvp; 5005 fpl->tvp_seqc = fpl->dvp_seqc; 5006 5007 cnp = fpl->cnp; 5008 dvp = fpl->dvp; 5009 5010 #ifdef INVARIANTS 5011 for (cp = cnp->cn_pnbuf; *cp != '\0'; cp++) { 5012 KASSERT(*cp == '/', 5013 ("%s: encountered non-slash; string [%s]\n", __func__, 5014 cnp->cn_pnbuf)); 5015 } 5016 #endif 5017 5018 if (__predict_false(cnp->cn_nameiop != LOOKUP)) { 5019 cache_fpl_smr_exit(fpl); 5020 return (cache_fpl_handled_error(fpl, EISDIR)); 5021 } 5022 5023 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) { 5024 return (cache_fplookup_final_withparent(fpl)); 5025 } 5026 5027 dvs = vget_prep_smr(dvp); 5028 cache_fpl_smr_exit(fpl); 5029 if (__predict_false(dvs == VGET_NONE)) { 5030 return (cache_fpl_aborted(fpl)); 5031 } 5032 5033 if ((cnp->cn_flags & LOCKLEAF) != 0) { 5034 lkflags = LK_SHARED; 5035 if ((cnp->cn_flags & LOCKSHARED) == 0) 5036 lkflags = LK_EXCLUSIVE; 5037 error = vget_finish(dvp, lkflags, dvs); 5038 if (__predict_false(error != 0)) { 5039 return (cache_fpl_aborted(fpl)); 5040 } 5041 } else { 5042 vget_finish_ref(dvp, dvs); 5043 } 5044 return (cache_fpl_handled(fpl)); 5045 } 5046 5047 static int __noinline 5048 cache_fplookup_emptypath(struct cache_fpl *fpl) 5049 { 5050 struct nameidata *ndp; 5051 struct componentname *cnp; 5052 enum vgetstate tvs; 5053 struct vnode *tvp; 5054 int error, lkflags; 5055 5056 fpl->tvp = fpl->dvp; 5057 fpl->tvp_seqc = fpl->dvp_seqc; 5058 5059 ndp = fpl->ndp; 5060 cnp = fpl->cnp; 5061 tvp = fpl->tvp; 5062 5063 MPASS(*cnp->cn_pnbuf == '\0'); 5064 5065 if (__predict_false((cnp->cn_flags & EMPTYPATH) == 0)) { 5066 cache_fpl_smr_exit(fpl); 5067 return (cache_fpl_handled_error(fpl, ENOENT)); 5068 } 5069 5070 MPASS((cnp->cn_flags & (LOCKPARENT | WANTPARENT)) == 0); 5071 5072 tvs = vget_prep_smr(tvp); 5073 cache_fpl_smr_exit(fpl); 5074 if (__predict_false(tvs == VGET_NONE)) { 5075 return (cache_fpl_aborted(fpl)); 5076 } 5077 5078 if ((cnp->cn_flags & LOCKLEAF) != 0) { 5079 lkflags = LK_SHARED; 5080 if ((cnp->cn_flags & LOCKSHARED) == 0) 5081 lkflags = LK_EXCLUSIVE; 5082 error = vget_finish(tvp, lkflags, tvs); 5083 if (__predict_false(error != 0)) { 5084 return (cache_fpl_aborted(fpl)); 5085 } 5086 } else { 5087 vget_finish_ref(tvp, tvs); 5088 } 5089 5090 ndp->ni_resflags |= NIRES_EMPTYPATH; 5091 return (cache_fpl_handled(fpl)); 5092 } 5093 5094 static int __noinline 5095 cache_fplookup_noentry(struct cache_fpl *fpl) 5096 { 5097 struct nameidata *ndp; 5098 struct componentname *cnp; 5099 enum vgetstate dvs; 5100 struct vnode *dvp, *tvp; 5101 seqc_t dvp_seqc; 5102 int error; 5103 5104 ndp = fpl->ndp; 5105 cnp = fpl->cnp; 5106 dvp = fpl->dvp; 5107 dvp_seqc = fpl->dvp_seqc; 5108 5109 MPASS((cnp->cn_flags & MAKEENTRY) == 0); 5110 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 5111 if (cnp->cn_nameiop == LOOKUP) 5112 MPASS((cnp->cn_flags & NOCACHE) == 0); 5113 MPASS(!cache_fpl_isdotdot(cnp)); 5114 5115 /* 5116 * Hack: delayed name len checking. 5117 */ 5118 if (__predict_false(cnp->cn_namelen > NAME_MAX)) { 5119 cache_fpl_smr_exit(fpl); 5120 return (cache_fpl_handled_error(fpl, ENAMETOOLONG)); 5121 } 5122 5123 if (cnp->cn_nameptr[0] == '/') { 5124 return (cache_fplookup_skip_slashes(fpl)); 5125 } 5126 5127 if (cnp->cn_pnbuf[0] == '\0') { 5128 return (cache_fplookup_emptypath(fpl)); 5129 } 5130 5131 if (cnp->cn_nameptr[0] == '\0') { 5132 if (fpl->tvp == NULL) { 5133 return (cache_fplookup_degenerate(fpl)); 5134 } 5135 return (cache_fplookup_trailingslash(fpl)); 5136 } 5137 5138 if (cnp->cn_nameiop != LOOKUP) { 5139 fpl->tvp = NULL; 5140 return (cache_fplookup_modifying(fpl)); 5141 } 5142 5143 /* 5144 * Only try to fill in the component if it is the last one, 5145 * otherwise not only there may be several to handle but the 5146 * walk may be complicated. 5147 */ 5148 if (!cache_fpl_islastcn(ndp)) { 5149 return (cache_fpl_partial(fpl)); 5150 } 5151 5152 /* 5153 * Regular lookup nulifies the slash, which we don't do here. 5154 * Don't take chances with filesystem routines seeing it for 5155 * the last entry. 5156 */ 5157 if (cache_fpl_istrailingslash(fpl)) { 5158 return (cache_fpl_partial(fpl)); 5159 } 5160 5161 /* 5162 * Secure access to dvp; check cache_fplookup_partial_setup for 5163 * reasoning. 5164 */ 5165 dvs = vget_prep_smr(dvp); 5166 cache_fpl_smr_exit(fpl); 5167 if (__predict_false(dvs == VGET_NONE)) { 5168 return (cache_fpl_aborted(fpl)); 5169 } 5170 5171 vget_finish_ref(dvp, dvs); 5172 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 5173 vrele(dvp); 5174 return (cache_fpl_aborted(fpl)); 5175 } 5176 5177 error = vn_lock(dvp, LK_SHARED); 5178 if (__predict_false(error != 0)) { 5179 vrele(dvp); 5180 return (cache_fpl_aborted(fpl)); 5181 } 5182 5183 tvp = NULL; 5184 /* 5185 * TODO: provide variants which don't require locking either vnode. 5186 */ 5187 cnp->cn_flags |= ISLASTCN | MAKEENTRY; 5188 cnp->cn_lkflags = LK_SHARED; 5189 if ((cnp->cn_flags & LOCKSHARED) == 0) { 5190 cnp->cn_lkflags = LK_EXCLUSIVE; 5191 } 5192 error = VOP_LOOKUP(dvp, &tvp, cnp); 5193 switch (error) { 5194 case EJUSTRETURN: 5195 case 0: 5196 break; 5197 case ENOTDIR: 5198 case ENOENT: 5199 vput(dvp); 5200 return (cache_fpl_handled_error(fpl, error)); 5201 default: 5202 vput(dvp); 5203 return (cache_fpl_aborted(fpl)); 5204 } 5205 5206 fpl->tvp = tvp; 5207 5208 if (tvp == NULL) { 5209 MPASS(error == EJUSTRETURN); 5210 if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) { 5211 vput(dvp); 5212 } else if ((cnp->cn_flags & LOCKPARENT) == 0) { 5213 VOP_UNLOCK(dvp); 5214 } 5215 return (cache_fpl_handled(fpl)); 5216 } 5217 5218 if (tvp->v_type == VLNK) { 5219 if ((cnp->cn_flags & FOLLOW) != 0) { 5220 vput(dvp); 5221 vput(tvp); 5222 return (cache_fpl_aborted(fpl)); 5223 } 5224 } 5225 5226 if (__predict_false(cache_fplookup_is_mp(fpl))) { 5227 vput(dvp); 5228 vput(tvp); 5229 return (cache_fpl_aborted(fpl)); 5230 } 5231 5232 if ((cnp->cn_flags & LOCKLEAF) == 0) { 5233 VOP_UNLOCK(tvp); 5234 } 5235 5236 if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) { 5237 vput(dvp); 5238 } else if ((cnp->cn_flags & LOCKPARENT) == 0) { 5239 VOP_UNLOCK(dvp); 5240 } 5241 return (cache_fpl_handled(fpl)); 5242 } 5243 5244 static int __noinline 5245 cache_fplookup_dot(struct cache_fpl *fpl) 5246 { 5247 int error; 5248 5249 MPASS(!seqc_in_modify(fpl->dvp_seqc)); 5250 5251 if (__predict_false(fpl->dvp->v_type != VDIR)) { 5252 cache_fpl_smr_exit(fpl); 5253 return (cache_fpl_handled_error(fpl, ENOTDIR)); 5254 } 5255 5256 /* 5257 * Just re-assign the value. seqc will be checked later for the first 5258 * non-dot path component in line and/or before deciding to return the 5259 * vnode. 5260 */ 5261 fpl->tvp = fpl->dvp; 5262 fpl->tvp_seqc = fpl->dvp_seqc; 5263 5264 SDT_PROBE3(vfs, namecache, lookup, hit, fpl->dvp, ".", fpl->dvp); 5265 5266 error = 0; 5267 if (cache_fplookup_is_mp(fpl)) { 5268 error = cache_fplookup_cross_mount(fpl); 5269 } 5270 return (error); 5271 } 5272 5273 static int __noinline 5274 cache_fplookup_dotdot(struct cache_fpl *fpl) 5275 { 5276 struct nameidata *ndp; 5277 struct componentname *cnp; 5278 struct namecache *ncp; 5279 struct vnode *dvp; 5280 struct prison *pr; 5281 u_char nc_flag; 5282 5283 ndp = fpl->ndp; 5284 cnp = fpl->cnp; 5285 dvp = fpl->dvp; 5286 5287 MPASS(cache_fpl_isdotdot(cnp)); 5288 5289 /* 5290 * XXX this is racy the same way regular lookup is 5291 */ 5292 for (pr = cnp->cn_cred->cr_prison; pr != NULL; 5293 pr = pr->pr_parent) 5294 if (dvp == pr->pr_root) 5295 break; 5296 5297 if (dvp == ndp->ni_rootdir || 5298 dvp == ndp->ni_topdir || 5299 dvp == rootvnode || 5300 pr != NULL) { 5301 fpl->tvp = dvp; 5302 fpl->tvp_seqc = vn_seqc_read_any(dvp); 5303 if (seqc_in_modify(fpl->tvp_seqc)) { 5304 return (cache_fpl_aborted(fpl)); 5305 } 5306 return (0); 5307 } 5308 5309 if ((dvp->v_vflag & VV_ROOT) != 0) { 5310 /* 5311 * TODO 5312 * The opposite of climb mount is needed here. 5313 */ 5314 return (cache_fpl_partial(fpl)); 5315 } 5316 5317 if (__predict_false(dvp->v_type != VDIR)) { 5318 cache_fpl_smr_exit(fpl); 5319 return (cache_fpl_handled_error(fpl, ENOTDIR)); 5320 } 5321 5322 ncp = atomic_load_consume_ptr(&dvp->v_cache_dd); 5323 if (ncp == NULL) { 5324 return (cache_fpl_aborted(fpl)); 5325 } 5326 5327 nc_flag = atomic_load_char(&ncp->nc_flag); 5328 if ((nc_flag & NCF_ISDOTDOT) != 0) { 5329 if ((nc_flag & NCF_NEGATIVE) != 0) 5330 return (cache_fpl_aborted(fpl)); 5331 fpl->tvp = ncp->nc_vp; 5332 } else { 5333 fpl->tvp = ncp->nc_dvp; 5334 } 5335 5336 fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp); 5337 if (seqc_in_modify(fpl->tvp_seqc)) { 5338 return (cache_fpl_partial(fpl)); 5339 } 5340 5341 /* 5342 * Acquire fence provided by vn_seqc_read_any above. 5343 */ 5344 if (__predict_false(atomic_load_ptr(&dvp->v_cache_dd) != ncp)) { 5345 return (cache_fpl_aborted(fpl)); 5346 } 5347 5348 if (!cache_ncp_canuse(ncp)) { 5349 return (cache_fpl_aborted(fpl)); 5350 } 5351 5352 return (0); 5353 } 5354 5355 static int __noinline 5356 cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash) 5357 { 5358 u_char nc_flag __diagused; 5359 bool neg_promote; 5360 5361 #ifdef INVARIANTS 5362 nc_flag = atomic_load_char(&ncp->nc_flag); 5363 MPASS((nc_flag & NCF_NEGATIVE) != 0); 5364 #endif 5365 /* 5366 * If they want to create an entry we need to replace this one. 5367 */ 5368 if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) { 5369 fpl->tvp = NULL; 5370 return (cache_fplookup_modifying(fpl)); 5371 } 5372 neg_promote = cache_neg_hit_prep(ncp); 5373 if (!cache_fpl_neg_ncp_canuse(ncp)) { 5374 cache_neg_hit_abort(ncp); 5375 return (cache_fpl_partial(fpl)); 5376 } 5377 if (neg_promote) { 5378 return (cache_fplookup_negative_promote(fpl, ncp, hash)); 5379 } 5380 cache_neg_hit_finish(ncp); 5381 cache_fpl_smr_exit(fpl); 5382 return (cache_fpl_handled_error(fpl, ENOENT)); 5383 } 5384 5385 /* 5386 * Resolve a symlink. Called by filesystem-specific routines. 5387 * 5388 * Code flow is: 5389 * ... -> cache_fplookup_symlink -> VOP_FPLOOKUP_SYMLINK -> cache_symlink_resolve 5390 */ 5391 int 5392 cache_symlink_resolve(struct cache_fpl *fpl, const char *string, size_t len) 5393 { 5394 struct nameidata *ndp; 5395 struct componentname *cnp; 5396 size_t adjust; 5397 5398 ndp = fpl->ndp; 5399 cnp = fpl->cnp; 5400 5401 if (__predict_false(len == 0)) { 5402 return (ENOENT); 5403 } 5404 5405 if (__predict_false(len > MAXPATHLEN - 2)) { 5406 if (cache_fpl_istrailingslash(fpl)) { 5407 return (EAGAIN); 5408 } 5409 } 5410 5411 ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr - cnp->cn_namelen + 1; 5412 #ifdef INVARIANTS 5413 if (ndp->ni_pathlen != fpl->debug.ni_pathlen) { 5414 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n", 5415 __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar, 5416 cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf); 5417 } 5418 #endif 5419 5420 if (__predict_false(len + ndp->ni_pathlen > MAXPATHLEN)) { 5421 return (ENAMETOOLONG); 5422 } 5423 5424 if (__predict_false(ndp->ni_loopcnt++ >= MAXSYMLINKS)) { 5425 return (ELOOP); 5426 } 5427 5428 adjust = len; 5429 if (ndp->ni_pathlen > 1) { 5430 bcopy(ndp->ni_next, cnp->cn_pnbuf + len, ndp->ni_pathlen); 5431 } else { 5432 if (cache_fpl_istrailingslash(fpl)) { 5433 adjust = len + 1; 5434 cnp->cn_pnbuf[len] = '/'; 5435 cnp->cn_pnbuf[len + 1] = '\0'; 5436 } else { 5437 cnp->cn_pnbuf[len] = '\0'; 5438 } 5439 } 5440 bcopy(string, cnp->cn_pnbuf, len); 5441 5442 ndp->ni_pathlen += adjust; 5443 cache_fpl_pathlen_add(fpl, adjust); 5444 cnp->cn_nameptr = cnp->cn_pnbuf; 5445 fpl->nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1]; 5446 fpl->tvp = NULL; 5447 return (0); 5448 } 5449 5450 static int __noinline 5451 cache_fplookup_symlink(struct cache_fpl *fpl) 5452 { 5453 struct mount *mp; 5454 struct nameidata *ndp; 5455 struct componentname *cnp; 5456 struct vnode *dvp, *tvp; 5457 struct pwd *pwd; 5458 int error; 5459 5460 ndp = fpl->ndp; 5461 cnp = fpl->cnp; 5462 dvp = fpl->dvp; 5463 tvp = fpl->tvp; 5464 pwd = *(fpl->pwd); 5465 5466 if (cache_fpl_islastcn(ndp)) { 5467 if ((cnp->cn_flags & FOLLOW) == 0) { 5468 return (cache_fplookup_final(fpl)); 5469 } 5470 } 5471 5472 mp = atomic_load_ptr(&dvp->v_mount); 5473 if (__predict_false(mp == NULL)) { 5474 return (cache_fpl_aborted(fpl)); 5475 } 5476 5477 /* 5478 * Note this check races against setting the flag just like regular 5479 * lookup. 5480 */ 5481 if (__predict_false((mp->mnt_flag & MNT_NOSYMFOLLOW) != 0)) { 5482 cache_fpl_smr_exit(fpl); 5483 return (cache_fpl_handled_error(fpl, EACCES)); 5484 } 5485 5486 error = VOP_FPLOOKUP_SYMLINK(tvp, fpl); 5487 if (__predict_false(error != 0)) { 5488 switch (error) { 5489 case EAGAIN: 5490 return (cache_fpl_partial(fpl)); 5491 case ENOENT: 5492 case ENAMETOOLONG: 5493 case ELOOP: 5494 cache_fpl_smr_exit(fpl); 5495 return (cache_fpl_handled_error(fpl, error)); 5496 default: 5497 return (cache_fpl_aborted(fpl)); 5498 } 5499 } 5500 5501 if (*(cnp->cn_nameptr) == '/') { 5502 fpl->dvp = cache_fpl_handle_root(fpl); 5503 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); 5504 if (seqc_in_modify(fpl->dvp_seqc)) { 5505 return (cache_fpl_aborted(fpl)); 5506 } 5507 /* 5508 * The main loop assumes that ->dvp points to a vnode belonging 5509 * to a filesystem which can do lockless lookup, but the absolute 5510 * symlink can be wandering off to one which does not. 5511 */ 5512 mp = atomic_load_ptr(&fpl->dvp->v_mount); 5513 if (__predict_false(mp == NULL)) { 5514 return (cache_fpl_aborted(fpl)); 5515 } 5516 if (!cache_fplookup_mp_supported(mp)) { 5517 cache_fpl_checkpoint(fpl); 5518 return (cache_fpl_partial(fpl)); 5519 } 5520 if (__predict_false(pwd->pwd_adir != pwd->pwd_rdir)) { 5521 return (cache_fpl_aborted(fpl)); 5522 } 5523 } 5524 return (0); 5525 } 5526 5527 static int 5528 cache_fplookup_next(struct cache_fpl *fpl) 5529 { 5530 struct componentname *cnp; 5531 struct namecache *ncp; 5532 struct vnode *dvp, *tvp; 5533 u_char nc_flag; 5534 uint32_t hash; 5535 int error; 5536 5537 cnp = fpl->cnp; 5538 dvp = fpl->dvp; 5539 hash = fpl->hash; 5540 5541 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 5542 if (cnp->cn_namelen == 1) { 5543 return (cache_fplookup_dot(fpl)); 5544 } 5545 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 5546 return (cache_fplookup_dotdot(fpl)); 5547 } 5548 } 5549 5550 MPASS(!cache_fpl_isdotdot(cnp)); 5551 5552 ncp = cache_ncp_find(dvp, cnp, hash); 5553 if (__predict_false(ncp == NULL)) { 5554 return (cache_fplookup_noentry(fpl)); 5555 } 5556 5557 tvp = atomic_load_ptr(&ncp->nc_vp); 5558 nc_flag = atomic_load_char(&ncp->nc_flag); 5559 if ((nc_flag & NCF_NEGATIVE) != 0) { 5560 return (cache_fplookup_neg(fpl, ncp, hash)); 5561 } 5562 5563 if (!cache_ncp_canuse(ncp)) { 5564 return (cache_fpl_partial(fpl)); 5565 } 5566 5567 fpl->tvp = tvp; 5568 fpl->tvp_seqc = vn_seqc_read_any(tvp); 5569 if (seqc_in_modify(fpl->tvp_seqc)) { 5570 return (cache_fpl_partial(fpl)); 5571 } 5572 5573 counter_u64_add(numposhits, 1); 5574 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp); 5575 5576 error = 0; 5577 if (cache_fplookup_is_mp(fpl)) { 5578 error = cache_fplookup_cross_mount(fpl); 5579 } 5580 return (error); 5581 } 5582 5583 static bool 5584 cache_fplookup_mp_supported(struct mount *mp) 5585 { 5586 5587 MPASS(mp != NULL); 5588 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) 5589 return (false); 5590 return (true); 5591 } 5592 5593 /* 5594 * Walk up the mount stack (if any). 5595 * 5596 * Correctness is provided in the following ways: 5597 * - all vnodes are protected from freeing with SMR 5598 * - struct mount objects are type stable making them always safe to access 5599 * - stability of the particular mount is provided by busying it 5600 * - relationship between the vnode which is mounted on and the mount is 5601 * verified with the vnode sequence counter after busying 5602 * - association between root vnode of the mount and the mount is protected 5603 * by busy 5604 * 5605 * From that point on we can read the sequence counter of the root vnode 5606 * and get the next mount on the stack (if any) using the same protection. 5607 * 5608 * By the end of successful walk we are guaranteed the reached state was 5609 * indeed present at least at some point which matches the regular lookup. 5610 */ 5611 static int __noinline 5612 cache_fplookup_climb_mount(struct cache_fpl *fpl) 5613 { 5614 struct mount *mp, *prev_mp; 5615 struct mount_pcpu *mpcpu, *prev_mpcpu; 5616 struct vnode *vp; 5617 seqc_t vp_seqc; 5618 5619 vp = fpl->tvp; 5620 vp_seqc = fpl->tvp_seqc; 5621 5622 VNPASS(vp->v_type == VDIR || vp->v_type == VREG || vp->v_type == VBAD, vp); 5623 mp = atomic_load_ptr(&vp->v_mountedhere); 5624 if (__predict_false(mp == NULL)) { 5625 return (0); 5626 } 5627 5628 prev_mp = NULL; 5629 for (;;) { 5630 if (!vfs_op_thread_enter_crit(mp, mpcpu)) { 5631 if (prev_mp != NULL) 5632 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); 5633 return (cache_fpl_partial(fpl)); 5634 } 5635 if (prev_mp != NULL) 5636 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); 5637 if (!vn_seqc_consistent(vp, vp_seqc)) { 5638 vfs_op_thread_exit_crit(mp, mpcpu); 5639 return (cache_fpl_partial(fpl)); 5640 } 5641 if (!cache_fplookup_mp_supported(mp)) { 5642 vfs_op_thread_exit_crit(mp, mpcpu); 5643 return (cache_fpl_partial(fpl)); 5644 } 5645 vp = atomic_load_ptr(&mp->mnt_rootvnode); 5646 if (vp == NULL) { 5647 vfs_op_thread_exit_crit(mp, mpcpu); 5648 return (cache_fpl_partial(fpl)); 5649 } 5650 vp_seqc = vn_seqc_read_any(vp); 5651 if (seqc_in_modify(vp_seqc)) { 5652 vfs_op_thread_exit_crit(mp, mpcpu); 5653 return (cache_fpl_partial(fpl)); 5654 } 5655 prev_mp = mp; 5656 prev_mpcpu = mpcpu; 5657 mp = atomic_load_ptr(&vp->v_mountedhere); 5658 if (mp == NULL) 5659 break; 5660 } 5661 5662 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); 5663 fpl->tvp = vp; 5664 fpl->tvp_seqc = vp_seqc; 5665 return (0); 5666 } 5667 5668 static int __noinline 5669 cache_fplookup_cross_mount(struct cache_fpl *fpl) 5670 { 5671 struct mount *mp; 5672 struct mount_pcpu *mpcpu; 5673 struct vnode *vp; 5674 seqc_t vp_seqc; 5675 5676 vp = fpl->tvp; 5677 vp_seqc = fpl->tvp_seqc; 5678 5679 VNPASS(vp->v_type == VDIR || vp->v_type == VREG || vp->v_type == VBAD, vp); 5680 mp = atomic_load_ptr(&vp->v_mountedhere); 5681 if (__predict_false(mp == NULL)) { 5682 return (0); 5683 } 5684 5685 if (!vfs_op_thread_enter_crit(mp, mpcpu)) { 5686 return (cache_fpl_partial(fpl)); 5687 } 5688 if (!vn_seqc_consistent(vp, vp_seqc)) { 5689 vfs_op_thread_exit_crit(mp, mpcpu); 5690 return (cache_fpl_partial(fpl)); 5691 } 5692 if (!cache_fplookup_mp_supported(mp)) { 5693 vfs_op_thread_exit_crit(mp, mpcpu); 5694 return (cache_fpl_partial(fpl)); 5695 } 5696 vp = atomic_load_ptr(&mp->mnt_rootvnode); 5697 if (__predict_false(vp == NULL)) { 5698 vfs_op_thread_exit_crit(mp, mpcpu); 5699 return (cache_fpl_partial(fpl)); 5700 } 5701 vp_seqc = vn_seqc_read_any(vp); 5702 vfs_op_thread_exit_crit(mp, mpcpu); 5703 if (seqc_in_modify(vp_seqc)) { 5704 return (cache_fpl_partial(fpl)); 5705 } 5706 mp = atomic_load_ptr(&vp->v_mountedhere); 5707 if (__predict_false(mp != NULL)) { 5708 /* 5709 * There are possibly more mount points on top. 5710 * Normally this does not happen so for simplicity just start 5711 * over. 5712 */ 5713 return (cache_fplookup_climb_mount(fpl)); 5714 } 5715 5716 fpl->tvp = vp; 5717 fpl->tvp_seqc = vp_seqc; 5718 return (0); 5719 } 5720 5721 /* 5722 * Check if a vnode is mounted on. 5723 */ 5724 static bool 5725 cache_fplookup_is_mp(struct cache_fpl *fpl) 5726 { 5727 struct vnode *vp; 5728 5729 vp = fpl->tvp; 5730 return ((vn_irflag_read(vp) & VIRF_MOUNTPOINT) != 0); 5731 } 5732 5733 /* 5734 * Parse the path. 5735 * 5736 * The code was originally copy-pasted from regular lookup and despite 5737 * clean ups leaves performance on the table. Any modifications here 5738 * must take into account that in case off fallback the resulting 5739 * nameidata state has to be compatible with the original. 5740 */ 5741 5742 /* 5743 * Debug ni_pathlen tracking. 5744 */ 5745 #ifdef INVARIANTS 5746 static void 5747 cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n) 5748 { 5749 5750 fpl->debug.ni_pathlen += n; 5751 KASSERT(fpl->debug.ni_pathlen <= PATH_MAX, 5752 ("%s: pathlen overflow to %zd\n", __func__, fpl->debug.ni_pathlen)); 5753 } 5754 5755 static void 5756 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n) 5757 { 5758 5759 fpl->debug.ni_pathlen -= n; 5760 KASSERT(fpl->debug.ni_pathlen <= PATH_MAX, 5761 ("%s: pathlen underflow to %zd\n", __func__, fpl->debug.ni_pathlen)); 5762 } 5763 5764 static void 5765 cache_fpl_pathlen_inc(struct cache_fpl *fpl) 5766 { 5767 5768 cache_fpl_pathlen_add(fpl, 1); 5769 } 5770 5771 static void 5772 cache_fpl_pathlen_dec(struct cache_fpl *fpl) 5773 { 5774 5775 cache_fpl_pathlen_sub(fpl, 1); 5776 } 5777 #else 5778 static void 5779 cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n) 5780 { 5781 } 5782 5783 static void 5784 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n) 5785 { 5786 } 5787 5788 static void 5789 cache_fpl_pathlen_inc(struct cache_fpl *fpl) 5790 { 5791 } 5792 5793 static void 5794 cache_fpl_pathlen_dec(struct cache_fpl *fpl) 5795 { 5796 } 5797 #endif 5798 5799 static void 5800 cache_fplookup_parse(struct cache_fpl *fpl) 5801 { 5802 struct nameidata *ndp; 5803 struct componentname *cnp; 5804 struct vnode *dvp; 5805 char *cp; 5806 uint32_t hash; 5807 5808 ndp = fpl->ndp; 5809 cnp = fpl->cnp; 5810 dvp = fpl->dvp; 5811 5812 /* 5813 * Find the end of this path component, it is either / or nul. 5814 * 5815 * Store / as a temporary sentinel so that we only have one character 5816 * to test for. Pathnames tend to be short so this should not be 5817 * resulting in cache misses. 5818 * 5819 * TODO: fix this to be word-sized. 5820 */ 5821 MPASS(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] >= cnp->cn_pnbuf); 5822 KASSERT(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] == fpl->nulchar, 5823 ("%s: mismatch between pathlen (%zu) and nulchar (%p != %p), string [%s]\n", 5824 __func__, fpl->debug.ni_pathlen, &cnp->cn_nameptr[fpl->debug.ni_pathlen - 1], 5825 fpl->nulchar, cnp->cn_pnbuf)); 5826 KASSERT(*fpl->nulchar == '\0', 5827 ("%s: expected nul at %p; string [%s]\n", __func__, fpl->nulchar, 5828 cnp->cn_pnbuf)); 5829 hash = cache_get_hash_iter_start(dvp); 5830 *fpl->nulchar = '/'; 5831 for (cp = cnp->cn_nameptr; *cp != '/'; cp++) { 5832 KASSERT(*cp != '\0', 5833 ("%s: encountered unexpected nul; string [%s]\n", __func__, 5834 cnp->cn_nameptr)); 5835 hash = cache_get_hash_iter(*cp, hash); 5836 continue; 5837 } 5838 *fpl->nulchar = '\0'; 5839 fpl->hash = cache_get_hash_iter_finish(hash); 5840 5841 cnp->cn_namelen = cp - cnp->cn_nameptr; 5842 cache_fpl_pathlen_sub(fpl, cnp->cn_namelen); 5843 5844 #ifdef INVARIANTS 5845 /* 5846 * cache_get_hash only accepts lengths up to NAME_MAX. This is fine since 5847 * we are going to fail this lookup with ENAMETOOLONG (see below). 5848 */ 5849 if (cnp->cn_namelen <= NAME_MAX) { 5850 if (fpl->hash != cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp)) { 5851 panic("%s: mismatched hash for [%s] len %ld", __func__, 5852 cnp->cn_nameptr, cnp->cn_namelen); 5853 } 5854 } 5855 #endif 5856 5857 /* 5858 * Hack: we have to check if the found path component's length exceeds 5859 * NAME_MAX. However, the condition is very rarely true and check can 5860 * be elided in the common case -- if an entry was found in the cache, 5861 * then it could not have been too long to begin with. 5862 */ 5863 ndp->ni_next = cp; 5864 } 5865 5866 static void 5867 cache_fplookup_parse_advance(struct cache_fpl *fpl) 5868 { 5869 struct nameidata *ndp; 5870 struct componentname *cnp; 5871 5872 ndp = fpl->ndp; 5873 cnp = fpl->cnp; 5874 5875 cnp->cn_nameptr = ndp->ni_next; 5876 KASSERT(*(cnp->cn_nameptr) == '/', 5877 ("%s: should have seen slash at %p ; buf %p [%s]\n", __func__, 5878 cnp->cn_nameptr, cnp->cn_pnbuf, cnp->cn_pnbuf)); 5879 cnp->cn_nameptr++; 5880 cache_fpl_pathlen_dec(fpl); 5881 } 5882 5883 /* 5884 * Skip spurious slashes in a pathname (e.g., "foo///bar") and retry. 5885 * 5886 * Lockless lookup tries to elide checking for spurious slashes and should they 5887 * be present is guaranteed to fail to find an entry. In this case the caller 5888 * must check if the name starts with a slash and call this routine. It is 5889 * going to fast forward across the spurious slashes and set the state up for 5890 * retry. 5891 */ 5892 static int __noinline 5893 cache_fplookup_skip_slashes(struct cache_fpl *fpl) 5894 { 5895 struct nameidata *ndp; 5896 struct componentname *cnp; 5897 5898 ndp = fpl->ndp; 5899 cnp = fpl->cnp; 5900 5901 MPASS(*(cnp->cn_nameptr) == '/'); 5902 do { 5903 cnp->cn_nameptr++; 5904 cache_fpl_pathlen_dec(fpl); 5905 } while (*(cnp->cn_nameptr) == '/'); 5906 5907 /* 5908 * Go back to one slash so that cache_fplookup_parse_advance has 5909 * something to skip. 5910 */ 5911 cnp->cn_nameptr--; 5912 cache_fpl_pathlen_inc(fpl); 5913 5914 /* 5915 * cache_fplookup_parse_advance starts from ndp->ni_next 5916 */ 5917 ndp->ni_next = cnp->cn_nameptr; 5918 5919 /* 5920 * See cache_fplookup_dot. 5921 */ 5922 fpl->tvp = fpl->dvp; 5923 fpl->tvp_seqc = fpl->dvp_seqc; 5924 5925 return (0); 5926 } 5927 5928 /* 5929 * Handle trailing slashes (e.g., "foo/"). 5930 * 5931 * If a trailing slash is found the terminal vnode must be a directory. 5932 * Regular lookup shortens the path by nulifying the first trailing slash and 5933 * sets the TRAILINGSLASH flag to denote this took place. There are several 5934 * checks on it performed later. 5935 * 5936 * Similarly to spurious slashes, lockless lookup handles this in a speculative 5937 * manner relying on an invariant that a non-directory vnode will get a miss. 5938 * In this case cn_nameptr[0] == '\0' and cn_namelen == 0. 5939 * 5940 * Thus for a path like "foo/bar/" the code unwinds the state back to "bar/" 5941 * and denotes this is the last path component, which avoids looping back. 5942 * 5943 * Only plain lookups are supported for now to restrict corner cases to handle. 5944 */ 5945 static int __noinline 5946 cache_fplookup_trailingslash(struct cache_fpl *fpl) 5947 { 5948 #ifdef INVARIANTS 5949 size_t ni_pathlen; 5950 #endif 5951 struct nameidata *ndp; 5952 struct componentname *cnp; 5953 struct namecache *ncp; 5954 struct vnode *tvp; 5955 char *cn_nameptr_orig, *cn_nameptr_slash; 5956 seqc_t tvp_seqc; 5957 u_char nc_flag; 5958 5959 ndp = fpl->ndp; 5960 cnp = fpl->cnp; 5961 tvp = fpl->tvp; 5962 tvp_seqc = fpl->tvp_seqc; 5963 5964 MPASS(fpl->dvp == fpl->tvp); 5965 KASSERT(cache_fpl_istrailingslash(fpl), 5966 ("%s: expected trailing slash at %p; string [%s]\n", __func__, fpl->nulchar - 1, 5967 cnp->cn_pnbuf)); 5968 KASSERT(cnp->cn_nameptr[0] == '\0', 5969 ("%s: expected nul char at %p; string [%s]\n", __func__, &cnp->cn_nameptr[0], 5970 cnp->cn_pnbuf)); 5971 KASSERT(cnp->cn_namelen == 0, 5972 ("%s: namelen 0 but got %ld; string [%s]\n", __func__, cnp->cn_namelen, 5973 cnp->cn_pnbuf)); 5974 MPASS(cnp->cn_nameptr > cnp->cn_pnbuf); 5975 5976 if (cnp->cn_nameiop != LOOKUP) { 5977 return (cache_fpl_aborted(fpl)); 5978 } 5979 5980 if (__predict_false(tvp->v_type != VDIR)) { 5981 if (!vn_seqc_consistent(tvp, tvp_seqc)) { 5982 return (cache_fpl_aborted(fpl)); 5983 } 5984 cache_fpl_smr_exit(fpl); 5985 return (cache_fpl_handled_error(fpl, ENOTDIR)); 5986 } 5987 5988 /* 5989 * Denote the last component. 5990 */ 5991 ndp->ni_next = &cnp->cn_nameptr[0]; 5992 MPASS(cache_fpl_islastcn(ndp)); 5993 5994 /* 5995 * Unwind trailing slashes. 5996 */ 5997 cn_nameptr_orig = cnp->cn_nameptr; 5998 while (cnp->cn_nameptr >= cnp->cn_pnbuf) { 5999 cnp->cn_nameptr--; 6000 if (cnp->cn_nameptr[0] != '/') { 6001 break; 6002 } 6003 } 6004 6005 /* 6006 * Unwind to the beginning of the path component. 6007 * 6008 * Note the path may or may not have started with a slash. 6009 */ 6010 cn_nameptr_slash = cnp->cn_nameptr; 6011 while (cnp->cn_nameptr > cnp->cn_pnbuf) { 6012 cnp->cn_nameptr--; 6013 if (cnp->cn_nameptr[0] == '/') { 6014 break; 6015 } 6016 } 6017 if (cnp->cn_nameptr[0] == '/') { 6018 cnp->cn_nameptr++; 6019 } 6020 6021 cnp->cn_namelen = cn_nameptr_slash - cnp->cn_nameptr + 1; 6022 cache_fpl_pathlen_add(fpl, cn_nameptr_orig - cnp->cn_nameptr); 6023 cache_fpl_checkpoint(fpl); 6024 6025 #ifdef INVARIANTS 6026 ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1; 6027 if (ni_pathlen != fpl->debug.ni_pathlen) { 6028 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n", 6029 __func__, ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar, 6030 cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf); 6031 } 6032 #endif 6033 6034 /* 6035 * If this was a "./" lookup the parent directory is already correct. 6036 */ 6037 if (cnp->cn_nameptr[0] == '.' && cnp->cn_namelen == 1) { 6038 return (0); 6039 } 6040 6041 /* 6042 * Otherwise we need to look it up. 6043 */ 6044 tvp = fpl->tvp; 6045 ncp = atomic_load_consume_ptr(&tvp->v_cache_dd); 6046 if (__predict_false(ncp == NULL)) { 6047 return (cache_fpl_aborted(fpl)); 6048 } 6049 nc_flag = atomic_load_char(&ncp->nc_flag); 6050 if ((nc_flag & NCF_ISDOTDOT) != 0) { 6051 return (cache_fpl_aborted(fpl)); 6052 } 6053 fpl->dvp = ncp->nc_dvp; 6054 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); 6055 if (seqc_in_modify(fpl->dvp_seqc)) { 6056 return (cache_fpl_aborted(fpl)); 6057 } 6058 return (0); 6059 } 6060 6061 /* 6062 * See the API contract for VOP_FPLOOKUP_VEXEC. 6063 */ 6064 static int __noinline 6065 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error) 6066 { 6067 struct componentname *cnp; 6068 struct vnode *dvp; 6069 seqc_t dvp_seqc; 6070 6071 cnp = fpl->cnp; 6072 dvp = fpl->dvp; 6073 dvp_seqc = fpl->dvp_seqc; 6074 6075 /* 6076 * Hack: delayed empty path checking. 6077 */ 6078 if (cnp->cn_pnbuf[0] == '\0') { 6079 return (cache_fplookup_emptypath(fpl)); 6080 } 6081 6082 /* 6083 * TODO: Due to ignoring trailing slashes lookup will perform a 6084 * permission check on the last dir when it should not be doing it. It 6085 * may fail, but said failure should be ignored. It is possible to fix 6086 * it up fully without resorting to regular lookup, but for now just 6087 * abort. 6088 */ 6089 if (cache_fpl_istrailingslash(fpl)) { 6090 return (cache_fpl_aborted(fpl)); 6091 } 6092 6093 /* 6094 * Hack: delayed degenerate path checking. 6095 */ 6096 if (cnp->cn_nameptr[0] == '\0' && fpl->tvp == NULL) { 6097 return (cache_fplookup_degenerate(fpl)); 6098 } 6099 6100 /* 6101 * Hack: delayed name len checking. 6102 */ 6103 if (__predict_false(cnp->cn_namelen > NAME_MAX)) { 6104 cache_fpl_smr_exit(fpl); 6105 return (cache_fpl_handled_error(fpl, ENAMETOOLONG)); 6106 } 6107 6108 /* 6109 * Hack: they may be looking up foo/bar, where foo is not a directory. 6110 * In such a case we need to return ENOTDIR, but we may happen to get 6111 * here with a different error. 6112 */ 6113 if (dvp->v_type != VDIR) { 6114 error = ENOTDIR; 6115 } 6116 6117 /* 6118 * Hack: handle O_SEARCH. 6119 * 6120 * Open Group Base Specifications Issue 7, 2018 edition states: 6121 * <quote> 6122 * If the access mode of the open file description associated with the 6123 * file descriptor is not O_SEARCH, the function shall check whether 6124 * directory searches are permitted using the current permissions of 6125 * the directory underlying the file descriptor. If the access mode is 6126 * O_SEARCH, the function shall not perform the check. 6127 * </quote> 6128 * 6129 * Regular lookup tests for the NOEXECCHECK flag for every path 6130 * component to decide whether to do the permission check. However, 6131 * since most lookups never have the flag (and when they do it is only 6132 * present for the first path component), lockless lookup only acts on 6133 * it if there is a permission problem. Here the flag is represented 6134 * with a boolean so that we don't have to clear it on the way out. 6135 * 6136 * For simplicity this always aborts. 6137 * TODO: check if this is the first lookup and ignore the permission 6138 * problem. Note the flag has to survive fallback (if it happens to be 6139 * performed). 6140 */ 6141 if (fpl->fsearch) { 6142 return (cache_fpl_aborted(fpl)); 6143 } 6144 6145 switch (error) { 6146 case EAGAIN: 6147 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 6148 error = cache_fpl_aborted(fpl); 6149 } else { 6150 cache_fpl_partial(fpl); 6151 } 6152 break; 6153 default: 6154 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 6155 error = cache_fpl_aborted(fpl); 6156 } else { 6157 cache_fpl_smr_exit(fpl); 6158 cache_fpl_handled_error(fpl, error); 6159 } 6160 break; 6161 } 6162 return (error); 6163 } 6164 6165 static int 6166 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl) 6167 { 6168 struct nameidata *ndp; 6169 struct componentname *cnp; 6170 struct mount *mp; 6171 int error; 6172 6173 ndp = fpl->ndp; 6174 cnp = fpl->cnp; 6175 6176 cache_fpl_checkpoint(fpl); 6177 6178 /* 6179 * The vnode at hand is almost always stable, skip checking for it. 6180 * Worst case this postpones the check towards the end of the iteration 6181 * of the main loop. 6182 */ 6183 fpl->dvp = dvp; 6184 fpl->dvp_seqc = vn_seqc_read_notmodify(fpl->dvp); 6185 6186 mp = atomic_load_ptr(&dvp->v_mount); 6187 if (__predict_false(mp == NULL || !cache_fplookup_mp_supported(mp))) { 6188 return (cache_fpl_aborted(fpl)); 6189 } 6190 6191 MPASS(fpl->tvp == NULL); 6192 6193 for (;;) { 6194 cache_fplookup_parse(fpl); 6195 6196 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred); 6197 if (__predict_false(error != 0)) { 6198 error = cache_fplookup_failed_vexec(fpl, error); 6199 break; 6200 } 6201 6202 error = cache_fplookup_next(fpl); 6203 if (__predict_false(cache_fpl_terminated(fpl))) { 6204 break; 6205 } 6206 6207 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 6208 6209 if (fpl->tvp->v_type == VLNK) { 6210 error = cache_fplookup_symlink(fpl); 6211 if (cache_fpl_terminated(fpl)) { 6212 break; 6213 } 6214 } else { 6215 if (cache_fpl_islastcn(ndp)) { 6216 error = cache_fplookup_final(fpl); 6217 break; 6218 } 6219 6220 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { 6221 error = cache_fpl_aborted(fpl); 6222 break; 6223 } 6224 6225 fpl->dvp = fpl->tvp; 6226 fpl->dvp_seqc = fpl->tvp_seqc; 6227 cache_fplookup_parse_advance(fpl); 6228 } 6229 6230 cache_fpl_checkpoint(fpl); 6231 } 6232 6233 return (error); 6234 } 6235 6236 /* 6237 * Fast path lookup protected with SMR and sequence counters. 6238 * 6239 * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one. 6240 * 6241 * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria 6242 * outlined below. 6243 * 6244 * Traditional vnode lookup conceptually looks like this: 6245 * 6246 * vn_lock(current); 6247 * for (;;) { 6248 * next = find(); 6249 * vn_lock(next); 6250 * vn_unlock(current); 6251 * current = next; 6252 * if (last) 6253 * break; 6254 * } 6255 * return (current); 6256 * 6257 * Each jump to the next vnode is safe memory-wise and atomic with respect to 6258 * any modifications thanks to holding respective locks. 6259 * 6260 * The same guarantee can be provided with a combination of safe memory 6261 * reclamation and sequence counters instead. If all operations which affect 6262 * the relationship between the current vnode and the one we are looking for 6263 * also modify the counter, we can verify whether all the conditions held as 6264 * we made the jump. This includes things like permissions, mount points etc. 6265 * Counter modification is provided by enclosing relevant places in 6266 * vn_seqc_write_begin()/end() calls. 6267 * 6268 * Thus this translates to: 6269 * 6270 * vfs_smr_enter(); 6271 * dvp_seqc = seqc_read_any(dvp); 6272 * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode 6273 * abort(); 6274 * for (;;) { 6275 * tvp = find(); 6276 * tvp_seqc = seqc_read_any(tvp); 6277 * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode 6278 * abort(); 6279 * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode 6280 * abort(); 6281 * dvp = tvp; // we know nothing of importance has changed 6282 * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration 6283 * if (last) 6284 * break; 6285 * } 6286 * vget(); // secure the vnode 6287 * if (!seqc_consistent(tvp, tvp_seqc) // final check 6288 * abort(); 6289 * // at this point we know nothing has changed for any parent<->child pair 6290 * // as they were crossed during the lookup, meaning we matched the guarantee 6291 * // of the locked variant 6292 * return (tvp); 6293 * 6294 * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows: 6295 * - they are called while within vfs_smr protection which they must never exit 6296 * - EAGAIN can be returned to denote checking could not be performed, it is 6297 * always valid to return it 6298 * - if the sequence counter has not changed the result must be valid 6299 * - if the sequence counter has changed both false positives and false negatives 6300 * are permitted (since the result will be rejected later) 6301 * - for simple cases of unix permission checks vaccess_vexec_smr can be used 6302 * 6303 * Caveats to watch out for: 6304 * - vnodes are passed unlocked and unreferenced with nothing stopping 6305 * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised 6306 * to use atomic_load_ptr to fetch it. 6307 * - the aforementioned object can also get freed, meaning absent other means it 6308 * should be protected with vfs_smr 6309 * - either safely checking permissions as they are modified or guaranteeing 6310 * their stability is left to the routine 6311 */ 6312 int 6313 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status, 6314 struct pwd **pwdp) 6315 { 6316 struct cache_fpl fpl; 6317 struct pwd *pwd; 6318 struct vnode *dvp; 6319 struct componentname *cnp; 6320 int error; 6321 6322 fpl.status = CACHE_FPL_STATUS_UNSET; 6323 fpl.in_smr = false; 6324 fpl.ndp = ndp; 6325 fpl.cnp = cnp = &ndp->ni_cnd; 6326 MPASS(ndp->ni_lcf == 0); 6327 KASSERT ((cnp->cn_flags & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, 6328 ("%s: internal flags found in cn_flags %" PRIx64, __func__, 6329 cnp->cn_flags)); 6330 MPASS(cnp->cn_nameptr == cnp->cn_pnbuf); 6331 MPASS(ndp->ni_resflags == 0); 6332 6333 if (__predict_false(!cache_can_fplookup(&fpl))) { 6334 *status = fpl.status; 6335 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 6336 return (EOPNOTSUPP); 6337 } 6338 6339 cache_fpl_checkpoint_outer(&fpl); 6340 6341 cache_fpl_smr_enter_initial(&fpl); 6342 #ifdef INVARIANTS 6343 fpl.debug.ni_pathlen = ndp->ni_pathlen; 6344 #endif 6345 fpl.nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1]; 6346 fpl.fsearch = false; 6347 fpl.tvp = NULL; /* for degenerate path handling */ 6348 fpl.pwd = pwdp; 6349 pwd = pwd_get_smr(); 6350 *(fpl.pwd) = pwd; 6351 namei_setup_rootdir(ndp, cnp, pwd); 6352 ndp->ni_topdir = pwd->pwd_jdir; 6353 6354 if (cnp->cn_pnbuf[0] == '/') { 6355 dvp = cache_fpl_handle_root(&fpl); 6356 ndp->ni_resflags = NIRES_ABS; 6357 } else { 6358 if (ndp->ni_dirfd == AT_FDCWD) { 6359 dvp = pwd->pwd_cdir; 6360 } else { 6361 error = cache_fplookup_dirfd(&fpl, &dvp); 6362 if (__predict_false(error != 0)) { 6363 goto out; 6364 } 6365 } 6366 } 6367 6368 SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true); 6369 error = cache_fplookup_impl(dvp, &fpl); 6370 out: 6371 cache_fpl_smr_assert_not_entered(&fpl); 6372 cache_fpl_assert_status(&fpl); 6373 *status = fpl.status; 6374 if (SDT_PROBES_ENABLED()) { 6375 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 6376 if (fpl.status == CACHE_FPL_STATUS_HANDLED) 6377 SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true, 6378 ndp); 6379 } 6380 6381 if (__predict_true(fpl.status == CACHE_FPL_STATUS_HANDLED)) { 6382 MPASS(error != CACHE_FPL_FAILED); 6383 if (error != 0) { 6384 cache_fpl_cleanup_cnp(fpl.cnp); 6385 MPASS(fpl.dvp == NULL); 6386 MPASS(fpl.tvp == NULL); 6387 } 6388 ndp->ni_dvp = fpl.dvp; 6389 ndp->ni_vp = fpl.tvp; 6390 } 6391 return (error); 6392 } 6393