1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Poul-Henning Kamp of the FreeBSD Project. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include "opt_ddb.h" 36 #include "opt_ktrace.h" 37 38 #include <sys/param.h> 39 #include <sys/systm.h> 40 #include <sys/capsicum.h> 41 #include <sys/counter.h> 42 #include <sys/filedesc.h> 43 #include <sys/fnv_hash.h> 44 #include <sys/inotify.h> 45 #include <sys/kernel.h> 46 #include <sys/ktr.h> 47 #include <sys/lock.h> 48 #include <sys/malloc.h> 49 #include <sys/fcntl.h> 50 #include <sys/jail.h> 51 #include <sys/mount.h> 52 #include <sys/namei.h> 53 #include <sys/proc.h> 54 #include <sys/seqc.h> 55 #include <sys/sdt.h> 56 #include <sys/smr.h> 57 #include <sys/smp.h> 58 #include <sys/syscallsubr.h> 59 #include <sys/sysctl.h> 60 #include <sys/sysproto.h> 61 #include <sys/vnode.h> 62 #include <ck_queue.h> 63 #ifdef KTRACE 64 #include <sys/ktrace.h> 65 #endif 66 #ifdef INVARIANTS 67 #include <machine/_inttypes.h> 68 #endif 69 70 #include <security/audit/audit.h> 71 #include <security/mac/mac_framework.h> 72 73 #ifdef DDB 74 #include <ddb/ddb.h> 75 #endif 76 77 #include <vm/uma.h> 78 79 /* 80 * High level overview of name caching in the VFS layer. 81 * 82 * Originally caching was implemented as part of UFS, later extracted to allow 83 * use by other filesystems. A decision was made to make it optional and 84 * completely detached from the rest of the kernel, which comes with limitations 85 * outlined near the end of this comment block. 86 * 87 * This fundamental choice needs to be revisited. In the meantime, the current 88 * state is described below. Significance of all notable routines is explained 89 * in comments placed above their implementation. Scattered thoroughout the 90 * file are TODO comments indicating shortcomings which can be fixed without 91 * reworking everything (most of the fixes will likely be reusable). Various 92 * details are omitted from this explanation to not clutter the overview, they 93 * have to be checked by reading the code and associated commentary. 94 * 95 * Keep in mind that it's individual path components which are cached, not full 96 * paths. That is, for a fully cached path "foo/bar/baz" there are 3 entries, 97 * one for each name. 98 * 99 * I. Data organization 100 * 101 * Entries are described by "struct namecache" objects and stored in a hash 102 * table. See cache_get_hash for more information. 103 * 104 * "struct vnode" contains pointers to source entries (names which can be found 105 * when traversing through said vnode), destination entries (names of that 106 * vnode (see "Limitations" for a breakdown on the subject) and a pointer to 107 * the parent vnode. 108 * 109 * The (directory vnode; name) tuple reliably determines the target entry if 110 * it exists. 111 * 112 * Since there are no small locks at this time (all are 32 bytes in size on 113 * LP64), the code works around the problem by introducing lock arrays to 114 * protect hash buckets and vnode lists. 115 * 116 * II. Filesystem integration 117 * 118 * Filesystems participating in name caching do the following: 119 * - set vop_lookup routine to vfs_cache_lookup 120 * - set vop_cachedlookup to whatever can perform the lookup if the above fails 121 * - if they support lockless lookup (see below), vop_fplookup_vexec and 122 * vop_fplookup_symlink are set along with the MNTK_FPLOOKUP flag on the 123 * mount point 124 * - call cache_purge or cache_vop_* routines to eliminate stale entries as 125 * applicable 126 * - call cache_enter to add entries depending on the MAKEENTRY flag 127 * 128 * With the above in mind, there are 2 entry points when doing lookups: 129 * - ... -> namei -> cache_fplookup -- this is the default 130 * - ... -> VOP_LOOKUP -> vfs_cache_lookup -- normally only called by namei 131 * should the above fail 132 * 133 * Example code flow how an entry is added: 134 * ... -> namei -> cache_fplookup -> cache_fplookup_noentry -> VOP_LOOKUP -> 135 * vfs_cache_lookup -> VOP_CACHEDLOOKUP -> ufs_lookup_ino -> cache_enter 136 * 137 * III. Performance considerations 138 * 139 * For lockless case forward lookup avoids any writes to shared areas apart 140 * from the terminal path component. In other words non-modifying lookups of 141 * different files don't suffer any scalability problems in the namecache. 142 * Looking up the same file is limited by VFS and goes beyond the scope of this 143 * file. 144 * 145 * At least on amd64 the single-threaded bottleneck for long paths is hashing 146 * (see cache_get_hash). There are cases where the code issues acquire fence 147 * multiple times, they can be combined on architectures which suffer from it. 148 * 149 * For locked case each encountered vnode has to be referenced and locked in 150 * order to be handed out to the caller (normally that's namei). This 151 * introduces significant hit single-threaded and serialization multi-threaded. 152 * 153 * Reverse lookup (e.g., "getcwd") fully scales provided it is fully cached -- 154 * avoids any writes to shared areas to any components. 155 * 156 * Unrelated insertions are partially serialized on updating the global entry 157 * counter and possibly serialized on colliding bucket or vnode locks. 158 * 159 * IV. Observability 160 * 161 * Note not everything has an explicit dtrace probe nor it should have, thus 162 * some of the one-liners below depend on implementation details. 163 * 164 * Examples: 165 * 166 * # Check what lookups failed to be handled in a lockless manner. Column 1 is 167 * # line number, column 2 is status code (see cache_fpl_status) 168 * dtrace -n 'vfs:fplookup:lookup:done { @[arg1, arg2] = count(); }' 169 * 170 * # Lengths of names added by binary name 171 * dtrace -n 'fbt::cache_enter_time:entry { @[execname] = quantize(args[2]->cn_namelen); }' 172 * 173 * # Same as above but only those which exceed 64 characters 174 * dtrace -n 'fbt::cache_enter_time:entry /args[2]->cn_namelen > 64/ { @[execname] = quantize(args[2]->cn_namelen); }' 175 * 176 * # Who is performing lookups with spurious slashes (e.g., "foo//bar") and what 177 * # path is it 178 * dtrace -n 'fbt::cache_fplookup_skip_slashes:entry { @[execname, stringof(args[0]->cnp->cn_pnbuf)] = count(); }' 179 * 180 * V. Limitations and implementation defects 181 * 182 * - since it is possible there is no entry for an open file, tools like 183 * "procstat" may fail to resolve fd -> vnode -> path to anything 184 * - even if a filesystem adds an entry, it may get purged (e.g., due to memory 185 * shortage) in which case the above problem applies 186 * - hardlinks are not tracked, thus if a vnode is reachable in more than one 187 * way, resolving a name may return a different path than the one used to 188 * open it (even if said path is still valid) 189 * - by default entries are not added for newly created files 190 * - adding an entry may need to evict negative entry first, which happens in 2 191 * distinct places (evicting on lookup, adding in a later VOP) making it 192 * impossible to simply reuse it 193 * - there is a simple scheme to evict negative entries as the cache is approaching 194 * its capacity, but it is very unclear if doing so is a good idea to begin with 195 * - vnodes are subject to being recycled even if target inode is left in memory, 196 * which loses the name cache entries when it perhaps should not. in case of tmpfs 197 * names get duplicated -- kept by filesystem itself and namecache separately 198 * - struct namecache has a fixed size and comes in 2 variants, often wasting 199 * space. now hard to replace with malloc due to dependence on SMR, which 200 * requires UMA zones to opt in 201 * - lack of better integration with the kernel also turns nullfs into a layered 202 * filesystem instead of something which can take advantage of caching 203 * 204 * Appendix A: where is the time lost, expanding on paragraph III 205 * 206 * While some care went into optimizing lookups, there is still plenty of 207 * performance left on the table, most notably from single-threaded standpoint. 208 * Below is a woefully incomplete list of changes which can help. Ideas are 209 * mostly sketched out, no claim is made all kinks or prerequisites are laid 210 * out. 211 * 212 * Note there is performance lost all over VFS. 213 * 214 * === SMR-only lookup 215 * 216 * For commonly used ops like stat(2), when the terminal vnode *is* cached, 217 * lockless lookup could refrain from refing/locking the found vnode and 218 * instead return while within the SMR section. Then a call to, say, 219 * vop_stat_smr could do the work (or fail with EAGAIN), finally the result 220 * would be validated with seqc not changing. This would be faster 221 * single-threaded as it dodges atomics and would provide full scalability for 222 * multicore uses. This would *not* work for open(2) or other calls which need 223 * the vnode to hang around for the long haul, but would work for aforementioned 224 * stat(2) but also access(2), readlink(2), realpathat(2) and probably more. 225 * 226 * === hotpatching for sdt probes 227 * 228 * They result in *tons* of branches all over with rather regrettable codegen 229 * at times. Removing sdt probes altogether gives over 2% boost in lookup rate. 230 * Reworking the code to patch itself at runtime with asm goto would solve it. 231 * asm goto is fully supported by gcc and clang. 232 * 233 * === copyinstr 234 * 235 * On all architectures it operates one byte at a time, while it could be 236 * word-sized instead thanks to the Mycroft trick. 237 * 238 * API itself is rather pessimal for path lookup, accepting arbitrary sizes and 239 * *optionally* filling in the length parameter. 240 * 241 * Instead a new routine (copyinpath?) could be introduced, demanding a buffer 242 * size which is a multiply of the word (and never zero), with the length 243 * always returned. On top of it the routine could be allowed to transform the 244 * buffer in arbitrary ways, most notably writing past the found length (not to 245 * be confused with writing past buffer size) -- this would allow word-sized 246 * movs while checking for '\0' later. 247 * 248 * === detour through namei 249 * 250 * Currently one suffers being called from namei, which then has to check if 251 * things worked out locklessly. Instead the lockless lookup could be the 252 * actual entry point which calls what is currently namei as a fallback. 253 * 254 * === avoidable branches in cache_can_fplookup 255 * 256 * The cache_fast_lookup_enabled flag check could be hotpatchable (in fact if 257 * this is off, none of fplookup code should execute). 258 * 259 * Both audit and capsicum branches can be combined into one, but it requires 260 * paying off a lot of tech debt first. 261 * 262 * ni_startdir could be indicated with a flag in cn_flags, eliminating the 263 * branch. 264 * 265 * === mount stacks 266 * 267 * Crossing a mount requires checking if perhaps something is mounted on top. 268 * Instead, an additional entry could be added to struct mount with a pointer 269 * to the final mount on the stack. This would be recalculated on each 270 * mount/unmount. 271 * 272 * === root vnodes 273 * 274 * It could become part of the API contract to *always* have a rootvnode set in 275 * mnt_rootvnode. Such vnodes are annotated with VV_ROOT and vnlru would have 276 * to be modified to always skip them. 277 * 278 * === inactive on v_usecount reaching 0 279 * 280 * VOP_NEED_INACTIVE should not exist. Filesystems would indicate need for such 281 * processing with a bit in usecount. 282 * 283 * === v_holdcnt 284 * 285 * Hold count should probably get eliminated, but one can argue it is a useful 286 * feature. Even if so, handling of v_usecount could be decoupled from it -- 287 * vnlru et al would consider the vnode not-freeable if has either hold or 288 * usecount on it. 289 * 290 * This would eliminate 2 atomics. 291 */ 292 293 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 294 "Name cache"); 295 296 SDT_PROVIDER_DECLARE(vfs); 297 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", 298 "struct vnode *"); 299 SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *", 300 "struct vnode *"); 301 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", 302 "char *"); 303 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *", 304 "const char *"); 305 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *", 306 "struct namecache *", "int", "int"); 307 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); 308 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", 309 "char *", "struct vnode *"); 310 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); 311 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", 312 "struct vnode *", "char *"); 313 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", 314 "struct vnode *"); 315 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, 316 "struct vnode *", "char *"); 317 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", 318 "char *"); 319 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *", 320 "struct componentname *"); 321 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *", 322 "struct componentname *"); 323 SDT_PROBE_DEFINE3(vfs, namecache, purge, done, "struct vnode *", "size_t", "size_t"); 324 SDT_PROBE_DEFINE1(vfs, namecache, purge, batch, "int"); 325 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); 326 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); 327 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", 328 "struct vnode *"); 329 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", 330 "char *"); 331 SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *", 332 "char *"); 333 SDT_PROBE_DEFINE1(vfs, namecache, symlink, alloc__fail, "size_t"); 334 335 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool"); 336 SDT_PROBE_DECLARE(vfs, namei, lookup, entry); 337 SDT_PROBE_DECLARE(vfs, namei, lookup, return); 338 339 static char __read_frequently cache_fast_lookup_enabled = true; 340 341 /* 342 * This structure describes the elements in the cache of recent 343 * names looked up by namei. 344 */ 345 struct negstate { 346 u_char neg_flag; 347 u_char neg_hit; 348 }; 349 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *), 350 "the state must fit in a union with a pointer without growing it"); 351 352 struct namecache { 353 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 354 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 355 CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */ 356 struct vnode *nc_dvp; /* vnode of parent of name */ 357 union { 358 struct vnode *nu_vp; /* vnode the name refers to */ 359 struct negstate nu_neg;/* negative entry state */ 360 } n_un; 361 u_char nc_flag; /* flag bits */ 362 u_char nc_nlen; /* length of name */ 363 char nc_name[]; /* segment name + nul */ 364 }; 365 366 /* 367 * struct namecache_ts repeats struct namecache layout up to the 368 * nc_nlen member. 369 * struct namecache_ts is used in place of struct namecache when time(s) need 370 * to be stored. The nc_dotdottime field is used when a cache entry is mapping 371 * both a non-dotdot directory name plus dotdot for the directory's 372 * parent. 373 * 374 * See below for alignment requirement. 375 */ 376 struct namecache_ts { 377 struct timespec nc_time; /* timespec provided by fs */ 378 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ 379 int nc_ticks; /* ticks value when entry was added */ 380 int nc_pad; 381 struct namecache nc_nc; 382 }; 383 384 TAILQ_HEAD(cache_freebatch, namecache); 385 386 /* 387 * At least mips n32 performs 64-bit accesses to timespec as found 388 * in namecache_ts and requires them to be aligned. Since others 389 * may be in the same spot suffer a little bit and enforce the 390 * alignment for everyone. Note this is a nop for 64-bit platforms. 391 */ 392 #define CACHE_ZONE_ALIGNMENT UMA_ALIGNOF(time_t) 393 394 /* 395 * TODO: the initial value of CACHE_PATH_CUTOFF was inherited from the 396 * 4.4 BSD codebase. Later on struct namecache was tweaked to become 397 * smaller and the value was bumped to retain the total size, but it 398 * was never re-evaluated for suitability. A simple test counting 399 * lengths during package building shows that the value of 45 covers 400 * about 86% of all added entries, reaching 99% at 65. 401 * 402 * Regardless of the above, use of dedicated zones instead of malloc may be 403 * inducing additional waste. This may be hard to address as said zones are 404 * tied to VFS SMR. Even if retaining them, the current split should be 405 * re-evaluated. 406 */ 407 #ifdef __LP64__ 408 #define CACHE_PATH_CUTOFF 45 409 #define CACHE_LARGE_PAD 6 410 #else 411 #define CACHE_PATH_CUTOFF 41 412 #define CACHE_LARGE_PAD 2 413 #endif 414 415 #define CACHE_ZONE_SMALL_SIZE (offsetof(struct namecache, nc_name) + CACHE_PATH_CUTOFF + 1) 416 #define CACHE_ZONE_SMALL_TS_SIZE (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_SMALL_SIZE) 417 #define CACHE_ZONE_LARGE_SIZE (offsetof(struct namecache, nc_name) + NAME_MAX + 1 + CACHE_LARGE_PAD) 418 #define CACHE_ZONE_LARGE_TS_SIZE (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_LARGE_SIZE) 419 420 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 421 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 422 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 423 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 424 425 #define nc_vp n_un.nu_vp 426 #define nc_neg n_un.nu_neg 427 428 /* 429 * Flags in namecache.nc_flag 430 */ 431 #define NCF_WHITE 0x01 432 #define NCF_ISDOTDOT 0x02 433 #define NCF_TS 0x04 434 #define NCF_DTS 0x08 435 #define NCF_DVDROP 0x10 436 #define NCF_NEGATIVE 0x20 437 #define NCF_INVALID 0x40 438 #define NCF_WIP 0x80 439 440 /* 441 * Flags in negstate.neg_flag 442 */ 443 #define NEG_HOT 0x01 444 445 static bool cache_neg_evict_cond(u_long lnumcache); 446 447 /* 448 * Mark an entry as invalid. 449 * 450 * This is called before it starts getting deconstructed. 451 */ 452 static void 453 cache_ncp_invalidate(struct namecache *ncp) 454 { 455 456 KASSERT((ncp->nc_flag & NCF_INVALID) == 0, 457 ("%s: entry %p already invalid", __func__, ncp)); 458 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID); 459 atomic_thread_fence_rel(); 460 } 461 462 /* 463 * Does this entry match the given directory and name? 464 */ 465 static bool 466 cache_ncp_match(struct namecache *ncp, struct vnode *dvp, 467 struct componentname *cnp) 468 { 469 return (ncp->nc_dvp == dvp && 470 ncp->nc_nlen == cnp->cn_namelen && 471 bcmp(ncp->nc_name, cnp->cn_nameptr, cnp->cn_namelen) == 0); 472 } 473 474 /* 475 * Check whether the entry can be safely used. 476 * 477 * All places which elide locks are supposed to call this after they are 478 * done with reading from an entry. 479 */ 480 #define cache_ncp_canuse(ncp) ({ \ 481 struct namecache *_ncp = (ncp); \ 482 u_char _nc_flag; \ 483 \ 484 atomic_thread_fence_acq(); \ 485 _nc_flag = atomic_load_char(&_ncp->nc_flag); \ 486 __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP)) == 0); \ 487 }) 488 489 /* 490 * Like the above but also checks NCF_WHITE. 491 */ 492 #define cache_fpl_neg_ncp_canuse(ncp) ({ \ 493 struct namecache *_ncp = (ncp); \ 494 u_char _nc_flag; \ 495 \ 496 atomic_thread_fence_acq(); \ 497 _nc_flag = atomic_load_char(&_ncp->nc_flag); \ 498 __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP | NCF_WHITE)) == 0); \ 499 }) 500 501 VFS_SMR_DECLARE; 502 503 static SYSCTL_NODE(_vfs_cache, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 504 "Name cache parameters"); 505 506 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */ 507 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, size, CTLFLAG_RD, &ncsize, 0, 508 "Total namecache capacity"); 509 510 u_int ncsizefactor = 2; 511 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, sizefactor, CTLFLAG_RW, &ncsizefactor, 0, 512 "Size factor for namecache"); 513 514 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */ 515 SYSCTL_ULONG(_vfs_cache_param, OID_AUTO, negfactor, CTLFLAG_RW, &ncnegfactor, 0, 516 "Ratio of negative namecache entries"); 517 518 /* 519 * Negative entry % of namecache capacity above which automatic eviction is allowed. 520 * 521 * Check cache_neg_evict_cond for details. 522 */ 523 static u_int ncnegminpct = 3; 524 525 static u_int __read_mostly neg_min; /* the above recomputed against ncsize */ 526 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, negmin, CTLFLAG_RD, &neg_min, 0, 527 "Negative entry count above which automatic eviction is allowed"); 528 529 /* 530 * Structures associated with name caching. 531 */ 532 #define NCHHASH(hash) \ 533 (&nchashtbl[(hash) & nchash]) 534 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ 535 static u_long __read_mostly nchash; /* size of hash table */ 536 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 537 "Size of namecache hash table"); 538 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ 539 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ 540 541 struct nchstats nchstats; /* cache effectiveness statistics */ 542 543 static u_int __exclusive_cache_line neg_cycle; 544 545 #define ncneghash 3 546 #define numneglists (ncneghash + 1) 547 548 struct neglist { 549 struct mtx nl_evict_lock; 550 struct mtx nl_lock __aligned(CACHE_LINE_SIZE); 551 TAILQ_HEAD(, namecache) nl_list; 552 TAILQ_HEAD(, namecache) nl_hotlist; 553 u_long nl_hotnum; 554 } __aligned(CACHE_LINE_SIZE); 555 556 static struct neglist neglists[numneglists]; 557 558 static inline struct neglist * 559 NCP2NEGLIST(struct namecache *ncp) 560 { 561 562 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); 563 } 564 565 static inline struct negstate * 566 NCP2NEGSTATE(struct namecache *ncp) 567 { 568 569 MPASS(atomic_load_char(&ncp->nc_flag) & NCF_NEGATIVE); 570 return (&ncp->nc_neg); 571 } 572 573 #define numbucketlocks (ncbuckethash + 1) 574 static u_int __read_mostly ncbuckethash; 575 static struct mtx_padalign __read_mostly *bucketlocks; 576 #define HASH2BUCKETLOCK(hash) \ 577 ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)])) 578 579 #define numvnodelocks (ncvnodehash + 1) 580 static u_int __read_mostly ncvnodehash; 581 static struct mtx __read_mostly *vnodelocks; 582 static inline struct mtx * 583 VP2VNODELOCK(struct vnode *vp) 584 { 585 586 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); 587 } 588 589 /* 590 * Search the hash table for a namecache entry. Either the corresponding bucket 591 * must be locked, or the caller must be in an SMR read section. 592 */ 593 static struct namecache * 594 cache_ncp_find(struct vnode *dvp, struct componentname *cnp, uint32_t hash) 595 { 596 struct namecache *ncp; 597 598 KASSERT(mtx_owned(HASH2BUCKETLOCK(hash)) || VFS_SMR_ENTERED(), 599 ("%s: hash %u not locked", __func__, hash)); 600 CK_SLIST_FOREACH(ncp, NCHHASH(hash), nc_hash) { 601 if (cache_ncp_match(ncp, dvp, cnp)) 602 break; 603 } 604 return (ncp); 605 } 606 607 static void 608 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) 609 { 610 struct namecache_ts *ncp_ts; 611 612 KASSERT((ncp->nc_flag & NCF_TS) != 0 || 613 (tsp == NULL && ticksp == NULL), 614 ("No NCF_TS")); 615 616 if (tsp == NULL) 617 return; 618 619 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 620 *tsp = ncp_ts->nc_time; 621 *ticksp = ncp_ts->nc_ticks; 622 } 623 624 #ifdef DEBUG_CACHE 625 static int __read_mostly doingcache = 1; /* 1 => enable the cache */ 626 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, 627 "VFS namecache enabled"); 628 #endif 629 630 /* Export size information to userland */ 631 SYSCTL_SIZEOF_STRUCT(namecache); 632 633 /* 634 * The new name cache statistics 635 */ 636 static SYSCTL_NODE(_vfs_cache, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 637 "Name cache statistics"); 638 639 #define STATNODE_ULONG(name, varname, descr) \ 640 SYSCTL_ULONG(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr); 641 #define STATNODE_COUNTER(name, varname, descr) \ 642 static COUNTER_U64_DEFINE_EARLY(varname); \ 643 SYSCTL_COUNTER_U64(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, \ 644 descr); 645 STATNODE_ULONG(neg, numneg, "Number of negative cache entries"); 646 STATNODE_ULONG(count, numcache, "Number of cache entries"); 647 STATNODE_COUNTER(heldvnodes, numcachehv, "Number of namecache entries with vnodes held"); 648 STATNODE_COUNTER(drops, numdrops, "Number of dropped entries due to reaching the limit"); 649 STATNODE_COUNTER(miss, nummiss, "Number of cache misses"); 650 STATNODE_COUNTER(misszap, nummisszap, "Number of cache misses we do not want to cache"); 651 STATNODE_COUNTER(poszaps, numposzaps, 652 "Number of cache hits (positive) we do not want to cache"); 653 STATNODE_COUNTER(poshits, numposhits, "Number of cache hits (positive)"); 654 STATNODE_COUNTER(negzaps, numnegzaps, 655 "Number of cache hits (negative) we do not want to cache"); 656 STATNODE_COUNTER(neghits, numneghits, "Number of cache hits (negative)"); 657 /* These count for vn_getcwd(), too. */ 658 STATNODE_COUNTER(fullpathcalls, numfullpathcalls, "Number of fullpath search calls"); 659 STATNODE_COUNTER(fullpathfail2, numfullpathfail2, 660 "Number of fullpath search errors (VOP_VPTOCNP failures)"); 661 STATNODE_COUNTER(fullpathfail4, numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); 662 STATNODE_COUNTER(fullpathfound, numfullpathfound, "Number of successful fullpath calls"); 663 STATNODE_COUNTER(symlinktoobig, symlinktoobig, "Number of times symlink did not fit the cache"); 664 665 /* 666 * Debug or developer statistics. 667 */ 668 static SYSCTL_NODE(_vfs_cache, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 669 "Name cache debugging"); 670 #define DEBUGNODE_ULONG(name, varname, descr) \ 671 SYSCTL_ULONG(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr); 672 static u_long zap_bucket_relock_success; 673 DEBUGNODE_ULONG(zap_bucket_relock_success, zap_bucket_relock_success, 674 "Number of successful removals after relocking"); 675 static u_long zap_bucket_fail; 676 DEBUGNODE_ULONG(zap_bucket_fail, zap_bucket_fail, ""); 677 static u_long zap_bucket_fail2; 678 DEBUGNODE_ULONG(zap_bucket_fail2, zap_bucket_fail2, ""); 679 static u_long cache_lock_vnodes_cel_3_failures; 680 DEBUGNODE_ULONG(vnodes_cel_3_failures, cache_lock_vnodes_cel_3_failures, 681 "Number of times 3-way vnode locking failed"); 682 683 static void cache_zap_locked(struct namecache *ncp); 684 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 685 char **retbuf, size_t *buflen, size_t addend); 686 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, 687 char **retbuf, size_t *buflen); 688 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, 689 char **retbuf, size_t *len, size_t addend); 690 691 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 692 693 static inline void 694 cache_assert_vlp_locked(struct mtx *vlp) 695 { 696 697 if (vlp != NULL) 698 mtx_assert(vlp, MA_OWNED); 699 } 700 701 static inline void 702 cache_assert_vnode_locked(struct vnode *vp) 703 { 704 struct mtx *vlp; 705 706 vlp = VP2VNODELOCK(vp); 707 cache_assert_vlp_locked(vlp); 708 } 709 710 /* 711 * Directory vnodes with entries are held for two reasons: 712 * 1. make them less of a target for reclamation in vnlru 713 * 2. suffer smaller performance penalty in locked lookup as requeieing is avoided 714 * 715 * It will be feasible to stop doing it altogether if all filesystems start 716 * supporting lockless lookup. 717 */ 718 static void 719 cache_hold_vnode(struct vnode *vp) 720 { 721 722 cache_assert_vnode_locked(vp); 723 VNPASS(LIST_EMPTY(&vp->v_cache_src), vp); 724 vhold(vp); 725 counter_u64_add(numcachehv, 1); 726 } 727 728 static void 729 cache_drop_vnode(struct vnode *vp) 730 { 731 732 /* 733 * Called after all locks are dropped, meaning we can't assert 734 * on the state of v_cache_src. 735 */ 736 vdrop(vp); 737 counter_u64_add(numcachehv, -1); 738 } 739 740 /* 741 * UMA zones. 742 */ 743 static uma_zone_t __read_mostly cache_zone_small; 744 static uma_zone_t __read_mostly cache_zone_small_ts; 745 static uma_zone_t __read_mostly cache_zone_large; 746 static uma_zone_t __read_mostly cache_zone_large_ts; 747 748 char * 749 cache_symlink_alloc(size_t size, int flags) 750 { 751 752 if (size < CACHE_ZONE_SMALL_SIZE) { 753 return (uma_zalloc_smr(cache_zone_small, flags)); 754 } 755 if (size < CACHE_ZONE_LARGE_SIZE) { 756 return (uma_zalloc_smr(cache_zone_large, flags)); 757 } 758 counter_u64_add(symlinktoobig, 1); 759 SDT_PROBE1(vfs, namecache, symlink, alloc__fail, size); 760 return (NULL); 761 } 762 763 void 764 cache_symlink_free(char *string, size_t size) 765 { 766 767 MPASS(string != NULL); 768 KASSERT(size < CACHE_ZONE_LARGE_SIZE, 769 ("%s: size %zu too big", __func__, size)); 770 771 if (size < CACHE_ZONE_SMALL_SIZE) { 772 uma_zfree_smr(cache_zone_small, string); 773 return; 774 } 775 if (size < CACHE_ZONE_LARGE_SIZE) { 776 uma_zfree_smr(cache_zone_large, string); 777 return; 778 } 779 __assert_unreachable(); 780 } 781 782 static struct namecache * 783 cache_alloc_uma(int len, bool ts) 784 { 785 struct namecache_ts *ncp_ts; 786 struct namecache *ncp; 787 788 if (__predict_false(ts)) { 789 if (len <= CACHE_PATH_CUTOFF) 790 ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK); 791 else 792 ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK); 793 ncp = &ncp_ts->nc_nc; 794 } else { 795 if (len <= CACHE_PATH_CUTOFF) 796 ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK); 797 else 798 ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK); 799 } 800 return (ncp); 801 } 802 803 static void 804 cache_free_uma(struct namecache *ncp) 805 { 806 struct namecache_ts *ncp_ts; 807 808 if (__predict_false(ncp->nc_flag & NCF_TS)) { 809 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 810 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 811 uma_zfree_smr(cache_zone_small_ts, ncp_ts); 812 else 813 uma_zfree_smr(cache_zone_large_ts, ncp_ts); 814 } else { 815 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 816 uma_zfree_smr(cache_zone_small, ncp); 817 else 818 uma_zfree_smr(cache_zone_large, ncp); 819 } 820 } 821 822 static struct namecache * 823 cache_alloc(int len, bool ts) 824 { 825 u_long lnumcache; 826 827 /* 828 * Avoid blowout in namecache entries. 829 * 830 * Bugs: 831 * 1. filesystems may end up trying to add an already existing entry 832 * (for example this can happen after a cache miss during concurrent 833 * lookup), in which case we will call cache_neg_evict despite not 834 * adding anything. 835 * 2. the routine may fail to free anything and no provisions are made 836 * to make it try harder (see the inside for failure modes) 837 * 3. it only ever looks at negative entries. 838 */ 839 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; 840 if (cache_neg_evict_cond(lnumcache)) { 841 lnumcache = atomic_load_long(&numcache); 842 } 843 if (__predict_false(lnumcache >= ncsize)) { 844 atomic_subtract_long(&numcache, 1); 845 counter_u64_add(numdrops, 1); 846 return (NULL); 847 } 848 return (cache_alloc_uma(len, ts)); 849 } 850 851 static void 852 cache_free(struct namecache *ncp) 853 { 854 855 MPASS(ncp != NULL); 856 if ((ncp->nc_flag & NCF_DVDROP) != 0) { 857 cache_drop_vnode(ncp->nc_dvp); 858 } 859 cache_free_uma(ncp); 860 atomic_subtract_long(&numcache, 1); 861 } 862 863 static void 864 cache_free_batch(struct cache_freebatch *batch) 865 { 866 struct namecache *ncp, *nnp; 867 int i; 868 869 i = 0; 870 if (TAILQ_EMPTY(batch)) 871 goto out; 872 TAILQ_FOREACH_SAFE(ncp, batch, nc_dst, nnp) { 873 if ((ncp->nc_flag & NCF_DVDROP) != 0) { 874 cache_drop_vnode(ncp->nc_dvp); 875 } 876 cache_free_uma(ncp); 877 i++; 878 } 879 atomic_subtract_long(&numcache, i); 880 out: 881 SDT_PROBE1(vfs, namecache, purge, batch, i); 882 } 883 884 /* 885 * Hashing. 886 * 887 * The code was made to use FNV in 2001 and this choice needs to be revisited. 888 * 889 * Short summary of the difficulty: 890 * The longest name which can be inserted is NAME_MAX characters in length (or 891 * 255 at the time of writing this comment), while majority of names used in 892 * practice are significantly shorter (mostly below 10). More importantly 893 * majority of lookups performed find names are even shorter than that. 894 * 895 * This poses a problem where hashes which do better than FNV past word size 896 * (or so) tend to come with additional overhead when finalizing the result, 897 * making them noticeably slower for the most commonly used range. 898 * 899 * Consider a path like: /usr/obj/usr/src/sys/amd64/GENERIC/vnode_if.c 900 * 901 * When looking it up the most time consuming part by a large margin (at least 902 * on amd64) is hashing. Replacing FNV with something which pessimizes short 903 * input would make the slowest part stand out even more. 904 */ 905 906 /* 907 * TODO: With the value stored we can do better than computing the hash based 908 * on the address. 909 */ 910 static void 911 cache_prehash(struct vnode *vp) 912 { 913 914 vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT); 915 } 916 917 static uint32_t 918 cache_get_hash(char *name, u_char len, struct vnode *dvp) 919 { 920 921 return (fnv_32_buf(name, len, dvp->v_nchash)); 922 } 923 924 static uint32_t 925 cache_get_hash_iter_start(struct vnode *dvp) 926 { 927 928 return (dvp->v_nchash); 929 } 930 931 static uint32_t 932 cache_get_hash_iter(char c, uint32_t hash) 933 { 934 935 return (fnv_32_buf(&c, 1, hash)); 936 } 937 938 static uint32_t 939 cache_get_hash_iter_finish(uint32_t hash) 940 { 941 942 return (hash); 943 } 944 945 static inline struct nchashhead * 946 NCP2BUCKET(struct namecache *ncp) 947 { 948 uint32_t hash; 949 950 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 951 return (NCHHASH(hash)); 952 } 953 954 static inline struct mtx * 955 NCP2BUCKETLOCK(struct namecache *ncp) 956 { 957 uint32_t hash; 958 959 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 960 return (HASH2BUCKETLOCK(hash)); 961 } 962 963 #ifdef INVARIANTS 964 static void 965 cache_assert_bucket_locked(struct namecache *ncp) 966 { 967 struct mtx *blp; 968 969 blp = NCP2BUCKETLOCK(ncp); 970 mtx_assert(blp, MA_OWNED); 971 } 972 973 static void 974 cache_assert_bucket_unlocked(struct namecache *ncp) 975 { 976 struct mtx *blp; 977 978 blp = NCP2BUCKETLOCK(ncp); 979 mtx_assert(blp, MA_NOTOWNED); 980 } 981 #else 982 #define cache_assert_bucket_locked(x) do { } while (0) 983 #define cache_assert_bucket_unlocked(x) do { } while (0) 984 #endif 985 986 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) 987 static void 988 _cache_sort_vnodes(void **p1, void **p2) 989 { 990 void *tmp; 991 992 MPASS(*p1 != NULL || *p2 != NULL); 993 994 if (*p1 > *p2) { 995 tmp = *p2; 996 *p2 = *p1; 997 *p1 = tmp; 998 } 999 } 1000 1001 static void 1002 cache_lock_all_buckets(void) 1003 { 1004 u_int i; 1005 1006 for (i = 0; i < numbucketlocks; i++) 1007 mtx_lock(&bucketlocks[i]); 1008 } 1009 1010 static void 1011 cache_unlock_all_buckets(void) 1012 { 1013 u_int i; 1014 1015 for (i = 0; i < numbucketlocks; i++) 1016 mtx_unlock(&bucketlocks[i]); 1017 } 1018 1019 static void 1020 cache_lock_all_vnodes(void) 1021 { 1022 u_int i; 1023 1024 for (i = 0; i < numvnodelocks; i++) 1025 mtx_lock(&vnodelocks[i]); 1026 } 1027 1028 static void 1029 cache_unlock_all_vnodes(void) 1030 { 1031 u_int i; 1032 1033 for (i = 0; i < numvnodelocks; i++) 1034 mtx_unlock(&vnodelocks[i]); 1035 } 1036 1037 static int 1038 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 1039 { 1040 1041 cache_sort_vnodes(&vlp1, &vlp2); 1042 1043 if (vlp1 != NULL) { 1044 if (!mtx_trylock(vlp1)) 1045 return (EAGAIN); 1046 } 1047 if (!mtx_trylock(vlp2)) { 1048 if (vlp1 != NULL) 1049 mtx_unlock(vlp1); 1050 return (EAGAIN); 1051 } 1052 1053 return (0); 1054 } 1055 1056 static void 1057 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 1058 { 1059 1060 MPASS(vlp1 != NULL || vlp2 != NULL); 1061 MPASS(vlp1 <= vlp2); 1062 1063 if (vlp1 != NULL) 1064 mtx_lock(vlp1); 1065 if (vlp2 != NULL) 1066 mtx_lock(vlp2); 1067 } 1068 1069 static void 1070 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 1071 { 1072 1073 MPASS(vlp1 != NULL || vlp2 != NULL); 1074 1075 if (vlp1 != NULL) 1076 mtx_unlock(vlp1); 1077 if (vlp2 != NULL) 1078 mtx_unlock(vlp2); 1079 } 1080 1081 static int 1082 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 1083 { 1084 struct nchstats snap; 1085 1086 if (req->oldptr == NULL) 1087 return (SYSCTL_OUT(req, 0, sizeof(snap))); 1088 1089 snap = nchstats; 1090 snap.ncs_goodhits = counter_u64_fetch(numposhits); 1091 snap.ncs_neghits = counter_u64_fetch(numneghits); 1092 snap.ncs_badhits = counter_u64_fetch(numposzaps) + 1093 counter_u64_fetch(numnegzaps); 1094 snap.ncs_miss = counter_u64_fetch(nummisszap) + 1095 counter_u64_fetch(nummiss); 1096 1097 return (SYSCTL_OUT(req, &snap, sizeof(snap))); 1098 } 1099 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | 1100 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", 1101 "VFS cache effectiveness statistics"); 1102 1103 static int 1104 sysctl_hitpct(SYSCTL_HANDLER_ARGS) 1105 { 1106 long poshits, neghits, miss, total; 1107 long pct; 1108 1109 poshits = counter_u64_fetch(numposhits); 1110 neghits = counter_u64_fetch(numneghits); 1111 miss = counter_u64_fetch(nummiss); 1112 total = poshits + neghits + miss; 1113 1114 pct = 0; 1115 if (total != 0) 1116 pct = ((poshits + neghits) * 100) / total; 1117 return (sysctl_handle_int(oidp, 0, pct, req)); 1118 } 1119 SYSCTL_PROC(_vfs_cache_stats, OID_AUTO, hitpct, 1120 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_hitpct, 1121 "I", "Percentage of hits"); 1122 1123 static void 1124 cache_recalc_neg_min(void) 1125 { 1126 1127 neg_min = (ncsize * ncnegminpct) / 100; 1128 } 1129 1130 static int 1131 sysctl_negminpct(SYSCTL_HANDLER_ARGS) 1132 { 1133 u_int val; 1134 int error; 1135 1136 val = ncnegminpct; 1137 error = sysctl_handle_int(oidp, &val, 0, req); 1138 if (error != 0 || req->newptr == NULL) 1139 return (error); 1140 1141 if (val == ncnegminpct) 1142 return (0); 1143 if (val < 0 || val > 99) 1144 return (EINVAL); 1145 ncnegminpct = val; 1146 cache_recalc_neg_min(); 1147 return (0); 1148 } 1149 1150 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, negminpct, 1151 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_negminpct, 1152 "I", "Negative entry \% of namecache capacity above which automatic eviction is allowed"); 1153 1154 #ifdef DEBUG_CACHE 1155 /* 1156 * Grab an atomic snapshot of the name cache hash chain lengths 1157 */ 1158 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, 1159 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 1160 "hash table stats"); 1161 1162 static int 1163 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) 1164 { 1165 struct nchashhead *ncpp; 1166 struct namecache *ncp; 1167 int i, error, n_nchash, *cntbuf; 1168 1169 retry: 1170 n_nchash = nchash + 1; /* nchash is max index, not count */ 1171 if (req->oldptr == NULL) 1172 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); 1173 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); 1174 cache_lock_all_buckets(); 1175 if (n_nchash != nchash + 1) { 1176 cache_unlock_all_buckets(); 1177 free(cntbuf, M_TEMP); 1178 goto retry; 1179 } 1180 /* Scan hash tables counting entries */ 1181 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) 1182 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) 1183 cntbuf[i]++; 1184 cache_unlock_all_buckets(); 1185 for (error = 0, i = 0; i < n_nchash; i++) 1186 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) 1187 break; 1188 free(cntbuf, M_TEMP); 1189 return (error); 1190 } 1191 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| 1192 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", 1193 "nchash chain lengths"); 1194 1195 static int 1196 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) 1197 { 1198 int error; 1199 struct nchashhead *ncpp; 1200 struct namecache *ncp; 1201 int n_nchash; 1202 int count, maxlength, used, pct; 1203 1204 if (!req->oldptr) 1205 return SYSCTL_OUT(req, 0, 4 * sizeof(int)); 1206 1207 cache_lock_all_buckets(); 1208 n_nchash = nchash + 1; /* nchash is max index, not count */ 1209 used = 0; 1210 maxlength = 0; 1211 1212 /* Scan hash tables for applicable entries */ 1213 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { 1214 count = 0; 1215 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) { 1216 count++; 1217 } 1218 if (count) 1219 used++; 1220 if (maxlength < count) 1221 maxlength = count; 1222 } 1223 n_nchash = nchash + 1; 1224 cache_unlock_all_buckets(); 1225 pct = (used * 100) / (n_nchash / 100); 1226 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); 1227 if (error) 1228 return (error); 1229 error = SYSCTL_OUT(req, &used, sizeof(used)); 1230 if (error) 1231 return (error); 1232 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); 1233 if (error) 1234 return (error); 1235 error = SYSCTL_OUT(req, &pct, sizeof(pct)); 1236 if (error) 1237 return (error); 1238 return (0); 1239 } 1240 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| 1241 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", 1242 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); 1243 #endif 1244 1245 /* 1246 * Negative entries management 1247 * 1248 * Various workloads create plenty of negative entries and barely use them 1249 * afterwards. Moreover malicious users can keep performing bogus lookups 1250 * adding even more entries. For example "make tinderbox" as of writing this 1251 * comment ends up with 2.6M namecache entries in total, 1.2M of which are 1252 * negative. 1253 * 1254 * As such, a rather aggressive eviction method is needed. The currently 1255 * employed method is a placeholder. 1256 * 1257 * Entries are split over numneglists separate lists, each of which is further 1258 * split into hot and cold entries. Entries get promoted after getting a hit. 1259 * Eviction happens on addition of new entry. 1260 */ 1261 static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1262 "Name cache negative entry statistics"); 1263 1264 SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0, 1265 "Number of negative cache entries"); 1266 1267 static COUNTER_U64_DEFINE_EARLY(neg_created); 1268 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created, 1269 "Number of created negative entries"); 1270 1271 static COUNTER_U64_DEFINE_EARLY(neg_evicted); 1272 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted, 1273 "Number of evicted negative entries"); 1274 1275 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty); 1276 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD, 1277 &neg_evict_skipped_empty, 1278 "Number of times evicting failed due to lack of entries"); 1279 1280 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed); 1281 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD, 1282 &neg_evict_skipped_missed, 1283 "Number of times evicting failed due to target entry disappearing"); 1284 1285 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended); 1286 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD, 1287 &neg_evict_skipped_contended, 1288 "Number of times evicting failed due to contention"); 1289 1290 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits, 1291 "Number of cache hits (negative)"); 1292 1293 static int 1294 sysctl_neg_hot(SYSCTL_HANDLER_ARGS) 1295 { 1296 int i, out; 1297 1298 out = 0; 1299 for (i = 0; i < numneglists; i++) 1300 out += neglists[i].nl_hotnum; 1301 1302 return (SYSCTL_OUT(req, &out, sizeof(out))); 1303 } 1304 SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD | 1305 CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I", 1306 "Number of hot negative entries"); 1307 1308 static void 1309 cache_neg_init(struct namecache *ncp) 1310 { 1311 struct negstate *ns; 1312 1313 ncp->nc_flag |= NCF_NEGATIVE; 1314 ns = NCP2NEGSTATE(ncp); 1315 ns->neg_flag = 0; 1316 ns->neg_hit = 0; 1317 counter_u64_add(neg_created, 1); 1318 } 1319 1320 #define CACHE_NEG_PROMOTION_THRESH 2 1321 1322 static bool 1323 cache_neg_hit_prep(struct namecache *ncp) 1324 { 1325 struct negstate *ns; 1326 u_char n; 1327 1328 ns = NCP2NEGSTATE(ncp); 1329 n = atomic_load_char(&ns->neg_hit); 1330 for (;;) { 1331 if (n >= CACHE_NEG_PROMOTION_THRESH) 1332 return (false); 1333 if (atomic_fcmpset_8(&ns->neg_hit, &n, n + 1)) 1334 break; 1335 } 1336 return (n + 1 == CACHE_NEG_PROMOTION_THRESH); 1337 } 1338 1339 /* 1340 * Nothing to do here but it is provided for completeness as some 1341 * cache_neg_hit_prep callers may end up returning without even 1342 * trying to promote. 1343 */ 1344 #define cache_neg_hit_abort(ncp) do { } while (0) 1345 1346 static void 1347 cache_neg_hit_finish(struct namecache *ncp) 1348 { 1349 1350 SDT_PROBE2(vfs, namecache, lookup, hit__negative, ncp->nc_dvp, ncp->nc_name); 1351 counter_u64_add(numneghits, 1); 1352 } 1353 1354 /* 1355 * Move a negative entry to the hot list. 1356 */ 1357 static void 1358 cache_neg_promote_locked(struct namecache *ncp) 1359 { 1360 struct neglist *nl; 1361 struct negstate *ns; 1362 1363 ns = NCP2NEGSTATE(ncp); 1364 nl = NCP2NEGLIST(ncp); 1365 mtx_assert(&nl->nl_lock, MA_OWNED); 1366 if ((ns->neg_flag & NEG_HOT) == 0) { 1367 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst); 1368 TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst); 1369 nl->nl_hotnum++; 1370 ns->neg_flag |= NEG_HOT; 1371 } 1372 } 1373 1374 /* 1375 * Move a hot negative entry to the cold list. 1376 */ 1377 static void 1378 cache_neg_demote_locked(struct namecache *ncp) 1379 { 1380 struct neglist *nl; 1381 struct negstate *ns; 1382 1383 ns = NCP2NEGSTATE(ncp); 1384 nl = NCP2NEGLIST(ncp); 1385 mtx_assert(&nl->nl_lock, MA_OWNED); 1386 MPASS(ns->neg_flag & NEG_HOT); 1387 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst); 1388 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst); 1389 nl->nl_hotnum--; 1390 ns->neg_flag &= ~NEG_HOT; 1391 atomic_store_char(&ns->neg_hit, 0); 1392 } 1393 1394 /* 1395 * Move a negative entry to the hot list if it matches the lookup. 1396 * 1397 * We have to take locks, but they may be contended and in the worst 1398 * case we may need to go off CPU. We don't want to spin within the 1399 * smr section and we can't block with it. Exiting the section means 1400 * the found entry could have been evicted. We are going to look it 1401 * up again. 1402 */ 1403 static bool 1404 cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp, 1405 struct namecache *oncp, uint32_t hash) 1406 { 1407 struct namecache *ncp; 1408 struct neglist *nl; 1409 u_char nc_flag; 1410 1411 nl = NCP2NEGLIST(oncp); 1412 1413 mtx_lock(&nl->nl_lock); 1414 /* 1415 * For hash iteration. 1416 */ 1417 vfs_smr_enter(); 1418 1419 /* 1420 * Avoid all surprises by only succeeding if we got the same entry and 1421 * bailing completely otherwise. 1422 * XXX There are no provisions to keep the vnode around, meaning we may 1423 * end up promoting a negative entry for a *new* vnode and returning 1424 * ENOENT on its account. This is the error we want to return anyway 1425 * and promotion is harmless. 1426 * 1427 * In particular at this point there can be a new ncp which matches the 1428 * search but hashes to a different neglist. 1429 */ 1430 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1431 if (ncp == oncp) 1432 break; 1433 } 1434 1435 /* 1436 * No match to begin with. 1437 */ 1438 if (__predict_false(ncp == NULL)) { 1439 goto out_abort; 1440 } 1441 1442 /* 1443 * The newly found entry may be something different... 1444 */ 1445 if (!cache_ncp_match(ncp, dvp, cnp)) { 1446 goto out_abort; 1447 } 1448 1449 /* 1450 * ... and not even negative. 1451 */ 1452 nc_flag = atomic_load_char(&ncp->nc_flag); 1453 if ((nc_flag & NCF_NEGATIVE) == 0) { 1454 goto out_abort; 1455 } 1456 1457 if (!cache_ncp_canuse(ncp)) { 1458 goto out_abort; 1459 } 1460 1461 cache_neg_promote_locked(ncp); 1462 cache_neg_hit_finish(ncp); 1463 vfs_smr_exit(); 1464 mtx_unlock(&nl->nl_lock); 1465 return (true); 1466 out_abort: 1467 vfs_smr_exit(); 1468 mtx_unlock(&nl->nl_lock); 1469 return (false); 1470 } 1471 1472 static void 1473 cache_neg_promote(struct namecache *ncp) 1474 { 1475 struct neglist *nl; 1476 1477 nl = NCP2NEGLIST(ncp); 1478 mtx_lock(&nl->nl_lock); 1479 cache_neg_promote_locked(ncp); 1480 mtx_unlock(&nl->nl_lock); 1481 } 1482 1483 static void 1484 cache_neg_insert(struct namecache *ncp) 1485 { 1486 struct neglist *nl; 1487 1488 MPASS(ncp->nc_flag & NCF_NEGATIVE); 1489 cache_assert_bucket_locked(ncp); 1490 nl = NCP2NEGLIST(ncp); 1491 mtx_lock(&nl->nl_lock); 1492 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst); 1493 mtx_unlock(&nl->nl_lock); 1494 atomic_add_long(&numneg, 1); 1495 } 1496 1497 static void 1498 cache_neg_remove(struct namecache *ncp) 1499 { 1500 struct neglist *nl; 1501 struct negstate *ns; 1502 1503 cache_assert_bucket_locked(ncp); 1504 nl = NCP2NEGLIST(ncp); 1505 ns = NCP2NEGSTATE(ncp); 1506 mtx_lock(&nl->nl_lock); 1507 if ((ns->neg_flag & NEG_HOT) != 0) { 1508 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst); 1509 nl->nl_hotnum--; 1510 } else { 1511 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst); 1512 } 1513 mtx_unlock(&nl->nl_lock); 1514 atomic_subtract_long(&numneg, 1); 1515 } 1516 1517 static struct neglist * 1518 cache_neg_evict_select_list(void) 1519 { 1520 struct neglist *nl; 1521 u_int c; 1522 1523 c = atomic_fetchadd_int(&neg_cycle, 1) + 1; 1524 nl = &neglists[c % numneglists]; 1525 if (!mtx_trylock(&nl->nl_evict_lock)) { 1526 counter_u64_add(neg_evict_skipped_contended, 1); 1527 return (NULL); 1528 } 1529 return (nl); 1530 } 1531 1532 static struct namecache * 1533 cache_neg_evict_select_entry(struct neglist *nl) 1534 { 1535 struct namecache *ncp, *lncp; 1536 struct negstate *ns, *lns; 1537 int i; 1538 1539 mtx_assert(&nl->nl_evict_lock, MA_OWNED); 1540 mtx_assert(&nl->nl_lock, MA_OWNED); 1541 ncp = TAILQ_FIRST(&nl->nl_list); 1542 if (ncp == NULL) 1543 return (NULL); 1544 lncp = ncp; 1545 lns = NCP2NEGSTATE(lncp); 1546 for (i = 1; i < 4; i++) { 1547 ncp = TAILQ_NEXT(ncp, nc_dst); 1548 if (ncp == NULL) 1549 break; 1550 ns = NCP2NEGSTATE(ncp); 1551 if (ns->neg_hit < lns->neg_hit) { 1552 lncp = ncp; 1553 lns = ns; 1554 } 1555 } 1556 return (lncp); 1557 } 1558 1559 static bool 1560 cache_neg_evict(void) 1561 { 1562 struct namecache *ncp, *ncp2; 1563 struct neglist *nl; 1564 struct vnode *dvp; 1565 struct mtx *dvlp; 1566 struct mtx *blp; 1567 uint32_t hash; 1568 u_char nlen; 1569 bool evicted; 1570 1571 nl = cache_neg_evict_select_list(); 1572 if (nl == NULL) { 1573 return (false); 1574 } 1575 1576 mtx_lock(&nl->nl_lock); 1577 ncp = TAILQ_FIRST(&nl->nl_hotlist); 1578 if (ncp != NULL) { 1579 cache_neg_demote_locked(ncp); 1580 } 1581 ncp = cache_neg_evict_select_entry(nl); 1582 if (ncp == NULL) { 1583 counter_u64_add(neg_evict_skipped_empty, 1); 1584 mtx_unlock(&nl->nl_lock); 1585 mtx_unlock(&nl->nl_evict_lock); 1586 return (false); 1587 } 1588 nlen = ncp->nc_nlen; 1589 dvp = ncp->nc_dvp; 1590 hash = cache_get_hash(ncp->nc_name, nlen, dvp); 1591 dvlp = VP2VNODELOCK(dvp); 1592 blp = HASH2BUCKETLOCK(hash); 1593 mtx_unlock(&nl->nl_lock); 1594 mtx_unlock(&nl->nl_evict_lock); 1595 mtx_lock(dvlp); 1596 mtx_lock(blp); 1597 /* 1598 * Note that since all locks were dropped above, the entry may be 1599 * gone or reallocated to be something else. 1600 */ 1601 CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) { 1602 if (ncp2 == ncp && ncp2->nc_dvp == dvp && 1603 ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0) 1604 break; 1605 } 1606 if (ncp2 == NULL) { 1607 counter_u64_add(neg_evict_skipped_missed, 1); 1608 ncp = NULL; 1609 evicted = false; 1610 } else { 1611 MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp)); 1612 MPASS(blp == NCP2BUCKETLOCK(ncp)); 1613 SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp, 1614 ncp->nc_name); 1615 cache_zap_locked(ncp); 1616 counter_u64_add(neg_evicted, 1); 1617 evicted = true; 1618 } 1619 mtx_unlock(blp); 1620 mtx_unlock(dvlp); 1621 if (ncp != NULL) 1622 cache_free(ncp); 1623 return (evicted); 1624 } 1625 1626 /* 1627 * Maybe evict a negative entry to create more room. 1628 * 1629 * The ncnegfactor parameter limits what fraction of the total count 1630 * can comprise of negative entries. However, if the cache is just 1631 * warming up this leads to excessive evictions. As such, ncnegminpct 1632 * (recomputed to neg_min) dictates whether the above should be 1633 * applied. 1634 * 1635 * Try evicting if the cache is close to full capacity regardless of 1636 * other considerations. 1637 */ 1638 static bool 1639 cache_neg_evict_cond(u_long lnumcache) 1640 { 1641 u_long lnumneg; 1642 1643 if (ncsize - 1000 < lnumcache) 1644 goto out_evict; 1645 lnumneg = atomic_load_long(&numneg); 1646 if (lnumneg < neg_min) 1647 return (false); 1648 if (lnumneg * ncnegfactor < lnumcache) 1649 return (false); 1650 out_evict: 1651 return (cache_neg_evict()); 1652 } 1653 1654 /* 1655 * cache_zap_locked(): 1656 * 1657 * Removes a namecache entry from cache, whether it contains an actual 1658 * pointer to a vnode or if it is just a negative cache entry. 1659 */ 1660 static void 1661 cache_zap_locked(struct namecache *ncp) 1662 { 1663 struct nchashhead *ncpp; 1664 struct vnode *dvp, *vp; 1665 1666 dvp = ncp->nc_dvp; 1667 vp = ncp->nc_vp; 1668 1669 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1670 cache_assert_vnode_locked(vp); 1671 cache_assert_vnode_locked(dvp); 1672 cache_assert_bucket_locked(ncp); 1673 1674 cache_ncp_invalidate(ncp); 1675 1676 ncpp = NCP2BUCKET(ncp); 1677 CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash); 1678 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 1679 SDT_PROBE3(vfs, namecache, zap, done, dvp, ncp->nc_name, vp); 1680 TAILQ_REMOVE(&vp->v_cache_dst, ncp, nc_dst); 1681 if (ncp == vp->v_cache_dd) { 1682 atomic_store_ptr(&vp->v_cache_dd, NULL); 1683 } 1684 } else { 1685 SDT_PROBE2(vfs, namecache, zap_negative, done, dvp, ncp->nc_name); 1686 cache_neg_remove(ncp); 1687 } 1688 if (ncp->nc_flag & NCF_ISDOTDOT) { 1689 if (ncp == dvp->v_cache_dd) { 1690 atomic_store_ptr(&dvp->v_cache_dd, NULL); 1691 } 1692 } else { 1693 LIST_REMOVE(ncp, nc_src); 1694 if (LIST_EMPTY(&dvp->v_cache_src)) { 1695 ncp->nc_flag |= NCF_DVDROP; 1696 } 1697 } 1698 } 1699 1700 static void 1701 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) 1702 { 1703 struct mtx *blp; 1704 1705 MPASS(ncp->nc_dvp == vp); 1706 MPASS(ncp->nc_flag & NCF_NEGATIVE); 1707 cache_assert_vnode_locked(vp); 1708 1709 blp = NCP2BUCKETLOCK(ncp); 1710 mtx_lock(blp); 1711 cache_zap_locked(ncp); 1712 mtx_unlock(blp); 1713 } 1714 1715 static bool 1716 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, 1717 struct mtx **vlpp) 1718 { 1719 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 1720 struct mtx *blp; 1721 1722 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 1723 cache_assert_vnode_locked(vp); 1724 1725 if (ncp->nc_flag & NCF_NEGATIVE) { 1726 if (*vlpp != NULL) { 1727 mtx_unlock(*vlpp); 1728 *vlpp = NULL; 1729 } 1730 cache_zap_negative_locked_vnode_kl(ncp, vp); 1731 return (true); 1732 } 1733 1734 pvlp = VP2VNODELOCK(vp); 1735 blp = NCP2BUCKETLOCK(ncp); 1736 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 1737 vlp2 = VP2VNODELOCK(ncp->nc_vp); 1738 1739 if (*vlpp == vlp1 || *vlpp == vlp2) { 1740 to_unlock = *vlpp; 1741 *vlpp = NULL; 1742 } else { 1743 if (*vlpp != NULL) { 1744 mtx_unlock(*vlpp); 1745 *vlpp = NULL; 1746 } 1747 cache_sort_vnodes(&vlp1, &vlp2); 1748 if (vlp1 == pvlp) { 1749 mtx_lock(vlp2); 1750 to_unlock = vlp2; 1751 } else { 1752 if (!mtx_trylock(vlp1)) 1753 goto out_relock; 1754 to_unlock = vlp1; 1755 } 1756 } 1757 mtx_lock(blp); 1758 cache_zap_locked(ncp); 1759 mtx_unlock(blp); 1760 if (to_unlock != NULL) 1761 mtx_unlock(to_unlock); 1762 return (true); 1763 1764 out_relock: 1765 mtx_unlock(vlp2); 1766 mtx_lock(vlp1); 1767 mtx_lock(vlp2); 1768 MPASS(*vlpp == NULL); 1769 *vlpp = vlp1; 1770 return (false); 1771 } 1772 1773 /* 1774 * If trylocking failed we can get here. We know enough to take all needed locks 1775 * in the right order and re-lookup the entry. 1776 */ 1777 static int 1778 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1779 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, 1780 struct mtx *blp) 1781 { 1782 struct namecache *rncp; 1783 struct mtx *rvlp; 1784 1785 cache_assert_bucket_unlocked(ncp); 1786 1787 cache_sort_vnodes(&dvlp, &vlp); 1788 cache_lock_vnodes(dvlp, vlp); 1789 mtx_lock(blp); 1790 CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { 1791 if (rncp == ncp && cache_ncp_match(rncp, dvp, cnp)) 1792 break; 1793 } 1794 if (rncp == NULL) 1795 goto out_mismatch; 1796 1797 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1798 rvlp = VP2VNODELOCK(rncp->nc_vp); 1799 else 1800 rvlp = NULL; 1801 if (rvlp != vlp) 1802 goto out_mismatch; 1803 1804 cache_zap_locked(rncp); 1805 mtx_unlock(blp); 1806 cache_unlock_vnodes(dvlp, vlp); 1807 atomic_add_long(&zap_bucket_relock_success, 1); 1808 return (0); 1809 1810 out_mismatch: 1811 mtx_unlock(blp); 1812 cache_unlock_vnodes(dvlp, vlp); 1813 return (EAGAIN); 1814 } 1815 1816 static int __noinline 1817 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp, 1818 uint32_t hash, struct mtx *blp) 1819 { 1820 struct mtx *dvlp, *vlp; 1821 struct vnode *dvp; 1822 1823 cache_assert_bucket_locked(ncp); 1824 1825 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1826 vlp = NULL; 1827 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1828 vlp = VP2VNODELOCK(ncp->nc_vp); 1829 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1830 cache_zap_locked(ncp); 1831 mtx_unlock(blp); 1832 cache_unlock_vnodes(dvlp, vlp); 1833 return (0); 1834 } 1835 1836 dvp = ncp->nc_dvp; 1837 mtx_unlock(blp); 1838 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1839 } 1840 1841 static __noinline int 1842 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp) 1843 { 1844 struct namecache *ncp; 1845 struct mtx *blp; 1846 struct mtx *dvlp, *dvlp2; 1847 uint32_t hash; 1848 int error; 1849 1850 if (cnp->cn_namelen == 2 && 1851 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1852 dvlp = VP2VNODELOCK(dvp); 1853 dvlp2 = NULL; 1854 mtx_lock(dvlp); 1855 retry_dotdot: 1856 ncp = dvp->v_cache_dd; 1857 if (ncp == NULL) { 1858 mtx_unlock(dvlp); 1859 if (dvlp2 != NULL) 1860 mtx_unlock(dvlp2); 1861 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1862 return (0); 1863 } 1864 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1865 if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2)) 1866 goto retry_dotdot; 1867 MPASS(dvp->v_cache_dd == NULL); 1868 mtx_unlock(dvlp); 1869 if (dvlp2 != NULL) 1870 mtx_unlock(dvlp2); 1871 cache_free(ncp); 1872 } else { 1873 atomic_store_ptr(&dvp->v_cache_dd, NULL); 1874 mtx_unlock(dvlp); 1875 if (dvlp2 != NULL) 1876 mtx_unlock(dvlp2); 1877 } 1878 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1879 return (1); 1880 } 1881 1882 /* 1883 * XXX note that access here is completely unlocked with no provisions 1884 * to keep the hash allocated. If one is sufficiently unlucky a 1885 * parallel cache resize can reallocate the hash, unmap backing pages 1886 * and cause the empty check below to fault. 1887 * 1888 * Fixing this has epsilon priority, but can be done with no overhead 1889 * for this codepath with sufficient effort. 1890 */ 1891 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1892 blp = HASH2BUCKETLOCK(hash); 1893 retry: 1894 if (CK_SLIST_EMPTY(NCHHASH(hash))) 1895 goto out_no_entry; 1896 1897 mtx_lock(blp); 1898 ncp = cache_ncp_find(dvp, cnp, hash); 1899 if (ncp == NULL) { 1900 mtx_unlock(blp); 1901 goto out_no_entry; 1902 } 1903 1904 error = cache_zap_locked_bucket(ncp, cnp, hash, blp); 1905 if (__predict_false(error != 0)) { 1906 atomic_add_long(&zap_bucket_fail, 1); 1907 goto retry; 1908 } 1909 counter_u64_add(numposzaps, 1); 1910 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1911 cache_free(ncp); 1912 return (1); 1913 out_no_entry: 1914 counter_u64_add(nummisszap, 1); 1915 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1916 return (0); 1917 } 1918 1919 static int __noinline 1920 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1921 struct timespec *tsp, int *ticksp) 1922 { 1923 int ltype; 1924 1925 *vpp = dvp; 1926 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); 1927 if (tsp != NULL) 1928 timespecclear(tsp); 1929 if (ticksp != NULL) 1930 *ticksp = ticks; 1931 vrefact(*vpp); 1932 /* 1933 * When we lookup "." we still can be asked to lock it 1934 * differently... 1935 */ 1936 ltype = cnp->cn_lkflags & LK_TYPE_MASK; 1937 if (ltype != VOP_ISLOCKED(*vpp)) { 1938 if (ltype == LK_EXCLUSIVE) { 1939 vn_lock(*vpp, LK_UPGRADE | LK_RETRY); 1940 if (VN_IS_DOOMED((*vpp))) { 1941 /* forced unmount */ 1942 vrele(*vpp); 1943 *vpp = NULL; 1944 return (ENOENT); 1945 } 1946 } else 1947 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); 1948 } 1949 return (-1); 1950 } 1951 1952 static int __noinline 1953 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1954 struct timespec *tsp, int *ticksp) 1955 { 1956 struct namecache_ts *ncp_ts; 1957 struct namecache *ncp; 1958 struct mtx *dvlp; 1959 enum vgetstate vs; 1960 int error, ltype; 1961 bool whiteout; 1962 1963 MPASS((cnp->cn_flags & ISDOTDOT) != 0); 1964 1965 if ((cnp->cn_flags & MAKEENTRY) == 0) { 1966 cache_remove_cnp(dvp, cnp); 1967 return (0); 1968 } 1969 1970 retry: 1971 dvlp = VP2VNODELOCK(dvp); 1972 mtx_lock(dvlp); 1973 ncp = dvp->v_cache_dd; 1974 if (ncp == NULL) { 1975 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, ".."); 1976 mtx_unlock(dvlp); 1977 return (0); 1978 } 1979 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1980 if (ncp->nc_flag & NCF_NEGATIVE) 1981 *vpp = NULL; 1982 else 1983 *vpp = ncp->nc_vp; 1984 } else 1985 *vpp = ncp->nc_dvp; 1986 if (*vpp == NULL) 1987 goto negative_success; 1988 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp); 1989 cache_out_ts(ncp, tsp, ticksp); 1990 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == 1991 NCF_DTS && tsp != NULL) { 1992 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1993 *tsp = ncp_ts->nc_dotdottime; 1994 } 1995 1996 MPASS(dvp != *vpp); 1997 ltype = VOP_ISLOCKED(dvp); 1998 VOP_UNLOCK(dvp); 1999 vs = vget_prep(*vpp); 2000 mtx_unlock(dvlp); 2001 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 2002 vn_lock(dvp, ltype | LK_RETRY); 2003 if (VN_IS_DOOMED(dvp)) { 2004 if (error == 0) 2005 vput(*vpp); 2006 *vpp = NULL; 2007 return (ENOENT); 2008 } 2009 if (error) { 2010 *vpp = NULL; 2011 goto retry; 2012 } 2013 return (-1); 2014 negative_success: 2015 if (__predict_false(cnp->cn_nameiop == CREATE)) { 2016 if (cnp->cn_flags & ISLASTCN) { 2017 counter_u64_add(numnegzaps, 1); 2018 cache_zap_negative_locked_vnode_kl(ncp, dvp); 2019 mtx_unlock(dvlp); 2020 cache_free(ncp); 2021 return (0); 2022 } 2023 } 2024 2025 whiteout = (ncp->nc_flag & NCF_WHITE); 2026 cache_out_ts(ncp, tsp, ticksp); 2027 if (cache_neg_hit_prep(ncp)) 2028 cache_neg_promote(ncp); 2029 else 2030 cache_neg_hit_finish(ncp); 2031 mtx_unlock(dvlp); 2032 if (whiteout) 2033 cnp->cn_flags |= ISWHITEOUT; 2034 return (ENOENT); 2035 } 2036 2037 /** 2038 * Lookup a name in the name cache 2039 * 2040 * # Arguments 2041 * 2042 * - dvp: Parent directory in which to search. 2043 * - vpp: Return argument. Will contain desired vnode on cache hit. 2044 * - cnp: Parameters of the name search. The most interesting bits of 2045 * the cn_flags field have the following meanings: 2046 * - MAKEENTRY: If clear, free an entry from the cache rather than look 2047 * it up. 2048 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." 2049 * - tsp: Return storage for cache timestamp. On a successful (positive 2050 * or negative) lookup, tsp will be filled with any timespec that 2051 * was stored when this cache entry was created. However, it will 2052 * be clear for "." entries. 2053 * - ticks: Return storage for alternate cache timestamp. On a successful 2054 * (positive or negative) lookup, it will contain the ticks value 2055 * that was current when the cache entry was created, unless cnp 2056 * was ".". 2057 * 2058 * Either both tsp and ticks have to be provided or neither of them. 2059 * 2060 * # Returns 2061 * 2062 * - -1: A positive cache hit. vpp will contain the desired vnode. 2063 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due 2064 * to a forced unmount. vpp will not be modified. If the entry 2065 * is a whiteout, then the ISWHITEOUT flag will be set in 2066 * cnp->cn_flags. 2067 * - 0: A cache miss. vpp will not be modified. 2068 * 2069 * # Locking 2070 * 2071 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up 2072 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the 2073 * lock is not recursively acquired. 2074 */ 2075 static int __noinline 2076 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 2077 struct timespec *tsp, int *ticksp) 2078 { 2079 struct namecache *ncp; 2080 struct mtx *blp; 2081 uint32_t hash; 2082 enum vgetstate vs; 2083 int error; 2084 bool whiteout; 2085 2086 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 2087 MPASS((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) != 0); 2088 2089 retry: 2090 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 2091 blp = HASH2BUCKETLOCK(hash); 2092 mtx_lock(blp); 2093 2094 ncp = cache_ncp_find(dvp, cnp, hash); 2095 if (__predict_false(ncp == NULL)) { 2096 mtx_unlock(blp); 2097 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr); 2098 counter_u64_add(nummiss, 1); 2099 return (0); 2100 } 2101 2102 if (ncp->nc_flag & NCF_NEGATIVE) 2103 goto negative_success; 2104 2105 counter_u64_add(numposhits, 1); 2106 *vpp = ncp->nc_vp; 2107 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); 2108 cache_out_ts(ncp, tsp, ticksp); 2109 MPASS(dvp != *vpp); 2110 vs = vget_prep(*vpp); 2111 mtx_unlock(blp); 2112 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 2113 if (error) { 2114 *vpp = NULL; 2115 goto retry; 2116 } 2117 return (-1); 2118 negative_success: 2119 /* 2120 * We don't get here with regular lookup apart from corner cases. 2121 */ 2122 if (__predict_true(cnp->cn_nameiop == CREATE)) { 2123 if (cnp->cn_flags & ISLASTCN) { 2124 counter_u64_add(numnegzaps, 1); 2125 error = cache_zap_locked_bucket(ncp, cnp, hash, blp); 2126 if (__predict_false(error != 0)) { 2127 atomic_add_long(&zap_bucket_fail2, 1); 2128 goto retry; 2129 } 2130 cache_free(ncp); 2131 return (0); 2132 } 2133 } 2134 2135 whiteout = (ncp->nc_flag & NCF_WHITE); 2136 cache_out_ts(ncp, tsp, ticksp); 2137 if (cache_neg_hit_prep(ncp)) 2138 cache_neg_promote(ncp); 2139 else 2140 cache_neg_hit_finish(ncp); 2141 mtx_unlock(blp); 2142 if (whiteout) 2143 cnp->cn_flags |= ISWHITEOUT; 2144 return (ENOENT); 2145 } 2146 2147 int 2148 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 2149 struct timespec *tsp, int *ticksp) 2150 { 2151 struct namecache *ncp; 2152 uint32_t hash; 2153 enum vgetstate vs; 2154 int error; 2155 bool whiteout, neg_promote; 2156 u_short nc_flag; 2157 2158 MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL)); 2159 2160 #ifdef DEBUG_CACHE 2161 if (__predict_false(!doingcache)) { 2162 cnp->cn_flags &= ~MAKEENTRY; 2163 return (0); 2164 } 2165 #endif 2166 2167 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 2168 if (cnp->cn_namelen == 1) 2169 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); 2170 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') 2171 return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp)); 2172 } 2173 2174 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 2175 2176 if ((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) == 0) { 2177 cache_remove_cnp(dvp, cnp); 2178 return (0); 2179 } 2180 2181 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 2182 vfs_smr_enter(); 2183 2184 ncp = cache_ncp_find(dvp, cnp, hash); 2185 if (__predict_false(ncp == NULL)) { 2186 vfs_smr_exit(); 2187 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr); 2188 counter_u64_add(nummiss, 1); 2189 return (0); 2190 } 2191 2192 nc_flag = atomic_load_char(&ncp->nc_flag); 2193 if (nc_flag & NCF_NEGATIVE) 2194 goto negative_success; 2195 2196 counter_u64_add(numposhits, 1); 2197 *vpp = ncp->nc_vp; 2198 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); 2199 cache_out_ts(ncp, tsp, ticksp); 2200 MPASS(dvp != *vpp); 2201 if (!cache_ncp_canuse(ncp)) { 2202 vfs_smr_exit(); 2203 *vpp = NULL; 2204 goto out_fallback; 2205 } 2206 vs = vget_prep_smr(*vpp); 2207 vfs_smr_exit(); 2208 if (__predict_false(vs == VGET_NONE)) { 2209 *vpp = NULL; 2210 goto out_fallback; 2211 } 2212 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 2213 if (error) { 2214 *vpp = NULL; 2215 goto out_fallback; 2216 } 2217 return (-1); 2218 negative_success: 2219 if (cnp->cn_nameiop == CREATE) { 2220 if (cnp->cn_flags & ISLASTCN) { 2221 vfs_smr_exit(); 2222 goto out_fallback; 2223 } 2224 } 2225 2226 cache_out_ts(ncp, tsp, ticksp); 2227 whiteout = (atomic_load_char(&ncp->nc_flag) & NCF_WHITE); 2228 neg_promote = cache_neg_hit_prep(ncp); 2229 if (!cache_ncp_canuse(ncp)) { 2230 cache_neg_hit_abort(ncp); 2231 vfs_smr_exit(); 2232 goto out_fallback; 2233 } 2234 if (neg_promote) { 2235 vfs_smr_exit(); 2236 if (!cache_neg_promote_cond(dvp, cnp, ncp, hash)) 2237 goto out_fallback; 2238 } else { 2239 cache_neg_hit_finish(ncp); 2240 vfs_smr_exit(); 2241 } 2242 if (whiteout) 2243 cnp->cn_flags |= ISWHITEOUT; 2244 return (ENOENT); 2245 out_fallback: 2246 return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp)); 2247 } 2248 2249 struct celockstate { 2250 struct mtx *vlp[3]; 2251 struct mtx *blp[2]; 2252 }; 2253 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); 2254 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); 2255 2256 static inline void 2257 cache_celockstate_init(struct celockstate *cel) 2258 { 2259 2260 bzero(cel, sizeof(*cel)); 2261 } 2262 2263 static void 2264 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, 2265 struct vnode *dvp) 2266 { 2267 struct mtx *vlp1, *vlp2; 2268 2269 MPASS(cel->vlp[0] == NULL); 2270 MPASS(cel->vlp[1] == NULL); 2271 MPASS(cel->vlp[2] == NULL); 2272 2273 MPASS(vp != NULL || dvp != NULL); 2274 2275 vlp1 = VP2VNODELOCK(vp); 2276 vlp2 = VP2VNODELOCK(dvp); 2277 cache_sort_vnodes(&vlp1, &vlp2); 2278 2279 if (vlp1 != NULL) { 2280 mtx_lock(vlp1); 2281 cel->vlp[0] = vlp1; 2282 } 2283 mtx_lock(vlp2); 2284 cel->vlp[1] = vlp2; 2285 } 2286 2287 static void 2288 cache_unlock_vnodes_cel(struct celockstate *cel) 2289 { 2290 2291 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); 2292 2293 if (cel->vlp[0] != NULL) 2294 mtx_unlock(cel->vlp[0]); 2295 if (cel->vlp[1] != NULL) 2296 mtx_unlock(cel->vlp[1]); 2297 if (cel->vlp[2] != NULL) 2298 mtx_unlock(cel->vlp[2]); 2299 } 2300 2301 static bool 2302 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) 2303 { 2304 struct mtx *vlp; 2305 bool ret; 2306 2307 cache_assert_vlp_locked(cel->vlp[0]); 2308 cache_assert_vlp_locked(cel->vlp[1]); 2309 MPASS(cel->vlp[2] == NULL); 2310 2311 MPASS(vp != NULL); 2312 vlp = VP2VNODELOCK(vp); 2313 2314 ret = true; 2315 if (vlp >= cel->vlp[1]) { 2316 mtx_lock(vlp); 2317 } else { 2318 if (mtx_trylock(vlp)) 2319 goto out; 2320 cache_unlock_vnodes_cel(cel); 2321 atomic_add_long(&cache_lock_vnodes_cel_3_failures, 1); 2322 if (vlp < cel->vlp[0]) { 2323 mtx_lock(vlp); 2324 mtx_lock(cel->vlp[0]); 2325 mtx_lock(cel->vlp[1]); 2326 } else { 2327 if (cel->vlp[0] != NULL) 2328 mtx_lock(cel->vlp[0]); 2329 mtx_lock(vlp); 2330 mtx_lock(cel->vlp[1]); 2331 } 2332 ret = false; 2333 } 2334 out: 2335 cel->vlp[2] = vlp; 2336 return (ret); 2337 } 2338 2339 static void 2340 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1, 2341 struct mtx *blp2) 2342 { 2343 2344 MPASS(cel->blp[0] == NULL); 2345 MPASS(cel->blp[1] == NULL); 2346 2347 cache_sort_vnodes(&blp1, &blp2); 2348 2349 if (blp1 != NULL) { 2350 mtx_lock(blp1); 2351 cel->blp[0] = blp1; 2352 } 2353 mtx_lock(blp2); 2354 cel->blp[1] = blp2; 2355 } 2356 2357 static void 2358 cache_unlock_buckets_cel(struct celockstate *cel) 2359 { 2360 2361 if (cel->blp[0] != NULL) 2362 mtx_unlock(cel->blp[0]); 2363 mtx_unlock(cel->blp[1]); 2364 } 2365 2366 /* 2367 * Lock part of the cache affected by the insertion. 2368 * 2369 * This means vnodelocks for dvp, vp and the relevant bucketlock. 2370 * However, insertion can result in removal of an old entry. In this 2371 * case we have an additional vnode and bucketlock pair to lock. 2372 * 2373 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while 2374 * preserving the locking order (smaller address first). 2375 */ 2376 static void 2377 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 2378 uint32_t hash) 2379 { 2380 struct namecache *ncp; 2381 struct mtx *blps[2]; 2382 u_char nc_flag; 2383 2384 blps[0] = HASH2BUCKETLOCK(hash); 2385 for (;;) { 2386 blps[1] = NULL; 2387 cache_lock_vnodes_cel(cel, dvp, vp); 2388 if (vp == NULL || vp->v_type != VDIR) 2389 break; 2390 ncp = atomic_load_consume_ptr(&vp->v_cache_dd); 2391 if (ncp == NULL) 2392 break; 2393 nc_flag = atomic_load_char(&ncp->nc_flag); 2394 if ((nc_flag & NCF_ISDOTDOT) == 0) 2395 break; 2396 MPASS(ncp->nc_dvp == vp); 2397 blps[1] = NCP2BUCKETLOCK(ncp); 2398 if ((nc_flag & NCF_NEGATIVE) != 0) 2399 break; 2400 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 2401 break; 2402 /* 2403 * All vnodes got re-locked. Re-validate the state and if 2404 * nothing changed we are done. Otherwise restart. 2405 */ 2406 if (ncp == vp->v_cache_dd && 2407 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 2408 blps[1] == NCP2BUCKETLOCK(ncp) && 2409 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 2410 break; 2411 cache_unlock_vnodes_cel(cel); 2412 cel->vlp[0] = NULL; 2413 cel->vlp[1] = NULL; 2414 cel->vlp[2] = NULL; 2415 } 2416 cache_lock_buckets_cel(cel, blps[0], blps[1]); 2417 } 2418 2419 static void 2420 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 2421 uint32_t hash) 2422 { 2423 struct namecache *ncp; 2424 struct mtx *blps[2]; 2425 u_char nc_flag; 2426 2427 blps[0] = HASH2BUCKETLOCK(hash); 2428 for (;;) { 2429 blps[1] = NULL; 2430 cache_lock_vnodes_cel(cel, dvp, vp); 2431 ncp = atomic_load_consume_ptr(&dvp->v_cache_dd); 2432 if (ncp == NULL) 2433 break; 2434 nc_flag = atomic_load_char(&ncp->nc_flag); 2435 if ((nc_flag & NCF_ISDOTDOT) == 0) 2436 break; 2437 MPASS(ncp->nc_dvp == dvp); 2438 blps[1] = NCP2BUCKETLOCK(ncp); 2439 if ((nc_flag & NCF_NEGATIVE) != 0) 2440 break; 2441 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 2442 break; 2443 if (ncp == dvp->v_cache_dd && 2444 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 2445 blps[1] == NCP2BUCKETLOCK(ncp) && 2446 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 2447 break; 2448 cache_unlock_vnodes_cel(cel); 2449 cel->vlp[0] = NULL; 2450 cel->vlp[1] = NULL; 2451 cel->vlp[2] = NULL; 2452 } 2453 cache_lock_buckets_cel(cel, blps[0], blps[1]); 2454 } 2455 2456 static void 2457 cache_enter_unlock(struct celockstate *cel) 2458 { 2459 2460 cache_unlock_buckets_cel(cel); 2461 cache_unlock_vnodes_cel(cel); 2462 } 2463 2464 static void __noinline 2465 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, 2466 struct componentname *cnp) 2467 { 2468 struct celockstate cel; 2469 struct namecache *ncp; 2470 uint32_t hash; 2471 int len; 2472 2473 if (atomic_load_ptr(&dvp->v_cache_dd) == NULL) 2474 return; 2475 len = cnp->cn_namelen; 2476 cache_celockstate_init(&cel); 2477 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 2478 cache_enter_lock_dd(&cel, dvp, vp, hash); 2479 ncp = dvp->v_cache_dd; 2480 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) { 2481 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); 2482 cache_zap_locked(ncp); 2483 } else { 2484 ncp = NULL; 2485 } 2486 atomic_store_ptr(&dvp->v_cache_dd, NULL); 2487 cache_enter_unlock(&cel); 2488 if (ncp != NULL) 2489 cache_free(ncp); 2490 } 2491 2492 /* 2493 * Add an entry to the cache. 2494 */ 2495 void 2496 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 2497 struct timespec *tsp, struct timespec *dtsp) 2498 { 2499 struct celockstate cel; 2500 struct namecache *ncp, *n2, *ndd; 2501 struct namecache_ts *ncp_ts; 2502 uint32_t hash; 2503 int flag; 2504 int len; 2505 2506 KASSERT(cnp->cn_namelen <= NAME_MAX, 2507 ("%s: passed len %ld exceeds NAME_MAX (%d)", __func__, cnp->cn_namelen, 2508 NAME_MAX)); 2509 VNPASS(!VN_IS_DOOMED(dvp), dvp); 2510 VNPASS(dvp->v_type != VNON, dvp); 2511 if (vp != NULL) { 2512 VNPASS(!VN_IS_DOOMED(vp), vp); 2513 VNPASS(vp->v_type != VNON, vp); 2514 } 2515 if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') { 2516 KASSERT(dvp == vp, 2517 ("%s: different vnodes for dot entry (%p; %p)\n", __func__, 2518 dvp, vp)); 2519 } else { 2520 KASSERT(dvp != vp, 2521 ("%s: same vnode for non-dot entry [%s] (%p)\n", __func__, 2522 cnp->cn_nameptr, dvp)); 2523 } 2524 2525 #ifdef DEBUG_CACHE 2526 if (__predict_false(!doingcache)) 2527 return; 2528 #endif 2529 2530 flag = 0; 2531 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 2532 if (cnp->cn_namelen == 1) 2533 return; 2534 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 2535 cache_enter_dotdot_prep(dvp, vp, cnp); 2536 flag = NCF_ISDOTDOT; 2537 } 2538 } 2539 2540 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); 2541 if (ncp == NULL) 2542 return; 2543 2544 cache_celockstate_init(&cel); 2545 ndd = NULL; 2546 ncp_ts = NULL; 2547 2548 /* 2549 * Calculate the hash key and setup as much of the new 2550 * namecache entry as possible before acquiring the lock. 2551 */ 2552 ncp->nc_flag = flag | NCF_WIP; 2553 ncp->nc_vp = vp; 2554 if (vp == NULL) 2555 cache_neg_init(ncp); 2556 ncp->nc_dvp = dvp; 2557 if (tsp != NULL) { 2558 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 2559 ncp_ts->nc_time = *tsp; 2560 ncp_ts->nc_ticks = ticks; 2561 ncp_ts->nc_nc.nc_flag |= NCF_TS; 2562 if (dtsp != NULL) { 2563 ncp_ts->nc_dotdottime = *dtsp; 2564 ncp_ts->nc_nc.nc_flag |= NCF_DTS; 2565 } 2566 } 2567 len = ncp->nc_nlen = cnp->cn_namelen; 2568 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 2569 memcpy(ncp->nc_name, cnp->cn_nameptr, len); 2570 ncp->nc_name[len] = '\0'; 2571 cache_enter_lock(&cel, dvp, vp, hash); 2572 2573 /* 2574 * See if this vnode or negative entry is already in the cache 2575 * with this name. This can happen with concurrent lookups of 2576 * the same path name. 2577 */ 2578 n2 = cache_ncp_find(dvp, cnp, hash); 2579 if (n2 != NULL) { 2580 MPASS(cache_ncp_canuse(n2)); 2581 if ((n2->nc_flag & NCF_NEGATIVE) != 0) 2582 KASSERT(vp == NULL, 2583 ("%s: found entry pointing to a different vnode " 2584 "(%p != %p); name [%s]", 2585 __func__, NULL, vp, cnp->cn_nameptr)); 2586 else 2587 KASSERT(n2->nc_vp == vp, 2588 ("%s: found entry pointing to a different vnode " 2589 "(%p != %p); name [%s]", 2590 __func__, n2->nc_vp, vp, cnp->cn_nameptr)); 2591 /* 2592 * Entries are supposed to be immutable unless in the 2593 * process of getting destroyed. Accommodating for 2594 * changing timestamps is possible but not worth it. 2595 * This should be harmless in terms of correctness, in 2596 * the worst case resulting in an earlier expiration. 2597 * Alternatively, the found entry can be replaced 2598 * altogether. 2599 */ 2600 MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == 2601 (ncp->nc_flag & (NCF_TS | NCF_DTS))); 2602 #if 0 2603 if (tsp != NULL) { 2604 KASSERT((n2->nc_flag & NCF_TS) != 0, 2605 ("no NCF_TS")); 2606 n2_ts = __containerof(n2, struct namecache_ts, nc_nc); 2607 n2_ts->nc_time = ncp_ts->nc_time; 2608 n2_ts->nc_ticks = ncp_ts->nc_ticks; 2609 if (dtsp != NULL) { 2610 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; 2611 n2_ts->nc_nc.nc_flag |= NCF_DTS; 2612 } 2613 } 2614 #endif 2615 SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name, 2616 vp); 2617 goto out_unlock_free; 2618 } 2619 2620 if (flag == NCF_ISDOTDOT) { 2621 /* 2622 * See if we are trying to add .. entry, but some other lookup 2623 * has populated v_cache_dd pointer already. 2624 */ 2625 if (dvp->v_cache_dd != NULL) 2626 goto out_unlock_free; 2627 KASSERT(vp == NULL || vp->v_type == VDIR, 2628 ("wrong vnode type %p", vp)); 2629 atomic_thread_fence_rel(); 2630 atomic_store_ptr(&dvp->v_cache_dd, ncp); 2631 } else if (vp != NULL) { 2632 /* 2633 * Take the slow path in INOTIFY(). This flag will be lazily 2634 * cleared by cache_vop_inotify() once all directories referring 2635 * to vp are unwatched. 2636 */ 2637 if (__predict_false((vn_irflag_read(dvp) & VIRF_INOTIFY) != 0)) 2638 vn_irflag_set_cond(vp, VIRF_INOTIFY_PARENT); 2639 2640 /* 2641 * For this case, the cache entry maps both the 2642 * directory name in it and the name ".." for the 2643 * directory's parent. 2644 */ 2645 if ((ndd = vp->v_cache_dd) != NULL) { 2646 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) 2647 cache_zap_locked(ndd); 2648 else 2649 ndd = NULL; 2650 } 2651 atomic_thread_fence_rel(); 2652 atomic_store_ptr(&vp->v_cache_dd, ncp); 2653 } 2654 2655 if (flag != NCF_ISDOTDOT) { 2656 if (LIST_EMPTY(&dvp->v_cache_src)) { 2657 cache_hold_vnode(dvp); 2658 } 2659 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); 2660 } 2661 2662 /* 2663 * If the entry is "negative", we place it into the 2664 * "negative" cache queue, otherwise, we place it into the 2665 * destination vnode's cache entries queue. 2666 */ 2667 if (vp != NULL) { 2668 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); 2669 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, 2670 vp); 2671 } else { 2672 if (cnp->cn_flags & ISWHITEOUT) 2673 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_WHITE); 2674 cache_neg_insert(ncp); 2675 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, 2676 ncp->nc_name); 2677 } 2678 2679 /* 2680 * Insert the new namecache entry into the appropriate chain 2681 * within the cache entries table. 2682 */ 2683 CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); 2684 2685 atomic_thread_fence_rel(); 2686 /* 2687 * Mark the entry as fully constructed. 2688 * It is immutable past this point until its removal. 2689 */ 2690 atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP); 2691 2692 cache_enter_unlock(&cel); 2693 if (ndd != NULL) 2694 cache_free(ndd); 2695 return; 2696 out_unlock_free: 2697 cache_enter_unlock(&cel); 2698 cache_free(ncp); 2699 return; 2700 } 2701 2702 /* 2703 * A variant of the above accepting flags. 2704 * 2705 * - VFS_CACHE_DROPOLD -- if a conflicting entry is found, drop it. 2706 * 2707 * TODO: this routine is a hack. It blindly removes the old entry, even if it 2708 * happens to match and it is doing it in an inefficient manner. It was added 2709 * to accommodate NFS which runs into a case where the target for a given name 2710 * may change from under it. Note this does nothing to solve the following 2711 * race: 2 callers of cache_enter_time_flags pass a different target vnode for 2712 * the same [dvp, cnp]. It may be argued that code doing this is broken. 2713 */ 2714 void 2715 cache_enter_time_flags(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 2716 struct timespec *tsp, struct timespec *dtsp, int flags) 2717 { 2718 2719 MPASS((flags & ~(VFS_CACHE_DROPOLD)) == 0); 2720 2721 if (flags & VFS_CACHE_DROPOLD) 2722 cache_remove_cnp(dvp, cnp); 2723 cache_enter_time(dvp, vp, cnp, tsp, dtsp); 2724 } 2725 2726 static u_long 2727 cache_roundup_2(u_long val) 2728 { 2729 u_long res; 2730 2731 for (res = 1; res <= val; res <<= 1) 2732 continue; 2733 2734 return (res); 2735 } 2736 2737 static struct nchashhead * 2738 nchinittbl(u_long elements, u_long *hashmask) 2739 { 2740 struct nchashhead *hashtbl; 2741 u_long hashsize, i; 2742 2743 hashsize = cache_roundup_2(elements) / 2; 2744 2745 hashtbl = malloc(hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK); 2746 for (i = 0; i < hashsize; i++) 2747 CK_SLIST_INIT(&hashtbl[i]); 2748 *hashmask = hashsize - 1; 2749 return (hashtbl); 2750 } 2751 2752 static void 2753 ncfreetbl(struct nchashhead *hashtbl) 2754 { 2755 2756 free(hashtbl, M_VFSCACHE); 2757 } 2758 2759 /* 2760 * Name cache initialization, from vfs_init() when we are booting 2761 */ 2762 static void 2763 nchinit(void *dummy __unused) 2764 { 2765 u_int i; 2766 2767 cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE, 2768 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2769 cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE, 2770 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2771 cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE, 2772 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2773 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE, 2774 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2775 2776 VFS_SMR_ZONE_SET(cache_zone_small); 2777 VFS_SMR_ZONE_SET(cache_zone_small_ts); 2778 VFS_SMR_ZONE_SET(cache_zone_large); 2779 VFS_SMR_ZONE_SET(cache_zone_large_ts); 2780 2781 ncsize = desiredvnodes * ncsizefactor; 2782 cache_recalc_neg_min(); 2783 nchashtbl = nchinittbl(ncsize, &nchash); 2784 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; 2785 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ 2786 ncbuckethash = 7; 2787 if (ncbuckethash > nchash) 2788 ncbuckethash = nchash; 2789 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, 2790 M_WAITOK | M_ZERO); 2791 for (i = 0; i < numbucketlocks; i++) 2792 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE); 2793 ncvnodehash = ncbuckethash; 2794 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, 2795 M_WAITOK | M_ZERO); 2796 for (i = 0; i < numvnodelocks; i++) 2797 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); 2798 2799 for (i = 0; i < numneglists; i++) { 2800 mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF); 2801 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); 2802 TAILQ_INIT(&neglists[i].nl_list); 2803 TAILQ_INIT(&neglists[i].nl_hotlist); 2804 } 2805 } 2806 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); 2807 2808 void 2809 cache_vnode_init(struct vnode *vp) 2810 { 2811 2812 LIST_INIT(&vp->v_cache_src); 2813 TAILQ_INIT(&vp->v_cache_dst); 2814 vp->v_cache_dd = NULL; 2815 cache_prehash(vp); 2816 } 2817 2818 /* 2819 * Induce transient cache misses for lockless operation in cache_lookup() by 2820 * using a temporary hash table. 2821 * 2822 * This will force a fs lookup. 2823 * 2824 * Synchronisation is done in 2 steps, calling vfs_smr_synchronize each time 2825 * to observe all CPUs not performing the lookup. 2826 */ 2827 static void 2828 cache_changesize_set_temp(struct nchashhead *temptbl, u_long temphash) 2829 { 2830 2831 MPASS(temphash < nchash); 2832 /* 2833 * Change the size. The new size is smaller and can safely be used 2834 * against the existing table. All lookups which now hash wrong will 2835 * result in a cache miss, which all callers are supposed to know how 2836 * to handle. 2837 */ 2838 atomic_store_long(&nchash, temphash); 2839 atomic_thread_fence_rel(); 2840 vfs_smr_synchronize(); 2841 /* 2842 * At this point everyone sees the updated hash value, but they still 2843 * see the old table. 2844 */ 2845 atomic_store_ptr(&nchashtbl, temptbl); 2846 atomic_thread_fence_rel(); 2847 vfs_smr_synchronize(); 2848 /* 2849 * At this point everyone sees the updated table pointer and size pair. 2850 */ 2851 } 2852 2853 /* 2854 * Set the new hash table. 2855 * 2856 * Similarly to cache_changesize_set_temp(), this has to synchronize against 2857 * lockless operation in cache_lookup(). 2858 */ 2859 static void 2860 cache_changesize_set_new(struct nchashhead *new_tbl, u_long new_hash) 2861 { 2862 2863 MPASS(nchash < new_hash); 2864 /* 2865 * Change the pointer first. This wont result in out of bounds access 2866 * since the temporary table is guaranteed to be smaller. 2867 */ 2868 atomic_store_ptr(&nchashtbl, new_tbl); 2869 atomic_thread_fence_rel(); 2870 vfs_smr_synchronize(); 2871 /* 2872 * At this point everyone sees the updated pointer value, but they 2873 * still see the old size. 2874 */ 2875 atomic_store_long(&nchash, new_hash); 2876 atomic_thread_fence_rel(); 2877 vfs_smr_synchronize(); 2878 /* 2879 * At this point everyone sees the updated table pointer and size pair. 2880 */ 2881 } 2882 2883 void 2884 cache_changesize(u_long newmaxvnodes) 2885 { 2886 struct nchashhead *new_nchashtbl, *old_nchashtbl, *temptbl; 2887 u_long new_nchash, old_nchash, temphash; 2888 struct namecache *ncp; 2889 uint32_t hash; 2890 u_long newncsize; 2891 u_long i; 2892 2893 newncsize = newmaxvnodes * ncsizefactor; 2894 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); 2895 if (newmaxvnodes < numbucketlocks) 2896 newmaxvnodes = numbucketlocks; 2897 2898 new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash); 2899 /* If same hash table size, nothing to do */ 2900 if (nchash == new_nchash) { 2901 ncfreetbl(new_nchashtbl); 2902 return; 2903 } 2904 2905 temptbl = nchinittbl(1, &temphash); 2906 2907 /* 2908 * Move everything from the old hash table to the new table. 2909 * None of the namecache entries in the table can be removed 2910 * because to do so, they have to be removed from the hash table. 2911 */ 2912 cache_lock_all_vnodes(); 2913 cache_lock_all_buckets(); 2914 old_nchashtbl = nchashtbl; 2915 old_nchash = nchash; 2916 cache_changesize_set_temp(temptbl, temphash); 2917 for (i = 0; i <= old_nchash; i++) { 2918 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) { 2919 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, 2920 ncp->nc_dvp); 2921 CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash); 2922 CK_SLIST_INSERT_HEAD(&new_nchashtbl[hash & new_nchash], ncp, nc_hash); 2923 } 2924 } 2925 ncsize = newncsize; 2926 cache_recalc_neg_min(); 2927 cache_changesize_set_new(new_nchashtbl, new_nchash); 2928 cache_unlock_all_buckets(); 2929 cache_unlock_all_vnodes(); 2930 ncfreetbl(old_nchashtbl); 2931 ncfreetbl(temptbl); 2932 } 2933 2934 /* 2935 * Remove all entries from and to a particular vnode. 2936 */ 2937 static void 2938 cache_purge_impl(struct vnode *vp) 2939 { 2940 struct cache_freebatch batch; 2941 struct namecache *ncp; 2942 struct mtx *vlp, *vlp2; 2943 2944 TAILQ_INIT(&batch); 2945 vlp = VP2VNODELOCK(vp); 2946 vlp2 = NULL; 2947 mtx_lock(vlp); 2948 retry: 2949 while (!LIST_EMPTY(&vp->v_cache_src)) { 2950 ncp = LIST_FIRST(&vp->v_cache_src); 2951 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2952 goto retry; 2953 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2954 } 2955 while (!TAILQ_EMPTY(&vp->v_cache_dst)) { 2956 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2957 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2958 goto retry; 2959 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2960 } 2961 ncp = vp->v_cache_dd; 2962 if (ncp != NULL) { 2963 KASSERT(ncp->nc_flag & NCF_ISDOTDOT, 2964 ("lost dotdot link")); 2965 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2966 goto retry; 2967 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2968 } 2969 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); 2970 mtx_unlock(vlp); 2971 if (vlp2 != NULL) 2972 mtx_unlock(vlp2); 2973 cache_free_batch(&batch); 2974 } 2975 2976 /* 2977 * Opportunistic check to see if there is anything to do. 2978 */ 2979 static bool 2980 cache_has_entries(struct vnode *vp) 2981 { 2982 2983 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2984 atomic_load_ptr(&vp->v_cache_dd) == NULL) 2985 return (false); 2986 return (true); 2987 } 2988 2989 void 2990 cache_purge(struct vnode *vp) 2991 { 2992 2993 SDT_PROBE1(vfs, namecache, purge, done, vp); 2994 if (!cache_has_entries(vp)) 2995 return; 2996 cache_purge_impl(vp); 2997 } 2998 2999 /* 3000 * Only to be used by vgone. 3001 */ 3002 void 3003 cache_purge_vgone(struct vnode *vp) 3004 { 3005 struct mtx *vlp; 3006 3007 VNPASS(VN_IS_DOOMED(vp), vp); 3008 if (cache_has_entries(vp)) { 3009 cache_purge_impl(vp); 3010 return; 3011 } 3012 3013 /* 3014 * Serialize against a potential thread doing cache_purge. 3015 */ 3016 vlp = VP2VNODELOCK(vp); 3017 mtx_wait_unlocked(vlp); 3018 if (cache_has_entries(vp)) { 3019 cache_purge_impl(vp); 3020 return; 3021 } 3022 return; 3023 } 3024 3025 /* 3026 * Remove all negative entries for a particular directory vnode. 3027 */ 3028 void 3029 cache_purge_negative(struct vnode *vp) 3030 { 3031 struct cache_freebatch batch; 3032 struct namecache *ncp, *nnp; 3033 struct mtx *vlp; 3034 3035 SDT_PROBE1(vfs, namecache, purge_negative, done, vp); 3036 if (LIST_EMPTY(&vp->v_cache_src)) 3037 return; 3038 TAILQ_INIT(&batch); 3039 vlp = VP2VNODELOCK(vp); 3040 mtx_lock(vlp); 3041 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { 3042 if (!(ncp->nc_flag & NCF_NEGATIVE)) 3043 continue; 3044 cache_zap_negative_locked_vnode_kl(ncp, vp); 3045 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 3046 } 3047 mtx_unlock(vlp); 3048 cache_free_batch(&batch); 3049 } 3050 3051 /* 3052 * Entry points for modifying VOP operations. 3053 */ 3054 void 3055 cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, 3056 struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp) 3057 { 3058 3059 ASSERT_VOP_IN_SEQC(fdvp); 3060 ASSERT_VOP_IN_SEQC(fvp); 3061 ASSERT_VOP_IN_SEQC(tdvp); 3062 if (tvp != NULL) 3063 ASSERT_VOP_IN_SEQC(tvp); 3064 3065 cache_purge(fvp); 3066 if (tvp != NULL) { 3067 cache_purge(tvp); 3068 KASSERT(!cache_remove_cnp(tdvp, tcnp), 3069 ("%s: lingering negative entry", __func__)); 3070 } else { 3071 cache_remove_cnp(tdvp, tcnp); 3072 } 3073 3074 /* 3075 * TODO 3076 * 3077 * Historically renaming was always purging all revelang entries, 3078 * but that's quite wasteful. In particular turns out that in many cases 3079 * the target file is immediately accessed after rename, inducing a cache 3080 * miss. 3081 * 3082 * Recode this to reduce relocking and reuse the existing entry (if any) 3083 * instead of just removing it above and allocating a new one here. 3084 */ 3085 cache_enter(tdvp, fvp, tcnp); 3086 } 3087 3088 void 3089 cache_vop_rmdir(struct vnode *dvp, struct vnode *vp) 3090 { 3091 3092 ASSERT_VOP_IN_SEQC(dvp); 3093 ASSERT_VOP_IN_SEQC(vp); 3094 cache_purge(vp); 3095 } 3096 3097 #ifdef INVARIANTS 3098 /* 3099 * Validate that if an entry exists it matches. 3100 */ 3101 void 3102 cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) 3103 { 3104 struct namecache *ncp; 3105 struct mtx *blp; 3106 uint32_t hash; 3107 3108 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 3109 if (CK_SLIST_EMPTY(NCHHASH(hash))) 3110 return; 3111 blp = HASH2BUCKETLOCK(hash); 3112 mtx_lock(blp); 3113 ncp = cache_ncp_find(dvp, cnp, hash); 3114 if (ncp != NULL && ncp->nc_vp != vp) { 3115 panic("%s: mismatch (%p != %p); ncp %p [%s] dvp %p\n", 3116 __func__, vp, ncp->nc_vp, ncp, ncp->nc_name, ncp->nc_dvp); 3117 } 3118 mtx_unlock(blp); 3119 } 3120 3121 void 3122 cache_assert_no_entries(struct vnode *vp) 3123 { 3124 3125 VNPASS(TAILQ_EMPTY(&vp->v_cache_dst), vp); 3126 VNPASS(LIST_EMPTY(&vp->v_cache_src), vp); 3127 VNPASS(vp->v_cache_dd == NULL, vp); 3128 } 3129 #endif 3130 3131 /* 3132 * Flush all entries referencing a particular filesystem. 3133 */ 3134 void 3135 cache_purgevfs(struct mount *mp) 3136 { 3137 struct vnode *vp, *mvp; 3138 size_t visited __sdt_used, purged __sdt_used; 3139 3140 visited = purged = 0; 3141 /* 3142 * Somewhat wasteful iteration over all vnodes. Would be better to 3143 * support filtering and avoid the interlock to begin with. 3144 */ 3145 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 3146 visited++; 3147 if (!cache_has_entries(vp)) { 3148 VI_UNLOCK(vp); 3149 continue; 3150 } 3151 vholdl(vp); 3152 VI_UNLOCK(vp); 3153 cache_purge(vp); 3154 purged++; 3155 vdrop(vp); 3156 } 3157 3158 SDT_PROBE3(vfs, namecache, purgevfs, done, mp, visited, purged); 3159 } 3160 3161 /* 3162 * Perform canonical checks and cache lookup and pass on to filesystem 3163 * through the vop_cachedlookup only if needed. 3164 */ 3165 3166 int 3167 vfs_cache_lookup(struct vop_lookup_args *ap) 3168 { 3169 struct vnode *dvp; 3170 int error; 3171 struct vnode **vpp = ap->a_vpp; 3172 struct componentname *cnp = ap->a_cnp; 3173 int flags = cnp->cn_flags; 3174 3175 *vpp = NULL; 3176 dvp = ap->a_dvp; 3177 3178 if (dvp->v_type != VDIR) 3179 return (ENOTDIR); 3180 3181 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && 3182 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) 3183 return (EROFS); 3184 3185 error = vn_dir_check_exec(dvp, cnp); 3186 if (error != 0) 3187 return (error); 3188 3189 error = cache_lookup(dvp, vpp, cnp, NULL, NULL); 3190 if (error == 0) 3191 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); 3192 if (error == -1) 3193 return (0); 3194 return (error); 3195 } 3196 3197 /* Implementation of the getcwd syscall. */ 3198 int 3199 sys___getcwd(struct thread *td, struct __getcwd_args *uap) 3200 { 3201 char *buf, *retbuf; 3202 size_t buflen; 3203 int error; 3204 3205 buflen = uap->buflen; 3206 if (__predict_false(buflen < 2)) 3207 return (EINVAL); 3208 if (buflen > MAXPATHLEN) 3209 buflen = MAXPATHLEN; 3210 3211 buf = uma_zalloc(namei_zone, M_WAITOK); 3212 error = vn_getcwd(buf, &retbuf, &buflen); 3213 if (error == 0) 3214 error = copyout(retbuf, uap->buf, buflen); 3215 uma_zfree(namei_zone, buf); 3216 return (error); 3217 } 3218 3219 int 3220 vn_getcwd(char *buf, char **retbuf, size_t *buflen) 3221 { 3222 struct pwd *pwd; 3223 int error; 3224 3225 vfs_smr_enter(); 3226 pwd = pwd_get_smr(); 3227 error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, 3228 buflen, 0); 3229 VFS_SMR_ASSERT_NOT_ENTERED(); 3230 if (error < 0) { 3231 pwd = pwd_hold(curthread); 3232 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf, 3233 retbuf, buflen); 3234 pwd_drop(pwd); 3235 } 3236 3237 #ifdef KTRACE 3238 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0) 3239 ktrnamei(*retbuf); 3240 #endif 3241 return (error); 3242 } 3243 3244 /* 3245 * Canonicalize a path by walking it forward and back. 3246 * 3247 * BUGS: 3248 * - Nothing guarantees the integrity of the entire chain. Consider the case 3249 * where the path "foo/bar/baz/qux" is passed, but "bar" is moved out of 3250 * "foo" into "quux" during the backwards walk. The result will be 3251 * "quux/bar/baz/qux", which could not have been obtained by an incremental 3252 * walk in userspace. Moreover, the path we return is inaccessible if the 3253 * calling thread lacks permission to traverse "quux". 3254 */ 3255 static int 3256 kern___realpathat(struct thread *td, int fd, const char *path, char *buf, 3257 size_t size, int flags, enum uio_seg pathseg) 3258 { 3259 struct nameidata nd; 3260 char *retbuf, *freebuf; 3261 int error; 3262 3263 if (flags != 0) 3264 return (EINVAL); 3265 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | WANTPARENT | AUDITVNODE1, 3266 pathseg, path, fd, &cap_fstat_rights); 3267 if ((error = namei(&nd)) != 0) 3268 return (error); 3269 3270 if (nd.ni_vp->v_type == VREG && nd.ni_dvp->v_type != VDIR && 3271 (nd.ni_vp->v_vflag & VV_ROOT) != 0) { 3272 struct vnode *covered_vp; 3273 3274 /* 3275 * This happens if vp is a file mount. The call to 3276 * vn_fullpath_hardlink can panic if path resolution can't be 3277 * handled without the directory. 3278 * 3279 * To resolve this, we find the vnode which was mounted on - 3280 * this should have a unique global path since we disallow 3281 * mounting on linked files. 3282 */ 3283 error = vn_lock(nd.ni_vp, LK_SHARED); 3284 if (error != 0) 3285 goto out; 3286 covered_vp = nd.ni_vp->v_mount->mnt_vnodecovered; 3287 vref(covered_vp); 3288 VOP_UNLOCK(nd.ni_vp); 3289 error = vn_fullpath(covered_vp, &retbuf, &freebuf); 3290 vrele(covered_vp); 3291 } else { 3292 error = vn_fullpath_hardlink(nd.ni_vp, nd.ni_dvp, 3293 nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, &retbuf, 3294 &freebuf, &size); 3295 } 3296 if (error == 0) { 3297 size_t len; 3298 3299 len = strlen(retbuf) + 1; 3300 if (size < len) 3301 error = ENAMETOOLONG; 3302 else if (pathseg == UIO_USERSPACE) 3303 error = copyout(retbuf, buf, len); 3304 else 3305 memcpy(buf, retbuf, len); 3306 free(freebuf, M_TEMP); 3307 } 3308 out: 3309 vrele(nd.ni_vp); 3310 vrele(nd.ni_dvp); 3311 NDFREE_PNBUF(&nd); 3312 return (error); 3313 } 3314 3315 int 3316 sys___realpathat(struct thread *td, struct __realpathat_args *uap) 3317 { 3318 3319 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size, 3320 uap->flags, UIO_USERSPACE)); 3321 } 3322 3323 /* 3324 * Retrieve the full filesystem path that correspond to a vnode from the name 3325 * cache (if available) 3326 */ 3327 int 3328 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf) 3329 { 3330 struct pwd *pwd; 3331 char *buf; 3332 size_t buflen; 3333 int error; 3334 3335 if (__predict_false(vp == NULL)) 3336 return (EINVAL); 3337 3338 buflen = MAXPATHLEN; 3339 buf = malloc(buflen, M_TEMP, M_WAITOK); 3340 vfs_smr_enter(); 3341 pwd = pwd_get_smr(); 3342 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, 0); 3343 VFS_SMR_ASSERT_NOT_ENTERED(); 3344 if (error < 0) { 3345 pwd = pwd_hold(curthread); 3346 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen); 3347 pwd_drop(pwd); 3348 } 3349 if (error == 0) 3350 *freebuf = buf; 3351 else 3352 free(buf, M_TEMP); 3353 return (error); 3354 } 3355 3356 /* 3357 * This function is similar to vn_fullpath, but it attempts to lookup the 3358 * pathname relative to the global root mount point. This is required for the 3359 * auditing sub-system, as audited pathnames must be absolute, relative to the 3360 * global root mount point. 3361 */ 3362 int 3363 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf) 3364 { 3365 char *buf; 3366 size_t buflen; 3367 int error; 3368 3369 if (__predict_false(vp == NULL)) 3370 return (EINVAL); 3371 buflen = MAXPATHLEN; 3372 buf = malloc(buflen, M_TEMP, M_WAITOK); 3373 vfs_smr_enter(); 3374 error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, 0); 3375 VFS_SMR_ASSERT_NOT_ENTERED(); 3376 if (error < 0) { 3377 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen); 3378 } 3379 if (error == 0) 3380 *freebuf = buf; 3381 else 3382 free(buf, M_TEMP); 3383 return (error); 3384 } 3385 3386 static struct namecache * 3387 vn_dd_from_dst(struct vnode *vp) 3388 { 3389 struct namecache *ncp; 3390 3391 cache_assert_vnode_locked(vp); 3392 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) { 3393 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 3394 return (ncp); 3395 } 3396 return (NULL); 3397 } 3398 3399 int 3400 vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen) 3401 { 3402 struct vnode *dvp; 3403 struct namecache *ncp; 3404 struct mtx *vlp; 3405 int error; 3406 3407 vlp = VP2VNODELOCK(*vp); 3408 mtx_lock(vlp); 3409 ncp = (*vp)->v_cache_dd; 3410 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) { 3411 KASSERT(ncp == vn_dd_from_dst(*vp), 3412 ("%s: mismatch for dd entry (%p != %p)", __func__, 3413 ncp, vn_dd_from_dst(*vp))); 3414 } else { 3415 ncp = vn_dd_from_dst(*vp); 3416 } 3417 if (ncp != NULL) { 3418 if (*buflen < ncp->nc_nlen) { 3419 mtx_unlock(vlp); 3420 vrele(*vp); 3421 counter_u64_add(numfullpathfail4, 1); 3422 error = ENOMEM; 3423 SDT_PROBE3(vfs, namecache, fullpath, return, error, 3424 vp, NULL); 3425 return (error); 3426 } 3427 *buflen -= ncp->nc_nlen; 3428 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 3429 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, 3430 ncp->nc_name, vp); 3431 dvp = *vp; 3432 *vp = ncp->nc_dvp; 3433 vref(*vp); 3434 mtx_unlock(vlp); 3435 vrele(dvp); 3436 return (0); 3437 } 3438 SDT_PROBE1(vfs, namecache, fullpath, miss, vp); 3439 3440 mtx_unlock(vlp); 3441 vn_lock(*vp, LK_SHARED | LK_RETRY); 3442 error = VOP_VPTOCNP(*vp, &dvp, buf, buflen); 3443 vput(*vp); 3444 if (error) { 3445 counter_u64_add(numfullpathfail2, 1); 3446 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 3447 return (error); 3448 } 3449 3450 *vp = dvp; 3451 if (VN_IS_DOOMED(dvp)) { 3452 /* forced unmount */ 3453 vrele(dvp); 3454 error = ENOENT; 3455 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 3456 return (error); 3457 } 3458 /* 3459 * *vp has its use count incremented still. 3460 */ 3461 3462 return (0); 3463 } 3464 3465 /* 3466 * Resolve a directory to a pathname. 3467 * 3468 * The name of the directory can always be found in the namecache or fetched 3469 * from the filesystem. There is also guaranteed to be only one parent, meaning 3470 * we can just follow vnodes up until we find the root. 3471 * 3472 * The vnode must be referenced. 3473 */ 3474 static int 3475 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 3476 size_t *len, size_t addend) 3477 { 3478 #ifdef KDTRACE_HOOKS 3479 struct vnode *startvp = vp; 3480 #endif 3481 struct vnode *vp1; 3482 size_t buflen; 3483 int error; 3484 bool slash_prefixed; 3485 3486 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); 3487 VNPASS(vp->v_usecount > 0, vp); 3488 3489 buflen = *len; 3490 3491 slash_prefixed = true; 3492 if (addend == 0) { 3493 MPASS(*len >= 2); 3494 buflen--; 3495 buf[buflen] = '\0'; 3496 slash_prefixed = false; 3497 } 3498 3499 error = 0; 3500 3501 SDT_PROBE1(vfs, namecache, fullpath, entry, vp); 3502 counter_u64_add(numfullpathcalls, 1); 3503 while (vp != rdir && vp != rootvnode) { 3504 /* 3505 * The vp vnode must be already fully constructed, 3506 * since it is either found in namecache or obtained 3507 * from VOP_VPTOCNP(). We may test for VV_ROOT safely 3508 * without obtaining the vnode lock. 3509 */ 3510 if ((vp->v_vflag & VV_ROOT) != 0) { 3511 vn_lock(vp, LK_RETRY | LK_SHARED); 3512 3513 /* 3514 * With the vnode locked, check for races with 3515 * unmount, forced or not. Note that we 3516 * already verified that vp is not equal to 3517 * the root vnode, which means that 3518 * mnt_vnodecovered can be NULL only for the 3519 * case of unmount. 3520 */ 3521 if (VN_IS_DOOMED(vp) || 3522 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || 3523 vp1->v_mountedhere != vp->v_mount) { 3524 vput(vp); 3525 error = ENOENT; 3526 SDT_PROBE3(vfs, namecache, fullpath, return, 3527 error, vp, NULL); 3528 break; 3529 } 3530 3531 vref(vp1); 3532 vput(vp); 3533 vp = vp1; 3534 continue; 3535 } 3536 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); 3537 error = vn_vptocnp(&vp, buf, &buflen); 3538 if (error) 3539 break; 3540 if (buflen == 0) { 3541 vrele(vp); 3542 error = ENOMEM; 3543 SDT_PROBE3(vfs, namecache, fullpath, return, error, 3544 startvp, NULL); 3545 break; 3546 } 3547 buf[--buflen] = '/'; 3548 slash_prefixed = true; 3549 } 3550 if (error) 3551 return (error); 3552 if (!slash_prefixed) { 3553 if (buflen == 0) { 3554 vrele(vp); 3555 counter_u64_add(numfullpathfail4, 1); 3556 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, 3557 startvp, NULL); 3558 return (ENOMEM); 3559 } 3560 buf[--buflen] = '/'; 3561 } 3562 counter_u64_add(numfullpathfound, 1); 3563 vrele(vp); 3564 3565 *retbuf = buf + buflen; 3566 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf); 3567 *len -= buflen; 3568 *len += addend; 3569 return (0); 3570 } 3571 3572 /* 3573 * Resolve an arbitrary vnode to a pathname. 3574 * 3575 * Note 2 caveats: 3576 * - hardlinks are not tracked, thus if the vnode is not a directory this can 3577 * resolve to a different path than the one used to find it 3578 * - namecache is not mandatory, meaning names are not guaranteed to be added 3579 * (in which case resolving fails) 3580 */ 3581 static void __inline 3582 cache_rev_failed_impl(int *reason, int line) 3583 { 3584 3585 *reason = line; 3586 } 3587 #define cache_rev_failed(var) cache_rev_failed_impl((var), __LINE__) 3588 3589 static int 3590 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 3591 char **retbuf, size_t *buflen, size_t addend) 3592 { 3593 #ifdef KDTRACE_HOOKS 3594 struct vnode *startvp = vp; 3595 #endif 3596 struct vnode *tvp; 3597 struct mount *mp; 3598 struct namecache *ncp; 3599 size_t orig_buflen; 3600 int reason; 3601 int error; 3602 #ifdef KDTRACE_HOOKS 3603 int i; 3604 #endif 3605 seqc_t vp_seqc, tvp_seqc; 3606 u_char nc_flag; 3607 3608 VFS_SMR_ASSERT_ENTERED(); 3609 3610 if (!atomic_load_char(&cache_fast_lookup_enabled)) { 3611 vfs_smr_exit(); 3612 return (-1); 3613 } 3614 3615 orig_buflen = *buflen; 3616 3617 if (addend == 0) { 3618 MPASS(*buflen >= 2); 3619 *buflen -= 1; 3620 buf[*buflen] = '\0'; 3621 } 3622 3623 if (vp == rdir || vp == rootvnode) { 3624 if (addend == 0) { 3625 *buflen -= 1; 3626 buf[*buflen] = '/'; 3627 } 3628 goto out_ok; 3629 } 3630 3631 #ifdef KDTRACE_HOOKS 3632 i = 0; 3633 #endif 3634 error = -1; 3635 ncp = NULL; /* for sdt probe down below */ 3636 vp_seqc = vn_seqc_read_any(vp); 3637 if (seqc_in_modify(vp_seqc)) { 3638 cache_rev_failed(&reason); 3639 goto out_abort; 3640 } 3641 3642 for (;;) { 3643 #ifdef KDTRACE_HOOKS 3644 i++; 3645 #endif 3646 if ((vp->v_vflag & VV_ROOT) != 0) { 3647 mp = atomic_load_ptr(&vp->v_mount); 3648 if (mp == NULL) { 3649 cache_rev_failed(&reason); 3650 goto out_abort; 3651 } 3652 tvp = atomic_load_ptr(&mp->mnt_vnodecovered); 3653 tvp_seqc = vn_seqc_read_any(tvp); 3654 if (seqc_in_modify(tvp_seqc)) { 3655 cache_rev_failed(&reason); 3656 goto out_abort; 3657 } 3658 if (!vn_seqc_consistent(vp, vp_seqc)) { 3659 cache_rev_failed(&reason); 3660 goto out_abort; 3661 } 3662 vp = tvp; 3663 vp_seqc = tvp_seqc; 3664 continue; 3665 } 3666 ncp = atomic_load_consume_ptr(&vp->v_cache_dd); 3667 if (ncp == NULL) { 3668 cache_rev_failed(&reason); 3669 goto out_abort; 3670 } 3671 nc_flag = atomic_load_char(&ncp->nc_flag); 3672 if ((nc_flag & NCF_ISDOTDOT) != 0) { 3673 cache_rev_failed(&reason); 3674 goto out_abort; 3675 } 3676 if (ncp->nc_nlen >= *buflen) { 3677 cache_rev_failed(&reason); 3678 error = ENOMEM; 3679 goto out_abort; 3680 } 3681 *buflen -= ncp->nc_nlen; 3682 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 3683 *buflen -= 1; 3684 buf[*buflen] = '/'; 3685 tvp = ncp->nc_dvp; 3686 tvp_seqc = vn_seqc_read_any(tvp); 3687 if (seqc_in_modify(tvp_seqc)) { 3688 cache_rev_failed(&reason); 3689 goto out_abort; 3690 } 3691 if (!vn_seqc_consistent(vp, vp_seqc)) { 3692 cache_rev_failed(&reason); 3693 goto out_abort; 3694 } 3695 /* 3696 * Acquire fence provided by vn_seqc_read_any above. 3697 */ 3698 if (__predict_false(atomic_load_ptr(&vp->v_cache_dd) != ncp)) { 3699 cache_rev_failed(&reason); 3700 goto out_abort; 3701 } 3702 if (!cache_ncp_canuse(ncp)) { 3703 cache_rev_failed(&reason); 3704 goto out_abort; 3705 } 3706 vp = tvp; 3707 vp_seqc = tvp_seqc; 3708 if (vp == rdir || vp == rootvnode) 3709 break; 3710 } 3711 out_ok: 3712 vfs_smr_exit(); 3713 *retbuf = buf + *buflen; 3714 *buflen = orig_buflen - *buflen + addend; 3715 SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf); 3716 return (0); 3717 3718 out_abort: 3719 *buflen = orig_buflen; 3720 SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i); 3721 vfs_smr_exit(); 3722 return (error); 3723 } 3724 3725 static int 3726 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 3727 size_t *buflen) 3728 { 3729 size_t orig_buflen, addend; 3730 int error; 3731 3732 if (*buflen < 2) 3733 return (EINVAL); 3734 3735 orig_buflen = *buflen; 3736 3737 vref(vp); 3738 addend = 0; 3739 if (vp->v_type != VDIR) { 3740 *buflen -= 1; 3741 buf[*buflen] = '\0'; 3742 error = vn_vptocnp(&vp, buf, buflen); 3743 if (error) 3744 return (error); 3745 if (*buflen == 0) { 3746 vrele(vp); 3747 return (ENOMEM); 3748 } 3749 *buflen -= 1; 3750 buf[*buflen] = '/'; 3751 addend = orig_buflen - *buflen; 3752 } 3753 3754 return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, addend)); 3755 } 3756 3757 /* 3758 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks). 3759 * 3760 * Since the namecache does not track hardlinks, the caller is expected to 3761 * first look up the target vnode with WANTPARENT flag passed to namei to get 3762 * dvp and vp. 3763 * 3764 * Then we have 2 cases: 3765 * - if the found vnode is a directory, the path can be constructed just by 3766 * following names up the chain 3767 * - otherwise we populate the buffer with the saved name and start resolving 3768 * from the parent 3769 */ 3770 int 3771 vn_fullpath_hardlink(struct vnode *vp, struct vnode *dvp, 3772 const char *hrdl_name, size_t hrdl_name_length, 3773 char **retbuf, char **freebuf, size_t *buflen) 3774 { 3775 char *buf, *tmpbuf; 3776 struct pwd *pwd; 3777 size_t addend; 3778 int error; 3779 __enum_uint8(vtype) type; 3780 3781 if (*buflen < 2) 3782 return (EINVAL); 3783 if (*buflen > MAXPATHLEN) 3784 *buflen = MAXPATHLEN; 3785 3786 buf = malloc(*buflen, M_TEMP, M_WAITOK); 3787 3788 addend = 0; 3789 3790 /* 3791 * Check for VBAD to work around the vp_crossmp bug in lookup(). 3792 * 3793 * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be 3794 * set to mount point's root vnode while ni_dvp will be vp_crossmp. 3795 * If the type is VDIR (like in this very case) we can skip looking 3796 * at ni_dvp in the first place. However, since vnodes get passed here 3797 * unlocked the target may transition to doomed state (type == VBAD) 3798 * before we get to evaluate the condition. If this happens, we will 3799 * populate part of the buffer and descend to vn_fullpath_dir with 3800 * vp == vp_crossmp. Prevent the problem by checking for VBAD. 3801 */ 3802 type = atomic_load_8(&vp->v_type); 3803 if (type == VBAD) { 3804 error = ENOENT; 3805 goto out_bad; 3806 } 3807 if (type != VDIR) { 3808 addend = hrdl_name_length + 2; 3809 if (*buflen < addend) { 3810 error = ENOMEM; 3811 goto out_bad; 3812 } 3813 *buflen -= addend; 3814 tmpbuf = buf + *buflen; 3815 tmpbuf[0] = '/'; 3816 memcpy(&tmpbuf[1], hrdl_name, hrdl_name_length); 3817 tmpbuf[addend - 1] = '\0'; 3818 vp = dvp; 3819 } 3820 3821 vfs_smr_enter(); 3822 pwd = pwd_get_smr(); 3823 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen, 3824 addend); 3825 VFS_SMR_ASSERT_NOT_ENTERED(); 3826 if (error < 0) { 3827 pwd = pwd_hold(curthread); 3828 vref(vp); 3829 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen, 3830 addend); 3831 pwd_drop(pwd); 3832 } 3833 if (error != 0) 3834 goto out_bad; 3835 3836 *freebuf = buf; 3837 3838 return (0); 3839 out_bad: 3840 free(buf, M_TEMP); 3841 return (error); 3842 } 3843 3844 struct vnode * 3845 vn_dir_dd_ino(struct vnode *vp) 3846 { 3847 struct namecache *ncp; 3848 struct vnode *ddvp; 3849 struct mtx *vlp; 3850 enum vgetstate vs; 3851 3852 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); 3853 vlp = VP2VNODELOCK(vp); 3854 mtx_lock(vlp); 3855 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { 3856 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 3857 continue; 3858 ddvp = ncp->nc_dvp; 3859 vs = vget_prep(ddvp); 3860 mtx_unlock(vlp); 3861 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) 3862 return (NULL); 3863 return (ddvp); 3864 } 3865 mtx_unlock(vlp); 3866 return (NULL); 3867 } 3868 3869 int 3870 vn_commname(struct vnode *vp, char *buf, u_int buflen) 3871 { 3872 struct namecache *ncp; 3873 struct mtx *vlp; 3874 int l; 3875 3876 vlp = VP2VNODELOCK(vp); 3877 mtx_lock(vlp); 3878 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) 3879 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 3880 break; 3881 if (ncp == NULL) { 3882 mtx_unlock(vlp); 3883 return (ENOENT); 3884 } 3885 l = min(ncp->nc_nlen, buflen - 1); 3886 memcpy(buf, ncp->nc_name, l); 3887 mtx_unlock(vlp); 3888 buf[l] = '\0'; 3889 return (0); 3890 } 3891 3892 /* 3893 * This function updates path string to vnode's full global path 3894 * and checks the size of the new path string against the pathlen argument. 3895 * 3896 * Requires a locked, referenced vnode. 3897 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 3898 * 3899 * If vp is a directory, the call to vn_fullpath_global() always succeeds 3900 * because it falls back to the ".." lookup if the namecache lookup fails. 3901 */ 3902 int 3903 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, 3904 u_int pathlen) 3905 { 3906 struct nameidata nd; 3907 struct vnode *vp1; 3908 char *rpath, *fbuf; 3909 int error; 3910 3911 ASSERT_VOP_ELOCKED(vp, __func__); 3912 3913 /* Construct global filesystem path from vp. */ 3914 VOP_UNLOCK(vp); 3915 error = vn_fullpath_global(vp, &rpath, &fbuf); 3916 3917 if (error != 0) { 3918 vrele(vp); 3919 return (error); 3920 } 3921 3922 if (strlen(rpath) >= pathlen) { 3923 vrele(vp); 3924 error = ENAMETOOLONG; 3925 goto out; 3926 } 3927 3928 /* 3929 * Re-lookup the vnode by path to detect a possible rename. 3930 * As a side effect, the vnode is relocked. 3931 * If vnode was renamed, return ENOENT. 3932 */ 3933 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, path); 3934 error = namei(&nd); 3935 if (error != 0) { 3936 vrele(vp); 3937 goto out; 3938 } 3939 NDFREE_PNBUF(&nd); 3940 vp1 = nd.ni_vp; 3941 vrele(vp); 3942 if (vp1 == vp) 3943 strcpy(path, rpath); 3944 else { 3945 vput(vp1); 3946 error = ENOENT; 3947 } 3948 3949 out: 3950 free(fbuf, M_TEMP); 3951 return (error); 3952 } 3953 3954 /* 3955 * This is similar to vn_path_to_global_path but allows for regular 3956 * files which may not be present in the cache. 3957 * 3958 * Requires a locked, referenced vnode. 3959 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 3960 */ 3961 int 3962 vn_path_to_global_path_hardlink(struct thread *td, struct vnode *vp, 3963 struct vnode *dvp, char *path, u_int pathlen, const char *leaf_name, 3964 size_t leaf_length) 3965 { 3966 struct nameidata nd; 3967 struct vnode *vp1; 3968 char *rpath, *fbuf; 3969 size_t len; 3970 int error; 3971 3972 ASSERT_VOP_ELOCKED(vp, __func__); 3973 3974 /* 3975 * Construct global filesystem path from dvp, vp and leaf 3976 * name. 3977 */ 3978 VOP_UNLOCK(vp); 3979 len = pathlen; 3980 error = vn_fullpath_hardlink(vp, dvp, leaf_name, leaf_length, 3981 &rpath, &fbuf, &len); 3982 3983 if (error != 0) { 3984 vrele(vp); 3985 return (error); 3986 } 3987 3988 if (strlen(rpath) >= pathlen) { 3989 vrele(vp); 3990 error = ENAMETOOLONG; 3991 goto out; 3992 } 3993 3994 /* 3995 * Re-lookup the vnode by path to detect a possible rename. 3996 * As a side effect, the vnode is relocked. 3997 * If vnode was renamed, return ENOENT. 3998 */ 3999 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, path); 4000 error = namei(&nd); 4001 if (error != 0) { 4002 vrele(vp); 4003 goto out; 4004 } 4005 NDFREE_PNBUF(&nd); 4006 vp1 = nd.ni_vp; 4007 vrele(vp); 4008 if (vp1 == vp) 4009 strcpy(path, rpath); 4010 else { 4011 vput(vp1); 4012 error = ENOENT; 4013 } 4014 4015 out: 4016 free(fbuf, M_TEMP); 4017 return (error); 4018 } 4019 4020 void 4021 cache_vop_inotify(struct vnode *vp, int event, uint32_t cookie) 4022 { 4023 struct mtx *vlp; 4024 struct namecache *ncp; 4025 int isdir; 4026 bool logged, self; 4027 4028 isdir = vp->v_type == VDIR ? IN_ISDIR : 0; 4029 self = (vn_irflag_read(vp) & VIRF_INOTIFY) != 0 && 4030 (vp->v_type != VDIR || (event & ~_IN_DIR_EVENTS) != 0); 4031 4032 if (self) { 4033 int selfevent; 4034 4035 if (event == _IN_ATTRIB_LINKCOUNT) 4036 selfevent = IN_ATTRIB; 4037 else 4038 selfevent = event; 4039 inotify_log(vp, NULL, 0, selfevent | isdir, cookie); 4040 } 4041 if ((event & IN_ALL_EVENTS) == 0) 4042 return; 4043 4044 logged = false; 4045 vlp = VP2VNODELOCK(vp); 4046 mtx_lock(vlp); 4047 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) { 4048 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 4049 continue; 4050 if ((vn_irflag_read(ncp->nc_dvp) & VIRF_INOTIFY) != 0) { 4051 /* 4052 * XXX-MJ if the vnode has two links in the same 4053 * dir, we'll log the same event twice. 4054 */ 4055 inotify_log(ncp->nc_dvp, ncp->nc_name, ncp->nc_nlen, 4056 event | isdir, cookie); 4057 logged = true; 4058 } 4059 } 4060 if (!logged && (vn_irflag_read(vp) & VIRF_INOTIFY_PARENT) != 0) { 4061 /* 4062 * We didn't find a watched directory that contains this vnode, 4063 * so stop calling VOP_INOTIFY for operations on the vnode. 4064 */ 4065 vn_irflag_unset(vp, VIRF_INOTIFY_PARENT); 4066 } 4067 mtx_unlock(vlp); 4068 } 4069 4070 #ifdef DDB 4071 static void 4072 db_print_vpath(struct vnode *vp) 4073 { 4074 4075 while (vp != NULL) { 4076 db_printf("%p: ", vp); 4077 if (vp == rootvnode) { 4078 db_printf("/"); 4079 vp = NULL; 4080 } else { 4081 if (vp->v_vflag & VV_ROOT) { 4082 db_printf("<mount point>"); 4083 vp = vp->v_mount->mnt_vnodecovered; 4084 } else { 4085 struct namecache *ncp; 4086 char *ncn; 4087 int i; 4088 4089 ncp = TAILQ_FIRST(&vp->v_cache_dst); 4090 if (ncp != NULL) { 4091 ncn = ncp->nc_name; 4092 for (i = 0; i < ncp->nc_nlen; i++) 4093 db_printf("%c", *ncn++); 4094 vp = ncp->nc_dvp; 4095 } else { 4096 vp = NULL; 4097 } 4098 } 4099 } 4100 db_printf("\n"); 4101 } 4102 4103 return; 4104 } 4105 4106 DB_SHOW_COMMAND(vpath, db_show_vpath) 4107 { 4108 struct vnode *vp; 4109 4110 if (!have_addr) { 4111 db_printf("usage: show vpath <struct vnode *>\n"); 4112 return; 4113 } 4114 4115 vp = (struct vnode *)addr; 4116 db_print_vpath(vp); 4117 } 4118 4119 #endif 4120 4121 static int cache_fast_lookup = 1; 4122 4123 #define CACHE_FPL_FAILED -2020 4124 4125 static int 4126 cache_vop_bad_vexec(struct vop_fplookup_vexec_args *v) 4127 { 4128 vn_printf(v->a_vp, "no proper vop_fplookup_vexec\n"); 4129 panic("no proper vop_fplookup_vexec"); 4130 } 4131 4132 static int 4133 cache_vop_bad_symlink(struct vop_fplookup_symlink_args *v) 4134 { 4135 vn_printf(v->a_vp, "no proper vop_fplookup_symlink\n"); 4136 panic("no proper vop_fplookup_symlink"); 4137 } 4138 4139 void 4140 cache_vop_vector_register(struct vop_vector *v) 4141 { 4142 size_t ops; 4143 4144 ops = 0; 4145 if (v->vop_fplookup_vexec != NULL) { 4146 ops++; 4147 } 4148 if (v->vop_fplookup_symlink != NULL) { 4149 ops++; 4150 } 4151 4152 if (ops == 2) { 4153 return; 4154 } 4155 4156 if (ops == 0) { 4157 v->vop_fplookup_vexec = cache_vop_bad_vexec; 4158 v->vop_fplookup_symlink = cache_vop_bad_symlink; 4159 return; 4160 } 4161 4162 printf("%s: invalid vop vector %p -- either all or none fplookup vops " 4163 "need to be provided", __func__, v); 4164 if (v->vop_fplookup_vexec == NULL) { 4165 printf("%s: missing vop_fplookup_vexec\n", __func__); 4166 } 4167 if (v->vop_fplookup_symlink == NULL) { 4168 printf("%s: missing vop_fplookup_symlink\n", __func__); 4169 } 4170 panic("bad vop vector %p", v); 4171 } 4172 4173 #ifdef INVARIANTS 4174 void 4175 cache_validate_vop_vector(struct mount *mp, struct vop_vector *vops) 4176 { 4177 if (mp == NULL) 4178 return; 4179 4180 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) 4181 return; 4182 4183 if (vops->vop_fplookup_vexec == NULL || 4184 vops->vop_fplookup_vexec == cache_vop_bad_vexec) 4185 panic("bad vop_fplookup_vexec on vector %p for filesystem %s", 4186 vops, mp->mnt_vfc->vfc_name); 4187 4188 if (vops->vop_fplookup_symlink == NULL || 4189 vops->vop_fplookup_symlink == cache_vop_bad_symlink) 4190 panic("bad vop_fplookup_symlink on vector %p for filesystem %s", 4191 vops, mp->mnt_vfc->vfc_name); 4192 } 4193 #endif 4194 4195 void 4196 cache_fast_lookup_enabled_recalc(void) 4197 { 4198 int lookup_flag; 4199 int mac_on; 4200 4201 #ifdef MAC 4202 mac_on = mac_vnode_check_lookup_enabled(); 4203 mac_on |= mac_vnode_check_readlink_enabled(); 4204 #else 4205 mac_on = 0; 4206 #endif 4207 4208 lookup_flag = atomic_load_int(&cache_fast_lookup); 4209 if (lookup_flag && !mac_on) { 4210 atomic_store_char(&cache_fast_lookup_enabled, true); 4211 } else { 4212 atomic_store_char(&cache_fast_lookup_enabled, false); 4213 } 4214 } 4215 4216 static int 4217 syscal_vfs_cache_fast_lookup(SYSCTL_HANDLER_ARGS) 4218 { 4219 int error, old; 4220 4221 old = atomic_load_int(&cache_fast_lookup); 4222 error = sysctl_handle_int(oidp, arg1, arg2, req); 4223 if (error == 0 && req->newptr && old != atomic_load_int(&cache_fast_lookup)) 4224 cache_fast_lookup_enabled_recalc(); 4225 return (error); 4226 } 4227 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, fast_lookup, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_MPSAFE, 4228 &cache_fast_lookup, 0, syscal_vfs_cache_fast_lookup, "IU", ""); 4229 4230 /* 4231 * Components of nameidata (or objects it can point to) which may 4232 * need restoring in case fast path lookup fails. 4233 */ 4234 struct nameidata_outer { 4235 size_t ni_pathlen; 4236 uint64_t cn_flags; 4237 }; 4238 4239 struct nameidata_saved { 4240 #ifdef INVARIANTS 4241 char *cn_nameptr; 4242 size_t ni_pathlen; 4243 #endif 4244 }; 4245 4246 #ifdef INVARIANTS 4247 struct cache_fpl_debug { 4248 size_t ni_pathlen; 4249 }; 4250 #endif 4251 4252 struct cache_fpl { 4253 struct nameidata *ndp; 4254 struct componentname *cnp; 4255 char *nulchar; 4256 struct vnode *dvp; 4257 struct vnode *tvp; 4258 seqc_t dvp_seqc; 4259 seqc_t tvp_seqc; 4260 uint32_t hash; 4261 struct nameidata_saved snd; 4262 struct nameidata_outer snd_outer; 4263 int line; 4264 enum cache_fpl_status status:8; 4265 bool in_smr; 4266 bool fsearch; 4267 struct pwd **pwd; 4268 #ifdef INVARIANTS 4269 struct cache_fpl_debug debug; 4270 #endif 4271 }; 4272 4273 static bool cache_fplookup_mp_supported(struct mount *mp); 4274 static bool cache_fplookup_is_mp(struct cache_fpl *fpl); 4275 static int cache_fplookup_cross_mount(struct cache_fpl *fpl); 4276 static int cache_fplookup_partial_setup(struct cache_fpl *fpl); 4277 static int cache_fplookup_skip_slashes(struct cache_fpl *fpl); 4278 static int cache_fplookup_trailingslash(struct cache_fpl *fpl); 4279 static void cache_fpl_pathlen_dec(struct cache_fpl *fpl); 4280 static void cache_fpl_pathlen_inc(struct cache_fpl *fpl); 4281 static void cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n); 4282 static void cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n); 4283 4284 static void 4285 cache_fpl_cleanup_cnp(struct componentname *cnp) 4286 { 4287 4288 uma_zfree(namei_zone, cnp->cn_pnbuf); 4289 cnp->cn_pnbuf = NULL; 4290 cnp->cn_nameptr = NULL; 4291 } 4292 4293 static struct vnode * 4294 cache_fpl_handle_root(struct cache_fpl *fpl) 4295 { 4296 struct nameidata *ndp; 4297 struct componentname *cnp; 4298 4299 ndp = fpl->ndp; 4300 cnp = fpl->cnp; 4301 4302 MPASS(*(cnp->cn_nameptr) == '/'); 4303 cnp->cn_nameptr++; 4304 cache_fpl_pathlen_dec(fpl); 4305 4306 if (__predict_false(*(cnp->cn_nameptr) == '/')) { 4307 do { 4308 cnp->cn_nameptr++; 4309 cache_fpl_pathlen_dec(fpl); 4310 } while (*(cnp->cn_nameptr) == '/'); 4311 } 4312 4313 return (ndp->ni_rootdir); 4314 } 4315 4316 static void 4317 cache_fpl_checkpoint_outer(struct cache_fpl *fpl) 4318 { 4319 4320 fpl->snd_outer.ni_pathlen = fpl->ndp->ni_pathlen; 4321 fpl->snd_outer.cn_flags = fpl->ndp->ni_cnd.cn_flags; 4322 } 4323 4324 static void 4325 cache_fpl_checkpoint(struct cache_fpl *fpl) 4326 { 4327 4328 #ifdef INVARIANTS 4329 fpl->snd.cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr; 4330 fpl->snd.ni_pathlen = fpl->debug.ni_pathlen; 4331 #endif 4332 } 4333 4334 static void 4335 cache_fpl_restore_partial(struct cache_fpl *fpl) 4336 { 4337 4338 fpl->ndp->ni_cnd.cn_flags = fpl->snd_outer.cn_flags; 4339 #ifdef INVARIANTS 4340 fpl->debug.ni_pathlen = fpl->snd.ni_pathlen; 4341 #endif 4342 } 4343 4344 static void 4345 cache_fpl_restore_abort(struct cache_fpl *fpl) 4346 { 4347 4348 cache_fpl_restore_partial(fpl); 4349 /* 4350 * It is 0 on entry by API contract. 4351 */ 4352 fpl->ndp->ni_resflags = 0; 4353 fpl->ndp->ni_cnd.cn_nameptr = fpl->ndp->ni_cnd.cn_pnbuf; 4354 fpl->ndp->ni_pathlen = fpl->snd_outer.ni_pathlen; 4355 } 4356 4357 #ifdef INVARIANTS 4358 #define cache_fpl_smr_assert_entered(fpl) ({ \ 4359 struct cache_fpl *_fpl = (fpl); \ 4360 MPASS(_fpl->in_smr == true); \ 4361 VFS_SMR_ASSERT_ENTERED(); \ 4362 }) 4363 #define cache_fpl_smr_assert_not_entered(fpl) ({ \ 4364 struct cache_fpl *_fpl = (fpl); \ 4365 MPASS(_fpl->in_smr == false); \ 4366 VFS_SMR_ASSERT_NOT_ENTERED(); \ 4367 }) 4368 static void 4369 cache_fpl_assert_status(struct cache_fpl *fpl) 4370 { 4371 4372 switch (fpl->status) { 4373 case CACHE_FPL_STATUS_UNSET: 4374 __assert_unreachable(); 4375 break; 4376 case CACHE_FPL_STATUS_DESTROYED: 4377 case CACHE_FPL_STATUS_ABORTED: 4378 case CACHE_FPL_STATUS_PARTIAL: 4379 case CACHE_FPL_STATUS_HANDLED: 4380 break; 4381 } 4382 } 4383 #else 4384 #define cache_fpl_smr_assert_entered(fpl) do { } while (0) 4385 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0) 4386 #define cache_fpl_assert_status(fpl) do { } while (0) 4387 #endif 4388 4389 #define cache_fpl_smr_enter_initial(fpl) ({ \ 4390 struct cache_fpl *_fpl = (fpl); \ 4391 vfs_smr_enter(); \ 4392 _fpl->in_smr = true; \ 4393 }) 4394 4395 #define cache_fpl_smr_enter(fpl) ({ \ 4396 struct cache_fpl *_fpl = (fpl); \ 4397 MPASS(_fpl->in_smr == false); \ 4398 vfs_smr_enter(); \ 4399 _fpl->in_smr = true; \ 4400 }) 4401 4402 #define cache_fpl_smr_exit(fpl) ({ \ 4403 struct cache_fpl *_fpl = (fpl); \ 4404 MPASS(_fpl->in_smr == true); \ 4405 vfs_smr_exit(); \ 4406 _fpl->in_smr = false; \ 4407 }) 4408 4409 static int 4410 cache_fpl_aborted_early_impl(struct cache_fpl *fpl, int line) 4411 { 4412 4413 if (fpl->status != CACHE_FPL_STATUS_UNSET) { 4414 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, 4415 ("%s: converting to abort from %d at %d, set at %d\n", 4416 __func__, fpl->status, line, fpl->line)); 4417 } 4418 cache_fpl_smr_assert_not_entered(fpl); 4419 fpl->status = CACHE_FPL_STATUS_ABORTED; 4420 fpl->line = line; 4421 return (CACHE_FPL_FAILED); 4422 } 4423 4424 #define cache_fpl_aborted_early(x) cache_fpl_aborted_early_impl((x), __LINE__) 4425 4426 static int __noinline 4427 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line) 4428 { 4429 struct nameidata *ndp; 4430 struct componentname *cnp; 4431 4432 ndp = fpl->ndp; 4433 cnp = fpl->cnp; 4434 4435 if (fpl->status != CACHE_FPL_STATUS_UNSET) { 4436 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, 4437 ("%s: converting to abort from %d at %d, set at %d\n", 4438 __func__, fpl->status, line, fpl->line)); 4439 } 4440 fpl->status = CACHE_FPL_STATUS_ABORTED; 4441 fpl->line = line; 4442 if (fpl->in_smr) 4443 cache_fpl_smr_exit(fpl); 4444 cache_fpl_restore_abort(fpl); 4445 /* 4446 * Resolving symlinks overwrites data passed by the caller. 4447 * Let namei know. 4448 */ 4449 if (ndp->ni_loopcnt > 0) { 4450 fpl->status = CACHE_FPL_STATUS_DESTROYED; 4451 cache_fpl_cleanup_cnp(cnp); 4452 } 4453 return (CACHE_FPL_FAILED); 4454 } 4455 4456 #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__) 4457 4458 static int __noinline 4459 cache_fpl_partial_impl(struct cache_fpl *fpl, int line) 4460 { 4461 4462 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 4463 ("%s: setting to partial at %d, but already set to %d at %d\n", 4464 __func__, line, fpl->status, fpl->line)); 4465 cache_fpl_smr_assert_entered(fpl); 4466 fpl->status = CACHE_FPL_STATUS_PARTIAL; 4467 fpl->line = line; 4468 return (cache_fplookup_partial_setup(fpl)); 4469 } 4470 4471 #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__) 4472 4473 static int 4474 cache_fpl_handled_impl(struct cache_fpl *fpl, int line) 4475 { 4476 4477 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 4478 ("%s: setting to handled at %d, but already set to %d at %d\n", 4479 __func__, line, fpl->status, fpl->line)); 4480 cache_fpl_smr_assert_not_entered(fpl); 4481 fpl->status = CACHE_FPL_STATUS_HANDLED; 4482 fpl->line = line; 4483 return (0); 4484 } 4485 4486 #define cache_fpl_handled(x) cache_fpl_handled_impl((x), __LINE__) 4487 4488 static int 4489 cache_fpl_handled_error_impl(struct cache_fpl *fpl, int error, int line) 4490 { 4491 4492 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 4493 ("%s: setting to handled at %d, but already set to %d at %d\n", 4494 __func__, line, fpl->status, fpl->line)); 4495 MPASS(error != 0); 4496 MPASS(error != CACHE_FPL_FAILED); 4497 cache_fpl_smr_assert_not_entered(fpl); 4498 fpl->status = CACHE_FPL_STATUS_HANDLED; 4499 fpl->line = line; 4500 fpl->dvp = NULL; 4501 fpl->tvp = NULL; 4502 return (error); 4503 } 4504 4505 #define cache_fpl_handled_error(x, e) cache_fpl_handled_error_impl((x), (e), __LINE__) 4506 4507 static bool 4508 cache_fpl_terminated(struct cache_fpl *fpl) 4509 { 4510 4511 return (fpl->status != CACHE_FPL_STATUS_UNSET); 4512 } 4513 4514 #define CACHE_FPL_SUPPORTED_CN_FLAGS \ 4515 (NC_NOMAKEENTRY | NC_KEEPPOSENTRY | LOCKLEAF | LOCKPARENT | WANTPARENT | \ 4516 FAILIFEXISTS | FOLLOW | EMPTYPATH | LOCKSHARED | ISRESTARTED | WILLBEDIR | \ 4517 ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK | OPENREAD | \ 4518 OPENWRITE | WANTIOCTLCAPS | NAMEILOOKUP) 4519 4520 #define CACHE_FPL_INTERNAL_CN_FLAGS \ 4521 (ISDOTDOT | MAKEENTRY | ISLASTCN) 4522 4523 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, 4524 "supported and internal flags overlap"); 4525 4526 static bool 4527 cache_fpl_islastcn(struct nameidata *ndp) 4528 { 4529 4530 return (*ndp->ni_next == 0); 4531 } 4532 4533 static bool 4534 cache_fpl_istrailingslash(struct cache_fpl *fpl) 4535 { 4536 4537 MPASS(fpl->nulchar > fpl->cnp->cn_pnbuf); 4538 return (*(fpl->nulchar - 1) == '/'); 4539 } 4540 4541 static bool 4542 cache_fpl_isdotdot(struct componentname *cnp) 4543 { 4544 4545 if (cnp->cn_namelen == 2 && 4546 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') 4547 return (true); 4548 return (false); 4549 } 4550 4551 static bool 4552 cache_can_fplookup(struct cache_fpl *fpl) 4553 { 4554 struct nameidata *ndp; 4555 struct componentname *cnp; 4556 struct thread *td; 4557 4558 ndp = fpl->ndp; 4559 cnp = fpl->cnp; 4560 td = curthread; 4561 4562 if (!atomic_load_char(&cache_fast_lookup_enabled)) { 4563 cache_fpl_aborted_early(fpl); 4564 return (false); 4565 } 4566 if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) { 4567 cache_fpl_aborted_early(fpl); 4568 return (false); 4569 } 4570 if (IN_CAPABILITY_MODE(td) || CAP_TRACING(td)) { 4571 cache_fpl_aborted_early(fpl); 4572 return (false); 4573 } 4574 if (AUDITING_TD(td)) { 4575 cache_fpl_aborted_early(fpl); 4576 return (false); 4577 } 4578 if (ndp->ni_startdir != NULL) { 4579 cache_fpl_aborted_early(fpl); 4580 return (false); 4581 } 4582 return (true); 4583 } 4584 4585 static int __noinline 4586 cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp) 4587 { 4588 struct nameidata *ndp; 4589 struct componentname *cnp; 4590 int error, flags; 4591 4592 ndp = fpl->ndp; 4593 cnp = fpl->cnp; 4594 4595 error = fgetvp_lookup_smr(ndp, vpp, &flags); 4596 if (__predict_false(error != 0)) { 4597 return (cache_fpl_aborted(fpl)); 4598 } 4599 if (__predict_false((flags & O_RESOLVE_BENEATH) != 0)) { 4600 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & RBENEATH) == 0, 4601 "RBENEATH supported by fplookup"); 4602 cache_fpl_smr_exit(fpl); 4603 cache_fpl_aborted(fpl); 4604 return (EOPNOTSUPP); 4605 } 4606 fpl->fsearch = (flags & FSEARCH) != 0; 4607 if ((*vpp)->v_type != VDIR) { 4608 if (!((cnp->cn_flags & EMPTYPATH) != 0 && cnp->cn_pnbuf[0] == '\0')) { 4609 cache_fpl_smr_exit(fpl); 4610 return (cache_fpl_handled_error(fpl, ENOTDIR)); 4611 } 4612 } 4613 return (0); 4614 } 4615 4616 static int __noinline 4617 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp, 4618 uint32_t hash) 4619 { 4620 struct componentname *cnp; 4621 struct vnode *dvp; 4622 4623 cnp = fpl->cnp; 4624 dvp = fpl->dvp; 4625 4626 cache_fpl_smr_exit(fpl); 4627 if (cache_neg_promote_cond(dvp, cnp, oncp, hash)) 4628 return (cache_fpl_handled_error(fpl, ENOENT)); 4629 else 4630 return (cache_fpl_aborted(fpl)); 4631 } 4632 4633 /* 4634 * The target vnode is not supported, prepare for the slow path to take over. 4635 */ 4636 static int __noinline 4637 cache_fplookup_partial_setup(struct cache_fpl *fpl) 4638 { 4639 struct nameidata *ndp; 4640 struct componentname *cnp; 4641 enum vgetstate dvs; 4642 struct vnode *dvp; 4643 struct pwd *pwd; 4644 seqc_t dvp_seqc; 4645 4646 ndp = fpl->ndp; 4647 cnp = fpl->cnp; 4648 pwd = *(fpl->pwd); 4649 dvp = fpl->dvp; 4650 dvp_seqc = fpl->dvp_seqc; 4651 4652 if (!pwd_hold_smr(pwd)) { 4653 return (cache_fpl_aborted(fpl)); 4654 } 4655 4656 /* 4657 * Note that seqc is checked before the vnode is locked, so by 4658 * the time regular lookup gets to it it may have moved. 4659 * 4660 * Ultimately this does not affect correctness, any lookup errors 4661 * are userspace racing with itself. It is guaranteed that any 4662 * path which ultimately gets found could also have been found 4663 * by regular lookup going all the way in absence of concurrent 4664 * modifications. 4665 */ 4666 dvs = vget_prep_smr(dvp); 4667 cache_fpl_smr_exit(fpl); 4668 if (__predict_false(dvs == VGET_NONE)) { 4669 pwd_drop(pwd); 4670 return (cache_fpl_aborted(fpl)); 4671 } 4672 4673 vget_finish_ref(dvp, dvs); 4674 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4675 vrele(dvp); 4676 pwd_drop(pwd); 4677 return (cache_fpl_aborted(fpl)); 4678 } 4679 4680 cache_fpl_restore_partial(fpl); 4681 #ifdef INVARIANTS 4682 if (cnp->cn_nameptr != fpl->snd.cn_nameptr) { 4683 panic("%s: cn_nameptr mismatch (%p != %p) full [%s]\n", __func__, 4684 cnp->cn_nameptr, fpl->snd.cn_nameptr, cnp->cn_pnbuf); 4685 } 4686 #endif 4687 4688 ndp->ni_startdir = dvp; 4689 cnp->cn_flags |= MAKEENTRY; 4690 if (cache_fpl_islastcn(ndp)) 4691 cnp->cn_flags |= ISLASTCN; 4692 if (cache_fpl_isdotdot(cnp)) 4693 cnp->cn_flags |= ISDOTDOT; 4694 4695 /* 4696 * Skip potential extra slashes parsing did not take care of. 4697 * cache_fplookup_skip_slashes explains the mechanism. 4698 */ 4699 if (__predict_false(*(cnp->cn_nameptr) == '/')) { 4700 do { 4701 cnp->cn_nameptr++; 4702 cache_fpl_pathlen_dec(fpl); 4703 } while (*(cnp->cn_nameptr) == '/'); 4704 } 4705 4706 ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1; 4707 #ifdef INVARIANTS 4708 if (ndp->ni_pathlen != fpl->debug.ni_pathlen) { 4709 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n", 4710 __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar, 4711 cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf); 4712 } 4713 #endif 4714 return (0); 4715 } 4716 4717 static int 4718 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs) 4719 { 4720 struct componentname *cnp; 4721 struct vnode *tvp; 4722 seqc_t tvp_seqc; 4723 int error, lkflags; 4724 4725 cnp = fpl->cnp; 4726 tvp = fpl->tvp; 4727 tvp_seqc = fpl->tvp_seqc; 4728 4729 if ((cnp->cn_flags & LOCKLEAF) != 0) { 4730 lkflags = LK_SHARED; 4731 if ((cnp->cn_flags & LOCKSHARED) == 0) 4732 lkflags = LK_EXCLUSIVE; 4733 error = vget_finish(tvp, lkflags, tvs); 4734 if (__predict_false(error != 0)) { 4735 return (cache_fpl_aborted(fpl)); 4736 } 4737 } else { 4738 vget_finish_ref(tvp, tvs); 4739 } 4740 4741 if (!vn_seqc_consistent(tvp, tvp_seqc)) { 4742 if ((cnp->cn_flags & LOCKLEAF) != 0) 4743 vput(tvp); 4744 else 4745 vrele(tvp); 4746 return (cache_fpl_aborted(fpl)); 4747 } 4748 4749 return (cache_fpl_handled(fpl)); 4750 } 4751 4752 /* 4753 * They want to possibly modify the state of the namecache. 4754 */ 4755 static int __noinline 4756 cache_fplookup_final_modifying(struct cache_fpl *fpl) 4757 { 4758 struct nameidata *ndp __diagused; 4759 struct componentname *cnp; 4760 enum vgetstate dvs; 4761 struct vnode *dvp, *tvp; 4762 struct mount *mp; 4763 seqc_t dvp_seqc; 4764 int error; 4765 bool docache; 4766 4767 ndp = fpl->ndp; 4768 cnp = fpl->cnp; 4769 dvp = fpl->dvp; 4770 dvp_seqc = fpl->dvp_seqc; 4771 4772 MPASS(*(cnp->cn_nameptr) != '/'); 4773 MPASS(cache_fpl_islastcn(ndp)); 4774 if ((cnp->cn_flags & LOCKPARENT) == 0) 4775 MPASS((cnp->cn_flags & WANTPARENT) != 0); 4776 MPASS((cnp->cn_flags & TRAILINGSLASH) == 0); 4777 MPASS(cnp->cn_nameiop == CREATE || cnp->cn_nameiop == DELETE || 4778 cnp->cn_nameiop == RENAME); 4779 MPASS((cnp->cn_flags & MAKEENTRY) == 0); 4780 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 4781 4782 docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE; 4783 if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME) 4784 docache = false; 4785 4786 /* 4787 * Regular lookup nulifies the slash, which we don't do here. 4788 * Don't take chances with filesystem routines seeing it for 4789 * the last entry. 4790 */ 4791 if (cache_fpl_istrailingslash(fpl)) { 4792 return (cache_fpl_partial(fpl)); 4793 } 4794 4795 mp = atomic_load_ptr(&dvp->v_mount); 4796 if (__predict_false(mp == NULL)) { 4797 return (cache_fpl_aborted(fpl)); 4798 } 4799 4800 if (__predict_false(mp->mnt_flag & MNT_RDONLY)) { 4801 cache_fpl_smr_exit(fpl); 4802 /* 4803 * Original code keeps not checking for CREATE which 4804 * might be a bug. For now let the old lookup decide. 4805 */ 4806 if (cnp->cn_nameiop == CREATE) { 4807 return (cache_fpl_aborted(fpl)); 4808 } 4809 return (cache_fpl_handled_error(fpl, EROFS)); 4810 } 4811 4812 if (fpl->tvp != NULL && (cnp->cn_flags & FAILIFEXISTS) != 0) { 4813 cache_fpl_smr_exit(fpl); 4814 return (cache_fpl_handled_error(fpl, EEXIST)); 4815 } 4816 4817 /* 4818 * Secure access to dvp; check cache_fplookup_partial_setup for 4819 * reasoning. 4820 * 4821 * XXX At least UFS requires its lookup routine to be called for 4822 * the last path component, which leads to some level of complication 4823 * and inefficiency: 4824 * - the target routine always locks the target vnode, but our caller 4825 * may not need it locked 4826 * - some of the VOP machinery asserts that the parent is locked, which 4827 * once more may be not required 4828 * 4829 * TODO: add a flag for filesystems which don't need this. 4830 */ 4831 dvs = vget_prep_smr(dvp); 4832 cache_fpl_smr_exit(fpl); 4833 if (__predict_false(dvs == VGET_NONE)) { 4834 return (cache_fpl_aborted(fpl)); 4835 } 4836 4837 vget_finish_ref(dvp, dvs); 4838 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4839 vrele(dvp); 4840 return (cache_fpl_aborted(fpl)); 4841 } 4842 4843 error = vn_lock(dvp, LK_EXCLUSIVE); 4844 if (__predict_false(error != 0)) { 4845 vrele(dvp); 4846 return (cache_fpl_aborted(fpl)); 4847 } 4848 4849 tvp = NULL; 4850 cnp->cn_flags |= ISLASTCN; 4851 if (docache) 4852 cnp->cn_flags |= MAKEENTRY; 4853 if (cache_fpl_isdotdot(cnp)) 4854 cnp->cn_flags |= ISDOTDOT; 4855 cnp->cn_lkflags = LK_EXCLUSIVE; 4856 error = VOP_LOOKUP(dvp, &tvp, cnp); 4857 switch (error) { 4858 case EJUSTRETURN: 4859 case 0: 4860 break; 4861 case ENOTDIR: 4862 case ENOENT: 4863 vput(dvp); 4864 return (cache_fpl_handled_error(fpl, error)); 4865 default: 4866 vput(dvp); 4867 return (cache_fpl_aborted(fpl)); 4868 } 4869 4870 fpl->tvp = tvp; 4871 4872 if (tvp == NULL) { 4873 MPASS(error == EJUSTRETURN); 4874 if ((cnp->cn_flags & LOCKPARENT) == 0) { 4875 VOP_UNLOCK(dvp); 4876 } 4877 return (cache_fpl_handled(fpl)); 4878 } 4879 4880 /* 4881 * There are very hairy corner cases concerning various flag combinations 4882 * and locking state. In particular here we only hold one lock instead of 4883 * two. 4884 * 4885 * Skip the complexity as it is of no significance for normal workloads. 4886 */ 4887 if (__predict_false(tvp == dvp)) { 4888 vput(dvp); 4889 vrele(tvp); 4890 return (cache_fpl_aborted(fpl)); 4891 } 4892 4893 /* 4894 * If they want the symlink itself we are fine, but if they want to 4895 * follow it regular lookup has to be engaged. 4896 */ 4897 if (tvp->v_type == VLNK) { 4898 if ((cnp->cn_flags & FOLLOW) != 0) { 4899 vput(dvp); 4900 vput(tvp); 4901 return (cache_fpl_aborted(fpl)); 4902 } 4903 } 4904 4905 /* 4906 * Since we expect this to be the terminal vnode it should almost never 4907 * be a mount point. 4908 */ 4909 if (__predict_false(cache_fplookup_is_mp(fpl))) { 4910 vput(dvp); 4911 vput(tvp); 4912 return (cache_fpl_aborted(fpl)); 4913 } 4914 4915 if ((cnp->cn_flags & FAILIFEXISTS) != 0) { 4916 vput(dvp); 4917 vput(tvp); 4918 return (cache_fpl_handled_error(fpl, EEXIST)); 4919 } 4920 4921 if ((cnp->cn_flags & LOCKLEAF) == 0) { 4922 VOP_UNLOCK(tvp); 4923 } 4924 4925 if ((cnp->cn_flags & LOCKPARENT) == 0) { 4926 VOP_UNLOCK(dvp); 4927 } 4928 4929 return (cache_fpl_handled(fpl)); 4930 } 4931 4932 static int __noinline 4933 cache_fplookup_modifying(struct cache_fpl *fpl) 4934 { 4935 struct nameidata *ndp; 4936 4937 ndp = fpl->ndp; 4938 4939 if (!cache_fpl_islastcn(ndp)) { 4940 return (cache_fpl_partial(fpl)); 4941 } 4942 return (cache_fplookup_final_modifying(fpl)); 4943 } 4944 4945 static int __noinline 4946 cache_fplookup_final_withparent(struct cache_fpl *fpl) 4947 { 4948 struct componentname *cnp; 4949 enum vgetstate dvs, tvs; 4950 struct vnode *dvp, *tvp; 4951 seqc_t dvp_seqc; 4952 int error; 4953 4954 cnp = fpl->cnp; 4955 dvp = fpl->dvp; 4956 dvp_seqc = fpl->dvp_seqc; 4957 tvp = fpl->tvp; 4958 4959 MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0); 4960 4961 /* 4962 * This is less efficient than it can be for simplicity. 4963 */ 4964 dvs = vget_prep_smr(dvp); 4965 if (__predict_false(dvs == VGET_NONE)) { 4966 return (cache_fpl_aborted(fpl)); 4967 } 4968 tvs = vget_prep_smr(tvp); 4969 if (__predict_false(tvs == VGET_NONE)) { 4970 cache_fpl_smr_exit(fpl); 4971 vget_abort(dvp, dvs); 4972 return (cache_fpl_aborted(fpl)); 4973 } 4974 4975 cache_fpl_smr_exit(fpl); 4976 4977 if ((cnp->cn_flags & LOCKPARENT) != 0) { 4978 error = vget_finish(dvp, LK_EXCLUSIVE, dvs); 4979 if (__predict_false(error != 0)) { 4980 vget_abort(tvp, tvs); 4981 return (cache_fpl_aborted(fpl)); 4982 } 4983 } else { 4984 vget_finish_ref(dvp, dvs); 4985 } 4986 4987 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4988 vget_abort(tvp, tvs); 4989 if ((cnp->cn_flags & LOCKPARENT) != 0) 4990 vput(dvp); 4991 else 4992 vrele(dvp); 4993 return (cache_fpl_aborted(fpl)); 4994 } 4995 4996 error = cache_fplookup_final_child(fpl, tvs); 4997 if (__predict_false(error != 0)) { 4998 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED || 4999 fpl->status == CACHE_FPL_STATUS_DESTROYED); 5000 if ((cnp->cn_flags & LOCKPARENT) != 0) 5001 vput(dvp); 5002 else 5003 vrele(dvp); 5004 return (error); 5005 } 5006 5007 MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED); 5008 return (0); 5009 } 5010 5011 static int 5012 cache_fplookup_final(struct cache_fpl *fpl) 5013 { 5014 struct componentname *cnp; 5015 enum vgetstate tvs; 5016 struct vnode *dvp, *tvp; 5017 seqc_t dvp_seqc; 5018 5019 cnp = fpl->cnp; 5020 dvp = fpl->dvp; 5021 dvp_seqc = fpl->dvp_seqc; 5022 tvp = fpl->tvp; 5023 5024 MPASS(*(cnp->cn_nameptr) != '/'); 5025 5026 if (cnp->cn_nameiop != LOOKUP) { 5027 return (cache_fplookup_final_modifying(fpl)); 5028 } 5029 5030 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) 5031 return (cache_fplookup_final_withparent(fpl)); 5032 5033 tvs = vget_prep_smr(tvp); 5034 if (__predict_false(tvs == VGET_NONE)) { 5035 return (cache_fpl_partial(fpl)); 5036 } 5037 5038 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 5039 cache_fpl_smr_exit(fpl); 5040 vget_abort(tvp, tvs); 5041 return (cache_fpl_aborted(fpl)); 5042 } 5043 5044 cache_fpl_smr_exit(fpl); 5045 return (cache_fplookup_final_child(fpl, tvs)); 5046 } 5047 5048 /* 5049 * Comment from locked lookup: 5050 * Check for degenerate name (e.g. / or "") which is a way of talking about a 5051 * directory, e.g. like "/." or ".". 5052 */ 5053 static int __noinline 5054 cache_fplookup_degenerate(struct cache_fpl *fpl) 5055 { 5056 struct componentname *cnp; 5057 struct vnode *dvp; 5058 enum vgetstate dvs; 5059 int error, lkflags; 5060 #ifdef INVARIANTS 5061 char *cp; 5062 #endif 5063 5064 fpl->tvp = fpl->dvp; 5065 fpl->tvp_seqc = fpl->dvp_seqc; 5066 5067 cnp = fpl->cnp; 5068 dvp = fpl->dvp; 5069 5070 #ifdef INVARIANTS 5071 for (cp = cnp->cn_pnbuf; *cp != '\0'; cp++) { 5072 KASSERT(*cp == '/', 5073 ("%s: encountered non-slash; string [%s]\n", __func__, 5074 cnp->cn_pnbuf)); 5075 } 5076 #endif 5077 5078 if (__predict_false(cnp->cn_nameiop != LOOKUP)) { 5079 cache_fpl_smr_exit(fpl); 5080 return (cache_fpl_handled_error(fpl, EISDIR)); 5081 } 5082 5083 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) { 5084 return (cache_fplookup_final_withparent(fpl)); 5085 } 5086 5087 dvs = vget_prep_smr(dvp); 5088 cache_fpl_smr_exit(fpl); 5089 if (__predict_false(dvs == VGET_NONE)) { 5090 return (cache_fpl_aborted(fpl)); 5091 } 5092 5093 if ((cnp->cn_flags & LOCKLEAF) != 0) { 5094 lkflags = LK_SHARED; 5095 if ((cnp->cn_flags & LOCKSHARED) == 0) 5096 lkflags = LK_EXCLUSIVE; 5097 error = vget_finish(dvp, lkflags, dvs); 5098 if (__predict_false(error != 0)) { 5099 return (cache_fpl_aborted(fpl)); 5100 } 5101 } else { 5102 vget_finish_ref(dvp, dvs); 5103 } 5104 return (cache_fpl_handled(fpl)); 5105 } 5106 5107 static int __noinline 5108 cache_fplookup_emptypath(struct cache_fpl *fpl) 5109 { 5110 struct nameidata *ndp; 5111 struct componentname *cnp; 5112 enum vgetstate tvs; 5113 struct vnode *tvp; 5114 int error, lkflags; 5115 5116 fpl->tvp = fpl->dvp; 5117 fpl->tvp_seqc = fpl->dvp_seqc; 5118 5119 ndp = fpl->ndp; 5120 cnp = fpl->cnp; 5121 tvp = fpl->tvp; 5122 5123 MPASS(*cnp->cn_pnbuf == '\0'); 5124 5125 if (__predict_false((cnp->cn_flags & EMPTYPATH) == 0)) { 5126 cache_fpl_smr_exit(fpl); 5127 return (cache_fpl_handled_error(fpl, ENOENT)); 5128 } 5129 5130 MPASS((cnp->cn_flags & (LOCKPARENT | WANTPARENT)) == 0); 5131 5132 tvs = vget_prep_smr(tvp); 5133 cache_fpl_smr_exit(fpl); 5134 if (__predict_false(tvs == VGET_NONE)) { 5135 return (cache_fpl_aborted(fpl)); 5136 } 5137 5138 if ((cnp->cn_flags & LOCKLEAF) != 0) { 5139 lkflags = LK_SHARED; 5140 if ((cnp->cn_flags & LOCKSHARED) == 0) 5141 lkflags = LK_EXCLUSIVE; 5142 error = vget_finish(tvp, lkflags, tvs); 5143 if (__predict_false(error != 0)) { 5144 return (cache_fpl_aborted(fpl)); 5145 } 5146 } else { 5147 vget_finish_ref(tvp, tvs); 5148 } 5149 5150 ndp->ni_resflags |= NIRES_EMPTYPATH; 5151 return (cache_fpl_handled(fpl)); 5152 } 5153 5154 static int __noinline 5155 cache_fplookup_noentry(struct cache_fpl *fpl) 5156 { 5157 struct nameidata *ndp; 5158 struct componentname *cnp; 5159 enum vgetstate dvs; 5160 struct vnode *dvp, *tvp; 5161 seqc_t dvp_seqc; 5162 int error; 5163 5164 ndp = fpl->ndp; 5165 cnp = fpl->cnp; 5166 dvp = fpl->dvp; 5167 dvp_seqc = fpl->dvp_seqc; 5168 5169 MPASS((cnp->cn_flags & MAKEENTRY) == 0); 5170 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 5171 if (cnp->cn_nameiop == LOOKUP) 5172 MPASS((cnp->cn_flags & NOCACHE) == 0); 5173 MPASS(!cache_fpl_isdotdot(cnp)); 5174 5175 /* 5176 * Hack: delayed name len checking. 5177 */ 5178 if (__predict_false(cnp->cn_namelen > NAME_MAX)) { 5179 cache_fpl_smr_exit(fpl); 5180 return (cache_fpl_handled_error(fpl, ENAMETOOLONG)); 5181 } 5182 5183 if (cnp->cn_nameptr[0] == '/') { 5184 return (cache_fplookup_skip_slashes(fpl)); 5185 } 5186 5187 if (cnp->cn_pnbuf[0] == '\0') { 5188 return (cache_fplookup_emptypath(fpl)); 5189 } 5190 5191 if (cnp->cn_nameptr[0] == '\0') { 5192 if (fpl->tvp == NULL) { 5193 return (cache_fplookup_degenerate(fpl)); 5194 } 5195 return (cache_fplookup_trailingslash(fpl)); 5196 } 5197 5198 if (cnp->cn_nameiop != LOOKUP) { 5199 fpl->tvp = NULL; 5200 return (cache_fplookup_modifying(fpl)); 5201 } 5202 5203 /* 5204 * Only try to fill in the component if it is the last one, 5205 * otherwise not only there may be several to handle but the 5206 * walk may be complicated. 5207 */ 5208 if (!cache_fpl_islastcn(ndp)) { 5209 return (cache_fpl_partial(fpl)); 5210 } 5211 5212 /* 5213 * Regular lookup nulifies the slash, which we don't do here. 5214 * Don't take chances with filesystem routines seeing it for 5215 * the last entry. 5216 */ 5217 if (cache_fpl_istrailingslash(fpl)) { 5218 return (cache_fpl_partial(fpl)); 5219 } 5220 5221 /* 5222 * Secure access to dvp; check cache_fplookup_partial_setup for 5223 * reasoning. 5224 */ 5225 dvs = vget_prep_smr(dvp); 5226 cache_fpl_smr_exit(fpl); 5227 if (__predict_false(dvs == VGET_NONE)) { 5228 return (cache_fpl_aborted(fpl)); 5229 } 5230 5231 vget_finish_ref(dvp, dvs); 5232 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 5233 vrele(dvp); 5234 return (cache_fpl_aborted(fpl)); 5235 } 5236 5237 error = vn_lock(dvp, LK_SHARED); 5238 if (__predict_false(error != 0)) { 5239 vrele(dvp); 5240 return (cache_fpl_aborted(fpl)); 5241 } 5242 5243 tvp = NULL; 5244 /* 5245 * TODO: provide variants which don't require locking either vnode. 5246 */ 5247 cnp->cn_flags |= ISLASTCN | MAKEENTRY; 5248 cnp->cn_lkflags = LK_SHARED; 5249 if ((cnp->cn_flags & LOCKSHARED) == 0) { 5250 cnp->cn_lkflags = LK_EXCLUSIVE; 5251 } 5252 error = VOP_LOOKUP(dvp, &tvp, cnp); 5253 switch (error) { 5254 case EJUSTRETURN: 5255 case 0: 5256 break; 5257 case ENOTDIR: 5258 case ENOENT: 5259 vput(dvp); 5260 return (cache_fpl_handled_error(fpl, error)); 5261 default: 5262 vput(dvp); 5263 return (cache_fpl_aborted(fpl)); 5264 } 5265 5266 fpl->tvp = tvp; 5267 5268 if (tvp == NULL) { 5269 MPASS(error == EJUSTRETURN); 5270 if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) { 5271 vput(dvp); 5272 } else if ((cnp->cn_flags & LOCKPARENT) == 0) { 5273 VOP_UNLOCK(dvp); 5274 } 5275 return (cache_fpl_handled(fpl)); 5276 } 5277 5278 if (tvp->v_type == VLNK) { 5279 if ((cnp->cn_flags & FOLLOW) != 0) { 5280 vput(dvp); 5281 vput(tvp); 5282 return (cache_fpl_aborted(fpl)); 5283 } 5284 } 5285 5286 if (__predict_false(cache_fplookup_is_mp(fpl))) { 5287 vput(dvp); 5288 vput(tvp); 5289 return (cache_fpl_aborted(fpl)); 5290 } 5291 5292 if ((cnp->cn_flags & LOCKLEAF) == 0) { 5293 VOP_UNLOCK(tvp); 5294 } 5295 5296 if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) { 5297 vput(dvp); 5298 } else if ((cnp->cn_flags & LOCKPARENT) == 0) { 5299 VOP_UNLOCK(dvp); 5300 } 5301 return (cache_fpl_handled(fpl)); 5302 } 5303 5304 static int __noinline 5305 cache_fplookup_dot(struct cache_fpl *fpl) 5306 { 5307 int error; 5308 5309 MPASS(!seqc_in_modify(fpl->dvp_seqc)); 5310 5311 if (__predict_false(fpl->dvp->v_type != VDIR)) { 5312 cache_fpl_smr_exit(fpl); 5313 return (cache_fpl_handled_error(fpl, ENOTDIR)); 5314 } 5315 5316 /* 5317 * Just re-assign the value. seqc will be checked later for the first 5318 * non-dot path component in line and/or before deciding to return the 5319 * vnode. 5320 */ 5321 fpl->tvp = fpl->dvp; 5322 fpl->tvp_seqc = fpl->dvp_seqc; 5323 5324 SDT_PROBE3(vfs, namecache, lookup, hit, fpl->dvp, ".", fpl->dvp); 5325 5326 error = 0; 5327 if (cache_fplookup_is_mp(fpl)) { 5328 error = cache_fplookup_cross_mount(fpl); 5329 } 5330 return (error); 5331 } 5332 5333 static int __noinline 5334 cache_fplookup_dotdot(struct cache_fpl *fpl) 5335 { 5336 struct nameidata *ndp; 5337 struct namecache *ncp; 5338 struct vnode *dvp; 5339 u_char nc_flag; 5340 5341 ndp = fpl->ndp; 5342 dvp = fpl->dvp; 5343 5344 MPASS(cache_fpl_isdotdot(fpl->cnp)); 5345 5346 /* 5347 * XXX this is racy the same way regular lookup is 5348 */ 5349 if (vfs_lookup_isroot(ndp, dvp)) { 5350 fpl->tvp = dvp; 5351 fpl->tvp_seqc = vn_seqc_read_any(dvp); 5352 if (seqc_in_modify(fpl->tvp_seqc)) { 5353 return (cache_fpl_aborted(fpl)); 5354 } 5355 return (0); 5356 } 5357 5358 if ((dvp->v_vflag & VV_ROOT) != 0) { 5359 /* 5360 * TODO 5361 * The opposite of climb mount is needed here. 5362 */ 5363 return (cache_fpl_partial(fpl)); 5364 } 5365 5366 if (__predict_false(dvp->v_type != VDIR)) { 5367 cache_fpl_smr_exit(fpl); 5368 return (cache_fpl_handled_error(fpl, ENOTDIR)); 5369 } 5370 5371 ncp = atomic_load_consume_ptr(&dvp->v_cache_dd); 5372 if (ncp == NULL) { 5373 return (cache_fpl_aborted(fpl)); 5374 } 5375 5376 nc_flag = atomic_load_char(&ncp->nc_flag); 5377 if ((nc_flag & NCF_ISDOTDOT) != 0) { 5378 if ((nc_flag & NCF_NEGATIVE) != 0) 5379 return (cache_fpl_aborted(fpl)); 5380 fpl->tvp = ncp->nc_vp; 5381 } else { 5382 fpl->tvp = ncp->nc_dvp; 5383 } 5384 5385 fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp); 5386 if (seqc_in_modify(fpl->tvp_seqc)) { 5387 return (cache_fpl_partial(fpl)); 5388 } 5389 5390 /* 5391 * Acquire fence provided by vn_seqc_read_any above. 5392 */ 5393 if (__predict_false(atomic_load_ptr(&dvp->v_cache_dd) != ncp)) { 5394 return (cache_fpl_aborted(fpl)); 5395 } 5396 5397 if (!cache_ncp_canuse(ncp)) { 5398 return (cache_fpl_aborted(fpl)); 5399 } 5400 5401 return (0); 5402 } 5403 5404 static int __noinline 5405 cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash) 5406 { 5407 u_char nc_flag __diagused; 5408 bool neg_promote; 5409 5410 #ifdef INVARIANTS 5411 nc_flag = atomic_load_char(&ncp->nc_flag); 5412 MPASS((nc_flag & NCF_NEGATIVE) != 0); 5413 #endif 5414 /* 5415 * If they want to create an entry we need to replace this one. 5416 */ 5417 if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) { 5418 fpl->tvp = NULL; 5419 return (cache_fplookup_modifying(fpl)); 5420 } 5421 neg_promote = cache_neg_hit_prep(ncp); 5422 if (!cache_fpl_neg_ncp_canuse(ncp)) { 5423 cache_neg_hit_abort(ncp); 5424 return (cache_fpl_partial(fpl)); 5425 } 5426 if (neg_promote) { 5427 return (cache_fplookup_negative_promote(fpl, ncp, hash)); 5428 } 5429 cache_neg_hit_finish(ncp); 5430 cache_fpl_smr_exit(fpl); 5431 return (cache_fpl_handled_error(fpl, ENOENT)); 5432 } 5433 5434 /* 5435 * Resolve a symlink. Called by filesystem-specific routines. 5436 * 5437 * Code flow is: 5438 * ... -> cache_fplookup_symlink -> VOP_FPLOOKUP_SYMLINK -> cache_symlink_resolve 5439 */ 5440 int 5441 cache_symlink_resolve(struct cache_fpl *fpl, const char *string, size_t len) 5442 { 5443 struct nameidata *ndp; 5444 struct componentname *cnp; 5445 size_t adjust; 5446 5447 ndp = fpl->ndp; 5448 cnp = fpl->cnp; 5449 5450 if (__predict_false(len == 0)) { 5451 return (ENOENT); 5452 } 5453 5454 if (__predict_false(len > MAXPATHLEN - 2)) { 5455 if (cache_fpl_istrailingslash(fpl)) { 5456 return (EAGAIN); 5457 } 5458 } 5459 5460 ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr - cnp->cn_namelen + 1; 5461 #ifdef INVARIANTS 5462 if (ndp->ni_pathlen != fpl->debug.ni_pathlen) { 5463 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n", 5464 __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar, 5465 cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf); 5466 } 5467 #endif 5468 5469 if (__predict_false(len + ndp->ni_pathlen > MAXPATHLEN)) { 5470 return (ENAMETOOLONG); 5471 } 5472 5473 if (__predict_false(ndp->ni_loopcnt++ >= MAXSYMLINKS)) { 5474 return (ELOOP); 5475 } 5476 5477 adjust = len; 5478 if (ndp->ni_pathlen > 1) { 5479 bcopy(ndp->ni_next, cnp->cn_pnbuf + len, ndp->ni_pathlen); 5480 } else { 5481 if (cache_fpl_istrailingslash(fpl)) { 5482 adjust = len + 1; 5483 cnp->cn_pnbuf[len] = '/'; 5484 cnp->cn_pnbuf[len + 1] = '\0'; 5485 } else { 5486 cnp->cn_pnbuf[len] = '\0'; 5487 } 5488 } 5489 bcopy(string, cnp->cn_pnbuf, len); 5490 5491 ndp->ni_pathlen += adjust; 5492 cache_fpl_pathlen_add(fpl, adjust); 5493 cnp->cn_nameptr = cnp->cn_pnbuf; 5494 fpl->nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1]; 5495 fpl->tvp = NULL; 5496 return (0); 5497 } 5498 5499 static int __noinline 5500 cache_fplookup_symlink(struct cache_fpl *fpl) 5501 { 5502 struct mount *mp; 5503 struct nameidata *ndp; 5504 struct componentname *cnp; 5505 struct vnode *dvp, *tvp; 5506 struct pwd *pwd; 5507 int error; 5508 5509 ndp = fpl->ndp; 5510 cnp = fpl->cnp; 5511 dvp = fpl->dvp; 5512 tvp = fpl->tvp; 5513 pwd = *(fpl->pwd); 5514 5515 if (cache_fpl_islastcn(ndp)) { 5516 if ((cnp->cn_flags & FOLLOW) == 0) { 5517 return (cache_fplookup_final(fpl)); 5518 } 5519 } 5520 5521 mp = atomic_load_ptr(&dvp->v_mount); 5522 if (__predict_false(mp == NULL)) { 5523 return (cache_fpl_aborted(fpl)); 5524 } 5525 5526 /* 5527 * Note this check races against setting the flag just like regular 5528 * lookup. 5529 */ 5530 if (__predict_false((mp->mnt_flag & MNT_NOSYMFOLLOW) != 0)) { 5531 cache_fpl_smr_exit(fpl); 5532 return (cache_fpl_handled_error(fpl, EACCES)); 5533 } 5534 5535 error = VOP_FPLOOKUP_SYMLINK(tvp, fpl); 5536 if (__predict_false(error != 0)) { 5537 switch (error) { 5538 case EAGAIN: 5539 return (cache_fpl_partial(fpl)); 5540 case ENOENT: 5541 case ENAMETOOLONG: 5542 case ELOOP: 5543 cache_fpl_smr_exit(fpl); 5544 return (cache_fpl_handled_error(fpl, error)); 5545 default: 5546 return (cache_fpl_aborted(fpl)); 5547 } 5548 } 5549 5550 if (*(cnp->cn_nameptr) == '/') { 5551 fpl->dvp = cache_fpl_handle_root(fpl); 5552 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); 5553 if (seqc_in_modify(fpl->dvp_seqc)) { 5554 return (cache_fpl_aborted(fpl)); 5555 } 5556 /* 5557 * The main loop assumes that ->dvp points to a vnode belonging 5558 * to a filesystem which can do lockless lookup, but the absolute 5559 * symlink can be wandering off to one which does not. 5560 */ 5561 mp = atomic_load_ptr(&fpl->dvp->v_mount); 5562 if (__predict_false(mp == NULL)) { 5563 return (cache_fpl_aborted(fpl)); 5564 } 5565 if (!cache_fplookup_mp_supported(mp)) { 5566 cache_fpl_checkpoint(fpl); 5567 return (cache_fpl_partial(fpl)); 5568 } 5569 if (__predict_false(pwd->pwd_adir != pwd->pwd_rdir)) { 5570 return (cache_fpl_aborted(fpl)); 5571 } 5572 } 5573 return (0); 5574 } 5575 5576 static int 5577 cache_fplookup_next(struct cache_fpl *fpl) 5578 { 5579 struct componentname *cnp; 5580 struct namecache *ncp; 5581 struct vnode *dvp, *tvp; 5582 u_char nc_flag; 5583 uint32_t hash; 5584 int error; 5585 5586 cnp = fpl->cnp; 5587 dvp = fpl->dvp; 5588 hash = fpl->hash; 5589 5590 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 5591 if (cnp->cn_namelen == 1) { 5592 return (cache_fplookup_dot(fpl)); 5593 } 5594 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 5595 return (cache_fplookup_dotdot(fpl)); 5596 } 5597 } 5598 5599 MPASS(!cache_fpl_isdotdot(cnp)); 5600 5601 ncp = cache_ncp_find(dvp, cnp, hash); 5602 if (__predict_false(ncp == NULL)) { 5603 return (cache_fplookup_noentry(fpl)); 5604 } 5605 5606 tvp = atomic_load_ptr(&ncp->nc_vp); 5607 nc_flag = atomic_load_char(&ncp->nc_flag); 5608 if ((nc_flag & NCF_NEGATIVE) != 0) { 5609 return (cache_fplookup_neg(fpl, ncp, hash)); 5610 } 5611 5612 if (!cache_ncp_canuse(ncp)) { 5613 return (cache_fpl_partial(fpl)); 5614 } 5615 5616 fpl->tvp = tvp; 5617 fpl->tvp_seqc = vn_seqc_read_any(tvp); 5618 if (seqc_in_modify(fpl->tvp_seqc)) { 5619 return (cache_fpl_partial(fpl)); 5620 } 5621 5622 counter_u64_add(numposhits, 1); 5623 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp); 5624 5625 error = 0; 5626 if (cache_fplookup_is_mp(fpl)) { 5627 error = cache_fplookup_cross_mount(fpl); 5628 } 5629 return (error); 5630 } 5631 5632 static bool 5633 cache_fplookup_mp_supported(struct mount *mp) 5634 { 5635 5636 MPASS(mp != NULL); 5637 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) 5638 return (false); 5639 return (true); 5640 } 5641 5642 /* 5643 * Walk up the mount stack (if any). 5644 * 5645 * Correctness is provided in the following ways: 5646 * - all vnodes are protected from freeing with SMR 5647 * - struct mount objects are type stable making them always safe to access 5648 * - stability of the particular mount is provided by busying it 5649 * - relationship between the vnode which is mounted on and the mount is 5650 * verified with the vnode sequence counter after busying 5651 * - association between root vnode of the mount and the mount is protected 5652 * by busy 5653 * 5654 * From that point on we can read the sequence counter of the root vnode 5655 * and get the next mount on the stack (if any) using the same protection. 5656 * 5657 * By the end of successful walk we are guaranteed the reached state was 5658 * indeed present at least at some point which matches the regular lookup. 5659 */ 5660 static int __noinline 5661 cache_fplookup_climb_mount(struct cache_fpl *fpl) 5662 { 5663 struct mount *mp, *prev_mp; 5664 struct mount_pcpu *mpcpu, *prev_mpcpu; 5665 struct vnode *vp; 5666 seqc_t vp_seqc; 5667 5668 vp = fpl->tvp; 5669 vp_seqc = fpl->tvp_seqc; 5670 5671 VNPASS(vp->v_type == VDIR || vp->v_type == VREG || vp->v_type == VBAD, vp); 5672 mp = atomic_load_ptr(&vp->v_mountedhere); 5673 if (__predict_false(mp == NULL)) { 5674 return (0); 5675 } 5676 5677 prev_mp = NULL; 5678 for (;;) { 5679 if (!vfs_op_thread_enter_crit(mp, mpcpu)) { 5680 if (prev_mp != NULL) 5681 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); 5682 return (cache_fpl_partial(fpl)); 5683 } 5684 if (prev_mp != NULL) 5685 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); 5686 if (!vn_seqc_consistent(vp, vp_seqc)) { 5687 vfs_op_thread_exit_crit(mp, mpcpu); 5688 return (cache_fpl_partial(fpl)); 5689 } 5690 if (!cache_fplookup_mp_supported(mp)) { 5691 vfs_op_thread_exit_crit(mp, mpcpu); 5692 return (cache_fpl_partial(fpl)); 5693 } 5694 vp = atomic_load_ptr(&mp->mnt_rootvnode); 5695 if (vp == NULL) { 5696 vfs_op_thread_exit_crit(mp, mpcpu); 5697 return (cache_fpl_partial(fpl)); 5698 } 5699 vp_seqc = vn_seqc_read_any(vp); 5700 if (seqc_in_modify(vp_seqc)) { 5701 vfs_op_thread_exit_crit(mp, mpcpu); 5702 return (cache_fpl_partial(fpl)); 5703 } 5704 prev_mp = mp; 5705 prev_mpcpu = mpcpu; 5706 mp = atomic_load_ptr(&vp->v_mountedhere); 5707 if (mp == NULL) 5708 break; 5709 } 5710 5711 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); 5712 fpl->tvp = vp; 5713 fpl->tvp_seqc = vp_seqc; 5714 return (0); 5715 } 5716 5717 static int __noinline 5718 cache_fplookup_cross_mount(struct cache_fpl *fpl) 5719 { 5720 struct mount *mp; 5721 struct mount_pcpu *mpcpu; 5722 struct vnode *vp; 5723 seqc_t vp_seqc; 5724 5725 vp = fpl->tvp; 5726 vp_seqc = fpl->tvp_seqc; 5727 5728 VNPASS(vp->v_type == VDIR || vp->v_type == VREG || vp->v_type == VBAD, vp); 5729 mp = atomic_load_ptr(&vp->v_mountedhere); 5730 if (__predict_false(mp == NULL)) { 5731 return (0); 5732 } 5733 5734 if (!vfs_op_thread_enter_crit(mp, mpcpu)) { 5735 return (cache_fpl_partial(fpl)); 5736 } 5737 if (!vn_seqc_consistent(vp, vp_seqc)) { 5738 vfs_op_thread_exit_crit(mp, mpcpu); 5739 return (cache_fpl_partial(fpl)); 5740 } 5741 if (!cache_fplookup_mp_supported(mp)) { 5742 vfs_op_thread_exit_crit(mp, mpcpu); 5743 return (cache_fpl_partial(fpl)); 5744 } 5745 vp = atomic_load_ptr(&mp->mnt_rootvnode); 5746 if (__predict_false(vp == NULL)) { 5747 vfs_op_thread_exit_crit(mp, mpcpu); 5748 return (cache_fpl_partial(fpl)); 5749 } 5750 vp_seqc = vn_seqc_read_any(vp); 5751 vfs_op_thread_exit_crit(mp, mpcpu); 5752 if (seqc_in_modify(vp_seqc)) { 5753 return (cache_fpl_partial(fpl)); 5754 } 5755 mp = atomic_load_ptr(&vp->v_mountedhere); 5756 if (__predict_false(mp != NULL)) { 5757 /* 5758 * There are possibly more mount points on top. 5759 * Normally this does not happen so for simplicity just start 5760 * over. 5761 */ 5762 return (cache_fplookup_climb_mount(fpl)); 5763 } 5764 5765 fpl->tvp = vp; 5766 fpl->tvp_seqc = vp_seqc; 5767 return (0); 5768 } 5769 5770 /* 5771 * Check if a vnode is mounted on. 5772 */ 5773 static bool 5774 cache_fplookup_is_mp(struct cache_fpl *fpl) 5775 { 5776 struct vnode *vp; 5777 5778 vp = fpl->tvp; 5779 return ((vn_irflag_read(vp) & VIRF_MOUNTPOINT) != 0); 5780 } 5781 5782 /* 5783 * Parse the path. 5784 * 5785 * The code was originally copy-pasted from regular lookup and despite 5786 * clean ups leaves performance on the table. Any modifications here 5787 * must take into account that in case off fallback the resulting 5788 * nameidata state has to be compatible with the original. 5789 */ 5790 5791 /* 5792 * Debug ni_pathlen tracking. 5793 */ 5794 #ifdef INVARIANTS 5795 static void 5796 cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n) 5797 { 5798 5799 fpl->debug.ni_pathlen += n; 5800 KASSERT(fpl->debug.ni_pathlen <= PATH_MAX, 5801 ("%s: pathlen overflow to %zd\n", __func__, fpl->debug.ni_pathlen)); 5802 } 5803 5804 static void 5805 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n) 5806 { 5807 5808 fpl->debug.ni_pathlen -= n; 5809 KASSERT(fpl->debug.ni_pathlen <= PATH_MAX, 5810 ("%s: pathlen underflow to %zd\n", __func__, fpl->debug.ni_pathlen)); 5811 } 5812 5813 static void 5814 cache_fpl_pathlen_inc(struct cache_fpl *fpl) 5815 { 5816 5817 cache_fpl_pathlen_add(fpl, 1); 5818 } 5819 5820 static void 5821 cache_fpl_pathlen_dec(struct cache_fpl *fpl) 5822 { 5823 5824 cache_fpl_pathlen_sub(fpl, 1); 5825 } 5826 #else 5827 static void 5828 cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n) 5829 { 5830 } 5831 5832 static void 5833 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n) 5834 { 5835 } 5836 5837 static void 5838 cache_fpl_pathlen_inc(struct cache_fpl *fpl) 5839 { 5840 } 5841 5842 static void 5843 cache_fpl_pathlen_dec(struct cache_fpl *fpl) 5844 { 5845 } 5846 #endif 5847 5848 static void 5849 cache_fplookup_parse(struct cache_fpl *fpl) 5850 { 5851 struct nameidata *ndp; 5852 struct componentname *cnp; 5853 struct vnode *dvp; 5854 char *cp; 5855 uint32_t hash; 5856 5857 ndp = fpl->ndp; 5858 cnp = fpl->cnp; 5859 dvp = fpl->dvp; 5860 5861 /* 5862 * Find the end of this path component, it is either / or nul. 5863 * 5864 * Store / as a temporary sentinel so that we only have one character 5865 * to test for. Pathnames tend to be short so this should not be 5866 * resulting in cache misses. 5867 * 5868 * TODO: fix this to be word-sized. 5869 */ 5870 MPASS(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] >= cnp->cn_pnbuf); 5871 KASSERT(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] == fpl->nulchar, 5872 ("%s: mismatch between pathlen (%zu) and nulchar (%p != %p), string [%s]\n", 5873 __func__, fpl->debug.ni_pathlen, &cnp->cn_nameptr[fpl->debug.ni_pathlen - 1], 5874 fpl->nulchar, cnp->cn_pnbuf)); 5875 KASSERT(*fpl->nulchar == '\0', 5876 ("%s: expected nul at %p; string [%s]\n", __func__, fpl->nulchar, 5877 cnp->cn_pnbuf)); 5878 hash = cache_get_hash_iter_start(dvp); 5879 *fpl->nulchar = '/'; 5880 for (cp = cnp->cn_nameptr; *cp != '/'; cp++) { 5881 KASSERT(*cp != '\0', 5882 ("%s: encountered unexpected nul; string [%s]\n", __func__, 5883 cnp->cn_nameptr)); 5884 hash = cache_get_hash_iter(*cp, hash); 5885 continue; 5886 } 5887 *fpl->nulchar = '\0'; 5888 fpl->hash = cache_get_hash_iter_finish(hash); 5889 5890 cnp->cn_namelen = cp - cnp->cn_nameptr; 5891 cache_fpl_pathlen_sub(fpl, cnp->cn_namelen); 5892 5893 #ifdef INVARIANTS 5894 /* 5895 * cache_get_hash only accepts lengths up to NAME_MAX. This is fine since 5896 * we are going to fail this lookup with ENAMETOOLONG (see below). 5897 */ 5898 if (cnp->cn_namelen <= NAME_MAX) { 5899 if (fpl->hash != cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp)) { 5900 panic("%s: mismatched hash for [%s] len %ld", __func__, 5901 cnp->cn_nameptr, cnp->cn_namelen); 5902 } 5903 } 5904 #endif 5905 5906 /* 5907 * Hack: we have to check if the found path component's length exceeds 5908 * NAME_MAX. However, the condition is very rarely true and check can 5909 * be elided in the common case -- if an entry was found in the cache, 5910 * then it could not have been too long to begin with. 5911 */ 5912 ndp->ni_next = cp; 5913 } 5914 5915 static void 5916 cache_fplookup_parse_advance(struct cache_fpl *fpl) 5917 { 5918 struct nameidata *ndp; 5919 struct componentname *cnp; 5920 5921 ndp = fpl->ndp; 5922 cnp = fpl->cnp; 5923 5924 cnp->cn_nameptr = ndp->ni_next; 5925 KASSERT(*(cnp->cn_nameptr) == '/', 5926 ("%s: should have seen slash at %p ; buf %p [%s]\n", __func__, 5927 cnp->cn_nameptr, cnp->cn_pnbuf, cnp->cn_pnbuf)); 5928 cnp->cn_nameptr++; 5929 cache_fpl_pathlen_dec(fpl); 5930 } 5931 5932 /* 5933 * Skip spurious slashes in a pathname (e.g., "foo///bar") and retry. 5934 * 5935 * Lockless lookup tries to elide checking for spurious slashes and should they 5936 * be present is guaranteed to fail to find an entry. In this case the caller 5937 * must check if the name starts with a slash and call this routine. It is 5938 * going to fast forward across the spurious slashes and set the state up for 5939 * retry. 5940 */ 5941 static int __noinline 5942 cache_fplookup_skip_slashes(struct cache_fpl *fpl) 5943 { 5944 struct nameidata *ndp; 5945 struct componentname *cnp; 5946 5947 ndp = fpl->ndp; 5948 cnp = fpl->cnp; 5949 5950 MPASS(*(cnp->cn_nameptr) == '/'); 5951 do { 5952 cnp->cn_nameptr++; 5953 cache_fpl_pathlen_dec(fpl); 5954 } while (*(cnp->cn_nameptr) == '/'); 5955 5956 /* 5957 * Go back to one slash so that cache_fplookup_parse_advance has 5958 * something to skip. 5959 */ 5960 cnp->cn_nameptr--; 5961 cache_fpl_pathlen_inc(fpl); 5962 5963 /* 5964 * cache_fplookup_parse_advance starts from ndp->ni_next 5965 */ 5966 ndp->ni_next = cnp->cn_nameptr; 5967 5968 /* 5969 * See cache_fplookup_dot. 5970 */ 5971 fpl->tvp = fpl->dvp; 5972 fpl->tvp_seqc = fpl->dvp_seqc; 5973 5974 return (0); 5975 } 5976 5977 /* 5978 * Handle trailing slashes (e.g., "foo/"). 5979 * 5980 * If a trailing slash is found the terminal vnode must be a directory. 5981 * Regular lookup shortens the path by nulifying the first trailing slash and 5982 * sets the TRAILINGSLASH flag to denote this took place. There are several 5983 * checks on it performed later. 5984 * 5985 * Similarly to spurious slashes, lockless lookup handles this in a speculative 5986 * manner relying on an invariant that a non-directory vnode will get a miss. 5987 * In this case cn_nameptr[0] == '\0' and cn_namelen == 0. 5988 * 5989 * Thus for a path like "foo/bar/" the code unwinds the state back to "bar/" 5990 * and denotes this is the last path component, which avoids looping back. 5991 * 5992 * Only plain lookups are supported for now to restrict corner cases to handle. 5993 */ 5994 static int __noinline 5995 cache_fplookup_trailingslash(struct cache_fpl *fpl) 5996 { 5997 #ifdef INVARIANTS 5998 size_t ni_pathlen; 5999 #endif 6000 struct nameidata *ndp; 6001 struct componentname *cnp; 6002 struct namecache *ncp; 6003 struct vnode *tvp; 6004 char *cn_nameptr_orig, *cn_nameptr_slash; 6005 seqc_t tvp_seqc; 6006 u_char nc_flag; 6007 6008 ndp = fpl->ndp; 6009 cnp = fpl->cnp; 6010 tvp = fpl->tvp; 6011 tvp_seqc = fpl->tvp_seqc; 6012 6013 MPASS(fpl->dvp == fpl->tvp); 6014 KASSERT(cache_fpl_istrailingslash(fpl), 6015 ("%s: expected trailing slash at %p; string [%s]\n", __func__, fpl->nulchar - 1, 6016 cnp->cn_pnbuf)); 6017 KASSERT(cnp->cn_nameptr[0] == '\0', 6018 ("%s: expected nul char at %p; string [%s]\n", __func__, &cnp->cn_nameptr[0], 6019 cnp->cn_pnbuf)); 6020 KASSERT(cnp->cn_namelen == 0, 6021 ("%s: namelen 0 but got %ld; string [%s]\n", __func__, cnp->cn_namelen, 6022 cnp->cn_pnbuf)); 6023 MPASS(cnp->cn_nameptr > cnp->cn_pnbuf); 6024 6025 if (cnp->cn_nameiop != LOOKUP) { 6026 return (cache_fpl_aborted(fpl)); 6027 } 6028 6029 if (__predict_false(tvp->v_type != VDIR)) { 6030 if (!vn_seqc_consistent(tvp, tvp_seqc)) { 6031 return (cache_fpl_aborted(fpl)); 6032 } 6033 cache_fpl_smr_exit(fpl); 6034 return (cache_fpl_handled_error(fpl, ENOTDIR)); 6035 } 6036 6037 /* 6038 * Denote the last component. 6039 */ 6040 ndp->ni_next = &cnp->cn_nameptr[0]; 6041 MPASS(cache_fpl_islastcn(ndp)); 6042 6043 /* 6044 * Unwind trailing slashes. 6045 */ 6046 cn_nameptr_orig = cnp->cn_nameptr; 6047 while (cnp->cn_nameptr >= cnp->cn_pnbuf) { 6048 cnp->cn_nameptr--; 6049 if (cnp->cn_nameptr[0] != '/') { 6050 break; 6051 } 6052 } 6053 6054 /* 6055 * Unwind to the beginning of the path component. 6056 * 6057 * Note the path may or may not have started with a slash. 6058 */ 6059 cn_nameptr_slash = cnp->cn_nameptr; 6060 while (cnp->cn_nameptr > cnp->cn_pnbuf) { 6061 cnp->cn_nameptr--; 6062 if (cnp->cn_nameptr[0] == '/') { 6063 break; 6064 } 6065 } 6066 if (cnp->cn_nameptr[0] == '/') { 6067 cnp->cn_nameptr++; 6068 } 6069 6070 cnp->cn_namelen = cn_nameptr_slash - cnp->cn_nameptr + 1; 6071 cache_fpl_pathlen_add(fpl, cn_nameptr_orig - cnp->cn_nameptr); 6072 cache_fpl_checkpoint(fpl); 6073 6074 #ifdef INVARIANTS 6075 ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1; 6076 if (ni_pathlen != fpl->debug.ni_pathlen) { 6077 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n", 6078 __func__, ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar, 6079 cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf); 6080 } 6081 #endif 6082 6083 /* 6084 * If this was a "./" lookup the parent directory is already correct. 6085 */ 6086 if (cnp->cn_nameptr[0] == '.' && cnp->cn_namelen == 1) { 6087 return (0); 6088 } 6089 6090 /* 6091 * Otherwise we need to look it up. 6092 */ 6093 tvp = fpl->tvp; 6094 ncp = atomic_load_consume_ptr(&tvp->v_cache_dd); 6095 if (__predict_false(ncp == NULL)) { 6096 return (cache_fpl_aborted(fpl)); 6097 } 6098 nc_flag = atomic_load_char(&ncp->nc_flag); 6099 if ((nc_flag & NCF_ISDOTDOT) != 0) { 6100 return (cache_fpl_aborted(fpl)); 6101 } 6102 fpl->dvp = ncp->nc_dvp; 6103 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); 6104 if (seqc_in_modify(fpl->dvp_seqc)) { 6105 return (cache_fpl_aborted(fpl)); 6106 } 6107 return (0); 6108 } 6109 6110 /* 6111 * See the API contract for VOP_FPLOOKUP_VEXEC. 6112 */ 6113 static int __noinline 6114 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error) 6115 { 6116 struct componentname *cnp; 6117 struct vnode *dvp; 6118 seqc_t dvp_seqc; 6119 6120 cnp = fpl->cnp; 6121 dvp = fpl->dvp; 6122 dvp_seqc = fpl->dvp_seqc; 6123 6124 /* 6125 * Hack: delayed empty path checking. 6126 */ 6127 if (cnp->cn_pnbuf[0] == '\0') { 6128 return (cache_fplookup_emptypath(fpl)); 6129 } 6130 6131 /* 6132 * TODO: Due to ignoring trailing slashes lookup will perform a 6133 * permission check on the last dir when it should not be doing it. It 6134 * may fail, but said failure should be ignored. It is possible to fix 6135 * it up fully without resorting to regular lookup, but for now just 6136 * abort. 6137 */ 6138 if (cache_fpl_istrailingslash(fpl)) { 6139 return (cache_fpl_aborted(fpl)); 6140 } 6141 6142 /* 6143 * Hack: delayed degenerate path checking. 6144 */ 6145 if (cnp->cn_nameptr[0] == '\0' && fpl->tvp == NULL) { 6146 return (cache_fplookup_degenerate(fpl)); 6147 } 6148 6149 /* 6150 * Hack: delayed name len checking. 6151 */ 6152 if (__predict_false(cnp->cn_namelen > NAME_MAX)) { 6153 cache_fpl_smr_exit(fpl); 6154 return (cache_fpl_handled_error(fpl, ENAMETOOLONG)); 6155 } 6156 6157 /* 6158 * Hack: they may be looking up foo/bar, where foo is not a directory. 6159 * In such a case we need to return ENOTDIR, but we may happen to get 6160 * here with a different error. 6161 */ 6162 if (dvp->v_type != VDIR) { 6163 error = ENOTDIR; 6164 } 6165 6166 /* 6167 * Hack: handle O_SEARCH. 6168 * 6169 * Open Group Base Specifications Issue 7, 2018 edition states: 6170 * <quote> 6171 * If the access mode of the open file description associated with the 6172 * file descriptor is not O_SEARCH, the function shall check whether 6173 * directory searches are permitted using the current permissions of 6174 * the directory underlying the file descriptor. If the access mode is 6175 * O_SEARCH, the function shall not perform the check. 6176 * </quote> 6177 * 6178 * Regular lookup tests for the NOEXECCHECK flag for every path 6179 * component to decide whether to do the permission check. However, 6180 * since most lookups never have the flag (and when they do it is only 6181 * present for the first path component), lockless lookup only acts on 6182 * it if there is a permission problem. Here the flag is represented 6183 * with a boolean so that we don't have to clear it on the way out. 6184 * 6185 * For simplicity this always aborts. 6186 * TODO: check if this is the first lookup and ignore the permission 6187 * problem. Note the flag has to survive fallback (if it happens to be 6188 * performed). 6189 */ 6190 if (fpl->fsearch) { 6191 return (cache_fpl_aborted(fpl)); 6192 } 6193 6194 switch (error) { 6195 case EAGAIN: 6196 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 6197 error = cache_fpl_aborted(fpl); 6198 } else { 6199 cache_fpl_partial(fpl); 6200 } 6201 break; 6202 default: 6203 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 6204 error = cache_fpl_aborted(fpl); 6205 } else { 6206 cache_fpl_smr_exit(fpl); 6207 cache_fpl_handled_error(fpl, error); 6208 } 6209 break; 6210 } 6211 return (error); 6212 } 6213 6214 static int 6215 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl) 6216 { 6217 struct nameidata *ndp; 6218 struct componentname *cnp; 6219 struct mount *mp; 6220 int error; 6221 6222 ndp = fpl->ndp; 6223 cnp = fpl->cnp; 6224 6225 cache_fpl_checkpoint(fpl); 6226 6227 /* 6228 * The vnode at hand is almost always stable, skip checking for it. 6229 * Worst case this postpones the check towards the end of the iteration 6230 * of the main loop. 6231 */ 6232 fpl->dvp = dvp; 6233 fpl->dvp_seqc = vn_seqc_read_notmodify(fpl->dvp); 6234 6235 mp = atomic_load_ptr(&dvp->v_mount); 6236 if (__predict_false(mp == NULL || !cache_fplookup_mp_supported(mp))) { 6237 return (cache_fpl_aborted(fpl)); 6238 } 6239 6240 MPASS(fpl->tvp == NULL); 6241 6242 for (;;) { 6243 cache_fplookup_parse(fpl); 6244 6245 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred); 6246 if (__predict_false(error != 0)) { 6247 error = cache_fplookup_failed_vexec(fpl, error); 6248 break; 6249 } 6250 6251 error = cache_fplookup_next(fpl); 6252 if (__predict_false(cache_fpl_terminated(fpl))) { 6253 break; 6254 } 6255 6256 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 6257 6258 if (fpl->tvp->v_type == VLNK) { 6259 error = cache_fplookup_symlink(fpl); 6260 if (cache_fpl_terminated(fpl)) { 6261 break; 6262 } 6263 } else { 6264 if (cache_fpl_islastcn(ndp)) { 6265 error = cache_fplookup_final(fpl); 6266 break; 6267 } 6268 6269 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { 6270 error = cache_fpl_aborted(fpl); 6271 break; 6272 } 6273 6274 fpl->dvp = fpl->tvp; 6275 fpl->dvp_seqc = fpl->tvp_seqc; 6276 cache_fplookup_parse_advance(fpl); 6277 } 6278 6279 cache_fpl_checkpoint(fpl); 6280 } 6281 6282 return (error); 6283 } 6284 6285 /* 6286 * Fast path lookup protected with SMR and sequence counters. 6287 * 6288 * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one. 6289 * 6290 * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria 6291 * outlined below. 6292 * 6293 * Traditional vnode lookup conceptually looks like this: 6294 * 6295 * vn_lock(current); 6296 * for (;;) { 6297 * next = find(); 6298 * vn_lock(next); 6299 * vn_unlock(current); 6300 * current = next; 6301 * if (last) 6302 * break; 6303 * } 6304 * return (current); 6305 * 6306 * Each jump to the next vnode is safe memory-wise and atomic with respect to 6307 * any modifications thanks to holding respective locks. 6308 * 6309 * The same guarantee can be provided with a combination of safe memory 6310 * reclamation and sequence counters instead. If all operations which affect 6311 * the relationship between the current vnode and the one we are looking for 6312 * also modify the counter, we can verify whether all the conditions held as 6313 * we made the jump. This includes things like permissions, mount points etc. 6314 * Counter modification is provided by enclosing relevant places in 6315 * vn_seqc_write_begin()/end() calls. 6316 * 6317 * Thus this translates to: 6318 * 6319 * vfs_smr_enter(); 6320 * dvp_seqc = seqc_read_any(dvp); 6321 * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode 6322 * abort(); 6323 * for (;;) { 6324 * tvp = find(); 6325 * tvp_seqc = seqc_read_any(tvp); 6326 * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode 6327 * abort(); 6328 * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode 6329 * abort(); 6330 * dvp = tvp; // we know nothing of importance has changed 6331 * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration 6332 * if (last) 6333 * break; 6334 * } 6335 * vget(); // secure the vnode 6336 * if (!seqc_consistent(tvp, tvp_seqc) // final check 6337 * abort(); 6338 * // at this point we know nothing has changed for any parent<->child pair 6339 * // as they were crossed during the lookup, meaning we matched the guarantee 6340 * // of the locked variant 6341 * return (tvp); 6342 * 6343 * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows: 6344 * - they are called while within vfs_smr protection which they must never exit 6345 * - EAGAIN can be returned to denote checking could not be performed, it is 6346 * always valid to return it 6347 * - if the sequence counter has not changed the result must be valid 6348 * - if the sequence counter has changed both false positives and false negatives 6349 * are permitted (since the result will be rejected later) 6350 * - for simple cases of unix permission checks vaccess_vexec_smr can be used 6351 * 6352 * Caveats to watch out for: 6353 * - vnodes are passed unlocked and unreferenced with nothing stopping 6354 * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised 6355 * to use atomic_load_ptr to fetch it. 6356 * - the aforementioned object can also get freed, meaning absent other means it 6357 * should be protected with vfs_smr 6358 * - either safely checking permissions as they are modified or guaranteeing 6359 * their stability is left to the routine 6360 */ 6361 int 6362 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status, 6363 struct pwd **pwdp) 6364 { 6365 struct cache_fpl fpl; 6366 struct pwd *pwd; 6367 struct vnode *dvp; 6368 struct componentname *cnp; 6369 int error; 6370 6371 fpl.status = CACHE_FPL_STATUS_UNSET; 6372 fpl.in_smr = false; 6373 fpl.ndp = ndp; 6374 fpl.cnp = cnp = &ndp->ni_cnd; 6375 MPASS(ndp->ni_lcf == 0); 6376 KASSERT ((cnp->cn_flags & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, 6377 ("%s: internal flags found in cn_flags %" PRIx64, __func__, 6378 cnp->cn_flags)); 6379 MPASS(cnp->cn_nameptr == cnp->cn_pnbuf); 6380 MPASS(ndp->ni_resflags == 0); 6381 6382 if (__predict_false(!cache_can_fplookup(&fpl))) { 6383 *status = fpl.status; 6384 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 6385 return (EOPNOTSUPP); 6386 } 6387 6388 cache_fpl_checkpoint_outer(&fpl); 6389 6390 cache_fpl_smr_enter_initial(&fpl); 6391 #ifdef INVARIANTS 6392 fpl.debug.ni_pathlen = ndp->ni_pathlen; 6393 #endif 6394 fpl.nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1]; 6395 fpl.fsearch = false; 6396 fpl.tvp = NULL; /* for degenerate path handling */ 6397 fpl.pwd = pwdp; 6398 pwd = pwd_get_smr(); 6399 *(fpl.pwd) = pwd; 6400 namei_setup_rootdir(ndp, cnp, pwd); 6401 ndp->ni_topdir = pwd->pwd_jdir; 6402 6403 if (cnp->cn_pnbuf[0] == '/') { 6404 dvp = cache_fpl_handle_root(&fpl); 6405 ndp->ni_resflags = NIRES_ABS; 6406 } else { 6407 if (ndp->ni_dirfd == AT_FDCWD) { 6408 dvp = pwd->pwd_cdir; 6409 } else { 6410 error = cache_fplookup_dirfd(&fpl, &dvp); 6411 if (__predict_false(error != 0)) { 6412 goto out; 6413 } 6414 } 6415 } 6416 6417 SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true); 6418 error = cache_fplookup_impl(dvp, &fpl); 6419 out: 6420 cache_fpl_smr_assert_not_entered(&fpl); 6421 cache_fpl_assert_status(&fpl); 6422 *status = fpl.status; 6423 if (SDT_PROBES_ENABLED()) { 6424 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 6425 if (fpl.status == CACHE_FPL_STATUS_HANDLED) 6426 SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true, 6427 ndp); 6428 } 6429 6430 if (__predict_true(fpl.status == CACHE_FPL_STATUS_HANDLED)) { 6431 MPASS(error != CACHE_FPL_FAILED); 6432 if (error != 0) { 6433 cache_fpl_cleanup_cnp(fpl.cnp); 6434 MPASS(fpl.dvp == NULL); 6435 MPASS(fpl.tvp == NULL); 6436 } 6437 ndp->ni_dvp = fpl.dvp; 6438 ndp->ni_vp = fpl.tvp; 6439 } 6440 return (error); 6441 } 6442