1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Poul-Henning Kamp of the FreeBSD Project. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 35 */ 36 37 #include <sys/cdefs.h> 38 #include "opt_ddb.h" 39 #include "opt_ktrace.h" 40 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/capsicum.h> 44 #include <sys/counter.h> 45 #include <sys/filedesc.h> 46 #include <sys/fnv_hash.h> 47 #include <sys/kernel.h> 48 #include <sys/ktr.h> 49 #include <sys/lock.h> 50 #include <sys/malloc.h> 51 #include <sys/fcntl.h> 52 #include <sys/jail.h> 53 #include <sys/mount.h> 54 #include <sys/namei.h> 55 #include <sys/proc.h> 56 #include <sys/seqc.h> 57 #include <sys/sdt.h> 58 #include <sys/smr.h> 59 #include <sys/smp.h> 60 #include <sys/syscallsubr.h> 61 #include <sys/sysctl.h> 62 #include <sys/sysproto.h> 63 #include <sys/vnode.h> 64 #include <ck_queue.h> 65 #ifdef KTRACE 66 #include <sys/ktrace.h> 67 #endif 68 #ifdef INVARIANTS 69 #include <machine/_inttypes.h> 70 #endif 71 72 #include <security/audit/audit.h> 73 #include <security/mac/mac_framework.h> 74 75 #ifdef DDB 76 #include <ddb/ddb.h> 77 #endif 78 79 #include <vm/uma.h> 80 81 /* 82 * High level overview of name caching in the VFS layer. 83 * 84 * Originally caching was implemented as part of UFS, later extracted to allow 85 * use by other filesystems. A decision was made to make it optional and 86 * completely detached from the rest of the kernel, which comes with limitations 87 * outlined near the end of this comment block. 88 * 89 * This fundamental choice needs to be revisited. In the meantime, the current 90 * state is described below. Significance of all notable routines is explained 91 * in comments placed above their implementation. Scattered thoroughout the 92 * file are TODO comments indicating shortcomings which can be fixed without 93 * reworking everything (most of the fixes will likely be reusable). Various 94 * details are omitted from this explanation to not clutter the overview, they 95 * have to be checked by reading the code and associated commentary. 96 * 97 * Keep in mind that it's individual path components which are cached, not full 98 * paths. That is, for a fully cached path "foo/bar/baz" there are 3 entries, 99 * one for each name. 100 * 101 * I. Data organization 102 * 103 * Entries are described by "struct namecache" objects and stored in a hash 104 * table. See cache_get_hash for more information. 105 * 106 * "struct vnode" contains pointers to source entries (names which can be found 107 * when traversing through said vnode), destination entries (names of that 108 * vnode (see "Limitations" for a breakdown on the subject) and a pointer to 109 * the parent vnode. 110 * 111 * The (directory vnode; name) tuple reliably determines the target entry if 112 * it exists. 113 * 114 * Since there are no small locks at this time (all are 32 bytes in size on 115 * LP64), the code works around the problem by introducing lock arrays to 116 * protect hash buckets and vnode lists. 117 * 118 * II. Filesystem integration 119 * 120 * Filesystems participating in name caching do the following: 121 * - set vop_lookup routine to vfs_cache_lookup 122 * - set vop_cachedlookup to whatever can perform the lookup if the above fails 123 * - if they support lockless lookup (see below), vop_fplookup_vexec and 124 * vop_fplookup_symlink are set along with the MNTK_FPLOOKUP flag on the 125 * mount point 126 * - call cache_purge or cache_vop_* routines to eliminate stale entries as 127 * applicable 128 * - call cache_enter to add entries depending on the MAKEENTRY flag 129 * 130 * With the above in mind, there are 2 entry points when doing lookups: 131 * - ... -> namei -> cache_fplookup -- this is the default 132 * - ... -> VOP_LOOKUP -> vfs_cache_lookup -- normally only called by namei 133 * should the above fail 134 * 135 * Example code flow how an entry is added: 136 * ... -> namei -> cache_fplookup -> cache_fplookup_noentry -> VOP_LOOKUP -> 137 * vfs_cache_lookup -> VOP_CACHEDLOOKUP -> ufs_lookup_ino -> cache_enter 138 * 139 * III. Performance considerations 140 * 141 * For lockless case forward lookup avoids any writes to shared areas apart 142 * from the terminal path component. In other words non-modifying lookups of 143 * different files don't suffer any scalability problems in the namecache. 144 * Looking up the same file is limited by VFS and goes beyond the scope of this 145 * file. 146 * 147 * At least on amd64 the single-threaded bottleneck for long paths is hashing 148 * (see cache_get_hash). There are cases where the code issues acquire fence 149 * multiple times, they can be combined on architectures which suffer from it. 150 * 151 * For locked case each encountered vnode has to be referenced and locked in 152 * order to be handed out to the caller (normally that's namei). This 153 * introduces significant hit single-threaded and serialization multi-threaded. 154 * 155 * Reverse lookup (e.g., "getcwd") fully scales provided it is fully cached -- 156 * avoids any writes to shared areas to any components. 157 * 158 * Unrelated insertions are partially serialized on updating the global entry 159 * counter and possibly serialized on colliding bucket or vnode locks. 160 * 161 * IV. Observability 162 * 163 * Note not everything has an explicit dtrace probe nor it should have, thus 164 * some of the one-liners below depend on implementation details. 165 * 166 * Examples: 167 * 168 * # Check what lookups failed to be handled in a lockless manner. Column 1 is 169 * # line number, column 2 is status code (see cache_fpl_status) 170 * dtrace -n 'vfs:fplookup:lookup:done { @[arg1, arg2] = count(); }' 171 * 172 * # Lengths of names added by binary name 173 * dtrace -n 'fbt::cache_enter_time:entry { @[execname] = quantize(args[2]->cn_namelen); }' 174 * 175 * # Same as above but only those which exceed 64 characters 176 * dtrace -n 'fbt::cache_enter_time:entry /args[2]->cn_namelen > 64/ { @[execname] = quantize(args[2]->cn_namelen); }' 177 * 178 * # Who is performing lookups with spurious slashes (e.g., "foo//bar") and what 179 * # path is it 180 * dtrace -n 'fbt::cache_fplookup_skip_slashes:entry { @[execname, stringof(args[0]->cnp->cn_pnbuf)] = count(); }' 181 * 182 * V. Limitations and implementation defects 183 * 184 * - since it is possible there is no entry for an open file, tools like 185 * "procstat" may fail to resolve fd -> vnode -> path to anything 186 * - even if a filesystem adds an entry, it may get purged (e.g., due to memory 187 * shortage) in which case the above problem applies 188 * - hardlinks are not tracked, thus if a vnode is reachable in more than one 189 * way, resolving a name may return a different path than the one used to 190 * open it (even if said path is still valid) 191 * - by default entries are not added for newly created files 192 * - adding an entry may need to evict negative entry first, which happens in 2 193 * distinct places (evicting on lookup, adding in a later VOP) making it 194 * impossible to simply reuse it 195 * - there is a simple scheme to evict negative entries as the cache is approaching 196 * its capacity, but it is very unclear if doing so is a good idea to begin with 197 * - vnodes are subject to being recycled even if target inode is left in memory, 198 * which loses the name cache entries when it perhaps should not. in case of tmpfs 199 * names get duplicated -- kept by filesystem itself and namecache separately 200 * - struct namecache has a fixed size and comes in 2 variants, often wasting 201 * space. now hard to replace with malloc due to dependence on SMR, which 202 * requires UMA zones to opt in 203 * - lack of better integration with the kernel also turns nullfs into a layered 204 * filesystem instead of something which can take advantage of caching 205 * 206 * Appendix A: where is the time lost, expanding on paragraph III 207 * 208 * While some care went into optimizing lookups, there is still plenty of 209 * performance left on the table, most notably from single-threaded standpoint. 210 * Below is a woefully incomplete list of changes which can help. Ideas are 211 * mostly sketched out, no claim is made all kinks or prerequisites are laid 212 * out. 213 * 214 * Note there is performance lost all over VFS. 215 * 216 * === SMR-only lookup 217 * 218 * For commonly used ops like stat(2), when the terminal vnode *is* cached, 219 * lockless lookup could refrain from refing/locking the found vnode and 220 * instead return while within the SMR section. Then a call to, say, 221 * vop_stat_smr could do the work (or fail with EAGAIN), finally the result 222 * would be validated with seqc not changing. This would be faster 223 * single-threaded as it dodges atomics and would provide full scalability for 224 * multicore uses. This would *not* work for open(2) or other calls which need 225 * the vnode to hang around for the long haul, but would work for aforementioned 226 * stat(2) but also access(2), readlink(2), realpathat(2) and probably more. 227 * 228 * === hotpatching for sdt probes 229 * 230 * They result in *tons* of branches all over with rather regrettable codegen 231 * at times. Removing sdt probes altogether gives over 2% boost in lookup rate. 232 * Reworking the code to patch itself at runtime with asm goto would solve it. 233 * asm goto is fully supported by gcc and clang. 234 * 235 * === copyinstr 236 * 237 * On all architectures it operates one byte at a time, while it could be 238 * word-sized instead thanks to the Mycroft trick. 239 * 240 * API itself is rather pessimal for path lookup, accepting arbitrary sizes and 241 * *optionally* filling in the length parameter. 242 * 243 * Instead a new routine (copyinpath?) could be introduced, demanding a buffer 244 * size which is a multiply of the word (and never zero), with the length 245 * always returned. On top of it the routine could be allowed to transform the 246 * buffer in arbitrary ways, most notably writing past the found length (not to 247 * be confused with writing past buffer size) -- this would allow word-sized 248 * movs while checking for '\0' later. 249 * 250 * === detour through namei 251 * 252 * Currently one suffers being called from namei, which then has to check if 253 * things worked out locklessly. Instead the lockless lookup could be the 254 * actual entry point which calls what is currently namei as a fallback. 255 * 256 * === avoidable branches in cache_can_fplookup 257 * 258 * The cache_fast_lookup_enabled flag check could be hotpatchable (in fact if 259 * this is off, none of fplookup code should execute). 260 * 261 * Both audit and capsicum branches can be combined into one, but it requires 262 * paying off a lot of tech debt first. 263 * 264 * ni_startdir could be indicated with a flag in cn_flags, eliminating the 265 * branch. 266 * 267 * === mount stacks 268 * 269 * Crossing a mount requires checking if perhaps something is mounted on top. 270 * Instead, an additional entry could be added to struct mount with a pointer 271 * to the final mount on the stack. This would be recalculated on each 272 * mount/unmount. 273 * 274 * === root vnodes 275 * 276 * It could become part of the API contract to *always* have a rootvnode set in 277 * mnt_rootvnode. Such vnodes are annotated with VV_ROOT and vnlru would have 278 * to be modified to always skip them. 279 */ 280 281 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 282 "Name cache"); 283 284 SDT_PROVIDER_DECLARE(vfs); 285 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", 286 "struct vnode *"); 287 SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *", 288 "struct vnode *"); 289 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", 290 "char *"); 291 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *", 292 "const char *"); 293 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *", 294 "struct namecache *", "int", "int"); 295 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); 296 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", 297 "char *", "struct vnode *"); 298 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); 299 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", 300 "struct vnode *", "char *"); 301 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", 302 "struct vnode *"); 303 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, 304 "struct vnode *", "char *"); 305 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", 306 "char *"); 307 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *", 308 "struct componentname *"); 309 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *", 310 "struct componentname *"); 311 SDT_PROBE_DEFINE3(vfs, namecache, purge, done, "struct vnode *", "size_t", "size_t"); 312 SDT_PROBE_DEFINE1(vfs, namecache, purge, batch, "int"); 313 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); 314 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); 315 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", 316 "struct vnode *"); 317 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", 318 "char *"); 319 SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *", 320 "char *"); 321 SDT_PROBE_DEFINE1(vfs, namecache, symlink, alloc__fail, "size_t"); 322 323 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool"); 324 SDT_PROBE_DECLARE(vfs, namei, lookup, entry); 325 SDT_PROBE_DECLARE(vfs, namei, lookup, return); 326 327 static char __read_frequently cache_fast_lookup_enabled = true; 328 329 /* 330 * This structure describes the elements in the cache of recent 331 * names looked up by namei. 332 */ 333 struct negstate { 334 u_char neg_flag; 335 u_char neg_hit; 336 }; 337 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *), 338 "the state must fit in a union with a pointer without growing it"); 339 340 struct namecache { 341 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 342 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 343 CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */ 344 struct vnode *nc_dvp; /* vnode of parent of name */ 345 union { 346 struct vnode *nu_vp; /* vnode the name refers to */ 347 struct negstate nu_neg;/* negative entry state */ 348 } n_un; 349 u_char nc_flag; /* flag bits */ 350 u_char nc_nlen; /* length of name */ 351 char nc_name[]; /* segment name + nul */ 352 }; 353 354 /* 355 * struct namecache_ts repeats struct namecache layout up to the 356 * nc_nlen member. 357 * struct namecache_ts is used in place of struct namecache when time(s) need 358 * to be stored. The nc_dotdottime field is used when a cache entry is mapping 359 * both a non-dotdot directory name plus dotdot for the directory's 360 * parent. 361 * 362 * See below for alignment requirement. 363 */ 364 struct namecache_ts { 365 struct timespec nc_time; /* timespec provided by fs */ 366 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ 367 int nc_ticks; /* ticks value when entry was added */ 368 int nc_pad; 369 struct namecache nc_nc; 370 }; 371 372 TAILQ_HEAD(cache_freebatch, namecache); 373 374 /* 375 * At least mips n32 performs 64-bit accesses to timespec as found 376 * in namecache_ts and requires them to be aligned. Since others 377 * may be in the same spot suffer a little bit and enforce the 378 * alignment for everyone. Note this is a nop for 64-bit platforms. 379 */ 380 #define CACHE_ZONE_ALIGNMENT UMA_ALIGNOF(time_t) 381 382 /* 383 * TODO: the initial value of CACHE_PATH_CUTOFF was inherited from the 384 * 4.4 BSD codebase. Later on struct namecache was tweaked to become 385 * smaller and the value was bumped to retain the total size, but it 386 * was never re-evaluated for suitability. A simple test counting 387 * lengths during package building shows that the value of 45 covers 388 * about 86% of all added entries, reaching 99% at 65. 389 * 390 * Regardless of the above, use of dedicated zones instead of malloc may be 391 * inducing additional waste. This may be hard to address as said zones are 392 * tied to VFS SMR. Even if retaining them, the current split should be 393 * re-evaluated. 394 */ 395 #ifdef __LP64__ 396 #define CACHE_PATH_CUTOFF 45 397 #define CACHE_LARGE_PAD 6 398 #else 399 #define CACHE_PATH_CUTOFF 41 400 #define CACHE_LARGE_PAD 2 401 #endif 402 403 #define CACHE_ZONE_SMALL_SIZE (offsetof(struct namecache, nc_name) + CACHE_PATH_CUTOFF + 1) 404 #define CACHE_ZONE_SMALL_TS_SIZE (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_SMALL_SIZE) 405 #define CACHE_ZONE_LARGE_SIZE (offsetof(struct namecache, nc_name) + NAME_MAX + 1 + CACHE_LARGE_PAD) 406 #define CACHE_ZONE_LARGE_TS_SIZE (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_LARGE_SIZE) 407 408 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 409 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 410 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 411 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 412 413 #define nc_vp n_un.nu_vp 414 #define nc_neg n_un.nu_neg 415 416 /* 417 * Flags in namecache.nc_flag 418 */ 419 #define NCF_WHITE 0x01 420 #define NCF_ISDOTDOT 0x02 421 #define NCF_TS 0x04 422 #define NCF_DTS 0x08 423 #define NCF_DVDROP 0x10 424 #define NCF_NEGATIVE 0x20 425 #define NCF_INVALID 0x40 426 #define NCF_WIP 0x80 427 428 /* 429 * Flags in negstate.neg_flag 430 */ 431 #define NEG_HOT 0x01 432 433 static bool cache_neg_evict_cond(u_long lnumcache); 434 435 /* 436 * Mark an entry as invalid. 437 * 438 * This is called before it starts getting deconstructed. 439 */ 440 static void 441 cache_ncp_invalidate(struct namecache *ncp) 442 { 443 444 KASSERT((ncp->nc_flag & NCF_INVALID) == 0, 445 ("%s: entry %p already invalid", __func__, ncp)); 446 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID); 447 atomic_thread_fence_rel(); 448 } 449 450 /* 451 * Check whether the entry can be safely used. 452 * 453 * All places which elide locks are supposed to call this after they are 454 * done with reading from an entry. 455 */ 456 #define cache_ncp_canuse(ncp) ({ \ 457 struct namecache *_ncp = (ncp); \ 458 u_char _nc_flag; \ 459 \ 460 atomic_thread_fence_acq(); \ 461 _nc_flag = atomic_load_char(&_ncp->nc_flag); \ 462 __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP)) == 0); \ 463 }) 464 465 /* 466 * Like the above but also checks NCF_WHITE. 467 */ 468 #define cache_fpl_neg_ncp_canuse(ncp) ({ \ 469 struct namecache *_ncp = (ncp); \ 470 u_char _nc_flag; \ 471 \ 472 atomic_thread_fence_acq(); \ 473 _nc_flag = atomic_load_char(&_ncp->nc_flag); \ 474 __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP | NCF_WHITE)) == 0); \ 475 }) 476 477 VFS_SMR_DECLARE; 478 479 static SYSCTL_NODE(_vfs_cache, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 480 "Name cache parameters"); 481 482 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */ 483 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, size, CTLFLAG_RD, &ncsize, 0, 484 "Total namecache capacity"); 485 486 u_int ncsizefactor = 2; 487 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, sizefactor, CTLFLAG_RW, &ncsizefactor, 0, 488 "Size factor for namecache"); 489 490 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */ 491 SYSCTL_ULONG(_vfs_cache_param, OID_AUTO, negfactor, CTLFLAG_RW, &ncnegfactor, 0, 492 "Ratio of negative namecache entries"); 493 494 /* 495 * Negative entry % of namecache capacity above which automatic eviction is allowed. 496 * 497 * Check cache_neg_evict_cond for details. 498 */ 499 static u_int ncnegminpct = 3; 500 501 static u_int __read_mostly neg_min; /* the above recomputed against ncsize */ 502 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, negmin, CTLFLAG_RD, &neg_min, 0, 503 "Negative entry count above which automatic eviction is allowed"); 504 505 /* 506 * Structures associated with name caching. 507 */ 508 #define NCHHASH(hash) \ 509 (&nchashtbl[(hash) & nchash]) 510 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ 511 static u_long __read_mostly nchash; /* size of hash table */ 512 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 513 "Size of namecache hash table"); 514 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ 515 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ 516 517 struct nchstats nchstats; /* cache effectiveness statistics */ 518 519 static u_int __exclusive_cache_line neg_cycle; 520 521 #define ncneghash 3 522 #define numneglists (ncneghash + 1) 523 524 struct neglist { 525 struct mtx nl_evict_lock; 526 struct mtx nl_lock __aligned(CACHE_LINE_SIZE); 527 TAILQ_HEAD(, namecache) nl_list; 528 TAILQ_HEAD(, namecache) nl_hotlist; 529 u_long nl_hotnum; 530 } __aligned(CACHE_LINE_SIZE); 531 532 static struct neglist neglists[numneglists]; 533 534 static inline struct neglist * 535 NCP2NEGLIST(struct namecache *ncp) 536 { 537 538 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); 539 } 540 541 static inline struct negstate * 542 NCP2NEGSTATE(struct namecache *ncp) 543 { 544 545 MPASS(atomic_load_char(&ncp->nc_flag) & NCF_NEGATIVE); 546 return (&ncp->nc_neg); 547 } 548 549 #define numbucketlocks (ncbuckethash + 1) 550 static u_int __read_mostly ncbuckethash; 551 static struct mtx_padalign __read_mostly *bucketlocks; 552 #define HASH2BUCKETLOCK(hash) \ 553 ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)])) 554 555 #define numvnodelocks (ncvnodehash + 1) 556 static u_int __read_mostly ncvnodehash; 557 static struct mtx __read_mostly *vnodelocks; 558 static inline struct mtx * 559 VP2VNODELOCK(struct vnode *vp) 560 { 561 562 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); 563 } 564 565 static void 566 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) 567 { 568 struct namecache_ts *ncp_ts; 569 570 KASSERT((ncp->nc_flag & NCF_TS) != 0 || 571 (tsp == NULL && ticksp == NULL), 572 ("No NCF_TS")); 573 574 if (tsp == NULL) 575 return; 576 577 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 578 *tsp = ncp_ts->nc_time; 579 *ticksp = ncp_ts->nc_ticks; 580 } 581 582 #ifdef DEBUG_CACHE 583 static int __read_mostly doingcache = 1; /* 1 => enable the cache */ 584 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, 585 "VFS namecache enabled"); 586 #endif 587 588 /* Export size information to userland */ 589 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 590 sizeof(struct namecache), "sizeof(struct namecache)"); 591 592 /* 593 * The new name cache statistics 594 */ 595 static SYSCTL_NODE(_vfs_cache, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 596 "Name cache statistics"); 597 598 #define STATNODE_ULONG(name, varname, descr) \ 599 SYSCTL_ULONG(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr); 600 #define STATNODE_COUNTER(name, varname, descr) \ 601 static COUNTER_U64_DEFINE_EARLY(varname); \ 602 SYSCTL_COUNTER_U64(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, \ 603 descr); 604 STATNODE_ULONG(neg, numneg, "Number of negative cache entries"); 605 STATNODE_ULONG(count, numcache, "Number of cache entries"); 606 STATNODE_COUNTER(heldvnodes, numcachehv, "Number of namecache entries with vnodes held"); 607 STATNODE_COUNTER(drops, numdrops, "Number of dropped entries due to reaching the limit"); 608 STATNODE_COUNTER(miss, nummiss, "Number of cache misses"); 609 STATNODE_COUNTER(misszap, nummisszap, "Number of cache misses we do not want to cache"); 610 STATNODE_COUNTER(poszaps, numposzaps, 611 "Number of cache hits (positive) we do not want to cache"); 612 STATNODE_COUNTER(poshits, numposhits, "Number of cache hits (positive)"); 613 STATNODE_COUNTER(negzaps, numnegzaps, 614 "Number of cache hits (negative) we do not want to cache"); 615 STATNODE_COUNTER(neghits, numneghits, "Number of cache hits (negative)"); 616 /* These count for vn_getcwd(), too. */ 617 STATNODE_COUNTER(fullpathcalls, numfullpathcalls, "Number of fullpath search calls"); 618 STATNODE_COUNTER(fullpathfail2, numfullpathfail2, 619 "Number of fullpath search errors (VOP_VPTOCNP failures)"); 620 STATNODE_COUNTER(fullpathfail4, numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); 621 STATNODE_COUNTER(fullpathfound, numfullpathfound, "Number of successful fullpath calls"); 622 STATNODE_COUNTER(symlinktoobig, symlinktoobig, "Number of times symlink did not fit the cache"); 623 624 /* 625 * Debug or developer statistics. 626 */ 627 static SYSCTL_NODE(_vfs_cache, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 628 "Name cache debugging"); 629 #define DEBUGNODE_ULONG(name, varname, descr) \ 630 SYSCTL_ULONG(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr); 631 #define DEBUGNODE_COUNTER(name, varname, descr) \ 632 static COUNTER_U64_DEFINE_EARLY(varname); \ 633 SYSCTL_COUNTER_U64(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, \ 634 descr); 635 DEBUGNODE_COUNTER(zap_bucket_relock_success, zap_bucket_relock_success, 636 "Number of successful removals after relocking"); 637 static long zap_bucket_fail; 638 DEBUGNODE_ULONG(zap_bucket_fail, zap_bucket_fail, ""); 639 static long zap_bucket_fail2; 640 DEBUGNODE_ULONG(zap_bucket_fail2, zap_bucket_fail2, ""); 641 static long cache_lock_vnodes_cel_3_failures; 642 DEBUGNODE_ULONG(vnodes_cel_3_failures, cache_lock_vnodes_cel_3_failures, 643 "Number of times 3-way vnode locking failed"); 644 645 static void cache_zap_locked(struct namecache *ncp); 646 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 647 char **retbuf, size_t *buflen, size_t addend); 648 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, 649 char **retbuf, size_t *buflen); 650 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, 651 char **retbuf, size_t *len, size_t addend); 652 653 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 654 655 static inline void 656 cache_assert_vlp_locked(struct mtx *vlp) 657 { 658 659 if (vlp != NULL) 660 mtx_assert(vlp, MA_OWNED); 661 } 662 663 static inline void 664 cache_assert_vnode_locked(struct vnode *vp) 665 { 666 struct mtx *vlp; 667 668 vlp = VP2VNODELOCK(vp); 669 cache_assert_vlp_locked(vlp); 670 } 671 672 /* 673 * Directory vnodes with entries are held for two reasons: 674 * 1. make them less of a target for reclamation in vnlru 675 * 2. suffer smaller performance penalty in locked lookup as requeieing is avoided 676 * 677 * It will be feasible to stop doing it altogether if all filesystems start 678 * supporting lockless lookup. 679 */ 680 static void 681 cache_hold_vnode(struct vnode *vp) 682 { 683 684 cache_assert_vnode_locked(vp); 685 VNPASS(LIST_EMPTY(&vp->v_cache_src), vp); 686 vhold(vp); 687 counter_u64_add(numcachehv, 1); 688 } 689 690 static void 691 cache_drop_vnode(struct vnode *vp) 692 { 693 694 /* 695 * Called after all locks are dropped, meaning we can't assert 696 * on the state of v_cache_src. 697 */ 698 vdrop(vp); 699 counter_u64_add(numcachehv, -1); 700 } 701 702 /* 703 * UMA zones. 704 */ 705 static uma_zone_t __read_mostly cache_zone_small; 706 static uma_zone_t __read_mostly cache_zone_small_ts; 707 static uma_zone_t __read_mostly cache_zone_large; 708 static uma_zone_t __read_mostly cache_zone_large_ts; 709 710 char * 711 cache_symlink_alloc(size_t size, int flags) 712 { 713 714 if (size < CACHE_ZONE_SMALL_SIZE) { 715 return (uma_zalloc_smr(cache_zone_small, flags)); 716 } 717 if (size < CACHE_ZONE_LARGE_SIZE) { 718 return (uma_zalloc_smr(cache_zone_large, flags)); 719 } 720 counter_u64_add(symlinktoobig, 1); 721 SDT_PROBE1(vfs, namecache, symlink, alloc__fail, size); 722 return (NULL); 723 } 724 725 void 726 cache_symlink_free(char *string, size_t size) 727 { 728 729 MPASS(string != NULL); 730 KASSERT(size < CACHE_ZONE_LARGE_SIZE, 731 ("%s: size %zu too big", __func__, size)); 732 733 if (size < CACHE_ZONE_SMALL_SIZE) { 734 uma_zfree_smr(cache_zone_small, string); 735 return; 736 } 737 if (size < CACHE_ZONE_LARGE_SIZE) { 738 uma_zfree_smr(cache_zone_large, string); 739 return; 740 } 741 __assert_unreachable(); 742 } 743 744 static struct namecache * 745 cache_alloc_uma(int len, bool ts) 746 { 747 struct namecache_ts *ncp_ts; 748 struct namecache *ncp; 749 750 if (__predict_false(ts)) { 751 if (len <= CACHE_PATH_CUTOFF) 752 ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK); 753 else 754 ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK); 755 ncp = &ncp_ts->nc_nc; 756 } else { 757 if (len <= CACHE_PATH_CUTOFF) 758 ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK); 759 else 760 ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK); 761 } 762 return (ncp); 763 } 764 765 static void 766 cache_free_uma(struct namecache *ncp) 767 { 768 struct namecache_ts *ncp_ts; 769 770 if (__predict_false(ncp->nc_flag & NCF_TS)) { 771 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 772 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 773 uma_zfree_smr(cache_zone_small_ts, ncp_ts); 774 else 775 uma_zfree_smr(cache_zone_large_ts, ncp_ts); 776 } else { 777 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 778 uma_zfree_smr(cache_zone_small, ncp); 779 else 780 uma_zfree_smr(cache_zone_large, ncp); 781 } 782 } 783 784 static struct namecache * 785 cache_alloc(int len, bool ts) 786 { 787 u_long lnumcache; 788 789 /* 790 * Avoid blowout in namecache entries. 791 * 792 * Bugs: 793 * 1. filesystems may end up trying to add an already existing entry 794 * (for example this can happen after a cache miss during concurrent 795 * lookup), in which case we will call cache_neg_evict despite not 796 * adding anything. 797 * 2. the routine may fail to free anything and no provisions are made 798 * to make it try harder (see the inside for failure modes) 799 * 3. it only ever looks at negative entries. 800 */ 801 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; 802 if (cache_neg_evict_cond(lnumcache)) { 803 lnumcache = atomic_load_long(&numcache); 804 } 805 if (__predict_false(lnumcache >= ncsize)) { 806 atomic_subtract_long(&numcache, 1); 807 counter_u64_add(numdrops, 1); 808 return (NULL); 809 } 810 return (cache_alloc_uma(len, ts)); 811 } 812 813 static void 814 cache_free(struct namecache *ncp) 815 { 816 817 MPASS(ncp != NULL); 818 if ((ncp->nc_flag & NCF_DVDROP) != 0) { 819 cache_drop_vnode(ncp->nc_dvp); 820 } 821 cache_free_uma(ncp); 822 atomic_subtract_long(&numcache, 1); 823 } 824 825 static void 826 cache_free_batch(struct cache_freebatch *batch) 827 { 828 struct namecache *ncp, *nnp; 829 int i; 830 831 i = 0; 832 if (TAILQ_EMPTY(batch)) 833 goto out; 834 TAILQ_FOREACH_SAFE(ncp, batch, nc_dst, nnp) { 835 if ((ncp->nc_flag & NCF_DVDROP) != 0) { 836 cache_drop_vnode(ncp->nc_dvp); 837 } 838 cache_free_uma(ncp); 839 i++; 840 } 841 atomic_subtract_long(&numcache, i); 842 out: 843 SDT_PROBE1(vfs, namecache, purge, batch, i); 844 } 845 846 /* 847 * Hashing. 848 * 849 * The code was made to use FNV in 2001 and this choice needs to be revisited. 850 * 851 * Short summary of the difficulty: 852 * The longest name which can be inserted is NAME_MAX characters in length (or 853 * 255 at the time of writing this comment), while majority of names used in 854 * practice are significantly shorter (mostly below 10). More importantly 855 * majority of lookups performed find names are even shorter than that. 856 * 857 * This poses a problem where hashes which do better than FNV past word size 858 * (or so) tend to come with additional overhead when finalizing the result, 859 * making them noticeably slower for the most commonly used range. 860 * 861 * Consider a path like: /usr/obj/usr/src/sys/amd64/GENERIC/vnode_if.c 862 * 863 * When looking it up the most time consuming part by a large margin (at least 864 * on amd64) is hashing. Replacing FNV with something which pessimizes short 865 * input would make the slowest part stand out even more. 866 */ 867 868 /* 869 * TODO: With the value stored we can do better than computing the hash based 870 * on the address. 871 */ 872 static void 873 cache_prehash(struct vnode *vp) 874 { 875 876 vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT); 877 } 878 879 static uint32_t 880 cache_get_hash(char *name, u_char len, struct vnode *dvp) 881 { 882 883 return (fnv_32_buf(name, len, dvp->v_nchash)); 884 } 885 886 static uint32_t 887 cache_get_hash_iter_start(struct vnode *dvp) 888 { 889 890 return (dvp->v_nchash); 891 } 892 893 static uint32_t 894 cache_get_hash_iter(char c, uint32_t hash) 895 { 896 897 return (fnv_32_buf(&c, 1, hash)); 898 } 899 900 static uint32_t 901 cache_get_hash_iter_finish(uint32_t hash) 902 { 903 904 return (hash); 905 } 906 907 static inline struct nchashhead * 908 NCP2BUCKET(struct namecache *ncp) 909 { 910 uint32_t hash; 911 912 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 913 return (NCHHASH(hash)); 914 } 915 916 static inline struct mtx * 917 NCP2BUCKETLOCK(struct namecache *ncp) 918 { 919 uint32_t hash; 920 921 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 922 return (HASH2BUCKETLOCK(hash)); 923 } 924 925 #ifdef INVARIANTS 926 static void 927 cache_assert_bucket_locked(struct namecache *ncp) 928 { 929 struct mtx *blp; 930 931 blp = NCP2BUCKETLOCK(ncp); 932 mtx_assert(blp, MA_OWNED); 933 } 934 935 static void 936 cache_assert_bucket_unlocked(struct namecache *ncp) 937 { 938 struct mtx *blp; 939 940 blp = NCP2BUCKETLOCK(ncp); 941 mtx_assert(blp, MA_NOTOWNED); 942 } 943 #else 944 #define cache_assert_bucket_locked(x) do { } while (0) 945 #define cache_assert_bucket_unlocked(x) do { } while (0) 946 #endif 947 948 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) 949 static void 950 _cache_sort_vnodes(void **p1, void **p2) 951 { 952 void *tmp; 953 954 MPASS(*p1 != NULL || *p2 != NULL); 955 956 if (*p1 > *p2) { 957 tmp = *p2; 958 *p2 = *p1; 959 *p1 = tmp; 960 } 961 } 962 963 static void 964 cache_lock_all_buckets(void) 965 { 966 u_int i; 967 968 for (i = 0; i < numbucketlocks; i++) 969 mtx_lock(&bucketlocks[i]); 970 } 971 972 static void 973 cache_unlock_all_buckets(void) 974 { 975 u_int i; 976 977 for (i = 0; i < numbucketlocks; i++) 978 mtx_unlock(&bucketlocks[i]); 979 } 980 981 static void 982 cache_lock_all_vnodes(void) 983 { 984 u_int i; 985 986 for (i = 0; i < numvnodelocks; i++) 987 mtx_lock(&vnodelocks[i]); 988 } 989 990 static void 991 cache_unlock_all_vnodes(void) 992 { 993 u_int i; 994 995 for (i = 0; i < numvnodelocks; i++) 996 mtx_unlock(&vnodelocks[i]); 997 } 998 999 static int 1000 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 1001 { 1002 1003 cache_sort_vnodes(&vlp1, &vlp2); 1004 1005 if (vlp1 != NULL) { 1006 if (!mtx_trylock(vlp1)) 1007 return (EAGAIN); 1008 } 1009 if (!mtx_trylock(vlp2)) { 1010 if (vlp1 != NULL) 1011 mtx_unlock(vlp1); 1012 return (EAGAIN); 1013 } 1014 1015 return (0); 1016 } 1017 1018 static void 1019 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 1020 { 1021 1022 MPASS(vlp1 != NULL || vlp2 != NULL); 1023 MPASS(vlp1 <= vlp2); 1024 1025 if (vlp1 != NULL) 1026 mtx_lock(vlp1); 1027 if (vlp2 != NULL) 1028 mtx_lock(vlp2); 1029 } 1030 1031 static void 1032 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 1033 { 1034 1035 MPASS(vlp1 != NULL || vlp2 != NULL); 1036 1037 if (vlp1 != NULL) 1038 mtx_unlock(vlp1); 1039 if (vlp2 != NULL) 1040 mtx_unlock(vlp2); 1041 } 1042 1043 static int 1044 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 1045 { 1046 struct nchstats snap; 1047 1048 if (req->oldptr == NULL) 1049 return (SYSCTL_OUT(req, 0, sizeof(snap))); 1050 1051 snap = nchstats; 1052 snap.ncs_goodhits = counter_u64_fetch(numposhits); 1053 snap.ncs_neghits = counter_u64_fetch(numneghits); 1054 snap.ncs_badhits = counter_u64_fetch(numposzaps) + 1055 counter_u64_fetch(numnegzaps); 1056 snap.ncs_miss = counter_u64_fetch(nummisszap) + 1057 counter_u64_fetch(nummiss); 1058 1059 return (SYSCTL_OUT(req, &snap, sizeof(snap))); 1060 } 1061 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | 1062 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", 1063 "VFS cache effectiveness statistics"); 1064 1065 static void 1066 cache_recalc_neg_min(void) 1067 { 1068 1069 neg_min = (ncsize * ncnegminpct) / 100; 1070 } 1071 1072 static int 1073 sysctl_negminpct(SYSCTL_HANDLER_ARGS) 1074 { 1075 u_int val; 1076 int error; 1077 1078 val = ncnegminpct; 1079 error = sysctl_handle_int(oidp, &val, 0, req); 1080 if (error != 0 || req->newptr == NULL) 1081 return (error); 1082 1083 if (val == ncnegminpct) 1084 return (0); 1085 if (val < 0 || val > 99) 1086 return (EINVAL); 1087 ncnegminpct = val; 1088 cache_recalc_neg_min(); 1089 return (0); 1090 } 1091 1092 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, negminpct, 1093 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_negminpct, 1094 "I", "Negative entry \% of namecache capacity above which automatic eviction is allowed"); 1095 1096 #ifdef DEBUG_CACHE 1097 /* 1098 * Grab an atomic snapshot of the name cache hash chain lengths 1099 */ 1100 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, 1101 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 1102 "hash table stats"); 1103 1104 static int 1105 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) 1106 { 1107 struct nchashhead *ncpp; 1108 struct namecache *ncp; 1109 int i, error, n_nchash, *cntbuf; 1110 1111 retry: 1112 n_nchash = nchash + 1; /* nchash is max index, not count */ 1113 if (req->oldptr == NULL) 1114 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); 1115 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); 1116 cache_lock_all_buckets(); 1117 if (n_nchash != nchash + 1) { 1118 cache_unlock_all_buckets(); 1119 free(cntbuf, M_TEMP); 1120 goto retry; 1121 } 1122 /* Scan hash tables counting entries */ 1123 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) 1124 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) 1125 cntbuf[i]++; 1126 cache_unlock_all_buckets(); 1127 for (error = 0, i = 0; i < n_nchash; i++) 1128 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) 1129 break; 1130 free(cntbuf, M_TEMP); 1131 return (error); 1132 } 1133 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| 1134 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", 1135 "nchash chain lengths"); 1136 1137 static int 1138 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) 1139 { 1140 int error; 1141 struct nchashhead *ncpp; 1142 struct namecache *ncp; 1143 int n_nchash; 1144 int count, maxlength, used, pct; 1145 1146 if (!req->oldptr) 1147 return SYSCTL_OUT(req, 0, 4 * sizeof(int)); 1148 1149 cache_lock_all_buckets(); 1150 n_nchash = nchash + 1; /* nchash is max index, not count */ 1151 used = 0; 1152 maxlength = 0; 1153 1154 /* Scan hash tables for applicable entries */ 1155 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { 1156 count = 0; 1157 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) { 1158 count++; 1159 } 1160 if (count) 1161 used++; 1162 if (maxlength < count) 1163 maxlength = count; 1164 } 1165 n_nchash = nchash + 1; 1166 cache_unlock_all_buckets(); 1167 pct = (used * 100) / (n_nchash / 100); 1168 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); 1169 if (error) 1170 return (error); 1171 error = SYSCTL_OUT(req, &used, sizeof(used)); 1172 if (error) 1173 return (error); 1174 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); 1175 if (error) 1176 return (error); 1177 error = SYSCTL_OUT(req, &pct, sizeof(pct)); 1178 if (error) 1179 return (error); 1180 return (0); 1181 } 1182 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| 1183 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", 1184 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); 1185 #endif 1186 1187 /* 1188 * Negative entries management 1189 * 1190 * Various workloads create plenty of negative entries and barely use them 1191 * afterwards. Moreover malicious users can keep performing bogus lookups 1192 * adding even more entries. For example "make tinderbox" as of writing this 1193 * comment ends up with 2.6M namecache entries in total, 1.2M of which are 1194 * negative. 1195 * 1196 * As such, a rather aggressive eviction method is needed. The currently 1197 * employed method is a placeholder. 1198 * 1199 * Entries are split over numneglists separate lists, each of which is further 1200 * split into hot and cold entries. Entries get promoted after getting a hit. 1201 * Eviction happens on addition of new entry. 1202 */ 1203 static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1204 "Name cache negative entry statistics"); 1205 1206 SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0, 1207 "Number of negative cache entries"); 1208 1209 static COUNTER_U64_DEFINE_EARLY(neg_created); 1210 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created, 1211 "Number of created negative entries"); 1212 1213 static COUNTER_U64_DEFINE_EARLY(neg_evicted); 1214 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted, 1215 "Number of evicted negative entries"); 1216 1217 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty); 1218 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD, 1219 &neg_evict_skipped_empty, 1220 "Number of times evicting failed due to lack of entries"); 1221 1222 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed); 1223 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD, 1224 &neg_evict_skipped_missed, 1225 "Number of times evicting failed due to target entry disappearing"); 1226 1227 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended); 1228 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD, 1229 &neg_evict_skipped_contended, 1230 "Number of times evicting failed due to contention"); 1231 1232 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits, 1233 "Number of cache hits (negative)"); 1234 1235 static int 1236 sysctl_neg_hot(SYSCTL_HANDLER_ARGS) 1237 { 1238 int i, out; 1239 1240 out = 0; 1241 for (i = 0; i < numneglists; i++) 1242 out += neglists[i].nl_hotnum; 1243 1244 return (SYSCTL_OUT(req, &out, sizeof(out))); 1245 } 1246 SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD | 1247 CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I", 1248 "Number of hot negative entries"); 1249 1250 static void 1251 cache_neg_init(struct namecache *ncp) 1252 { 1253 struct negstate *ns; 1254 1255 ncp->nc_flag |= NCF_NEGATIVE; 1256 ns = NCP2NEGSTATE(ncp); 1257 ns->neg_flag = 0; 1258 ns->neg_hit = 0; 1259 counter_u64_add(neg_created, 1); 1260 } 1261 1262 #define CACHE_NEG_PROMOTION_THRESH 2 1263 1264 static bool 1265 cache_neg_hit_prep(struct namecache *ncp) 1266 { 1267 struct negstate *ns; 1268 u_char n; 1269 1270 ns = NCP2NEGSTATE(ncp); 1271 n = atomic_load_char(&ns->neg_hit); 1272 for (;;) { 1273 if (n >= CACHE_NEG_PROMOTION_THRESH) 1274 return (false); 1275 if (atomic_fcmpset_8(&ns->neg_hit, &n, n + 1)) 1276 break; 1277 } 1278 return (n + 1 == CACHE_NEG_PROMOTION_THRESH); 1279 } 1280 1281 /* 1282 * Nothing to do here but it is provided for completeness as some 1283 * cache_neg_hit_prep callers may end up returning without even 1284 * trying to promote. 1285 */ 1286 #define cache_neg_hit_abort(ncp) do { } while (0) 1287 1288 static void 1289 cache_neg_hit_finish(struct namecache *ncp) 1290 { 1291 1292 SDT_PROBE2(vfs, namecache, lookup, hit__negative, ncp->nc_dvp, ncp->nc_name); 1293 counter_u64_add(numneghits, 1); 1294 } 1295 1296 /* 1297 * Move a negative entry to the hot list. 1298 */ 1299 static void 1300 cache_neg_promote_locked(struct namecache *ncp) 1301 { 1302 struct neglist *nl; 1303 struct negstate *ns; 1304 1305 ns = NCP2NEGSTATE(ncp); 1306 nl = NCP2NEGLIST(ncp); 1307 mtx_assert(&nl->nl_lock, MA_OWNED); 1308 if ((ns->neg_flag & NEG_HOT) == 0) { 1309 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst); 1310 TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst); 1311 nl->nl_hotnum++; 1312 ns->neg_flag |= NEG_HOT; 1313 } 1314 } 1315 1316 /* 1317 * Move a hot negative entry to the cold list. 1318 */ 1319 static void 1320 cache_neg_demote_locked(struct namecache *ncp) 1321 { 1322 struct neglist *nl; 1323 struct negstate *ns; 1324 1325 ns = NCP2NEGSTATE(ncp); 1326 nl = NCP2NEGLIST(ncp); 1327 mtx_assert(&nl->nl_lock, MA_OWNED); 1328 MPASS(ns->neg_flag & NEG_HOT); 1329 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst); 1330 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst); 1331 nl->nl_hotnum--; 1332 ns->neg_flag &= ~NEG_HOT; 1333 atomic_store_char(&ns->neg_hit, 0); 1334 } 1335 1336 /* 1337 * Move a negative entry to the hot list if it matches the lookup. 1338 * 1339 * We have to take locks, but they may be contended and in the worst 1340 * case we may need to go off CPU. We don't want to spin within the 1341 * smr section and we can't block with it. Exiting the section means 1342 * the found entry could have been evicted. We are going to look it 1343 * up again. 1344 */ 1345 static bool 1346 cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp, 1347 struct namecache *oncp, uint32_t hash) 1348 { 1349 struct namecache *ncp; 1350 struct neglist *nl; 1351 u_char nc_flag; 1352 1353 nl = NCP2NEGLIST(oncp); 1354 1355 mtx_lock(&nl->nl_lock); 1356 /* 1357 * For hash iteration. 1358 */ 1359 vfs_smr_enter(); 1360 1361 /* 1362 * Avoid all surprises by only succeeding if we got the same entry and 1363 * bailing completely otherwise. 1364 * XXX There are no provisions to keep the vnode around, meaning we may 1365 * end up promoting a negative entry for a *new* vnode and returning 1366 * ENOENT on its account. This is the error we want to return anyway 1367 * and promotion is harmless. 1368 * 1369 * In particular at this point there can be a new ncp which matches the 1370 * search but hashes to a different neglist. 1371 */ 1372 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1373 if (ncp == oncp) 1374 break; 1375 } 1376 1377 /* 1378 * No match to begin with. 1379 */ 1380 if (__predict_false(ncp == NULL)) { 1381 goto out_abort; 1382 } 1383 1384 /* 1385 * The newly found entry may be something different... 1386 */ 1387 if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1388 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) { 1389 goto out_abort; 1390 } 1391 1392 /* 1393 * ... and not even negative. 1394 */ 1395 nc_flag = atomic_load_char(&ncp->nc_flag); 1396 if ((nc_flag & NCF_NEGATIVE) == 0) { 1397 goto out_abort; 1398 } 1399 1400 if (!cache_ncp_canuse(ncp)) { 1401 goto out_abort; 1402 } 1403 1404 cache_neg_promote_locked(ncp); 1405 cache_neg_hit_finish(ncp); 1406 vfs_smr_exit(); 1407 mtx_unlock(&nl->nl_lock); 1408 return (true); 1409 out_abort: 1410 vfs_smr_exit(); 1411 mtx_unlock(&nl->nl_lock); 1412 return (false); 1413 } 1414 1415 static void 1416 cache_neg_promote(struct namecache *ncp) 1417 { 1418 struct neglist *nl; 1419 1420 nl = NCP2NEGLIST(ncp); 1421 mtx_lock(&nl->nl_lock); 1422 cache_neg_promote_locked(ncp); 1423 mtx_unlock(&nl->nl_lock); 1424 } 1425 1426 static void 1427 cache_neg_insert(struct namecache *ncp) 1428 { 1429 struct neglist *nl; 1430 1431 MPASS(ncp->nc_flag & NCF_NEGATIVE); 1432 cache_assert_bucket_locked(ncp); 1433 nl = NCP2NEGLIST(ncp); 1434 mtx_lock(&nl->nl_lock); 1435 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst); 1436 mtx_unlock(&nl->nl_lock); 1437 atomic_add_long(&numneg, 1); 1438 } 1439 1440 static void 1441 cache_neg_remove(struct namecache *ncp) 1442 { 1443 struct neglist *nl; 1444 struct negstate *ns; 1445 1446 cache_assert_bucket_locked(ncp); 1447 nl = NCP2NEGLIST(ncp); 1448 ns = NCP2NEGSTATE(ncp); 1449 mtx_lock(&nl->nl_lock); 1450 if ((ns->neg_flag & NEG_HOT) != 0) { 1451 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst); 1452 nl->nl_hotnum--; 1453 } else { 1454 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst); 1455 } 1456 mtx_unlock(&nl->nl_lock); 1457 atomic_subtract_long(&numneg, 1); 1458 } 1459 1460 static struct neglist * 1461 cache_neg_evict_select_list(void) 1462 { 1463 struct neglist *nl; 1464 u_int c; 1465 1466 c = atomic_fetchadd_int(&neg_cycle, 1) + 1; 1467 nl = &neglists[c % numneglists]; 1468 if (!mtx_trylock(&nl->nl_evict_lock)) { 1469 counter_u64_add(neg_evict_skipped_contended, 1); 1470 return (NULL); 1471 } 1472 return (nl); 1473 } 1474 1475 static struct namecache * 1476 cache_neg_evict_select_entry(struct neglist *nl) 1477 { 1478 struct namecache *ncp, *lncp; 1479 struct negstate *ns, *lns; 1480 int i; 1481 1482 mtx_assert(&nl->nl_evict_lock, MA_OWNED); 1483 mtx_assert(&nl->nl_lock, MA_OWNED); 1484 ncp = TAILQ_FIRST(&nl->nl_list); 1485 if (ncp == NULL) 1486 return (NULL); 1487 lncp = ncp; 1488 lns = NCP2NEGSTATE(lncp); 1489 for (i = 1; i < 4; i++) { 1490 ncp = TAILQ_NEXT(ncp, nc_dst); 1491 if (ncp == NULL) 1492 break; 1493 ns = NCP2NEGSTATE(ncp); 1494 if (ns->neg_hit < lns->neg_hit) { 1495 lncp = ncp; 1496 lns = ns; 1497 } 1498 } 1499 return (lncp); 1500 } 1501 1502 static bool 1503 cache_neg_evict(void) 1504 { 1505 struct namecache *ncp, *ncp2; 1506 struct neglist *nl; 1507 struct vnode *dvp; 1508 struct mtx *dvlp; 1509 struct mtx *blp; 1510 uint32_t hash; 1511 u_char nlen; 1512 bool evicted; 1513 1514 nl = cache_neg_evict_select_list(); 1515 if (nl == NULL) { 1516 return (false); 1517 } 1518 1519 mtx_lock(&nl->nl_lock); 1520 ncp = TAILQ_FIRST(&nl->nl_hotlist); 1521 if (ncp != NULL) { 1522 cache_neg_demote_locked(ncp); 1523 } 1524 ncp = cache_neg_evict_select_entry(nl); 1525 if (ncp == NULL) { 1526 counter_u64_add(neg_evict_skipped_empty, 1); 1527 mtx_unlock(&nl->nl_lock); 1528 mtx_unlock(&nl->nl_evict_lock); 1529 return (false); 1530 } 1531 nlen = ncp->nc_nlen; 1532 dvp = ncp->nc_dvp; 1533 hash = cache_get_hash(ncp->nc_name, nlen, dvp); 1534 dvlp = VP2VNODELOCK(dvp); 1535 blp = HASH2BUCKETLOCK(hash); 1536 mtx_unlock(&nl->nl_lock); 1537 mtx_unlock(&nl->nl_evict_lock); 1538 mtx_lock(dvlp); 1539 mtx_lock(blp); 1540 /* 1541 * Note that since all locks were dropped above, the entry may be 1542 * gone or reallocated to be something else. 1543 */ 1544 CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) { 1545 if (ncp2 == ncp && ncp2->nc_dvp == dvp && 1546 ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0) 1547 break; 1548 } 1549 if (ncp2 == NULL) { 1550 counter_u64_add(neg_evict_skipped_missed, 1); 1551 ncp = NULL; 1552 evicted = false; 1553 } else { 1554 MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp)); 1555 MPASS(blp == NCP2BUCKETLOCK(ncp)); 1556 SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp, 1557 ncp->nc_name); 1558 cache_zap_locked(ncp); 1559 counter_u64_add(neg_evicted, 1); 1560 evicted = true; 1561 } 1562 mtx_unlock(blp); 1563 mtx_unlock(dvlp); 1564 if (ncp != NULL) 1565 cache_free(ncp); 1566 return (evicted); 1567 } 1568 1569 /* 1570 * Maybe evict a negative entry to create more room. 1571 * 1572 * The ncnegfactor parameter limits what fraction of the total count 1573 * can comprise of negative entries. However, if the cache is just 1574 * warming up this leads to excessive evictions. As such, ncnegminpct 1575 * (recomputed to neg_min) dictates whether the above should be 1576 * applied. 1577 * 1578 * Try evicting if the cache is close to full capacity regardless of 1579 * other considerations. 1580 */ 1581 static bool 1582 cache_neg_evict_cond(u_long lnumcache) 1583 { 1584 u_long lnumneg; 1585 1586 if (ncsize - 1000 < lnumcache) 1587 goto out_evict; 1588 lnumneg = atomic_load_long(&numneg); 1589 if (lnumneg < neg_min) 1590 return (false); 1591 if (lnumneg * ncnegfactor < lnumcache) 1592 return (false); 1593 out_evict: 1594 return (cache_neg_evict()); 1595 } 1596 1597 /* 1598 * cache_zap_locked(): 1599 * 1600 * Removes a namecache entry from cache, whether it contains an actual 1601 * pointer to a vnode or if it is just a negative cache entry. 1602 */ 1603 static void 1604 cache_zap_locked(struct namecache *ncp) 1605 { 1606 struct nchashhead *ncpp; 1607 struct vnode *dvp, *vp; 1608 1609 dvp = ncp->nc_dvp; 1610 vp = ncp->nc_vp; 1611 1612 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1613 cache_assert_vnode_locked(vp); 1614 cache_assert_vnode_locked(dvp); 1615 cache_assert_bucket_locked(ncp); 1616 1617 cache_ncp_invalidate(ncp); 1618 1619 ncpp = NCP2BUCKET(ncp); 1620 CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash); 1621 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 1622 SDT_PROBE3(vfs, namecache, zap, done, dvp, ncp->nc_name, vp); 1623 TAILQ_REMOVE(&vp->v_cache_dst, ncp, nc_dst); 1624 if (ncp == vp->v_cache_dd) { 1625 atomic_store_ptr(&vp->v_cache_dd, NULL); 1626 } 1627 } else { 1628 SDT_PROBE2(vfs, namecache, zap_negative, done, dvp, ncp->nc_name); 1629 cache_neg_remove(ncp); 1630 } 1631 if (ncp->nc_flag & NCF_ISDOTDOT) { 1632 if (ncp == dvp->v_cache_dd) { 1633 atomic_store_ptr(&dvp->v_cache_dd, NULL); 1634 } 1635 } else { 1636 LIST_REMOVE(ncp, nc_src); 1637 if (LIST_EMPTY(&dvp->v_cache_src)) { 1638 ncp->nc_flag |= NCF_DVDROP; 1639 } 1640 } 1641 } 1642 1643 static void 1644 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) 1645 { 1646 struct mtx *blp; 1647 1648 MPASS(ncp->nc_dvp == vp); 1649 MPASS(ncp->nc_flag & NCF_NEGATIVE); 1650 cache_assert_vnode_locked(vp); 1651 1652 blp = NCP2BUCKETLOCK(ncp); 1653 mtx_lock(blp); 1654 cache_zap_locked(ncp); 1655 mtx_unlock(blp); 1656 } 1657 1658 static bool 1659 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, 1660 struct mtx **vlpp) 1661 { 1662 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 1663 struct mtx *blp; 1664 1665 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 1666 cache_assert_vnode_locked(vp); 1667 1668 if (ncp->nc_flag & NCF_NEGATIVE) { 1669 if (*vlpp != NULL) { 1670 mtx_unlock(*vlpp); 1671 *vlpp = NULL; 1672 } 1673 cache_zap_negative_locked_vnode_kl(ncp, vp); 1674 return (true); 1675 } 1676 1677 pvlp = VP2VNODELOCK(vp); 1678 blp = NCP2BUCKETLOCK(ncp); 1679 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 1680 vlp2 = VP2VNODELOCK(ncp->nc_vp); 1681 1682 if (*vlpp == vlp1 || *vlpp == vlp2) { 1683 to_unlock = *vlpp; 1684 *vlpp = NULL; 1685 } else { 1686 if (*vlpp != NULL) { 1687 mtx_unlock(*vlpp); 1688 *vlpp = NULL; 1689 } 1690 cache_sort_vnodes(&vlp1, &vlp2); 1691 if (vlp1 == pvlp) { 1692 mtx_lock(vlp2); 1693 to_unlock = vlp2; 1694 } else { 1695 if (!mtx_trylock(vlp1)) 1696 goto out_relock; 1697 to_unlock = vlp1; 1698 } 1699 } 1700 mtx_lock(blp); 1701 cache_zap_locked(ncp); 1702 mtx_unlock(blp); 1703 if (to_unlock != NULL) 1704 mtx_unlock(to_unlock); 1705 return (true); 1706 1707 out_relock: 1708 mtx_unlock(vlp2); 1709 mtx_lock(vlp1); 1710 mtx_lock(vlp2); 1711 MPASS(*vlpp == NULL); 1712 *vlpp = vlp1; 1713 return (false); 1714 } 1715 1716 /* 1717 * If trylocking failed we can get here. We know enough to take all needed locks 1718 * in the right order and re-lookup the entry. 1719 */ 1720 static int 1721 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1722 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, 1723 struct mtx *blp) 1724 { 1725 struct namecache *rncp; 1726 1727 cache_assert_bucket_unlocked(ncp); 1728 1729 cache_sort_vnodes(&dvlp, &vlp); 1730 cache_lock_vnodes(dvlp, vlp); 1731 mtx_lock(blp); 1732 CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { 1733 if (rncp == ncp && rncp->nc_dvp == dvp && 1734 rncp->nc_nlen == cnp->cn_namelen && 1735 !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) 1736 break; 1737 } 1738 if (rncp != NULL) { 1739 cache_zap_locked(rncp); 1740 mtx_unlock(blp); 1741 cache_unlock_vnodes(dvlp, vlp); 1742 counter_u64_add(zap_bucket_relock_success, 1); 1743 return (0); 1744 } 1745 1746 mtx_unlock(blp); 1747 cache_unlock_vnodes(dvlp, vlp); 1748 return (EAGAIN); 1749 } 1750 1751 static int __noinline 1752 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp, 1753 uint32_t hash, struct mtx *blp) 1754 { 1755 struct mtx *dvlp, *vlp; 1756 struct vnode *dvp; 1757 1758 cache_assert_bucket_locked(ncp); 1759 1760 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1761 vlp = NULL; 1762 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1763 vlp = VP2VNODELOCK(ncp->nc_vp); 1764 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1765 cache_zap_locked(ncp); 1766 mtx_unlock(blp); 1767 cache_unlock_vnodes(dvlp, vlp); 1768 return (0); 1769 } 1770 1771 dvp = ncp->nc_dvp; 1772 mtx_unlock(blp); 1773 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1774 } 1775 1776 static __noinline int 1777 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp) 1778 { 1779 struct namecache *ncp; 1780 struct mtx *blp; 1781 struct mtx *dvlp, *dvlp2; 1782 uint32_t hash; 1783 int error; 1784 1785 if (cnp->cn_namelen == 2 && 1786 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1787 dvlp = VP2VNODELOCK(dvp); 1788 dvlp2 = NULL; 1789 mtx_lock(dvlp); 1790 retry_dotdot: 1791 ncp = dvp->v_cache_dd; 1792 if (ncp == NULL) { 1793 mtx_unlock(dvlp); 1794 if (dvlp2 != NULL) 1795 mtx_unlock(dvlp2); 1796 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1797 return (0); 1798 } 1799 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1800 if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2)) 1801 goto retry_dotdot; 1802 MPASS(dvp->v_cache_dd == NULL); 1803 mtx_unlock(dvlp); 1804 if (dvlp2 != NULL) 1805 mtx_unlock(dvlp2); 1806 cache_free(ncp); 1807 } else { 1808 atomic_store_ptr(&dvp->v_cache_dd, NULL); 1809 mtx_unlock(dvlp); 1810 if (dvlp2 != NULL) 1811 mtx_unlock(dvlp2); 1812 } 1813 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1814 return (1); 1815 } 1816 1817 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1818 blp = HASH2BUCKETLOCK(hash); 1819 retry: 1820 if (CK_SLIST_EMPTY(NCHHASH(hash))) 1821 goto out_no_entry; 1822 1823 mtx_lock(blp); 1824 1825 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1826 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1827 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1828 break; 1829 } 1830 1831 if (ncp == NULL) { 1832 mtx_unlock(blp); 1833 goto out_no_entry; 1834 } 1835 1836 error = cache_zap_locked_bucket(ncp, cnp, hash, blp); 1837 if (__predict_false(error != 0)) { 1838 zap_bucket_fail++; 1839 goto retry; 1840 } 1841 counter_u64_add(numposzaps, 1); 1842 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1843 cache_free(ncp); 1844 return (1); 1845 out_no_entry: 1846 counter_u64_add(nummisszap, 1); 1847 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1848 return (0); 1849 } 1850 1851 static int __noinline 1852 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1853 struct timespec *tsp, int *ticksp) 1854 { 1855 int ltype; 1856 1857 *vpp = dvp; 1858 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); 1859 if (tsp != NULL) 1860 timespecclear(tsp); 1861 if (ticksp != NULL) 1862 *ticksp = ticks; 1863 vrefact(*vpp); 1864 /* 1865 * When we lookup "." we still can be asked to lock it 1866 * differently... 1867 */ 1868 ltype = cnp->cn_lkflags & LK_TYPE_MASK; 1869 if (ltype != VOP_ISLOCKED(*vpp)) { 1870 if (ltype == LK_EXCLUSIVE) { 1871 vn_lock(*vpp, LK_UPGRADE | LK_RETRY); 1872 if (VN_IS_DOOMED((*vpp))) { 1873 /* forced unmount */ 1874 vrele(*vpp); 1875 *vpp = NULL; 1876 return (ENOENT); 1877 } 1878 } else 1879 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); 1880 } 1881 return (-1); 1882 } 1883 1884 static int __noinline 1885 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1886 struct timespec *tsp, int *ticksp) 1887 { 1888 struct namecache_ts *ncp_ts; 1889 struct namecache *ncp; 1890 struct mtx *dvlp; 1891 enum vgetstate vs; 1892 int error, ltype; 1893 bool whiteout; 1894 1895 MPASS((cnp->cn_flags & ISDOTDOT) != 0); 1896 1897 if ((cnp->cn_flags & MAKEENTRY) == 0) { 1898 cache_remove_cnp(dvp, cnp); 1899 return (0); 1900 } 1901 1902 retry: 1903 dvlp = VP2VNODELOCK(dvp); 1904 mtx_lock(dvlp); 1905 ncp = dvp->v_cache_dd; 1906 if (ncp == NULL) { 1907 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, ".."); 1908 mtx_unlock(dvlp); 1909 return (0); 1910 } 1911 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1912 if (ncp->nc_flag & NCF_NEGATIVE) 1913 *vpp = NULL; 1914 else 1915 *vpp = ncp->nc_vp; 1916 } else 1917 *vpp = ncp->nc_dvp; 1918 if (*vpp == NULL) 1919 goto negative_success; 1920 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp); 1921 cache_out_ts(ncp, tsp, ticksp); 1922 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == 1923 NCF_DTS && tsp != NULL) { 1924 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1925 *tsp = ncp_ts->nc_dotdottime; 1926 } 1927 1928 MPASS(dvp != *vpp); 1929 ltype = VOP_ISLOCKED(dvp); 1930 VOP_UNLOCK(dvp); 1931 vs = vget_prep(*vpp); 1932 mtx_unlock(dvlp); 1933 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1934 vn_lock(dvp, ltype | LK_RETRY); 1935 if (VN_IS_DOOMED(dvp)) { 1936 if (error == 0) 1937 vput(*vpp); 1938 *vpp = NULL; 1939 return (ENOENT); 1940 } 1941 if (error) { 1942 *vpp = NULL; 1943 goto retry; 1944 } 1945 return (-1); 1946 negative_success: 1947 if (__predict_false(cnp->cn_nameiop == CREATE)) { 1948 if (cnp->cn_flags & ISLASTCN) { 1949 counter_u64_add(numnegzaps, 1); 1950 cache_zap_negative_locked_vnode_kl(ncp, dvp); 1951 mtx_unlock(dvlp); 1952 cache_free(ncp); 1953 return (0); 1954 } 1955 } 1956 1957 whiteout = (ncp->nc_flag & NCF_WHITE); 1958 cache_out_ts(ncp, tsp, ticksp); 1959 if (cache_neg_hit_prep(ncp)) 1960 cache_neg_promote(ncp); 1961 else 1962 cache_neg_hit_finish(ncp); 1963 mtx_unlock(dvlp); 1964 if (whiteout) 1965 cnp->cn_flags |= ISWHITEOUT; 1966 return (ENOENT); 1967 } 1968 1969 /** 1970 * Lookup a name in the name cache 1971 * 1972 * # Arguments 1973 * 1974 * - dvp: Parent directory in which to search. 1975 * - vpp: Return argument. Will contain desired vnode on cache hit. 1976 * - cnp: Parameters of the name search. The most interesting bits of 1977 * the cn_flags field have the following meanings: 1978 * - MAKEENTRY: If clear, free an entry from the cache rather than look 1979 * it up. 1980 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." 1981 * - tsp: Return storage for cache timestamp. On a successful (positive 1982 * or negative) lookup, tsp will be filled with any timespec that 1983 * was stored when this cache entry was created. However, it will 1984 * be clear for "." entries. 1985 * - ticks: Return storage for alternate cache timestamp. On a successful 1986 * (positive or negative) lookup, it will contain the ticks value 1987 * that was current when the cache entry was created, unless cnp 1988 * was ".". 1989 * 1990 * Either both tsp and ticks have to be provided or neither of them. 1991 * 1992 * # Returns 1993 * 1994 * - -1: A positive cache hit. vpp will contain the desired vnode. 1995 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due 1996 * to a forced unmount. vpp will not be modified. If the entry 1997 * is a whiteout, then the ISWHITEOUT flag will be set in 1998 * cnp->cn_flags. 1999 * - 0: A cache miss. vpp will not be modified. 2000 * 2001 * # Locking 2002 * 2003 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up 2004 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the 2005 * lock is not recursively acquired. 2006 */ 2007 static int __noinline 2008 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 2009 struct timespec *tsp, int *ticksp) 2010 { 2011 struct namecache *ncp; 2012 struct mtx *blp; 2013 uint32_t hash; 2014 enum vgetstate vs; 2015 int error; 2016 bool whiteout; 2017 2018 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 2019 MPASS((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) != 0); 2020 2021 retry: 2022 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 2023 blp = HASH2BUCKETLOCK(hash); 2024 mtx_lock(blp); 2025 2026 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 2027 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 2028 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 2029 break; 2030 } 2031 2032 if (__predict_false(ncp == NULL)) { 2033 mtx_unlock(blp); 2034 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr); 2035 counter_u64_add(nummiss, 1); 2036 return (0); 2037 } 2038 2039 if (ncp->nc_flag & NCF_NEGATIVE) 2040 goto negative_success; 2041 2042 counter_u64_add(numposhits, 1); 2043 *vpp = ncp->nc_vp; 2044 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); 2045 cache_out_ts(ncp, tsp, ticksp); 2046 MPASS(dvp != *vpp); 2047 vs = vget_prep(*vpp); 2048 mtx_unlock(blp); 2049 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 2050 if (error) { 2051 *vpp = NULL; 2052 goto retry; 2053 } 2054 return (-1); 2055 negative_success: 2056 /* 2057 * We don't get here with regular lookup apart from corner cases. 2058 */ 2059 if (__predict_true(cnp->cn_nameiop == CREATE)) { 2060 if (cnp->cn_flags & ISLASTCN) { 2061 counter_u64_add(numnegzaps, 1); 2062 error = cache_zap_locked_bucket(ncp, cnp, hash, blp); 2063 if (__predict_false(error != 0)) { 2064 zap_bucket_fail2++; 2065 goto retry; 2066 } 2067 cache_free(ncp); 2068 return (0); 2069 } 2070 } 2071 2072 whiteout = (ncp->nc_flag & NCF_WHITE); 2073 cache_out_ts(ncp, tsp, ticksp); 2074 if (cache_neg_hit_prep(ncp)) 2075 cache_neg_promote(ncp); 2076 else 2077 cache_neg_hit_finish(ncp); 2078 mtx_unlock(blp); 2079 if (whiteout) 2080 cnp->cn_flags |= ISWHITEOUT; 2081 return (ENOENT); 2082 } 2083 2084 int 2085 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 2086 struct timespec *tsp, int *ticksp) 2087 { 2088 struct namecache *ncp; 2089 uint32_t hash; 2090 enum vgetstate vs; 2091 int error; 2092 bool whiteout, neg_promote; 2093 u_short nc_flag; 2094 2095 MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL)); 2096 2097 #ifdef DEBUG_CACHE 2098 if (__predict_false(!doingcache)) { 2099 cnp->cn_flags &= ~MAKEENTRY; 2100 return (0); 2101 } 2102 #endif 2103 2104 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 2105 if (cnp->cn_namelen == 1) 2106 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); 2107 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') 2108 return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp)); 2109 } 2110 2111 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 2112 2113 if ((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) == 0) { 2114 cache_remove_cnp(dvp, cnp); 2115 return (0); 2116 } 2117 2118 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 2119 vfs_smr_enter(); 2120 2121 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 2122 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 2123 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 2124 break; 2125 } 2126 2127 if (__predict_false(ncp == NULL)) { 2128 vfs_smr_exit(); 2129 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr); 2130 counter_u64_add(nummiss, 1); 2131 return (0); 2132 } 2133 2134 nc_flag = atomic_load_char(&ncp->nc_flag); 2135 if (nc_flag & NCF_NEGATIVE) 2136 goto negative_success; 2137 2138 counter_u64_add(numposhits, 1); 2139 *vpp = ncp->nc_vp; 2140 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); 2141 cache_out_ts(ncp, tsp, ticksp); 2142 MPASS(dvp != *vpp); 2143 if (!cache_ncp_canuse(ncp)) { 2144 vfs_smr_exit(); 2145 *vpp = NULL; 2146 goto out_fallback; 2147 } 2148 vs = vget_prep_smr(*vpp); 2149 vfs_smr_exit(); 2150 if (__predict_false(vs == VGET_NONE)) { 2151 *vpp = NULL; 2152 goto out_fallback; 2153 } 2154 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 2155 if (error) { 2156 *vpp = NULL; 2157 goto out_fallback; 2158 } 2159 return (-1); 2160 negative_success: 2161 if (cnp->cn_nameiop == CREATE) { 2162 if (cnp->cn_flags & ISLASTCN) { 2163 vfs_smr_exit(); 2164 goto out_fallback; 2165 } 2166 } 2167 2168 cache_out_ts(ncp, tsp, ticksp); 2169 whiteout = (atomic_load_char(&ncp->nc_flag) & NCF_WHITE); 2170 neg_promote = cache_neg_hit_prep(ncp); 2171 if (!cache_ncp_canuse(ncp)) { 2172 cache_neg_hit_abort(ncp); 2173 vfs_smr_exit(); 2174 goto out_fallback; 2175 } 2176 if (neg_promote) { 2177 vfs_smr_exit(); 2178 if (!cache_neg_promote_cond(dvp, cnp, ncp, hash)) 2179 goto out_fallback; 2180 } else { 2181 cache_neg_hit_finish(ncp); 2182 vfs_smr_exit(); 2183 } 2184 if (whiteout) 2185 cnp->cn_flags |= ISWHITEOUT; 2186 return (ENOENT); 2187 out_fallback: 2188 return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp)); 2189 } 2190 2191 struct celockstate { 2192 struct mtx *vlp[3]; 2193 struct mtx *blp[2]; 2194 }; 2195 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); 2196 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); 2197 2198 static inline void 2199 cache_celockstate_init(struct celockstate *cel) 2200 { 2201 2202 bzero(cel, sizeof(*cel)); 2203 } 2204 2205 static void 2206 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, 2207 struct vnode *dvp) 2208 { 2209 struct mtx *vlp1, *vlp2; 2210 2211 MPASS(cel->vlp[0] == NULL); 2212 MPASS(cel->vlp[1] == NULL); 2213 MPASS(cel->vlp[2] == NULL); 2214 2215 MPASS(vp != NULL || dvp != NULL); 2216 2217 vlp1 = VP2VNODELOCK(vp); 2218 vlp2 = VP2VNODELOCK(dvp); 2219 cache_sort_vnodes(&vlp1, &vlp2); 2220 2221 if (vlp1 != NULL) { 2222 mtx_lock(vlp1); 2223 cel->vlp[0] = vlp1; 2224 } 2225 mtx_lock(vlp2); 2226 cel->vlp[1] = vlp2; 2227 } 2228 2229 static void 2230 cache_unlock_vnodes_cel(struct celockstate *cel) 2231 { 2232 2233 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); 2234 2235 if (cel->vlp[0] != NULL) 2236 mtx_unlock(cel->vlp[0]); 2237 if (cel->vlp[1] != NULL) 2238 mtx_unlock(cel->vlp[1]); 2239 if (cel->vlp[2] != NULL) 2240 mtx_unlock(cel->vlp[2]); 2241 } 2242 2243 static bool 2244 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) 2245 { 2246 struct mtx *vlp; 2247 bool ret; 2248 2249 cache_assert_vlp_locked(cel->vlp[0]); 2250 cache_assert_vlp_locked(cel->vlp[1]); 2251 MPASS(cel->vlp[2] == NULL); 2252 2253 MPASS(vp != NULL); 2254 vlp = VP2VNODELOCK(vp); 2255 2256 ret = true; 2257 if (vlp >= cel->vlp[1]) { 2258 mtx_lock(vlp); 2259 } else { 2260 if (mtx_trylock(vlp)) 2261 goto out; 2262 cache_lock_vnodes_cel_3_failures++; 2263 cache_unlock_vnodes_cel(cel); 2264 if (vlp < cel->vlp[0]) { 2265 mtx_lock(vlp); 2266 mtx_lock(cel->vlp[0]); 2267 mtx_lock(cel->vlp[1]); 2268 } else { 2269 if (cel->vlp[0] != NULL) 2270 mtx_lock(cel->vlp[0]); 2271 mtx_lock(vlp); 2272 mtx_lock(cel->vlp[1]); 2273 } 2274 ret = false; 2275 } 2276 out: 2277 cel->vlp[2] = vlp; 2278 return (ret); 2279 } 2280 2281 static void 2282 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1, 2283 struct mtx *blp2) 2284 { 2285 2286 MPASS(cel->blp[0] == NULL); 2287 MPASS(cel->blp[1] == NULL); 2288 2289 cache_sort_vnodes(&blp1, &blp2); 2290 2291 if (blp1 != NULL) { 2292 mtx_lock(blp1); 2293 cel->blp[0] = blp1; 2294 } 2295 mtx_lock(blp2); 2296 cel->blp[1] = blp2; 2297 } 2298 2299 static void 2300 cache_unlock_buckets_cel(struct celockstate *cel) 2301 { 2302 2303 if (cel->blp[0] != NULL) 2304 mtx_unlock(cel->blp[0]); 2305 mtx_unlock(cel->blp[1]); 2306 } 2307 2308 /* 2309 * Lock part of the cache affected by the insertion. 2310 * 2311 * This means vnodelocks for dvp, vp and the relevant bucketlock. 2312 * However, insertion can result in removal of an old entry. In this 2313 * case we have an additional vnode and bucketlock pair to lock. 2314 * 2315 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while 2316 * preserving the locking order (smaller address first). 2317 */ 2318 static void 2319 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 2320 uint32_t hash) 2321 { 2322 struct namecache *ncp; 2323 struct mtx *blps[2]; 2324 u_char nc_flag; 2325 2326 blps[0] = HASH2BUCKETLOCK(hash); 2327 for (;;) { 2328 blps[1] = NULL; 2329 cache_lock_vnodes_cel(cel, dvp, vp); 2330 if (vp == NULL || vp->v_type != VDIR) 2331 break; 2332 ncp = atomic_load_consume_ptr(&vp->v_cache_dd); 2333 if (ncp == NULL) 2334 break; 2335 nc_flag = atomic_load_char(&ncp->nc_flag); 2336 if ((nc_flag & NCF_ISDOTDOT) == 0) 2337 break; 2338 MPASS(ncp->nc_dvp == vp); 2339 blps[1] = NCP2BUCKETLOCK(ncp); 2340 if ((nc_flag & NCF_NEGATIVE) != 0) 2341 break; 2342 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 2343 break; 2344 /* 2345 * All vnodes got re-locked. Re-validate the state and if 2346 * nothing changed we are done. Otherwise restart. 2347 */ 2348 if (ncp == vp->v_cache_dd && 2349 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 2350 blps[1] == NCP2BUCKETLOCK(ncp) && 2351 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 2352 break; 2353 cache_unlock_vnodes_cel(cel); 2354 cel->vlp[0] = NULL; 2355 cel->vlp[1] = NULL; 2356 cel->vlp[2] = NULL; 2357 } 2358 cache_lock_buckets_cel(cel, blps[0], blps[1]); 2359 } 2360 2361 static void 2362 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 2363 uint32_t hash) 2364 { 2365 struct namecache *ncp; 2366 struct mtx *blps[2]; 2367 u_char nc_flag; 2368 2369 blps[0] = HASH2BUCKETLOCK(hash); 2370 for (;;) { 2371 blps[1] = NULL; 2372 cache_lock_vnodes_cel(cel, dvp, vp); 2373 ncp = atomic_load_consume_ptr(&dvp->v_cache_dd); 2374 if (ncp == NULL) 2375 break; 2376 nc_flag = atomic_load_char(&ncp->nc_flag); 2377 if ((nc_flag & NCF_ISDOTDOT) == 0) 2378 break; 2379 MPASS(ncp->nc_dvp == dvp); 2380 blps[1] = NCP2BUCKETLOCK(ncp); 2381 if ((nc_flag & NCF_NEGATIVE) != 0) 2382 break; 2383 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 2384 break; 2385 if (ncp == dvp->v_cache_dd && 2386 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 2387 blps[1] == NCP2BUCKETLOCK(ncp) && 2388 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 2389 break; 2390 cache_unlock_vnodes_cel(cel); 2391 cel->vlp[0] = NULL; 2392 cel->vlp[1] = NULL; 2393 cel->vlp[2] = NULL; 2394 } 2395 cache_lock_buckets_cel(cel, blps[0], blps[1]); 2396 } 2397 2398 static void 2399 cache_enter_unlock(struct celockstate *cel) 2400 { 2401 2402 cache_unlock_buckets_cel(cel); 2403 cache_unlock_vnodes_cel(cel); 2404 } 2405 2406 static void __noinline 2407 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, 2408 struct componentname *cnp) 2409 { 2410 struct celockstate cel; 2411 struct namecache *ncp; 2412 uint32_t hash; 2413 int len; 2414 2415 if (atomic_load_ptr(&dvp->v_cache_dd) == NULL) 2416 return; 2417 len = cnp->cn_namelen; 2418 cache_celockstate_init(&cel); 2419 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 2420 cache_enter_lock_dd(&cel, dvp, vp, hash); 2421 ncp = dvp->v_cache_dd; 2422 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) { 2423 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); 2424 cache_zap_locked(ncp); 2425 } else { 2426 ncp = NULL; 2427 } 2428 atomic_store_ptr(&dvp->v_cache_dd, NULL); 2429 cache_enter_unlock(&cel); 2430 if (ncp != NULL) 2431 cache_free(ncp); 2432 } 2433 2434 /* 2435 * Add an entry to the cache. 2436 */ 2437 void 2438 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 2439 struct timespec *tsp, struct timespec *dtsp) 2440 { 2441 struct celockstate cel; 2442 struct namecache *ncp, *n2, *ndd; 2443 struct namecache_ts *ncp_ts; 2444 struct nchashhead *ncpp; 2445 uint32_t hash; 2446 int flag; 2447 int len; 2448 2449 KASSERT(cnp->cn_namelen <= NAME_MAX, 2450 ("%s: passed len %ld exceeds NAME_MAX (%d)", __func__, cnp->cn_namelen, 2451 NAME_MAX)); 2452 VNPASS(!VN_IS_DOOMED(dvp), dvp); 2453 VNPASS(dvp->v_type != VNON, dvp); 2454 if (vp != NULL) { 2455 VNPASS(!VN_IS_DOOMED(vp), vp); 2456 VNPASS(vp->v_type != VNON, vp); 2457 } 2458 if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') { 2459 KASSERT(dvp == vp, 2460 ("%s: different vnodes for dot entry (%p; %p)\n", __func__, 2461 dvp, vp)); 2462 } else { 2463 KASSERT(dvp != vp, 2464 ("%s: same vnode for non-dot entry [%s] (%p)\n", __func__, 2465 cnp->cn_nameptr, dvp)); 2466 } 2467 2468 #ifdef DEBUG_CACHE 2469 if (__predict_false(!doingcache)) 2470 return; 2471 #endif 2472 2473 flag = 0; 2474 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 2475 if (cnp->cn_namelen == 1) 2476 return; 2477 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 2478 cache_enter_dotdot_prep(dvp, vp, cnp); 2479 flag = NCF_ISDOTDOT; 2480 } 2481 } 2482 2483 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); 2484 if (ncp == NULL) 2485 return; 2486 2487 cache_celockstate_init(&cel); 2488 ndd = NULL; 2489 ncp_ts = NULL; 2490 2491 /* 2492 * Calculate the hash key and setup as much of the new 2493 * namecache entry as possible before acquiring the lock. 2494 */ 2495 ncp->nc_flag = flag | NCF_WIP; 2496 ncp->nc_vp = vp; 2497 if (vp == NULL) 2498 cache_neg_init(ncp); 2499 ncp->nc_dvp = dvp; 2500 if (tsp != NULL) { 2501 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 2502 ncp_ts->nc_time = *tsp; 2503 ncp_ts->nc_ticks = ticks; 2504 ncp_ts->nc_nc.nc_flag |= NCF_TS; 2505 if (dtsp != NULL) { 2506 ncp_ts->nc_dotdottime = *dtsp; 2507 ncp_ts->nc_nc.nc_flag |= NCF_DTS; 2508 } 2509 } 2510 len = ncp->nc_nlen = cnp->cn_namelen; 2511 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 2512 memcpy(ncp->nc_name, cnp->cn_nameptr, len); 2513 ncp->nc_name[len] = '\0'; 2514 cache_enter_lock(&cel, dvp, vp, hash); 2515 2516 /* 2517 * See if this vnode or negative entry is already in the cache 2518 * with this name. This can happen with concurrent lookups of 2519 * the same path name. 2520 */ 2521 ncpp = NCHHASH(hash); 2522 CK_SLIST_FOREACH(n2, ncpp, nc_hash) { 2523 if (n2->nc_dvp == dvp && 2524 n2->nc_nlen == cnp->cn_namelen && 2525 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { 2526 MPASS(cache_ncp_canuse(n2)); 2527 if ((n2->nc_flag & NCF_NEGATIVE) != 0) 2528 KASSERT(vp == NULL, 2529 ("%s: found entry pointing to a different vnode (%p != %p) ; name [%s]", 2530 __func__, NULL, vp, cnp->cn_nameptr)); 2531 else 2532 KASSERT(n2->nc_vp == vp, 2533 ("%s: found entry pointing to a different vnode (%p != %p) ; name [%s]", 2534 __func__, n2->nc_vp, vp, cnp->cn_nameptr)); 2535 /* 2536 * Entries are supposed to be immutable unless in the 2537 * process of getting destroyed. Accommodating for 2538 * changing timestamps is possible but not worth it. 2539 * This should be harmless in terms of correctness, in 2540 * the worst case resulting in an earlier expiration. 2541 * Alternatively, the found entry can be replaced 2542 * altogether. 2543 */ 2544 MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS))); 2545 #if 0 2546 if (tsp != NULL) { 2547 KASSERT((n2->nc_flag & NCF_TS) != 0, 2548 ("no NCF_TS")); 2549 n2_ts = __containerof(n2, struct namecache_ts, nc_nc); 2550 n2_ts->nc_time = ncp_ts->nc_time; 2551 n2_ts->nc_ticks = ncp_ts->nc_ticks; 2552 if (dtsp != NULL) { 2553 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; 2554 n2_ts->nc_nc.nc_flag |= NCF_DTS; 2555 } 2556 } 2557 #endif 2558 SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name, 2559 vp); 2560 goto out_unlock_free; 2561 } 2562 } 2563 2564 if (flag == NCF_ISDOTDOT) { 2565 /* 2566 * See if we are trying to add .. entry, but some other lookup 2567 * has populated v_cache_dd pointer already. 2568 */ 2569 if (dvp->v_cache_dd != NULL) 2570 goto out_unlock_free; 2571 KASSERT(vp == NULL || vp->v_type == VDIR, 2572 ("wrong vnode type %p", vp)); 2573 atomic_thread_fence_rel(); 2574 atomic_store_ptr(&dvp->v_cache_dd, ncp); 2575 } 2576 2577 if (vp != NULL) { 2578 if (flag != NCF_ISDOTDOT) { 2579 /* 2580 * For this case, the cache entry maps both the 2581 * directory name in it and the name ".." for the 2582 * directory's parent. 2583 */ 2584 if ((ndd = vp->v_cache_dd) != NULL) { 2585 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) 2586 cache_zap_locked(ndd); 2587 else 2588 ndd = NULL; 2589 } 2590 atomic_thread_fence_rel(); 2591 atomic_store_ptr(&vp->v_cache_dd, ncp); 2592 } else if (vp->v_type != VDIR) { 2593 if (vp->v_cache_dd != NULL) { 2594 atomic_store_ptr(&vp->v_cache_dd, NULL); 2595 } 2596 } 2597 } 2598 2599 if (flag != NCF_ISDOTDOT) { 2600 if (LIST_EMPTY(&dvp->v_cache_src)) { 2601 cache_hold_vnode(dvp); 2602 } 2603 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); 2604 } 2605 2606 /* 2607 * If the entry is "negative", we place it into the 2608 * "negative" cache queue, otherwise, we place it into the 2609 * destination vnode's cache entries queue. 2610 */ 2611 if (vp != NULL) { 2612 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); 2613 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, 2614 vp); 2615 } else { 2616 if (cnp->cn_flags & ISWHITEOUT) 2617 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_WHITE); 2618 cache_neg_insert(ncp); 2619 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, 2620 ncp->nc_name); 2621 } 2622 2623 /* 2624 * Insert the new namecache entry into the appropriate chain 2625 * within the cache entries table. 2626 */ 2627 CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash); 2628 2629 atomic_thread_fence_rel(); 2630 /* 2631 * Mark the entry as fully constructed. 2632 * It is immutable past this point until its removal. 2633 */ 2634 atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP); 2635 2636 cache_enter_unlock(&cel); 2637 if (ndd != NULL) 2638 cache_free(ndd); 2639 return; 2640 out_unlock_free: 2641 cache_enter_unlock(&cel); 2642 cache_free(ncp); 2643 return; 2644 } 2645 2646 /* 2647 * A variant of the above accepting flags. 2648 * 2649 * - VFS_CACHE_DROPOLD -- if a conflicting entry is found, drop it. 2650 * 2651 * TODO: this routine is a hack. It blindly removes the old entry, even if it 2652 * happens to match and it is doing it in an inefficient manner. It was added 2653 * to accommodate NFS which runs into a case where the target for a given name 2654 * may change from under it. Note this does nothing to solve the following 2655 * race: 2 callers of cache_enter_time_flags pass a different target vnode for 2656 * the same [dvp, cnp]. It may be argued that code doing this is broken. 2657 */ 2658 void 2659 cache_enter_time_flags(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 2660 struct timespec *tsp, struct timespec *dtsp, int flags) 2661 { 2662 2663 MPASS((flags & ~(VFS_CACHE_DROPOLD)) == 0); 2664 2665 if (flags & VFS_CACHE_DROPOLD) 2666 cache_remove_cnp(dvp, cnp); 2667 cache_enter_time(dvp, vp, cnp, tsp, dtsp); 2668 } 2669 2670 static u_long 2671 cache_roundup_2(u_long val) 2672 { 2673 u_long res; 2674 2675 for (res = 1; res <= val; res <<= 1) 2676 continue; 2677 2678 return (res); 2679 } 2680 2681 static struct nchashhead * 2682 nchinittbl(u_long elements, u_long *hashmask) 2683 { 2684 struct nchashhead *hashtbl; 2685 u_long hashsize, i; 2686 2687 hashsize = cache_roundup_2(elements) / 2; 2688 2689 hashtbl = malloc(hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK); 2690 for (i = 0; i < hashsize; i++) 2691 CK_SLIST_INIT(&hashtbl[i]); 2692 *hashmask = hashsize - 1; 2693 return (hashtbl); 2694 } 2695 2696 static void 2697 ncfreetbl(struct nchashhead *hashtbl) 2698 { 2699 2700 free(hashtbl, M_VFSCACHE); 2701 } 2702 2703 /* 2704 * Name cache initialization, from vfs_init() when we are booting 2705 */ 2706 static void 2707 nchinit(void *dummy __unused) 2708 { 2709 u_int i; 2710 2711 cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE, 2712 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2713 cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE, 2714 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2715 cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE, 2716 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2717 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE, 2718 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2719 2720 VFS_SMR_ZONE_SET(cache_zone_small); 2721 VFS_SMR_ZONE_SET(cache_zone_small_ts); 2722 VFS_SMR_ZONE_SET(cache_zone_large); 2723 VFS_SMR_ZONE_SET(cache_zone_large_ts); 2724 2725 ncsize = desiredvnodes * ncsizefactor; 2726 cache_recalc_neg_min(); 2727 nchashtbl = nchinittbl(desiredvnodes * 2, &nchash); 2728 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; 2729 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ 2730 ncbuckethash = 7; 2731 if (ncbuckethash > nchash) 2732 ncbuckethash = nchash; 2733 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, 2734 M_WAITOK | M_ZERO); 2735 for (i = 0; i < numbucketlocks; i++) 2736 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE); 2737 ncvnodehash = ncbuckethash; 2738 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, 2739 M_WAITOK | M_ZERO); 2740 for (i = 0; i < numvnodelocks; i++) 2741 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); 2742 2743 for (i = 0; i < numneglists; i++) { 2744 mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF); 2745 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); 2746 TAILQ_INIT(&neglists[i].nl_list); 2747 TAILQ_INIT(&neglists[i].nl_hotlist); 2748 } 2749 } 2750 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); 2751 2752 void 2753 cache_vnode_init(struct vnode *vp) 2754 { 2755 2756 LIST_INIT(&vp->v_cache_src); 2757 TAILQ_INIT(&vp->v_cache_dst); 2758 vp->v_cache_dd = NULL; 2759 cache_prehash(vp); 2760 } 2761 2762 /* 2763 * Induce transient cache misses for lockless operation in cache_lookup() by 2764 * using a temporary hash table. 2765 * 2766 * This will force a fs lookup. 2767 * 2768 * Synchronisation is done in 2 steps, calling vfs_smr_synchronize each time 2769 * to observe all CPUs not performing the lookup. 2770 */ 2771 static void 2772 cache_changesize_set_temp(struct nchashhead *temptbl, u_long temphash) 2773 { 2774 2775 MPASS(temphash < nchash); 2776 /* 2777 * Change the size. The new size is smaller and can safely be used 2778 * against the existing table. All lookups which now hash wrong will 2779 * result in a cache miss, which all callers are supposed to know how 2780 * to handle. 2781 */ 2782 atomic_store_long(&nchash, temphash); 2783 atomic_thread_fence_rel(); 2784 vfs_smr_synchronize(); 2785 /* 2786 * At this point everyone sees the updated hash value, but they still 2787 * see the old table. 2788 */ 2789 atomic_store_ptr(&nchashtbl, temptbl); 2790 atomic_thread_fence_rel(); 2791 vfs_smr_synchronize(); 2792 /* 2793 * At this point everyone sees the updated table pointer and size pair. 2794 */ 2795 } 2796 2797 /* 2798 * Set the new hash table. 2799 * 2800 * Similarly to cache_changesize_set_temp(), this has to synchronize against 2801 * lockless operation in cache_lookup(). 2802 */ 2803 static void 2804 cache_changesize_set_new(struct nchashhead *new_tbl, u_long new_hash) 2805 { 2806 2807 MPASS(nchash < new_hash); 2808 /* 2809 * Change the pointer first. This wont result in out of bounds access 2810 * since the temporary table is guaranteed to be smaller. 2811 */ 2812 atomic_store_ptr(&nchashtbl, new_tbl); 2813 atomic_thread_fence_rel(); 2814 vfs_smr_synchronize(); 2815 /* 2816 * At this point everyone sees the updated pointer value, but they 2817 * still see the old size. 2818 */ 2819 atomic_store_long(&nchash, new_hash); 2820 atomic_thread_fence_rel(); 2821 vfs_smr_synchronize(); 2822 /* 2823 * At this point everyone sees the updated table pointer and size pair. 2824 */ 2825 } 2826 2827 void 2828 cache_changesize(u_long newmaxvnodes) 2829 { 2830 struct nchashhead *new_nchashtbl, *old_nchashtbl, *temptbl; 2831 u_long new_nchash, old_nchash, temphash; 2832 struct namecache *ncp; 2833 uint32_t hash; 2834 u_long newncsize; 2835 u_long i; 2836 2837 newncsize = newmaxvnodes * ncsizefactor; 2838 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); 2839 if (newmaxvnodes < numbucketlocks) 2840 newmaxvnodes = numbucketlocks; 2841 2842 new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash); 2843 /* If same hash table size, nothing to do */ 2844 if (nchash == new_nchash) { 2845 ncfreetbl(new_nchashtbl); 2846 return; 2847 } 2848 2849 temptbl = nchinittbl(1, &temphash); 2850 2851 /* 2852 * Move everything from the old hash table to the new table. 2853 * None of the namecache entries in the table can be removed 2854 * because to do so, they have to be removed from the hash table. 2855 */ 2856 cache_lock_all_vnodes(); 2857 cache_lock_all_buckets(); 2858 old_nchashtbl = nchashtbl; 2859 old_nchash = nchash; 2860 cache_changesize_set_temp(temptbl, temphash); 2861 for (i = 0; i <= old_nchash; i++) { 2862 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) { 2863 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, 2864 ncp->nc_dvp); 2865 CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash); 2866 CK_SLIST_INSERT_HEAD(&new_nchashtbl[hash & new_nchash], ncp, nc_hash); 2867 } 2868 } 2869 ncsize = newncsize; 2870 cache_recalc_neg_min(); 2871 cache_changesize_set_new(new_nchashtbl, new_nchash); 2872 cache_unlock_all_buckets(); 2873 cache_unlock_all_vnodes(); 2874 ncfreetbl(old_nchashtbl); 2875 ncfreetbl(temptbl); 2876 } 2877 2878 /* 2879 * Remove all entries from and to a particular vnode. 2880 */ 2881 static void 2882 cache_purge_impl(struct vnode *vp) 2883 { 2884 struct cache_freebatch batch; 2885 struct namecache *ncp; 2886 struct mtx *vlp, *vlp2; 2887 2888 TAILQ_INIT(&batch); 2889 vlp = VP2VNODELOCK(vp); 2890 vlp2 = NULL; 2891 mtx_lock(vlp); 2892 retry: 2893 while (!LIST_EMPTY(&vp->v_cache_src)) { 2894 ncp = LIST_FIRST(&vp->v_cache_src); 2895 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2896 goto retry; 2897 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2898 } 2899 while (!TAILQ_EMPTY(&vp->v_cache_dst)) { 2900 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2901 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2902 goto retry; 2903 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2904 } 2905 ncp = vp->v_cache_dd; 2906 if (ncp != NULL) { 2907 KASSERT(ncp->nc_flag & NCF_ISDOTDOT, 2908 ("lost dotdot link")); 2909 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2910 goto retry; 2911 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2912 } 2913 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); 2914 mtx_unlock(vlp); 2915 if (vlp2 != NULL) 2916 mtx_unlock(vlp2); 2917 cache_free_batch(&batch); 2918 } 2919 2920 /* 2921 * Opportunistic check to see if there is anything to do. 2922 */ 2923 static bool 2924 cache_has_entries(struct vnode *vp) 2925 { 2926 2927 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2928 atomic_load_ptr(&vp->v_cache_dd) == NULL) 2929 return (false); 2930 return (true); 2931 } 2932 2933 void 2934 cache_purge(struct vnode *vp) 2935 { 2936 2937 SDT_PROBE1(vfs, namecache, purge, done, vp); 2938 if (!cache_has_entries(vp)) 2939 return; 2940 cache_purge_impl(vp); 2941 } 2942 2943 /* 2944 * Only to be used by vgone. 2945 */ 2946 void 2947 cache_purge_vgone(struct vnode *vp) 2948 { 2949 struct mtx *vlp; 2950 2951 VNPASS(VN_IS_DOOMED(vp), vp); 2952 if (cache_has_entries(vp)) { 2953 cache_purge_impl(vp); 2954 return; 2955 } 2956 2957 /* 2958 * Serialize against a potential thread doing cache_purge. 2959 */ 2960 vlp = VP2VNODELOCK(vp); 2961 mtx_wait_unlocked(vlp); 2962 if (cache_has_entries(vp)) { 2963 cache_purge_impl(vp); 2964 return; 2965 } 2966 return; 2967 } 2968 2969 /* 2970 * Remove all negative entries for a particular directory vnode. 2971 */ 2972 void 2973 cache_purge_negative(struct vnode *vp) 2974 { 2975 struct cache_freebatch batch; 2976 struct namecache *ncp, *nnp; 2977 struct mtx *vlp; 2978 2979 SDT_PROBE1(vfs, namecache, purge_negative, done, vp); 2980 if (LIST_EMPTY(&vp->v_cache_src)) 2981 return; 2982 TAILQ_INIT(&batch); 2983 vlp = VP2VNODELOCK(vp); 2984 mtx_lock(vlp); 2985 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { 2986 if (!(ncp->nc_flag & NCF_NEGATIVE)) 2987 continue; 2988 cache_zap_negative_locked_vnode_kl(ncp, vp); 2989 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2990 } 2991 mtx_unlock(vlp); 2992 cache_free_batch(&batch); 2993 } 2994 2995 /* 2996 * Entry points for modifying VOP operations. 2997 */ 2998 void 2999 cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, 3000 struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp) 3001 { 3002 3003 ASSERT_VOP_IN_SEQC(fdvp); 3004 ASSERT_VOP_IN_SEQC(fvp); 3005 ASSERT_VOP_IN_SEQC(tdvp); 3006 if (tvp != NULL) 3007 ASSERT_VOP_IN_SEQC(tvp); 3008 3009 cache_purge(fvp); 3010 if (tvp != NULL) { 3011 cache_purge(tvp); 3012 KASSERT(!cache_remove_cnp(tdvp, tcnp), 3013 ("%s: lingering negative entry", __func__)); 3014 } else { 3015 cache_remove_cnp(tdvp, tcnp); 3016 } 3017 3018 /* 3019 * TODO 3020 * 3021 * Historically renaming was always purging all revelang entries, 3022 * but that's quite wasteful. In particular turns out that in many cases 3023 * the target file is immediately accessed after rename, inducing a cache 3024 * miss. 3025 * 3026 * Recode this to reduce relocking and reuse the existing entry (if any) 3027 * instead of just removing it above and allocating a new one here. 3028 */ 3029 cache_enter(tdvp, fvp, tcnp); 3030 } 3031 3032 void 3033 cache_vop_rmdir(struct vnode *dvp, struct vnode *vp) 3034 { 3035 3036 ASSERT_VOP_IN_SEQC(dvp); 3037 ASSERT_VOP_IN_SEQC(vp); 3038 cache_purge(vp); 3039 } 3040 3041 #ifdef INVARIANTS 3042 /* 3043 * Validate that if an entry exists it matches. 3044 */ 3045 void 3046 cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) 3047 { 3048 struct namecache *ncp; 3049 struct mtx *blp; 3050 uint32_t hash; 3051 3052 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 3053 if (CK_SLIST_EMPTY(NCHHASH(hash))) 3054 return; 3055 blp = HASH2BUCKETLOCK(hash); 3056 mtx_lock(blp); 3057 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 3058 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 3059 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) { 3060 if (ncp->nc_vp != vp) 3061 panic("%s: mismatch (%p != %p); ncp %p [%s] dvp %p\n", 3062 __func__, vp, ncp->nc_vp, ncp, ncp->nc_name, ncp->nc_dvp); 3063 } 3064 } 3065 mtx_unlock(blp); 3066 } 3067 3068 void 3069 cache_assert_no_entries(struct vnode *vp) 3070 { 3071 3072 VNPASS(TAILQ_EMPTY(&vp->v_cache_dst), vp); 3073 VNPASS(LIST_EMPTY(&vp->v_cache_src), vp); 3074 VNPASS(vp->v_cache_dd == NULL, vp); 3075 } 3076 #endif 3077 3078 /* 3079 * Flush all entries referencing a particular filesystem. 3080 */ 3081 void 3082 cache_purgevfs(struct mount *mp) 3083 { 3084 struct vnode *vp, *mvp; 3085 size_t visited __sdt_used, purged __sdt_used; 3086 3087 visited = purged = 0; 3088 /* 3089 * Somewhat wasteful iteration over all vnodes. Would be better to 3090 * support filtering and avoid the interlock to begin with. 3091 */ 3092 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 3093 visited++; 3094 if (!cache_has_entries(vp)) { 3095 VI_UNLOCK(vp); 3096 continue; 3097 } 3098 vholdl(vp); 3099 VI_UNLOCK(vp); 3100 cache_purge(vp); 3101 purged++; 3102 vdrop(vp); 3103 } 3104 3105 SDT_PROBE3(vfs, namecache, purgevfs, done, mp, visited, purged); 3106 } 3107 3108 /* 3109 * Perform canonical checks and cache lookup and pass on to filesystem 3110 * through the vop_cachedlookup only if needed. 3111 */ 3112 3113 int 3114 vfs_cache_lookup(struct vop_lookup_args *ap) 3115 { 3116 struct vnode *dvp; 3117 int error; 3118 struct vnode **vpp = ap->a_vpp; 3119 struct componentname *cnp = ap->a_cnp; 3120 int flags = cnp->cn_flags; 3121 3122 *vpp = NULL; 3123 dvp = ap->a_dvp; 3124 3125 if (dvp->v_type != VDIR) 3126 return (ENOTDIR); 3127 3128 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && 3129 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) 3130 return (EROFS); 3131 3132 error = vn_dir_check_exec(dvp, cnp); 3133 if (error != 0) 3134 return (error); 3135 3136 error = cache_lookup(dvp, vpp, cnp, NULL, NULL); 3137 if (error == 0) 3138 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); 3139 if (error == -1) 3140 return (0); 3141 return (error); 3142 } 3143 3144 /* Implementation of the getcwd syscall. */ 3145 int 3146 sys___getcwd(struct thread *td, struct __getcwd_args *uap) 3147 { 3148 char *buf, *retbuf; 3149 size_t buflen; 3150 int error; 3151 3152 buflen = uap->buflen; 3153 if (__predict_false(buflen < 2)) 3154 return (EINVAL); 3155 if (buflen > MAXPATHLEN) 3156 buflen = MAXPATHLEN; 3157 3158 buf = uma_zalloc(namei_zone, M_WAITOK); 3159 error = vn_getcwd(buf, &retbuf, &buflen); 3160 if (error == 0) 3161 error = copyout(retbuf, uap->buf, buflen); 3162 uma_zfree(namei_zone, buf); 3163 return (error); 3164 } 3165 3166 int 3167 vn_getcwd(char *buf, char **retbuf, size_t *buflen) 3168 { 3169 struct pwd *pwd; 3170 int error; 3171 3172 vfs_smr_enter(); 3173 pwd = pwd_get_smr(); 3174 error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, 3175 buflen, 0); 3176 VFS_SMR_ASSERT_NOT_ENTERED(); 3177 if (error < 0) { 3178 pwd = pwd_hold(curthread); 3179 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf, 3180 retbuf, buflen); 3181 pwd_drop(pwd); 3182 } 3183 3184 #ifdef KTRACE 3185 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0) 3186 ktrnamei(*retbuf); 3187 #endif 3188 return (error); 3189 } 3190 3191 /* 3192 * Canonicalize a path by walking it forward and back. 3193 * 3194 * BUGS: 3195 * - Nothing guarantees the integrity of the entire chain. Consider the case 3196 * where the path "foo/bar/baz/qux" is passed, but "bar" is moved out of 3197 * "foo" into "quux" during the backwards walk. The result will be 3198 * "quux/bar/baz/qux", which could not have been obtained by an incremental 3199 * walk in userspace. Moreover, the path we return is inaccessible if the 3200 * calling thread lacks permission to traverse "quux". 3201 */ 3202 static int 3203 kern___realpathat(struct thread *td, int fd, const char *path, char *buf, 3204 size_t size, int flags, enum uio_seg pathseg) 3205 { 3206 struct nameidata nd; 3207 char *retbuf, *freebuf; 3208 int error; 3209 3210 if (flags != 0) 3211 return (EINVAL); 3212 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | WANTPARENT | AUDITVNODE1, 3213 pathseg, path, fd, &cap_fstat_rights); 3214 if ((error = namei(&nd)) != 0) 3215 return (error); 3216 3217 if (nd.ni_vp->v_type == VREG && nd.ni_dvp->v_type != VDIR && 3218 (nd.ni_vp->v_vflag & VV_ROOT) != 0) { 3219 /* 3220 * This happens if vp is a file mount. The call to 3221 * vn_fullpath_hardlink can panic if path resolution can't be 3222 * handled without the directory. 3223 * 3224 * To resolve this, we find the vnode which was mounted on - 3225 * this should have a unique global path since we disallow 3226 * mounting on linked files. 3227 */ 3228 struct vnode *covered_vp; 3229 error = vn_lock(nd.ni_vp, LK_SHARED); 3230 if (error != 0) 3231 goto out; 3232 covered_vp = nd.ni_vp->v_mount->mnt_vnodecovered; 3233 vref(covered_vp); 3234 VOP_UNLOCK(nd.ni_vp); 3235 error = vn_fullpath(covered_vp, &retbuf, &freebuf); 3236 vrele(covered_vp); 3237 } else { 3238 error = vn_fullpath_hardlink(nd.ni_vp, nd.ni_dvp, nd.ni_cnd.cn_nameptr, 3239 nd.ni_cnd.cn_namelen, &retbuf, &freebuf, &size); 3240 } 3241 if (error == 0) { 3242 error = copyout(retbuf, buf, size); 3243 free(freebuf, M_TEMP); 3244 } 3245 out: 3246 vrele(nd.ni_vp); 3247 vrele(nd.ni_dvp); 3248 NDFREE_PNBUF(&nd); 3249 return (error); 3250 } 3251 3252 int 3253 sys___realpathat(struct thread *td, struct __realpathat_args *uap) 3254 { 3255 3256 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size, 3257 uap->flags, UIO_USERSPACE)); 3258 } 3259 3260 /* 3261 * Retrieve the full filesystem path that correspond to a vnode from the name 3262 * cache (if available) 3263 */ 3264 int 3265 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf) 3266 { 3267 struct pwd *pwd; 3268 char *buf; 3269 size_t buflen; 3270 int error; 3271 3272 if (__predict_false(vp == NULL)) 3273 return (EINVAL); 3274 3275 buflen = MAXPATHLEN; 3276 buf = malloc(buflen, M_TEMP, M_WAITOK); 3277 vfs_smr_enter(); 3278 pwd = pwd_get_smr(); 3279 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, 0); 3280 VFS_SMR_ASSERT_NOT_ENTERED(); 3281 if (error < 0) { 3282 pwd = pwd_hold(curthread); 3283 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen); 3284 pwd_drop(pwd); 3285 } 3286 if (error == 0) 3287 *freebuf = buf; 3288 else 3289 free(buf, M_TEMP); 3290 return (error); 3291 } 3292 3293 /* 3294 * This function is similar to vn_fullpath, but it attempts to lookup the 3295 * pathname relative to the global root mount point. This is required for the 3296 * auditing sub-system, as audited pathnames must be absolute, relative to the 3297 * global root mount point. 3298 */ 3299 int 3300 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf) 3301 { 3302 char *buf; 3303 size_t buflen; 3304 int error; 3305 3306 if (__predict_false(vp == NULL)) 3307 return (EINVAL); 3308 buflen = MAXPATHLEN; 3309 buf = malloc(buflen, M_TEMP, M_WAITOK); 3310 vfs_smr_enter(); 3311 error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, 0); 3312 VFS_SMR_ASSERT_NOT_ENTERED(); 3313 if (error < 0) { 3314 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen); 3315 } 3316 if (error == 0) 3317 *freebuf = buf; 3318 else 3319 free(buf, M_TEMP); 3320 return (error); 3321 } 3322 3323 static struct namecache * 3324 vn_dd_from_dst(struct vnode *vp) 3325 { 3326 struct namecache *ncp; 3327 3328 cache_assert_vnode_locked(vp); 3329 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) { 3330 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 3331 return (ncp); 3332 } 3333 return (NULL); 3334 } 3335 3336 int 3337 vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen) 3338 { 3339 struct vnode *dvp; 3340 struct namecache *ncp; 3341 struct mtx *vlp; 3342 int error; 3343 3344 vlp = VP2VNODELOCK(*vp); 3345 mtx_lock(vlp); 3346 ncp = (*vp)->v_cache_dd; 3347 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) { 3348 KASSERT(ncp == vn_dd_from_dst(*vp), 3349 ("%s: mismatch for dd entry (%p != %p)", __func__, 3350 ncp, vn_dd_from_dst(*vp))); 3351 } else { 3352 ncp = vn_dd_from_dst(*vp); 3353 } 3354 if (ncp != NULL) { 3355 if (*buflen < ncp->nc_nlen) { 3356 mtx_unlock(vlp); 3357 vrele(*vp); 3358 counter_u64_add(numfullpathfail4, 1); 3359 error = ENOMEM; 3360 SDT_PROBE3(vfs, namecache, fullpath, return, error, 3361 vp, NULL); 3362 return (error); 3363 } 3364 *buflen -= ncp->nc_nlen; 3365 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 3366 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, 3367 ncp->nc_name, vp); 3368 dvp = *vp; 3369 *vp = ncp->nc_dvp; 3370 vref(*vp); 3371 mtx_unlock(vlp); 3372 vrele(dvp); 3373 return (0); 3374 } 3375 SDT_PROBE1(vfs, namecache, fullpath, miss, vp); 3376 3377 mtx_unlock(vlp); 3378 vn_lock(*vp, LK_SHARED | LK_RETRY); 3379 error = VOP_VPTOCNP(*vp, &dvp, buf, buflen); 3380 vput(*vp); 3381 if (error) { 3382 counter_u64_add(numfullpathfail2, 1); 3383 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 3384 return (error); 3385 } 3386 3387 *vp = dvp; 3388 if (VN_IS_DOOMED(dvp)) { 3389 /* forced unmount */ 3390 vrele(dvp); 3391 error = ENOENT; 3392 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 3393 return (error); 3394 } 3395 /* 3396 * *vp has its use count incremented still. 3397 */ 3398 3399 return (0); 3400 } 3401 3402 /* 3403 * Resolve a directory to a pathname. 3404 * 3405 * The name of the directory can always be found in the namecache or fetched 3406 * from the filesystem. There is also guaranteed to be only one parent, meaning 3407 * we can just follow vnodes up until we find the root. 3408 * 3409 * The vnode must be referenced. 3410 */ 3411 static int 3412 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 3413 size_t *len, size_t addend) 3414 { 3415 #ifdef KDTRACE_HOOKS 3416 struct vnode *startvp = vp; 3417 #endif 3418 struct vnode *vp1; 3419 size_t buflen; 3420 int error; 3421 bool slash_prefixed; 3422 3423 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); 3424 VNPASS(vp->v_usecount > 0, vp); 3425 3426 buflen = *len; 3427 3428 slash_prefixed = true; 3429 if (addend == 0) { 3430 MPASS(*len >= 2); 3431 buflen--; 3432 buf[buflen] = '\0'; 3433 slash_prefixed = false; 3434 } 3435 3436 error = 0; 3437 3438 SDT_PROBE1(vfs, namecache, fullpath, entry, vp); 3439 counter_u64_add(numfullpathcalls, 1); 3440 while (vp != rdir && vp != rootvnode) { 3441 /* 3442 * The vp vnode must be already fully constructed, 3443 * since it is either found in namecache or obtained 3444 * from VOP_VPTOCNP(). We may test for VV_ROOT safely 3445 * without obtaining the vnode lock. 3446 */ 3447 if ((vp->v_vflag & VV_ROOT) != 0) { 3448 vn_lock(vp, LK_RETRY | LK_SHARED); 3449 3450 /* 3451 * With the vnode locked, check for races with 3452 * unmount, forced or not. Note that we 3453 * already verified that vp is not equal to 3454 * the root vnode, which means that 3455 * mnt_vnodecovered can be NULL only for the 3456 * case of unmount. 3457 */ 3458 if (VN_IS_DOOMED(vp) || 3459 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || 3460 vp1->v_mountedhere != vp->v_mount) { 3461 vput(vp); 3462 error = ENOENT; 3463 SDT_PROBE3(vfs, namecache, fullpath, return, 3464 error, vp, NULL); 3465 break; 3466 } 3467 3468 vref(vp1); 3469 vput(vp); 3470 vp = vp1; 3471 continue; 3472 } 3473 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); 3474 error = vn_vptocnp(&vp, buf, &buflen); 3475 if (error) 3476 break; 3477 if (buflen == 0) { 3478 vrele(vp); 3479 error = ENOMEM; 3480 SDT_PROBE3(vfs, namecache, fullpath, return, error, 3481 startvp, NULL); 3482 break; 3483 } 3484 buf[--buflen] = '/'; 3485 slash_prefixed = true; 3486 } 3487 if (error) 3488 return (error); 3489 if (!slash_prefixed) { 3490 if (buflen == 0) { 3491 vrele(vp); 3492 counter_u64_add(numfullpathfail4, 1); 3493 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, 3494 startvp, NULL); 3495 return (ENOMEM); 3496 } 3497 buf[--buflen] = '/'; 3498 } 3499 counter_u64_add(numfullpathfound, 1); 3500 vrele(vp); 3501 3502 *retbuf = buf + buflen; 3503 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf); 3504 *len -= buflen; 3505 *len += addend; 3506 return (0); 3507 } 3508 3509 /* 3510 * Resolve an arbitrary vnode to a pathname. 3511 * 3512 * Note 2 caveats: 3513 * - hardlinks are not tracked, thus if the vnode is not a directory this can 3514 * resolve to a different path than the one used to find it 3515 * - namecache is not mandatory, meaning names are not guaranteed to be added 3516 * (in which case resolving fails) 3517 */ 3518 static void __inline 3519 cache_rev_failed_impl(int *reason, int line) 3520 { 3521 3522 *reason = line; 3523 } 3524 #define cache_rev_failed(var) cache_rev_failed_impl((var), __LINE__) 3525 3526 static int 3527 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 3528 char **retbuf, size_t *buflen, size_t addend) 3529 { 3530 #ifdef KDTRACE_HOOKS 3531 struct vnode *startvp = vp; 3532 #endif 3533 struct vnode *tvp; 3534 struct mount *mp; 3535 struct namecache *ncp; 3536 size_t orig_buflen; 3537 int reason; 3538 int error; 3539 #ifdef KDTRACE_HOOKS 3540 int i; 3541 #endif 3542 seqc_t vp_seqc, tvp_seqc; 3543 u_char nc_flag; 3544 3545 VFS_SMR_ASSERT_ENTERED(); 3546 3547 if (!atomic_load_char(&cache_fast_lookup_enabled)) { 3548 vfs_smr_exit(); 3549 return (-1); 3550 } 3551 3552 orig_buflen = *buflen; 3553 3554 if (addend == 0) { 3555 MPASS(*buflen >= 2); 3556 *buflen -= 1; 3557 buf[*buflen] = '\0'; 3558 } 3559 3560 if (vp == rdir || vp == rootvnode) { 3561 if (addend == 0) { 3562 *buflen -= 1; 3563 buf[*buflen] = '/'; 3564 } 3565 goto out_ok; 3566 } 3567 3568 #ifdef KDTRACE_HOOKS 3569 i = 0; 3570 #endif 3571 error = -1; 3572 ncp = NULL; /* for sdt probe down below */ 3573 vp_seqc = vn_seqc_read_any(vp); 3574 if (seqc_in_modify(vp_seqc)) { 3575 cache_rev_failed(&reason); 3576 goto out_abort; 3577 } 3578 3579 for (;;) { 3580 #ifdef KDTRACE_HOOKS 3581 i++; 3582 #endif 3583 if ((vp->v_vflag & VV_ROOT) != 0) { 3584 mp = atomic_load_ptr(&vp->v_mount); 3585 if (mp == NULL) { 3586 cache_rev_failed(&reason); 3587 goto out_abort; 3588 } 3589 tvp = atomic_load_ptr(&mp->mnt_vnodecovered); 3590 tvp_seqc = vn_seqc_read_any(tvp); 3591 if (seqc_in_modify(tvp_seqc)) { 3592 cache_rev_failed(&reason); 3593 goto out_abort; 3594 } 3595 if (!vn_seqc_consistent(vp, vp_seqc)) { 3596 cache_rev_failed(&reason); 3597 goto out_abort; 3598 } 3599 vp = tvp; 3600 vp_seqc = tvp_seqc; 3601 continue; 3602 } 3603 ncp = atomic_load_consume_ptr(&vp->v_cache_dd); 3604 if (ncp == NULL) { 3605 cache_rev_failed(&reason); 3606 goto out_abort; 3607 } 3608 nc_flag = atomic_load_char(&ncp->nc_flag); 3609 if ((nc_flag & NCF_ISDOTDOT) != 0) { 3610 cache_rev_failed(&reason); 3611 goto out_abort; 3612 } 3613 if (ncp->nc_nlen >= *buflen) { 3614 cache_rev_failed(&reason); 3615 error = ENOMEM; 3616 goto out_abort; 3617 } 3618 *buflen -= ncp->nc_nlen; 3619 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 3620 *buflen -= 1; 3621 buf[*buflen] = '/'; 3622 tvp = ncp->nc_dvp; 3623 tvp_seqc = vn_seqc_read_any(tvp); 3624 if (seqc_in_modify(tvp_seqc)) { 3625 cache_rev_failed(&reason); 3626 goto out_abort; 3627 } 3628 if (!vn_seqc_consistent(vp, vp_seqc)) { 3629 cache_rev_failed(&reason); 3630 goto out_abort; 3631 } 3632 /* 3633 * Acquire fence provided by vn_seqc_read_any above. 3634 */ 3635 if (__predict_false(atomic_load_ptr(&vp->v_cache_dd) != ncp)) { 3636 cache_rev_failed(&reason); 3637 goto out_abort; 3638 } 3639 if (!cache_ncp_canuse(ncp)) { 3640 cache_rev_failed(&reason); 3641 goto out_abort; 3642 } 3643 vp = tvp; 3644 vp_seqc = tvp_seqc; 3645 if (vp == rdir || vp == rootvnode) 3646 break; 3647 } 3648 out_ok: 3649 vfs_smr_exit(); 3650 *retbuf = buf + *buflen; 3651 *buflen = orig_buflen - *buflen + addend; 3652 SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf); 3653 return (0); 3654 3655 out_abort: 3656 *buflen = orig_buflen; 3657 SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i); 3658 vfs_smr_exit(); 3659 return (error); 3660 } 3661 3662 static int 3663 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 3664 size_t *buflen) 3665 { 3666 size_t orig_buflen, addend; 3667 int error; 3668 3669 if (*buflen < 2) 3670 return (EINVAL); 3671 3672 orig_buflen = *buflen; 3673 3674 vref(vp); 3675 addend = 0; 3676 if (vp->v_type != VDIR) { 3677 *buflen -= 1; 3678 buf[*buflen] = '\0'; 3679 error = vn_vptocnp(&vp, buf, buflen); 3680 if (error) 3681 return (error); 3682 if (*buflen == 0) { 3683 vrele(vp); 3684 return (ENOMEM); 3685 } 3686 *buflen -= 1; 3687 buf[*buflen] = '/'; 3688 addend = orig_buflen - *buflen; 3689 } 3690 3691 return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, addend)); 3692 } 3693 3694 /* 3695 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks). 3696 * 3697 * Since the namecache does not track hardlinks, the caller is expected to 3698 * first look up the target vnode with WANTPARENT flag passed to namei to get 3699 * dvp and vp. 3700 * 3701 * Then we have 2 cases: 3702 * - if the found vnode is a directory, the path can be constructed just by 3703 * following names up the chain 3704 * - otherwise we populate the buffer with the saved name and start resolving 3705 * from the parent 3706 */ 3707 int 3708 vn_fullpath_hardlink(struct vnode *vp, struct vnode *dvp, 3709 const char *hrdl_name, size_t hrdl_name_length, 3710 char **retbuf, char **freebuf, size_t *buflen) 3711 { 3712 char *buf, *tmpbuf; 3713 struct pwd *pwd; 3714 size_t addend; 3715 int error; 3716 __enum_uint8(vtype) type; 3717 3718 if (*buflen < 2) 3719 return (EINVAL); 3720 if (*buflen > MAXPATHLEN) 3721 *buflen = MAXPATHLEN; 3722 3723 buf = malloc(*buflen, M_TEMP, M_WAITOK); 3724 3725 addend = 0; 3726 3727 /* 3728 * Check for VBAD to work around the vp_crossmp bug in lookup(). 3729 * 3730 * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be 3731 * set to mount point's root vnode while ni_dvp will be vp_crossmp. 3732 * If the type is VDIR (like in this very case) we can skip looking 3733 * at ni_dvp in the first place. However, since vnodes get passed here 3734 * unlocked the target may transition to doomed state (type == VBAD) 3735 * before we get to evaluate the condition. If this happens, we will 3736 * populate part of the buffer and descend to vn_fullpath_dir with 3737 * vp == vp_crossmp. Prevent the problem by checking for VBAD. 3738 */ 3739 type = atomic_load_8(&vp->v_type); 3740 if (type == VBAD) { 3741 error = ENOENT; 3742 goto out_bad; 3743 } 3744 if (type != VDIR) { 3745 addend = hrdl_name_length + 2; 3746 if (*buflen < addend) { 3747 error = ENOMEM; 3748 goto out_bad; 3749 } 3750 *buflen -= addend; 3751 tmpbuf = buf + *buflen; 3752 tmpbuf[0] = '/'; 3753 memcpy(&tmpbuf[1], hrdl_name, hrdl_name_length); 3754 tmpbuf[addend - 1] = '\0'; 3755 vp = dvp; 3756 } 3757 3758 vfs_smr_enter(); 3759 pwd = pwd_get_smr(); 3760 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen, 3761 addend); 3762 VFS_SMR_ASSERT_NOT_ENTERED(); 3763 if (error < 0) { 3764 pwd = pwd_hold(curthread); 3765 vref(vp); 3766 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen, 3767 addend); 3768 pwd_drop(pwd); 3769 } 3770 if (error != 0) 3771 goto out_bad; 3772 3773 *freebuf = buf; 3774 3775 return (0); 3776 out_bad: 3777 free(buf, M_TEMP); 3778 return (error); 3779 } 3780 3781 struct vnode * 3782 vn_dir_dd_ino(struct vnode *vp) 3783 { 3784 struct namecache *ncp; 3785 struct vnode *ddvp; 3786 struct mtx *vlp; 3787 enum vgetstate vs; 3788 3789 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); 3790 vlp = VP2VNODELOCK(vp); 3791 mtx_lock(vlp); 3792 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { 3793 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 3794 continue; 3795 ddvp = ncp->nc_dvp; 3796 vs = vget_prep(ddvp); 3797 mtx_unlock(vlp); 3798 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) 3799 return (NULL); 3800 return (ddvp); 3801 } 3802 mtx_unlock(vlp); 3803 return (NULL); 3804 } 3805 3806 int 3807 vn_commname(struct vnode *vp, char *buf, u_int buflen) 3808 { 3809 struct namecache *ncp; 3810 struct mtx *vlp; 3811 int l; 3812 3813 vlp = VP2VNODELOCK(vp); 3814 mtx_lock(vlp); 3815 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) 3816 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 3817 break; 3818 if (ncp == NULL) { 3819 mtx_unlock(vlp); 3820 return (ENOENT); 3821 } 3822 l = min(ncp->nc_nlen, buflen - 1); 3823 memcpy(buf, ncp->nc_name, l); 3824 mtx_unlock(vlp); 3825 buf[l] = '\0'; 3826 return (0); 3827 } 3828 3829 /* 3830 * This function updates path string to vnode's full global path 3831 * and checks the size of the new path string against the pathlen argument. 3832 * 3833 * Requires a locked, referenced vnode. 3834 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 3835 * 3836 * If vp is a directory, the call to vn_fullpath_global() always succeeds 3837 * because it falls back to the ".." lookup if the namecache lookup fails. 3838 */ 3839 int 3840 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, 3841 u_int pathlen) 3842 { 3843 struct nameidata nd; 3844 struct vnode *vp1; 3845 char *rpath, *fbuf; 3846 int error; 3847 3848 ASSERT_VOP_ELOCKED(vp, __func__); 3849 3850 /* Construct global filesystem path from vp. */ 3851 VOP_UNLOCK(vp); 3852 error = vn_fullpath_global(vp, &rpath, &fbuf); 3853 3854 if (error != 0) { 3855 vrele(vp); 3856 return (error); 3857 } 3858 3859 if (strlen(rpath) >= pathlen) { 3860 vrele(vp); 3861 error = ENAMETOOLONG; 3862 goto out; 3863 } 3864 3865 /* 3866 * Re-lookup the vnode by path to detect a possible rename. 3867 * As a side effect, the vnode is relocked. 3868 * If vnode was renamed, return ENOENT. 3869 */ 3870 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, path); 3871 error = namei(&nd); 3872 if (error != 0) { 3873 vrele(vp); 3874 goto out; 3875 } 3876 NDFREE_PNBUF(&nd); 3877 vp1 = nd.ni_vp; 3878 vrele(vp); 3879 if (vp1 == vp) 3880 strcpy(path, rpath); 3881 else { 3882 vput(vp1); 3883 error = ENOENT; 3884 } 3885 3886 out: 3887 free(fbuf, M_TEMP); 3888 return (error); 3889 } 3890 3891 /* 3892 * This is similar to vn_path_to_global_path but allows for regular 3893 * files which may not be present in the cache. 3894 * 3895 * Requires a locked, referenced vnode. 3896 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 3897 */ 3898 int 3899 vn_path_to_global_path_hardlink(struct thread *td, struct vnode *vp, 3900 struct vnode *dvp, char *path, u_int pathlen, const char *leaf_name, 3901 size_t leaf_length) 3902 { 3903 struct nameidata nd; 3904 struct vnode *vp1; 3905 char *rpath, *fbuf; 3906 size_t len; 3907 int error; 3908 3909 ASSERT_VOP_ELOCKED(vp, __func__); 3910 3911 /* 3912 * Construct global filesystem path from dvp, vp and leaf 3913 * name. 3914 */ 3915 VOP_UNLOCK(vp); 3916 len = pathlen; 3917 error = vn_fullpath_hardlink(vp, dvp, leaf_name, leaf_length, 3918 &rpath, &fbuf, &len); 3919 3920 if (error != 0) { 3921 vrele(vp); 3922 return (error); 3923 } 3924 3925 if (strlen(rpath) >= pathlen) { 3926 vrele(vp); 3927 error = ENAMETOOLONG; 3928 goto out; 3929 } 3930 3931 /* 3932 * Re-lookup the vnode by path to detect a possible rename. 3933 * As a side effect, the vnode is relocked. 3934 * If vnode was renamed, return ENOENT. 3935 */ 3936 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, path); 3937 error = namei(&nd); 3938 if (error != 0) { 3939 vrele(vp); 3940 goto out; 3941 } 3942 NDFREE_PNBUF(&nd); 3943 vp1 = nd.ni_vp; 3944 vrele(vp); 3945 if (vp1 == vp) 3946 strcpy(path, rpath); 3947 else { 3948 vput(vp1); 3949 error = ENOENT; 3950 } 3951 3952 out: 3953 free(fbuf, M_TEMP); 3954 return (error); 3955 } 3956 3957 #ifdef DDB 3958 static void 3959 db_print_vpath(struct vnode *vp) 3960 { 3961 3962 while (vp != NULL) { 3963 db_printf("%p: ", vp); 3964 if (vp == rootvnode) { 3965 db_printf("/"); 3966 vp = NULL; 3967 } else { 3968 if (vp->v_vflag & VV_ROOT) { 3969 db_printf("<mount point>"); 3970 vp = vp->v_mount->mnt_vnodecovered; 3971 } else { 3972 struct namecache *ncp; 3973 char *ncn; 3974 int i; 3975 3976 ncp = TAILQ_FIRST(&vp->v_cache_dst); 3977 if (ncp != NULL) { 3978 ncn = ncp->nc_name; 3979 for (i = 0; i < ncp->nc_nlen; i++) 3980 db_printf("%c", *ncn++); 3981 vp = ncp->nc_dvp; 3982 } else { 3983 vp = NULL; 3984 } 3985 } 3986 } 3987 db_printf("\n"); 3988 } 3989 3990 return; 3991 } 3992 3993 DB_SHOW_COMMAND(vpath, db_show_vpath) 3994 { 3995 struct vnode *vp; 3996 3997 if (!have_addr) { 3998 db_printf("usage: show vpath <struct vnode *>\n"); 3999 return; 4000 } 4001 4002 vp = (struct vnode *)addr; 4003 db_print_vpath(vp); 4004 } 4005 4006 #endif 4007 4008 static int cache_fast_lookup = 1; 4009 4010 #define CACHE_FPL_FAILED -2020 4011 4012 static int 4013 cache_vop_bad_vexec(struct vop_fplookup_vexec_args *v) 4014 { 4015 vn_printf(v->a_vp, "no proper vop_fplookup_vexec\n"); 4016 panic("no proper vop_fplookup_vexec"); 4017 } 4018 4019 static int 4020 cache_vop_bad_symlink(struct vop_fplookup_symlink_args *v) 4021 { 4022 vn_printf(v->a_vp, "no proper vop_fplookup_symlink\n"); 4023 panic("no proper vop_fplookup_symlink"); 4024 } 4025 4026 void 4027 cache_vop_vector_register(struct vop_vector *v) 4028 { 4029 size_t ops; 4030 4031 ops = 0; 4032 if (v->vop_fplookup_vexec != NULL) { 4033 ops++; 4034 } 4035 if (v->vop_fplookup_symlink != NULL) { 4036 ops++; 4037 } 4038 4039 if (ops == 2) { 4040 return; 4041 } 4042 4043 if (ops == 0) { 4044 v->vop_fplookup_vexec = cache_vop_bad_vexec; 4045 v->vop_fplookup_symlink = cache_vop_bad_symlink; 4046 return; 4047 } 4048 4049 printf("%s: invalid vop vector %p -- either all or none fplookup vops " 4050 "need to be provided", __func__, v); 4051 if (v->vop_fplookup_vexec == NULL) { 4052 printf("%s: missing vop_fplookup_vexec\n", __func__); 4053 } 4054 if (v->vop_fplookup_symlink == NULL) { 4055 printf("%s: missing vop_fplookup_symlink\n", __func__); 4056 } 4057 panic("bad vop vector %p", v); 4058 } 4059 4060 #ifdef INVARIANTS 4061 void 4062 cache_validate_vop_vector(struct mount *mp, struct vop_vector *vops) 4063 { 4064 if (mp == NULL) 4065 return; 4066 4067 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) 4068 return; 4069 4070 if (vops->vop_fplookup_vexec == NULL || 4071 vops->vop_fplookup_vexec == cache_vop_bad_vexec) 4072 panic("bad vop_fplookup_vexec on vector %p for filesystem %s", 4073 vops, mp->mnt_vfc->vfc_name); 4074 4075 if (vops->vop_fplookup_symlink == NULL || 4076 vops->vop_fplookup_symlink == cache_vop_bad_symlink) 4077 panic("bad vop_fplookup_symlink on vector %p for filesystem %s", 4078 vops, mp->mnt_vfc->vfc_name); 4079 } 4080 #endif 4081 4082 void 4083 cache_fast_lookup_enabled_recalc(void) 4084 { 4085 int lookup_flag; 4086 int mac_on; 4087 4088 #ifdef MAC 4089 mac_on = mac_vnode_check_lookup_enabled(); 4090 mac_on |= mac_vnode_check_readlink_enabled(); 4091 #else 4092 mac_on = 0; 4093 #endif 4094 4095 lookup_flag = atomic_load_int(&cache_fast_lookup); 4096 if (lookup_flag && !mac_on) { 4097 atomic_store_char(&cache_fast_lookup_enabled, true); 4098 } else { 4099 atomic_store_char(&cache_fast_lookup_enabled, false); 4100 } 4101 } 4102 4103 static int 4104 syscal_vfs_cache_fast_lookup(SYSCTL_HANDLER_ARGS) 4105 { 4106 int error, old; 4107 4108 old = atomic_load_int(&cache_fast_lookup); 4109 error = sysctl_handle_int(oidp, arg1, arg2, req); 4110 if (error == 0 && req->newptr && old != atomic_load_int(&cache_fast_lookup)) 4111 cache_fast_lookup_enabled_recalc(); 4112 return (error); 4113 } 4114 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, fast_lookup, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_MPSAFE, 4115 &cache_fast_lookup, 0, syscal_vfs_cache_fast_lookup, "IU", ""); 4116 4117 /* 4118 * Components of nameidata (or objects it can point to) which may 4119 * need restoring in case fast path lookup fails. 4120 */ 4121 struct nameidata_outer { 4122 size_t ni_pathlen; 4123 int cn_flags; 4124 }; 4125 4126 struct nameidata_saved { 4127 #ifdef INVARIANTS 4128 char *cn_nameptr; 4129 size_t ni_pathlen; 4130 #endif 4131 }; 4132 4133 #ifdef INVARIANTS 4134 struct cache_fpl_debug { 4135 size_t ni_pathlen; 4136 }; 4137 #endif 4138 4139 struct cache_fpl { 4140 struct nameidata *ndp; 4141 struct componentname *cnp; 4142 char *nulchar; 4143 struct vnode *dvp; 4144 struct vnode *tvp; 4145 seqc_t dvp_seqc; 4146 seqc_t tvp_seqc; 4147 uint32_t hash; 4148 struct nameidata_saved snd; 4149 struct nameidata_outer snd_outer; 4150 int line; 4151 enum cache_fpl_status status:8; 4152 bool in_smr; 4153 bool fsearch; 4154 struct pwd **pwd; 4155 #ifdef INVARIANTS 4156 struct cache_fpl_debug debug; 4157 #endif 4158 }; 4159 4160 static bool cache_fplookup_mp_supported(struct mount *mp); 4161 static bool cache_fplookup_is_mp(struct cache_fpl *fpl); 4162 static int cache_fplookup_cross_mount(struct cache_fpl *fpl); 4163 static int cache_fplookup_partial_setup(struct cache_fpl *fpl); 4164 static int cache_fplookup_skip_slashes(struct cache_fpl *fpl); 4165 static int cache_fplookup_trailingslash(struct cache_fpl *fpl); 4166 static void cache_fpl_pathlen_dec(struct cache_fpl *fpl); 4167 static void cache_fpl_pathlen_inc(struct cache_fpl *fpl); 4168 static void cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n); 4169 static void cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n); 4170 4171 static void 4172 cache_fpl_cleanup_cnp(struct componentname *cnp) 4173 { 4174 4175 uma_zfree(namei_zone, cnp->cn_pnbuf); 4176 cnp->cn_pnbuf = NULL; 4177 cnp->cn_nameptr = NULL; 4178 } 4179 4180 static struct vnode * 4181 cache_fpl_handle_root(struct cache_fpl *fpl) 4182 { 4183 struct nameidata *ndp; 4184 struct componentname *cnp; 4185 4186 ndp = fpl->ndp; 4187 cnp = fpl->cnp; 4188 4189 MPASS(*(cnp->cn_nameptr) == '/'); 4190 cnp->cn_nameptr++; 4191 cache_fpl_pathlen_dec(fpl); 4192 4193 if (__predict_false(*(cnp->cn_nameptr) == '/')) { 4194 do { 4195 cnp->cn_nameptr++; 4196 cache_fpl_pathlen_dec(fpl); 4197 } while (*(cnp->cn_nameptr) == '/'); 4198 } 4199 4200 return (ndp->ni_rootdir); 4201 } 4202 4203 static void 4204 cache_fpl_checkpoint_outer(struct cache_fpl *fpl) 4205 { 4206 4207 fpl->snd_outer.ni_pathlen = fpl->ndp->ni_pathlen; 4208 fpl->snd_outer.cn_flags = fpl->ndp->ni_cnd.cn_flags; 4209 } 4210 4211 static void 4212 cache_fpl_checkpoint(struct cache_fpl *fpl) 4213 { 4214 4215 #ifdef INVARIANTS 4216 fpl->snd.cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr; 4217 fpl->snd.ni_pathlen = fpl->debug.ni_pathlen; 4218 #endif 4219 } 4220 4221 static void 4222 cache_fpl_restore_partial(struct cache_fpl *fpl) 4223 { 4224 4225 fpl->ndp->ni_cnd.cn_flags = fpl->snd_outer.cn_flags; 4226 #ifdef INVARIANTS 4227 fpl->debug.ni_pathlen = fpl->snd.ni_pathlen; 4228 #endif 4229 } 4230 4231 static void 4232 cache_fpl_restore_abort(struct cache_fpl *fpl) 4233 { 4234 4235 cache_fpl_restore_partial(fpl); 4236 /* 4237 * It is 0 on entry by API contract. 4238 */ 4239 fpl->ndp->ni_resflags = 0; 4240 fpl->ndp->ni_cnd.cn_nameptr = fpl->ndp->ni_cnd.cn_pnbuf; 4241 fpl->ndp->ni_pathlen = fpl->snd_outer.ni_pathlen; 4242 } 4243 4244 #ifdef INVARIANTS 4245 #define cache_fpl_smr_assert_entered(fpl) ({ \ 4246 struct cache_fpl *_fpl = (fpl); \ 4247 MPASS(_fpl->in_smr == true); \ 4248 VFS_SMR_ASSERT_ENTERED(); \ 4249 }) 4250 #define cache_fpl_smr_assert_not_entered(fpl) ({ \ 4251 struct cache_fpl *_fpl = (fpl); \ 4252 MPASS(_fpl->in_smr == false); \ 4253 VFS_SMR_ASSERT_NOT_ENTERED(); \ 4254 }) 4255 static void 4256 cache_fpl_assert_status(struct cache_fpl *fpl) 4257 { 4258 4259 switch (fpl->status) { 4260 case CACHE_FPL_STATUS_UNSET: 4261 __assert_unreachable(); 4262 break; 4263 case CACHE_FPL_STATUS_DESTROYED: 4264 case CACHE_FPL_STATUS_ABORTED: 4265 case CACHE_FPL_STATUS_PARTIAL: 4266 case CACHE_FPL_STATUS_HANDLED: 4267 break; 4268 } 4269 } 4270 #else 4271 #define cache_fpl_smr_assert_entered(fpl) do { } while (0) 4272 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0) 4273 #define cache_fpl_assert_status(fpl) do { } while (0) 4274 #endif 4275 4276 #define cache_fpl_smr_enter_initial(fpl) ({ \ 4277 struct cache_fpl *_fpl = (fpl); \ 4278 vfs_smr_enter(); \ 4279 _fpl->in_smr = true; \ 4280 }) 4281 4282 #define cache_fpl_smr_enter(fpl) ({ \ 4283 struct cache_fpl *_fpl = (fpl); \ 4284 MPASS(_fpl->in_smr == false); \ 4285 vfs_smr_enter(); \ 4286 _fpl->in_smr = true; \ 4287 }) 4288 4289 #define cache_fpl_smr_exit(fpl) ({ \ 4290 struct cache_fpl *_fpl = (fpl); \ 4291 MPASS(_fpl->in_smr == true); \ 4292 vfs_smr_exit(); \ 4293 _fpl->in_smr = false; \ 4294 }) 4295 4296 static int 4297 cache_fpl_aborted_early_impl(struct cache_fpl *fpl, int line) 4298 { 4299 4300 if (fpl->status != CACHE_FPL_STATUS_UNSET) { 4301 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, 4302 ("%s: converting to abort from %d at %d, set at %d\n", 4303 __func__, fpl->status, line, fpl->line)); 4304 } 4305 cache_fpl_smr_assert_not_entered(fpl); 4306 fpl->status = CACHE_FPL_STATUS_ABORTED; 4307 fpl->line = line; 4308 return (CACHE_FPL_FAILED); 4309 } 4310 4311 #define cache_fpl_aborted_early(x) cache_fpl_aborted_early_impl((x), __LINE__) 4312 4313 static int __noinline 4314 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line) 4315 { 4316 struct nameidata *ndp; 4317 struct componentname *cnp; 4318 4319 ndp = fpl->ndp; 4320 cnp = fpl->cnp; 4321 4322 if (fpl->status != CACHE_FPL_STATUS_UNSET) { 4323 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, 4324 ("%s: converting to abort from %d at %d, set at %d\n", 4325 __func__, fpl->status, line, fpl->line)); 4326 } 4327 fpl->status = CACHE_FPL_STATUS_ABORTED; 4328 fpl->line = line; 4329 if (fpl->in_smr) 4330 cache_fpl_smr_exit(fpl); 4331 cache_fpl_restore_abort(fpl); 4332 /* 4333 * Resolving symlinks overwrites data passed by the caller. 4334 * Let namei know. 4335 */ 4336 if (ndp->ni_loopcnt > 0) { 4337 fpl->status = CACHE_FPL_STATUS_DESTROYED; 4338 cache_fpl_cleanup_cnp(cnp); 4339 } 4340 return (CACHE_FPL_FAILED); 4341 } 4342 4343 #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__) 4344 4345 static int __noinline 4346 cache_fpl_partial_impl(struct cache_fpl *fpl, int line) 4347 { 4348 4349 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 4350 ("%s: setting to partial at %d, but already set to %d at %d\n", 4351 __func__, line, fpl->status, fpl->line)); 4352 cache_fpl_smr_assert_entered(fpl); 4353 fpl->status = CACHE_FPL_STATUS_PARTIAL; 4354 fpl->line = line; 4355 return (cache_fplookup_partial_setup(fpl)); 4356 } 4357 4358 #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__) 4359 4360 static int 4361 cache_fpl_handled_impl(struct cache_fpl *fpl, int line) 4362 { 4363 4364 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 4365 ("%s: setting to handled at %d, but already set to %d at %d\n", 4366 __func__, line, fpl->status, fpl->line)); 4367 cache_fpl_smr_assert_not_entered(fpl); 4368 fpl->status = CACHE_FPL_STATUS_HANDLED; 4369 fpl->line = line; 4370 return (0); 4371 } 4372 4373 #define cache_fpl_handled(x) cache_fpl_handled_impl((x), __LINE__) 4374 4375 static int 4376 cache_fpl_handled_error_impl(struct cache_fpl *fpl, int error, int line) 4377 { 4378 4379 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 4380 ("%s: setting to handled at %d, but already set to %d at %d\n", 4381 __func__, line, fpl->status, fpl->line)); 4382 MPASS(error != 0); 4383 MPASS(error != CACHE_FPL_FAILED); 4384 cache_fpl_smr_assert_not_entered(fpl); 4385 fpl->status = CACHE_FPL_STATUS_HANDLED; 4386 fpl->line = line; 4387 fpl->dvp = NULL; 4388 fpl->tvp = NULL; 4389 return (error); 4390 } 4391 4392 #define cache_fpl_handled_error(x, e) cache_fpl_handled_error_impl((x), (e), __LINE__) 4393 4394 static bool 4395 cache_fpl_terminated(struct cache_fpl *fpl) 4396 { 4397 4398 return (fpl->status != CACHE_FPL_STATUS_UNSET); 4399 } 4400 4401 #define CACHE_FPL_SUPPORTED_CN_FLAGS \ 4402 (NC_NOMAKEENTRY | NC_KEEPPOSENTRY | LOCKLEAF | LOCKPARENT | WANTPARENT | \ 4403 FAILIFEXISTS | FOLLOW | EMPTYPATH | LOCKSHARED | ISRESTARTED | WILLBEDIR | \ 4404 ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK | OPENREAD | \ 4405 OPENWRITE | WANTIOCTLCAPS) 4406 4407 #define CACHE_FPL_INTERNAL_CN_FLAGS \ 4408 (ISDOTDOT | MAKEENTRY | ISLASTCN) 4409 4410 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, 4411 "supported and internal flags overlap"); 4412 4413 static bool 4414 cache_fpl_islastcn(struct nameidata *ndp) 4415 { 4416 4417 return (*ndp->ni_next == 0); 4418 } 4419 4420 static bool 4421 cache_fpl_istrailingslash(struct cache_fpl *fpl) 4422 { 4423 4424 MPASS(fpl->nulchar > fpl->cnp->cn_pnbuf); 4425 return (*(fpl->nulchar - 1) == '/'); 4426 } 4427 4428 static bool 4429 cache_fpl_isdotdot(struct componentname *cnp) 4430 { 4431 4432 if (cnp->cn_namelen == 2 && 4433 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') 4434 return (true); 4435 return (false); 4436 } 4437 4438 static bool 4439 cache_can_fplookup(struct cache_fpl *fpl) 4440 { 4441 struct nameidata *ndp; 4442 struct componentname *cnp; 4443 struct thread *td; 4444 4445 ndp = fpl->ndp; 4446 cnp = fpl->cnp; 4447 td = curthread; 4448 4449 if (!atomic_load_char(&cache_fast_lookup_enabled)) { 4450 cache_fpl_aborted_early(fpl); 4451 return (false); 4452 } 4453 if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) { 4454 cache_fpl_aborted_early(fpl); 4455 return (false); 4456 } 4457 if (IN_CAPABILITY_MODE(td)) { 4458 cache_fpl_aborted_early(fpl); 4459 return (false); 4460 } 4461 if (AUDITING_TD(td)) { 4462 cache_fpl_aborted_early(fpl); 4463 return (false); 4464 } 4465 if (ndp->ni_startdir != NULL) { 4466 cache_fpl_aborted_early(fpl); 4467 return (false); 4468 } 4469 return (true); 4470 } 4471 4472 static int __noinline 4473 cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp) 4474 { 4475 struct nameidata *ndp; 4476 struct componentname *cnp; 4477 int error; 4478 bool fsearch; 4479 4480 ndp = fpl->ndp; 4481 cnp = fpl->cnp; 4482 4483 error = fgetvp_lookup_smr(ndp->ni_dirfd, ndp, vpp, &fsearch); 4484 if (__predict_false(error != 0)) { 4485 return (cache_fpl_aborted(fpl)); 4486 } 4487 fpl->fsearch = fsearch; 4488 if ((*vpp)->v_type != VDIR) { 4489 if (!((cnp->cn_flags & EMPTYPATH) != 0 && cnp->cn_pnbuf[0] == '\0')) { 4490 cache_fpl_smr_exit(fpl); 4491 return (cache_fpl_handled_error(fpl, ENOTDIR)); 4492 } 4493 } 4494 return (0); 4495 } 4496 4497 static int __noinline 4498 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp, 4499 uint32_t hash) 4500 { 4501 struct componentname *cnp; 4502 struct vnode *dvp; 4503 4504 cnp = fpl->cnp; 4505 dvp = fpl->dvp; 4506 4507 cache_fpl_smr_exit(fpl); 4508 if (cache_neg_promote_cond(dvp, cnp, oncp, hash)) 4509 return (cache_fpl_handled_error(fpl, ENOENT)); 4510 else 4511 return (cache_fpl_aborted(fpl)); 4512 } 4513 4514 /* 4515 * The target vnode is not supported, prepare for the slow path to take over. 4516 */ 4517 static int __noinline 4518 cache_fplookup_partial_setup(struct cache_fpl *fpl) 4519 { 4520 struct nameidata *ndp; 4521 struct componentname *cnp; 4522 enum vgetstate dvs; 4523 struct vnode *dvp; 4524 struct pwd *pwd; 4525 seqc_t dvp_seqc; 4526 4527 ndp = fpl->ndp; 4528 cnp = fpl->cnp; 4529 pwd = *(fpl->pwd); 4530 dvp = fpl->dvp; 4531 dvp_seqc = fpl->dvp_seqc; 4532 4533 if (!pwd_hold_smr(pwd)) { 4534 return (cache_fpl_aborted(fpl)); 4535 } 4536 4537 /* 4538 * Note that seqc is checked before the vnode is locked, so by 4539 * the time regular lookup gets to it it may have moved. 4540 * 4541 * Ultimately this does not affect correctness, any lookup errors 4542 * are userspace racing with itself. It is guaranteed that any 4543 * path which ultimately gets found could also have been found 4544 * by regular lookup going all the way in absence of concurrent 4545 * modifications. 4546 */ 4547 dvs = vget_prep_smr(dvp); 4548 cache_fpl_smr_exit(fpl); 4549 if (__predict_false(dvs == VGET_NONE)) { 4550 pwd_drop(pwd); 4551 return (cache_fpl_aborted(fpl)); 4552 } 4553 4554 vget_finish_ref(dvp, dvs); 4555 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4556 vrele(dvp); 4557 pwd_drop(pwd); 4558 return (cache_fpl_aborted(fpl)); 4559 } 4560 4561 cache_fpl_restore_partial(fpl); 4562 #ifdef INVARIANTS 4563 if (cnp->cn_nameptr != fpl->snd.cn_nameptr) { 4564 panic("%s: cn_nameptr mismatch (%p != %p) full [%s]\n", __func__, 4565 cnp->cn_nameptr, fpl->snd.cn_nameptr, cnp->cn_pnbuf); 4566 } 4567 #endif 4568 4569 ndp->ni_startdir = dvp; 4570 cnp->cn_flags |= MAKEENTRY; 4571 if (cache_fpl_islastcn(ndp)) 4572 cnp->cn_flags |= ISLASTCN; 4573 if (cache_fpl_isdotdot(cnp)) 4574 cnp->cn_flags |= ISDOTDOT; 4575 4576 /* 4577 * Skip potential extra slashes parsing did not take care of. 4578 * cache_fplookup_skip_slashes explains the mechanism. 4579 */ 4580 if (__predict_false(*(cnp->cn_nameptr) == '/')) { 4581 do { 4582 cnp->cn_nameptr++; 4583 cache_fpl_pathlen_dec(fpl); 4584 } while (*(cnp->cn_nameptr) == '/'); 4585 } 4586 4587 ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1; 4588 #ifdef INVARIANTS 4589 if (ndp->ni_pathlen != fpl->debug.ni_pathlen) { 4590 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n", 4591 __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar, 4592 cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf); 4593 } 4594 #endif 4595 return (0); 4596 } 4597 4598 static int 4599 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs) 4600 { 4601 struct componentname *cnp; 4602 struct vnode *tvp; 4603 seqc_t tvp_seqc; 4604 int error, lkflags; 4605 4606 cnp = fpl->cnp; 4607 tvp = fpl->tvp; 4608 tvp_seqc = fpl->tvp_seqc; 4609 4610 if ((cnp->cn_flags & LOCKLEAF) != 0) { 4611 lkflags = LK_SHARED; 4612 if ((cnp->cn_flags & LOCKSHARED) == 0) 4613 lkflags = LK_EXCLUSIVE; 4614 error = vget_finish(tvp, lkflags, tvs); 4615 if (__predict_false(error != 0)) { 4616 return (cache_fpl_aborted(fpl)); 4617 } 4618 } else { 4619 vget_finish_ref(tvp, tvs); 4620 } 4621 4622 if (!vn_seqc_consistent(tvp, tvp_seqc)) { 4623 if ((cnp->cn_flags & LOCKLEAF) != 0) 4624 vput(tvp); 4625 else 4626 vrele(tvp); 4627 return (cache_fpl_aborted(fpl)); 4628 } 4629 4630 return (cache_fpl_handled(fpl)); 4631 } 4632 4633 /* 4634 * They want to possibly modify the state of the namecache. 4635 */ 4636 static int __noinline 4637 cache_fplookup_final_modifying(struct cache_fpl *fpl) 4638 { 4639 struct nameidata *ndp __diagused; 4640 struct componentname *cnp; 4641 enum vgetstate dvs; 4642 struct vnode *dvp, *tvp; 4643 struct mount *mp; 4644 seqc_t dvp_seqc; 4645 int error; 4646 bool docache; 4647 4648 ndp = fpl->ndp; 4649 cnp = fpl->cnp; 4650 dvp = fpl->dvp; 4651 dvp_seqc = fpl->dvp_seqc; 4652 4653 MPASS(*(cnp->cn_nameptr) != '/'); 4654 MPASS(cache_fpl_islastcn(ndp)); 4655 if ((cnp->cn_flags & LOCKPARENT) == 0) 4656 MPASS((cnp->cn_flags & WANTPARENT) != 0); 4657 MPASS((cnp->cn_flags & TRAILINGSLASH) == 0); 4658 MPASS(cnp->cn_nameiop == CREATE || cnp->cn_nameiop == DELETE || 4659 cnp->cn_nameiop == RENAME); 4660 MPASS((cnp->cn_flags & MAKEENTRY) == 0); 4661 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 4662 4663 docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE; 4664 if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME) 4665 docache = false; 4666 4667 /* 4668 * Regular lookup nulifies the slash, which we don't do here. 4669 * Don't take chances with filesystem routines seeing it for 4670 * the last entry. 4671 */ 4672 if (cache_fpl_istrailingslash(fpl)) { 4673 return (cache_fpl_partial(fpl)); 4674 } 4675 4676 mp = atomic_load_ptr(&dvp->v_mount); 4677 if (__predict_false(mp == NULL)) { 4678 return (cache_fpl_aborted(fpl)); 4679 } 4680 4681 if (__predict_false(mp->mnt_flag & MNT_RDONLY)) { 4682 cache_fpl_smr_exit(fpl); 4683 /* 4684 * Original code keeps not checking for CREATE which 4685 * might be a bug. For now let the old lookup decide. 4686 */ 4687 if (cnp->cn_nameiop == CREATE) { 4688 return (cache_fpl_aborted(fpl)); 4689 } 4690 return (cache_fpl_handled_error(fpl, EROFS)); 4691 } 4692 4693 if (fpl->tvp != NULL && (cnp->cn_flags & FAILIFEXISTS) != 0) { 4694 cache_fpl_smr_exit(fpl); 4695 return (cache_fpl_handled_error(fpl, EEXIST)); 4696 } 4697 4698 /* 4699 * Secure access to dvp; check cache_fplookup_partial_setup for 4700 * reasoning. 4701 * 4702 * XXX At least UFS requires its lookup routine to be called for 4703 * the last path component, which leads to some level of complication 4704 * and inefficiency: 4705 * - the target routine always locks the target vnode, but our caller 4706 * may not need it locked 4707 * - some of the VOP machinery asserts that the parent is locked, which 4708 * once more may be not required 4709 * 4710 * TODO: add a flag for filesystems which don't need this. 4711 */ 4712 dvs = vget_prep_smr(dvp); 4713 cache_fpl_smr_exit(fpl); 4714 if (__predict_false(dvs == VGET_NONE)) { 4715 return (cache_fpl_aborted(fpl)); 4716 } 4717 4718 vget_finish_ref(dvp, dvs); 4719 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4720 vrele(dvp); 4721 return (cache_fpl_aborted(fpl)); 4722 } 4723 4724 error = vn_lock(dvp, LK_EXCLUSIVE); 4725 if (__predict_false(error != 0)) { 4726 vrele(dvp); 4727 return (cache_fpl_aborted(fpl)); 4728 } 4729 4730 tvp = NULL; 4731 cnp->cn_flags |= ISLASTCN; 4732 if (docache) 4733 cnp->cn_flags |= MAKEENTRY; 4734 if (cache_fpl_isdotdot(cnp)) 4735 cnp->cn_flags |= ISDOTDOT; 4736 cnp->cn_lkflags = LK_EXCLUSIVE; 4737 error = VOP_LOOKUP(dvp, &tvp, cnp); 4738 switch (error) { 4739 case EJUSTRETURN: 4740 case 0: 4741 break; 4742 case ENOTDIR: 4743 case ENOENT: 4744 vput(dvp); 4745 return (cache_fpl_handled_error(fpl, error)); 4746 default: 4747 vput(dvp); 4748 return (cache_fpl_aborted(fpl)); 4749 } 4750 4751 fpl->tvp = tvp; 4752 4753 if (tvp == NULL) { 4754 MPASS(error == EJUSTRETURN); 4755 if ((cnp->cn_flags & LOCKPARENT) == 0) { 4756 VOP_UNLOCK(dvp); 4757 } 4758 return (cache_fpl_handled(fpl)); 4759 } 4760 4761 /* 4762 * There are very hairy corner cases concerning various flag combinations 4763 * and locking state. In particular here we only hold one lock instead of 4764 * two. 4765 * 4766 * Skip the complexity as it is of no significance for normal workloads. 4767 */ 4768 if (__predict_false(tvp == dvp)) { 4769 vput(dvp); 4770 vrele(tvp); 4771 return (cache_fpl_aborted(fpl)); 4772 } 4773 4774 /* 4775 * If they want the symlink itself we are fine, but if they want to 4776 * follow it regular lookup has to be engaged. 4777 */ 4778 if (tvp->v_type == VLNK) { 4779 if ((cnp->cn_flags & FOLLOW) != 0) { 4780 vput(dvp); 4781 vput(tvp); 4782 return (cache_fpl_aborted(fpl)); 4783 } 4784 } 4785 4786 /* 4787 * Since we expect this to be the terminal vnode it should almost never 4788 * be a mount point. 4789 */ 4790 if (__predict_false(cache_fplookup_is_mp(fpl))) { 4791 vput(dvp); 4792 vput(tvp); 4793 return (cache_fpl_aborted(fpl)); 4794 } 4795 4796 if ((cnp->cn_flags & FAILIFEXISTS) != 0) { 4797 vput(dvp); 4798 vput(tvp); 4799 return (cache_fpl_handled_error(fpl, EEXIST)); 4800 } 4801 4802 if ((cnp->cn_flags & LOCKLEAF) == 0) { 4803 VOP_UNLOCK(tvp); 4804 } 4805 4806 if ((cnp->cn_flags & LOCKPARENT) == 0) { 4807 VOP_UNLOCK(dvp); 4808 } 4809 4810 return (cache_fpl_handled(fpl)); 4811 } 4812 4813 static int __noinline 4814 cache_fplookup_modifying(struct cache_fpl *fpl) 4815 { 4816 struct nameidata *ndp; 4817 4818 ndp = fpl->ndp; 4819 4820 if (!cache_fpl_islastcn(ndp)) { 4821 return (cache_fpl_partial(fpl)); 4822 } 4823 return (cache_fplookup_final_modifying(fpl)); 4824 } 4825 4826 static int __noinline 4827 cache_fplookup_final_withparent(struct cache_fpl *fpl) 4828 { 4829 struct componentname *cnp; 4830 enum vgetstate dvs, tvs; 4831 struct vnode *dvp, *tvp; 4832 seqc_t dvp_seqc; 4833 int error; 4834 4835 cnp = fpl->cnp; 4836 dvp = fpl->dvp; 4837 dvp_seqc = fpl->dvp_seqc; 4838 tvp = fpl->tvp; 4839 4840 MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0); 4841 4842 /* 4843 * This is less efficient than it can be for simplicity. 4844 */ 4845 dvs = vget_prep_smr(dvp); 4846 if (__predict_false(dvs == VGET_NONE)) { 4847 return (cache_fpl_aborted(fpl)); 4848 } 4849 tvs = vget_prep_smr(tvp); 4850 if (__predict_false(tvs == VGET_NONE)) { 4851 cache_fpl_smr_exit(fpl); 4852 vget_abort(dvp, dvs); 4853 return (cache_fpl_aborted(fpl)); 4854 } 4855 4856 cache_fpl_smr_exit(fpl); 4857 4858 if ((cnp->cn_flags & LOCKPARENT) != 0) { 4859 error = vget_finish(dvp, LK_EXCLUSIVE, dvs); 4860 if (__predict_false(error != 0)) { 4861 vget_abort(tvp, tvs); 4862 return (cache_fpl_aborted(fpl)); 4863 } 4864 } else { 4865 vget_finish_ref(dvp, dvs); 4866 } 4867 4868 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4869 vget_abort(tvp, tvs); 4870 if ((cnp->cn_flags & LOCKPARENT) != 0) 4871 vput(dvp); 4872 else 4873 vrele(dvp); 4874 return (cache_fpl_aborted(fpl)); 4875 } 4876 4877 error = cache_fplookup_final_child(fpl, tvs); 4878 if (__predict_false(error != 0)) { 4879 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED || 4880 fpl->status == CACHE_FPL_STATUS_DESTROYED); 4881 if ((cnp->cn_flags & LOCKPARENT) != 0) 4882 vput(dvp); 4883 else 4884 vrele(dvp); 4885 return (error); 4886 } 4887 4888 MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED); 4889 return (0); 4890 } 4891 4892 static int 4893 cache_fplookup_final(struct cache_fpl *fpl) 4894 { 4895 struct componentname *cnp; 4896 enum vgetstate tvs; 4897 struct vnode *dvp, *tvp; 4898 seqc_t dvp_seqc; 4899 4900 cnp = fpl->cnp; 4901 dvp = fpl->dvp; 4902 dvp_seqc = fpl->dvp_seqc; 4903 tvp = fpl->tvp; 4904 4905 MPASS(*(cnp->cn_nameptr) != '/'); 4906 4907 if (cnp->cn_nameiop != LOOKUP) { 4908 return (cache_fplookup_final_modifying(fpl)); 4909 } 4910 4911 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) 4912 return (cache_fplookup_final_withparent(fpl)); 4913 4914 tvs = vget_prep_smr(tvp); 4915 if (__predict_false(tvs == VGET_NONE)) { 4916 return (cache_fpl_partial(fpl)); 4917 } 4918 4919 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4920 cache_fpl_smr_exit(fpl); 4921 vget_abort(tvp, tvs); 4922 return (cache_fpl_aborted(fpl)); 4923 } 4924 4925 cache_fpl_smr_exit(fpl); 4926 return (cache_fplookup_final_child(fpl, tvs)); 4927 } 4928 4929 /* 4930 * Comment from locked lookup: 4931 * Check for degenerate name (e.g. / or "") which is a way of talking about a 4932 * directory, e.g. like "/." or ".". 4933 */ 4934 static int __noinline 4935 cache_fplookup_degenerate(struct cache_fpl *fpl) 4936 { 4937 struct componentname *cnp; 4938 struct vnode *dvp; 4939 enum vgetstate dvs; 4940 int error, lkflags; 4941 #ifdef INVARIANTS 4942 char *cp; 4943 #endif 4944 4945 fpl->tvp = fpl->dvp; 4946 fpl->tvp_seqc = fpl->dvp_seqc; 4947 4948 cnp = fpl->cnp; 4949 dvp = fpl->dvp; 4950 4951 #ifdef INVARIANTS 4952 for (cp = cnp->cn_pnbuf; *cp != '\0'; cp++) { 4953 KASSERT(*cp == '/', 4954 ("%s: encountered non-slash; string [%s]\n", __func__, 4955 cnp->cn_pnbuf)); 4956 } 4957 #endif 4958 4959 if (__predict_false(cnp->cn_nameiop != LOOKUP)) { 4960 cache_fpl_smr_exit(fpl); 4961 return (cache_fpl_handled_error(fpl, EISDIR)); 4962 } 4963 4964 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) { 4965 return (cache_fplookup_final_withparent(fpl)); 4966 } 4967 4968 dvs = vget_prep_smr(dvp); 4969 cache_fpl_smr_exit(fpl); 4970 if (__predict_false(dvs == VGET_NONE)) { 4971 return (cache_fpl_aborted(fpl)); 4972 } 4973 4974 if ((cnp->cn_flags & LOCKLEAF) != 0) { 4975 lkflags = LK_SHARED; 4976 if ((cnp->cn_flags & LOCKSHARED) == 0) 4977 lkflags = LK_EXCLUSIVE; 4978 error = vget_finish(dvp, lkflags, dvs); 4979 if (__predict_false(error != 0)) { 4980 return (cache_fpl_aborted(fpl)); 4981 } 4982 } else { 4983 vget_finish_ref(dvp, dvs); 4984 } 4985 return (cache_fpl_handled(fpl)); 4986 } 4987 4988 static int __noinline 4989 cache_fplookup_emptypath(struct cache_fpl *fpl) 4990 { 4991 struct nameidata *ndp; 4992 struct componentname *cnp; 4993 enum vgetstate tvs; 4994 struct vnode *tvp; 4995 int error, lkflags; 4996 4997 fpl->tvp = fpl->dvp; 4998 fpl->tvp_seqc = fpl->dvp_seqc; 4999 5000 ndp = fpl->ndp; 5001 cnp = fpl->cnp; 5002 tvp = fpl->tvp; 5003 5004 MPASS(*cnp->cn_pnbuf == '\0'); 5005 5006 if (__predict_false((cnp->cn_flags & EMPTYPATH) == 0)) { 5007 cache_fpl_smr_exit(fpl); 5008 return (cache_fpl_handled_error(fpl, ENOENT)); 5009 } 5010 5011 MPASS((cnp->cn_flags & (LOCKPARENT | WANTPARENT)) == 0); 5012 5013 tvs = vget_prep_smr(tvp); 5014 cache_fpl_smr_exit(fpl); 5015 if (__predict_false(tvs == VGET_NONE)) { 5016 return (cache_fpl_aborted(fpl)); 5017 } 5018 5019 if ((cnp->cn_flags & LOCKLEAF) != 0) { 5020 lkflags = LK_SHARED; 5021 if ((cnp->cn_flags & LOCKSHARED) == 0) 5022 lkflags = LK_EXCLUSIVE; 5023 error = vget_finish(tvp, lkflags, tvs); 5024 if (__predict_false(error != 0)) { 5025 return (cache_fpl_aborted(fpl)); 5026 } 5027 } else { 5028 vget_finish_ref(tvp, tvs); 5029 } 5030 5031 ndp->ni_resflags |= NIRES_EMPTYPATH; 5032 return (cache_fpl_handled(fpl)); 5033 } 5034 5035 static int __noinline 5036 cache_fplookup_noentry(struct cache_fpl *fpl) 5037 { 5038 struct nameidata *ndp; 5039 struct componentname *cnp; 5040 enum vgetstate dvs; 5041 struct vnode *dvp, *tvp; 5042 seqc_t dvp_seqc; 5043 int error; 5044 5045 ndp = fpl->ndp; 5046 cnp = fpl->cnp; 5047 dvp = fpl->dvp; 5048 dvp_seqc = fpl->dvp_seqc; 5049 5050 MPASS((cnp->cn_flags & MAKEENTRY) == 0); 5051 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 5052 if (cnp->cn_nameiop == LOOKUP) 5053 MPASS((cnp->cn_flags & NOCACHE) == 0); 5054 MPASS(!cache_fpl_isdotdot(cnp)); 5055 5056 /* 5057 * Hack: delayed name len checking. 5058 */ 5059 if (__predict_false(cnp->cn_namelen > NAME_MAX)) { 5060 cache_fpl_smr_exit(fpl); 5061 return (cache_fpl_handled_error(fpl, ENAMETOOLONG)); 5062 } 5063 5064 if (cnp->cn_nameptr[0] == '/') { 5065 return (cache_fplookup_skip_slashes(fpl)); 5066 } 5067 5068 if (cnp->cn_pnbuf[0] == '\0') { 5069 return (cache_fplookup_emptypath(fpl)); 5070 } 5071 5072 if (cnp->cn_nameptr[0] == '\0') { 5073 if (fpl->tvp == NULL) { 5074 return (cache_fplookup_degenerate(fpl)); 5075 } 5076 return (cache_fplookup_trailingslash(fpl)); 5077 } 5078 5079 if (cnp->cn_nameiop != LOOKUP) { 5080 fpl->tvp = NULL; 5081 return (cache_fplookup_modifying(fpl)); 5082 } 5083 5084 /* 5085 * Only try to fill in the component if it is the last one, 5086 * otherwise not only there may be several to handle but the 5087 * walk may be complicated. 5088 */ 5089 if (!cache_fpl_islastcn(ndp)) { 5090 return (cache_fpl_partial(fpl)); 5091 } 5092 5093 /* 5094 * Regular lookup nulifies the slash, which we don't do here. 5095 * Don't take chances with filesystem routines seeing it for 5096 * the last entry. 5097 */ 5098 if (cache_fpl_istrailingslash(fpl)) { 5099 return (cache_fpl_partial(fpl)); 5100 } 5101 5102 /* 5103 * Secure access to dvp; check cache_fplookup_partial_setup for 5104 * reasoning. 5105 */ 5106 dvs = vget_prep_smr(dvp); 5107 cache_fpl_smr_exit(fpl); 5108 if (__predict_false(dvs == VGET_NONE)) { 5109 return (cache_fpl_aborted(fpl)); 5110 } 5111 5112 vget_finish_ref(dvp, dvs); 5113 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 5114 vrele(dvp); 5115 return (cache_fpl_aborted(fpl)); 5116 } 5117 5118 error = vn_lock(dvp, LK_SHARED); 5119 if (__predict_false(error != 0)) { 5120 vrele(dvp); 5121 return (cache_fpl_aborted(fpl)); 5122 } 5123 5124 tvp = NULL; 5125 /* 5126 * TODO: provide variants which don't require locking either vnode. 5127 */ 5128 cnp->cn_flags |= ISLASTCN | MAKEENTRY; 5129 cnp->cn_lkflags = LK_SHARED; 5130 if ((cnp->cn_flags & LOCKSHARED) == 0) { 5131 cnp->cn_lkflags = LK_EXCLUSIVE; 5132 } 5133 error = VOP_LOOKUP(dvp, &tvp, cnp); 5134 switch (error) { 5135 case EJUSTRETURN: 5136 case 0: 5137 break; 5138 case ENOTDIR: 5139 case ENOENT: 5140 vput(dvp); 5141 return (cache_fpl_handled_error(fpl, error)); 5142 default: 5143 vput(dvp); 5144 return (cache_fpl_aborted(fpl)); 5145 } 5146 5147 fpl->tvp = tvp; 5148 5149 if (tvp == NULL) { 5150 MPASS(error == EJUSTRETURN); 5151 if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) { 5152 vput(dvp); 5153 } else if ((cnp->cn_flags & LOCKPARENT) == 0) { 5154 VOP_UNLOCK(dvp); 5155 } 5156 return (cache_fpl_handled(fpl)); 5157 } 5158 5159 if (tvp->v_type == VLNK) { 5160 if ((cnp->cn_flags & FOLLOW) != 0) { 5161 vput(dvp); 5162 vput(tvp); 5163 return (cache_fpl_aborted(fpl)); 5164 } 5165 } 5166 5167 if (__predict_false(cache_fplookup_is_mp(fpl))) { 5168 vput(dvp); 5169 vput(tvp); 5170 return (cache_fpl_aborted(fpl)); 5171 } 5172 5173 if ((cnp->cn_flags & LOCKLEAF) == 0) { 5174 VOP_UNLOCK(tvp); 5175 } 5176 5177 if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) { 5178 vput(dvp); 5179 } else if ((cnp->cn_flags & LOCKPARENT) == 0) { 5180 VOP_UNLOCK(dvp); 5181 } 5182 return (cache_fpl_handled(fpl)); 5183 } 5184 5185 static int __noinline 5186 cache_fplookup_dot(struct cache_fpl *fpl) 5187 { 5188 int error; 5189 5190 MPASS(!seqc_in_modify(fpl->dvp_seqc)); 5191 5192 if (__predict_false(fpl->dvp->v_type != VDIR)) { 5193 cache_fpl_smr_exit(fpl); 5194 return (cache_fpl_handled_error(fpl, ENOTDIR)); 5195 } 5196 5197 /* 5198 * Just re-assign the value. seqc will be checked later for the first 5199 * non-dot path component in line and/or before deciding to return the 5200 * vnode. 5201 */ 5202 fpl->tvp = fpl->dvp; 5203 fpl->tvp_seqc = fpl->dvp_seqc; 5204 5205 SDT_PROBE3(vfs, namecache, lookup, hit, fpl->dvp, ".", fpl->dvp); 5206 5207 error = 0; 5208 if (cache_fplookup_is_mp(fpl)) { 5209 error = cache_fplookup_cross_mount(fpl); 5210 } 5211 return (error); 5212 } 5213 5214 static int __noinline 5215 cache_fplookup_dotdot(struct cache_fpl *fpl) 5216 { 5217 struct nameidata *ndp; 5218 struct componentname *cnp; 5219 struct namecache *ncp; 5220 struct vnode *dvp; 5221 struct prison *pr; 5222 u_char nc_flag; 5223 5224 ndp = fpl->ndp; 5225 cnp = fpl->cnp; 5226 dvp = fpl->dvp; 5227 5228 MPASS(cache_fpl_isdotdot(cnp)); 5229 5230 /* 5231 * XXX this is racy the same way regular lookup is 5232 */ 5233 for (pr = cnp->cn_cred->cr_prison; pr != NULL; 5234 pr = pr->pr_parent) 5235 if (dvp == pr->pr_root) 5236 break; 5237 5238 if (dvp == ndp->ni_rootdir || 5239 dvp == ndp->ni_topdir || 5240 dvp == rootvnode || 5241 pr != NULL) { 5242 fpl->tvp = dvp; 5243 fpl->tvp_seqc = vn_seqc_read_any(dvp); 5244 if (seqc_in_modify(fpl->tvp_seqc)) { 5245 return (cache_fpl_aborted(fpl)); 5246 } 5247 return (0); 5248 } 5249 5250 if ((dvp->v_vflag & VV_ROOT) != 0) { 5251 /* 5252 * TODO 5253 * The opposite of climb mount is needed here. 5254 */ 5255 return (cache_fpl_partial(fpl)); 5256 } 5257 5258 if (__predict_false(dvp->v_type != VDIR)) { 5259 cache_fpl_smr_exit(fpl); 5260 return (cache_fpl_handled_error(fpl, ENOTDIR)); 5261 } 5262 5263 ncp = atomic_load_consume_ptr(&dvp->v_cache_dd); 5264 if (ncp == NULL) { 5265 return (cache_fpl_aborted(fpl)); 5266 } 5267 5268 nc_flag = atomic_load_char(&ncp->nc_flag); 5269 if ((nc_flag & NCF_ISDOTDOT) != 0) { 5270 if ((nc_flag & NCF_NEGATIVE) != 0) 5271 return (cache_fpl_aborted(fpl)); 5272 fpl->tvp = ncp->nc_vp; 5273 } else { 5274 fpl->tvp = ncp->nc_dvp; 5275 } 5276 5277 fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp); 5278 if (seqc_in_modify(fpl->tvp_seqc)) { 5279 return (cache_fpl_partial(fpl)); 5280 } 5281 5282 /* 5283 * Acquire fence provided by vn_seqc_read_any above. 5284 */ 5285 if (__predict_false(atomic_load_ptr(&dvp->v_cache_dd) != ncp)) { 5286 return (cache_fpl_aborted(fpl)); 5287 } 5288 5289 if (!cache_ncp_canuse(ncp)) { 5290 return (cache_fpl_aborted(fpl)); 5291 } 5292 5293 return (0); 5294 } 5295 5296 static int __noinline 5297 cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash) 5298 { 5299 u_char nc_flag __diagused; 5300 bool neg_promote; 5301 5302 #ifdef INVARIANTS 5303 nc_flag = atomic_load_char(&ncp->nc_flag); 5304 MPASS((nc_flag & NCF_NEGATIVE) != 0); 5305 #endif 5306 /* 5307 * If they want to create an entry we need to replace this one. 5308 */ 5309 if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) { 5310 fpl->tvp = NULL; 5311 return (cache_fplookup_modifying(fpl)); 5312 } 5313 neg_promote = cache_neg_hit_prep(ncp); 5314 if (!cache_fpl_neg_ncp_canuse(ncp)) { 5315 cache_neg_hit_abort(ncp); 5316 return (cache_fpl_partial(fpl)); 5317 } 5318 if (neg_promote) { 5319 return (cache_fplookup_negative_promote(fpl, ncp, hash)); 5320 } 5321 cache_neg_hit_finish(ncp); 5322 cache_fpl_smr_exit(fpl); 5323 return (cache_fpl_handled_error(fpl, ENOENT)); 5324 } 5325 5326 /* 5327 * Resolve a symlink. Called by filesystem-specific routines. 5328 * 5329 * Code flow is: 5330 * ... -> cache_fplookup_symlink -> VOP_FPLOOKUP_SYMLINK -> cache_symlink_resolve 5331 */ 5332 int 5333 cache_symlink_resolve(struct cache_fpl *fpl, const char *string, size_t len) 5334 { 5335 struct nameidata *ndp; 5336 struct componentname *cnp; 5337 size_t adjust; 5338 5339 ndp = fpl->ndp; 5340 cnp = fpl->cnp; 5341 5342 if (__predict_false(len == 0)) { 5343 return (ENOENT); 5344 } 5345 5346 if (__predict_false(len > MAXPATHLEN - 2)) { 5347 if (cache_fpl_istrailingslash(fpl)) { 5348 return (EAGAIN); 5349 } 5350 } 5351 5352 ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr - cnp->cn_namelen + 1; 5353 #ifdef INVARIANTS 5354 if (ndp->ni_pathlen != fpl->debug.ni_pathlen) { 5355 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n", 5356 __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar, 5357 cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf); 5358 } 5359 #endif 5360 5361 if (__predict_false(len + ndp->ni_pathlen > MAXPATHLEN)) { 5362 return (ENAMETOOLONG); 5363 } 5364 5365 if (__predict_false(ndp->ni_loopcnt++ >= MAXSYMLINKS)) { 5366 return (ELOOP); 5367 } 5368 5369 adjust = len; 5370 if (ndp->ni_pathlen > 1) { 5371 bcopy(ndp->ni_next, cnp->cn_pnbuf + len, ndp->ni_pathlen); 5372 } else { 5373 if (cache_fpl_istrailingslash(fpl)) { 5374 adjust = len + 1; 5375 cnp->cn_pnbuf[len] = '/'; 5376 cnp->cn_pnbuf[len + 1] = '\0'; 5377 } else { 5378 cnp->cn_pnbuf[len] = '\0'; 5379 } 5380 } 5381 bcopy(string, cnp->cn_pnbuf, len); 5382 5383 ndp->ni_pathlen += adjust; 5384 cache_fpl_pathlen_add(fpl, adjust); 5385 cnp->cn_nameptr = cnp->cn_pnbuf; 5386 fpl->nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1]; 5387 fpl->tvp = NULL; 5388 return (0); 5389 } 5390 5391 static int __noinline 5392 cache_fplookup_symlink(struct cache_fpl *fpl) 5393 { 5394 struct mount *mp; 5395 struct nameidata *ndp; 5396 struct componentname *cnp; 5397 struct vnode *dvp, *tvp; 5398 int error; 5399 5400 ndp = fpl->ndp; 5401 cnp = fpl->cnp; 5402 dvp = fpl->dvp; 5403 tvp = fpl->tvp; 5404 5405 if (cache_fpl_islastcn(ndp)) { 5406 if ((cnp->cn_flags & FOLLOW) == 0) { 5407 return (cache_fplookup_final(fpl)); 5408 } 5409 } 5410 5411 mp = atomic_load_ptr(&dvp->v_mount); 5412 if (__predict_false(mp == NULL)) { 5413 return (cache_fpl_aborted(fpl)); 5414 } 5415 5416 /* 5417 * Note this check races against setting the flag just like regular 5418 * lookup. 5419 */ 5420 if (__predict_false((mp->mnt_flag & MNT_NOSYMFOLLOW) != 0)) { 5421 cache_fpl_smr_exit(fpl); 5422 return (cache_fpl_handled_error(fpl, EACCES)); 5423 } 5424 5425 error = VOP_FPLOOKUP_SYMLINK(tvp, fpl); 5426 if (__predict_false(error != 0)) { 5427 switch (error) { 5428 case EAGAIN: 5429 return (cache_fpl_partial(fpl)); 5430 case ENOENT: 5431 case ENAMETOOLONG: 5432 case ELOOP: 5433 cache_fpl_smr_exit(fpl); 5434 return (cache_fpl_handled_error(fpl, error)); 5435 default: 5436 return (cache_fpl_aborted(fpl)); 5437 } 5438 } 5439 5440 if (*(cnp->cn_nameptr) == '/') { 5441 fpl->dvp = cache_fpl_handle_root(fpl); 5442 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); 5443 if (seqc_in_modify(fpl->dvp_seqc)) { 5444 return (cache_fpl_aborted(fpl)); 5445 } 5446 /* 5447 * The main loop assumes that ->dvp points to a vnode belonging 5448 * to a filesystem which can do lockless lookup, but the absolute 5449 * symlink can be wandering off to one which does not. 5450 */ 5451 mp = atomic_load_ptr(&fpl->dvp->v_mount); 5452 if (__predict_false(mp == NULL)) { 5453 return (cache_fpl_aborted(fpl)); 5454 } 5455 if (!cache_fplookup_mp_supported(mp)) { 5456 cache_fpl_checkpoint(fpl); 5457 return (cache_fpl_partial(fpl)); 5458 } 5459 } 5460 return (0); 5461 } 5462 5463 static int 5464 cache_fplookup_next(struct cache_fpl *fpl) 5465 { 5466 struct componentname *cnp; 5467 struct namecache *ncp; 5468 struct vnode *dvp, *tvp; 5469 u_char nc_flag; 5470 uint32_t hash; 5471 int error; 5472 5473 cnp = fpl->cnp; 5474 dvp = fpl->dvp; 5475 hash = fpl->hash; 5476 5477 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 5478 if (cnp->cn_namelen == 1) { 5479 return (cache_fplookup_dot(fpl)); 5480 } 5481 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 5482 return (cache_fplookup_dotdot(fpl)); 5483 } 5484 } 5485 5486 MPASS(!cache_fpl_isdotdot(cnp)); 5487 5488 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 5489 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 5490 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 5491 break; 5492 } 5493 5494 if (__predict_false(ncp == NULL)) { 5495 return (cache_fplookup_noentry(fpl)); 5496 } 5497 5498 tvp = atomic_load_ptr(&ncp->nc_vp); 5499 nc_flag = atomic_load_char(&ncp->nc_flag); 5500 if ((nc_flag & NCF_NEGATIVE) != 0) { 5501 return (cache_fplookup_neg(fpl, ncp, hash)); 5502 } 5503 5504 if (!cache_ncp_canuse(ncp)) { 5505 return (cache_fpl_partial(fpl)); 5506 } 5507 5508 fpl->tvp = tvp; 5509 fpl->tvp_seqc = vn_seqc_read_any(tvp); 5510 if (seqc_in_modify(fpl->tvp_seqc)) { 5511 return (cache_fpl_partial(fpl)); 5512 } 5513 5514 counter_u64_add(numposhits, 1); 5515 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp); 5516 5517 error = 0; 5518 if (cache_fplookup_is_mp(fpl)) { 5519 error = cache_fplookup_cross_mount(fpl); 5520 } 5521 return (error); 5522 } 5523 5524 static bool 5525 cache_fplookup_mp_supported(struct mount *mp) 5526 { 5527 5528 MPASS(mp != NULL); 5529 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) 5530 return (false); 5531 return (true); 5532 } 5533 5534 /* 5535 * Walk up the mount stack (if any). 5536 * 5537 * Correctness is provided in the following ways: 5538 * - all vnodes are protected from freeing with SMR 5539 * - struct mount objects are type stable making them always safe to access 5540 * - stability of the particular mount is provided by busying it 5541 * - relationship between the vnode which is mounted on and the mount is 5542 * verified with the vnode sequence counter after busying 5543 * - association between root vnode of the mount and the mount is protected 5544 * by busy 5545 * 5546 * From that point on we can read the sequence counter of the root vnode 5547 * and get the next mount on the stack (if any) using the same protection. 5548 * 5549 * By the end of successful walk we are guaranteed the reached state was 5550 * indeed present at least at some point which matches the regular lookup. 5551 */ 5552 static int __noinline 5553 cache_fplookup_climb_mount(struct cache_fpl *fpl) 5554 { 5555 struct mount *mp, *prev_mp; 5556 struct mount_pcpu *mpcpu, *prev_mpcpu; 5557 struct vnode *vp; 5558 seqc_t vp_seqc; 5559 5560 vp = fpl->tvp; 5561 vp_seqc = fpl->tvp_seqc; 5562 5563 VNPASS(vp->v_type == VDIR || vp->v_type == VREG || vp->v_type == VBAD, vp); 5564 mp = atomic_load_ptr(&vp->v_mountedhere); 5565 if (__predict_false(mp == NULL)) { 5566 return (0); 5567 } 5568 5569 prev_mp = NULL; 5570 for (;;) { 5571 if (!vfs_op_thread_enter_crit(mp, mpcpu)) { 5572 if (prev_mp != NULL) 5573 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); 5574 return (cache_fpl_partial(fpl)); 5575 } 5576 if (prev_mp != NULL) 5577 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); 5578 if (!vn_seqc_consistent(vp, vp_seqc)) { 5579 vfs_op_thread_exit_crit(mp, mpcpu); 5580 return (cache_fpl_partial(fpl)); 5581 } 5582 if (!cache_fplookup_mp_supported(mp)) { 5583 vfs_op_thread_exit_crit(mp, mpcpu); 5584 return (cache_fpl_partial(fpl)); 5585 } 5586 vp = atomic_load_ptr(&mp->mnt_rootvnode); 5587 if (vp == NULL) { 5588 vfs_op_thread_exit_crit(mp, mpcpu); 5589 return (cache_fpl_partial(fpl)); 5590 } 5591 vp_seqc = vn_seqc_read_any(vp); 5592 if (seqc_in_modify(vp_seqc)) { 5593 vfs_op_thread_exit_crit(mp, mpcpu); 5594 return (cache_fpl_partial(fpl)); 5595 } 5596 prev_mp = mp; 5597 prev_mpcpu = mpcpu; 5598 mp = atomic_load_ptr(&vp->v_mountedhere); 5599 if (mp == NULL) 5600 break; 5601 } 5602 5603 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); 5604 fpl->tvp = vp; 5605 fpl->tvp_seqc = vp_seqc; 5606 return (0); 5607 } 5608 5609 static int __noinline 5610 cache_fplookup_cross_mount(struct cache_fpl *fpl) 5611 { 5612 struct mount *mp; 5613 struct mount_pcpu *mpcpu; 5614 struct vnode *vp; 5615 seqc_t vp_seqc; 5616 5617 vp = fpl->tvp; 5618 vp_seqc = fpl->tvp_seqc; 5619 5620 VNPASS(vp->v_type == VDIR || vp->v_type == VREG || vp->v_type == VBAD, vp); 5621 mp = atomic_load_ptr(&vp->v_mountedhere); 5622 if (__predict_false(mp == NULL)) { 5623 return (0); 5624 } 5625 5626 if (!vfs_op_thread_enter_crit(mp, mpcpu)) { 5627 return (cache_fpl_partial(fpl)); 5628 } 5629 if (!vn_seqc_consistent(vp, vp_seqc)) { 5630 vfs_op_thread_exit_crit(mp, mpcpu); 5631 return (cache_fpl_partial(fpl)); 5632 } 5633 if (!cache_fplookup_mp_supported(mp)) { 5634 vfs_op_thread_exit_crit(mp, mpcpu); 5635 return (cache_fpl_partial(fpl)); 5636 } 5637 vp = atomic_load_ptr(&mp->mnt_rootvnode); 5638 if (__predict_false(vp == NULL)) { 5639 vfs_op_thread_exit_crit(mp, mpcpu); 5640 return (cache_fpl_partial(fpl)); 5641 } 5642 vp_seqc = vn_seqc_read_any(vp); 5643 vfs_op_thread_exit_crit(mp, mpcpu); 5644 if (seqc_in_modify(vp_seqc)) { 5645 return (cache_fpl_partial(fpl)); 5646 } 5647 mp = atomic_load_ptr(&vp->v_mountedhere); 5648 if (__predict_false(mp != NULL)) { 5649 /* 5650 * There are possibly more mount points on top. 5651 * Normally this does not happen so for simplicity just start 5652 * over. 5653 */ 5654 return (cache_fplookup_climb_mount(fpl)); 5655 } 5656 5657 fpl->tvp = vp; 5658 fpl->tvp_seqc = vp_seqc; 5659 return (0); 5660 } 5661 5662 /* 5663 * Check if a vnode is mounted on. 5664 */ 5665 static bool 5666 cache_fplookup_is_mp(struct cache_fpl *fpl) 5667 { 5668 struct vnode *vp; 5669 5670 vp = fpl->tvp; 5671 return ((vn_irflag_read(vp) & VIRF_MOUNTPOINT) != 0); 5672 } 5673 5674 /* 5675 * Parse the path. 5676 * 5677 * The code was originally copy-pasted from regular lookup and despite 5678 * clean ups leaves performance on the table. Any modifications here 5679 * must take into account that in case off fallback the resulting 5680 * nameidata state has to be compatible with the original. 5681 */ 5682 5683 /* 5684 * Debug ni_pathlen tracking. 5685 */ 5686 #ifdef INVARIANTS 5687 static void 5688 cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n) 5689 { 5690 5691 fpl->debug.ni_pathlen += n; 5692 KASSERT(fpl->debug.ni_pathlen <= PATH_MAX, 5693 ("%s: pathlen overflow to %zd\n", __func__, fpl->debug.ni_pathlen)); 5694 } 5695 5696 static void 5697 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n) 5698 { 5699 5700 fpl->debug.ni_pathlen -= n; 5701 KASSERT(fpl->debug.ni_pathlen <= PATH_MAX, 5702 ("%s: pathlen underflow to %zd\n", __func__, fpl->debug.ni_pathlen)); 5703 } 5704 5705 static void 5706 cache_fpl_pathlen_inc(struct cache_fpl *fpl) 5707 { 5708 5709 cache_fpl_pathlen_add(fpl, 1); 5710 } 5711 5712 static void 5713 cache_fpl_pathlen_dec(struct cache_fpl *fpl) 5714 { 5715 5716 cache_fpl_pathlen_sub(fpl, 1); 5717 } 5718 #else 5719 static void 5720 cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n) 5721 { 5722 } 5723 5724 static void 5725 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n) 5726 { 5727 } 5728 5729 static void 5730 cache_fpl_pathlen_inc(struct cache_fpl *fpl) 5731 { 5732 } 5733 5734 static void 5735 cache_fpl_pathlen_dec(struct cache_fpl *fpl) 5736 { 5737 } 5738 #endif 5739 5740 static void 5741 cache_fplookup_parse(struct cache_fpl *fpl) 5742 { 5743 struct nameidata *ndp; 5744 struct componentname *cnp; 5745 struct vnode *dvp; 5746 char *cp; 5747 uint32_t hash; 5748 5749 ndp = fpl->ndp; 5750 cnp = fpl->cnp; 5751 dvp = fpl->dvp; 5752 5753 /* 5754 * Find the end of this path component, it is either / or nul. 5755 * 5756 * Store / as a temporary sentinel so that we only have one character 5757 * to test for. Pathnames tend to be short so this should not be 5758 * resulting in cache misses. 5759 * 5760 * TODO: fix this to be word-sized. 5761 */ 5762 MPASS(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] >= cnp->cn_pnbuf); 5763 KASSERT(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] == fpl->nulchar, 5764 ("%s: mismatch between pathlen (%zu) and nulchar (%p != %p), string [%s]\n", 5765 __func__, fpl->debug.ni_pathlen, &cnp->cn_nameptr[fpl->debug.ni_pathlen - 1], 5766 fpl->nulchar, cnp->cn_pnbuf)); 5767 KASSERT(*fpl->nulchar == '\0', 5768 ("%s: expected nul at %p; string [%s]\n", __func__, fpl->nulchar, 5769 cnp->cn_pnbuf)); 5770 hash = cache_get_hash_iter_start(dvp); 5771 *fpl->nulchar = '/'; 5772 for (cp = cnp->cn_nameptr; *cp != '/'; cp++) { 5773 KASSERT(*cp != '\0', 5774 ("%s: encountered unexpected nul; string [%s]\n", __func__, 5775 cnp->cn_nameptr)); 5776 hash = cache_get_hash_iter(*cp, hash); 5777 continue; 5778 } 5779 *fpl->nulchar = '\0'; 5780 fpl->hash = cache_get_hash_iter_finish(hash); 5781 5782 cnp->cn_namelen = cp - cnp->cn_nameptr; 5783 cache_fpl_pathlen_sub(fpl, cnp->cn_namelen); 5784 5785 #ifdef INVARIANTS 5786 /* 5787 * cache_get_hash only accepts lengths up to NAME_MAX. This is fine since 5788 * we are going to fail this lookup with ENAMETOOLONG (see below). 5789 */ 5790 if (cnp->cn_namelen <= NAME_MAX) { 5791 if (fpl->hash != cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp)) { 5792 panic("%s: mismatched hash for [%s] len %ld", __func__, 5793 cnp->cn_nameptr, cnp->cn_namelen); 5794 } 5795 } 5796 #endif 5797 5798 /* 5799 * Hack: we have to check if the found path component's length exceeds 5800 * NAME_MAX. However, the condition is very rarely true and check can 5801 * be elided in the common case -- if an entry was found in the cache, 5802 * then it could not have been too long to begin with. 5803 */ 5804 ndp->ni_next = cp; 5805 } 5806 5807 static void 5808 cache_fplookup_parse_advance(struct cache_fpl *fpl) 5809 { 5810 struct nameidata *ndp; 5811 struct componentname *cnp; 5812 5813 ndp = fpl->ndp; 5814 cnp = fpl->cnp; 5815 5816 cnp->cn_nameptr = ndp->ni_next; 5817 KASSERT(*(cnp->cn_nameptr) == '/', 5818 ("%s: should have seen slash at %p ; buf %p [%s]\n", __func__, 5819 cnp->cn_nameptr, cnp->cn_pnbuf, cnp->cn_pnbuf)); 5820 cnp->cn_nameptr++; 5821 cache_fpl_pathlen_dec(fpl); 5822 } 5823 5824 /* 5825 * Skip spurious slashes in a pathname (e.g., "foo///bar") and retry. 5826 * 5827 * Lockless lookup tries to elide checking for spurious slashes and should they 5828 * be present is guaranteed to fail to find an entry. In this case the caller 5829 * must check if the name starts with a slash and call this routine. It is 5830 * going to fast forward across the spurious slashes and set the state up for 5831 * retry. 5832 */ 5833 static int __noinline 5834 cache_fplookup_skip_slashes(struct cache_fpl *fpl) 5835 { 5836 struct nameidata *ndp; 5837 struct componentname *cnp; 5838 5839 ndp = fpl->ndp; 5840 cnp = fpl->cnp; 5841 5842 MPASS(*(cnp->cn_nameptr) == '/'); 5843 do { 5844 cnp->cn_nameptr++; 5845 cache_fpl_pathlen_dec(fpl); 5846 } while (*(cnp->cn_nameptr) == '/'); 5847 5848 /* 5849 * Go back to one slash so that cache_fplookup_parse_advance has 5850 * something to skip. 5851 */ 5852 cnp->cn_nameptr--; 5853 cache_fpl_pathlen_inc(fpl); 5854 5855 /* 5856 * cache_fplookup_parse_advance starts from ndp->ni_next 5857 */ 5858 ndp->ni_next = cnp->cn_nameptr; 5859 5860 /* 5861 * See cache_fplookup_dot. 5862 */ 5863 fpl->tvp = fpl->dvp; 5864 fpl->tvp_seqc = fpl->dvp_seqc; 5865 5866 return (0); 5867 } 5868 5869 /* 5870 * Handle trailing slashes (e.g., "foo/"). 5871 * 5872 * If a trailing slash is found the terminal vnode must be a directory. 5873 * Regular lookup shortens the path by nulifying the first trailing slash and 5874 * sets the TRAILINGSLASH flag to denote this took place. There are several 5875 * checks on it performed later. 5876 * 5877 * Similarly to spurious slashes, lockless lookup handles this in a speculative 5878 * manner relying on an invariant that a non-directory vnode will get a miss. 5879 * In this case cn_nameptr[0] == '\0' and cn_namelen == 0. 5880 * 5881 * Thus for a path like "foo/bar/" the code unwinds the state back to "bar/" 5882 * and denotes this is the last path component, which avoids looping back. 5883 * 5884 * Only plain lookups are supported for now to restrict corner cases to handle. 5885 */ 5886 static int __noinline 5887 cache_fplookup_trailingslash(struct cache_fpl *fpl) 5888 { 5889 #ifdef INVARIANTS 5890 size_t ni_pathlen; 5891 #endif 5892 struct nameidata *ndp; 5893 struct componentname *cnp; 5894 struct namecache *ncp; 5895 struct vnode *tvp; 5896 char *cn_nameptr_orig, *cn_nameptr_slash; 5897 seqc_t tvp_seqc; 5898 u_char nc_flag; 5899 5900 ndp = fpl->ndp; 5901 cnp = fpl->cnp; 5902 tvp = fpl->tvp; 5903 tvp_seqc = fpl->tvp_seqc; 5904 5905 MPASS(fpl->dvp == fpl->tvp); 5906 KASSERT(cache_fpl_istrailingslash(fpl), 5907 ("%s: expected trailing slash at %p; string [%s]\n", __func__, fpl->nulchar - 1, 5908 cnp->cn_pnbuf)); 5909 KASSERT(cnp->cn_nameptr[0] == '\0', 5910 ("%s: expected nul char at %p; string [%s]\n", __func__, &cnp->cn_nameptr[0], 5911 cnp->cn_pnbuf)); 5912 KASSERT(cnp->cn_namelen == 0, 5913 ("%s: namelen 0 but got %ld; string [%s]\n", __func__, cnp->cn_namelen, 5914 cnp->cn_pnbuf)); 5915 MPASS(cnp->cn_nameptr > cnp->cn_pnbuf); 5916 5917 if (cnp->cn_nameiop != LOOKUP) { 5918 return (cache_fpl_aborted(fpl)); 5919 } 5920 5921 if (__predict_false(tvp->v_type != VDIR)) { 5922 if (!vn_seqc_consistent(tvp, tvp_seqc)) { 5923 return (cache_fpl_aborted(fpl)); 5924 } 5925 cache_fpl_smr_exit(fpl); 5926 return (cache_fpl_handled_error(fpl, ENOTDIR)); 5927 } 5928 5929 /* 5930 * Denote the last component. 5931 */ 5932 ndp->ni_next = &cnp->cn_nameptr[0]; 5933 MPASS(cache_fpl_islastcn(ndp)); 5934 5935 /* 5936 * Unwind trailing slashes. 5937 */ 5938 cn_nameptr_orig = cnp->cn_nameptr; 5939 while (cnp->cn_nameptr >= cnp->cn_pnbuf) { 5940 cnp->cn_nameptr--; 5941 if (cnp->cn_nameptr[0] != '/') { 5942 break; 5943 } 5944 } 5945 5946 /* 5947 * Unwind to the beginning of the path component. 5948 * 5949 * Note the path may or may not have started with a slash. 5950 */ 5951 cn_nameptr_slash = cnp->cn_nameptr; 5952 while (cnp->cn_nameptr > cnp->cn_pnbuf) { 5953 cnp->cn_nameptr--; 5954 if (cnp->cn_nameptr[0] == '/') { 5955 break; 5956 } 5957 } 5958 if (cnp->cn_nameptr[0] == '/') { 5959 cnp->cn_nameptr++; 5960 } 5961 5962 cnp->cn_namelen = cn_nameptr_slash - cnp->cn_nameptr + 1; 5963 cache_fpl_pathlen_add(fpl, cn_nameptr_orig - cnp->cn_nameptr); 5964 cache_fpl_checkpoint(fpl); 5965 5966 #ifdef INVARIANTS 5967 ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1; 5968 if (ni_pathlen != fpl->debug.ni_pathlen) { 5969 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n", 5970 __func__, ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar, 5971 cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf); 5972 } 5973 #endif 5974 5975 /* 5976 * If this was a "./" lookup the parent directory is already correct. 5977 */ 5978 if (cnp->cn_nameptr[0] == '.' && cnp->cn_namelen == 1) { 5979 return (0); 5980 } 5981 5982 /* 5983 * Otherwise we need to look it up. 5984 */ 5985 tvp = fpl->tvp; 5986 ncp = atomic_load_consume_ptr(&tvp->v_cache_dd); 5987 if (__predict_false(ncp == NULL)) { 5988 return (cache_fpl_aborted(fpl)); 5989 } 5990 nc_flag = atomic_load_char(&ncp->nc_flag); 5991 if ((nc_flag & NCF_ISDOTDOT) != 0) { 5992 return (cache_fpl_aborted(fpl)); 5993 } 5994 fpl->dvp = ncp->nc_dvp; 5995 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); 5996 if (seqc_in_modify(fpl->dvp_seqc)) { 5997 return (cache_fpl_aborted(fpl)); 5998 } 5999 return (0); 6000 } 6001 6002 /* 6003 * See the API contract for VOP_FPLOOKUP_VEXEC. 6004 */ 6005 static int __noinline 6006 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error) 6007 { 6008 struct componentname *cnp; 6009 struct vnode *dvp; 6010 seqc_t dvp_seqc; 6011 6012 cnp = fpl->cnp; 6013 dvp = fpl->dvp; 6014 dvp_seqc = fpl->dvp_seqc; 6015 6016 /* 6017 * Hack: delayed empty path checking. 6018 */ 6019 if (cnp->cn_pnbuf[0] == '\0') { 6020 return (cache_fplookup_emptypath(fpl)); 6021 } 6022 6023 /* 6024 * TODO: Due to ignoring trailing slashes lookup will perform a 6025 * permission check on the last dir when it should not be doing it. It 6026 * may fail, but said failure should be ignored. It is possible to fix 6027 * it up fully without resorting to regular lookup, but for now just 6028 * abort. 6029 */ 6030 if (cache_fpl_istrailingslash(fpl)) { 6031 return (cache_fpl_aborted(fpl)); 6032 } 6033 6034 /* 6035 * Hack: delayed degenerate path checking. 6036 */ 6037 if (cnp->cn_nameptr[0] == '\0' && fpl->tvp == NULL) { 6038 return (cache_fplookup_degenerate(fpl)); 6039 } 6040 6041 /* 6042 * Hack: delayed name len checking. 6043 */ 6044 if (__predict_false(cnp->cn_namelen > NAME_MAX)) { 6045 cache_fpl_smr_exit(fpl); 6046 return (cache_fpl_handled_error(fpl, ENAMETOOLONG)); 6047 } 6048 6049 /* 6050 * Hack: they may be looking up foo/bar, where foo is not a directory. 6051 * In such a case we need to return ENOTDIR, but we may happen to get 6052 * here with a different error. 6053 */ 6054 if (dvp->v_type != VDIR) { 6055 error = ENOTDIR; 6056 } 6057 6058 /* 6059 * Hack: handle O_SEARCH. 6060 * 6061 * Open Group Base Specifications Issue 7, 2018 edition states: 6062 * <quote> 6063 * If the access mode of the open file description associated with the 6064 * file descriptor is not O_SEARCH, the function shall check whether 6065 * directory searches are permitted using the current permissions of 6066 * the directory underlying the file descriptor. If the access mode is 6067 * O_SEARCH, the function shall not perform the check. 6068 * </quote> 6069 * 6070 * Regular lookup tests for the NOEXECCHECK flag for every path 6071 * component to decide whether to do the permission check. However, 6072 * since most lookups never have the flag (and when they do it is only 6073 * present for the first path component), lockless lookup only acts on 6074 * it if there is a permission problem. Here the flag is represented 6075 * with a boolean so that we don't have to clear it on the way out. 6076 * 6077 * For simplicity this always aborts. 6078 * TODO: check if this is the first lookup and ignore the permission 6079 * problem. Note the flag has to survive fallback (if it happens to be 6080 * performed). 6081 */ 6082 if (fpl->fsearch) { 6083 return (cache_fpl_aborted(fpl)); 6084 } 6085 6086 switch (error) { 6087 case EAGAIN: 6088 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 6089 error = cache_fpl_aborted(fpl); 6090 } else { 6091 cache_fpl_partial(fpl); 6092 } 6093 break; 6094 default: 6095 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 6096 error = cache_fpl_aborted(fpl); 6097 } else { 6098 cache_fpl_smr_exit(fpl); 6099 cache_fpl_handled_error(fpl, error); 6100 } 6101 break; 6102 } 6103 return (error); 6104 } 6105 6106 static int 6107 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl) 6108 { 6109 struct nameidata *ndp; 6110 struct componentname *cnp; 6111 struct mount *mp; 6112 int error; 6113 6114 ndp = fpl->ndp; 6115 cnp = fpl->cnp; 6116 6117 cache_fpl_checkpoint(fpl); 6118 6119 /* 6120 * The vnode at hand is almost always stable, skip checking for it. 6121 * Worst case this postpones the check towards the end of the iteration 6122 * of the main loop. 6123 */ 6124 fpl->dvp = dvp; 6125 fpl->dvp_seqc = vn_seqc_read_notmodify(fpl->dvp); 6126 6127 mp = atomic_load_ptr(&dvp->v_mount); 6128 if (__predict_false(mp == NULL || !cache_fplookup_mp_supported(mp))) { 6129 return (cache_fpl_aborted(fpl)); 6130 } 6131 6132 MPASS(fpl->tvp == NULL); 6133 6134 for (;;) { 6135 cache_fplookup_parse(fpl); 6136 6137 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred); 6138 if (__predict_false(error != 0)) { 6139 error = cache_fplookup_failed_vexec(fpl, error); 6140 break; 6141 } 6142 6143 error = cache_fplookup_next(fpl); 6144 if (__predict_false(cache_fpl_terminated(fpl))) { 6145 break; 6146 } 6147 6148 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 6149 6150 if (fpl->tvp->v_type == VLNK) { 6151 error = cache_fplookup_symlink(fpl); 6152 if (cache_fpl_terminated(fpl)) { 6153 break; 6154 } 6155 } else { 6156 if (cache_fpl_islastcn(ndp)) { 6157 error = cache_fplookup_final(fpl); 6158 break; 6159 } 6160 6161 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { 6162 error = cache_fpl_aborted(fpl); 6163 break; 6164 } 6165 6166 fpl->dvp = fpl->tvp; 6167 fpl->dvp_seqc = fpl->tvp_seqc; 6168 cache_fplookup_parse_advance(fpl); 6169 } 6170 6171 cache_fpl_checkpoint(fpl); 6172 } 6173 6174 return (error); 6175 } 6176 6177 /* 6178 * Fast path lookup protected with SMR and sequence counters. 6179 * 6180 * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one. 6181 * 6182 * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria 6183 * outlined below. 6184 * 6185 * Traditional vnode lookup conceptually looks like this: 6186 * 6187 * vn_lock(current); 6188 * for (;;) { 6189 * next = find(); 6190 * vn_lock(next); 6191 * vn_unlock(current); 6192 * current = next; 6193 * if (last) 6194 * break; 6195 * } 6196 * return (current); 6197 * 6198 * Each jump to the next vnode is safe memory-wise and atomic with respect to 6199 * any modifications thanks to holding respective locks. 6200 * 6201 * The same guarantee can be provided with a combination of safe memory 6202 * reclamation and sequence counters instead. If all operations which affect 6203 * the relationship between the current vnode and the one we are looking for 6204 * also modify the counter, we can verify whether all the conditions held as 6205 * we made the jump. This includes things like permissions, mount points etc. 6206 * Counter modification is provided by enclosing relevant places in 6207 * vn_seqc_write_begin()/end() calls. 6208 * 6209 * Thus this translates to: 6210 * 6211 * vfs_smr_enter(); 6212 * dvp_seqc = seqc_read_any(dvp); 6213 * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode 6214 * abort(); 6215 * for (;;) { 6216 * tvp = find(); 6217 * tvp_seqc = seqc_read_any(tvp); 6218 * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode 6219 * abort(); 6220 * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode 6221 * abort(); 6222 * dvp = tvp; // we know nothing of importance has changed 6223 * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration 6224 * if (last) 6225 * break; 6226 * } 6227 * vget(); // secure the vnode 6228 * if (!seqc_consistent(tvp, tvp_seqc) // final check 6229 * abort(); 6230 * // at this point we know nothing has changed for any parent<->child pair 6231 * // as they were crossed during the lookup, meaning we matched the guarantee 6232 * // of the locked variant 6233 * return (tvp); 6234 * 6235 * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows: 6236 * - they are called while within vfs_smr protection which they must never exit 6237 * - EAGAIN can be returned to denote checking could not be performed, it is 6238 * always valid to return it 6239 * - if the sequence counter has not changed the result must be valid 6240 * - if the sequence counter has changed both false positives and false negatives 6241 * are permitted (since the result will be rejected later) 6242 * - for simple cases of unix permission checks vaccess_vexec_smr can be used 6243 * 6244 * Caveats to watch out for: 6245 * - vnodes are passed unlocked and unreferenced with nothing stopping 6246 * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised 6247 * to use atomic_load_ptr to fetch it. 6248 * - the aforementioned object can also get freed, meaning absent other means it 6249 * should be protected with vfs_smr 6250 * - either safely checking permissions as they are modified or guaranteeing 6251 * their stability is left to the routine 6252 */ 6253 int 6254 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status, 6255 struct pwd **pwdp) 6256 { 6257 struct cache_fpl fpl; 6258 struct pwd *pwd; 6259 struct vnode *dvp; 6260 struct componentname *cnp; 6261 int error; 6262 6263 fpl.status = CACHE_FPL_STATUS_UNSET; 6264 fpl.in_smr = false; 6265 fpl.ndp = ndp; 6266 fpl.cnp = cnp = &ndp->ni_cnd; 6267 MPASS(ndp->ni_lcf == 0); 6268 KASSERT ((cnp->cn_flags & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, 6269 ("%s: internal flags found in cn_flags %" PRIx64, __func__, 6270 cnp->cn_flags)); 6271 MPASS(cnp->cn_nameptr == cnp->cn_pnbuf); 6272 MPASS(ndp->ni_resflags == 0); 6273 6274 if (__predict_false(!cache_can_fplookup(&fpl))) { 6275 *status = fpl.status; 6276 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 6277 return (EOPNOTSUPP); 6278 } 6279 6280 cache_fpl_checkpoint_outer(&fpl); 6281 6282 cache_fpl_smr_enter_initial(&fpl); 6283 #ifdef INVARIANTS 6284 fpl.debug.ni_pathlen = ndp->ni_pathlen; 6285 #endif 6286 fpl.nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1]; 6287 fpl.fsearch = false; 6288 fpl.tvp = NULL; /* for degenerate path handling */ 6289 fpl.pwd = pwdp; 6290 pwd = pwd_get_smr(); 6291 *(fpl.pwd) = pwd; 6292 namei_setup_rootdir(ndp, cnp, pwd); 6293 ndp->ni_topdir = pwd->pwd_jdir; 6294 6295 if (cnp->cn_pnbuf[0] == '/') { 6296 dvp = cache_fpl_handle_root(&fpl); 6297 ndp->ni_resflags = NIRES_ABS; 6298 } else { 6299 if (ndp->ni_dirfd == AT_FDCWD) { 6300 dvp = pwd->pwd_cdir; 6301 } else { 6302 error = cache_fplookup_dirfd(&fpl, &dvp); 6303 if (__predict_false(error != 0)) { 6304 goto out; 6305 } 6306 } 6307 } 6308 6309 SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true); 6310 error = cache_fplookup_impl(dvp, &fpl); 6311 out: 6312 cache_fpl_smr_assert_not_entered(&fpl); 6313 cache_fpl_assert_status(&fpl); 6314 *status = fpl.status; 6315 if (SDT_PROBES_ENABLED()) { 6316 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 6317 if (fpl.status == CACHE_FPL_STATUS_HANDLED) 6318 SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true, 6319 ndp); 6320 } 6321 6322 if (__predict_true(fpl.status == CACHE_FPL_STATUS_HANDLED)) { 6323 MPASS(error != CACHE_FPL_FAILED); 6324 if (error != 0) { 6325 cache_fpl_cleanup_cnp(fpl.cnp); 6326 MPASS(fpl.dvp == NULL); 6327 MPASS(fpl.tvp == NULL); 6328 } 6329 ndp->ni_dvp = fpl.dvp; 6330 ndp->ni_vp = fpl.tvp; 6331 } 6332 return (error); 6333 } 6334