1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Poul-Henning Kamp of the FreeBSD Project. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_ddb.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/capsicum.h> 46 #include <sys/counter.h> 47 #include <sys/filedesc.h> 48 #include <sys/fnv_hash.h> 49 #include <sys/kernel.h> 50 #include <sys/ktr.h> 51 #include <sys/lock.h> 52 #include <sys/malloc.h> 53 #include <sys/fcntl.h> 54 #include <sys/jail.h> 55 #include <sys/mount.h> 56 #include <sys/namei.h> 57 #include <sys/proc.h> 58 #include <sys/rwlock.h> 59 #include <sys/seqc.h> 60 #include <sys/sdt.h> 61 #include <sys/smr.h> 62 #include <sys/smp.h> 63 #include <sys/syscallsubr.h> 64 #include <sys/sysctl.h> 65 #include <sys/sysproto.h> 66 #include <sys/vnode.h> 67 #include <ck_queue.h> 68 #ifdef KTRACE 69 #include <sys/ktrace.h> 70 #endif 71 72 #include <sys/capsicum.h> 73 74 #include <security/audit/audit.h> 75 #include <security/mac/mac_framework.h> 76 77 #ifdef DDB 78 #include <ddb/ddb.h> 79 #endif 80 81 #include <vm/uma.h> 82 83 SDT_PROVIDER_DECLARE(vfs); 84 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", 85 "struct vnode *"); 86 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", 87 "char *"); 88 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *", 89 "const char *"); 90 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *", 91 "struct namecache *", "int", "int"); 92 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); 93 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", 94 "char *", "struct vnode *"); 95 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); 96 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", 97 "struct vnode *", "char *"); 98 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", 99 "struct vnode *"); 100 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, 101 "struct vnode *", "char *"); 102 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", 103 "char *"); 104 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *", 105 "struct componentname *"); 106 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *", 107 "struct componentname *"); 108 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); 109 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); 110 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); 111 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", 112 "struct vnode *"); 113 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", 114 "char *"); 115 SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *", 116 "char *"); 117 118 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool"); 119 SDT_PROBE_DECLARE(vfs, namei, lookup, entry); 120 SDT_PROBE_DECLARE(vfs, namei, lookup, return); 121 122 /* 123 * This structure describes the elements in the cache of recent 124 * names looked up by namei. 125 */ 126 struct negstate { 127 u_char neg_flag; 128 }; 129 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *), 130 "the state must fit in a union with a pointer without growing it"); 131 132 struct namecache { 133 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 134 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 135 CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */ 136 struct vnode *nc_dvp; /* vnode of parent of name */ 137 union { 138 struct vnode *nu_vp; /* vnode the name refers to */ 139 struct negstate nu_neg;/* negative entry state */ 140 } n_un; 141 u_char nc_flag; /* flag bits */ 142 u_char nc_nlen; /* length of name */ 143 char nc_name[0]; /* segment name + nul */ 144 }; 145 146 /* 147 * struct namecache_ts repeats struct namecache layout up to the 148 * nc_nlen member. 149 * struct namecache_ts is used in place of struct namecache when time(s) need 150 * to be stored. The nc_dotdottime field is used when a cache entry is mapping 151 * both a non-dotdot directory name plus dotdot for the directory's 152 * parent. 153 * 154 * See below for alignment requirement. 155 */ 156 struct namecache_ts { 157 struct timespec nc_time; /* timespec provided by fs */ 158 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ 159 int nc_ticks; /* ticks value when entry was added */ 160 struct namecache nc_nc; 161 }; 162 163 /* 164 * At least mips n32 performs 64-bit accesses to timespec as found 165 * in namecache_ts and requires them to be aligned. Since others 166 * may be in the same spot suffer a little bit and enforce the 167 * alignment for everyone. Note this is a nop for 64-bit platforms. 168 */ 169 #define CACHE_ZONE_ALIGNMENT UMA_ALIGNOF(time_t) 170 #define CACHE_PATH_CUTOFF 39 171 172 #define CACHE_ZONE_SMALL_SIZE (sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1) 173 #define CACHE_ZONE_SMALL_TS_SIZE (sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1) 174 #define CACHE_ZONE_LARGE_SIZE (sizeof(struct namecache) + NAME_MAX + 1) 175 #define CACHE_ZONE_LARGE_TS_SIZE (sizeof(struct namecache_ts) + NAME_MAX + 1) 176 177 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 178 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 179 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 180 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 181 182 #define nc_vp n_un.nu_vp 183 #define nc_neg n_un.nu_neg 184 185 /* 186 * Flags in namecache.nc_flag 187 */ 188 #define NCF_WHITE 0x01 189 #define NCF_ISDOTDOT 0x02 190 #define NCF_TS 0x04 191 #define NCF_DTS 0x08 192 #define NCF_DVDROP 0x10 193 #define NCF_NEGATIVE 0x20 194 #define NCF_INVALID 0x40 195 #define NCF_WIP 0x80 196 197 /* 198 * Flags in negstate.neg_flag 199 */ 200 #define NEG_HOT 0x01 201 202 /* 203 * Mark an entry as invalid. 204 * 205 * This is called before it starts getting deconstructed. 206 */ 207 static void 208 cache_ncp_invalidate(struct namecache *ncp) 209 { 210 211 KASSERT((ncp->nc_flag & NCF_INVALID) == 0, 212 ("%s: entry %p already invalid", __func__, ncp)); 213 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID); 214 atomic_thread_fence_rel(); 215 } 216 217 /* 218 * Check whether the entry can be safely used. 219 * 220 * All places which elide locks are supposed to call this after they are 221 * done with reading from an entry. 222 */ 223 static bool 224 cache_ncp_canuse(struct namecache *ncp) 225 { 226 227 atomic_thread_fence_acq(); 228 return ((atomic_load_char(&ncp->nc_flag) & (NCF_INVALID | NCF_WIP)) == 0); 229 } 230 231 /* 232 * Name caching works as follows: 233 * 234 * Names found by directory scans are retained in a cache 235 * for future reference. It is managed LRU, so frequently 236 * used names will hang around. Cache is indexed by hash value 237 * obtained from (dvp, name) where dvp refers to the directory 238 * containing name. 239 * 240 * If it is a "negative" entry, (i.e. for a name that is known NOT to 241 * exist) the vnode pointer will be NULL. 242 * 243 * Upon reaching the last segment of a path, if the reference 244 * is for DELETE, or NOCACHE is set (rewrite), and the 245 * name is located in the cache, it will be dropped. 246 * 247 * These locks are used (in the order in which they can be taken): 248 * NAME TYPE ROLE 249 * vnodelock mtx vnode lists and v_cache_dd field protection 250 * bucketlock rwlock for access to given set of hash buckets 251 * neglist mtx negative entry LRU management 252 * 253 * Additionally, ncneg_shrink_lock mtx is used to have at most one thread 254 * shrinking the LRU list. 255 * 256 * It is legal to take multiple vnodelock and bucketlock locks. The locking 257 * order is lower address first. Both are recursive. 258 * 259 * "." lookups are lockless. 260 * 261 * ".." and vnode -> name lookups require vnodelock. 262 * 263 * name -> vnode lookup requires the relevant bucketlock to be held for reading. 264 * 265 * Insertions and removals of entries require involved vnodes and bucketlocks 266 * to be write-locked to prevent other threads from seeing the entry. 267 * 268 * Some lookups result in removal of the found entry (e.g. getting rid of a 269 * negative entry with the intent to create a positive one), which poses a 270 * problem when multiple threads reach the state. Similarly, two different 271 * threads can purge two different vnodes and try to remove the same name. 272 * 273 * If the already held vnode lock is lower than the second required lock, we 274 * can just take the other lock. However, in the opposite case, this could 275 * deadlock. As such, this is resolved by trylocking and if that fails unlocking 276 * the first node, locking everything in order and revalidating the state. 277 */ 278 279 VFS_SMR_DECLARE; 280 281 /* 282 * Structures associated with name caching. 283 */ 284 #define NCHHASH(hash) \ 285 (&nchashtbl[(hash) & nchash]) 286 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ 287 static u_long __read_mostly nchash; /* size of hash table */ 288 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 289 "Size of namecache hash table"); 290 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */ 291 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 292 "Ratio of negative namecache entries"); 293 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ 294 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ 295 u_int ncsizefactor = 2; 296 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0, 297 "Size factor for namecache"); 298 static u_int __read_mostly ncpurgeminvnodes; 299 SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0, 300 "Number of vnodes below which purgevfs ignores the request"); 301 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */ 302 303 struct nchstats nchstats; /* cache effectiveness statistics */ 304 305 static bool __read_frequently cache_fast_revlookup = true; 306 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_revlookup, CTLFLAG_RW, 307 &cache_fast_revlookup, 0, ""); 308 309 static struct mtx __exclusive_cache_line ncneg_shrink_lock; 310 311 struct neglist { 312 struct mtx nl_lock; 313 TAILQ_HEAD(, namecache) nl_list; 314 } __aligned(CACHE_LINE_SIZE); 315 316 static struct neglist __read_mostly *neglists; 317 static struct neglist ncneg_hot; 318 static u_long numhotneg; 319 320 #define ncneghash 3 321 #define numneglists (ncneghash + 1) 322 static inline struct neglist * 323 NCP2NEGLIST(struct namecache *ncp) 324 { 325 326 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); 327 } 328 329 static inline struct negstate * 330 NCP2NEGSTATE(struct namecache *ncp) 331 { 332 333 MPASS(ncp->nc_flag & NCF_NEGATIVE); 334 return (&ncp->nc_neg); 335 } 336 337 #define numbucketlocks (ncbuckethash + 1) 338 static u_int __read_mostly ncbuckethash; 339 static struct rwlock_padalign __read_mostly *bucketlocks; 340 #define HASH2BUCKETLOCK(hash) \ 341 ((struct rwlock *)(&bucketlocks[((hash) & ncbuckethash)])) 342 343 #define numvnodelocks (ncvnodehash + 1) 344 static u_int __read_mostly ncvnodehash; 345 static struct mtx __read_mostly *vnodelocks; 346 static inline struct mtx * 347 VP2VNODELOCK(struct vnode *vp) 348 { 349 350 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); 351 } 352 353 /* 354 * UMA zones for the VFS cache. 355 * 356 * The small cache is used for entries with short names, which are the 357 * most common. The large cache is used for entries which are too big to 358 * fit in the small cache. 359 */ 360 static uma_zone_t __read_mostly cache_zone_small; 361 static uma_zone_t __read_mostly cache_zone_small_ts; 362 static uma_zone_t __read_mostly cache_zone_large; 363 static uma_zone_t __read_mostly cache_zone_large_ts; 364 365 static struct namecache * 366 cache_alloc(int len, int ts) 367 { 368 struct namecache_ts *ncp_ts; 369 struct namecache *ncp; 370 371 if (__predict_false(ts)) { 372 if (len <= CACHE_PATH_CUTOFF) 373 ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK); 374 else 375 ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK); 376 ncp = &ncp_ts->nc_nc; 377 } else { 378 if (len <= CACHE_PATH_CUTOFF) 379 ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK); 380 else 381 ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK); 382 } 383 return (ncp); 384 } 385 386 static void 387 cache_free(struct namecache *ncp) 388 { 389 struct namecache_ts *ncp_ts; 390 391 if (ncp == NULL) 392 return; 393 if ((ncp->nc_flag & NCF_DVDROP) != 0) 394 vdrop(ncp->nc_dvp); 395 if (__predict_false(ncp->nc_flag & NCF_TS)) { 396 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 397 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 398 uma_zfree_smr(cache_zone_small_ts, ncp_ts); 399 else 400 uma_zfree_smr(cache_zone_large_ts, ncp_ts); 401 } else { 402 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 403 uma_zfree_smr(cache_zone_small, ncp); 404 else 405 uma_zfree_smr(cache_zone_large, ncp); 406 } 407 } 408 409 static void 410 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) 411 { 412 struct namecache_ts *ncp_ts; 413 414 KASSERT((ncp->nc_flag & NCF_TS) != 0 || 415 (tsp == NULL && ticksp == NULL), 416 ("No NCF_TS")); 417 418 if (tsp == NULL && ticksp == NULL) 419 return; 420 421 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 422 if (tsp != NULL) 423 *tsp = ncp_ts->nc_time; 424 if (ticksp != NULL) 425 *ticksp = ncp_ts->nc_ticks; 426 } 427 428 #ifdef DEBUG_CACHE 429 static int __read_mostly doingcache = 1; /* 1 => enable the cache */ 430 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, 431 "VFS namecache enabled"); 432 #endif 433 434 /* Export size information to userland */ 435 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 436 sizeof(struct namecache), "sizeof(struct namecache)"); 437 438 /* 439 * The new name cache statistics 440 */ 441 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 442 "Name cache statistics"); 443 #define STATNODE_ULONG(name, descr) \ 444 SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr); 445 #define STATNODE_COUNTER(name, descr) \ 446 static COUNTER_U64_DEFINE_EARLY(name); \ 447 SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, \ 448 descr); 449 STATNODE_ULONG(numneg, "Number of negative cache entries"); 450 STATNODE_ULONG(numcache, "Number of cache entries"); 451 STATNODE_COUNTER(numcachehv, "Number of namecache entries with vnodes held"); 452 STATNODE_COUNTER(numdrops, "Number of dropped entries due to reaching the limit"); 453 STATNODE_COUNTER(dothits, "Number of '.' hits"); 454 STATNODE_COUNTER(dotdothits, "Number of '..' hits"); 455 STATNODE_COUNTER(nummiss, "Number of cache misses"); 456 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache"); 457 STATNODE_COUNTER(numposzaps, 458 "Number of cache hits (positive) we do not want to cache"); 459 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)"); 460 STATNODE_COUNTER(numnegzaps, 461 "Number of cache hits (negative) we do not want to cache"); 462 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)"); 463 /* These count for vn_getcwd(), too. */ 464 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls"); 465 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); 466 STATNODE_COUNTER(numfullpathfail2, 467 "Number of fullpath search errors (VOP_VPTOCNP failures)"); 468 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); 469 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls"); 470 STATNODE_COUNTER(zap_and_exit_bucket_relock_success, 471 "Number of successful removals after relocking"); 472 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail, 473 "Number of times zap_and_exit failed to lock"); 474 static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2, 475 "Number of times zap_and_exit failed to lock"); 476 static long cache_lock_vnodes_cel_3_failures; 477 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures, 478 "Number of times 3-way vnode locking failed"); 479 STATNODE_ULONG(numhotneg, "Number of hot negative entries"); 480 STATNODE_COUNTER(numneg_evicted, 481 "Number of negative entries evicted when adding a new entry"); 482 STATNODE_COUNTER(shrinking_skipped, 483 "Number of times shrinking was already in progress"); 484 485 static void cache_zap_locked(struct namecache *ncp); 486 static int vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, 487 char **freebuf, size_t *buflen); 488 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 489 char **retbuf, size_t *buflen, bool slash_prefixed, size_t addend); 490 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, 491 char **retbuf, size_t *buflen); 492 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, 493 char **retbuf, size_t *len, bool slash_prefixed, size_t addend); 494 495 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 496 497 static int cache_yield; 498 SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0, 499 "Number of times cache called yield"); 500 501 static void __noinline 502 cache_maybe_yield(void) 503 { 504 505 if (should_yield()) { 506 cache_yield++; 507 kern_yield(PRI_USER); 508 } 509 } 510 511 static inline void 512 cache_assert_vlp_locked(struct mtx *vlp) 513 { 514 515 if (vlp != NULL) 516 mtx_assert(vlp, MA_OWNED); 517 } 518 519 static inline void 520 cache_assert_vnode_locked(struct vnode *vp) 521 { 522 struct mtx *vlp; 523 524 vlp = VP2VNODELOCK(vp); 525 cache_assert_vlp_locked(vlp); 526 } 527 528 /* 529 * TODO: With the value stored we can do better than computing the hash based 530 * on the address. The choice of FNV should also be revisited. 531 */ 532 static void 533 cache_prehash(struct vnode *vp) 534 { 535 536 vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT); 537 } 538 539 static uint32_t 540 cache_get_hash(char *name, u_char len, struct vnode *dvp) 541 { 542 543 return (fnv_32_buf(name, len, dvp->v_nchash)); 544 } 545 546 static inline struct nchashhead * 547 NCP2BUCKET(struct namecache *ncp) 548 { 549 uint32_t hash; 550 551 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 552 return (NCHHASH(hash)); 553 } 554 555 static inline struct rwlock * 556 NCP2BUCKETLOCK(struct namecache *ncp) 557 { 558 uint32_t hash; 559 560 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 561 return (HASH2BUCKETLOCK(hash)); 562 } 563 564 #ifdef INVARIANTS 565 static void 566 cache_assert_bucket_locked(struct namecache *ncp, int mode) 567 { 568 struct rwlock *blp; 569 570 blp = NCP2BUCKETLOCK(ncp); 571 rw_assert(blp, mode); 572 } 573 #else 574 #define cache_assert_bucket_locked(x, y) do { } while (0) 575 #endif 576 577 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) 578 static void 579 _cache_sort_vnodes(void **p1, void **p2) 580 { 581 void *tmp; 582 583 MPASS(*p1 != NULL || *p2 != NULL); 584 585 if (*p1 > *p2) { 586 tmp = *p2; 587 *p2 = *p1; 588 *p1 = tmp; 589 } 590 } 591 592 static void 593 cache_lock_all_buckets(void) 594 { 595 u_int i; 596 597 for (i = 0; i < numbucketlocks; i++) 598 rw_wlock(&bucketlocks[i]); 599 } 600 601 static void 602 cache_unlock_all_buckets(void) 603 { 604 u_int i; 605 606 for (i = 0; i < numbucketlocks; i++) 607 rw_wunlock(&bucketlocks[i]); 608 } 609 610 static void 611 cache_lock_all_vnodes(void) 612 { 613 u_int i; 614 615 for (i = 0; i < numvnodelocks; i++) 616 mtx_lock(&vnodelocks[i]); 617 } 618 619 static void 620 cache_unlock_all_vnodes(void) 621 { 622 u_int i; 623 624 for (i = 0; i < numvnodelocks; i++) 625 mtx_unlock(&vnodelocks[i]); 626 } 627 628 static int 629 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 630 { 631 632 cache_sort_vnodes(&vlp1, &vlp2); 633 634 if (vlp1 != NULL) { 635 if (!mtx_trylock(vlp1)) 636 return (EAGAIN); 637 } 638 if (!mtx_trylock(vlp2)) { 639 if (vlp1 != NULL) 640 mtx_unlock(vlp1); 641 return (EAGAIN); 642 } 643 644 return (0); 645 } 646 647 static void 648 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 649 { 650 651 MPASS(vlp1 != NULL || vlp2 != NULL); 652 MPASS(vlp1 <= vlp2); 653 654 if (vlp1 != NULL) 655 mtx_lock(vlp1); 656 if (vlp2 != NULL) 657 mtx_lock(vlp2); 658 } 659 660 static void 661 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 662 { 663 664 MPASS(vlp1 != NULL || vlp2 != NULL); 665 666 if (vlp1 != NULL) 667 mtx_unlock(vlp1); 668 if (vlp2 != NULL) 669 mtx_unlock(vlp2); 670 } 671 672 static int 673 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 674 { 675 struct nchstats snap; 676 677 if (req->oldptr == NULL) 678 return (SYSCTL_OUT(req, 0, sizeof(snap))); 679 680 snap = nchstats; 681 snap.ncs_goodhits = counter_u64_fetch(numposhits); 682 snap.ncs_neghits = counter_u64_fetch(numneghits); 683 snap.ncs_badhits = counter_u64_fetch(numposzaps) + 684 counter_u64_fetch(numnegzaps); 685 snap.ncs_miss = counter_u64_fetch(nummisszap) + 686 counter_u64_fetch(nummiss); 687 688 return (SYSCTL_OUT(req, &snap, sizeof(snap))); 689 } 690 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | 691 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", 692 "VFS cache effectiveness statistics"); 693 694 #ifdef DIAGNOSTIC 695 /* 696 * Grab an atomic snapshot of the name cache hash chain lengths 697 */ 698 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, 699 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 700 "hash table stats"); 701 702 static int 703 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) 704 { 705 struct nchashhead *ncpp; 706 struct namecache *ncp; 707 int i, error, n_nchash, *cntbuf; 708 709 retry: 710 n_nchash = nchash + 1; /* nchash is max index, not count */ 711 if (req->oldptr == NULL) 712 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); 713 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); 714 cache_lock_all_buckets(); 715 if (n_nchash != nchash + 1) { 716 cache_unlock_all_buckets(); 717 free(cntbuf, M_TEMP); 718 goto retry; 719 } 720 /* Scan hash tables counting entries */ 721 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) 722 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) 723 cntbuf[i]++; 724 cache_unlock_all_buckets(); 725 for (error = 0, i = 0; i < n_nchash; i++) 726 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) 727 break; 728 free(cntbuf, M_TEMP); 729 return (error); 730 } 731 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| 732 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", 733 "nchash chain lengths"); 734 735 static int 736 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) 737 { 738 int error; 739 struct nchashhead *ncpp; 740 struct namecache *ncp; 741 int n_nchash; 742 int count, maxlength, used, pct; 743 744 if (!req->oldptr) 745 return SYSCTL_OUT(req, 0, 4 * sizeof(int)); 746 747 cache_lock_all_buckets(); 748 n_nchash = nchash + 1; /* nchash is max index, not count */ 749 used = 0; 750 maxlength = 0; 751 752 /* Scan hash tables for applicable entries */ 753 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { 754 count = 0; 755 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) { 756 count++; 757 } 758 if (count) 759 used++; 760 if (maxlength < count) 761 maxlength = count; 762 } 763 n_nchash = nchash + 1; 764 cache_unlock_all_buckets(); 765 pct = (used * 100) / (n_nchash / 100); 766 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); 767 if (error) 768 return (error); 769 error = SYSCTL_OUT(req, &used, sizeof(used)); 770 if (error) 771 return (error); 772 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); 773 if (error) 774 return (error); 775 error = SYSCTL_OUT(req, &pct, sizeof(pct)); 776 if (error) 777 return (error); 778 return (0); 779 } 780 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| 781 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", 782 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); 783 #endif 784 785 /* 786 * Negative entries management 787 * 788 * A variation of LRU scheme is used. New entries are hashed into one of 789 * numneglists cold lists. Entries get promoted to the hot list on first hit. 790 * 791 * The shrinker will demote hot list head and evict from the cold list in a 792 * round-robin manner. 793 */ 794 static void 795 cache_negative_init(struct namecache *ncp) 796 { 797 struct negstate *negstate; 798 799 ncp->nc_flag |= NCF_NEGATIVE; 800 negstate = NCP2NEGSTATE(ncp); 801 negstate->neg_flag = 0; 802 } 803 804 static void 805 cache_negative_hit(struct namecache *ncp) 806 { 807 struct neglist *neglist; 808 struct negstate *negstate; 809 810 negstate = NCP2NEGSTATE(ncp); 811 if ((negstate->neg_flag & NEG_HOT) != 0) 812 return; 813 neglist = NCP2NEGLIST(ncp); 814 mtx_lock(&ncneg_hot.nl_lock); 815 mtx_lock(&neglist->nl_lock); 816 if ((negstate->neg_flag & NEG_HOT) == 0) { 817 numhotneg++; 818 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 819 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); 820 negstate->neg_flag |= NEG_HOT; 821 } 822 mtx_unlock(&neglist->nl_lock); 823 mtx_unlock(&ncneg_hot.nl_lock); 824 } 825 826 static void 827 cache_negative_insert(struct namecache *ncp) 828 { 829 struct neglist *neglist; 830 831 MPASS(ncp->nc_flag & NCF_NEGATIVE); 832 cache_assert_bucket_locked(ncp, RA_WLOCKED); 833 neglist = NCP2NEGLIST(ncp); 834 mtx_lock(&neglist->nl_lock); 835 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 836 mtx_unlock(&neglist->nl_lock); 837 atomic_add_rel_long(&numneg, 1); 838 } 839 840 static void 841 cache_negative_remove(struct namecache *ncp) 842 { 843 struct neglist *neglist; 844 struct negstate *negstate; 845 bool hot_locked = false; 846 bool list_locked = false; 847 848 cache_assert_bucket_locked(ncp, RA_WLOCKED); 849 neglist = NCP2NEGLIST(ncp); 850 negstate = NCP2NEGSTATE(ncp); 851 if ((negstate->neg_flag & NEG_HOT) != 0) { 852 hot_locked = true; 853 mtx_lock(&ncneg_hot.nl_lock); 854 if ((negstate->neg_flag & NEG_HOT) == 0) { 855 list_locked = true; 856 mtx_lock(&neglist->nl_lock); 857 } 858 } else { 859 list_locked = true; 860 mtx_lock(&neglist->nl_lock); 861 /* 862 * We may be racing against promotion in lockless lookup. 863 */ 864 if ((negstate->neg_flag & NEG_HOT) != 0) { 865 mtx_unlock(&neglist->nl_lock); 866 hot_locked = true; 867 mtx_lock(&ncneg_hot.nl_lock); 868 mtx_lock(&neglist->nl_lock); 869 } 870 } 871 if ((negstate->neg_flag & NEG_HOT) != 0) { 872 mtx_assert(&ncneg_hot.nl_lock, MA_OWNED); 873 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 874 numhotneg--; 875 } else { 876 mtx_assert(&neglist->nl_lock, MA_OWNED); 877 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 878 } 879 if (list_locked) 880 mtx_unlock(&neglist->nl_lock); 881 if (hot_locked) 882 mtx_unlock(&ncneg_hot.nl_lock); 883 atomic_subtract_rel_long(&numneg, 1); 884 } 885 886 static void 887 cache_negative_shrink_select(struct namecache **ncpp, 888 struct neglist **neglistpp) 889 { 890 struct neglist *neglist; 891 struct namecache *ncp; 892 static u_int cycle; 893 u_int i; 894 895 *ncpp = ncp = NULL; 896 897 for (i = 0; i < numneglists; i++) { 898 neglist = &neglists[(cycle + i) % numneglists]; 899 if (TAILQ_FIRST(&neglist->nl_list) == NULL) 900 continue; 901 mtx_lock(&neglist->nl_lock); 902 ncp = TAILQ_FIRST(&neglist->nl_list); 903 if (ncp != NULL) 904 break; 905 mtx_unlock(&neglist->nl_lock); 906 } 907 908 *neglistpp = neglist; 909 *ncpp = ncp; 910 cycle++; 911 } 912 913 static void 914 cache_negative_zap_one(void) 915 { 916 struct namecache *ncp, *ncp2; 917 struct neglist *neglist; 918 struct negstate *negstate; 919 struct mtx *dvlp; 920 struct rwlock *blp; 921 922 if (mtx_owner(&ncneg_shrink_lock) != NULL || 923 !mtx_trylock(&ncneg_shrink_lock)) { 924 counter_u64_add(shrinking_skipped, 1); 925 return; 926 } 927 928 mtx_lock(&ncneg_hot.nl_lock); 929 ncp = TAILQ_FIRST(&ncneg_hot.nl_list); 930 if (ncp != NULL) { 931 neglist = NCP2NEGLIST(ncp); 932 negstate = NCP2NEGSTATE(ncp); 933 mtx_lock(&neglist->nl_lock); 934 MPASS((negstate->neg_flag & NEG_HOT) != 0); 935 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 936 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 937 negstate->neg_flag &= ~NEG_HOT; 938 numhotneg--; 939 mtx_unlock(&neglist->nl_lock); 940 } 941 mtx_unlock(&ncneg_hot.nl_lock); 942 943 cache_negative_shrink_select(&ncp, &neglist); 944 945 mtx_unlock(&ncneg_shrink_lock); 946 if (ncp == NULL) 947 return; 948 949 MPASS(ncp->nc_flag & NCF_NEGATIVE); 950 dvlp = VP2VNODELOCK(ncp->nc_dvp); 951 blp = NCP2BUCKETLOCK(ncp); 952 mtx_unlock(&neglist->nl_lock); 953 mtx_lock(dvlp); 954 rw_wlock(blp); 955 /* 956 * Enter SMR to safely check the negative list. 957 * Even if the found pointer matches, the entry may now be reallocated 958 * and used by a different vnode. 959 */ 960 vfs_smr_enter(); 961 ncp2 = TAILQ_FIRST(&neglist->nl_list); 962 if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) || 963 blp != NCP2BUCKETLOCK(ncp2)) { 964 vfs_smr_exit(); 965 ncp = NULL; 966 } else { 967 vfs_smr_exit(); 968 SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp, 969 ncp->nc_name); 970 cache_zap_locked(ncp); 971 counter_u64_add(numneg_evicted, 1); 972 } 973 rw_wunlock(blp); 974 mtx_unlock(dvlp); 975 cache_free(ncp); 976 } 977 978 /* 979 * cache_zap_locked(): 980 * 981 * Removes a namecache entry from cache, whether it contains an actual 982 * pointer to a vnode or if it is just a negative cache entry. 983 */ 984 static void 985 cache_zap_locked(struct namecache *ncp) 986 { 987 struct nchashhead *ncpp; 988 989 if (!(ncp->nc_flag & NCF_NEGATIVE)) 990 cache_assert_vnode_locked(ncp->nc_vp); 991 cache_assert_vnode_locked(ncp->nc_dvp); 992 cache_assert_bucket_locked(ncp, RA_WLOCKED); 993 994 CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, 995 (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp); 996 997 cache_ncp_invalidate(ncp); 998 999 ncpp = NCP2BUCKET(ncp); 1000 CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash); 1001 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 1002 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, 1003 ncp->nc_name, ncp->nc_vp); 1004 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); 1005 if (ncp == ncp->nc_vp->v_cache_dd) { 1006 vn_seqc_write_begin_unheld(ncp->nc_vp); 1007 ncp->nc_vp->v_cache_dd = NULL; 1008 vn_seqc_write_end(ncp->nc_vp); 1009 } 1010 } else { 1011 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp, 1012 ncp->nc_name); 1013 cache_negative_remove(ncp); 1014 } 1015 if (ncp->nc_flag & NCF_ISDOTDOT) { 1016 if (ncp == ncp->nc_dvp->v_cache_dd) { 1017 vn_seqc_write_begin_unheld(ncp->nc_dvp); 1018 ncp->nc_dvp->v_cache_dd = NULL; 1019 vn_seqc_write_end(ncp->nc_dvp); 1020 } 1021 } else { 1022 LIST_REMOVE(ncp, nc_src); 1023 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { 1024 ncp->nc_flag |= NCF_DVDROP; 1025 counter_u64_add(numcachehv, -1); 1026 } 1027 } 1028 atomic_subtract_rel_long(&numcache, 1); 1029 } 1030 1031 static void 1032 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) 1033 { 1034 struct rwlock *blp; 1035 1036 MPASS(ncp->nc_dvp == vp); 1037 MPASS(ncp->nc_flag & NCF_NEGATIVE); 1038 cache_assert_vnode_locked(vp); 1039 1040 blp = NCP2BUCKETLOCK(ncp); 1041 rw_wlock(blp); 1042 cache_zap_locked(ncp); 1043 rw_wunlock(blp); 1044 } 1045 1046 static bool 1047 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, 1048 struct mtx **vlpp) 1049 { 1050 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 1051 struct rwlock *blp; 1052 1053 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 1054 cache_assert_vnode_locked(vp); 1055 1056 if (ncp->nc_flag & NCF_NEGATIVE) { 1057 if (*vlpp != NULL) { 1058 mtx_unlock(*vlpp); 1059 *vlpp = NULL; 1060 } 1061 cache_zap_negative_locked_vnode_kl(ncp, vp); 1062 return (true); 1063 } 1064 1065 pvlp = VP2VNODELOCK(vp); 1066 blp = NCP2BUCKETLOCK(ncp); 1067 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 1068 vlp2 = VP2VNODELOCK(ncp->nc_vp); 1069 1070 if (*vlpp == vlp1 || *vlpp == vlp2) { 1071 to_unlock = *vlpp; 1072 *vlpp = NULL; 1073 } else { 1074 if (*vlpp != NULL) { 1075 mtx_unlock(*vlpp); 1076 *vlpp = NULL; 1077 } 1078 cache_sort_vnodes(&vlp1, &vlp2); 1079 if (vlp1 == pvlp) { 1080 mtx_lock(vlp2); 1081 to_unlock = vlp2; 1082 } else { 1083 if (!mtx_trylock(vlp1)) 1084 goto out_relock; 1085 to_unlock = vlp1; 1086 } 1087 } 1088 rw_wlock(blp); 1089 cache_zap_locked(ncp); 1090 rw_wunlock(blp); 1091 if (to_unlock != NULL) 1092 mtx_unlock(to_unlock); 1093 return (true); 1094 1095 out_relock: 1096 mtx_unlock(vlp2); 1097 mtx_lock(vlp1); 1098 mtx_lock(vlp2); 1099 MPASS(*vlpp == NULL); 1100 *vlpp = vlp1; 1101 return (false); 1102 } 1103 1104 static int __noinline 1105 cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp) 1106 { 1107 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 1108 struct rwlock *blp; 1109 int error = 0; 1110 1111 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 1112 cache_assert_vnode_locked(vp); 1113 1114 pvlp = VP2VNODELOCK(vp); 1115 if (ncp->nc_flag & NCF_NEGATIVE) { 1116 cache_zap_negative_locked_vnode_kl(ncp, vp); 1117 goto out; 1118 } 1119 1120 blp = NCP2BUCKETLOCK(ncp); 1121 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 1122 vlp2 = VP2VNODELOCK(ncp->nc_vp); 1123 cache_sort_vnodes(&vlp1, &vlp2); 1124 if (vlp1 == pvlp) { 1125 mtx_lock(vlp2); 1126 to_unlock = vlp2; 1127 } else { 1128 if (!mtx_trylock(vlp1)) { 1129 error = EAGAIN; 1130 goto out; 1131 } 1132 to_unlock = vlp1; 1133 } 1134 rw_wlock(blp); 1135 cache_zap_locked(ncp); 1136 rw_wunlock(blp); 1137 mtx_unlock(to_unlock); 1138 out: 1139 mtx_unlock(pvlp); 1140 return (error); 1141 } 1142 1143 /* 1144 * If trylocking failed we can get here. We know enough to take all needed locks 1145 * in the right order and re-lookup the entry. 1146 */ 1147 static int 1148 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1149 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, 1150 struct rwlock *blp) 1151 { 1152 struct namecache *rncp; 1153 1154 cache_assert_bucket_locked(ncp, RA_UNLOCKED); 1155 1156 cache_sort_vnodes(&dvlp, &vlp); 1157 cache_lock_vnodes(dvlp, vlp); 1158 rw_wlock(blp); 1159 CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { 1160 if (rncp == ncp && rncp->nc_dvp == dvp && 1161 rncp->nc_nlen == cnp->cn_namelen && 1162 !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) 1163 break; 1164 } 1165 if (rncp != NULL) { 1166 cache_zap_locked(rncp); 1167 rw_wunlock(blp); 1168 cache_unlock_vnodes(dvlp, vlp); 1169 counter_u64_add(zap_and_exit_bucket_relock_success, 1); 1170 return (0); 1171 } 1172 1173 rw_wunlock(blp); 1174 cache_unlock_vnodes(dvlp, vlp); 1175 return (EAGAIN); 1176 } 1177 1178 static int __noinline 1179 cache_zap_wlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1180 uint32_t hash, struct rwlock *blp) 1181 { 1182 struct mtx *dvlp, *vlp; 1183 struct vnode *dvp; 1184 1185 cache_assert_bucket_locked(ncp, RA_WLOCKED); 1186 1187 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1188 vlp = NULL; 1189 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1190 vlp = VP2VNODELOCK(ncp->nc_vp); 1191 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1192 cache_zap_locked(ncp); 1193 rw_wunlock(blp); 1194 cache_unlock_vnodes(dvlp, vlp); 1195 return (0); 1196 } 1197 1198 dvp = ncp->nc_dvp; 1199 rw_wunlock(blp); 1200 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1201 } 1202 1203 static int __noinline 1204 cache_zap_rlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1205 uint32_t hash, struct rwlock *blp) 1206 { 1207 struct mtx *dvlp, *vlp; 1208 struct vnode *dvp; 1209 1210 cache_assert_bucket_locked(ncp, RA_RLOCKED); 1211 1212 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1213 vlp = NULL; 1214 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1215 vlp = VP2VNODELOCK(ncp->nc_vp); 1216 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1217 rw_runlock(blp); 1218 rw_wlock(blp); 1219 cache_zap_locked(ncp); 1220 rw_wunlock(blp); 1221 cache_unlock_vnodes(dvlp, vlp); 1222 return (0); 1223 } 1224 1225 dvp = ncp->nc_dvp; 1226 rw_runlock(blp); 1227 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1228 } 1229 1230 static int 1231 cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp, 1232 struct mtx **vlpp1, struct mtx **vlpp2) 1233 { 1234 struct mtx *dvlp, *vlp; 1235 1236 cache_assert_bucket_locked(ncp, RA_WLOCKED); 1237 1238 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1239 vlp = NULL; 1240 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1241 vlp = VP2VNODELOCK(ncp->nc_vp); 1242 cache_sort_vnodes(&dvlp, &vlp); 1243 1244 if (*vlpp1 == dvlp && *vlpp2 == vlp) { 1245 cache_zap_locked(ncp); 1246 cache_unlock_vnodes(dvlp, vlp); 1247 *vlpp1 = NULL; 1248 *vlpp2 = NULL; 1249 return (0); 1250 } 1251 1252 if (*vlpp1 != NULL) 1253 mtx_unlock(*vlpp1); 1254 if (*vlpp2 != NULL) 1255 mtx_unlock(*vlpp2); 1256 *vlpp1 = NULL; 1257 *vlpp2 = NULL; 1258 1259 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1260 cache_zap_locked(ncp); 1261 cache_unlock_vnodes(dvlp, vlp); 1262 return (0); 1263 } 1264 1265 rw_wunlock(blp); 1266 *vlpp1 = dvlp; 1267 *vlpp2 = vlp; 1268 if (*vlpp1 != NULL) 1269 mtx_lock(*vlpp1); 1270 mtx_lock(*vlpp2); 1271 rw_wlock(blp); 1272 return (EAGAIN); 1273 } 1274 1275 static void 1276 cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp) 1277 { 1278 1279 if (blp != NULL) { 1280 rw_runlock(blp); 1281 } else { 1282 mtx_unlock(vlp); 1283 } 1284 } 1285 1286 static int __noinline 1287 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1288 struct timespec *tsp, int *ticksp) 1289 { 1290 int ltype; 1291 1292 *vpp = dvp; 1293 CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .", 1294 dvp, cnp->cn_nameptr); 1295 counter_u64_add(dothits, 1); 1296 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); 1297 if (tsp != NULL) 1298 timespecclear(tsp); 1299 if (ticksp != NULL) 1300 *ticksp = ticks; 1301 vrefact(*vpp); 1302 /* 1303 * When we lookup "." we still can be asked to lock it 1304 * differently... 1305 */ 1306 ltype = cnp->cn_lkflags & LK_TYPE_MASK; 1307 if (ltype != VOP_ISLOCKED(*vpp)) { 1308 if (ltype == LK_EXCLUSIVE) { 1309 vn_lock(*vpp, LK_UPGRADE | LK_RETRY); 1310 if (VN_IS_DOOMED((*vpp))) { 1311 /* forced unmount */ 1312 vrele(*vpp); 1313 *vpp = NULL; 1314 return (ENOENT); 1315 } 1316 } else 1317 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); 1318 } 1319 return (-1); 1320 } 1321 1322 static __noinline int 1323 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp) 1324 { 1325 struct namecache *ncp; 1326 struct rwlock *blp; 1327 struct mtx *dvlp, *dvlp2; 1328 uint32_t hash; 1329 int error; 1330 1331 if (cnp->cn_namelen == 2 && 1332 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1333 dvlp = VP2VNODELOCK(dvp); 1334 dvlp2 = NULL; 1335 mtx_lock(dvlp); 1336 retry_dotdot: 1337 ncp = dvp->v_cache_dd; 1338 if (ncp == NULL) { 1339 mtx_unlock(dvlp); 1340 if (dvlp2 != NULL) 1341 mtx_unlock(dvlp2); 1342 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1343 return (0); 1344 } 1345 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1346 if (ncp->nc_dvp != dvp) 1347 panic("dvp %p v_cache_dd %p\n", dvp, ncp); 1348 if (!cache_zap_locked_vnode_kl2(ncp, 1349 dvp, &dvlp2)) 1350 goto retry_dotdot; 1351 MPASS(dvp->v_cache_dd == NULL); 1352 mtx_unlock(dvlp); 1353 if (dvlp2 != NULL) 1354 mtx_unlock(dvlp2); 1355 cache_free(ncp); 1356 } else { 1357 vn_seqc_write_begin(dvp); 1358 dvp->v_cache_dd = NULL; 1359 vn_seqc_write_end(dvp); 1360 mtx_unlock(dvlp); 1361 if (dvlp2 != NULL) 1362 mtx_unlock(dvlp2); 1363 } 1364 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1365 return (1); 1366 } 1367 1368 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1369 blp = HASH2BUCKETLOCK(hash); 1370 retry: 1371 if (CK_SLIST_EMPTY(NCHHASH(hash))) 1372 goto out_no_entry; 1373 1374 rw_wlock(blp); 1375 1376 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1377 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1378 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1379 break; 1380 } 1381 1382 /* We failed to find an entry */ 1383 if (ncp == NULL) { 1384 rw_wunlock(blp); 1385 goto out_no_entry; 1386 } 1387 1388 error = cache_zap_wlocked_bucket(ncp, cnp, hash, blp); 1389 if (__predict_false(error != 0)) { 1390 zap_and_exit_bucket_fail++; 1391 cache_maybe_yield(); 1392 goto retry; 1393 } 1394 counter_u64_add(numposzaps, 1); 1395 cache_free(ncp); 1396 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1397 return (1); 1398 out_no_entry: 1399 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1400 counter_u64_add(nummisszap, 1); 1401 return (0); 1402 } 1403 1404 /** 1405 * Lookup a name in the name cache 1406 * 1407 * # Arguments 1408 * 1409 * - dvp: Parent directory in which to search. 1410 * - vpp: Return argument. Will contain desired vnode on cache hit. 1411 * - cnp: Parameters of the name search. The most interesting bits of 1412 * the cn_flags field have the following meanings: 1413 * - MAKEENTRY: If clear, free an entry from the cache rather than look 1414 * it up. 1415 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." 1416 * - tsp: Return storage for cache timestamp. On a successful (positive 1417 * or negative) lookup, tsp will be filled with any timespec that 1418 * was stored when this cache entry was created. However, it will 1419 * be clear for "." entries. 1420 * - ticks: Return storage for alternate cache timestamp. On a successful 1421 * (positive or negative) lookup, it will contain the ticks value 1422 * that was current when the cache entry was created, unless cnp 1423 * was ".". 1424 * 1425 * # Returns 1426 * 1427 * - -1: A positive cache hit. vpp will contain the desired vnode. 1428 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due 1429 * to a forced unmount. vpp will not be modified. If the entry 1430 * is a whiteout, then the ISWHITEOUT flag will be set in 1431 * cnp->cn_flags. 1432 * - 0: A cache miss. vpp will not be modified. 1433 * 1434 * # Locking 1435 * 1436 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up 1437 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the 1438 * lock is not recursively acquired. 1439 */ 1440 int 1441 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1442 struct timespec *tsp, int *ticksp) 1443 { 1444 struct namecache_ts *ncp_ts; 1445 struct namecache *ncp; 1446 struct negstate *negstate; 1447 struct rwlock *blp; 1448 struct mtx *dvlp; 1449 uint32_t hash; 1450 enum vgetstate vs; 1451 int error, ltype; 1452 bool try_smr, doing_smr, whiteout; 1453 1454 #ifdef DEBUG_CACHE 1455 if (__predict_false(!doingcache)) { 1456 cnp->cn_flags &= ~MAKEENTRY; 1457 return (0); 1458 } 1459 #endif 1460 1461 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) 1462 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); 1463 1464 if ((cnp->cn_flags & MAKEENTRY) == 0) { 1465 cache_remove_cnp(dvp, cnp); 1466 return (0); 1467 } 1468 1469 try_smr = true; 1470 if (cnp->cn_nameiop == CREATE) 1471 try_smr = false; 1472 retry: 1473 doing_smr = false; 1474 blp = NULL; 1475 dvlp = NULL; 1476 error = 0; 1477 if (cnp->cn_namelen == 2 && 1478 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1479 counter_u64_add(dotdothits, 1); 1480 dvlp = VP2VNODELOCK(dvp); 1481 mtx_lock(dvlp); 1482 ncp = dvp->v_cache_dd; 1483 if (ncp == NULL) { 1484 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, 1485 "..", NULL); 1486 mtx_unlock(dvlp); 1487 return (0); 1488 } 1489 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1490 if (ncp->nc_flag & NCF_NEGATIVE) 1491 *vpp = NULL; 1492 else 1493 *vpp = ncp->nc_vp; 1494 } else 1495 *vpp = ncp->nc_dvp; 1496 /* Return failure if negative entry was found. */ 1497 if (*vpp == NULL) 1498 goto negative_success; 1499 CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..", 1500 dvp, cnp->cn_nameptr, *vpp); 1501 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", 1502 *vpp); 1503 cache_out_ts(ncp, tsp, ticksp); 1504 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == 1505 NCF_DTS && tsp != NULL) { 1506 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1507 *tsp = ncp_ts->nc_dotdottime; 1508 } 1509 goto success; 1510 } 1511 1512 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1513 retry_hashed: 1514 if (try_smr) { 1515 vfs_smr_enter(); 1516 doing_smr = true; 1517 try_smr = false; 1518 } else { 1519 blp = HASH2BUCKETLOCK(hash); 1520 rw_rlock(blp); 1521 } 1522 1523 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1524 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1525 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1526 break; 1527 } 1528 1529 /* We failed to find an entry */ 1530 if (__predict_false(ncp == NULL)) { 1531 if (doing_smr) 1532 vfs_smr_exit(); 1533 else 1534 rw_runlock(blp); 1535 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1536 NULL); 1537 counter_u64_add(nummiss, 1); 1538 return (0); 1539 } 1540 1541 if (ncp->nc_flag & NCF_NEGATIVE) 1542 goto negative_success; 1543 1544 /* We found a "positive" match, return the vnode */ 1545 counter_u64_add(numposhits, 1); 1546 *vpp = ncp->nc_vp; 1547 CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p", 1548 dvp, cnp->cn_nameptr, *vpp, ncp); 1549 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, 1550 *vpp); 1551 cache_out_ts(ncp, tsp, ticksp); 1552 success: 1553 /* 1554 * On success we return a locked and ref'd vnode as per the lookup 1555 * protocol. 1556 */ 1557 MPASS(dvp != *vpp); 1558 ltype = 0; /* silence gcc warning */ 1559 if (cnp->cn_flags & ISDOTDOT) { 1560 ltype = VOP_ISLOCKED(dvp); 1561 VOP_UNLOCK(dvp); 1562 } 1563 if (doing_smr) { 1564 if (!cache_ncp_canuse(ncp)) { 1565 vfs_smr_exit(); 1566 *vpp = NULL; 1567 goto retry; 1568 } 1569 vs = vget_prep_smr(*vpp); 1570 vfs_smr_exit(); 1571 if (__predict_false(vs == VGET_NONE)) { 1572 *vpp = NULL; 1573 goto retry; 1574 } 1575 } else { 1576 vs = vget_prep(*vpp); 1577 cache_lookup_unlock(blp, dvlp); 1578 } 1579 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1580 if (cnp->cn_flags & ISDOTDOT) { 1581 vn_lock(dvp, ltype | LK_RETRY); 1582 if (VN_IS_DOOMED(dvp)) { 1583 if (error == 0) 1584 vput(*vpp); 1585 *vpp = NULL; 1586 return (ENOENT); 1587 } 1588 } 1589 if (error) { 1590 *vpp = NULL; 1591 goto retry; 1592 } 1593 if ((cnp->cn_flags & ISLASTCN) && 1594 (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) { 1595 ASSERT_VOP_ELOCKED(*vpp, "cache_lookup"); 1596 } 1597 return (-1); 1598 1599 negative_success: 1600 /* We found a negative match, and want to create it, so purge */ 1601 if (cnp->cn_nameiop == CREATE) { 1602 MPASS(!doing_smr); 1603 counter_u64_add(numnegzaps, 1); 1604 goto zap_and_exit; 1605 } 1606 1607 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 1608 cache_out_ts(ncp, tsp, ticksp); 1609 counter_u64_add(numneghits, 1); 1610 whiteout = (ncp->nc_flag & NCF_WHITE); 1611 1612 if (doing_smr) { 1613 /* 1614 * We need to take locks to promote an entry. 1615 */ 1616 negstate = NCP2NEGSTATE(ncp); 1617 if ((negstate->neg_flag & NEG_HOT) == 0 || 1618 !cache_ncp_canuse(ncp)) { 1619 vfs_smr_exit(); 1620 doing_smr = false; 1621 goto retry_hashed; 1622 } 1623 vfs_smr_exit(); 1624 } else { 1625 cache_negative_hit(ncp); 1626 cache_lookup_unlock(blp, dvlp); 1627 } 1628 if (whiteout) 1629 cnp->cn_flags |= ISWHITEOUT; 1630 return (ENOENT); 1631 1632 zap_and_exit: 1633 MPASS(!doing_smr); 1634 if (blp != NULL) 1635 error = cache_zap_rlocked_bucket(ncp, cnp, hash, blp); 1636 else 1637 error = cache_zap_locked_vnode(ncp, dvp); 1638 if (__predict_false(error != 0)) { 1639 zap_and_exit_bucket_fail2++; 1640 cache_maybe_yield(); 1641 goto retry; 1642 } 1643 cache_free(ncp); 1644 return (0); 1645 } 1646 1647 struct celockstate { 1648 struct mtx *vlp[3]; 1649 struct rwlock *blp[2]; 1650 }; 1651 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); 1652 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); 1653 1654 static inline void 1655 cache_celockstate_init(struct celockstate *cel) 1656 { 1657 1658 bzero(cel, sizeof(*cel)); 1659 } 1660 1661 static void 1662 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, 1663 struct vnode *dvp) 1664 { 1665 struct mtx *vlp1, *vlp2; 1666 1667 MPASS(cel->vlp[0] == NULL); 1668 MPASS(cel->vlp[1] == NULL); 1669 MPASS(cel->vlp[2] == NULL); 1670 1671 MPASS(vp != NULL || dvp != NULL); 1672 1673 vlp1 = VP2VNODELOCK(vp); 1674 vlp2 = VP2VNODELOCK(dvp); 1675 cache_sort_vnodes(&vlp1, &vlp2); 1676 1677 if (vlp1 != NULL) { 1678 mtx_lock(vlp1); 1679 cel->vlp[0] = vlp1; 1680 } 1681 mtx_lock(vlp2); 1682 cel->vlp[1] = vlp2; 1683 } 1684 1685 static void 1686 cache_unlock_vnodes_cel(struct celockstate *cel) 1687 { 1688 1689 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); 1690 1691 if (cel->vlp[0] != NULL) 1692 mtx_unlock(cel->vlp[0]); 1693 if (cel->vlp[1] != NULL) 1694 mtx_unlock(cel->vlp[1]); 1695 if (cel->vlp[2] != NULL) 1696 mtx_unlock(cel->vlp[2]); 1697 } 1698 1699 static bool 1700 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) 1701 { 1702 struct mtx *vlp; 1703 bool ret; 1704 1705 cache_assert_vlp_locked(cel->vlp[0]); 1706 cache_assert_vlp_locked(cel->vlp[1]); 1707 MPASS(cel->vlp[2] == NULL); 1708 1709 MPASS(vp != NULL); 1710 vlp = VP2VNODELOCK(vp); 1711 1712 ret = true; 1713 if (vlp >= cel->vlp[1]) { 1714 mtx_lock(vlp); 1715 } else { 1716 if (mtx_trylock(vlp)) 1717 goto out; 1718 cache_lock_vnodes_cel_3_failures++; 1719 cache_unlock_vnodes_cel(cel); 1720 if (vlp < cel->vlp[0]) { 1721 mtx_lock(vlp); 1722 mtx_lock(cel->vlp[0]); 1723 mtx_lock(cel->vlp[1]); 1724 } else { 1725 if (cel->vlp[0] != NULL) 1726 mtx_lock(cel->vlp[0]); 1727 mtx_lock(vlp); 1728 mtx_lock(cel->vlp[1]); 1729 } 1730 ret = false; 1731 } 1732 out: 1733 cel->vlp[2] = vlp; 1734 return (ret); 1735 } 1736 1737 static void 1738 cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1, 1739 struct rwlock *blp2) 1740 { 1741 1742 MPASS(cel->blp[0] == NULL); 1743 MPASS(cel->blp[1] == NULL); 1744 1745 cache_sort_vnodes(&blp1, &blp2); 1746 1747 if (blp1 != NULL) { 1748 rw_wlock(blp1); 1749 cel->blp[0] = blp1; 1750 } 1751 rw_wlock(blp2); 1752 cel->blp[1] = blp2; 1753 } 1754 1755 static void 1756 cache_unlock_buckets_cel(struct celockstate *cel) 1757 { 1758 1759 if (cel->blp[0] != NULL) 1760 rw_wunlock(cel->blp[0]); 1761 rw_wunlock(cel->blp[1]); 1762 } 1763 1764 /* 1765 * Lock part of the cache affected by the insertion. 1766 * 1767 * This means vnodelocks for dvp, vp and the relevant bucketlock. 1768 * However, insertion can result in removal of an old entry. In this 1769 * case we have an additional vnode and bucketlock pair to lock. If the 1770 * entry is negative, ncelock is locked instead of the vnode. 1771 * 1772 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while 1773 * preserving the locking order (smaller address first). 1774 */ 1775 static void 1776 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1777 uint32_t hash) 1778 { 1779 struct namecache *ncp; 1780 struct rwlock *blps[2]; 1781 1782 blps[0] = HASH2BUCKETLOCK(hash); 1783 for (;;) { 1784 blps[1] = NULL; 1785 cache_lock_vnodes_cel(cel, dvp, vp); 1786 if (vp == NULL || vp->v_type != VDIR) 1787 break; 1788 ncp = vp->v_cache_dd; 1789 if (ncp == NULL) 1790 break; 1791 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1792 break; 1793 MPASS(ncp->nc_dvp == vp); 1794 blps[1] = NCP2BUCKETLOCK(ncp); 1795 if (ncp->nc_flag & NCF_NEGATIVE) 1796 break; 1797 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1798 break; 1799 /* 1800 * All vnodes got re-locked. Re-validate the state and if 1801 * nothing changed we are done. Otherwise restart. 1802 */ 1803 if (ncp == vp->v_cache_dd && 1804 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1805 blps[1] == NCP2BUCKETLOCK(ncp) && 1806 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1807 break; 1808 cache_unlock_vnodes_cel(cel); 1809 cel->vlp[0] = NULL; 1810 cel->vlp[1] = NULL; 1811 cel->vlp[2] = NULL; 1812 } 1813 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1814 } 1815 1816 static void 1817 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1818 uint32_t hash) 1819 { 1820 struct namecache *ncp; 1821 struct rwlock *blps[2]; 1822 1823 blps[0] = HASH2BUCKETLOCK(hash); 1824 for (;;) { 1825 blps[1] = NULL; 1826 cache_lock_vnodes_cel(cel, dvp, vp); 1827 ncp = dvp->v_cache_dd; 1828 if (ncp == NULL) 1829 break; 1830 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1831 break; 1832 MPASS(ncp->nc_dvp == dvp); 1833 blps[1] = NCP2BUCKETLOCK(ncp); 1834 if (ncp->nc_flag & NCF_NEGATIVE) 1835 break; 1836 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1837 break; 1838 if (ncp == dvp->v_cache_dd && 1839 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1840 blps[1] == NCP2BUCKETLOCK(ncp) && 1841 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1842 break; 1843 cache_unlock_vnodes_cel(cel); 1844 cel->vlp[0] = NULL; 1845 cel->vlp[1] = NULL; 1846 cel->vlp[2] = NULL; 1847 } 1848 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1849 } 1850 1851 static void 1852 cache_enter_unlock(struct celockstate *cel) 1853 { 1854 1855 cache_unlock_buckets_cel(cel); 1856 cache_unlock_vnodes_cel(cel); 1857 } 1858 1859 static void __noinline 1860 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, 1861 struct componentname *cnp) 1862 { 1863 struct celockstate cel; 1864 struct namecache *ncp; 1865 uint32_t hash; 1866 int len; 1867 1868 if (dvp->v_cache_dd == NULL) 1869 return; 1870 len = cnp->cn_namelen; 1871 cache_celockstate_init(&cel); 1872 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1873 cache_enter_lock_dd(&cel, dvp, vp, hash); 1874 vn_seqc_write_begin(dvp); 1875 ncp = dvp->v_cache_dd; 1876 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) { 1877 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); 1878 cache_zap_locked(ncp); 1879 } else { 1880 ncp = NULL; 1881 } 1882 dvp->v_cache_dd = NULL; 1883 vn_seqc_write_end(dvp); 1884 cache_enter_unlock(&cel); 1885 cache_free(ncp); 1886 } 1887 1888 /* 1889 * Add an entry to the cache. 1890 */ 1891 void 1892 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 1893 struct timespec *tsp, struct timespec *dtsp) 1894 { 1895 struct celockstate cel; 1896 struct namecache *ncp, *n2, *ndd; 1897 struct namecache_ts *ncp_ts, *n2_ts; 1898 struct nchashhead *ncpp; 1899 uint32_t hash; 1900 int flag; 1901 int len; 1902 u_long lnumcache; 1903 1904 CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr); 1905 VNPASS(!VN_IS_DOOMED(dvp), dvp); 1906 VNPASS(dvp->v_type != VNON, dvp); 1907 if (vp != NULL) { 1908 VNPASS(!VN_IS_DOOMED(vp), vp); 1909 VNPASS(vp->v_type != VNON, vp); 1910 } 1911 1912 #ifdef DEBUG_CACHE 1913 if (__predict_false(!doingcache)) 1914 return; 1915 #endif 1916 1917 flag = 0; 1918 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 1919 if (cnp->cn_namelen == 1) 1920 return; 1921 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 1922 cache_enter_dotdot_prep(dvp, vp, cnp); 1923 flag = NCF_ISDOTDOT; 1924 } 1925 } 1926 1927 /* 1928 * Avoid blowout in namecache entries. 1929 */ 1930 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; 1931 if (__predict_false(lnumcache >= ncsize)) { 1932 atomic_add_long(&numcache, -1); 1933 counter_u64_add(numdrops, 1); 1934 return; 1935 } 1936 1937 cache_celockstate_init(&cel); 1938 ndd = NULL; 1939 ncp_ts = NULL; 1940 1941 /* 1942 * Calculate the hash key and setup as much of the new 1943 * namecache entry as possible before acquiring the lock. 1944 */ 1945 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); 1946 ncp->nc_flag = flag | NCF_WIP; 1947 ncp->nc_vp = vp; 1948 if (vp == NULL) 1949 cache_negative_init(ncp); 1950 ncp->nc_dvp = dvp; 1951 if (tsp != NULL) { 1952 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1953 ncp_ts->nc_time = *tsp; 1954 ncp_ts->nc_ticks = ticks; 1955 ncp_ts->nc_nc.nc_flag |= NCF_TS; 1956 if (dtsp != NULL) { 1957 ncp_ts->nc_dotdottime = *dtsp; 1958 ncp_ts->nc_nc.nc_flag |= NCF_DTS; 1959 } 1960 } 1961 len = ncp->nc_nlen = cnp->cn_namelen; 1962 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1963 memcpy(ncp->nc_name, cnp->cn_nameptr, len); 1964 ncp->nc_name[len] = '\0'; 1965 cache_enter_lock(&cel, dvp, vp, hash); 1966 1967 /* 1968 * See if this vnode or negative entry is already in the cache 1969 * with this name. This can happen with concurrent lookups of 1970 * the same path name. 1971 */ 1972 ncpp = NCHHASH(hash); 1973 CK_SLIST_FOREACH(n2, ncpp, nc_hash) { 1974 if (n2->nc_dvp == dvp && 1975 n2->nc_nlen == cnp->cn_namelen && 1976 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { 1977 MPASS(cache_ncp_canuse(n2)); 1978 if ((n2->nc_flag & NCF_NEGATIVE) != 0) 1979 KASSERT(vp == NULL, 1980 ("%s: found entry pointing to a different vnode (%p != %p)", 1981 __func__, NULL, vp)); 1982 else 1983 KASSERT(n2->nc_vp == vp, 1984 ("%s: found entry pointing to a different vnode (%p != %p)", 1985 __func__, n2->nc_vp, vp)); 1986 if (tsp != NULL) { 1987 KASSERT((n2->nc_flag & NCF_TS) != 0, 1988 ("no NCF_TS")); 1989 n2_ts = __containerof(n2, struct namecache_ts, nc_nc); 1990 n2_ts->nc_time = ncp_ts->nc_time; 1991 n2_ts->nc_ticks = ncp_ts->nc_ticks; 1992 if (dtsp != NULL) { 1993 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; 1994 n2_ts->nc_nc.nc_flag |= NCF_DTS; 1995 } 1996 } 1997 goto out_unlock_free; 1998 } 1999 } 2000 2001 if (flag == NCF_ISDOTDOT) { 2002 /* 2003 * See if we are trying to add .. entry, but some other lookup 2004 * has populated v_cache_dd pointer already. 2005 */ 2006 if (dvp->v_cache_dd != NULL) 2007 goto out_unlock_free; 2008 KASSERT(vp == NULL || vp->v_type == VDIR, 2009 ("wrong vnode type %p", vp)); 2010 vn_seqc_write_begin(dvp); 2011 dvp->v_cache_dd = ncp; 2012 vn_seqc_write_end(dvp); 2013 } 2014 2015 if (vp != NULL) { 2016 if (flag != NCF_ISDOTDOT) { 2017 /* 2018 * For this case, the cache entry maps both the 2019 * directory name in it and the name ".." for the 2020 * directory's parent. 2021 */ 2022 vn_seqc_write_begin(vp); 2023 if ((ndd = vp->v_cache_dd) != NULL) { 2024 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) 2025 cache_zap_locked(ndd); 2026 else 2027 ndd = NULL; 2028 } 2029 vp->v_cache_dd = ncp; 2030 vn_seqc_write_end(vp); 2031 } else if (vp->v_type != VDIR) { 2032 if (vp->v_cache_dd != NULL) { 2033 vn_seqc_write_begin(vp); 2034 vp->v_cache_dd = NULL; 2035 vn_seqc_write_end(vp); 2036 } 2037 } 2038 } 2039 2040 if (flag != NCF_ISDOTDOT) { 2041 if (LIST_EMPTY(&dvp->v_cache_src)) { 2042 vhold(dvp); 2043 counter_u64_add(numcachehv, 1); 2044 } 2045 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); 2046 } 2047 2048 /* 2049 * If the entry is "negative", we place it into the 2050 * "negative" cache queue, otherwise, we place it into the 2051 * destination vnode's cache entries queue. 2052 */ 2053 if (vp != NULL) { 2054 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); 2055 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, 2056 vp); 2057 } else { 2058 if (cnp->cn_flags & ISWHITEOUT) 2059 ncp->nc_flag |= NCF_WHITE; 2060 cache_negative_insert(ncp); 2061 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, 2062 ncp->nc_name); 2063 } 2064 2065 /* 2066 * Insert the new namecache entry into the appropriate chain 2067 * within the cache entries table. 2068 */ 2069 CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash); 2070 2071 atomic_thread_fence_rel(); 2072 /* 2073 * Mark the entry as fully constructed. 2074 * It is immutable past this point until its removal. 2075 */ 2076 atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP); 2077 2078 cache_enter_unlock(&cel); 2079 if (numneg * ncnegfactor > lnumcache) 2080 cache_negative_zap_one(); 2081 cache_free(ndd); 2082 return; 2083 out_unlock_free: 2084 cache_enter_unlock(&cel); 2085 atomic_add_long(&numcache, -1); 2086 cache_free(ncp); 2087 return; 2088 } 2089 2090 static u_int 2091 cache_roundup_2(u_int val) 2092 { 2093 u_int res; 2094 2095 for (res = 1; res <= val; res <<= 1) 2096 continue; 2097 2098 return (res); 2099 } 2100 2101 static struct nchashhead * 2102 nchinittbl(u_long elements, u_long *hashmask) 2103 { 2104 struct nchashhead *hashtbl; 2105 u_long hashsize, i; 2106 2107 hashsize = cache_roundup_2(elements) / 2; 2108 2109 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK); 2110 for (i = 0; i < hashsize; i++) 2111 CK_SLIST_INIT(&hashtbl[i]); 2112 *hashmask = hashsize - 1; 2113 return (hashtbl); 2114 } 2115 2116 static void 2117 ncfreetbl(struct nchashhead *hashtbl) 2118 { 2119 2120 free(hashtbl, M_VFSCACHE); 2121 } 2122 2123 /* 2124 * Name cache initialization, from vfs_init() when we are booting 2125 */ 2126 static void 2127 nchinit(void *dummy __unused) 2128 { 2129 u_int i; 2130 2131 cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE, 2132 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2133 cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE, 2134 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2135 cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE, 2136 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2137 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE, 2138 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2139 2140 VFS_SMR_ZONE_SET(cache_zone_small); 2141 VFS_SMR_ZONE_SET(cache_zone_small_ts); 2142 VFS_SMR_ZONE_SET(cache_zone_large); 2143 VFS_SMR_ZONE_SET(cache_zone_large_ts); 2144 2145 ncsize = desiredvnodes * ncsizefactor; 2146 nchashtbl = nchinittbl(desiredvnodes * 2, &nchash); 2147 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; 2148 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ 2149 ncbuckethash = 7; 2150 if (ncbuckethash > nchash) 2151 ncbuckethash = nchash; 2152 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, 2153 M_WAITOK | M_ZERO); 2154 for (i = 0; i < numbucketlocks; i++) 2155 rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE); 2156 ncvnodehash = ncbuckethash; 2157 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, 2158 M_WAITOK | M_ZERO); 2159 for (i = 0; i < numvnodelocks; i++) 2160 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); 2161 ncpurgeminvnodes = numbucketlocks * 2; 2162 2163 neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE, 2164 M_WAITOK | M_ZERO); 2165 for (i = 0; i < numneglists; i++) { 2166 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); 2167 TAILQ_INIT(&neglists[i].nl_list); 2168 } 2169 mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF); 2170 TAILQ_INIT(&ncneg_hot.nl_list); 2171 2172 mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF); 2173 } 2174 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); 2175 2176 void 2177 cache_vnode_init(struct vnode *vp) 2178 { 2179 2180 LIST_INIT(&vp->v_cache_src); 2181 TAILQ_INIT(&vp->v_cache_dst); 2182 vp->v_cache_dd = NULL; 2183 cache_prehash(vp); 2184 } 2185 2186 void 2187 cache_changesize(u_long newmaxvnodes) 2188 { 2189 struct nchashhead *new_nchashtbl, *old_nchashtbl; 2190 u_long new_nchash, old_nchash; 2191 struct namecache *ncp; 2192 uint32_t hash; 2193 u_long newncsize; 2194 int i; 2195 2196 newncsize = newmaxvnodes * ncsizefactor; 2197 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); 2198 if (newmaxvnodes < numbucketlocks) 2199 newmaxvnodes = numbucketlocks; 2200 2201 new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash); 2202 /* If same hash table size, nothing to do */ 2203 if (nchash == new_nchash) { 2204 ncfreetbl(new_nchashtbl); 2205 return; 2206 } 2207 /* 2208 * Move everything from the old hash table to the new table. 2209 * None of the namecache entries in the table can be removed 2210 * because to do so, they have to be removed from the hash table. 2211 */ 2212 cache_lock_all_vnodes(); 2213 cache_lock_all_buckets(); 2214 old_nchashtbl = nchashtbl; 2215 old_nchash = nchash; 2216 nchashtbl = new_nchashtbl; 2217 nchash = new_nchash; 2218 for (i = 0; i <= old_nchash; i++) { 2219 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) { 2220 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, 2221 ncp->nc_dvp); 2222 CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash); 2223 CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); 2224 } 2225 } 2226 ncsize = newncsize; 2227 cache_unlock_all_buckets(); 2228 cache_unlock_all_vnodes(); 2229 ncfreetbl(old_nchashtbl); 2230 } 2231 2232 /* 2233 * Invalidate all entries from and to a particular vnode. 2234 */ 2235 static void 2236 cache_purge_impl(struct vnode *vp) 2237 { 2238 TAILQ_HEAD(, namecache) ncps; 2239 struct namecache *ncp, *nnp; 2240 struct mtx *vlp, *vlp2; 2241 2242 TAILQ_INIT(&ncps); 2243 vlp = VP2VNODELOCK(vp); 2244 vlp2 = NULL; 2245 mtx_assert(vlp, MA_OWNED); 2246 retry: 2247 while (!LIST_EMPTY(&vp->v_cache_src)) { 2248 ncp = LIST_FIRST(&vp->v_cache_src); 2249 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2250 goto retry; 2251 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2252 } 2253 while (!TAILQ_EMPTY(&vp->v_cache_dst)) { 2254 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2255 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2256 goto retry; 2257 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2258 } 2259 ncp = vp->v_cache_dd; 2260 if (ncp != NULL) { 2261 KASSERT(ncp->nc_flag & NCF_ISDOTDOT, 2262 ("lost dotdot link")); 2263 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2264 goto retry; 2265 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2266 } 2267 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); 2268 mtx_unlock(vlp); 2269 if (vlp2 != NULL) 2270 mtx_unlock(vlp2); 2271 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2272 cache_free(ncp); 2273 } 2274 } 2275 2276 void 2277 cache_purge(struct vnode *vp) 2278 { 2279 struct mtx *vlp; 2280 2281 SDT_PROBE1(vfs, namecache, purge, done, vp); 2282 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2283 vp->v_cache_dd == NULL) 2284 return; 2285 vlp = VP2VNODELOCK(vp); 2286 mtx_lock(vlp); 2287 cache_purge_impl(vp); 2288 } 2289 2290 /* 2291 * Only to be used by vgone. 2292 */ 2293 void 2294 cache_purge_vgone(struct vnode *vp) 2295 { 2296 struct mtx *vlp; 2297 2298 VNPASS(VN_IS_DOOMED(vp), vp); 2299 vlp = VP2VNODELOCK(vp); 2300 if (!(LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2301 vp->v_cache_dd == NULL)) { 2302 mtx_lock(vlp); 2303 cache_purge_impl(vp); 2304 mtx_assert(vlp, MA_NOTOWNED); 2305 return; 2306 } 2307 2308 /* 2309 * All the NULL pointer state we found above may be transient. 2310 * Serialize against a possible thread doing cache_purge. 2311 */ 2312 mtx_wait_unlocked(vlp); 2313 if (!(LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2314 vp->v_cache_dd == NULL)) { 2315 mtx_lock(vlp); 2316 cache_purge_impl(vp); 2317 mtx_assert(vlp, MA_NOTOWNED); 2318 return; 2319 } 2320 return; 2321 } 2322 2323 /* 2324 * Invalidate all negative entries for a particular directory vnode. 2325 */ 2326 void 2327 cache_purge_negative(struct vnode *vp) 2328 { 2329 TAILQ_HEAD(, namecache) ncps; 2330 struct namecache *ncp, *nnp; 2331 struct mtx *vlp; 2332 2333 CTR1(KTR_VFS, "cache_purge_negative(%p)", vp); 2334 SDT_PROBE1(vfs, namecache, purge_negative, done, vp); 2335 if (LIST_EMPTY(&vp->v_cache_src)) 2336 return; 2337 TAILQ_INIT(&ncps); 2338 vlp = VP2VNODELOCK(vp); 2339 mtx_lock(vlp); 2340 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { 2341 if (!(ncp->nc_flag & NCF_NEGATIVE)) 2342 continue; 2343 cache_zap_negative_locked_vnode_kl(ncp, vp); 2344 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2345 } 2346 mtx_unlock(vlp); 2347 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2348 cache_free(ncp); 2349 } 2350 } 2351 2352 void 2353 cache_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, 2354 struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp) 2355 { 2356 2357 ASSERT_VOP_IN_SEQC(fdvp); 2358 ASSERT_VOP_IN_SEQC(fvp); 2359 ASSERT_VOP_IN_SEQC(tdvp); 2360 if (tvp != NULL) 2361 ASSERT_VOP_IN_SEQC(tvp); 2362 2363 cache_purge(fvp); 2364 if (tvp != NULL) { 2365 cache_purge(tvp); 2366 KASSERT(!cache_remove_cnp(tdvp, tcnp), 2367 ("%s: lingering negative entry", __func__)); 2368 } else { 2369 cache_remove_cnp(tdvp, tcnp); 2370 } 2371 } 2372 2373 /* 2374 * Flush all entries referencing a particular filesystem. 2375 */ 2376 void 2377 cache_purgevfs(struct mount *mp, bool force) 2378 { 2379 TAILQ_HEAD(, namecache) ncps; 2380 struct mtx *vlp1, *vlp2; 2381 struct rwlock *blp; 2382 struct nchashhead *bucket; 2383 struct namecache *ncp, *nnp; 2384 u_long i, j, n_nchash; 2385 int error; 2386 2387 /* Scan hash tables for applicable entries */ 2388 SDT_PROBE1(vfs, namecache, purgevfs, done, mp); 2389 if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes) 2390 return; 2391 TAILQ_INIT(&ncps); 2392 n_nchash = nchash + 1; 2393 vlp1 = vlp2 = NULL; 2394 for (i = 0; i < numbucketlocks; i++) { 2395 blp = (struct rwlock *)&bucketlocks[i]; 2396 rw_wlock(blp); 2397 for (j = i; j < n_nchash; j += numbucketlocks) { 2398 retry: 2399 bucket = &nchashtbl[j]; 2400 CK_SLIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) { 2401 cache_assert_bucket_locked(ncp, RA_WLOCKED); 2402 if (ncp->nc_dvp->v_mount != mp) 2403 continue; 2404 error = cache_zap_wlocked_bucket_kl(ncp, blp, 2405 &vlp1, &vlp2); 2406 if (error != 0) 2407 goto retry; 2408 TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst); 2409 } 2410 } 2411 rw_wunlock(blp); 2412 if (vlp1 == NULL && vlp2 == NULL) 2413 cache_maybe_yield(); 2414 } 2415 if (vlp1 != NULL) 2416 mtx_unlock(vlp1); 2417 if (vlp2 != NULL) 2418 mtx_unlock(vlp2); 2419 2420 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2421 cache_free(ncp); 2422 } 2423 } 2424 2425 /* 2426 * Perform canonical checks and cache lookup and pass on to filesystem 2427 * through the vop_cachedlookup only if needed. 2428 */ 2429 2430 int 2431 vfs_cache_lookup(struct vop_lookup_args *ap) 2432 { 2433 struct vnode *dvp; 2434 int error; 2435 struct vnode **vpp = ap->a_vpp; 2436 struct componentname *cnp = ap->a_cnp; 2437 int flags = cnp->cn_flags; 2438 2439 *vpp = NULL; 2440 dvp = ap->a_dvp; 2441 2442 if (dvp->v_type != VDIR) 2443 return (ENOTDIR); 2444 2445 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && 2446 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) 2447 return (EROFS); 2448 2449 error = vn_dir_check_exec(dvp, cnp); 2450 if (error != 0) 2451 return (error); 2452 2453 error = cache_lookup(dvp, vpp, cnp, NULL, NULL); 2454 if (error == 0) 2455 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); 2456 if (error == -1) 2457 return (0); 2458 return (error); 2459 } 2460 2461 /* Implementation of the getcwd syscall. */ 2462 int 2463 sys___getcwd(struct thread *td, struct __getcwd_args *uap) 2464 { 2465 char *buf, *retbuf; 2466 size_t buflen; 2467 int error; 2468 2469 buflen = uap->buflen; 2470 if (__predict_false(buflen < 2)) 2471 return (EINVAL); 2472 if (buflen > MAXPATHLEN) 2473 buflen = MAXPATHLEN; 2474 2475 buf = uma_zalloc(namei_zone, M_WAITOK); 2476 error = vn_getcwd(buf, &retbuf, &buflen); 2477 if (error == 0) 2478 error = copyout(retbuf, uap->buf, buflen); 2479 uma_zfree(namei_zone, buf); 2480 return (error); 2481 } 2482 2483 int 2484 vn_getcwd(char *buf, char **retbuf, size_t *buflen) 2485 { 2486 struct pwd *pwd; 2487 int error; 2488 2489 vfs_smr_enter(); 2490 pwd = pwd_get_smr(); 2491 error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, 2492 buflen, false, 0); 2493 VFS_SMR_ASSERT_NOT_ENTERED(); 2494 if (error < 0) { 2495 pwd = pwd_hold(curthread); 2496 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf, 2497 retbuf, buflen); 2498 pwd_drop(pwd); 2499 } 2500 2501 #ifdef KTRACE 2502 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0) 2503 ktrnamei(*retbuf); 2504 #endif 2505 return (error); 2506 } 2507 2508 static int 2509 kern___realpathat(struct thread *td, int fd, const char *path, char *buf, 2510 size_t size, int flags, enum uio_seg pathseg) 2511 { 2512 struct nameidata nd; 2513 char *retbuf, *freebuf; 2514 int error; 2515 2516 if (flags != 0) 2517 return (EINVAL); 2518 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1, 2519 pathseg, path, fd, &cap_fstat_rights, td); 2520 if ((error = namei(&nd)) != 0) 2521 return (error); 2522 error = vn_fullpath_hardlink(&nd, &retbuf, &freebuf, &size); 2523 if (error == 0) { 2524 error = copyout(retbuf, buf, size); 2525 free(freebuf, M_TEMP); 2526 } 2527 NDFREE(&nd, 0); 2528 return (error); 2529 } 2530 2531 int 2532 sys___realpathat(struct thread *td, struct __realpathat_args *uap) 2533 { 2534 2535 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size, 2536 uap->flags, UIO_USERSPACE)); 2537 } 2538 2539 /* 2540 * Retrieve the full filesystem path that correspond to a vnode from the name 2541 * cache (if available) 2542 */ 2543 int 2544 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf) 2545 { 2546 struct pwd *pwd; 2547 char *buf; 2548 size_t buflen; 2549 int error; 2550 2551 if (__predict_false(vp == NULL)) 2552 return (EINVAL); 2553 2554 buflen = MAXPATHLEN; 2555 buf = malloc(buflen, M_TEMP, M_WAITOK); 2556 vfs_smr_enter(); 2557 pwd = pwd_get_smr(); 2558 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, false, 0); 2559 VFS_SMR_ASSERT_NOT_ENTERED(); 2560 if (error < 0) { 2561 pwd = pwd_hold(curthread); 2562 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen); 2563 pwd_drop(pwd); 2564 } 2565 if (error == 0) 2566 *freebuf = buf; 2567 else 2568 free(buf, M_TEMP); 2569 return (error); 2570 } 2571 2572 /* 2573 * This function is similar to vn_fullpath, but it attempts to lookup the 2574 * pathname relative to the global root mount point. This is required for the 2575 * auditing sub-system, as audited pathnames must be absolute, relative to the 2576 * global root mount point. 2577 */ 2578 int 2579 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf) 2580 { 2581 char *buf; 2582 size_t buflen; 2583 int error; 2584 2585 if (__predict_false(vp == NULL)) 2586 return (EINVAL); 2587 buflen = MAXPATHLEN; 2588 buf = malloc(buflen, M_TEMP, M_WAITOK); 2589 vfs_smr_enter(); 2590 error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, false, 0); 2591 VFS_SMR_ASSERT_NOT_ENTERED(); 2592 if (error < 0) { 2593 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen); 2594 } 2595 if (error == 0) 2596 *freebuf = buf; 2597 else 2598 free(buf, M_TEMP); 2599 return (error); 2600 } 2601 2602 static struct namecache * 2603 vn_dd_from_dst(struct vnode *vp) 2604 { 2605 struct namecache *ncp; 2606 2607 cache_assert_vnode_locked(vp); 2608 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) { 2609 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2610 return (ncp); 2611 } 2612 return (NULL); 2613 } 2614 2615 int 2616 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, size_t *buflen) 2617 { 2618 struct vnode *dvp; 2619 struct namecache *ncp; 2620 struct mtx *vlp; 2621 int error; 2622 2623 vlp = VP2VNODELOCK(*vp); 2624 mtx_lock(vlp); 2625 ncp = (*vp)->v_cache_dd; 2626 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) { 2627 KASSERT(ncp == vn_dd_from_dst(*vp), 2628 ("%s: mismatch for dd entry (%p != %p)", __func__, 2629 ncp, vn_dd_from_dst(*vp))); 2630 } else { 2631 ncp = vn_dd_from_dst(*vp); 2632 } 2633 if (ncp != NULL) { 2634 if (*buflen < ncp->nc_nlen) { 2635 mtx_unlock(vlp); 2636 vrele(*vp); 2637 counter_u64_add(numfullpathfail4, 1); 2638 error = ENOMEM; 2639 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2640 vp, NULL); 2641 return (error); 2642 } 2643 *buflen -= ncp->nc_nlen; 2644 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 2645 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, 2646 ncp->nc_name, vp); 2647 dvp = *vp; 2648 *vp = ncp->nc_dvp; 2649 vref(*vp); 2650 mtx_unlock(vlp); 2651 vrele(dvp); 2652 return (0); 2653 } 2654 SDT_PROBE1(vfs, namecache, fullpath, miss, vp); 2655 2656 mtx_unlock(vlp); 2657 vn_lock(*vp, LK_SHARED | LK_RETRY); 2658 error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen); 2659 vput(*vp); 2660 if (error) { 2661 counter_u64_add(numfullpathfail2, 1); 2662 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2663 return (error); 2664 } 2665 2666 *vp = dvp; 2667 if (VN_IS_DOOMED(dvp)) { 2668 /* forced unmount */ 2669 vrele(dvp); 2670 error = ENOENT; 2671 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2672 return (error); 2673 } 2674 /* 2675 * *vp has its use count incremented still. 2676 */ 2677 2678 return (0); 2679 } 2680 2681 /* 2682 * Resolve a directory to a pathname. 2683 * 2684 * The name of the directory can always be found in the namecache or fetched 2685 * from the filesystem. There is also guaranteed to be only one parent, meaning 2686 * we can just follow vnodes up until we find the root. 2687 * 2688 * The vnode must be referenced. 2689 */ 2690 static int 2691 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 2692 size_t *len, bool slash_prefixed, size_t addend) 2693 { 2694 #ifdef KDTRACE_HOOKS 2695 struct vnode *startvp = vp; 2696 #endif 2697 struct vnode *vp1; 2698 size_t buflen; 2699 int error; 2700 2701 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); 2702 VNPASS(vp->v_usecount > 0, vp); 2703 2704 buflen = *len; 2705 2706 if (!slash_prefixed) { 2707 MPASS(*len >= 2); 2708 buflen--; 2709 buf[buflen] = '\0'; 2710 } 2711 2712 error = 0; 2713 2714 SDT_PROBE1(vfs, namecache, fullpath, entry, vp); 2715 counter_u64_add(numfullpathcalls, 1); 2716 while (vp != rdir && vp != rootvnode) { 2717 /* 2718 * The vp vnode must be already fully constructed, 2719 * since it is either found in namecache or obtained 2720 * from VOP_VPTOCNP(). We may test for VV_ROOT safely 2721 * without obtaining the vnode lock. 2722 */ 2723 if ((vp->v_vflag & VV_ROOT) != 0) { 2724 vn_lock(vp, LK_RETRY | LK_SHARED); 2725 2726 /* 2727 * With the vnode locked, check for races with 2728 * unmount, forced or not. Note that we 2729 * already verified that vp is not equal to 2730 * the root vnode, which means that 2731 * mnt_vnodecovered can be NULL only for the 2732 * case of unmount. 2733 */ 2734 if (VN_IS_DOOMED(vp) || 2735 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || 2736 vp1->v_mountedhere != vp->v_mount) { 2737 vput(vp); 2738 error = ENOENT; 2739 SDT_PROBE3(vfs, namecache, fullpath, return, 2740 error, vp, NULL); 2741 break; 2742 } 2743 2744 vref(vp1); 2745 vput(vp); 2746 vp = vp1; 2747 continue; 2748 } 2749 if (vp->v_type != VDIR) { 2750 vrele(vp); 2751 counter_u64_add(numfullpathfail1, 1); 2752 error = ENOTDIR; 2753 SDT_PROBE3(vfs, namecache, fullpath, return, 2754 error, vp, NULL); 2755 break; 2756 } 2757 error = vn_vptocnp(&vp, curthread->td_ucred, buf, &buflen); 2758 if (error) 2759 break; 2760 if (buflen == 0) { 2761 vrele(vp); 2762 error = ENOMEM; 2763 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2764 startvp, NULL); 2765 break; 2766 } 2767 buf[--buflen] = '/'; 2768 slash_prefixed = true; 2769 } 2770 if (error) 2771 return (error); 2772 if (!slash_prefixed) { 2773 if (buflen == 0) { 2774 vrele(vp); 2775 counter_u64_add(numfullpathfail4, 1); 2776 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, 2777 startvp, NULL); 2778 return (ENOMEM); 2779 } 2780 buf[--buflen] = '/'; 2781 } 2782 counter_u64_add(numfullpathfound, 1); 2783 vrele(vp); 2784 2785 *retbuf = buf + buflen; 2786 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf); 2787 *len -= buflen; 2788 *len += addend; 2789 return (0); 2790 } 2791 2792 /* 2793 * Resolve an arbitrary vnode to a pathname. 2794 * 2795 * Note 2 caveats: 2796 * - hardlinks are not tracked, thus if the vnode is not a directory this can 2797 * resolve to a different path than the one used to find it 2798 * - namecache is not mandatory, meaning names are not guaranteed to be added 2799 * (in which case resolving fails) 2800 */ 2801 static void __inline 2802 cache_rev_failed_impl(int *reason, int line) 2803 { 2804 2805 *reason = line; 2806 } 2807 #define cache_rev_failed(var) cache_rev_failed_impl((var), __LINE__) 2808 2809 static int 2810 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 2811 char **retbuf, size_t *buflen, bool slash_prefixed, size_t addend) 2812 { 2813 #ifdef KDTRACE_HOOKS 2814 struct vnode *startvp = vp; 2815 #endif 2816 struct vnode *tvp; 2817 struct mount *mp; 2818 struct namecache *ncp; 2819 size_t orig_buflen; 2820 int reason; 2821 int error; 2822 #ifdef KDTRACE_HOOKS 2823 int i; 2824 #endif 2825 seqc_t vp_seqc, tvp_seqc; 2826 u_char nc_flag; 2827 2828 VFS_SMR_ASSERT_ENTERED(); 2829 2830 if (!cache_fast_revlookup) { 2831 vfs_smr_exit(); 2832 return (-1); 2833 } 2834 2835 orig_buflen = *buflen; 2836 2837 MPASS(*buflen >= 2); 2838 2839 if (!slash_prefixed) { 2840 MPASS(*buflen >= 2); 2841 *buflen -= 1; 2842 buf[*buflen] = '\0'; 2843 } 2844 2845 if (vp == rdir || vp == rootvnode) { 2846 if (!slash_prefixed) { 2847 *buflen -= 1; 2848 buf[*buflen] = '/'; 2849 } 2850 goto out_ok; 2851 } 2852 2853 #ifdef KDTRACE_HOOKS 2854 i = 0; 2855 #endif 2856 error = -1; 2857 vp_seqc = vn_seqc_read_any(vp); 2858 if (seqc_in_modify(vp_seqc)) { 2859 cache_rev_failed(&reason); 2860 goto out_abort; 2861 } 2862 2863 for (;;) { 2864 #ifdef KDTRACE_HOOKS 2865 i++; 2866 #endif 2867 if ((vp->v_vflag & VV_ROOT) != 0) { 2868 mp = atomic_load_ptr(&vp->v_mount); 2869 if (mp == NULL) { 2870 cache_rev_failed(&reason); 2871 goto out_abort; 2872 } 2873 tvp = atomic_load_ptr(&mp->mnt_vnodecovered); 2874 tvp_seqc = vn_seqc_read_any(tvp); 2875 if (seqc_in_modify(tvp_seqc)) { 2876 cache_rev_failed(&reason); 2877 goto out_abort; 2878 } 2879 if (!vn_seqc_consistent(vp, vp_seqc)) { 2880 cache_rev_failed(&reason); 2881 goto out_abort; 2882 } 2883 vp = tvp; 2884 vp_seqc = tvp_seqc; 2885 continue; 2886 } 2887 ncp = atomic_load_ptr(&vp->v_cache_dd); 2888 if (ncp == NULL) { 2889 cache_rev_failed(&reason); 2890 goto out_abort; 2891 } 2892 nc_flag = atomic_load_char(&ncp->nc_flag); 2893 if ((nc_flag & NCF_ISDOTDOT) != 0) { 2894 cache_rev_failed(&reason); 2895 goto out_abort; 2896 } 2897 if (!cache_ncp_canuse(ncp)) { 2898 cache_rev_failed(&reason); 2899 goto out_abort; 2900 } 2901 if (ncp->nc_nlen >= *buflen) { 2902 cache_rev_failed(&reason); 2903 error = ENOMEM; 2904 goto out_abort; 2905 } 2906 *buflen -= ncp->nc_nlen; 2907 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 2908 *buflen -= 1; 2909 buf[*buflen] = '/'; 2910 tvp = ncp->nc_dvp; 2911 tvp_seqc = vn_seqc_read_any(tvp); 2912 if (seqc_in_modify(tvp_seqc)) { 2913 cache_rev_failed(&reason); 2914 goto out_abort; 2915 } 2916 if (!vn_seqc_consistent(vp, vp_seqc)) { 2917 cache_rev_failed(&reason); 2918 goto out_abort; 2919 } 2920 vp = tvp; 2921 vp_seqc = tvp_seqc; 2922 if (vp == rdir || vp == rootvnode) 2923 break; 2924 } 2925 out_ok: 2926 vfs_smr_exit(); 2927 *retbuf = buf + *buflen; 2928 *buflen = orig_buflen - *buflen + addend; 2929 SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf); 2930 return (0); 2931 2932 out_abort: 2933 *buflen = orig_buflen; 2934 SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i); 2935 vfs_smr_exit(); 2936 return (error); 2937 } 2938 2939 static int 2940 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 2941 size_t *buflen) 2942 { 2943 size_t orig_buflen; 2944 bool slash_prefixed; 2945 int error; 2946 2947 if (*buflen < 2) 2948 return (EINVAL); 2949 2950 orig_buflen = *buflen; 2951 2952 vref(vp); 2953 slash_prefixed = false; 2954 if (vp->v_type != VDIR) { 2955 *buflen -= 1; 2956 buf[*buflen] = '\0'; 2957 error = vn_vptocnp(&vp, curthread->td_ucred, buf, buflen); 2958 if (error) 2959 return (error); 2960 if (*buflen == 0) { 2961 vrele(vp); 2962 return (ENOMEM); 2963 } 2964 *buflen -= 1; 2965 buf[*buflen] = '/'; 2966 slash_prefixed = true; 2967 } 2968 2969 return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, slash_prefixed, 2970 orig_buflen - *buflen)); 2971 } 2972 2973 /* 2974 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks). 2975 * 2976 * Since the namecache does not track handlings, the caller is expected to first 2977 * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei. 2978 * 2979 * Then we have 2 cases: 2980 * - if the found vnode is a directory, the path can be constructed just by 2981 * fullowing names up the chain 2982 * - otherwise we populate the buffer with the saved name and start resolving 2983 * from the parent 2984 */ 2985 static int 2986 vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, char **freebuf, 2987 size_t *buflen) 2988 { 2989 char *buf, *tmpbuf; 2990 struct pwd *pwd; 2991 struct componentname *cnp; 2992 struct vnode *vp; 2993 size_t addend; 2994 int error; 2995 bool slash_prefixed; 2996 enum vtype type; 2997 2998 if (*buflen < 2) 2999 return (EINVAL); 3000 if (*buflen > MAXPATHLEN) 3001 *buflen = MAXPATHLEN; 3002 3003 slash_prefixed = false; 3004 3005 buf = malloc(*buflen, M_TEMP, M_WAITOK); 3006 3007 addend = 0; 3008 vp = ndp->ni_vp; 3009 /* 3010 * Check for VBAD to work around the vp_crossmp bug in lookup(). 3011 * 3012 * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be 3013 * set to mount point's root vnode while ni_dvp will be vp_crossmp. 3014 * If the type is VDIR (like in this very case) we can skip looking 3015 * at ni_dvp in the first place. However, since vnodes get passed here 3016 * unlocked the target may transition to doomed state (type == VBAD) 3017 * before we get to evaluate the condition. If this happens, we will 3018 * populate part of the buffer and descend to vn_fullpath_dir with 3019 * vp == vp_crossmp. Prevent the problem by checking for VBAD. 3020 * 3021 * This should be atomic_load(&vp->v_type) but it is ilegal to take 3022 * an address of a bit field, even if said field is sized to char. 3023 * Work around the problem by reading the value into a full-sized enum 3024 * and then re-reading it with atomic_load which will still prevent 3025 * the compiler from re-reading down the road. 3026 */ 3027 type = vp->v_type; 3028 type = atomic_load_int(&type); 3029 if (type == VBAD) { 3030 error = ENOENT; 3031 goto out_bad; 3032 } 3033 if (type != VDIR) { 3034 cnp = &ndp->ni_cnd; 3035 addend = cnp->cn_namelen + 2; 3036 if (*buflen < addend) { 3037 error = ENOMEM; 3038 goto out_bad; 3039 } 3040 *buflen -= addend; 3041 tmpbuf = buf + *buflen; 3042 tmpbuf[0] = '/'; 3043 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen); 3044 tmpbuf[addend - 1] = '\0'; 3045 slash_prefixed = true; 3046 vp = ndp->ni_dvp; 3047 } 3048 3049 vfs_smr_enter(); 3050 pwd = pwd_get_smr(); 3051 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen, 3052 slash_prefixed, addend); 3053 VFS_SMR_ASSERT_NOT_ENTERED(); 3054 if (error < 0) { 3055 pwd = pwd_hold(curthread); 3056 vref(vp); 3057 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen, 3058 slash_prefixed, addend); 3059 pwd_drop(pwd); 3060 if (error != 0) 3061 goto out_bad; 3062 } 3063 3064 *freebuf = buf; 3065 3066 return (0); 3067 out_bad: 3068 free(buf, M_TEMP); 3069 return (error); 3070 } 3071 3072 struct vnode * 3073 vn_dir_dd_ino(struct vnode *vp) 3074 { 3075 struct namecache *ncp; 3076 struct vnode *ddvp; 3077 struct mtx *vlp; 3078 enum vgetstate vs; 3079 3080 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); 3081 vlp = VP2VNODELOCK(vp); 3082 mtx_lock(vlp); 3083 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { 3084 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 3085 continue; 3086 ddvp = ncp->nc_dvp; 3087 vs = vget_prep(ddvp); 3088 mtx_unlock(vlp); 3089 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) 3090 return (NULL); 3091 return (ddvp); 3092 } 3093 mtx_unlock(vlp); 3094 return (NULL); 3095 } 3096 3097 int 3098 vn_commname(struct vnode *vp, char *buf, u_int buflen) 3099 { 3100 struct namecache *ncp; 3101 struct mtx *vlp; 3102 int l; 3103 3104 vlp = VP2VNODELOCK(vp); 3105 mtx_lock(vlp); 3106 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) 3107 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 3108 break; 3109 if (ncp == NULL) { 3110 mtx_unlock(vlp); 3111 return (ENOENT); 3112 } 3113 l = min(ncp->nc_nlen, buflen - 1); 3114 memcpy(buf, ncp->nc_name, l); 3115 mtx_unlock(vlp); 3116 buf[l] = '\0'; 3117 return (0); 3118 } 3119 3120 /* 3121 * This function updates path string to vnode's full global path 3122 * and checks the size of the new path string against the pathlen argument. 3123 * 3124 * Requires a locked, referenced vnode. 3125 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 3126 * 3127 * If vp is a directory, the call to vn_fullpath_global() always succeeds 3128 * because it falls back to the ".." lookup if the namecache lookup fails. 3129 */ 3130 int 3131 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, 3132 u_int pathlen) 3133 { 3134 struct nameidata nd; 3135 struct vnode *vp1; 3136 char *rpath, *fbuf; 3137 int error; 3138 3139 ASSERT_VOP_ELOCKED(vp, __func__); 3140 3141 /* Construct global filesystem path from vp. */ 3142 VOP_UNLOCK(vp); 3143 error = vn_fullpath_global(vp, &rpath, &fbuf); 3144 3145 if (error != 0) { 3146 vrele(vp); 3147 return (error); 3148 } 3149 3150 if (strlen(rpath) >= pathlen) { 3151 vrele(vp); 3152 error = ENAMETOOLONG; 3153 goto out; 3154 } 3155 3156 /* 3157 * Re-lookup the vnode by path to detect a possible rename. 3158 * As a side effect, the vnode is relocked. 3159 * If vnode was renamed, return ENOENT. 3160 */ 3161 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, 3162 UIO_SYSSPACE, path, td); 3163 error = namei(&nd); 3164 if (error != 0) { 3165 vrele(vp); 3166 goto out; 3167 } 3168 NDFREE(&nd, NDF_ONLY_PNBUF); 3169 vp1 = nd.ni_vp; 3170 vrele(vp); 3171 if (vp1 == vp) 3172 strcpy(path, rpath); 3173 else { 3174 vput(vp1); 3175 error = ENOENT; 3176 } 3177 3178 out: 3179 free(fbuf, M_TEMP); 3180 return (error); 3181 } 3182 3183 #ifdef DDB 3184 static void 3185 db_print_vpath(struct vnode *vp) 3186 { 3187 3188 while (vp != NULL) { 3189 db_printf("%p: ", vp); 3190 if (vp == rootvnode) { 3191 db_printf("/"); 3192 vp = NULL; 3193 } else { 3194 if (vp->v_vflag & VV_ROOT) { 3195 db_printf("<mount point>"); 3196 vp = vp->v_mount->mnt_vnodecovered; 3197 } else { 3198 struct namecache *ncp; 3199 char *ncn; 3200 int i; 3201 3202 ncp = TAILQ_FIRST(&vp->v_cache_dst); 3203 if (ncp != NULL) { 3204 ncn = ncp->nc_name; 3205 for (i = 0; i < ncp->nc_nlen; i++) 3206 db_printf("%c", *ncn++); 3207 vp = ncp->nc_dvp; 3208 } else { 3209 vp = NULL; 3210 } 3211 } 3212 } 3213 db_printf("\n"); 3214 } 3215 3216 return; 3217 } 3218 3219 DB_SHOW_COMMAND(vpath, db_show_vpath) 3220 { 3221 struct vnode *vp; 3222 3223 if (!have_addr) { 3224 db_printf("usage: show vpath <struct vnode *>\n"); 3225 return; 3226 } 3227 3228 vp = (struct vnode *)addr; 3229 db_print_vpath(vp); 3230 } 3231 3232 #endif 3233 3234 static bool __read_frequently cache_fast_lookup = true; 3235 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW, 3236 &cache_fast_lookup, 0, ""); 3237 3238 #define CACHE_FPL_FAILED -2020 3239 3240 static void 3241 cache_fpl_cleanup_cnp(struct componentname *cnp) 3242 { 3243 3244 uma_zfree(namei_zone, cnp->cn_pnbuf); 3245 #ifdef DIAGNOSTIC 3246 cnp->cn_pnbuf = NULL; 3247 cnp->cn_nameptr = NULL; 3248 #endif 3249 } 3250 3251 static void 3252 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp) 3253 { 3254 struct componentname *cnp; 3255 3256 cnp = &ndp->ni_cnd; 3257 while (*(cnp->cn_nameptr) == '/') { 3258 cnp->cn_nameptr++; 3259 ndp->ni_pathlen--; 3260 } 3261 3262 *dpp = ndp->ni_rootdir; 3263 } 3264 3265 /* 3266 * Components of nameidata (or objects it can point to) which may 3267 * need restoring in case fast path lookup fails. 3268 */ 3269 struct nameidata_saved { 3270 long cn_namelen; 3271 char *cn_nameptr; 3272 size_t ni_pathlen; 3273 int cn_flags; 3274 }; 3275 3276 struct cache_fpl { 3277 struct nameidata *ndp; 3278 struct componentname *cnp; 3279 struct pwd *pwd; 3280 struct vnode *dvp; 3281 struct vnode *tvp; 3282 seqc_t dvp_seqc; 3283 seqc_t tvp_seqc; 3284 struct nameidata_saved snd; 3285 int line; 3286 enum cache_fpl_status status:8; 3287 bool in_smr; 3288 }; 3289 3290 static void 3291 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd) 3292 { 3293 3294 snd->cn_flags = fpl->ndp->ni_cnd.cn_flags; 3295 snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen; 3296 snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr; 3297 snd->ni_pathlen = fpl->ndp->ni_pathlen; 3298 } 3299 3300 static void 3301 cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd) 3302 { 3303 3304 fpl->ndp->ni_cnd.cn_flags = snd->cn_flags; 3305 fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen; 3306 fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr; 3307 fpl->ndp->ni_pathlen = snd->ni_pathlen; 3308 } 3309 3310 #ifdef INVARIANTS 3311 #define cache_fpl_smr_assert_entered(fpl) ({ \ 3312 struct cache_fpl *_fpl = (fpl); \ 3313 MPASS(_fpl->in_smr == true); \ 3314 VFS_SMR_ASSERT_ENTERED(); \ 3315 }) 3316 #define cache_fpl_smr_assert_not_entered(fpl) ({ \ 3317 struct cache_fpl *_fpl = (fpl); \ 3318 MPASS(_fpl->in_smr == false); \ 3319 VFS_SMR_ASSERT_NOT_ENTERED(); \ 3320 }) 3321 #else 3322 #define cache_fpl_smr_assert_entered(fpl) do { } while (0) 3323 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0) 3324 #endif 3325 3326 #define cache_fpl_smr_enter_initial(fpl) ({ \ 3327 struct cache_fpl *_fpl = (fpl); \ 3328 vfs_smr_enter(); \ 3329 _fpl->in_smr = true; \ 3330 }) 3331 3332 #define cache_fpl_smr_enter(fpl) ({ \ 3333 struct cache_fpl *_fpl = (fpl); \ 3334 MPASS(_fpl->in_smr == false); \ 3335 vfs_smr_enter(); \ 3336 _fpl->in_smr = true; \ 3337 }) 3338 3339 #define cache_fpl_smr_exit(fpl) ({ \ 3340 struct cache_fpl *_fpl = (fpl); \ 3341 MPASS(_fpl->in_smr == true); \ 3342 vfs_smr_exit(); \ 3343 _fpl->in_smr = false; \ 3344 }) 3345 3346 static int 3347 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line) 3348 { 3349 3350 if (fpl->status != CACHE_FPL_STATUS_UNSET) { 3351 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, 3352 ("%s: converting to abort from %d at %d, set at %d\n", 3353 __func__, fpl->status, line, fpl->line)); 3354 } 3355 fpl->status = CACHE_FPL_STATUS_ABORTED; 3356 fpl->line = line; 3357 return (CACHE_FPL_FAILED); 3358 } 3359 3360 #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__) 3361 3362 static int 3363 cache_fpl_partial_impl(struct cache_fpl *fpl, int line) 3364 { 3365 3366 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3367 ("%s: setting to partial at %d, but already set to %d at %d\n", 3368 __func__, line, fpl->status, fpl->line)); 3369 cache_fpl_smr_assert_entered(fpl); 3370 fpl->status = CACHE_FPL_STATUS_PARTIAL; 3371 fpl->line = line; 3372 return (CACHE_FPL_FAILED); 3373 } 3374 3375 #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__) 3376 3377 static int 3378 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line) 3379 { 3380 3381 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3382 ("%s: setting to handled at %d, but already set to %d at %d\n", 3383 __func__, line, fpl->status, fpl->line)); 3384 cache_fpl_smr_assert_not_entered(fpl); 3385 MPASS(error != CACHE_FPL_FAILED); 3386 fpl->status = CACHE_FPL_STATUS_HANDLED; 3387 fpl->line = line; 3388 return (error); 3389 } 3390 3391 #define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__) 3392 3393 #define CACHE_FPL_SUPPORTED_CN_FLAGS \ 3394 (LOCKLEAF | LOCKPARENT | WANTPARENT | NOCACHE | FOLLOW | LOCKSHARED | SAVENAME | \ 3395 SAVESTART | WILLBEDIR | ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK) 3396 3397 #define CACHE_FPL_INTERNAL_CN_FLAGS \ 3398 (ISDOTDOT | MAKEENTRY | ISLASTCN) 3399 3400 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, 3401 "supported and internal flags overlap"); 3402 3403 static bool 3404 cache_fpl_islastcn(struct nameidata *ndp) 3405 { 3406 3407 return (*ndp->ni_next == 0); 3408 } 3409 3410 static bool 3411 cache_fpl_isdotdot(struct componentname *cnp) 3412 { 3413 3414 if (cnp->cn_namelen == 2 && 3415 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') 3416 return (true); 3417 return (false); 3418 } 3419 3420 static bool 3421 cache_can_fplookup(struct cache_fpl *fpl) 3422 { 3423 struct nameidata *ndp; 3424 struct componentname *cnp; 3425 struct thread *td; 3426 3427 ndp = fpl->ndp; 3428 cnp = fpl->cnp; 3429 td = cnp->cn_thread; 3430 3431 if (!cache_fast_lookup) { 3432 cache_fpl_aborted(fpl); 3433 return (false); 3434 } 3435 #ifdef MAC 3436 if (mac_vnode_check_lookup_enabled()) { 3437 cache_fpl_aborted(fpl); 3438 return (false); 3439 } 3440 #endif 3441 if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) { 3442 cache_fpl_aborted(fpl); 3443 return (false); 3444 } 3445 if (ndp->ni_dirfd != AT_FDCWD) { 3446 cache_fpl_aborted(fpl); 3447 return (false); 3448 } 3449 if (IN_CAPABILITY_MODE(td)) { 3450 cache_fpl_aborted(fpl); 3451 return (false); 3452 } 3453 if (AUDITING_TD(td)) { 3454 cache_fpl_aborted(fpl); 3455 return (false); 3456 } 3457 if (ndp->ni_startdir != NULL) { 3458 cache_fpl_aborted(fpl); 3459 return (false); 3460 } 3461 return (true); 3462 } 3463 3464 static bool 3465 cache_fplookup_vnode_supported(struct vnode *vp) 3466 { 3467 3468 return (vp->v_type != VLNK); 3469 } 3470 3471 /* 3472 * Move a negative entry to the hot list. 3473 * 3474 * We have to take locks, but they may be contended and in the worst 3475 * case we may need to go off CPU. We don't want to spin within the 3476 * smr section and we can't block with it. Instead we are going to 3477 * look up the entry again. 3478 */ 3479 static int __noinline 3480 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp, 3481 uint32_t hash) 3482 { 3483 struct componentname *cnp; 3484 struct namecache *ncp; 3485 struct neglist *neglist; 3486 struct negstate *negstate; 3487 struct vnode *dvp; 3488 u_char nc_flag; 3489 3490 cnp = fpl->cnp; 3491 dvp = fpl->dvp; 3492 3493 if (!vhold_smr(dvp)) 3494 return (cache_fpl_aborted(fpl)); 3495 3496 neglist = NCP2NEGLIST(oncp); 3497 cache_fpl_smr_exit(fpl); 3498 3499 mtx_lock(&ncneg_hot.nl_lock); 3500 mtx_lock(&neglist->nl_lock); 3501 /* 3502 * For hash iteration. 3503 */ 3504 cache_fpl_smr_enter(fpl); 3505 3506 /* 3507 * Avoid all surprises by only succeeding if we got the same entry and 3508 * bailing completely otherwise. 3509 * 3510 * In particular at this point there can be a new ncp which matches the 3511 * search but hashes to a different neglist. 3512 */ 3513 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 3514 if (ncp == oncp) 3515 break; 3516 } 3517 3518 /* 3519 * No match to begin with. 3520 */ 3521 if (__predict_false(ncp == NULL)) { 3522 goto out_abort; 3523 } 3524 3525 /* 3526 * The newly found entry may be something different... 3527 */ 3528 if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 3529 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) { 3530 goto out_abort; 3531 } 3532 3533 /* 3534 * ... and not even negative. 3535 */ 3536 nc_flag = atomic_load_char(&ncp->nc_flag); 3537 if ((nc_flag & NCF_NEGATIVE) == 0) { 3538 goto out_abort; 3539 } 3540 3541 if (__predict_false(!cache_ncp_canuse(ncp))) { 3542 goto out_abort; 3543 } 3544 3545 negstate = NCP2NEGSTATE(ncp); 3546 if ((negstate->neg_flag & NEG_HOT) == 0) { 3547 numhotneg++; 3548 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 3549 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); 3550 negstate->neg_flag |= NEG_HOT; 3551 } 3552 3553 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 3554 counter_u64_add(numneghits, 1); 3555 cache_fpl_smr_exit(fpl); 3556 mtx_unlock(&neglist->nl_lock); 3557 mtx_unlock(&ncneg_hot.nl_lock); 3558 vdrop(dvp); 3559 return (cache_fpl_handled(fpl, ENOENT)); 3560 out_abort: 3561 cache_fpl_smr_exit(fpl); 3562 mtx_unlock(&neglist->nl_lock); 3563 mtx_unlock(&ncneg_hot.nl_lock); 3564 vdrop(dvp); 3565 return (cache_fpl_aborted(fpl)); 3566 } 3567 3568 /* 3569 * The target vnode is not supported, prepare for the slow path to take over. 3570 */ 3571 static int __noinline 3572 cache_fplookup_partial_setup(struct cache_fpl *fpl) 3573 { 3574 struct nameidata *ndp; 3575 struct componentname *cnp; 3576 enum vgetstate dvs; 3577 struct vnode *dvp; 3578 struct pwd *pwd; 3579 seqc_t dvp_seqc; 3580 3581 ndp = fpl->ndp; 3582 cnp = fpl->cnp; 3583 dvp = fpl->dvp; 3584 dvp_seqc = fpl->dvp_seqc; 3585 3586 dvs = vget_prep_smr(dvp); 3587 if (__predict_false(dvs == VGET_NONE)) { 3588 cache_fpl_smr_exit(fpl); 3589 return (cache_fpl_aborted(fpl)); 3590 } 3591 3592 cache_fpl_smr_exit(fpl); 3593 3594 vget_finish_ref(dvp, dvs); 3595 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3596 vrele(dvp); 3597 return (cache_fpl_aborted(fpl)); 3598 } 3599 3600 pwd = pwd_hold(curthread); 3601 if (fpl->pwd != pwd) { 3602 vrele(dvp); 3603 pwd_drop(pwd); 3604 return (cache_fpl_aborted(fpl)); 3605 } 3606 3607 cache_fpl_restore(fpl, &fpl->snd); 3608 3609 ndp->ni_startdir = dvp; 3610 cnp->cn_flags |= MAKEENTRY; 3611 if (cache_fpl_islastcn(ndp)) 3612 cnp->cn_flags |= ISLASTCN; 3613 if (cache_fpl_isdotdot(cnp)) 3614 cnp->cn_flags |= ISDOTDOT; 3615 3616 return (0); 3617 } 3618 3619 static int 3620 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs) 3621 { 3622 struct componentname *cnp; 3623 struct vnode *tvp; 3624 seqc_t tvp_seqc; 3625 int error, lkflags; 3626 3627 cnp = fpl->cnp; 3628 tvp = fpl->tvp; 3629 tvp_seqc = fpl->tvp_seqc; 3630 3631 if ((cnp->cn_flags & LOCKLEAF) != 0) { 3632 lkflags = LK_SHARED; 3633 if ((cnp->cn_flags & LOCKSHARED) == 0) 3634 lkflags = LK_EXCLUSIVE; 3635 error = vget_finish(tvp, lkflags, tvs); 3636 if (__predict_false(error != 0)) { 3637 return (cache_fpl_aborted(fpl)); 3638 } 3639 } else { 3640 vget_finish_ref(tvp, tvs); 3641 } 3642 3643 if (!vn_seqc_consistent(tvp, tvp_seqc)) { 3644 if ((cnp->cn_flags & LOCKLEAF) != 0) 3645 vput(tvp); 3646 else 3647 vrele(tvp); 3648 return (cache_fpl_aborted(fpl)); 3649 } 3650 3651 return (cache_fpl_handled(fpl, 0)); 3652 } 3653 3654 /* 3655 * They want to possibly modify the state of the namecache. 3656 * 3657 * Don't try to match the API contract, just leave. 3658 * TODO: this leaves scalability on the table 3659 */ 3660 static int 3661 cache_fplookup_final_modifying(struct cache_fpl *fpl) 3662 { 3663 struct componentname *cnp; 3664 3665 cnp = fpl->cnp; 3666 MPASS(cnp->cn_nameiop != LOOKUP); 3667 return (cache_fpl_partial(fpl)); 3668 } 3669 3670 static int __noinline 3671 cache_fplookup_final_withparent(struct cache_fpl *fpl) 3672 { 3673 struct componentname *cnp; 3674 enum vgetstate dvs, tvs; 3675 struct vnode *dvp, *tvp; 3676 seqc_t dvp_seqc, tvp_seqc; 3677 int error; 3678 3679 cnp = fpl->cnp; 3680 dvp = fpl->dvp; 3681 dvp_seqc = fpl->dvp_seqc; 3682 tvp = fpl->tvp; 3683 tvp_seqc = fpl->tvp_seqc; 3684 3685 MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0); 3686 3687 /* 3688 * This is less efficient than it can be for simplicity. 3689 */ 3690 dvs = vget_prep_smr(dvp); 3691 if (__predict_false(dvs == VGET_NONE)) { 3692 return (cache_fpl_aborted(fpl)); 3693 } 3694 tvs = vget_prep_smr(tvp); 3695 if (__predict_false(tvs == VGET_NONE)) { 3696 cache_fpl_smr_exit(fpl); 3697 vget_abort(dvp, dvs); 3698 return (cache_fpl_aborted(fpl)); 3699 } 3700 3701 cache_fpl_smr_exit(fpl); 3702 3703 if ((cnp->cn_flags & LOCKPARENT) != 0) { 3704 error = vget_finish(dvp, LK_EXCLUSIVE, dvs); 3705 if (__predict_false(error != 0)) { 3706 vget_abort(tvp, tvs); 3707 return (cache_fpl_aborted(fpl)); 3708 } 3709 } else { 3710 vget_finish_ref(dvp, dvs); 3711 } 3712 3713 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3714 vget_abort(tvp, tvs); 3715 if ((cnp->cn_flags & LOCKPARENT) != 0) 3716 vput(dvp); 3717 else 3718 vrele(dvp); 3719 return (cache_fpl_aborted(fpl)); 3720 } 3721 3722 error = cache_fplookup_final_child(fpl, tvs); 3723 if (__predict_false(error != 0)) { 3724 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED); 3725 if ((cnp->cn_flags & LOCKPARENT) != 0) 3726 vput(dvp); 3727 else 3728 vrele(dvp); 3729 return (error); 3730 } 3731 3732 MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED); 3733 return (0); 3734 } 3735 3736 static int 3737 cache_fplookup_final(struct cache_fpl *fpl) 3738 { 3739 struct componentname *cnp; 3740 enum vgetstate tvs; 3741 struct vnode *dvp, *tvp; 3742 seqc_t dvp_seqc, tvp_seqc; 3743 3744 cnp = fpl->cnp; 3745 dvp = fpl->dvp; 3746 dvp_seqc = fpl->dvp_seqc; 3747 tvp = fpl->tvp; 3748 tvp_seqc = fpl->tvp_seqc; 3749 3750 VNPASS(cache_fplookup_vnode_supported(dvp), dvp); 3751 3752 if (cnp->cn_nameiop != LOOKUP) { 3753 return (cache_fplookup_final_modifying(fpl)); 3754 } 3755 3756 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) 3757 return (cache_fplookup_final_withparent(fpl)); 3758 3759 tvs = vget_prep_smr(tvp); 3760 if (__predict_false(tvs == VGET_NONE)) { 3761 return (cache_fpl_partial(fpl)); 3762 } 3763 3764 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3765 cache_fpl_smr_exit(fpl); 3766 vget_abort(tvp, tvs); 3767 return (cache_fpl_aborted(fpl)); 3768 } 3769 3770 cache_fpl_smr_exit(fpl); 3771 return (cache_fplookup_final_child(fpl, tvs)); 3772 } 3773 3774 static int __noinline 3775 cache_fplookup_dot(struct cache_fpl *fpl) 3776 { 3777 struct vnode *dvp; 3778 3779 dvp = fpl->dvp; 3780 3781 fpl->tvp = dvp; 3782 fpl->tvp_seqc = vn_seqc_read_any(dvp); 3783 if (seqc_in_modify(fpl->tvp_seqc)) { 3784 return (cache_fpl_aborted(fpl)); 3785 } 3786 3787 counter_u64_add(dothits, 1); 3788 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", dvp); 3789 3790 return (0); 3791 } 3792 3793 static int __noinline 3794 cache_fplookup_dotdot(struct cache_fpl *fpl) 3795 { 3796 struct nameidata *ndp; 3797 struct componentname *cnp; 3798 struct namecache *ncp; 3799 struct vnode *dvp; 3800 struct prison *pr; 3801 u_char nc_flag; 3802 3803 ndp = fpl->ndp; 3804 cnp = fpl->cnp; 3805 dvp = fpl->dvp; 3806 3807 /* 3808 * XXX this is racy the same way regular lookup is 3809 */ 3810 for (pr = cnp->cn_cred->cr_prison; pr != NULL; 3811 pr = pr->pr_parent) 3812 if (dvp == pr->pr_root) 3813 break; 3814 3815 if (dvp == ndp->ni_rootdir || 3816 dvp == ndp->ni_topdir || 3817 dvp == rootvnode || 3818 pr != NULL) { 3819 fpl->tvp = dvp; 3820 fpl->tvp_seqc = vn_seqc_read_any(dvp); 3821 if (seqc_in_modify(fpl->tvp_seqc)) { 3822 return (cache_fpl_aborted(fpl)); 3823 } 3824 return (0); 3825 } 3826 3827 if ((dvp->v_vflag & VV_ROOT) != 0) { 3828 /* 3829 * TODO 3830 * The opposite of climb mount is needed here. 3831 */ 3832 return (cache_fpl_aborted(fpl)); 3833 } 3834 3835 ncp = atomic_load_ptr(&dvp->v_cache_dd); 3836 if (ncp == NULL) { 3837 return (cache_fpl_aborted(fpl)); 3838 } 3839 3840 nc_flag = atomic_load_char(&ncp->nc_flag); 3841 if ((nc_flag & NCF_ISDOTDOT) != 0) { 3842 if ((nc_flag & NCF_NEGATIVE) != 0) 3843 return (cache_fpl_aborted(fpl)); 3844 fpl->tvp = ncp->nc_vp; 3845 } else { 3846 fpl->tvp = ncp->nc_dvp; 3847 } 3848 3849 if (__predict_false(!cache_ncp_canuse(ncp))) { 3850 return (cache_fpl_aborted(fpl)); 3851 } 3852 3853 fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp); 3854 if (seqc_in_modify(fpl->tvp_seqc)) { 3855 return (cache_fpl_partial(fpl)); 3856 } 3857 3858 counter_u64_add(dotdothits, 1); 3859 return (0); 3860 } 3861 3862 static int 3863 cache_fplookup_next(struct cache_fpl *fpl) 3864 { 3865 struct componentname *cnp; 3866 struct namecache *ncp; 3867 struct negstate *negstate; 3868 struct vnode *dvp, *tvp; 3869 u_char nc_flag; 3870 uint32_t hash; 3871 bool neg_hot; 3872 3873 cnp = fpl->cnp; 3874 dvp = fpl->dvp; 3875 3876 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) { 3877 return (cache_fplookup_dot(fpl)); 3878 } 3879 3880 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 3881 3882 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 3883 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 3884 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 3885 break; 3886 } 3887 3888 /* 3889 * If there is no entry we have to punt to the slow path to perform 3890 * actual lookup. Should there be nothing with this name a negative 3891 * entry will be created. 3892 */ 3893 if (__predict_false(ncp == NULL)) { 3894 return (cache_fpl_partial(fpl)); 3895 } 3896 3897 tvp = atomic_load_ptr(&ncp->nc_vp); 3898 nc_flag = atomic_load_char(&ncp->nc_flag); 3899 if ((nc_flag & NCF_NEGATIVE) != 0) { 3900 /* 3901 * If they want to create an entry we need to replace this one. 3902 */ 3903 if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) { 3904 return (cache_fpl_partial(fpl)); 3905 } 3906 negstate = NCP2NEGSTATE(ncp); 3907 neg_hot = ((negstate->neg_flag & NEG_HOT) != 0); 3908 if (__predict_false(!cache_ncp_canuse(ncp))) { 3909 return (cache_fpl_partial(fpl)); 3910 } 3911 if (__predict_false((nc_flag & NCF_WHITE) != 0)) { 3912 return (cache_fpl_partial(fpl)); 3913 } 3914 if (!neg_hot) { 3915 return (cache_fplookup_negative_promote(fpl, ncp, hash)); 3916 } 3917 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, 3918 ncp->nc_name); 3919 counter_u64_add(numneghits, 1); 3920 cache_fpl_smr_exit(fpl); 3921 return (cache_fpl_handled(fpl, ENOENT)); 3922 } 3923 3924 if (__predict_false(!cache_ncp_canuse(ncp))) { 3925 return (cache_fpl_partial(fpl)); 3926 } 3927 3928 fpl->tvp = tvp; 3929 fpl->tvp_seqc = vn_seqc_read_any(tvp); 3930 if (seqc_in_modify(fpl->tvp_seqc)) { 3931 return (cache_fpl_partial(fpl)); 3932 } 3933 3934 if (!cache_fplookup_vnode_supported(tvp)) { 3935 return (cache_fpl_partial(fpl)); 3936 } 3937 3938 counter_u64_add(numposhits, 1); 3939 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp); 3940 return (0); 3941 } 3942 3943 static bool 3944 cache_fplookup_mp_supported(struct mount *mp) 3945 { 3946 3947 if (mp == NULL) 3948 return (false); 3949 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) 3950 return (false); 3951 return (true); 3952 } 3953 3954 /* 3955 * Walk up the mount stack (if any). 3956 * 3957 * Correctness is provided in the following ways: 3958 * - all vnodes are protected from freeing with SMR 3959 * - struct mount objects are type stable making them always safe to access 3960 * - stability of the particular mount is provided by busying it 3961 * - relationship between the vnode which is mounted on and the mount is 3962 * verified with the vnode sequence counter after busying 3963 * - association between root vnode of the mount and the mount is protected 3964 * by busy 3965 * 3966 * From that point on we can read the sequence counter of the root vnode 3967 * and get the next mount on the stack (if any) using the same protection. 3968 * 3969 * By the end of successful walk we are guaranteed the reached state was 3970 * indeed present at least at some point which matches the regular lookup. 3971 */ 3972 static int __noinline 3973 cache_fplookup_climb_mount(struct cache_fpl *fpl) 3974 { 3975 struct mount *mp, *prev_mp; 3976 struct vnode *vp; 3977 seqc_t vp_seqc; 3978 3979 vp = fpl->tvp; 3980 vp_seqc = fpl->tvp_seqc; 3981 3982 VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp); 3983 mp = atomic_load_ptr(&vp->v_mountedhere); 3984 if (mp == NULL) 3985 return (0); 3986 3987 prev_mp = NULL; 3988 for (;;) { 3989 if (!vfs_op_thread_enter_crit(mp)) { 3990 if (prev_mp != NULL) 3991 vfs_op_thread_exit_crit(prev_mp); 3992 return (cache_fpl_partial(fpl)); 3993 } 3994 if (prev_mp != NULL) 3995 vfs_op_thread_exit_crit(prev_mp); 3996 if (!vn_seqc_consistent(vp, vp_seqc)) { 3997 vfs_op_thread_exit_crit(mp); 3998 return (cache_fpl_partial(fpl)); 3999 } 4000 if (!cache_fplookup_mp_supported(mp)) { 4001 vfs_op_thread_exit_crit(mp); 4002 return (cache_fpl_partial(fpl)); 4003 } 4004 vp = atomic_load_ptr(&mp->mnt_rootvnode); 4005 if (vp == NULL || VN_IS_DOOMED(vp)) { 4006 vfs_op_thread_exit_crit(mp); 4007 return (cache_fpl_partial(fpl)); 4008 } 4009 vp_seqc = vn_seqc_read_any(vp); 4010 if (seqc_in_modify(vp_seqc)) { 4011 vfs_op_thread_exit_crit(mp); 4012 return (cache_fpl_partial(fpl)); 4013 } 4014 prev_mp = mp; 4015 mp = atomic_load_ptr(&vp->v_mountedhere); 4016 if (mp == NULL) 4017 break; 4018 } 4019 4020 vfs_op_thread_exit_crit(prev_mp); 4021 fpl->tvp = vp; 4022 fpl->tvp_seqc = vp_seqc; 4023 return (0); 4024 } 4025 4026 static bool 4027 cache_fplookup_need_climb_mount(struct cache_fpl *fpl) 4028 { 4029 struct mount *mp; 4030 struct vnode *vp; 4031 4032 vp = fpl->tvp; 4033 4034 /* 4035 * Hack: while this is a union, the pointer tends to be NULL so save on 4036 * a branch. 4037 */ 4038 mp = atomic_load_ptr(&vp->v_mountedhere); 4039 if (mp == NULL) 4040 return (false); 4041 if (vp->v_type == VDIR) 4042 return (true); 4043 return (false); 4044 } 4045 4046 /* 4047 * Parse the path. 4048 * 4049 * The code is mostly copy-pasted from regular lookup, see lookup(). 4050 * The structure is maintained along with comments for easier maintenance. 4051 * Deduplicating the code will become feasible after fast path lookup 4052 * becomes more feature-complete. 4053 */ 4054 static int 4055 cache_fplookup_parse(struct cache_fpl *fpl) 4056 { 4057 struct nameidata *ndp; 4058 struct componentname *cnp; 4059 char *cp; 4060 4061 ndp = fpl->ndp; 4062 cnp = fpl->cnp; 4063 4064 /* 4065 * Search a new directory. 4066 * 4067 * The last component of the filename is left accessible via 4068 * cnp->cn_nameptr for callers that need the name. Callers needing 4069 * the name set the SAVENAME flag. When done, they assume 4070 * responsibility for freeing the pathname buffer. 4071 */ 4072 for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++) 4073 continue; 4074 cnp->cn_namelen = cp - cnp->cn_nameptr; 4075 if (__predict_false(cnp->cn_namelen > NAME_MAX)) { 4076 cache_fpl_smr_exit(fpl); 4077 return (cache_fpl_handled(fpl, ENAMETOOLONG)); 4078 } 4079 ndp->ni_pathlen -= cnp->cn_namelen; 4080 KASSERT(ndp->ni_pathlen <= PATH_MAX, 4081 ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen)); 4082 ndp->ni_next = cp; 4083 4084 /* 4085 * Replace multiple slashes by a single slash and trailing slashes 4086 * by a null. This must be done before VOP_LOOKUP() because some 4087 * fs's don't know about trailing slashes. Remember if there were 4088 * trailing slashes to handle symlinks, existing non-directories 4089 * and non-existing files that won't be directories specially later. 4090 */ 4091 while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) { 4092 cp++; 4093 ndp->ni_pathlen--; 4094 if (*cp == '\0') { 4095 /* 4096 * TODO 4097 * Regular lookup performs the following: 4098 * *ndp->ni_next = '\0'; 4099 * cnp->cn_flags |= TRAILINGSLASH; 4100 * 4101 * Which is problematic since it modifies data read 4102 * from userspace. Then if fast path lookup was to 4103 * abort we would have to either restore it or convey 4104 * the flag. Since this is a corner case just ignore 4105 * it for simplicity. 4106 */ 4107 return (cache_fpl_partial(fpl)); 4108 } 4109 } 4110 ndp->ni_next = cp; 4111 4112 /* 4113 * Check for degenerate name (e.g. / or "") 4114 * which is a way of talking about a directory, 4115 * e.g. like "/." or ".". 4116 * 4117 * TODO 4118 * Another corner case handled by the regular lookup 4119 */ 4120 if (__predict_false(cnp->cn_nameptr[0] == '\0')) { 4121 return (cache_fpl_partial(fpl)); 4122 } 4123 return (0); 4124 } 4125 4126 static void 4127 cache_fplookup_parse_advance(struct cache_fpl *fpl) 4128 { 4129 struct nameidata *ndp; 4130 struct componentname *cnp; 4131 4132 ndp = fpl->ndp; 4133 cnp = fpl->cnp; 4134 4135 cnp->cn_nameptr = ndp->ni_next; 4136 while (*cnp->cn_nameptr == '/') { 4137 cnp->cn_nameptr++; 4138 ndp->ni_pathlen--; 4139 } 4140 } 4141 4142 static int __noinline 4143 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error) 4144 { 4145 4146 switch (error) { 4147 case EAGAIN: 4148 /* 4149 * Can happen when racing against vgone. 4150 * */ 4151 case EOPNOTSUPP: 4152 cache_fpl_partial(fpl); 4153 break; 4154 default: 4155 /* 4156 * See the API contract for VOP_FPLOOKUP_VEXEC. 4157 */ 4158 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { 4159 error = cache_fpl_aborted(fpl); 4160 } else { 4161 cache_fpl_smr_exit(fpl); 4162 cache_fpl_handled(fpl, error); 4163 } 4164 break; 4165 } 4166 return (error); 4167 } 4168 4169 static int 4170 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl) 4171 { 4172 struct nameidata *ndp; 4173 struct componentname *cnp; 4174 struct mount *mp; 4175 int error; 4176 4177 error = CACHE_FPL_FAILED; 4178 ndp = fpl->ndp; 4179 cnp = fpl->cnp; 4180 4181 cache_fpl_checkpoint(fpl, &fpl->snd); 4182 4183 fpl->dvp = dvp; 4184 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); 4185 if (seqc_in_modify(fpl->dvp_seqc)) { 4186 cache_fpl_aborted(fpl); 4187 goto out; 4188 } 4189 mp = atomic_load_ptr(&fpl->dvp->v_mount); 4190 if (!cache_fplookup_mp_supported(mp)) { 4191 cache_fpl_aborted(fpl); 4192 goto out; 4193 } 4194 4195 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); 4196 4197 for (;;) { 4198 error = cache_fplookup_parse(fpl); 4199 if (__predict_false(error != 0)) { 4200 break; 4201 } 4202 4203 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); 4204 4205 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred); 4206 if (__predict_false(error != 0)) { 4207 error = cache_fplookup_failed_vexec(fpl, error); 4208 break; 4209 } 4210 4211 if (__predict_false(cache_fpl_isdotdot(cnp))) { 4212 error = cache_fplookup_dotdot(fpl); 4213 if (__predict_false(error != 0)) { 4214 break; 4215 } 4216 } else { 4217 error = cache_fplookup_next(fpl); 4218 if (__predict_false(error != 0)) { 4219 break; 4220 } 4221 4222 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 4223 4224 if (cache_fplookup_need_climb_mount(fpl)) { 4225 error = cache_fplookup_climb_mount(fpl); 4226 if (__predict_false(error != 0)) { 4227 break; 4228 } 4229 } 4230 } 4231 4232 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 4233 4234 if (cache_fpl_islastcn(ndp)) { 4235 error = cache_fplookup_final(fpl); 4236 break; 4237 } 4238 4239 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { 4240 error = cache_fpl_aborted(fpl); 4241 break; 4242 } 4243 4244 fpl->dvp = fpl->tvp; 4245 fpl->dvp_seqc = fpl->tvp_seqc; 4246 4247 cache_fplookup_parse_advance(fpl); 4248 cache_fpl_checkpoint(fpl, &fpl->snd); 4249 } 4250 out: 4251 switch (fpl->status) { 4252 case CACHE_FPL_STATUS_UNSET: 4253 __assert_unreachable(); 4254 break; 4255 case CACHE_FPL_STATUS_PARTIAL: 4256 cache_fpl_smr_assert_entered(fpl); 4257 return (cache_fplookup_partial_setup(fpl)); 4258 case CACHE_FPL_STATUS_ABORTED: 4259 if (fpl->in_smr) 4260 cache_fpl_smr_exit(fpl); 4261 return (CACHE_FPL_FAILED); 4262 case CACHE_FPL_STATUS_HANDLED: 4263 MPASS(error != CACHE_FPL_FAILED); 4264 cache_fpl_smr_assert_not_entered(fpl); 4265 if (__predict_false(error != 0)) { 4266 ndp->ni_dvp = NULL; 4267 ndp->ni_vp = NULL; 4268 cache_fpl_cleanup_cnp(cnp); 4269 return (error); 4270 } 4271 ndp->ni_dvp = fpl->dvp; 4272 ndp->ni_vp = fpl->tvp; 4273 if (cnp->cn_flags & SAVENAME) 4274 cnp->cn_flags |= HASBUF; 4275 else 4276 cache_fpl_cleanup_cnp(cnp); 4277 return (error); 4278 } 4279 } 4280 4281 /* 4282 * Fast path lookup protected with SMR and sequence counters. 4283 * 4284 * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one. 4285 * 4286 * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria 4287 * outlined below. 4288 * 4289 * Traditional vnode lookup conceptually looks like this: 4290 * 4291 * vn_lock(current); 4292 * for (;;) { 4293 * next = find(); 4294 * vn_lock(next); 4295 * vn_unlock(current); 4296 * current = next; 4297 * if (last) 4298 * break; 4299 * } 4300 * return (current); 4301 * 4302 * Each jump to the next vnode is safe memory-wise and atomic with respect to 4303 * any modifications thanks to holding respective locks. 4304 * 4305 * The same guarantee can be provided with a combination of safe memory 4306 * reclamation and sequence counters instead. If all operations which affect 4307 * the relationship between the current vnode and the one we are looking for 4308 * also modify the counter, we can verify whether all the conditions held as 4309 * we made the jump. This includes things like permissions, mount points etc. 4310 * Counter modification is provided by enclosing relevant places in 4311 * vn_seqc_write_begin()/end() calls. 4312 * 4313 * Thus this translates to: 4314 * 4315 * vfs_smr_enter(); 4316 * dvp_seqc = seqc_read_any(dvp); 4317 * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode 4318 * abort(); 4319 * for (;;) { 4320 * tvp = find(); 4321 * tvp_seqc = seqc_read_any(tvp); 4322 * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode 4323 * abort(); 4324 * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode 4325 * abort(); 4326 * dvp = tvp; // we know nothing of importance has changed 4327 * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration 4328 * if (last) 4329 * break; 4330 * } 4331 * vget(); // secure the vnode 4332 * if (!seqc_consistent(tvp, tvp_seqc) // final check 4333 * abort(); 4334 * // at this point we know nothing has changed for any parent<->child pair 4335 * // as they were crossed during the lookup, meaning we matched the guarantee 4336 * // of the locked variant 4337 * return (tvp); 4338 * 4339 * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows: 4340 * - they are called while within vfs_smr protection which they must never exit 4341 * - EAGAIN can be returned to denote checking could not be performed, it is 4342 * always valid to return it 4343 * - if the sequence counter has not changed the result must be valid 4344 * - if the sequence counter has changed both false positives and false negatives 4345 * are permitted (since the result will be rejected later) 4346 * - for simple cases of unix permission checks vaccess_vexec_smr can be used 4347 * 4348 * Caveats to watch out for: 4349 * - vnodes are passed unlocked and unreferenced with nothing stopping 4350 * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised 4351 * to use atomic_load_ptr to fetch it. 4352 * - the aforementioned object can also get freed, meaning absent other means it 4353 * should be protected with vfs_smr 4354 * - either safely checking permissions as they are modified or guaranteeing 4355 * their stability is left to the routine 4356 */ 4357 int 4358 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status, 4359 struct pwd **pwdp) 4360 { 4361 struct cache_fpl fpl; 4362 struct pwd *pwd; 4363 struct vnode *dvp; 4364 struct componentname *cnp; 4365 struct nameidata_saved orig; 4366 int error; 4367 4368 MPASS(ndp->ni_lcf == 0); 4369 4370 fpl.status = CACHE_FPL_STATUS_UNSET; 4371 fpl.ndp = ndp; 4372 fpl.cnp = &ndp->ni_cnd; 4373 MPASS(curthread == fpl.cnp->cn_thread); 4374 4375 if ((fpl.cnp->cn_flags & SAVESTART) != 0) 4376 MPASS(fpl.cnp->cn_nameiop != LOOKUP); 4377 4378 if (!cache_can_fplookup(&fpl)) { 4379 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 4380 *status = fpl.status; 4381 return (EOPNOTSUPP); 4382 } 4383 4384 cache_fpl_checkpoint(&fpl, &orig); 4385 4386 cache_fpl_smr_enter_initial(&fpl); 4387 pwd = pwd_get_smr(); 4388 fpl.pwd = pwd; 4389 ndp->ni_rootdir = pwd->pwd_rdir; 4390 ndp->ni_topdir = pwd->pwd_jdir; 4391 4392 cnp = fpl.cnp; 4393 cnp->cn_nameptr = cnp->cn_pnbuf; 4394 if (cnp->cn_pnbuf[0] == '/') { 4395 cache_fpl_handle_root(ndp, &dvp); 4396 } else { 4397 MPASS(ndp->ni_dirfd == AT_FDCWD); 4398 dvp = pwd->pwd_cdir; 4399 } 4400 4401 SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true); 4402 4403 error = cache_fplookup_impl(dvp, &fpl); 4404 cache_fpl_smr_assert_not_entered(&fpl); 4405 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 4406 4407 *status = fpl.status; 4408 switch (fpl.status) { 4409 case CACHE_FPL_STATUS_UNSET: 4410 __assert_unreachable(); 4411 break; 4412 case CACHE_FPL_STATUS_HANDLED: 4413 SDT_PROBE3(vfs, namei, lookup, return, error, 4414 (error == 0 ? ndp->ni_vp : NULL), true); 4415 break; 4416 case CACHE_FPL_STATUS_PARTIAL: 4417 *pwdp = fpl.pwd; 4418 /* 4419 * Status restored by cache_fplookup_partial_setup. 4420 */ 4421 break; 4422 case CACHE_FPL_STATUS_ABORTED: 4423 cache_fpl_restore(&fpl, &orig); 4424 break; 4425 } 4426 return (error); 4427 } 4428