1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Poul-Henning Kamp of the FreeBSD Project. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_ddb.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/capsicum.h> 46 #include <sys/counter.h> 47 #include <sys/filedesc.h> 48 #include <sys/fnv_hash.h> 49 #include <sys/kernel.h> 50 #include <sys/ktr.h> 51 #include <sys/lock.h> 52 #include <sys/malloc.h> 53 #include <sys/fcntl.h> 54 #include <sys/jail.h> 55 #include <sys/mount.h> 56 #include <sys/namei.h> 57 #include <sys/proc.h> 58 #include <sys/rwlock.h> 59 #include <sys/seqc.h> 60 #include <sys/sdt.h> 61 #include <sys/smr.h> 62 #include <sys/smp.h> 63 #include <sys/syscallsubr.h> 64 #include <sys/sysctl.h> 65 #include <sys/sysproto.h> 66 #include <sys/vnode.h> 67 #include <ck_queue.h> 68 #ifdef KTRACE 69 #include <sys/ktrace.h> 70 #endif 71 72 #include <sys/capsicum.h> 73 74 #include <security/audit/audit.h> 75 #include <security/mac/mac_framework.h> 76 77 #ifdef DDB 78 #include <ddb/ddb.h> 79 #endif 80 81 #include <vm/uma.h> 82 83 SDT_PROVIDER_DECLARE(vfs); 84 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", 85 "struct vnode *"); 86 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", 87 "char *"); 88 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); 89 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", 90 "char *", "struct vnode *"); 91 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); 92 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", 93 "struct vnode *", "char *"); 94 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", 95 "struct vnode *"); 96 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, 97 "struct vnode *", "char *"); 98 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", 99 "char *"); 100 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *", 101 "struct componentname *"); 102 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *", 103 "struct componentname *"); 104 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); 105 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); 106 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); 107 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", 108 "struct vnode *"); 109 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", 110 "char *"); 111 SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *", 112 "char *"); 113 114 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool"); 115 SDT_PROBE_DECLARE(vfs, namei, lookup, entry); 116 SDT_PROBE_DECLARE(vfs, namei, lookup, return); 117 118 /* 119 * This structure describes the elements in the cache of recent 120 * names looked up by namei. 121 */ 122 struct negstate { 123 u_char neg_flag; 124 }; 125 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *), 126 "the state must fit in a union with a pointer without growing it"); 127 128 struct namecache { 129 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 130 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 131 CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */ 132 struct vnode *nc_dvp; /* vnode of parent of name */ 133 union { 134 struct vnode *nu_vp; /* vnode the name refers to */ 135 struct negstate nu_neg;/* negative entry state */ 136 } n_un; 137 u_char nc_flag; /* flag bits */ 138 u_char nc_nlen; /* length of name */ 139 char nc_name[0]; /* segment name + nul */ 140 }; 141 142 /* 143 * struct namecache_ts repeats struct namecache layout up to the 144 * nc_nlen member. 145 * struct namecache_ts is used in place of struct namecache when time(s) need 146 * to be stored. The nc_dotdottime field is used when a cache entry is mapping 147 * both a non-dotdot directory name plus dotdot for the directory's 148 * parent. 149 * 150 * See below for alignment requirement. 151 */ 152 struct namecache_ts { 153 struct timespec nc_time; /* timespec provided by fs */ 154 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ 155 int nc_ticks; /* ticks value when entry was added */ 156 struct namecache nc_nc; 157 }; 158 159 /* 160 * At least mips n32 performs 64-bit accesses to timespec as found 161 * in namecache_ts and requires them to be aligned. Since others 162 * may be in the same spot suffer a little bit and enforce the 163 * alignment for everyone. Note this is a nop for 64-bit platforms. 164 */ 165 #define CACHE_ZONE_ALIGNMENT UMA_ALIGNOF(time_t) 166 #define CACHE_PATH_CUTOFF 39 167 168 #define CACHE_ZONE_SMALL_SIZE (sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1) 169 #define CACHE_ZONE_SMALL_TS_SIZE (sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1) 170 #define CACHE_ZONE_LARGE_SIZE (sizeof(struct namecache) + NAME_MAX + 1) 171 #define CACHE_ZONE_LARGE_TS_SIZE (sizeof(struct namecache_ts) + NAME_MAX + 1) 172 173 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 174 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 175 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 176 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 177 178 #define nc_vp n_un.nu_vp 179 #define nc_neg n_un.nu_neg 180 181 /* 182 * Flags in namecache.nc_flag 183 */ 184 #define NCF_WHITE 0x01 185 #define NCF_ISDOTDOT 0x02 186 #define NCF_TS 0x04 187 #define NCF_DTS 0x08 188 #define NCF_DVDROP 0x10 189 #define NCF_NEGATIVE 0x20 190 #define NCF_INVALID 0x40 191 #define NCF_WIP 0x80 192 193 /* 194 * Flags in negstate.neg_flag 195 */ 196 #define NEG_HOT 0x01 197 198 /* 199 * Mark an entry as invalid. 200 * 201 * This is called before it starts getting deconstructed. 202 */ 203 static void 204 cache_ncp_invalidate(struct namecache *ncp) 205 { 206 207 KASSERT((ncp->nc_flag & NCF_INVALID) == 0, 208 ("%s: entry %p already invalid", __func__, ncp)); 209 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID); 210 atomic_thread_fence_rel(); 211 } 212 213 /* 214 * Check whether the entry can be safely used. 215 * 216 * All places which elide locks are supposed to call this after they are 217 * done with reading from an entry. 218 */ 219 static bool 220 cache_ncp_canuse(struct namecache *ncp) 221 { 222 223 atomic_thread_fence_acq(); 224 return ((atomic_load_char(&ncp->nc_flag) & (NCF_INVALID | NCF_WIP)) == 0); 225 } 226 227 /* 228 * Name caching works as follows: 229 * 230 * Names found by directory scans are retained in a cache 231 * for future reference. It is managed LRU, so frequently 232 * used names will hang around. Cache is indexed by hash value 233 * obtained from (dvp, name) where dvp refers to the directory 234 * containing name. 235 * 236 * If it is a "negative" entry, (i.e. for a name that is known NOT to 237 * exist) the vnode pointer will be NULL. 238 * 239 * Upon reaching the last segment of a path, if the reference 240 * is for DELETE, or NOCACHE is set (rewrite), and the 241 * name is located in the cache, it will be dropped. 242 * 243 * These locks are used (in the order in which they can be taken): 244 * NAME TYPE ROLE 245 * vnodelock mtx vnode lists and v_cache_dd field protection 246 * bucketlock rwlock for access to given set of hash buckets 247 * neglist mtx negative entry LRU management 248 * 249 * Additionally, ncneg_shrink_lock mtx is used to have at most one thread 250 * shrinking the LRU list. 251 * 252 * It is legal to take multiple vnodelock and bucketlock locks. The locking 253 * order is lower address first. Both are recursive. 254 * 255 * "." lookups are lockless. 256 * 257 * ".." and vnode -> name lookups require vnodelock. 258 * 259 * name -> vnode lookup requires the relevant bucketlock to be held for reading. 260 * 261 * Insertions and removals of entries require involved vnodes and bucketlocks 262 * to be write-locked to prevent other threads from seeing the entry. 263 * 264 * Some lookups result in removal of the found entry (e.g. getting rid of a 265 * negative entry with the intent to create a positive one), which poses a 266 * problem when multiple threads reach the state. Similarly, two different 267 * threads can purge two different vnodes and try to remove the same name. 268 * 269 * If the already held vnode lock is lower than the second required lock, we 270 * can just take the other lock. However, in the opposite case, this could 271 * deadlock. As such, this is resolved by trylocking and if that fails unlocking 272 * the first node, locking everything in order and revalidating the state. 273 */ 274 275 VFS_SMR_DECLARE; 276 277 /* 278 * Structures associated with name caching. 279 */ 280 #define NCHHASH(hash) \ 281 (&nchashtbl[(hash) & nchash]) 282 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ 283 static u_long __read_mostly nchash; /* size of hash table */ 284 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 285 "Size of namecache hash table"); 286 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */ 287 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 288 "Ratio of negative namecache entries"); 289 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ 290 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ 291 u_int ncsizefactor = 2; 292 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0, 293 "Size factor for namecache"); 294 static u_int __read_mostly ncpurgeminvnodes; 295 SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0, 296 "Number of vnodes below which purgevfs ignores the request"); 297 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */ 298 299 struct nchstats nchstats; /* cache effectiveness statistics */ 300 301 static struct mtx __exclusive_cache_line ncneg_shrink_lock; 302 303 struct neglist { 304 struct mtx nl_lock; 305 TAILQ_HEAD(, namecache) nl_list; 306 } __aligned(CACHE_LINE_SIZE); 307 308 static struct neglist __read_mostly *neglists; 309 static struct neglist ncneg_hot; 310 static u_long numhotneg; 311 312 #define ncneghash 3 313 #define numneglists (ncneghash + 1) 314 static inline struct neglist * 315 NCP2NEGLIST(struct namecache *ncp) 316 { 317 318 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); 319 } 320 321 static inline struct negstate * 322 NCP2NEGSTATE(struct namecache *ncp) 323 { 324 325 MPASS(ncp->nc_flag & NCF_NEGATIVE); 326 return (&ncp->nc_neg); 327 } 328 329 #define numbucketlocks (ncbuckethash + 1) 330 static u_int __read_mostly ncbuckethash; 331 static struct rwlock_padalign __read_mostly *bucketlocks; 332 #define HASH2BUCKETLOCK(hash) \ 333 ((struct rwlock *)(&bucketlocks[((hash) & ncbuckethash)])) 334 335 #define numvnodelocks (ncvnodehash + 1) 336 static u_int __read_mostly ncvnodehash; 337 static struct mtx __read_mostly *vnodelocks; 338 static inline struct mtx * 339 VP2VNODELOCK(struct vnode *vp) 340 { 341 342 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); 343 } 344 345 /* 346 * UMA zones for the VFS cache. 347 * 348 * The small cache is used for entries with short names, which are the 349 * most common. The large cache is used for entries which are too big to 350 * fit in the small cache. 351 */ 352 static uma_zone_t __read_mostly cache_zone_small; 353 static uma_zone_t __read_mostly cache_zone_small_ts; 354 static uma_zone_t __read_mostly cache_zone_large; 355 static uma_zone_t __read_mostly cache_zone_large_ts; 356 357 static struct namecache * 358 cache_alloc(int len, int ts) 359 { 360 struct namecache_ts *ncp_ts; 361 struct namecache *ncp; 362 363 if (__predict_false(ts)) { 364 if (len <= CACHE_PATH_CUTOFF) 365 ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK); 366 else 367 ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK); 368 ncp = &ncp_ts->nc_nc; 369 } else { 370 if (len <= CACHE_PATH_CUTOFF) 371 ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK); 372 else 373 ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK); 374 } 375 return (ncp); 376 } 377 378 static void 379 cache_free(struct namecache *ncp) 380 { 381 struct namecache_ts *ncp_ts; 382 383 if (ncp == NULL) 384 return; 385 if ((ncp->nc_flag & NCF_DVDROP) != 0) 386 vdrop(ncp->nc_dvp); 387 if (__predict_false(ncp->nc_flag & NCF_TS)) { 388 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 389 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 390 uma_zfree_smr(cache_zone_small_ts, ncp_ts); 391 else 392 uma_zfree_smr(cache_zone_large_ts, ncp_ts); 393 } else { 394 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 395 uma_zfree_smr(cache_zone_small, ncp); 396 else 397 uma_zfree_smr(cache_zone_large, ncp); 398 } 399 } 400 401 static void 402 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) 403 { 404 struct namecache_ts *ncp_ts; 405 406 KASSERT((ncp->nc_flag & NCF_TS) != 0 || 407 (tsp == NULL && ticksp == NULL), 408 ("No NCF_TS")); 409 410 if (tsp == NULL && ticksp == NULL) 411 return; 412 413 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 414 if (tsp != NULL) 415 *tsp = ncp_ts->nc_time; 416 if (ticksp != NULL) 417 *ticksp = ncp_ts->nc_ticks; 418 } 419 420 #ifdef DEBUG_CACHE 421 static int __read_mostly doingcache = 1; /* 1 => enable the cache */ 422 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, 423 "VFS namecache enabled"); 424 #endif 425 426 /* Export size information to userland */ 427 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 428 sizeof(struct namecache), "sizeof(struct namecache)"); 429 430 /* 431 * The new name cache statistics 432 */ 433 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 434 "Name cache statistics"); 435 #define STATNODE_ULONG(name, descr) \ 436 SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr); 437 #define STATNODE_COUNTER(name, descr) \ 438 static COUNTER_U64_DEFINE_EARLY(name); \ 439 SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, \ 440 descr); 441 STATNODE_ULONG(numneg, "Number of negative cache entries"); 442 STATNODE_ULONG(numcache, "Number of cache entries"); 443 STATNODE_COUNTER(numcachehv, "Number of namecache entries with vnodes held"); 444 STATNODE_COUNTER(numdrops, "Number of dropped entries due to reaching the limit"); 445 STATNODE_COUNTER(dothits, "Number of '.' hits"); 446 STATNODE_COUNTER(dotdothits, "Number of '..' hits"); 447 STATNODE_COUNTER(nummiss, "Number of cache misses"); 448 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache"); 449 STATNODE_COUNTER(numposzaps, 450 "Number of cache hits (positive) we do not want to cache"); 451 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)"); 452 STATNODE_COUNTER(numnegzaps, 453 "Number of cache hits (negative) we do not want to cache"); 454 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)"); 455 /* These count for vn_getcwd(), too. */ 456 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls"); 457 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); 458 STATNODE_COUNTER(numfullpathfail2, 459 "Number of fullpath search errors (VOP_VPTOCNP failures)"); 460 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); 461 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls"); 462 STATNODE_COUNTER(zap_and_exit_bucket_relock_success, 463 "Number of successful removals after relocking"); 464 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail, 465 "Number of times zap_and_exit failed to lock"); 466 static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2, 467 "Number of times zap_and_exit failed to lock"); 468 static long cache_lock_vnodes_cel_3_failures; 469 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures, 470 "Number of times 3-way vnode locking failed"); 471 STATNODE_ULONG(numhotneg, "Number of hot negative entries"); 472 STATNODE_COUNTER(numneg_evicted, 473 "Number of negative entries evicted when adding a new entry"); 474 STATNODE_COUNTER(shrinking_skipped, 475 "Number of times shrinking was already in progress"); 476 477 static void cache_zap_locked(struct namecache *ncp); 478 static int vn_fullpath_hardlink(struct thread *td, struct nameidata *ndp, char **retbuf, 479 char **freebuf, size_t *buflen); 480 static int vn_fullpath_any(struct thread *td, struct vnode *vp, struct vnode *rdir, 481 char *buf, char **retbuf, size_t *buflen); 482 static int vn_fullpath_dir(struct thread *td, struct vnode *vp, struct vnode *rdir, 483 char *buf, char **retbuf, size_t *len, bool slash_prefixed, size_t addend); 484 485 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 486 487 static int cache_yield; 488 SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0, 489 "Number of times cache called yield"); 490 491 static void __noinline 492 cache_maybe_yield(void) 493 { 494 495 if (should_yield()) { 496 cache_yield++; 497 kern_yield(PRI_USER); 498 } 499 } 500 501 static inline void 502 cache_assert_vlp_locked(struct mtx *vlp) 503 { 504 505 if (vlp != NULL) 506 mtx_assert(vlp, MA_OWNED); 507 } 508 509 static inline void 510 cache_assert_vnode_locked(struct vnode *vp) 511 { 512 struct mtx *vlp; 513 514 vlp = VP2VNODELOCK(vp); 515 cache_assert_vlp_locked(vlp); 516 } 517 518 /* 519 * TODO: With the value stored we can do better than computing the hash based 520 * on the address. The choice of FNV should also be revisited. 521 */ 522 static void 523 cache_prehash(struct vnode *vp) 524 { 525 526 vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT); 527 } 528 529 static uint32_t 530 cache_get_hash(char *name, u_char len, struct vnode *dvp) 531 { 532 533 return (fnv_32_buf(name, len, dvp->v_nchash)); 534 } 535 536 static inline struct nchashhead * 537 NCP2BUCKET(struct namecache *ncp) 538 { 539 uint32_t hash; 540 541 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 542 return (NCHHASH(hash)); 543 } 544 545 static inline struct rwlock * 546 NCP2BUCKETLOCK(struct namecache *ncp) 547 { 548 uint32_t hash; 549 550 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 551 return (HASH2BUCKETLOCK(hash)); 552 } 553 554 #ifdef INVARIANTS 555 static void 556 cache_assert_bucket_locked(struct namecache *ncp, int mode) 557 { 558 struct rwlock *blp; 559 560 blp = NCP2BUCKETLOCK(ncp); 561 rw_assert(blp, mode); 562 } 563 #else 564 #define cache_assert_bucket_locked(x, y) do { } while (0) 565 #endif 566 567 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) 568 static void 569 _cache_sort_vnodes(void **p1, void **p2) 570 { 571 void *tmp; 572 573 MPASS(*p1 != NULL || *p2 != NULL); 574 575 if (*p1 > *p2) { 576 tmp = *p2; 577 *p2 = *p1; 578 *p1 = tmp; 579 } 580 } 581 582 static void 583 cache_lock_all_buckets(void) 584 { 585 u_int i; 586 587 for (i = 0; i < numbucketlocks; i++) 588 rw_wlock(&bucketlocks[i]); 589 } 590 591 static void 592 cache_unlock_all_buckets(void) 593 { 594 u_int i; 595 596 for (i = 0; i < numbucketlocks; i++) 597 rw_wunlock(&bucketlocks[i]); 598 } 599 600 static void 601 cache_lock_all_vnodes(void) 602 { 603 u_int i; 604 605 for (i = 0; i < numvnodelocks; i++) 606 mtx_lock(&vnodelocks[i]); 607 } 608 609 static void 610 cache_unlock_all_vnodes(void) 611 { 612 u_int i; 613 614 for (i = 0; i < numvnodelocks; i++) 615 mtx_unlock(&vnodelocks[i]); 616 } 617 618 static int 619 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 620 { 621 622 cache_sort_vnodes(&vlp1, &vlp2); 623 624 if (vlp1 != NULL) { 625 if (!mtx_trylock(vlp1)) 626 return (EAGAIN); 627 } 628 if (!mtx_trylock(vlp2)) { 629 if (vlp1 != NULL) 630 mtx_unlock(vlp1); 631 return (EAGAIN); 632 } 633 634 return (0); 635 } 636 637 static void 638 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 639 { 640 641 MPASS(vlp1 != NULL || vlp2 != NULL); 642 MPASS(vlp1 <= vlp2); 643 644 if (vlp1 != NULL) 645 mtx_lock(vlp1); 646 if (vlp2 != NULL) 647 mtx_lock(vlp2); 648 } 649 650 static void 651 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 652 { 653 654 MPASS(vlp1 != NULL || vlp2 != NULL); 655 656 if (vlp1 != NULL) 657 mtx_unlock(vlp1); 658 if (vlp2 != NULL) 659 mtx_unlock(vlp2); 660 } 661 662 static int 663 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 664 { 665 struct nchstats snap; 666 667 if (req->oldptr == NULL) 668 return (SYSCTL_OUT(req, 0, sizeof(snap))); 669 670 snap = nchstats; 671 snap.ncs_goodhits = counter_u64_fetch(numposhits); 672 snap.ncs_neghits = counter_u64_fetch(numneghits); 673 snap.ncs_badhits = counter_u64_fetch(numposzaps) + 674 counter_u64_fetch(numnegzaps); 675 snap.ncs_miss = counter_u64_fetch(nummisszap) + 676 counter_u64_fetch(nummiss); 677 678 return (SYSCTL_OUT(req, &snap, sizeof(snap))); 679 } 680 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | 681 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", 682 "VFS cache effectiveness statistics"); 683 684 #ifdef DIAGNOSTIC 685 /* 686 * Grab an atomic snapshot of the name cache hash chain lengths 687 */ 688 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, 689 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 690 "hash table stats"); 691 692 static int 693 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) 694 { 695 struct nchashhead *ncpp; 696 struct namecache *ncp; 697 int i, error, n_nchash, *cntbuf; 698 699 retry: 700 n_nchash = nchash + 1; /* nchash is max index, not count */ 701 if (req->oldptr == NULL) 702 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); 703 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); 704 cache_lock_all_buckets(); 705 if (n_nchash != nchash + 1) { 706 cache_unlock_all_buckets(); 707 free(cntbuf, M_TEMP); 708 goto retry; 709 } 710 /* Scan hash tables counting entries */ 711 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) 712 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) 713 cntbuf[i]++; 714 cache_unlock_all_buckets(); 715 for (error = 0, i = 0; i < n_nchash; i++) 716 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) 717 break; 718 free(cntbuf, M_TEMP); 719 return (error); 720 } 721 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| 722 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", 723 "nchash chain lengths"); 724 725 static int 726 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) 727 { 728 int error; 729 struct nchashhead *ncpp; 730 struct namecache *ncp; 731 int n_nchash; 732 int count, maxlength, used, pct; 733 734 if (!req->oldptr) 735 return SYSCTL_OUT(req, 0, 4 * sizeof(int)); 736 737 cache_lock_all_buckets(); 738 n_nchash = nchash + 1; /* nchash is max index, not count */ 739 used = 0; 740 maxlength = 0; 741 742 /* Scan hash tables for applicable entries */ 743 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { 744 count = 0; 745 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) { 746 count++; 747 } 748 if (count) 749 used++; 750 if (maxlength < count) 751 maxlength = count; 752 } 753 n_nchash = nchash + 1; 754 cache_unlock_all_buckets(); 755 pct = (used * 100) / (n_nchash / 100); 756 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); 757 if (error) 758 return (error); 759 error = SYSCTL_OUT(req, &used, sizeof(used)); 760 if (error) 761 return (error); 762 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); 763 if (error) 764 return (error); 765 error = SYSCTL_OUT(req, &pct, sizeof(pct)); 766 if (error) 767 return (error); 768 return (0); 769 } 770 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| 771 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", 772 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); 773 #endif 774 775 /* 776 * Negative entries management 777 * 778 * A variation of LRU scheme is used. New entries are hashed into one of 779 * numneglists cold lists. Entries get promoted to the hot list on first hit. 780 * 781 * The shrinker will demote hot list head and evict from the cold list in a 782 * round-robin manner. 783 */ 784 static void 785 cache_negative_init(struct namecache *ncp) 786 { 787 struct negstate *negstate; 788 789 ncp->nc_flag |= NCF_NEGATIVE; 790 negstate = NCP2NEGSTATE(ncp); 791 negstate->neg_flag = 0; 792 } 793 794 static void 795 cache_negative_hit(struct namecache *ncp) 796 { 797 struct neglist *neglist; 798 struct negstate *negstate; 799 800 negstate = NCP2NEGSTATE(ncp); 801 if ((negstate->neg_flag & NEG_HOT) != 0) 802 return; 803 neglist = NCP2NEGLIST(ncp); 804 mtx_lock(&ncneg_hot.nl_lock); 805 mtx_lock(&neglist->nl_lock); 806 if ((negstate->neg_flag & NEG_HOT) == 0) { 807 numhotneg++; 808 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 809 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); 810 negstate->neg_flag |= NEG_HOT; 811 } 812 mtx_unlock(&neglist->nl_lock); 813 mtx_unlock(&ncneg_hot.nl_lock); 814 } 815 816 static void 817 cache_negative_insert(struct namecache *ncp) 818 { 819 struct neglist *neglist; 820 821 MPASS(ncp->nc_flag & NCF_NEGATIVE); 822 cache_assert_bucket_locked(ncp, RA_WLOCKED); 823 neglist = NCP2NEGLIST(ncp); 824 mtx_lock(&neglist->nl_lock); 825 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 826 mtx_unlock(&neglist->nl_lock); 827 atomic_add_rel_long(&numneg, 1); 828 } 829 830 static void 831 cache_negative_remove(struct namecache *ncp) 832 { 833 struct neglist *neglist; 834 struct negstate *negstate; 835 bool hot_locked = false; 836 bool list_locked = false; 837 838 cache_assert_bucket_locked(ncp, RA_WLOCKED); 839 neglist = NCP2NEGLIST(ncp); 840 negstate = NCP2NEGSTATE(ncp); 841 if ((negstate->neg_flag & NEG_HOT) != 0) { 842 hot_locked = true; 843 mtx_lock(&ncneg_hot.nl_lock); 844 if ((negstate->neg_flag & NEG_HOT) == 0) { 845 list_locked = true; 846 mtx_lock(&neglist->nl_lock); 847 } 848 } else { 849 list_locked = true; 850 mtx_lock(&neglist->nl_lock); 851 /* 852 * We may be racing against promotion in lockless lookup. 853 */ 854 if ((negstate->neg_flag & NEG_HOT) != 0) { 855 mtx_unlock(&neglist->nl_lock); 856 hot_locked = true; 857 mtx_lock(&ncneg_hot.nl_lock); 858 mtx_lock(&neglist->nl_lock); 859 } 860 } 861 if ((negstate->neg_flag & NEG_HOT) != 0) { 862 mtx_assert(&ncneg_hot.nl_lock, MA_OWNED); 863 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 864 numhotneg--; 865 } else { 866 mtx_assert(&neglist->nl_lock, MA_OWNED); 867 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 868 } 869 if (list_locked) 870 mtx_unlock(&neglist->nl_lock); 871 if (hot_locked) 872 mtx_unlock(&ncneg_hot.nl_lock); 873 atomic_subtract_rel_long(&numneg, 1); 874 } 875 876 static void 877 cache_negative_shrink_select(struct namecache **ncpp, 878 struct neglist **neglistpp) 879 { 880 struct neglist *neglist; 881 struct namecache *ncp; 882 static u_int cycle; 883 u_int i; 884 885 *ncpp = ncp = NULL; 886 887 for (i = 0; i < numneglists; i++) { 888 neglist = &neglists[(cycle + i) % numneglists]; 889 if (TAILQ_FIRST(&neglist->nl_list) == NULL) 890 continue; 891 mtx_lock(&neglist->nl_lock); 892 ncp = TAILQ_FIRST(&neglist->nl_list); 893 if (ncp != NULL) 894 break; 895 mtx_unlock(&neglist->nl_lock); 896 } 897 898 *neglistpp = neglist; 899 *ncpp = ncp; 900 cycle++; 901 } 902 903 static void 904 cache_negative_zap_one(void) 905 { 906 struct namecache *ncp, *ncp2; 907 struct neglist *neglist; 908 struct negstate *negstate; 909 struct mtx *dvlp; 910 struct rwlock *blp; 911 912 if (mtx_owner(&ncneg_shrink_lock) != NULL || 913 !mtx_trylock(&ncneg_shrink_lock)) { 914 counter_u64_add(shrinking_skipped, 1); 915 return; 916 } 917 918 mtx_lock(&ncneg_hot.nl_lock); 919 ncp = TAILQ_FIRST(&ncneg_hot.nl_list); 920 if (ncp != NULL) { 921 neglist = NCP2NEGLIST(ncp); 922 negstate = NCP2NEGSTATE(ncp); 923 mtx_lock(&neglist->nl_lock); 924 MPASS((negstate->neg_flag & NEG_HOT) != 0); 925 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 926 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 927 negstate->neg_flag &= ~NEG_HOT; 928 numhotneg--; 929 mtx_unlock(&neglist->nl_lock); 930 } 931 mtx_unlock(&ncneg_hot.nl_lock); 932 933 cache_negative_shrink_select(&ncp, &neglist); 934 935 mtx_unlock(&ncneg_shrink_lock); 936 if (ncp == NULL) 937 return; 938 939 MPASS(ncp->nc_flag & NCF_NEGATIVE); 940 dvlp = VP2VNODELOCK(ncp->nc_dvp); 941 blp = NCP2BUCKETLOCK(ncp); 942 mtx_unlock(&neglist->nl_lock); 943 mtx_lock(dvlp); 944 rw_wlock(blp); 945 /* 946 * Enter SMR to safely check the negative list. 947 * Even if the found pointer matches, the entry may now be reallocated 948 * and used by a different vnode. 949 */ 950 vfs_smr_enter(); 951 ncp2 = TAILQ_FIRST(&neglist->nl_list); 952 if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) || 953 blp != NCP2BUCKETLOCK(ncp2)) { 954 vfs_smr_exit(); 955 ncp = NULL; 956 } else { 957 vfs_smr_exit(); 958 SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp, 959 ncp->nc_name); 960 cache_zap_locked(ncp); 961 counter_u64_add(numneg_evicted, 1); 962 } 963 rw_wunlock(blp); 964 mtx_unlock(dvlp); 965 cache_free(ncp); 966 } 967 968 /* 969 * cache_zap_locked(): 970 * 971 * Removes a namecache entry from cache, whether it contains an actual 972 * pointer to a vnode or if it is just a negative cache entry. 973 */ 974 static void 975 cache_zap_locked(struct namecache *ncp) 976 { 977 struct nchashhead *ncpp; 978 979 if (!(ncp->nc_flag & NCF_NEGATIVE)) 980 cache_assert_vnode_locked(ncp->nc_vp); 981 cache_assert_vnode_locked(ncp->nc_dvp); 982 cache_assert_bucket_locked(ncp, RA_WLOCKED); 983 984 CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, 985 (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp); 986 987 cache_ncp_invalidate(ncp); 988 989 ncpp = NCP2BUCKET(ncp); 990 CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash); 991 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 992 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, 993 ncp->nc_name, ncp->nc_vp); 994 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); 995 if (ncp == ncp->nc_vp->v_cache_dd) { 996 vn_seqc_write_begin_unheld(ncp->nc_vp); 997 ncp->nc_vp->v_cache_dd = NULL; 998 vn_seqc_write_end(ncp->nc_vp); 999 } 1000 } else { 1001 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp, 1002 ncp->nc_name); 1003 cache_negative_remove(ncp); 1004 } 1005 if (ncp->nc_flag & NCF_ISDOTDOT) { 1006 if (ncp == ncp->nc_dvp->v_cache_dd) { 1007 vn_seqc_write_begin_unheld(ncp->nc_dvp); 1008 ncp->nc_dvp->v_cache_dd = NULL; 1009 vn_seqc_write_end(ncp->nc_dvp); 1010 } 1011 } else { 1012 LIST_REMOVE(ncp, nc_src); 1013 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { 1014 ncp->nc_flag |= NCF_DVDROP; 1015 counter_u64_add(numcachehv, -1); 1016 } 1017 } 1018 atomic_subtract_rel_long(&numcache, 1); 1019 } 1020 1021 static void 1022 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) 1023 { 1024 struct rwlock *blp; 1025 1026 MPASS(ncp->nc_dvp == vp); 1027 MPASS(ncp->nc_flag & NCF_NEGATIVE); 1028 cache_assert_vnode_locked(vp); 1029 1030 blp = NCP2BUCKETLOCK(ncp); 1031 rw_wlock(blp); 1032 cache_zap_locked(ncp); 1033 rw_wunlock(blp); 1034 } 1035 1036 static bool 1037 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, 1038 struct mtx **vlpp) 1039 { 1040 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 1041 struct rwlock *blp; 1042 1043 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 1044 cache_assert_vnode_locked(vp); 1045 1046 if (ncp->nc_flag & NCF_NEGATIVE) { 1047 if (*vlpp != NULL) { 1048 mtx_unlock(*vlpp); 1049 *vlpp = NULL; 1050 } 1051 cache_zap_negative_locked_vnode_kl(ncp, vp); 1052 return (true); 1053 } 1054 1055 pvlp = VP2VNODELOCK(vp); 1056 blp = NCP2BUCKETLOCK(ncp); 1057 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 1058 vlp2 = VP2VNODELOCK(ncp->nc_vp); 1059 1060 if (*vlpp == vlp1 || *vlpp == vlp2) { 1061 to_unlock = *vlpp; 1062 *vlpp = NULL; 1063 } else { 1064 if (*vlpp != NULL) { 1065 mtx_unlock(*vlpp); 1066 *vlpp = NULL; 1067 } 1068 cache_sort_vnodes(&vlp1, &vlp2); 1069 if (vlp1 == pvlp) { 1070 mtx_lock(vlp2); 1071 to_unlock = vlp2; 1072 } else { 1073 if (!mtx_trylock(vlp1)) 1074 goto out_relock; 1075 to_unlock = vlp1; 1076 } 1077 } 1078 rw_wlock(blp); 1079 cache_zap_locked(ncp); 1080 rw_wunlock(blp); 1081 if (to_unlock != NULL) 1082 mtx_unlock(to_unlock); 1083 return (true); 1084 1085 out_relock: 1086 mtx_unlock(vlp2); 1087 mtx_lock(vlp1); 1088 mtx_lock(vlp2); 1089 MPASS(*vlpp == NULL); 1090 *vlpp = vlp1; 1091 return (false); 1092 } 1093 1094 static int __noinline 1095 cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp) 1096 { 1097 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 1098 struct rwlock *blp; 1099 int error = 0; 1100 1101 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 1102 cache_assert_vnode_locked(vp); 1103 1104 pvlp = VP2VNODELOCK(vp); 1105 if (ncp->nc_flag & NCF_NEGATIVE) { 1106 cache_zap_negative_locked_vnode_kl(ncp, vp); 1107 goto out; 1108 } 1109 1110 blp = NCP2BUCKETLOCK(ncp); 1111 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 1112 vlp2 = VP2VNODELOCK(ncp->nc_vp); 1113 cache_sort_vnodes(&vlp1, &vlp2); 1114 if (vlp1 == pvlp) { 1115 mtx_lock(vlp2); 1116 to_unlock = vlp2; 1117 } else { 1118 if (!mtx_trylock(vlp1)) { 1119 error = EAGAIN; 1120 goto out; 1121 } 1122 to_unlock = vlp1; 1123 } 1124 rw_wlock(blp); 1125 cache_zap_locked(ncp); 1126 rw_wunlock(blp); 1127 mtx_unlock(to_unlock); 1128 out: 1129 mtx_unlock(pvlp); 1130 return (error); 1131 } 1132 1133 /* 1134 * If trylocking failed we can get here. We know enough to take all needed locks 1135 * in the right order and re-lookup the entry. 1136 */ 1137 static int 1138 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1139 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, 1140 struct rwlock *blp) 1141 { 1142 struct namecache *rncp; 1143 1144 cache_assert_bucket_locked(ncp, RA_UNLOCKED); 1145 1146 cache_sort_vnodes(&dvlp, &vlp); 1147 cache_lock_vnodes(dvlp, vlp); 1148 rw_wlock(blp); 1149 CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { 1150 if (rncp == ncp && rncp->nc_dvp == dvp && 1151 rncp->nc_nlen == cnp->cn_namelen && 1152 !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) 1153 break; 1154 } 1155 if (rncp != NULL) { 1156 cache_zap_locked(rncp); 1157 rw_wunlock(blp); 1158 cache_unlock_vnodes(dvlp, vlp); 1159 counter_u64_add(zap_and_exit_bucket_relock_success, 1); 1160 return (0); 1161 } 1162 1163 rw_wunlock(blp); 1164 cache_unlock_vnodes(dvlp, vlp); 1165 return (EAGAIN); 1166 } 1167 1168 static int __noinline 1169 cache_zap_wlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1170 uint32_t hash, struct rwlock *blp) 1171 { 1172 struct mtx *dvlp, *vlp; 1173 struct vnode *dvp; 1174 1175 cache_assert_bucket_locked(ncp, RA_WLOCKED); 1176 1177 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1178 vlp = NULL; 1179 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1180 vlp = VP2VNODELOCK(ncp->nc_vp); 1181 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1182 cache_zap_locked(ncp); 1183 rw_wunlock(blp); 1184 cache_unlock_vnodes(dvlp, vlp); 1185 return (0); 1186 } 1187 1188 dvp = ncp->nc_dvp; 1189 rw_wunlock(blp); 1190 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1191 } 1192 1193 static int __noinline 1194 cache_zap_rlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1195 uint32_t hash, struct rwlock *blp) 1196 { 1197 struct mtx *dvlp, *vlp; 1198 struct vnode *dvp; 1199 1200 cache_assert_bucket_locked(ncp, RA_RLOCKED); 1201 1202 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1203 vlp = NULL; 1204 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1205 vlp = VP2VNODELOCK(ncp->nc_vp); 1206 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1207 rw_runlock(blp); 1208 rw_wlock(blp); 1209 cache_zap_locked(ncp); 1210 rw_wunlock(blp); 1211 cache_unlock_vnodes(dvlp, vlp); 1212 return (0); 1213 } 1214 1215 dvp = ncp->nc_dvp; 1216 rw_runlock(blp); 1217 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1218 } 1219 1220 static int 1221 cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp, 1222 struct mtx **vlpp1, struct mtx **vlpp2) 1223 { 1224 struct mtx *dvlp, *vlp; 1225 1226 cache_assert_bucket_locked(ncp, RA_WLOCKED); 1227 1228 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1229 vlp = NULL; 1230 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1231 vlp = VP2VNODELOCK(ncp->nc_vp); 1232 cache_sort_vnodes(&dvlp, &vlp); 1233 1234 if (*vlpp1 == dvlp && *vlpp2 == vlp) { 1235 cache_zap_locked(ncp); 1236 cache_unlock_vnodes(dvlp, vlp); 1237 *vlpp1 = NULL; 1238 *vlpp2 = NULL; 1239 return (0); 1240 } 1241 1242 if (*vlpp1 != NULL) 1243 mtx_unlock(*vlpp1); 1244 if (*vlpp2 != NULL) 1245 mtx_unlock(*vlpp2); 1246 *vlpp1 = NULL; 1247 *vlpp2 = NULL; 1248 1249 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1250 cache_zap_locked(ncp); 1251 cache_unlock_vnodes(dvlp, vlp); 1252 return (0); 1253 } 1254 1255 rw_wunlock(blp); 1256 *vlpp1 = dvlp; 1257 *vlpp2 = vlp; 1258 if (*vlpp1 != NULL) 1259 mtx_lock(*vlpp1); 1260 mtx_lock(*vlpp2); 1261 rw_wlock(blp); 1262 return (EAGAIN); 1263 } 1264 1265 static void 1266 cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp) 1267 { 1268 1269 if (blp != NULL) { 1270 rw_runlock(blp); 1271 } else { 1272 mtx_unlock(vlp); 1273 } 1274 } 1275 1276 static int __noinline 1277 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1278 struct timespec *tsp, int *ticksp) 1279 { 1280 int ltype; 1281 1282 *vpp = dvp; 1283 CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .", 1284 dvp, cnp->cn_nameptr); 1285 counter_u64_add(dothits, 1); 1286 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); 1287 if (tsp != NULL) 1288 timespecclear(tsp); 1289 if (ticksp != NULL) 1290 *ticksp = ticks; 1291 vrefact(*vpp); 1292 /* 1293 * When we lookup "." we still can be asked to lock it 1294 * differently... 1295 */ 1296 ltype = cnp->cn_lkflags & LK_TYPE_MASK; 1297 if (ltype != VOP_ISLOCKED(*vpp)) { 1298 if (ltype == LK_EXCLUSIVE) { 1299 vn_lock(*vpp, LK_UPGRADE | LK_RETRY); 1300 if (VN_IS_DOOMED((*vpp))) { 1301 /* forced unmount */ 1302 vrele(*vpp); 1303 *vpp = NULL; 1304 return (ENOENT); 1305 } 1306 } else 1307 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); 1308 } 1309 return (-1); 1310 } 1311 1312 static __noinline int 1313 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp) 1314 { 1315 struct namecache *ncp; 1316 struct rwlock *blp; 1317 struct mtx *dvlp, *dvlp2; 1318 uint32_t hash; 1319 int error; 1320 1321 if (cnp->cn_namelen == 2 && 1322 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1323 dvlp = VP2VNODELOCK(dvp); 1324 dvlp2 = NULL; 1325 mtx_lock(dvlp); 1326 retry_dotdot: 1327 ncp = dvp->v_cache_dd; 1328 if (ncp == NULL) { 1329 mtx_unlock(dvlp); 1330 if (dvlp2 != NULL) 1331 mtx_unlock(dvlp2); 1332 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1333 return (0); 1334 } 1335 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1336 if (ncp->nc_dvp != dvp) 1337 panic("dvp %p v_cache_dd %p\n", dvp, ncp); 1338 if (!cache_zap_locked_vnode_kl2(ncp, 1339 dvp, &dvlp2)) 1340 goto retry_dotdot; 1341 MPASS(dvp->v_cache_dd == NULL); 1342 mtx_unlock(dvlp); 1343 if (dvlp2 != NULL) 1344 mtx_unlock(dvlp2); 1345 cache_free(ncp); 1346 } else { 1347 vn_seqc_write_begin(dvp); 1348 dvp->v_cache_dd = NULL; 1349 vn_seqc_write_end(dvp); 1350 mtx_unlock(dvlp); 1351 if (dvlp2 != NULL) 1352 mtx_unlock(dvlp2); 1353 } 1354 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1355 return (1); 1356 } 1357 1358 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1359 blp = HASH2BUCKETLOCK(hash); 1360 retry: 1361 if (CK_SLIST_EMPTY(NCHHASH(hash))) 1362 goto out_no_entry; 1363 1364 rw_wlock(blp); 1365 1366 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1367 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1368 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1369 break; 1370 } 1371 1372 /* We failed to find an entry */ 1373 if (ncp == NULL) { 1374 rw_wunlock(blp); 1375 goto out_no_entry; 1376 } 1377 1378 error = cache_zap_wlocked_bucket(ncp, cnp, hash, blp); 1379 if (__predict_false(error != 0)) { 1380 zap_and_exit_bucket_fail++; 1381 cache_maybe_yield(); 1382 goto retry; 1383 } 1384 counter_u64_add(numposzaps, 1); 1385 cache_free(ncp); 1386 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1387 return (1); 1388 out_no_entry: 1389 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1390 counter_u64_add(nummisszap, 1); 1391 return (0); 1392 } 1393 1394 /** 1395 * Lookup a name in the name cache 1396 * 1397 * # Arguments 1398 * 1399 * - dvp: Parent directory in which to search. 1400 * - vpp: Return argument. Will contain desired vnode on cache hit. 1401 * - cnp: Parameters of the name search. The most interesting bits of 1402 * the cn_flags field have the following meanings: 1403 * - MAKEENTRY: If clear, free an entry from the cache rather than look 1404 * it up. 1405 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." 1406 * - tsp: Return storage for cache timestamp. On a successful (positive 1407 * or negative) lookup, tsp will be filled with any timespec that 1408 * was stored when this cache entry was created. However, it will 1409 * be clear for "." entries. 1410 * - ticks: Return storage for alternate cache timestamp. On a successful 1411 * (positive or negative) lookup, it will contain the ticks value 1412 * that was current when the cache entry was created, unless cnp 1413 * was ".". 1414 * 1415 * # Returns 1416 * 1417 * - -1: A positive cache hit. vpp will contain the desired vnode. 1418 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due 1419 * to a forced unmount. vpp will not be modified. If the entry 1420 * is a whiteout, then the ISWHITEOUT flag will be set in 1421 * cnp->cn_flags. 1422 * - 0: A cache miss. vpp will not be modified. 1423 * 1424 * # Locking 1425 * 1426 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up 1427 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the 1428 * lock is not recursively acquired. 1429 */ 1430 int 1431 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1432 struct timespec *tsp, int *ticksp) 1433 { 1434 struct namecache_ts *ncp_ts; 1435 struct namecache *ncp; 1436 struct negstate *negstate; 1437 struct rwlock *blp; 1438 struct mtx *dvlp; 1439 uint32_t hash; 1440 enum vgetstate vs; 1441 int error, ltype; 1442 bool try_smr, doing_smr, whiteout; 1443 1444 #ifdef DEBUG_CACHE 1445 if (__predict_false(!doingcache)) { 1446 cnp->cn_flags &= ~MAKEENTRY; 1447 return (0); 1448 } 1449 #endif 1450 1451 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) 1452 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); 1453 1454 if ((cnp->cn_flags & MAKEENTRY) == 0) { 1455 cache_remove_cnp(dvp, cnp); 1456 return (0); 1457 } 1458 1459 try_smr = true; 1460 if (cnp->cn_nameiop == CREATE) 1461 try_smr = false; 1462 retry: 1463 doing_smr = false; 1464 blp = NULL; 1465 dvlp = NULL; 1466 error = 0; 1467 if (cnp->cn_namelen == 2 && 1468 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1469 counter_u64_add(dotdothits, 1); 1470 dvlp = VP2VNODELOCK(dvp); 1471 mtx_lock(dvlp); 1472 ncp = dvp->v_cache_dd; 1473 if (ncp == NULL) { 1474 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, 1475 "..", NULL); 1476 mtx_unlock(dvlp); 1477 return (0); 1478 } 1479 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1480 if (ncp->nc_flag & NCF_NEGATIVE) 1481 *vpp = NULL; 1482 else 1483 *vpp = ncp->nc_vp; 1484 } else 1485 *vpp = ncp->nc_dvp; 1486 /* Return failure if negative entry was found. */ 1487 if (*vpp == NULL) 1488 goto negative_success; 1489 CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..", 1490 dvp, cnp->cn_nameptr, *vpp); 1491 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", 1492 *vpp); 1493 cache_out_ts(ncp, tsp, ticksp); 1494 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == 1495 NCF_DTS && tsp != NULL) { 1496 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1497 *tsp = ncp_ts->nc_dotdottime; 1498 } 1499 goto success; 1500 } 1501 1502 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1503 retry_hashed: 1504 if (try_smr) { 1505 vfs_smr_enter(); 1506 doing_smr = true; 1507 try_smr = false; 1508 } else { 1509 blp = HASH2BUCKETLOCK(hash); 1510 rw_rlock(blp); 1511 } 1512 1513 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1514 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1515 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1516 break; 1517 } 1518 1519 /* We failed to find an entry */ 1520 if (__predict_false(ncp == NULL)) { 1521 if (doing_smr) 1522 vfs_smr_exit(); 1523 else 1524 rw_runlock(blp); 1525 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1526 NULL); 1527 counter_u64_add(nummiss, 1); 1528 return (0); 1529 } 1530 1531 if (ncp->nc_flag & NCF_NEGATIVE) 1532 goto negative_success; 1533 1534 /* We found a "positive" match, return the vnode */ 1535 counter_u64_add(numposhits, 1); 1536 *vpp = ncp->nc_vp; 1537 CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p", 1538 dvp, cnp->cn_nameptr, *vpp, ncp); 1539 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, 1540 *vpp); 1541 cache_out_ts(ncp, tsp, ticksp); 1542 success: 1543 /* 1544 * On success we return a locked and ref'd vnode as per the lookup 1545 * protocol. 1546 */ 1547 MPASS(dvp != *vpp); 1548 ltype = 0; /* silence gcc warning */ 1549 if (cnp->cn_flags & ISDOTDOT) { 1550 ltype = VOP_ISLOCKED(dvp); 1551 VOP_UNLOCK(dvp); 1552 } 1553 if (doing_smr) { 1554 if (!cache_ncp_canuse(ncp)) { 1555 vfs_smr_exit(); 1556 *vpp = NULL; 1557 goto retry; 1558 } 1559 vs = vget_prep_smr(*vpp); 1560 vfs_smr_exit(); 1561 if (__predict_false(vs == VGET_NONE)) { 1562 *vpp = NULL; 1563 goto retry; 1564 } 1565 } else { 1566 vs = vget_prep(*vpp); 1567 cache_lookup_unlock(blp, dvlp); 1568 } 1569 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1570 if (cnp->cn_flags & ISDOTDOT) { 1571 vn_lock(dvp, ltype | LK_RETRY); 1572 if (VN_IS_DOOMED(dvp)) { 1573 if (error == 0) 1574 vput(*vpp); 1575 *vpp = NULL; 1576 return (ENOENT); 1577 } 1578 } 1579 if (error) { 1580 *vpp = NULL; 1581 goto retry; 1582 } 1583 if ((cnp->cn_flags & ISLASTCN) && 1584 (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) { 1585 ASSERT_VOP_ELOCKED(*vpp, "cache_lookup"); 1586 } 1587 return (-1); 1588 1589 negative_success: 1590 /* We found a negative match, and want to create it, so purge */ 1591 if (cnp->cn_nameiop == CREATE) { 1592 MPASS(!doing_smr); 1593 counter_u64_add(numnegzaps, 1); 1594 goto zap_and_exit; 1595 } 1596 1597 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 1598 cache_out_ts(ncp, tsp, ticksp); 1599 counter_u64_add(numneghits, 1); 1600 whiteout = (ncp->nc_flag & NCF_WHITE); 1601 1602 if (doing_smr) { 1603 /* 1604 * We need to take locks to promote an entry. 1605 */ 1606 negstate = NCP2NEGSTATE(ncp); 1607 if ((negstate->neg_flag & NEG_HOT) == 0 || 1608 !cache_ncp_canuse(ncp)) { 1609 vfs_smr_exit(); 1610 doing_smr = false; 1611 goto retry_hashed; 1612 } 1613 vfs_smr_exit(); 1614 } else { 1615 cache_negative_hit(ncp); 1616 cache_lookup_unlock(blp, dvlp); 1617 } 1618 if (whiteout) 1619 cnp->cn_flags |= ISWHITEOUT; 1620 return (ENOENT); 1621 1622 zap_and_exit: 1623 MPASS(!doing_smr); 1624 if (blp != NULL) 1625 error = cache_zap_rlocked_bucket(ncp, cnp, hash, blp); 1626 else 1627 error = cache_zap_locked_vnode(ncp, dvp); 1628 if (__predict_false(error != 0)) { 1629 zap_and_exit_bucket_fail2++; 1630 cache_maybe_yield(); 1631 goto retry; 1632 } 1633 cache_free(ncp); 1634 return (0); 1635 } 1636 1637 struct celockstate { 1638 struct mtx *vlp[3]; 1639 struct rwlock *blp[2]; 1640 }; 1641 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); 1642 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); 1643 1644 static inline void 1645 cache_celockstate_init(struct celockstate *cel) 1646 { 1647 1648 bzero(cel, sizeof(*cel)); 1649 } 1650 1651 static void 1652 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, 1653 struct vnode *dvp) 1654 { 1655 struct mtx *vlp1, *vlp2; 1656 1657 MPASS(cel->vlp[0] == NULL); 1658 MPASS(cel->vlp[1] == NULL); 1659 MPASS(cel->vlp[2] == NULL); 1660 1661 MPASS(vp != NULL || dvp != NULL); 1662 1663 vlp1 = VP2VNODELOCK(vp); 1664 vlp2 = VP2VNODELOCK(dvp); 1665 cache_sort_vnodes(&vlp1, &vlp2); 1666 1667 if (vlp1 != NULL) { 1668 mtx_lock(vlp1); 1669 cel->vlp[0] = vlp1; 1670 } 1671 mtx_lock(vlp2); 1672 cel->vlp[1] = vlp2; 1673 } 1674 1675 static void 1676 cache_unlock_vnodes_cel(struct celockstate *cel) 1677 { 1678 1679 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); 1680 1681 if (cel->vlp[0] != NULL) 1682 mtx_unlock(cel->vlp[0]); 1683 if (cel->vlp[1] != NULL) 1684 mtx_unlock(cel->vlp[1]); 1685 if (cel->vlp[2] != NULL) 1686 mtx_unlock(cel->vlp[2]); 1687 } 1688 1689 static bool 1690 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) 1691 { 1692 struct mtx *vlp; 1693 bool ret; 1694 1695 cache_assert_vlp_locked(cel->vlp[0]); 1696 cache_assert_vlp_locked(cel->vlp[1]); 1697 MPASS(cel->vlp[2] == NULL); 1698 1699 MPASS(vp != NULL); 1700 vlp = VP2VNODELOCK(vp); 1701 1702 ret = true; 1703 if (vlp >= cel->vlp[1]) { 1704 mtx_lock(vlp); 1705 } else { 1706 if (mtx_trylock(vlp)) 1707 goto out; 1708 cache_lock_vnodes_cel_3_failures++; 1709 cache_unlock_vnodes_cel(cel); 1710 if (vlp < cel->vlp[0]) { 1711 mtx_lock(vlp); 1712 mtx_lock(cel->vlp[0]); 1713 mtx_lock(cel->vlp[1]); 1714 } else { 1715 if (cel->vlp[0] != NULL) 1716 mtx_lock(cel->vlp[0]); 1717 mtx_lock(vlp); 1718 mtx_lock(cel->vlp[1]); 1719 } 1720 ret = false; 1721 } 1722 out: 1723 cel->vlp[2] = vlp; 1724 return (ret); 1725 } 1726 1727 static void 1728 cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1, 1729 struct rwlock *blp2) 1730 { 1731 1732 MPASS(cel->blp[0] == NULL); 1733 MPASS(cel->blp[1] == NULL); 1734 1735 cache_sort_vnodes(&blp1, &blp2); 1736 1737 if (blp1 != NULL) { 1738 rw_wlock(blp1); 1739 cel->blp[0] = blp1; 1740 } 1741 rw_wlock(blp2); 1742 cel->blp[1] = blp2; 1743 } 1744 1745 static void 1746 cache_unlock_buckets_cel(struct celockstate *cel) 1747 { 1748 1749 if (cel->blp[0] != NULL) 1750 rw_wunlock(cel->blp[0]); 1751 rw_wunlock(cel->blp[1]); 1752 } 1753 1754 /* 1755 * Lock part of the cache affected by the insertion. 1756 * 1757 * This means vnodelocks for dvp, vp and the relevant bucketlock. 1758 * However, insertion can result in removal of an old entry. In this 1759 * case we have an additional vnode and bucketlock pair to lock. If the 1760 * entry is negative, ncelock is locked instead of the vnode. 1761 * 1762 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while 1763 * preserving the locking order (smaller address first). 1764 */ 1765 static void 1766 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1767 uint32_t hash) 1768 { 1769 struct namecache *ncp; 1770 struct rwlock *blps[2]; 1771 1772 blps[0] = HASH2BUCKETLOCK(hash); 1773 for (;;) { 1774 blps[1] = NULL; 1775 cache_lock_vnodes_cel(cel, dvp, vp); 1776 if (vp == NULL || vp->v_type != VDIR) 1777 break; 1778 ncp = vp->v_cache_dd; 1779 if (ncp == NULL) 1780 break; 1781 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1782 break; 1783 MPASS(ncp->nc_dvp == vp); 1784 blps[1] = NCP2BUCKETLOCK(ncp); 1785 if (ncp->nc_flag & NCF_NEGATIVE) 1786 break; 1787 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1788 break; 1789 /* 1790 * All vnodes got re-locked. Re-validate the state and if 1791 * nothing changed we are done. Otherwise restart. 1792 */ 1793 if (ncp == vp->v_cache_dd && 1794 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1795 blps[1] == NCP2BUCKETLOCK(ncp) && 1796 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1797 break; 1798 cache_unlock_vnodes_cel(cel); 1799 cel->vlp[0] = NULL; 1800 cel->vlp[1] = NULL; 1801 cel->vlp[2] = NULL; 1802 } 1803 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1804 } 1805 1806 static void 1807 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1808 uint32_t hash) 1809 { 1810 struct namecache *ncp; 1811 struct rwlock *blps[2]; 1812 1813 blps[0] = HASH2BUCKETLOCK(hash); 1814 for (;;) { 1815 blps[1] = NULL; 1816 cache_lock_vnodes_cel(cel, dvp, vp); 1817 ncp = dvp->v_cache_dd; 1818 if (ncp == NULL) 1819 break; 1820 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1821 break; 1822 MPASS(ncp->nc_dvp == dvp); 1823 blps[1] = NCP2BUCKETLOCK(ncp); 1824 if (ncp->nc_flag & NCF_NEGATIVE) 1825 break; 1826 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1827 break; 1828 if (ncp == dvp->v_cache_dd && 1829 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1830 blps[1] == NCP2BUCKETLOCK(ncp) && 1831 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1832 break; 1833 cache_unlock_vnodes_cel(cel); 1834 cel->vlp[0] = NULL; 1835 cel->vlp[1] = NULL; 1836 cel->vlp[2] = NULL; 1837 } 1838 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1839 } 1840 1841 static void 1842 cache_enter_unlock(struct celockstate *cel) 1843 { 1844 1845 cache_unlock_buckets_cel(cel); 1846 cache_unlock_vnodes_cel(cel); 1847 } 1848 1849 static void __noinline 1850 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, 1851 struct componentname *cnp) 1852 { 1853 struct celockstate cel; 1854 struct namecache *ncp; 1855 uint32_t hash; 1856 int len; 1857 1858 if (dvp->v_cache_dd == NULL) 1859 return; 1860 len = cnp->cn_namelen; 1861 cache_celockstate_init(&cel); 1862 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1863 cache_enter_lock_dd(&cel, dvp, vp, hash); 1864 vn_seqc_write_begin(dvp); 1865 ncp = dvp->v_cache_dd; 1866 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) { 1867 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); 1868 cache_zap_locked(ncp); 1869 } else { 1870 ncp = NULL; 1871 } 1872 dvp->v_cache_dd = NULL; 1873 vn_seqc_write_end(dvp); 1874 cache_enter_unlock(&cel); 1875 cache_free(ncp); 1876 } 1877 1878 /* 1879 * Add an entry to the cache. 1880 */ 1881 void 1882 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 1883 struct timespec *tsp, struct timespec *dtsp) 1884 { 1885 struct celockstate cel; 1886 struct namecache *ncp, *n2, *ndd; 1887 struct namecache_ts *ncp_ts, *n2_ts; 1888 struct nchashhead *ncpp; 1889 uint32_t hash; 1890 int flag; 1891 int len; 1892 u_long lnumcache; 1893 1894 CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr); 1895 VNPASS(!VN_IS_DOOMED(dvp), dvp); 1896 VNPASS(dvp->v_type != VNON, dvp); 1897 if (vp != NULL) { 1898 VNPASS(!VN_IS_DOOMED(vp), vp); 1899 VNPASS(vp->v_type != VNON, vp); 1900 } 1901 1902 #ifdef DEBUG_CACHE 1903 if (__predict_false(!doingcache)) 1904 return; 1905 #endif 1906 1907 flag = 0; 1908 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 1909 if (cnp->cn_namelen == 1) 1910 return; 1911 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 1912 cache_enter_dotdot_prep(dvp, vp, cnp); 1913 flag = NCF_ISDOTDOT; 1914 } 1915 } 1916 1917 /* 1918 * Avoid blowout in namecache entries. 1919 */ 1920 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; 1921 if (__predict_false(lnumcache >= ncsize)) { 1922 atomic_add_long(&numcache, -1); 1923 counter_u64_add(numdrops, 1); 1924 return; 1925 } 1926 1927 cache_celockstate_init(&cel); 1928 ndd = NULL; 1929 ncp_ts = NULL; 1930 1931 /* 1932 * Calculate the hash key and setup as much of the new 1933 * namecache entry as possible before acquiring the lock. 1934 */ 1935 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); 1936 ncp->nc_flag = flag | NCF_WIP; 1937 ncp->nc_vp = vp; 1938 if (vp == NULL) 1939 cache_negative_init(ncp); 1940 ncp->nc_dvp = dvp; 1941 if (tsp != NULL) { 1942 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1943 ncp_ts->nc_time = *tsp; 1944 ncp_ts->nc_ticks = ticks; 1945 ncp_ts->nc_nc.nc_flag |= NCF_TS; 1946 if (dtsp != NULL) { 1947 ncp_ts->nc_dotdottime = *dtsp; 1948 ncp_ts->nc_nc.nc_flag |= NCF_DTS; 1949 } 1950 } 1951 len = ncp->nc_nlen = cnp->cn_namelen; 1952 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1953 memcpy(ncp->nc_name, cnp->cn_nameptr, len); 1954 ncp->nc_name[len] = '\0'; 1955 cache_enter_lock(&cel, dvp, vp, hash); 1956 1957 /* 1958 * See if this vnode or negative entry is already in the cache 1959 * with this name. This can happen with concurrent lookups of 1960 * the same path name. 1961 */ 1962 ncpp = NCHHASH(hash); 1963 CK_SLIST_FOREACH(n2, ncpp, nc_hash) { 1964 if (n2->nc_dvp == dvp && 1965 n2->nc_nlen == cnp->cn_namelen && 1966 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { 1967 MPASS(cache_ncp_canuse(n2)); 1968 if ((n2->nc_flag & NCF_NEGATIVE) != 0) 1969 KASSERT(vp == NULL, 1970 ("%s: found entry pointing to a different vnode (%p != %p)", 1971 __func__, NULL, vp)); 1972 else 1973 KASSERT(n2->nc_vp == vp, 1974 ("%s: found entry pointing to a different vnode (%p != %p)", 1975 __func__, n2->nc_vp, vp)); 1976 if (tsp != NULL) { 1977 KASSERT((n2->nc_flag & NCF_TS) != 0, 1978 ("no NCF_TS")); 1979 n2_ts = __containerof(n2, struct namecache_ts, nc_nc); 1980 n2_ts->nc_time = ncp_ts->nc_time; 1981 n2_ts->nc_ticks = ncp_ts->nc_ticks; 1982 if (dtsp != NULL) { 1983 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; 1984 n2_ts->nc_nc.nc_flag |= NCF_DTS; 1985 } 1986 } 1987 goto out_unlock_free; 1988 } 1989 } 1990 1991 if (flag == NCF_ISDOTDOT) { 1992 /* 1993 * See if we are trying to add .. entry, but some other lookup 1994 * has populated v_cache_dd pointer already. 1995 */ 1996 if (dvp->v_cache_dd != NULL) 1997 goto out_unlock_free; 1998 KASSERT(vp == NULL || vp->v_type == VDIR, 1999 ("wrong vnode type %p", vp)); 2000 vn_seqc_write_begin(dvp); 2001 dvp->v_cache_dd = ncp; 2002 vn_seqc_write_end(dvp); 2003 } 2004 2005 if (vp != NULL) { 2006 if (vp->v_type == VDIR) { 2007 if (flag != NCF_ISDOTDOT) { 2008 /* 2009 * For this case, the cache entry maps both the 2010 * directory name in it and the name ".." for the 2011 * directory's parent. 2012 */ 2013 vn_seqc_write_begin(vp); 2014 if ((ndd = vp->v_cache_dd) != NULL) { 2015 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) 2016 cache_zap_locked(ndd); 2017 else 2018 ndd = NULL; 2019 } 2020 vp->v_cache_dd = ncp; 2021 vn_seqc_write_end(vp); 2022 } 2023 } else { 2024 if (vp->v_cache_dd != NULL) { 2025 vn_seqc_write_begin(vp); 2026 vp->v_cache_dd = NULL; 2027 vn_seqc_write_end(vp); 2028 } 2029 } 2030 } 2031 2032 if (flag != NCF_ISDOTDOT) { 2033 if (LIST_EMPTY(&dvp->v_cache_src)) { 2034 vhold(dvp); 2035 counter_u64_add(numcachehv, 1); 2036 } 2037 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); 2038 } 2039 2040 /* 2041 * If the entry is "negative", we place it into the 2042 * "negative" cache queue, otherwise, we place it into the 2043 * destination vnode's cache entries queue. 2044 */ 2045 if (vp != NULL) { 2046 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); 2047 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, 2048 vp); 2049 } else { 2050 if (cnp->cn_flags & ISWHITEOUT) 2051 ncp->nc_flag |= NCF_WHITE; 2052 cache_negative_insert(ncp); 2053 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, 2054 ncp->nc_name); 2055 } 2056 2057 /* 2058 * Insert the new namecache entry into the appropriate chain 2059 * within the cache entries table. 2060 */ 2061 CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash); 2062 2063 atomic_thread_fence_rel(); 2064 /* 2065 * Mark the entry as fully constructed. 2066 * It is immutable past this point until its removal. 2067 */ 2068 atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP); 2069 2070 cache_enter_unlock(&cel); 2071 if (numneg * ncnegfactor > lnumcache) 2072 cache_negative_zap_one(); 2073 cache_free(ndd); 2074 return; 2075 out_unlock_free: 2076 cache_enter_unlock(&cel); 2077 atomic_add_long(&numcache, -1); 2078 cache_free(ncp); 2079 return; 2080 } 2081 2082 static u_int 2083 cache_roundup_2(u_int val) 2084 { 2085 u_int res; 2086 2087 for (res = 1; res <= val; res <<= 1) 2088 continue; 2089 2090 return (res); 2091 } 2092 2093 static struct nchashhead * 2094 nchinittbl(u_long elements, u_long *hashmask) 2095 { 2096 struct nchashhead *hashtbl; 2097 u_long hashsize, i; 2098 2099 hashsize = cache_roundup_2(elements) / 2; 2100 2101 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK); 2102 for (i = 0; i < hashsize; i++) 2103 CK_SLIST_INIT(&hashtbl[i]); 2104 *hashmask = hashsize - 1; 2105 return (hashtbl); 2106 } 2107 2108 static void 2109 ncfreetbl(struct nchashhead *hashtbl) 2110 { 2111 2112 free(hashtbl, M_VFSCACHE); 2113 } 2114 2115 /* 2116 * Name cache initialization, from vfs_init() when we are booting 2117 */ 2118 static void 2119 nchinit(void *dummy __unused) 2120 { 2121 u_int i; 2122 2123 cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE, 2124 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2125 cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE, 2126 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2127 cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE, 2128 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2129 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE, 2130 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2131 2132 VFS_SMR_ZONE_SET(cache_zone_small); 2133 VFS_SMR_ZONE_SET(cache_zone_small_ts); 2134 VFS_SMR_ZONE_SET(cache_zone_large); 2135 VFS_SMR_ZONE_SET(cache_zone_large_ts); 2136 2137 ncsize = desiredvnodes * ncsizefactor; 2138 nchashtbl = nchinittbl(desiredvnodes * 2, &nchash); 2139 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; 2140 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ 2141 ncbuckethash = 7; 2142 if (ncbuckethash > nchash) 2143 ncbuckethash = nchash; 2144 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, 2145 M_WAITOK | M_ZERO); 2146 for (i = 0; i < numbucketlocks; i++) 2147 rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE); 2148 ncvnodehash = ncbuckethash; 2149 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, 2150 M_WAITOK | M_ZERO); 2151 for (i = 0; i < numvnodelocks; i++) 2152 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); 2153 ncpurgeminvnodes = numbucketlocks * 2; 2154 2155 neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE, 2156 M_WAITOK | M_ZERO); 2157 for (i = 0; i < numneglists; i++) { 2158 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); 2159 TAILQ_INIT(&neglists[i].nl_list); 2160 } 2161 mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF); 2162 TAILQ_INIT(&ncneg_hot.nl_list); 2163 2164 mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF); 2165 } 2166 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); 2167 2168 void 2169 cache_vnode_init(struct vnode *vp) 2170 { 2171 2172 LIST_INIT(&vp->v_cache_src); 2173 TAILQ_INIT(&vp->v_cache_dst); 2174 vp->v_cache_dd = NULL; 2175 cache_prehash(vp); 2176 } 2177 2178 void 2179 cache_changesize(u_long newmaxvnodes) 2180 { 2181 struct nchashhead *new_nchashtbl, *old_nchashtbl; 2182 u_long new_nchash, old_nchash; 2183 struct namecache *ncp; 2184 uint32_t hash; 2185 u_long newncsize; 2186 int i; 2187 2188 newncsize = newmaxvnodes * ncsizefactor; 2189 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); 2190 if (newmaxvnodes < numbucketlocks) 2191 newmaxvnodes = numbucketlocks; 2192 2193 new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash); 2194 /* If same hash table size, nothing to do */ 2195 if (nchash == new_nchash) { 2196 ncfreetbl(new_nchashtbl); 2197 return; 2198 } 2199 /* 2200 * Move everything from the old hash table to the new table. 2201 * None of the namecache entries in the table can be removed 2202 * because to do so, they have to be removed from the hash table. 2203 */ 2204 cache_lock_all_vnodes(); 2205 cache_lock_all_buckets(); 2206 old_nchashtbl = nchashtbl; 2207 old_nchash = nchash; 2208 nchashtbl = new_nchashtbl; 2209 nchash = new_nchash; 2210 for (i = 0; i <= old_nchash; i++) { 2211 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) { 2212 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, 2213 ncp->nc_dvp); 2214 CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash); 2215 CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); 2216 } 2217 } 2218 ncsize = newncsize; 2219 cache_unlock_all_buckets(); 2220 cache_unlock_all_vnodes(); 2221 ncfreetbl(old_nchashtbl); 2222 } 2223 2224 /* 2225 * Invalidate all entries from and to a particular vnode. 2226 */ 2227 static void 2228 cache_purge_impl(struct vnode *vp) 2229 { 2230 TAILQ_HEAD(, namecache) ncps; 2231 struct namecache *ncp, *nnp; 2232 struct mtx *vlp, *vlp2; 2233 2234 TAILQ_INIT(&ncps); 2235 vlp = VP2VNODELOCK(vp); 2236 vlp2 = NULL; 2237 mtx_assert(vlp, MA_OWNED); 2238 retry: 2239 while (!LIST_EMPTY(&vp->v_cache_src)) { 2240 ncp = LIST_FIRST(&vp->v_cache_src); 2241 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2242 goto retry; 2243 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2244 } 2245 while (!TAILQ_EMPTY(&vp->v_cache_dst)) { 2246 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2247 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2248 goto retry; 2249 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2250 } 2251 ncp = vp->v_cache_dd; 2252 if (ncp != NULL) { 2253 KASSERT(ncp->nc_flag & NCF_ISDOTDOT, 2254 ("lost dotdot link")); 2255 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2256 goto retry; 2257 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2258 } 2259 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); 2260 mtx_unlock(vlp); 2261 if (vlp2 != NULL) 2262 mtx_unlock(vlp2); 2263 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2264 cache_free(ncp); 2265 } 2266 } 2267 2268 void 2269 cache_purge(struct vnode *vp) 2270 { 2271 struct mtx *vlp; 2272 2273 SDT_PROBE1(vfs, namecache, purge, done, vp); 2274 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2275 vp->v_cache_dd == NULL) 2276 return; 2277 vlp = VP2VNODELOCK(vp); 2278 mtx_lock(vlp); 2279 cache_purge_impl(vp); 2280 } 2281 2282 /* 2283 * Only to be used by vgone. 2284 */ 2285 void 2286 cache_purge_vgone(struct vnode *vp) 2287 { 2288 struct mtx *vlp; 2289 2290 VNPASS(VN_IS_DOOMED(vp), vp); 2291 vlp = VP2VNODELOCK(vp); 2292 if (!(LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2293 vp->v_cache_dd == NULL)) { 2294 mtx_lock(vlp); 2295 cache_purge_impl(vp); 2296 mtx_assert(vlp, MA_NOTOWNED); 2297 return; 2298 } 2299 2300 /* 2301 * All the NULL pointer state we found above may be transient. 2302 * Serialize against a possible thread doing cache_purge. 2303 */ 2304 mtx_wait_unlocked(vlp); 2305 if (!(LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2306 vp->v_cache_dd == NULL)) { 2307 mtx_lock(vlp); 2308 cache_purge_impl(vp); 2309 mtx_assert(vlp, MA_NOTOWNED); 2310 return; 2311 } 2312 return; 2313 } 2314 2315 /* 2316 * Invalidate all negative entries for a particular directory vnode. 2317 */ 2318 void 2319 cache_purge_negative(struct vnode *vp) 2320 { 2321 TAILQ_HEAD(, namecache) ncps; 2322 struct namecache *ncp, *nnp; 2323 struct mtx *vlp; 2324 2325 CTR1(KTR_VFS, "cache_purge_negative(%p)", vp); 2326 SDT_PROBE1(vfs, namecache, purge_negative, done, vp); 2327 if (LIST_EMPTY(&vp->v_cache_src)) 2328 return; 2329 TAILQ_INIT(&ncps); 2330 vlp = VP2VNODELOCK(vp); 2331 mtx_lock(vlp); 2332 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { 2333 if (!(ncp->nc_flag & NCF_NEGATIVE)) 2334 continue; 2335 cache_zap_negative_locked_vnode_kl(ncp, vp); 2336 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2337 } 2338 mtx_unlock(vlp); 2339 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2340 cache_free(ncp); 2341 } 2342 } 2343 2344 void 2345 cache_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, 2346 struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp) 2347 { 2348 2349 ASSERT_VOP_IN_SEQC(fdvp); 2350 ASSERT_VOP_IN_SEQC(fvp); 2351 ASSERT_VOP_IN_SEQC(tdvp); 2352 if (tvp != NULL) 2353 ASSERT_VOP_IN_SEQC(tvp); 2354 2355 cache_purge(fvp); 2356 if (tvp != NULL) { 2357 cache_purge(tvp); 2358 KASSERT(!cache_remove_cnp(tdvp, tcnp), 2359 ("%s: lingering negative entry", __func__)); 2360 } else { 2361 cache_remove_cnp(tdvp, tcnp); 2362 } 2363 } 2364 2365 /* 2366 * Flush all entries referencing a particular filesystem. 2367 */ 2368 void 2369 cache_purgevfs(struct mount *mp, bool force) 2370 { 2371 TAILQ_HEAD(, namecache) ncps; 2372 struct mtx *vlp1, *vlp2; 2373 struct rwlock *blp; 2374 struct nchashhead *bucket; 2375 struct namecache *ncp, *nnp; 2376 u_long i, j, n_nchash; 2377 int error; 2378 2379 /* Scan hash tables for applicable entries */ 2380 SDT_PROBE1(vfs, namecache, purgevfs, done, mp); 2381 if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes) 2382 return; 2383 TAILQ_INIT(&ncps); 2384 n_nchash = nchash + 1; 2385 vlp1 = vlp2 = NULL; 2386 for (i = 0; i < numbucketlocks; i++) { 2387 blp = (struct rwlock *)&bucketlocks[i]; 2388 rw_wlock(blp); 2389 for (j = i; j < n_nchash; j += numbucketlocks) { 2390 retry: 2391 bucket = &nchashtbl[j]; 2392 CK_SLIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) { 2393 cache_assert_bucket_locked(ncp, RA_WLOCKED); 2394 if (ncp->nc_dvp->v_mount != mp) 2395 continue; 2396 error = cache_zap_wlocked_bucket_kl(ncp, blp, 2397 &vlp1, &vlp2); 2398 if (error != 0) 2399 goto retry; 2400 TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst); 2401 } 2402 } 2403 rw_wunlock(blp); 2404 if (vlp1 == NULL && vlp2 == NULL) 2405 cache_maybe_yield(); 2406 } 2407 if (vlp1 != NULL) 2408 mtx_unlock(vlp1); 2409 if (vlp2 != NULL) 2410 mtx_unlock(vlp2); 2411 2412 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2413 cache_free(ncp); 2414 } 2415 } 2416 2417 /* 2418 * Perform canonical checks and cache lookup and pass on to filesystem 2419 * through the vop_cachedlookup only if needed. 2420 */ 2421 2422 int 2423 vfs_cache_lookup(struct vop_lookup_args *ap) 2424 { 2425 struct vnode *dvp; 2426 int error; 2427 struct vnode **vpp = ap->a_vpp; 2428 struct componentname *cnp = ap->a_cnp; 2429 int flags = cnp->cn_flags; 2430 2431 *vpp = NULL; 2432 dvp = ap->a_dvp; 2433 2434 if (dvp->v_type != VDIR) 2435 return (ENOTDIR); 2436 2437 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && 2438 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) 2439 return (EROFS); 2440 2441 error = vn_dir_check_exec(dvp, cnp); 2442 if (error != 0) 2443 return (error); 2444 2445 error = cache_lookup(dvp, vpp, cnp, NULL, NULL); 2446 if (error == 0) 2447 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); 2448 if (error == -1) 2449 return (0); 2450 return (error); 2451 } 2452 2453 /* Implementation of the getcwd syscall. */ 2454 int 2455 sys___getcwd(struct thread *td, struct __getcwd_args *uap) 2456 { 2457 char *buf, *retbuf; 2458 size_t buflen; 2459 int error; 2460 2461 buflen = uap->buflen; 2462 if (__predict_false(buflen < 2)) 2463 return (EINVAL); 2464 if (buflen > MAXPATHLEN) 2465 buflen = MAXPATHLEN; 2466 2467 buf = uma_zalloc(namei_zone, M_WAITOK); 2468 error = vn_getcwd(td, buf, &retbuf, &buflen); 2469 if (error == 0) 2470 error = copyout(retbuf, uap->buf, buflen); 2471 uma_zfree(namei_zone, buf); 2472 return (error); 2473 } 2474 2475 int 2476 vn_getcwd(struct thread *td, char *buf, char **retbuf, size_t *buflen) 2477 { 2478 struct pwd *pwd; 2479 int error; 2480 2481 pwd = pwd_hold(td); 2482 error = vn_fullpath_any(td, pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, buflen); 2483 pwd_drop(pwd); 2484 2485 #ifdef KTRACE 2486 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0) 2487 ktrnamei(*retbuf); 2488 #endif 2489 return (error); 2490 } 2491 2492 static int 2493 kern___realpathat(struct thread *td, int fd, const char *path, char *buf, 2494 size_t size, int flags, enum uio_seg pathseg) 2495 { 2496 struct nameidata nd; 2497 char *retbuf, *freebuf; 2498 int error; 2499 2500 if (flags != 0) 2501 return (EINVAL); 2502 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1, 2503 pathseg, path, fd, &cap_fstat_rights, td); 2504 if ((error = namei(&nd)) != 0) 2505 return (error); 2506 error = vn_fullpath_hardlink(td, &nd, &retbuf, &freebuf, &size); 2507 if (error == 0) { 2508 error = copyout(retbuf, buf, size); 2509 free(freebuf, M_TEMP); 2510 } 2511 NDFREE(&nd, 0); 2512 return (error); 2513 } 2514 2515 int 2516 sys___realpathat(struct thread *td, struct __realpathat_args *uap) 2517 { 2518 2519 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size, 2520 uap->flags, UIO_USERSPACE)); 2521 } 2522 2523 /* 2524 * Retrieve the full filesystem path that correspond to a vnode from the name 2525 * cache (if available) 2526 */ 2527 int 2528 vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf) 2529 { 2530 struct pwd *pwd; 2531 char *buf; 2532 size_t buflen; 2533 int error; 2534 2535 if (__predict_false(vn == NULL)) 2536 return (EINVAL); 2537 2538 buflen = MAXPATHLEN; 2539 buf = malloc(buflen, M_TEMP, M_WAITOK); 2540 pwd = pwd_hold(td); 2541 error = vn_fullpath_any(td, vn, pwd->pwd_rdir, buf, retbuf, &buflen); 2542 pwd_drop(pwd); 2543 2544 if (!error) 2545 *freebuf = buf; 2546 else 2547 free(buf, M_TEMP); 2548 return (error); 2549 } 2550 2551 /* 2552 * This function is similar to vn_fullpath, but it attempts to lookup the 2553 * pathname relative to the global root mount point. This is required for the 2554 * auditing sub-system, as audited pathnames must be absolute, relative to the 2555 * global root mount point. 2556 */ 2557 int 2558 vn_fullpath_global(struct thread *td, struct vnode *vn, 2559 char **retbuf, char **freebuf) 2560 { 2561 char *buf; 2562 size_t buflen; 2563 int error; 2564 2565 if (__predict_false(vn == NULL)) 2566 return (EINVAL); 2567 buflen = MAXPATHLEN; 2568 buf = malloc(buflen, M_TEMP, M_WAITOK); 2569 error = vn_fullpath_any(td, vn, rootvnode, buf, retbuf, &buflen); 2570 if (!error) 2571 *freebuf = buf; 2572 else 2573 free(buf, M_TEMP); 2574 return (error); 2575 } 2576 2577 int 2578 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, size_t *buflen) 2579 { 2580 struct vnode *dvp; 2581 struct namecache *ncp; 2582 struct mtx *vlp; 2583 int error; 2584 2585 vlp = VP2VNODELOCK(*vp); 2586 mtx_lock(vlp); 2587 TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) { 2588 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2589 break; 2590 } 2591 if (ncp != NULL) { 2592 if (*buflen < ncp->nc_nlen) { 2593 mtx_unlock(vlp); 2594 vrele(*vp); 2595 counter_u64_add(numfullpathfail4, 1); 2596 error = ENOMEM; 2597 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2598 vp, NULL); 2599 return (error); 2600 } 2601 *buflen -= ncp->nc_nlen; 2602 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 2603 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, 2604 ncp->nc_name, vp); 2605 dvp = *vp; 2606 *vp = ncp->nc_dvp; 2607 vref(*vp); 2608 mtx_unlock(vlp); 2609 vrele(dvp); 2610 return (0); 2611 } 2612 SDT_PROBE1(vfs, namecache, fullpath, miss, vp); 2613 2614 mtx_unlock(vlp); 2615 vn_lock(*vp, LK_SHARED | LK_RETRY); 2616 error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen); 2617 vput(*vp); 2618 if (error) { 2619 counter_u64_add(numfullpathfail2, 1); 2620 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2621 return (error); 2622 } 2623 2624 *vp = dvp; 2625 if (VN_IS_DOOMED(dvp)) { 2626 /* forced unmount */ 2627 vrele(dvp); 2628 error = ENOENT; 2629 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2630 return (error); 2631 } 2632 /* 2633 * *vp has its use count incremented still. 2634 */ 2635 2636 return (0); 2637 } 2638 2639 /* 2640 * Resolve a directory to a pathname. 2641 * 2642 * The name of the directory can always be found in the namecache or fetched 2643 * from the filesystem. There is also guaranteed to be only one parent, meaning 2644 * we can just follow vnodes up until we find the root. 2645 * 2646 * The vnode must be referenced. 2647 */ 2648 static int 2649 vn_fullpath_dir(struct thread *td, struct vnode *vp, struct vnode *rdir, 2650 char *buf, char **retbuf, size_t *len, bool slash_prefixed, size_t addend) 2651 { 2652 #ifdef KDTRACE_HOOKS 2653 struct vnode *startvp = vp; 2654 #endif 2655 struct vnode *vp1; 2656 size_t buflen; 2657 int error; 2658 2659 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); 2660 VNPASS(vp->v_usecount > 0, vp); 2661 2662 buflen = *len; 2663 2664 if (!slash_prefixed) { 2665 MPASS(*len >= 2); 2666 buflen--; 2667 buf[buflen] = '\0'; 2668 } 2669 2670 error = 0; 2671 2672 SDT_PROBE1(vfs, namecache, fullpath, entry, vp); 2673 counter_u64_add(numfullpathcalls, 1); 2674 while (vp != rdir && vp != rootvnode) { 2675 /* 2676 * The vp vnode must be already fully constructed, 2677 * since it is either found in namecache or obtained 2678 * from VOP_VPTOCNP(). We may test for VV_ROOT safely 2679 * without obtaining the vnode lock. 2680 */ 2681 if ((vp->v_vflag & VV_ROOT) != 0) { 2682 vn_lock(vp, LK_RETRY | LK_SHARED); 2683 2684 /* 2685 * With the vnode locked, check for races with 2686 * unmount, forced or not. Note that we 2687 * already verified that vp is not equal to 2688 * the root vnode, which means that 2689 * mnt_vnodecovered can be NULL only for the 2690 * case of unmount. 2691 */ 2692 if (VN_IS_DOOMED(vp) || 2693 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || 2694 vp1->v_mountedhere != vp->v_mount) { 2695 vput(vp); 2696 error = ENOENT; 2697 SDT_PROBE3(vfs, namecache, fullpath, return, 2698 error, vp, NULL); 2699 break; 2700 } 2701 2702 vref(vp1); 2703 vput(vp); 2704 vp = vp1; 2705 continue; 2706 } 2707 if (vp->v_type != VDIR) { 2708 vrele(vp); 2709 counter_u64_add(numfullpathfail1, 1); 2710 error = ENOTDIR; 2711 SDT_PROBE3(vfs, namecache, fullpath, return, 2712 error, vp, NULL); 2713 break; 2714 } 2715 error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen); 2716 if (error) 2717 break; 2718 if (buflen == 0) { 2719 vrele(vp); 2720 error = ENOMEM; 2721 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2722 startvp, NULL); 2723 break; 2724 } 2725 buf[--buflen] = '/'; 2726 slash_prefixed = true; 2727 } 2728 if (error) 2729 return (error); 2730 if (!slash_prefixed) { 2731 if (buflen == 0) { 2732 vrele(vp); 2733 counter_u64_add(numfullpathfail4, 1); 2734 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, 2735 startvp, NULL); 2736 return (ENOMEM); 2737 } 2738 buf[--buflen] = '/'; 2739 } 2740 counter_u64_add(numfullpathfound, 1); 2741 vrele(vp); 2742 2743 *retbuf = buf + buflen; 2744 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf); 2745 *len -= buflen; 2746 *len += addend; 2747 return (0); 2748 } 2749 2750 /* 2751 * Resolve an arbitrary vnode to a pathname. 2752 * 2753 * Note 2 caveats: 2754 * - hardlinks are not tracked, thus if the vnode is not a directory this can 2755 * resolve to a different path than the one used to find it 2756 * - namecache is not mandatory, meaning names are not guaranteed to be added 2757 * (in which case resolving fails) 2758 */ 2759 static int 2760 vn_fullpath_any(struct thread *td, struct vnode *vp, struct vnode *rdir, 2761 char *buf, char **retbuf, size_t *buflen) 2762 { 2763 size_t orig_buflen; 2764 bool slash_prefixed; 2765 int error; 2766 2767 if (*buflen < 2) 2768 return (EINVAL); 2769 2770 orig_buflen = *buflen; 2771 2772 vref(vp); 2773 slash_prefixed = false; 2774 if (vp->v_type != VDIR) { 2775 *buflen -= 1; 2776 buf[*buflen] = '\0'; 2777 error = vn_vptocnp(&vp, td->td_ucred, buf, buflen); 2778 if (error) 2779 return (error); 2780 if (*buflen == 0) { 2781 vrele(vp); 2782 return (ENOMEM); 2783 } 2784 *buflen -= 1; 2785 buf[*buflen] = '/'; 2786 slash_prefixed = true; 2787 } 2788 2789 return (vn_fullpath_dir(td, vp, rdir, buf, retbuf, buflen, slash_prefixed, 2790 orig_buflen - *buflen)); 2791 } 2792 2793 /* 2794 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks). 2795 * 2796 * Since the namecache does not track handlings, the caller is expected to first 2797 * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei. 2798 * 2799 * Then we have 2 cases: 2800 * - if the found vnode is a directory, the path can be constructed just by 2801 * fullowing names up the chain 2802 * - otherwise we populate the buffer with the saved name and start resolving 2803 * from the parent 2804 */ 2805 static int 2806 vn_fullpath_hardlink(struct thread *td, struct nameidata *ndp, char **retbuf, 2807 char **freebuf, size_t *buflen) 2808 { 2809 char *buf, *tmpbuf; 2810 struct pwd *pwd; 2811 struct componentname *cnp; 2812 struct vnode *vp; 2813 size_t addend; 2814 int error; 2815 bool slash_prefixed; 2816 enum vtype type; 2817 2818 if (*buflen < 2) 2819 return (EINVAL); 2820 if (*buflen > MAXPATHLEN) 2821 *buflen = MAXPATHLEN; 2822 2823 slash_prefixed = false; 2824 2825 buf = malloc(*buflen, M_TEMP, M_WAITOK); 2826 pwd = pwd_hold(td); 2827 2828 addend = 0; 2829 vp = ndp->ni_vp; 2830 /* 2831 * Check for VBAD to work around the vp_crossmp bug in lookup(). 2832 * 2833 * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be 2834 * set to mount point's root vnode while ni_dvp will be vp_crossmp. 2835 * If the type is VDIR (like in this very case) we can skip looking 2836 * at ni_dvp in the first place. However, since vnodes get passed here 2837 * unlocked the target may transition to doomed state (type == VBAD) 2838 * before we get to evaluate the condition. If this happens, we will 2839 * populate part of the buffer and descend to vn_fullpath_dir with 2840 * vp == vp_crossmp. Prevent the problem by checking for VBAD. 2841 * 2842 * This should be atomic_load(&vp->v_type) but it is ilegal to take 2843 * an address of a bit field, even if said field is sized to char. 2844 * Work around the problem by reading the value into a full-sized enum 2845 * and then re-reading it with atomic_load which will still prevent 2846 * the compiler from re-reading down the road. 2847 */ 2848 type = vp->v_type; 2849 type = atomic_load_int(&type); 2850 if (type == VBAD) { 2851 error = ENOENT; 2852 goto out_bad; 2853 } 2854 if (type != VDIR) { 2855 cnp = &ndp->ni_cnd; 2856 addend = cnp->cn_namelen + 2; 2857 if (*buflen < addend) { 2858 error = ENOMEM; 2859 goto out_bad; 2860 } 2861 *buflen -= addend; 2862 tmpbuf = buf + *buflen; 2863 tmpbuf[0] = '/'; 2864 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen); 2865 tmpbuf[addend - 1] = '\0'; 2866 slash_prefixed = true; 2867 vp = ndp->ni_dvp; 2868 } 2869 2870 vref(vp); 2871 error = vn_fullpath_dir(td, vp, pwd->pwd_rdir, buf, retbuf, buflen, 2872 slash_prefixed, addend); 2873 if (error != 0) 2874 goto out_bad; 2875 2876 pwd_drop(pwd); 2877 *freebuf = buf; 2878 2879 return (0); 2880 out_bad: 2881 pwd_drop(pwd); 2882 free(buf, M_TEMP); 2883 return (error); 2884 } 2885 2886 struct vnode * 2887 vn_dir_dd_ino(struct vnode *vp) 2888 { 2889 struct namecache *ncp; 2890 struct vnode *ddvp; 2891 struct mtx *vlp; 2892 enum vgetstate vs; 2893 2894 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); 2895 vlp = VP2VNODELOCK(vp); 2896 mtx_lock(vlp); 2897 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { 2898 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 2899 continue; 2900 ddvp = ncp->nc_dvp; 2901 vs = vget_prep(ddvp); 2902 mtx_unlock(vlp); 2903 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) 2904 return (NULL); 2905 return (ddvp); 2906 } 2907 mtx_unlock(vlp); 2908 return (NULL); 2909 } 2910 2911 int 2912 vn_commname(struct vnode *vp, char *buf, u_int buflen) 2913 { 2914 struct namecache *ncp; 2915 struct mtx *vlp; 2916 int l; 2917 2918 vlp = VP2VNODELOCK(vp); 2919 mtx_lock(vlp); 2920 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) 2921 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2922 break; 2923 if (ncp == NULL) { 2924 mtx_unlock(vlp); 2925 return (ENOENT); 2926 } 2927 l = min(ncp->nc_nlen, buflen - 1); 2928 memcpy(buf, ncp->nc_name, l); 2929 mtx_unlock(vlp); 2930 buf[l] = '\0'; 2931 return (0); 2932 } 2933 2934 /* 2935 * This function updates path string to vnode's full global path 2936 * and checks the size of the new path string against the pathlen argument. 2937 * 2938 * Requires a locked, referenced vnode. 2939 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 2940 * 2941 * If vp is a directory, the call to vn_fullpath_global() always succeeds 2942 * because it falls back to the ".." lookup if the namecache lookup fails. 2943 */ 2944 int 2945 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, 2946 u_int pathlen) 2947 { 2948 struct nameidata nd; 2949 struct vnode *vp1; 2950 char *rpath, *fbuf; 2951 int error; 2952 2953 ASSERT_VOP_ELOCKED(vp, __func__); 2954 2955 /* Construct global filesystem path from vp. */ 2956 VOP_UNLOCK(vp); 2957 error = vn_fullpath_global(td, vp, &rpath, &fbuf); 2958 2959 if (error != 0) { 2960 vrele(vp); 2961 return (error); 2962 } 2963 2964 if (strlen(rpath) >= pathlen) { 2965 vrele(vp); 2966 error = ENAMETOOLONG; 2967 goto out; 2968 } 2969 2970 /* 2971 * Re-lookup the vnode by path to detect a possible rename. 2972 * As a side effect, the vnode is relocked. 2973 * If vnode was renamed, return ENOENT. 2974 */ 2975 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, 2976 UIO_SYSSPACE, path, td); 2977 error = namei(&nd); 2978 if (error != 0) { 2979 vrele(vp); 2980 goto out; 2981 } 2982 NDFREE(&nd, NDF_ONLY_PNBUF); 2983 vp1 = nd.ni_vp; 2984 vrele(vp); 2985 if (vp1 == vp) 2986 strcpy(path, rpath); 2987 else { 2988 vput(vp1); 2989 error = ENOENT; 2990 } 2991 2992 out: 2993 free(fbuf, M_TEMP); 2994 return (error); 2995 } 2996 2997 #ifdef DDB 2998 static void 2999 db_print_vpath(struct vnode *vp) 3000 { 3001 3002 while (vp != NULL) { 3003 db_printf("%p: ", vp); 3004 if (vp == rootvnode) { 3005 db_printf("/"); 3006 vp = NULL; 3007 } else { 3008 if (vp->v_vflag & VV_ROOT) { 3009 db_printf("<mount point>"); 3010 vp = vp->v_mount->mnt_vnodecovered; 3011 } else { 3012 struct namecache *ncp; 3013 char *ncn; 3014 int i; 3015 3016 ncp = TAILQ_FIRST(&vp->v_cache_dst); 3017 if (ncp != NULL) { 3018 ncn = ncp->nc_name; 3019 for (i = 0; i < ncp->nc_nlen; i++) 3020 db_printf("%c", *ncn++); 3021 vp = ncp->nc_dvp; 3022 } else { 3023 vp = NULL; 3024 } 3025 } 3026 } 3027 db_printf("\n"); 3028 } 3029 3030 return; 3031 } 3032 3033 DB_SHOW_COMMAND(vpath, db_show_vpath) 3034 { 3035 struct vnode *vp; 3036 3037 if (!have_addr) { 3038 db_printf("usage: show vpath <struct vnode *>\n"); 3039 return; 3040 } 3041 3042 vp = (struct vnode *)addr; 3043 db_print_vpath(vp); 3044 } 3045 3046 #endif 3047 3048 static bool __read_frequently cache_fast_lookup = true; 3049 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW, 3050 &cache_fast_lookup, 0, ""); 3051 3052 #define CACHE_FPL_FAILED -2020 3053 3054 static void 3055 cache_fpl_cleanup_cnp(struct componentname *cnp) 3056 { 3057 3058 uma_zfree(namei_zone, cnp->cn_pnbuf); 3059 #ifdef DIAGNOSTIC 3060 cnp->cn_pnbuf = NULL; 3061 cnp->cn_nameptr = NULL; 3062 #endif 3063 } 3064 3065 static void 3066 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp) 3067 { 3068 struct componentname *cnp; 3069 3070 cnp = &ndp->ni_cnd; 3071 while (*(cnp->cn_nameptr) == '/') { 3072 cnp->cn_nameptr++; 3073 ndp->ni_pathlen--; 3074 } 3075 3076 *dpp = ndp->ni_rootdir; 3077 } 3078 3079 /* 3080 * Components of nameidata (or objects it can point to) which may 3081 * need restoring in case fast path lookup fails. 3082 */ 3083 struct nameidata_saved { 3084 long cn_namelen; 3085 char *cn_nameptr; 3086 size_t ni_pathlen; 3087 int cn_flags; 3088 }; 3089 3090 struct cache_fpl { 3091 struct nameidata *ndp; 3092 struct componentname *cnp; 3093 struct pwd *pwd; 3094 struct vnode *dvp; 3095 struct vnode *tvp; 3096 seqc_t dvp_seqc; 3097 seqc_t tvp_seqc; 3098 struct nameidata_saved snd; 3099 int line; 3100 enum cache_fpl_status status:8; 3101 bool in_smr; 3102 }; 3103 3104 static void 3105 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd) 3106 { 3107 3108 snd->cn_flags = fpl->ndp->ni_cnd.cn_flags; 3109 snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen; 3110 snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr; 3111 snd->ni_pathlen = fpl->ndp->ni_pathlen; 3112 } 3113 3114 static void 3115 cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd) 3116 { 3117 3118 fpl->ndp->ni_cnd.cn_flags = snd->cn_flags; 3119 fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen; 3120 fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr; 3121 fpl->ndp->ni_pathlen = snd->ni_pathlen; 3122 } 3123 3124 #ifdef INVARIANTS 3125 #define cache_fpl_smr_assert_entered(fpl) ({ \ 3126 struct cache_fpl *_fpl = (fpl); \ 3127 MPASS(_fpl->in_smr == true); \ 3128 VFS_SMR_ASSERT_ENTERED(); \ 3129 }) 3130 #define cache_fpl_smr_assert_not_entered(fpl) ({ \ 3131 struct cache_fpl *_fpl = (fpl); \ 3132 MPASS(_fpl->in_smr == false); \ 3133 VFS_SMR_ASSERT_NOT_ENTERED(); \ 3134 }) 3135 #else 3136 #define cache_fpl_smr_assert_entered(fpl) do { } while (0) 3137 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0) 3138 #endif 3139 3140 #define cache_fpl_smr_enter_initial(fpl) ({ \ 3141 struct cache_fpl *_fpl = (fpl); \ 3142 vfs_smr_enter(); \ 3143 _fpl->in_smr = true; \ 3144 }) 3145 3146 #define cache_fpl_smr_enter(fpl) ({ \ 3147 struct cache_fpl *_fpl = (fpl); \ 3148 MPASS(_fpl->in_smr == false); \ 3149 vfs_smr_enter(); \ 3150 _fpl->in_smr = true; \ 3151 }) 3152 3153 #define cache_fpl_smr_exit(fpl) ({ \ 3154 struct cache_fpl *_fpl = (fpl); \ 3155 MPASS(_fpl->in_smr == true); \ 3156 vfs_smr_exit(); \ 3157 _fpl->in_smr = false; \ 3158 }) 3159 3160 static int 3161 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line) 3162 { 3163 3164 if (fpl->status != CACHE_FPL_STATUS_UNSET) { 3165 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, 3166 ("%s: converting to abort from %d at %d, set at %d\n", 3167 __func__, fpl->status, line, fpl->line)); 3168 } 3169 fpl->status = CACHE_FPL_STATUS_ABORTED; 3170 fpl->line = line; 3171 return (CACHE_FPL_FAILED); 3172 } 3173 3174 #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__) 3175 3176 static int 3177 cache_fpl_partial_impl(struct cache_fpl *fpl, int line) 3178 { 3179 3180 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3181 ("%s: setting to partial at %d, but already set to %d at %d\n", 3182 __func__, line, fpl->status, fpl->line)); 3183 cache_fpl_smr_assert_entered(fpl); 3184 fpl->status = CACHE_FPL_STATUS_PARTIAL; 3185 fpl->line = line; 3186 return (CACHE_FPL_FAILED); 3187 } 3188 3189 #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__) 3190 3191 static int 3192 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line) 3193 { 3194 3195 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3196 ("%s: setting to handled at %d, but already set to %d at %d\n", 3197 __func__, line, fpl->status, fpl->line)); 3198 cache_fpl_smr_assert_not_entered(fpl); 3199 MPASS(error != CACHE_FPL_FAILED); 3200 fpl->status = CACHE_FPL_STATUS_HANDLED; 3201 fpl->line = line; 3202 return (error); 3203 } 3204 3205 #define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__) 3206 3207 #define CACHE_FPL_SUPPORTED_CN_FLAGS \ 3208 (LOCKLEAF | LOCKPARENT | WANTPARENT | NOCACHE | FOLLOW | LOCKSHARED | SAVENAME | \ 3209 SAVESTART | WILLBEDIR | ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK) 3210 3211 #define CACHE_FPL_INTERNAL_CN_FLAGS \ 3212 (ISDOTDOT | MAKEENTRY | ISLASTCN) 3213 3214 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, 3215 "supported and internal flags overlap"); 3216 3217 static bool 3218 cache_fpl_islastcn(struct nameidata *ndp) 3219 { 3220 3221 return (*ndp->ni_next == 0); 3222 } 3223 3224 static bool 3225 cache_fpl_isdotdot(struct componentname *cnp) 3226 { 3227 3228 if (cnp->cn_namelen == 2 && 3229 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') 3230 return (true); 3231 return (false); 3232 } 3233 3234 static bool 3235 cache_can_fplookup(struct cache_fpl *fpl) 3236 { 3237 struct nameidata *ndp; 3238 struct componentname *cnp; 3239 struct thread *td; 3240 3241 ndp = fpl->ndp; 3242 cnp = fpl->cnp; 3243 td = cnp->cn_thread; 3244 3245 if (!cache_fast_lookup) { 3246 cache_fpl_aborted(fpl); 3247 return (false); 3248 } 3249 #ifdef MAC 3250 if (mac_vnode_check_lookup_enabled()) { 3251 cache_fpl_aborted(fpl); 3252 return (false); 3253 } 3254 #endif 3255 if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) { 3256 cache_fpl_aborted(fpl); 3257 return (false); 3258 } 3259 if (ndp->ni_dirfd != AT_FDCWD) { 3260 cache_fpl_aborted(fpl); 3261 return (false); 3262 } 3263 if (IN_CAPABILITY_MODE(td)) { 3264 cache_fpl_aborted(fpl); 3265 return (false); 3266 } 3267 if (AUDITING_TD(td)) { 3268 cache_fpl_aborted(fpl); 3269 return (false); 3270 } 3271 if (ndp->ni_startdir != NULL) { 3272 cache_fpl_aborted(fpl); 3273 return (false); 3274 } 3275 return (true); 3276 } 3277 3278 static bool 3279 cache_fplookup_vnode_supported(struct vnode *vp) 3280 { 3281 3282 return (vp->v_type != VLNK); 3283 } 3284 3285 /* 3286 * Move a negative entry to the hot list. 3287 * 3288 * We have to take locks, but they may be contended and in the worst 3289 * case we may need to go off CPU. We don't want to spin within the 3290 * smr section and we can't block with it. Instead we are going to 3291 * look up the entry again. 3292 */ 3293 static int __noinline 3294 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp, 3295 uint32_t hash) 3296 { 3297 struct componentname *cnp; 3298 struct namecache *ncp; 3299 struct neglist *neglist; 3300 struct negstate *negstate; 3301 struct vnode *dvp; 3302 u_char nc_flag; 3303 3304 cnp = fpl->cnp; 3305 dvp = fpl->dvp; 3306 3307 if (!vhold_smr(dvp)) 3308 return (cache_fpl_aborted(fpl)); 3309 3310 neglist = NCP2NEGLIST(oncp); 3311 cache_fpl_smr_exit(fpl); 3312 3313 mtx_lock(&ncneg_hot.nl_lock); 3314 mtx_lock(&neglist->nl_lock); 3315 /* 3316 * For hash iteration. 3317 */ 3318 cache_fpl_smr_enter(fpl); 3319 3320 /* 3321 * Avoid all surprises by only succeeding if we got the same entry and 3322 * bailing completely otherwise. 3323 * 3324 * In particular at this point there can be a new ncp which matches the 3325 * search but hashes to a different neglist. 3326 */ 3327 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 3328 if (ncp == oncp) 3329 break; 3330 } 3331 3332 /* 3333 * No match to begin with. 3334 */ 3335 if (__predict_false(ncp == NULL)) { 3336 goto out_abort; 3337 } 3338 3339 /* 3340 * The newly found entry may be something different... 3341 */ 3342 if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 3343 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) { 3344 goto out_abort; 3345 } 3346 3347 /* 3348 * ... and not even negative. 3349 */ 3350 nc_flag = atomic_load_char(&ncp->nc_flag); 3351 if ((nc_flag & NCF_NEGATIVE) == 0) { 3352 goto out_abort; 3353 } 3354 3355 if (__predict_false(!cache_ncp_canuse(ncp))) { 3356 goto out_abort; 3357 } 3358 3359 negstate = NCP2NEGSTATE(ncp); 3360 if ((negstate->neg_flag & NEG_HOT) == 0) { 3361 numhotneg++; 3362 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 3363 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); 3364 negstate->neg_flag |= NEG_HOT; 3365 } 3366 3367 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 3368 counter_u64_add(numneghits, 1); 3369 cache_fpl_smr_exit(fpl); 3370 mtx_unlock(&neglist->nl_lock); 3371 mtx_unlock(&ncneg_hot.nl_lock); 3372 vdrop(dvp); 3373 return (cache_fpl_handled(fpl, ENOENT)); 3374 out_abort: 3375 cache_fpl_smr_exit(fpl); 3376 mtx_unlock(&neglist->nl_lock); 3377 mtx_unlock(&ncneg_hot.nl_lock); 3378 vdrop(dvp); 3379 return (cache_fpl_aborted(fpl)); 3380 } 3381 3382 /* 3383 * The target vnode is not supported, prepare for the slow path to take over. 3384 */ 3385 static int __noinline 3386 cache_fplookup_partial_setup(struct cache_fpl *fpl) 3387 { 3388 struct nameidata *ndp; 3389 struct componentname *cnp; 3390 enum vgetstate dvs; 3391 struct vnode *dvp; 3392 struct pwd *pwd; 3393 seqc_t dvp_seqc; 3394 3395 ndp = fpl->ndp; 3396 cnp = fpl->cnp; 3397 dvp = fpl->dvp; 3398 dvp_seqc = fpl->dvp_seqc; 3399 3400 dvs = vget_prep_smr(dvp); 3401 if (__predict_false(dvs == VGET_NONE)) { 3402 cache_fpl_smr_exit(fpl); 3403 return (cache_fpl_aborted(fpl)); 3404 } 3405 3406 cache_fpl_smr_exit(fpl); 3407 3408 vget_finish_ref(dvp, dvs); 3409 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3410 vrele(dvp); 3411 return (cache_fpl_aborted(fpl)); 3412 } 3413 3414 pwd = pwd_hold(curthread); 3415 if (fpl->pwd != pwd) { 3416 vrele(dvp); 3417 pwd_drop(pwd); 3418 return (cache_fpl_aborted(fpl)); 3419 } 3420 3421 cache_fpl_restore(fpl, &fpl->snd); 3422 3423 ndp->ni_startdir = dvp; 3424 cnp->cn_flags |= MAKEENTRY; 3425 if (cache_fpl_islastcn(ndp)) 3426 cnp->cn_flags |= ISLASTCN; 3427 if (cache_fpl_isdotdot(cnp)) 3428 cnp->cn_flags |= ISDOTDOT; 3429 3430 return (0); 3431 } 3432 3433 static int 3434 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs) 3435 { 3436 struct componentname *cnp; 3437 struct vnode *tvp; 3438 seqc_t tvp_seqc; 3439 int error, lkflags; 3440 3441 cnp = fpl->cnp; 3442 tvp = fpl->tvp; 3443 tvp_seqc = fpl->tvp_seqc; 3444 3445 if ((cnp->cn_flags & LOCKLEAF) != 0) { 3446 lkflags = LK_SHARED; 3447 if ((cnp->cn_flags & LOCKSHARED) == 0) 3448 lkflags = LK_EXCLUSIVE; 3449 error = vget_finish(tvp, lkflags, tvs); 3450 if (__predict_false(error != 0)) { 3451 return (cache_fpl_aborted(fpl)); 3452 } 3453 } else { 3454 vget_finish_ref(tvp, tvs); 3455 } 3456 3457 if (!vn_seqc_consistent(tvp, tvp_seqc)) { 3458 if ((cnp->cn_flags & LOCKLEAF) != 0) 3459 vput(tvp); 3460 else 3461 vrele(tvp); 3462 return (cache_fpl_aborted(fpl)); 3463 } 3464 3465 return (cache_fpl_handled(fpl, 0)); 3466 } 3467 3468 /* 3469 * They want to possibly modify the state of the namecache. 3470 * 3471 * Don't try to match the API contract, just leave. 3472 * TODO: this leaves scalability on the table 3473 */ 3474 static int 3475 cache_fplookup_final_modifying(struct cache_fpl *fpl) 3476 { 3477 struct componentname *cnp; 3478 3479 cnp = fpl->cnp; 3480 MPASS(cnp->cn_nameiop != LOOKUP); 3481 return (cache_fpl_partial(fpl)); 3482 } 3483 3484 static int __noinline 3485 cache_fplookup_final_withparent(struct cache_fpl *fpl) 3486 { 3487 struct componentname *cnp; 3488 enum vgetstate dvs, tvs; 3489 struct vnode *dvp, *tvp; 3490 seqc_t dvp_seqc, tvp_seqc; 3491 int error; 3492 3493 cnp = fpl->cnp; 3494 dvp = fpl->dvp; 3495 dvp_seqc = fpl->dvp_seqc; 3496 tvp = fpl->tvp; 3497 tvp_seqc = fpl->tvp_seqc; 3498 3499 MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0); 3500 3501 /* 3502 * This is less efficient than it can be for simplicity. 3503 */ 3504 dvs = vget_prep_smr(dvp); 3505 if (__predict_false(dvs == VGET_NONE)) { 3506 return (cache_fpl_aborted(fpl)); 3507 } 3508 tvs = vget_prep_smr(tvp); 3509 if (__predict_false(tvs == VGET_NONE)) { 3510 cache_fpl_smr_exit(fpl); 3511 vget_abort(dvp, dvs); 3512 return (cache_fpl_aborted(fpl)); 3513 } 3514 3515 cache_fpl_smr_exit(fpl); 3516 3517 if ((cnp->cn_flags & LOCKPARENT) != 0) { 3518 error = vget_finish(dvp, LK_EXCLUSIVE, dvs); 3519 if (__predict_false(error != 0)) { 3520 vget_abort(tvp, tvs); 3521 return (cache_fpl_aborted(fpl)); 3522 } 3523 } else { 3524 vget_finish_ref(dvp, dvs); 3525 } 3526 3527 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3528 vget_abort(tvp, tvs); 3529 if ((cnp->cn_flags & LOCKPARENT) != 0) 3530 vput(dvp); 3531 else 3532 vrele(dvp); 3533 return (cache_fpl_aborted(fpl)); 3534 } 3535 3536 error = cache_fplookup_final_child(fpl, tvs); 3537 if (__predict_false(error != 0)) { 3538 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED); 3539 if ((cnp->cn_flags & LOCKPARENT) != 0) 3540 vput(dvp); 3541 else 3542 vrele(dvp); 3543 return (error); 3544 } 3545 3546 MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED); 3547 return (0); 3548 } 3549 3550 static int 3551 cache_fplookup_final(struct cache_fpl *fpl) 3552 { 3553 struct componentname *cnp; 3554 enum vgetstate tvs; 3555 struct vnode *dvp, *tvp; 3556 seqc_t dvp_seqc, tvp_seqc; 3557 3558 cnp = fpl->cnp; 3559 dvp = fpl->dvp; 3560 dvp_seqc = fpl->dvp_seqc; 3561 tvp = fpl->tvp; 3562 tvp_seqc = fpl->tvp_seqc; 3563 3564 VNPASS(cache_fplookup_vnode_supported(dvp), dvp); 3565 3566 if (cnp->cn_nameiop != LOOKUP) { 3567 return (cache_fplookup_final_modifying(fpl)); 3568 } 3569 3570 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) 3571 return (cache_fplookup_final_withparent(fpl)); 3572 3573 tvs = vget_prep_smr(tvp); 3574 if (__predict_false(tvs == VGET_NONE)) { 3575 return (cache_fpl_partial(fpl)); 3576 } 3577 3578 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3579 cache_fpl_smr_exit(fpl); 3580 vget_abort(tvp, tvs); 3581 return (cache_fpl_aborted(fpl)); 3582 } 3583 3584 cache_fpl_smr_exit(fpl); 3585 return (cache_fplookup_final_child(fpl, tvs)); 3586 } 3587 3588 static int __noinline 3589 cache_fplookup_dot(struct cache_fpl *fpl) 3590 { 3591 struct vnode *dvp; 3592 3593 dvp = fpl->dvp; 3594 3595 fpl->tvp = dvp; 3596 fpl->tvp_seqc = vn_seqc_read_any(dvp); 3597 if (seqc_in_modify(fpl->tvp_seqc)) { 3598 return (cache_fpl_aborted(fpl)); 3599 } 3600 3601 counter_u64_add(dothits, 1); 3602 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", dvp); 3603 3604 return (0); 3605 } 3606 3607 static int __noinline 3608 cache_fplookup_dotdot(struct cache_fpl *fpl) 3609 { 3610 struct nameidata *ndp; 3611 struct componentname *cnp; 3612 struct namecache *ncp; 3613 struct vnode *dvp; 3614 struct prison *pr; 3615 u_char nc_flag; 3616 3617 ndp = fpl->ndp; 3618 cnp = fpl->cnp; 3619 dvp = fpl->dvp; 3620 3621 /* 3622 * XXX this is racy the same way regular lookup is 3623 */ 3624 for (pr = cnp->cn_cred->cr_prison; pr != NULL; 3625 pr = pr->pr_parent) 3626 if (dvp == pr->pr_root) 3627 break; 3628 3629 if (dvp == ndp->ni_rootdir || 3630 dvp == ndp->ni_topdir || 3631 dvp == rootvnode || 3632 pr != NULL) { 3633 fpl->tvp = dvp; 3634 fpl->tvp_seqc = vn_seqc_read_any(dvp); 3635 if (seqc_in_modify(fpl->tvp_seqc)) { 3636 return (cache_fpl_aborted(fpl)); 3637 } 3638 return (0); 3639 } 3640 3641 if ((dvp->v_vflag & VV_ROOT) != 0) { 3642 /* 3643 * TODO 3644 * The opposite of climb mount is needed here. 3645 */ 3646 return (cache_fpl_aborted(fpl)); 3647 } 3648 3649 ncp = atomic_load_ptr(&dvp->v_cache_dd); 3650 if (ncp == NULL) { 3651 return (cache_fpl_aborted(fpl)); 3652 } 3653 3654 nc_flag = atomic_load_char(&ncp->nc_flag); 3655 if ((nc_flag & NCF_ISDOTDOT) != 0) { 3656 if ((nc_flag & NCF_NEGATIVE) != 0) 3657 return (cache_fpl_aborted(fpl)); 3658 fpl->tvp = ncp->nc_vp; 3659 } else { 3660 fpl->tvp = ncp->nc_dvp; 3661 } 3662 3663 if (__predict_false(!cache_ncp_canuse(ncp))) { 3664 return (cache_fpl_aborted(fpl)); 3665 } 3666 3667 fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp); 3668 if (seqc_in_modify(fpl->tvp_seqc)) { 3669 return (cache_fpl_partial(fpl)); 3670 } 3671 3672 counter_u64_add(dotdothits, 1); 3673 return (0); 3674 } 3675 3676 static int 3677 cache_fplookup_next(struct cache_fpl *fpl) 3678 { 3679 struct componentname *cnp; 3680 struct namecache *ncp; 3681 struct negstate *negstate; 3682 struct vnode *dvp, *tvp; 3683 u_char nc_flag; 3684 uint32_t hash; 3685 bool neg_hot; 3686 3687 cnp = fpl->cnp; 3688 dvp = fpl->dvp; 3689 3690 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) { 3691 return (cache_fplookup_dot(fpl)); 3692 } 3693 3694 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 3695 3696 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 3697 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 3698 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 3699 break; 3700 } 3701 3702 /* 3703 * If there is no entry we have to punt to the slow path to perform 3704 * actual lookup. Should there be nothing with this name a negative 3705 * entry will be created. 3706 */ 3707 if (__predict_false(ncp == NULL)) { 3708 return (cache_fpl_partial(fpl)); 3709 } 3710 3711 tvp = atomic_load_ptr(&ncp->nc_vp); 3712 nc_flag = atomic_load_char(&ncp->nc_flag); 3713 if ((nc_flag & NCF_NEGATIVE) != 0) { 3714 /* 3715 * If they want to create an entry we need to replace this one. 3716 */ 3717 if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) { 3718 return (cache_fpl_partial(fpl)); 3719 } 3720 negstate = NCP2NEGSTATE(ncp); 3721 neg_hot = ((negstate->neg_flag & NEG_HOT) != 0); 3722 if (__predict_false(!cache_ncp_canuse(ncp))) { 3723 return (cache_fpl_partial(fpl)); 3724 } 3725 if (__predict_false((nc_flag & NCF_WHITE) != 0)) { 3726 return (cache_fpl_partial(fpl)); 3727 } 3728 if (!neg_hot) { 3729 return (cache_fplookup_negative_promote(fpl, ncp, hash)); 3730 } 3731 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, 3732 ncp->nc_name); 3733 counter_u64_add(numneghits, 1); 3734 cache_fpl_smr_exit(fpl); 3735 return (cache_fpl_handled(fpl, ENOENT)); 3736 } 3737 3738 if (__predict_false(!cache_ncp_canuse(ncp))) { 3739 return (cache_fpl_partial(fpl)); 3740 } 3741 3742 fpl->tvp = tvp; 3743 fpl->tvp_seqc = vn_seqc_read_any(tvp); 3744 if (seqc_in_modify(fpl->tvp_seqc)) { 3745 return (cache_fpl_partial(fpl)); 3746 } 3747 3748 if (!cache_fplookup_vnode_supported(tvp)) { 3749 return (cache_fpl_partial(fpl)); 3750 } 3751 3752 counter_u64_add(numposhits, 1); 3753 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp); 3754 return (0); 3755 } 3756 3757 static bool 3758 cache_fplookup_mp_supported(struct mount *mp) 3759 { 3760 3761 if (mp == NULL) 3762 return (false); 3763 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) 3764 return (false); 3765 return (true); 3766 } 3767 3768 /* 3769 * Walk up the mount stack (if any). 3770 * 3771 * Correctness is provided in the following ways: 3772 * - all vnodes are protected from freeing with SMR 3773 * - struct mount objects are type stable making them always safe to access 3774 * - stability of the particular mount is provided by busying it 3775 * - relationship between the vnode which is mounted on and the mount is 3776 * verified with the vnode sequence counter after busying 3777 * - association between root vnode of the mount and the mount is protected 3778 * by busy 3779 * 3780 * From that point on we can read the sequence counter of the root vnode 3781 * and get the next mount on the stack (if any) using the same protection. 3782 * 3783 * By the end of successful walk we are guaranteed the reached state was 3784 * indeed present at least at some point which matches the regular lookup. 3785 */ 3786 static int __noinline 3787 cache_fplookup_climb_mount(struct cache_fpl *fpl) 3788 { 3789 struct mount *mp, *prev_mp; 3790 struct vnode *vp; 3791 seqc_t vp_seqc; 3792 3793 vp = fpl->tvp; 3794 vp_seqc = fpl->tvp_seqc; 3795 3796 VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp); 3797 mp = atomic_load_ptr(&vp->v_mountedhere); 3798 if (mp == NULL) 3799 return (0); 3800 3801 prev_mp = NULL; 3802 for (;;) { 3803 if (!vfs_op_thread_enter_crit(mp)) { 3804 if (prev_mp != NULL) 3805 vfs_op_thread_exit_crit(prev_mp); 3806 return (cache_fpl_partial(fpl)); 3807 } 3808 if (prev_mp != NULL) 3809 vfs_op_thread_exit_crit(prev_mp); 3810 if (!vn_seqc_consistent(vp, vp_seqc)) { 3811 vfs_op_thread_exit_crit(mp); 3812 return (cache_fpl_partial(fpl)); 3813 } 3814 if (!cache_fplookup_mp_supported(mp)) { 3815 vfs_op_thread_exit_crit(mp); 3816 return (cache_fpl_partial(fpl)); 3817 } 3818 vp = atomic_load_ptr(&mp->mnt_rootvnode); 3819 if (vp == NULL || VN_IS_DOOMED(vp)) { 3820 vfs_op_thread_exit_crit(mp); 3821 return (cache_fpl_partial(fpl)); 3822 } 3823 vp_seqc = vn_seqc_read_any(vp); 3824 if (seqc_in_modify(vp_seqc)) { 3825 vfs_op_thread_exit_crit(mp); 3826 return (cache_fpl_partial(fpl)); 3827 } 3828 prev_mp = mp; 3829 mp = atomic_load_ptr(&vp->v_mountedhere); 3830 if (mp == NULL) 3831 break; 3832 } 3833 3834 vfs_op_thread_exit_crit(prev_mp); 3835 fpl->tvp = vp; 3836 fpl->tvp_seqc = vp_seqc; 3837 return (0); 3838 } 3839 3840 static bool 3841 cache_fplookup_need_climb_mount(struct cache_fpl *fpl) 3842 { 3843 struct mount *mp; 3844 struct vnode *vp; 3845 3846 vp = fpl->tvp; 3847 3848 /* 3849 * Hack: while this is a union, the pointer tends to be NULL so save on 3850 * a branch. 3851 */ 3852 mp = atomic_load_ptr(&vp->v_mountedhere); 3853 if (mp == NULL) 3854 return (false); 3855 if (vp->v_type == VDIR) 3856 return (true); 3857 return (false); 3858 } 3859 3860 /* 3861 * Parse the path. 3862 * 3863 * The code is mostly copy-pasted from regular lookup, see lookup(). 3864 * The structure is maintained along with comments for easier maintenance. 3865 * Deduplicating the code will become feasible after fast path lookup 3866 * becomes more feature-complete. 3867 */ 3868 static int 3869 cache_fplookup_parse(struct cache_fpl *fpl) 3870 { 3871 struct nameidata *ndp; 3872 struct componentname *cnp; 3873 char *cp; 3874 3875 ndp = fpl->ndp; 3876 cnp = fpl->cnp; 3877 3878 /* 3879 * Search a new directory. 3880 * 3881 * The last component of the filename is left accessible via 3882 * cnp->cn_nameptr for callers that need the name. Callers needing 3883 * the name set the SAVENAME flag. When done, they assume 3884 * responsibility for freeing the pathname buffer. 3885 */ 3886 for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++) 3887 continue; 3888 cnp->cn_namelen = cp - cnp->cn_nameptr; 3889 if (__predict_false(cnp->cn_namelen > NAME_MAX)) { 3890 cache_fpl_smr_exit(fpl); 3891 return (cache_fpl_handled(fpl, ENAMETOOLONG)); 3892 } 3893 ndp->ni_pathlen -= cnp->cn_namelen; 3894 KASSERT(ndp->ni_pathlen <= PATH_MAX, 3895 ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen)); 3896 ndp->ni_next = cp; 3897 3898 /* 3899 * Replace multiple slashes by a single slash and trailing slashes 3900 * by a null. This must be done before VOP_LOOKUP() because some 3901 * fs's don't know about trailing slashes. Remember if there were 3902 * trailing slashes to handle symlinks, existing non-directories 3903 * and non-existing files that won't be directories specially later. 3904 */ 3905 while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) { 3906 cp++; 3907 ndp->ni_pathlen--; 3908 if (*cp == '\0') { 3909 /* 3910 * TODO 3911 * Regular lookup performs the following: 3912 * *ndp->ni_next = '\0'; 3913 * cnp->cn_flags |= TRAILINGSLASH; 3914 * 3915 * Which is problematic since it modifies data read 3916 * from userspace. Then if fast path lookup was to 3917 * abort we would have to either restore it or convey 3918 * the flag. Since this is a corner case just ignore 3919 * it for simplicity. 3920 */ 3921 return (cache_fpl_partial(fpl)); 3922 } 3923 } 3924 ndp->ni_next = cp; 3925 3926 /* 3927 * Check for degenerate name (e.g. / or "") 3928 * which is a way of talking about a directory, 3929 * e.g. like "/." or ".". 3930 * 3931 * TODO 3932 * Another corner case handled by the regular lookup 3933 */ 3934 if (__predict_false(cnp->cn_nameptr[0] == '\0')) { 3935 return (cache_fpl_partial(fpl)); 3936 } 3937 return (0); 3938 } 3939 3940 static void 3941 cache_fplookup_parse_advance(struct cache_fpl *fpl) 3942 { 3943 struct nameidata *ndp; 3944 struct componentname *cnp; 3945 3946 ndp = fpl->ndp; 3947 cnp = fpl->cnp; 3948 3949 cnp->cn_nameptr = ndp->ni_next; 3950 while (*cnp->cn_nameptr == '/') { 3951 cnp->cn_nameptr++; 3952 ndp->ni_pathlen--; 3953 } 3954 } 3955 3956 static int __noinline 3957 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error) 3958 { 3959 3960 switch (error) { 3961 case EAGAIN: 3962 /* 3963 * Can happen when racing against vgone. 3964 * */ 3965 case EOPNOTSUPP: 3966 cache_fpl_partial(fpl); 3967 break; 3968 default: 3969 /* 3970 * See the API contract for VOP_FPLOOKUP_VEXEC. 3971 */ 3972 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { 3973 error = cache_fpl_aborted(fpl); 3974 } else { 3975 cache_fpl_smr_exit(fpl); 3976 cache_fpl_handled(fpl, error); 3977 } 3978 break; 3979 } 3980 return (error); 3981 } 3982 3983 static int 3984 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl) 3985 { 3986 struct nameidata *ndp; 3987 struct componentname *cnp; 3988 struct mount *mp; 3989 int error; 3990 3991 error = CACHE_FPL_FAILED; 3992 ndp = fpl->ndp; 3993 cnp = fpl->cnp; 3994 3995 cache_fpl_checkpoint(fpl, &fpl->snd); 3996 3997 fpl->dvp = dvp; 3998 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); 3999 if (seqc_in_modify(fpl->dvp_seqc)) { 4000 cache_fpl_aborted(fpl); 4001 goto out; 4002 } 4003 mp = atomic_load_ptr(&fpl->dvp->v_mount); 4004 if (!cache_fplookup_mp_supported(mp)) { 4005 cache_fpl_aborted(fpl); 4006 goto out; 4007 } 4008 4009 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); 4010 4011 for (;;) { 4012 error = cache_fplookup_parse(fpl); 4013 if (__predict_false(error != 0)) { 4014 break; 4015 } 4016 4017 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); 4018 4019 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred); 4020 if (__predict_false(error != 0)) { 4021 error = cache_fplookup_failed_vexec(fpl, error); 4022 break; 4023 } 4024 4025 if (__predict_false(cache_fpl_isdotdot(cnp))) { 4026 error = cache_fplookup_dotdot(fpl); 4027 if (__predict_false(error != 0)) { 4028 break; 4029 } 4030 } else { 4031 error = cache_fplookup_next(fpl); 4032 if (__predict_false(error != 0)) { 4033 break; 4034 } 4035 4036 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 4037 4038 if (cache_fplookup_need_climb_mount(fpl)) { 4039 error = cache_fplookup_climb_mount(fpl); 4040 if (__predict_false(error != 0)) { 4041 break; 4042 } 4043 } 4044 } 4045 4046 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 4047 4048 if (cache_fpl_islastcn(ndp)) { 4049 error = cache_fplookup_final(fpl); 4050 break; 4051 } 4052 4053 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { 4054 error = cache_fpl_aborted(fpl); 4055 break; 4056 } 4057 4058 fpl->dvp = fpl->tvp; 4059 fpl->dvp_seqc = fpl->tvp_seqc; 4060 4061 cache_fplookup_parse_advance(fpl); 4062 cache_fpl_checkpoint(fpl, &fpl->snd); 4063 } 4064 out: 4065 switch (fpl->status) { 4066 case CACHE_FPL_STATUS_UNSET: 4067 __assert_unreachable(); 4068 break; 4069 case CACHE_FPL_STATUS_PARTIAL: 4070 cache_fpl_smr_assert_entered(fpl); 4071 return (cache_fplookup_partial_setup(fpl)); 4072 case CACHE_FPL_STATUS_ABORTED: 4073 if (fpl->in_smr) 4074 cache_fpl_smr_exit(fpl); 4075 return (CACHE_FPL_FAILED); 4076 case CACHE_FPL_STATUS_HANDLED: 4077 MPASS(error != CACHE_FPL_FAILED); 4078 cache_fpl_smr_assert_not_entered(fpl); 4079 if (__predict_false(error != 0)) { 4080 ndp->ni_dvp = NULL; 4081 ndp->ni_vp = NULL; 4082 cache_fpl_cleanup_cnp(cnp); 4083 return (error); 4084 } 4085 ndp->ni_dvp = fpl->dvp; 4086 ndp->ni_vp = fpl->tvp; 4087 if (cnp->cn_flags & SAVENAME) 4088 cnp->cn_flags |= HASBUF; 4089 else 4090 cache_fpl_cleanup_cnp(cnp); 4091 return (error); 4092 } 4093 } 4094 4095 /* 4096 * Fast path lookup protected with SMR and sequence counters. 4097 * 4098 * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one. 4099 * 4100 * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria 4101 * outlined below. 4102 * 4103 * Traditional vnode lookup conceptually looks like this: 4104 * 4105 * vn_lock(current); 4106 * for (;;) { 4107 * next = find(); 4108 * vn_lock(next); 4109 * vn_unlock(current); 4110 * current = next; 4111 * if (last) 4112 * break; 4113 * } 4114 * return (current); 4115 * 4116 * Each jump to the next vnode is safe memory-wise and atomic with respect to 4117 * any modifications thanks to holding respective locks. 4118 * 4119 * The same guarantee can be provided with a combination of safe memory 4120 * reclamation and sequence counters instead. If all operations which affect 4121 * the relationship between the current vnode and the one we are looking for 4122 * also modify the counter, we can verify whether all the conditions held as 4123 * we made the jump. This includes things like permissions, mount points etc. 4124 * Counter modification is provided by enclosing relevant places in 4125 * vn_seqc_write_begin()/end() calls. 4126 * 4127 * Thus this translates to: 4128 * 4129 * vfs_smr_enter(); 4130 * dvp_seqc = seqc_read_any(dvp); 4131 * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode 4132 * abort(); 4133 * for (;;) { 4134 * tvp = find(); 4135 * tvp_seqc = seqc_read_any(tvp); 4136 * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode 4137 * abort(); 4138 * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode 4139 * abort(); 4140 * dvp = tvp; // we know nothing of importance has changed 4141 * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration 4142 * if (last) 4143 * break; 4144 * } 4145 * vget(); // secure the vnode 4146 * if (!seqc_consistent(tvp, tvp_seqc) // final check 4147 * abort(); 4148 * // at this point we know nothing has changed for any parent<->child pair 4149 * // as they were crossed during the lookup, meaning we matched the guarantee 4150 * // of the locked variant 4151 * return (tvp); 4152 * 4153 * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows: 4154 * - they are called while within vfs_smr protection which they must never exit 4155 * - EAGAIN can be returned to denote checking could not be performed, it is 4156 * always valid to return it 4157 * - if the sequence counter has not changed the result must be valid 4158 * - if the sequence counter has changed both false positives and false negatives 4159 * are permitted (since the result will be rejected later) 4160 * - for simple cases of unix permission checks vaccess_vexec_smr can be used 4161 * 4162 * Caveats to watch out for: 4163 * - vnodes are passed unlocked and unreferenced with nothing stopping 4164 * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised 4165 * to use atomic_load_ptr to fetch it. 4166 * - the aforementioned object can also get freed, meaning absent other means it 4167 * should be protected with vfs_smr 4168 * - either safely checking permissions as they are modified or guaranteeing 4169 * their stability is left to the routine 4170 */ 4171 int 4172 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status, 4173 struct pwd **pwdp) 4174 { 4175 struct cache_fpl fpl; 4176 struct pwd *pwd; 4177 struct vnode *dvp; 4178 struct componentname *cnp; 4179 struct nameidata_saved orig; 4180 int error; 4181 4182 MPASS(ndp->ni_lcf == 0); 4183 4184 fpl.status = CACHE_FPL_STATUS_UNSET; 4185 fpl.ndp = ndp; 4186 fpl.cnp = &ndp->ni_cnd; 4187 MPASS(curthread == fpl.cnp->cn_thread); 4188 4189 if ((fpl.cnp->cn_flags & SAVESTART) != 0) 4190 MPASS(fpl.cnp->cn_nameiop != LOOKUP); 4191 4192 if (!cache_can_fplookup(&fpl)) { 4193 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 4194 *status = fpl.status; 4195 return (EOPNOTSUPP); 4196 } 4197 4198 cache_fpl_checkpoint(&fpl, &orig); 4199 4200 cache_fpl_smr_enter_initial(&fpl); 4201 pwd = pwd_get_smr(); 4202 fpl.pwd = pwd; 4203 ndp->ni_rootdir = pwd->pwd_rdir; 4204 ndp->ni_topdir = pwd->pwd_jdir; 4205 4206 cnp = fpl.cnp; 4207 cnp->cn_nameptr = cnp->cn_pnbuf; 4208 if (cnp->cn_pnbuf[0] == '/') { 4209 cache_fpl_handle_root(ndp, &dvp); 4210 } else { 4211 MPASS(ndp->ni_dirfd == AT_FDCWD); 4212 dvp = pwd->pwd_cdir; 4213 } 4214 4215 SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true); 4216 4217 error = cache_fplookup_impl(dvp, &fpl); 4218 cache_fpl_smr_assert_not_entered(&fpl); 4219 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 4220 4221 *status = fpl.status; 4222 switch (fpl.status) { 4223 case CACHE_FPL_STATUS_UNSET: 4224 __assert_unreachable(); 4225 break; 4226 case CACHE_FPL_STATUS_HANDLED: 4227 SDT_PROBE3(vfs, namei, lookup, return, error, 4228 (error == 0 ? ndp->ni_vp : NULL), true); 4229 break; 4230 case CACHE_FPL_STATUS_PARTIAL: 4231 *pwdp = fpl.pwd; 4232 /* 4233 * Status restored by cache_fplookup_partial_setup. 4234 */ 4235 break; 4236 case CACHE_FPL_STATUS_ABORTED: 4237 cache_fpl_restore(&fpl, &orig); 4238 break; 4239 } 4240 return (error); 4241 } 4242