1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Poul-Henning Kamp of the FreeBSD Project. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_ddb.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/capsicum.h> 46 #include <sys/counter.h> 47 #include <sys/filedesc.h> 48 #include <sys/fnv_hash.h> 49 #include <sys/kernel.h> 50 #include <sys/ktr.h> 51 #include <sys/lock.h> 52 #include <sys/malloc.h> 53 #include <sys/fcntl.h> 54 #include <sys/jail.h> 55 #include <sys/mount.h> 56 #include <sys/namei.h> 57 #include <sys/proc.h> 58 #include <sys/rwlock.h> 59 #include <sys/seqc.h> 60 #include <sys/sdt.h> 61 #include <sys/smr.h> 62 #include <sys/smp.h> 63 #include <sys/syscallsubr.h> 64 #include <sys/sysctl.h> 65 #include <sys/sysproto.h> 66 #include <sys/vnode.h> 67 #include <ck_queue.h> 68 #ifdef KTRACE 69 #include <sys/ktrace.h> 70 #endif 71 72 #include <sys/capsicum.h> 73 74 #include <security/audit/audit.h> 75 #include <security/mac/mac_framework.h> 76 77 #ifdef DDB 78 #include <ddb/ddb.h> 79 #endif 80 81 #include <vm/uma.h> 82 83 SDT_PROVIDER_DECLARE(vfs); 84 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", 85 "struct vnode *"); 86 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", 87 "char *"); 88 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); 89 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", 90 "char *", "struct vnode *"); 91 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); 92 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", 93 "struct vnode *", "char *"); 94 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", 95 "struct vnode *"); 96 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, 97 "struct vnode *", "char *"); 98 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", 99 "char *"); 100 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *", 101 "struct componentname *"); 102 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *", 103 "struct componentname *"); 104 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); 105 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); 106 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); 107 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", 108 "struct vnode *"); 109 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", 110 "char *"); 111 SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *", 112 "char *"); 113 114 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool"); 115 SDT_PROBE_DECLARE(vfs, namei, lookup, entry); 116 SDT_PROBE_DECLARE(vfs, namei, lookup, return); 117 118 /* 119 * This structure describes the elements in the cache of recent 120 * names looked up by namei. 121 */ 122 struct negstate { 123 u_char neg_flag; 124 }; 125 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *), 126 "the state must fit in a union with a pointer without growing it"); 127 128 struct namecache { 129 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 130 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 131 CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */ 132 struct vnode *nc_dvp; /* vnode of parent of name */ 133 union { 134 struct vnode *nu_vp; /* vnode the name refers to */ 135 struct negstate nu_neg;/* negative entry state */ 136 } n_un; 137 u_char nc_flag; /* flag bits */ 138 u_char nc_nlen; /* length of name */ 139 char nc_name[0]; /* segment name + nul */ 140 }; 141 142 /* 143 * struct namecache_ts repeats struct namecache layout up to the 144 * nc_nlen member. 145 * struct namecache_ts is used in place of struct namecache when time(s) need 146 * to be stored. The nc_dotdottime field is used when a cache entry is mapping 147 * both a non-dotdot directory name plus dotdot for the directory's 148 * parent. 149 * 150 * See below for alignment requirement. 151 */ 152 struct namecache_ts { 153 struct timespec nc_time; /* timespec provided by fs */ 154 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ 155 int nc_ticks; /* ticks value when entry was added */ 156 struct namecache nc_nc; 157 }; 158 159 /* 160 * At least mips n32 performs 64-bit accesses to timespec as found 161 * in namecache_ts and requires them to be aligned. Since others 162 * may be in the same spot suffer a little bit and enforce the 163 * alignment for everyone. Note this is a nop for 64-bit platforms. 164 */ 165 #define CACHE_ZONE_ALIGNMENT UMA_ALIGNOF(time_t) 166 #define CACHE_PATH_CUTOFF 39 167 168 #define CACHE_ZONE_SMALL_SIZE (sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1) 169 #define CACHE_ZONE_SMALL_TS_SIZE (sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1) 170 #define CACHE_ZONE_LARGE_SIZE (sizeof(struct namecache) + NAME_MAX + 1) 171 #define CACHE_ZONE_LARGE_TS_SIZE (sizeof(struct namecache_ts) + NAME_MAX + 1) 172 173 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 174 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 175 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 176 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 177 178 #define nc_vp n_un.nu_vp 179 #define nc_neg n_un.nu_neg 180 181 /* 182 * Flags in namecache.nc_flag 183 */ 184 #define NCF_WHITE 0x01 185 #define NCF_ISDOTDOT 0x02 186 #define NCF_TS 0x04 187 #define NCF_DTS 0x08 188 #define NCF_DVDROP 0x10 189 #define NCF_NEGATIVE 0x20 190 #define NCF_INVALID 0x40 191 #define NCF_WIP 0x80 192 193 /* 194 * Flags in negstate.neg_flag 195 */ 196 #define NEG_HOT 0x01 197 198 /* 199 * Mark an entry as invalid. 200 * 201 * This is called before it starts getting deconstructed. 202 */ 203 static void 204 cache_ncp_invalidate(struct namecache *ncp) 205 { 206 207 KASSERT((ncp->nc_flag & NCF_INVALID) == 0, 208 ("%s: entry %p already invalid", __func__, ncp)); 209 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID); 210 atomic_thread_fence_rel(); 211 } 212 213 /* 214 * Check whether the entry can be safely used. 215 * 216 * All places which elide locks are supposed to call this after they are 217 * done with reading from an entry. 218 */ 219 static bool 220 cache_ncp_canuse(struct namecache *ncp) 221 { 222 223 atomic_thread_fence_acq(); 224 return ((atomic_load_char(&ncp->nc_flag) & (NCF_INVALID | NCF_WIP)) == 0); 225 } 226 227 /* 228 * Name caching works as follows: 229 * 230 * Names found by directory scans are retained in a cache 231 * for future reference. It is managed LRU, so frequently 232 * used names will hang around. Cache is indexed by hash value 233 * obtained from (dvp, name) where dvp refers to the directory 234 * containing name. 235 * 236 * If it is a "negative" entry, (i.e. for a name that is known NOT to 237 * exist) the vnode pointer will be NULL. 238 * 239 * Upon reaching the last segment of a path, if the reference 240 * is for DELETE, or NOCACHE is set (rewrite), and the 241 * name is located in the cache, it will be dropped. 242 * 243 * These locks are used (in the order in which they can be taken): 244 * NAME TYPE ROLE 245 * vnodelock mtx vnode lists and v_cache_dd field protection 246 * bucketlock rwlock for access to given set of hash buckets 247 * neglist mtx negative entry LRU management 248 * 249 * Additionally, ncneg_shrink_lock mtx is used to have at most one thread 250 * shrinking the LRU list. 251 * 252 * It is legal to take multiple vnodelock and bucketlock locks. The locking 253 * order is lower address first. Both are recursive. 254 * 255 * "." lookups are lockless. 256 * 257 * ".." and vnode -> name lookups require vnodelock. 258 * 259 * name -> vnode lookup requires the relevant bucketlock to be held for reading. 260 * 261 * Insertions and removals of entries require involved vnodes and bucketlocks 262 * to be write-locked to prevent other threads from seeing the entry. 263 * 264 * Some lookups result in removal of the found entry (e.g. getting rid of a 265 * negative entry with the intent to create a positive one), which poses a 266 * problem when multiple threads reach the state. Similarly, two different 267 * threads can purge two different vnodes and try to remove the same name. 268 * 269 * If the already held vnode lock is lower than the second required lock, we 270 * can just take the other lock. However, in the opposite case, this could 271 * deadlock. As such, this is resolved by trylocking and if that fails unlocking 272 * the first node, locking everything in order and revalidating the state. 273 */ 274 275 VFS_SMR_DECLARE; 276 277 /* 278 * Structures associated with name caching. 279 */ 280 #define NCHHASH(hash) \ 281 (&nchashtbl[(hash) & nchash]) 282 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ 283 static u_long __read_mostly nchash; /* size of hash table */ 284 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 285 "Size of namecache hash table"); 286 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */ 287 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 288 "Ratio of negative namecache entries"); 289 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ 290 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ 291 u_int ncsizefactor = 2; 292 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0, 293 "Size factor for namecache"); 294 static u_int __read_mostly ncpurgeminvnodes; 295 SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0, 296 "Number of vnodes below which purgevfs ignores the request"); 297 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */ 298 299 struct nchstats nchstats; /* cache effectiveness statistics */ 300 301 static struct mtx __exclusive_cache_line ncneg_shrink_lock; 302 303 struct neglist { 304 struct mtx nl_lock; 305 TAILQ_HEAD(, namecache) nl_list; 306 } __aligned(CACHE_LINE_SIZE); 307 308 static struct neglist __read_mostly *neglists; 309 static struct neglist ncneg_hot; 310 static u_long numhotneg; 311 312 #define ncneghash 3 313 #define numneglists (ncneghash + 1) 314 static inline struct neglist * 315 NCP2NEGLIST(struct namecache *ncp) 316 { 317 318 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); 319 } 320 321 static inline struct negstate * 322 NCP2NEGSTATE(struct namecache *ncp) 323 { 324 325 MPASS(ncp->nc_flag & NCF_NEGATIVE); 326 return (&ncp->nc_neg); 327 } 328 329 #define numbucketlocks (ncbuckethash + 1) 330 static u_int __read_mostly ncbuckethash; 331 static struct rwlock_padalign __read_mostly *bucketlocks; 332 #define HASH2BUCKETLOCK(hash) \ 333 ((struct rwlock *)(&bucketlocks[((hash) & ncbuckethash)])) 334 335 #define numvnodelocks (ncvnodehash + 1) 336 static u_int __read_mostly ncvnodehash; 337 static struct mtx __read_mostly *vnodelocks; 338 static inline struct mtx * 339 VP2VNODELOCK(struct vnode *vp) 340 { 341 342 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); 343 } 344 345 /* 346 * UMA zones for the VFS cache. 347 * 348 * The small cache is used for entries with short names, which are the 349 * most common. The large cache is used for entries which are too big to 350 * fit in the small cache. 351 */ 352 static uma_zone_t __read_mostly cache_zone_small; 353 static uma_zone_t __read_mostly cache_zone_small_ts; 354 static uma_zone_t __read_mostly cache_zone_large; 355 static uma_zone_t __read_mostly cache_zone_large_ts; 356 357 static struct namecache * 358 cache_alloc(int len, int ts) 359 { 360 struct namecache_ts *ncp_ts; 361 struct namecache *ncp; 362 363 if (__predict_false(ts)) { 364 if (len <= CACHE_PATH_CUTOFF) 365 ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK); 366 else 367 ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK); 368 ncp = &ncp_ts->nc_nc; 369 } else { 370 if (len <= CACHE_PATH_CUTOFF) 371 ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK); 372 else 373 ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK); 374 } 375 return (ncp); 376 } 377 378 static void 379 cache_free(struct namecache *ncp) 380 { 381 struct namecache_ts *ncp_ts; 382 383 if (ncp == NULL) 384 return; 385 if ((ncp->nc_flag & NCF_DVDROP) != 0) 386 vdrop(ncp->nc_dvp); 387 if (__predict_false(ncp->nc_flag & NCF_TS)) { 388 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 389 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 390 uma_zfree_smr(cache_zone_small_ts, ncp_ts); 391 else 392 uma_zfree_smr(cache_zone_large_ts, ncp_ts); 393 } else { 394 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 395 uma_zfree_smr(cache_zone_small, ncp); 396 else 397 uma_zfree_smr(cache_zone_large, ncp); 398 } 399 } 400 401 static void 402 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) 403 { 404 struct namecache_ts *ncp_ts; 405 406 KASSERT((ncp->nc_flag & NCF_TS) != 0 || 407 (tsp == NULL && ticksp == NULL), 408 ("No NCF_TS")); 409 410 if (tsp == NULL && ticksp == NULL) 411 return; 412 413 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 414 if (tsp != NULL) 415 *tsp = ncp_ts->nc_time; 416 if (ticksp != NULL) 417 *ticksp = ncp_ts->nc_ticks; 418 } 419 420 #ifdef DEBUG_CACHE 421 static int __read_mostly doingcache = 1; /* 1 => enable the cache */ 422 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, 423 "VFS namecache enabled"); 424 #endif 425 426 /* Export size information to userland */ 427 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 428 sizeof(struct namecache), "sizeof(struct namecache)"); 429 430 /* 431 * The new name cache statistics 432 */ 433 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 434 "Name cache statistics"); 435 #define STATNODE_ULONG(name, descr) \ 436 SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr); 437 #define STATNODE_COUNTER(name, descr) \ 438 static COUNTER_U64_DEFINE_EARLY(name); \ 439 SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, \ 440 descr); 441 STATNODE_ULONG(numneg, "Number of negative cache entries"); 442 STATNODE_ULONG(numcache, "Number of cache entries"); 443 STATNODE_COUNTER(numcachehv, "Number of namecache entries with vnodes held"); 444 STATNODE_COUNTER(numdrops, "Number of dropped entries due to reaching the limit"); 445 STATNODE_COUNTER(dothits, "Number of '.' hits"); 446 STATNODE_COUNTER(dotdothits, "Number of '..' hits"); 447 STATNODE_COUNTER(nummiss, "Number of cache misses"); 448 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache"); 449 STATNODE_COUNTER(numposzaps, 450 "Number of cache hits (positive) we do not want to cache"); 451 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)"); 452 STATNODE_COUNTER(numnegzaps, 453 "Number of cache hits (negative) we do not want to cache"); 454 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)"); 455 /* These count for vn_getcwd(), too. */ 456 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls"); 457 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); 458 STATNODE_COUNTER(numfullpathfail2, 459 "Number of fullpath search errors (VOP_VPTOCNP failures)"); 460 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); 461 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls"); 462 STATNODE_COUNTER(zap_and_exit_bucket_relock_success, 463 "Number of successful removals after relocking"); 464 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail, 465 "Number of times zap_and_exit failed to lock"); 466 static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2, 467 "Number of times zap_and_exit failed to lock"); 468 static long cache_lock_vnodes_cel_3_failures; 469 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures, 470 "Number of times 3-way vnode locking failed"); 471 STATNODE_ULONG(numhotneg, "Number of hot negative entries"); 472 STATNODE_COUNTER(numneg_evicted, 473 "Number of negative entries evicted when adding a new entry"); 474 STATNODE_COUNTER(shrinking_skipped, 475 "Number of times shrinking was already in progress"); 476 477 static void cache_zap_locked(struct namecache *ncp); 478 static int vn_fullpath_hardlink(struct thread *td, struct nameidata *ndp, char **retbuf, 479 char **freebuf, size_t *buflen); 480 static int vn_fullpath_any(struct thread *td, struct vnode *vp, struct vnode *rdir, 481 char *buf, char **retbuf, size_t *buflen); 482 static int vn_fullpath_dir(struct thread *td, struct vnode *vp, struct vnode *rdir, 483 char *buf, char **retbuf, size_t *len, bool slash_prefixed, size_t addend); 484 485 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 486 487 static int cache_yield; 488 SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0, 489 "Number of times cache called yield"); 490 491 static void __noinline 492 cache_maybe_yield(void) 493 { 494 495 if (should_yield()) { 496 cache_yield++; 497 kern_yield(PRI_USER); 498 } 499 } 500 501 static inline void 502 cache_assert_vlp_locked(struct mtx *vlp) 503 { 504 505 if (vlp != NULL) 506 mtx_assert(vlp, MA_OWNED); 507 } 508 509 static inline void 510 cache_assert_vnode_locked(struct vnode *vp) 511 { 512 struct mtx *vlp; 513 514 vlp = VP2VNODELOCK(vp); 515 cache_assert_vlp_locked(vlp); 516 } 517 518 /* 519 * TODO: With the value stored we can do better than computing the hash based 520 * on the address. The choice of FNV should also be revisited. 521 */ 522 static void 523 cache_prehash(struct vnode *vp) 524 { 525 526 vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT); 527 } 528 529 static uint32_t 530 cache_get_hash(char *name, u_char len, struct vnode *dvp) 531 { 532 533 return (fnv_32_buf(name, len, dvp->v_nchash)); 534 } 535 536 static inline struct nchashhead * 537 NCP2BUCKET(struct namecache *ncp) 538 { 539 uint32_t hash; 540 541 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 542 return (NCHHASH(hash)); 543 } 544 545 static inline struct rwlock * 546 NCP2BUCKETLOCK(struct namecache *ncp) 547 { 548 uint32_t hash; 549 550 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 551 return (HASH2BUCKETLOCK(hash)); 552 } 553 554 #ifdef INVARIANTS 555 static void 556 cache_assert_bucket_locked(struct namecache *ncp, int mode) 557 { 558 struct rwlock *blp; 559 560 blp = NCP2BUCKETLOCK(ncp); 561 rw_assert(blp, mode); 562 } 563 #else 564 #define cache_assert_bucket_locked(x, y) do { } while (0) 565 #endif 566 567 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) 568 static void 569 _cache_sort_vnodes(void **p1, void **p2) 570 { 571 void *tmp; 572 573 MPASS(*p1 != NULL || *p2 != NULL); 574 575 if (*p1 > *p2) { 576 tmp = *p2; 577 *p2 = *p1; 578 *p1 = tmp; 579 } 580 } 581 582 static void 583 cache_lock_all_buckets(void) 584 { 585 u_int i; 586 587 for (i = 0; i < numbucketlocks; i++) 588 rw_wlock(&bucketlocks[i]); 589 } 590 591 static void 592 cache_unlock_all_buckets(void) 593 { 594 u_int i; 595 596 for (i = 0; i < numbucketlocks; i++) 597 rw_wunlock(&bucketlocks[i]); 598 } 599 600 static void 601 cache_lock_all_vnodes(void) 602 { 603 u_int i; 604 605 for (i = 0; i < numvnodelocks; i++) 606 mtx_lock(&vnodelocks[i]); 607 } 608 609 static void 610 cache_unlock_all_vnodes(void) 611 { 612 u_int i; 613 614 for (i = 0; i < numvnodelocks; i++) 615 mtx_unlock(&vnodelocks[i]); 616 } 617 618 static int 619 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 620 { 621 622 cache_sort_vnodes(&vlp1, &vlp2); 623 624 if (vlp1 != NULL) { 625 if (!mtx_trylock(vlp1)) 626 return (EAGAIN); 627 } 628 if (!mtx_trylock(vlp2)) { 629 if (vlp1 != NULL) 630 mtx_unlock(vlp1); 631 return (EAGAIN); 632 } 633 634 return (0); 635 } 636 637 static void 638 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 639 { 640 641 MPASS(vlp1 != NULL || vlp2 != NULL); 642 MPASS(vlp1 <= vlp2); 643 644 if (vlp1 != NULL) 645 mtx_lock(vlp1); 646 if (vlp2 != NULL) 647 mtx_lock(vlp2); 648 } 649 650 static void 651 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 652 { 653 654 MPASS(vlp1 != NULL || vlp2 != NULL); 655 656 if (vlp1 != NULL) 657 mtx_unlock(vlp1); 658 if (vlp2 != NULL) 659 mtx_unlock(vlp2); 660 } 661 662 static int 663 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 664 { 665 struct nchstats snap; 666 667 if (req->oldptr == NULL) 668 return (SYSCTL_OUT(req, 0, sizeof(snap))); 669 670 snap = nchstats; 671 snap.ncs_goodhits = counter_u64_fetch(numposhits); 672 snap.ncs_neghits = counter_u64_fetch(numneghits); 673 snap.ncs_badhits = counter_u64_fetch(numposzaps) + 674 counter_u64_fetch(numnegzaps); 675 snap.ncs_miss = counter_u64_fetch(nummisszap) + 676 counter_u64_fetch(nummiss); 677 678 return (SYSCTL_OUT(req, &snap, sizeof(snap))); 679 } 680 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | 681 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", 682 "VFS cache effectiveness statistics"); 683 684 #ifdef DIAGNOSTIC 685 /* 686 * Grab an atomic snapshot of the name cache hash chain lengths 687 */ 688 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, 689 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 690 "hash table stats"); 691 692 static int 693 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) 694 { 695 struct nchashhead *ncpp; 696 struct namecache *ncp; 697 int i, error, n_nchash, *cntbuf; 698 699 retry: 700 n_nchash = nchash + 1; /* nchash is max index, not count */ 701 if (req->oldptr == NULL) 702 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); 703 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); 704 cache_lock_all_buckets(); 705 if (n_nchash != nchash + 1) { 706 cache_unlock_all_buckets(); 707 free(cntbuf, M_TEMP); 708 goto retry; 709 } 710 /* Scan hash tables counting entries */ 711 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) 712 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) 713 cntbuf[i]++; 714 cache_unlock_all_buckets(); 715 for (error = 0, i = 0; i < n_nchash; i++) 716 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) 717 break; 718 free(cntbuf, M_TEMP); 719 return (error); 720 } 721 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| 722 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", 723 "nchash chain lengths"); 724 725 static int 726 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) 727 { 728 int error; 729 struct nchashhead *ncpp; 730 struct namecache *ncp; 731 int n_nchash; 732 int count, maxlength, used, pct; 733 734 if (!req->oldptr) 735 return SYSCTL_OUT(req, 0, 4 * sizeof(int)); 736 737 cache_lock_all_buckets(); 738 n_nchash = nchash + 1; /* nchash is max index, not count */ 739 used = 0; 740 maxlength = 0; 741 742 /* Scan hash tables for applicable entries */ 743 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { 744 count = 0; 745 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) { 746 count++; 747 } 748 if (count) 749 used++; 750 if (maxlength < count) 751 maxlength = count; 752 } 753 n_nchash = nchash + 1; 754 cache_unlock_all_buckets(); 755 pct = (used * 100) / (n_nchash / 100); 756 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); 757 if (error) 758 return (error); 759 error = SYSCTL_OUT(req, &used, sizeof(used)); 760 if (error) 761 return (error); 762 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); 763 if (error) 764 return (error); 765 error = SYSCTL_OUT(req, &pct, sizeof(pct)); 766 if (error) 767 return (error); 768 return (0); 769 } 770 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| 771 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", 772 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); 773 #endif 774 775 /* 776 * Negative entries management 777 * 778 * A variation of LRU scheme is used. New entries are hashed into one of 779 * numneglists cold lists. Entries get promoted to the hot list on first hit. 780 * 781 * The shrinker will demote hot list head and evict from the cold list in a 782 * round-robin manner. 783 */ 784 static void 785 cache_negative_init(struct namecache *ncp) 786 { 787 struct negstate *negstate; 788 789 ncp->nc_flag |= NCF_NEGATIVE; 790 negstate = NCP2NEGSTATE(ncp); 791 negstate->neg_flag = 0; 792 } 793 794 static void 795 cache_negative_hit(struct namecache *ncp) 796 { 797 struct neglist *neglist; 798 struct negstate *negstate; 799 800 negstate = NCP2NEGSTATE(ncp); 801 if ((negstate->neg_flag & NEG_HOT) != 0) 802 return; 803 neglist = NCP2NEGLIST(ncp); 804 mtx_lock(&ncneg_hot.nl_lock); 805 mtx_lock(&neglist->nl_lock); 806 if ((negstate->neg_flag & NEG_HOT) == 0) { 807 numhotneg++; 808 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 809 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); 810 negstate->neg_flag |= NEG_HOT; 811 } 812 mtx_unlock(&neglist->nl_lock); 813 mtx_unlock(&ncneg_hot.nl_lock); 814 } 815 816 static void 817 cache_negative_insert(struct namecache *ncp) 818 { 819 struct neglist *neglist; 820 821 MPASS(ncp->nc_flag & NCF_NEGATIVE); 822 cache_assert_bucket_locked(ncp, RA_WLOCKED); 823 neglist = NCP2NEGLIST(ncp); 824 mtx_lock(&neglist->nl_lock); 825 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 826 mtx_unlock(&neglist->nl_lock); 827 atomic_add_rel_long(&numneg, 1); 828 } 829 830 static void 831 cache_negative_remove(struct namecache *ncp) 832 { 833 struct neglist *neglist; 834 struct negstate *negstate; 835 bool hot_locked = false; 836 bool list_locked = false; 837 838 cache_assert_bucket_locked(ncp, RA_WLOCKED); 839 neglist = NCP2NEGLIST(ncp); 840 negstate = NCP2NEGSTATE(ncp); 841 if ((negstate->neg_flag & NEG_HOT) != 0) { 842 hot_locked = true; 843 mtx_lock(&ncneg_hot.nl_lock); 844 if ((negstate->neg_flag & NEG_HOT) == 0) { 845 list_locked = true; 846 mtx_lock(&neglist->nl_lock); 847 } 848 } else { 849 list_locked = true; 850 mtx_lock(&neglist->nl_lock); 851 /* 852 * We may be racing against promotion in lockless lookup. 853 */ 854 if ((negstate->neg_flag & NEG_HOT) != 0) { 855 mtx_unlock(&neglist->nl_lock); 856 hot_locked = true; 857 mtx_lock(&ncneg_hot.nl_lock); 858 mtx_lock(&neglist->nl_lock); 859 } 860 } 861 if ((negstate->neg_flag & NEG_HOT) != 0) { 862 mtx_assert(&ncneg_hot.nl_lock, MA_OWNED); 863 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 864 numhotneg--; 865 } else { 866 mtx_assert(&neglist->nl_lock, MA_OWNED); 867 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 868 } 869 if (list_locked) 870 mtx_unlock(&neglist->nl_lock); 871 if (hot_locked) 872 mtx_unlock(&ncneg_hot.nl_lock); 873 atomic_subtract_rel_long(&numneg, 1); 874 } 875 876 static void 877 cache_negative_shrink_select(struct namecache **ncpp, 878 struct neglist **neglistpp) 879 { 880 struct neglist *neglist; 881 struct namecache *ncp; 882 static u_int cycle; 883 u_int i; 884 885 *ncpp = ncp = NULL; 886 887 for (i = 0; i < numneglists; i++) { 888 neglist = &neglists[(cycle + i) % numneglists]; 889 if (TAILQ_FIRST(&neglist->nl_list) == NULL) 890 continue; 891 mtx_lock(&neglist->nl_lock); 892 ncp = TAILQ_FIRST(&neglist->nl_list); 893 if (ncp != NULL) 894 break; 895 mtx_unlock(&neglist->nl_lock); 896 } 897 898 *neglistpp = neglist; 899 *ncpp = ncp; 900 cycle++; 901 } 902 903 static void 904 cache_negative_zap_one(void) 905 { 906 struct namecache *ncp, *ncp2; 907 struct neglist *neglist; 908 struct negstate *negstate; 909 struct mtx *dvlp; 910 struct rwlock *blp; 911 912 if (mtx_owner(&ncneg_shrink_lock) != NULL || 913 !mtx_trylock(&ncneg_shrink_lock)) { 914 counter_u64_add(shrinking_skipped, 1); 915 return; 916 } 917 918 mtx_lock(&ncneg_hot.nl_lock); 919 ncp = TAILQ_FIRST(&ncneg_hot.nl_list); 920 if (ncp != NULL) { 921 neglist = NCP2NEGLIST(ncp); 922 negstate = NCP2NEGSTATE(ncp); 923 mtx_lock(&neglist->nl_lock); 924 MPASS((negstate->neg_flag & NEG_HOT) != 0); 925 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 926 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 927 negstate->neg_flag &= ~NEG_HOT; 928 numhotneg--; 929 mtx_unlock(&neglist->nl_lock); 930 } 931 mtx_unlock(&ncneg_hot.nl_lock); 932 933 cache_negative_shrink_select(&ncp, &neglist); 934 935 mtx_unlock(&ncneg_shrink_lock); 936 if (ncp == NULL) 937 return; 938 939 MPASS(ncp->nc_flag & NCF_NEGATIVE); 940 dvlp = VP2VNODELOCK(ncp->nc_dvp); 941 blp = NCP2BUCKETLOCK(ncp); 942 mtx_unlock(&neglist->nl_lock); 943 mtx_lock(dvlp); 944 rw_wlock(blp); 945 /* 946 * Enter SMR to safely check the negative list. 947 * Even if the found pointer matches, the entry may now be reallocated 948 * and used by a different vnode. 949 */ 950 vfs_smr_enter(); 951 ncp2 = TAILQ_FIRST(&neglist->nl_list); 952 if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) || 953 blp != NCP2BUCKETLOCK(ncp2)) { 954 vfs_smr_exit(); 955 ncp = NULL; 956 } else { 957 vfs_smr_exit(); 958 SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp, 959 ncp->nc_name); 960 cache_zap_locked(ncp); 961 counter_u64_add(numneg_evicted, 1); 962 } 963 rw_wunlock(blp); 964 mtx_unlock(dvlp); 965 cache_free(ncp); 966 } 967 968 /* 969 * cache_zap_locked(): 970 * 971 * Removes a namecache entry from cache, whether it contains an actual 972 * pointer to a vnode or if it is just a negative cache entry. 973 */ 974 static void 975 cache_zap_locked(struct namecache *ncp) 976 { 977 struct nchashhead *ncpp; 978 979 if (!(ncp->nc_flag & NCF_NEGATIVE)) 980 cache_assert_vnode_locked(ncp->nc_vp); 981 cache_assert_vnode_locked(ncp->nc_dvp); 982 cache_assert_bucket_locked(ncp, RA_WLOCKED); 983 984 CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, 985 (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp); 986 987 cache_ncp_invalidate(ncp); 988 989 ncpp = NCP2BUCKET(ncp); 990 CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash); 991 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 992 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, 993 ncp->nc_name, ncp->nc_vp); 994 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); 995 if (ncp == ncp->nc_vp->v_cache_dd) { 996 vn_seqc_write_begin_unheld(ncp->nc_vp); 997 ncp->nc_vp->v_cache_dd = NULL; 998 vn_seqc_write_end(ncp->nc_vp); 999 } 1000 } else { 1001 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp, 1002 ncp->nc_name); 1003 cache_negative_remove(ncp); 1004 } 1005 if (ncp->nc_flag & NCF_ISDOTDOT) { 1006 if (ncp == ncp->nc_dvp->v_cache_dd) { 1007 vn_seqc_write_begin_unheld(ncp->nc_dvp); 1008 ncp->nc_dvp->v_cache_dd = NULL; 1009 vn_seqc_write_end(ncp->nc_dvp); 1010 } 1011 } else { 1012 LIST_REMOVE(ncp, nc_src); 1013 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { 1014 ncp->nc_flag |= NCF_DVDROP; 1015 counter_u64_add(numcachehv, -1); 1016 } 1017 } 1018 atomic_subtract_rel_long(&numcache, 1); 1019 } 1020 1021 static void 1022 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) 1023 { 1024 struct rwlock *blp; 1025 1026 MPASS(ncp->nc_dvp == vp); 1027 MPASS(ncp->nc_flag & NCF_NEGATIVE); 1028 cache_assert_vnode_locked(vp); 1029 1030 blp = NCP2BUCKETLOCK(ncp); 1031 rw_wlock(blp); 1032 cache_zap_locked(ncp); 1033 rw_wunlock(blp); 1034 } 1035 1036 static bool 1037 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, 1038 struct mtx **vlpp) 1039 { 1040 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 1041 struct rwlock *blp; 1042 1043 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 1044 cache_assert_vnode_locked(vp); 1045 1046 if (ncp->nc_flag & NCF_NEGATIVE) { 1047 if (*vlpp != NULL) { 1048 mtx_unlock(*vlpp); 1049 *vlpp = NULL; 1050 } 1051 cache_zap_negative_locked_vnode_kl(ncp, vp); 1052 return (true); 1053 } 1054 1055 pvlp = VP2VNODELOCK(vp); 1056 blp = NCP2BUCKETLOCK(ncp); 1057 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 1058 vlp2 = VP2VNODELOCK(ncp->nc_vp); 1059 1060 if (*vlpp == vlp1 || *vlpp == vlp2) { 1061 to_unlock = *vlpp; 1062 *vlpp = NULL; 1063 } else { 1064 if (*vlpp != NULL) { 1065 mtx_unlock(*vlpp); 1066 *vlpp = NULL; 1067 } 1068 cache_sort_vnodes(&vlp1, &vlp2); 1069 if (vlp1 == pvlp) { 1070 mtx_lock(vlp2); 1071 to_unlock = vlp2; 1072 } else { 1073 if (!mtx_trylock(vlp1)) 1074 goto out_relock; 1075 to_unlock = vlp1; 1076 } 1077 } 1078 rw_wlock(blp); 1079 cache_zap_locked(ncp); 1080 rw_wunlock(blp); 1081 if (to_unlock != NULL) 1082 mtx_unlock(to_unlock); 1083 return (true); 1084 1085 out_relock: 1086 mtx_unlock(vlp2); 1087 mtx_lock(vlp1); 1088 mtx_lock(vlp2); 1089 MPASS(*vlpp == NULL); 1090 *vlpp = vlp1; 1091 return (false); 1092 } 1093 1094 static int __noinline 1095 cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp) 1096 { 1097 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 1098 struct rwlock *blp; 1099 int error = 0; 1100 1101 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 1102 cache_assert_vnode_locked(vp); 1103 1104 pvlp = VP2VNODELOCK(vp); 1105 if (ncp->nc_flag & NCF_NEGATIVE) { 1106 cache_zap_negative_locked_vnode_kl(ncp, vp); 1107 goto out; 1108 } 1109 1110 blp = NCP2BUCKETLOCK(ncp); 1111 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 1112 vlp2 = VP2VNODELOCK(ncp->nc_vp); 1113 cache_sort_vnodes(&vlp1, &vlp2); 1114 if (vlp1 == pvlp) { 1115 mtx_lock(vlp2); 1116 to_unlock = vlp2; 1117 } else { 1118 if (!mtx_trylock(vlp1)) { 1119 error = EAGAIN; 1120 goto out; 1121 } 1122 to_unlock = vlp1; 1123 } 1124 rw_wlock(blp); 1125 cache_zap_locked(ncp); 1126 rw_wunlock(blp); 1127 mtx_unlock(to_unlock); 1128 out: 1129 mtx_unlock(pvlp); 1130 return (error); 1131 } 1132 1133 /* 1134 * If trylocking failed we can get here. We know enough to take all needed locks 1135 * in the right order and re-lookup the entry. 1136 */ 1137 static int 1138 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1139 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, 1140 struct rwlock *blp) 1141 { 1142 struct namecache *rncp; 1143 1144 cache_assert_bucket_locked(ncp, RA_UNLOCKED); 1145 1146 cache_sort_vnodes(&dvlp, &vlp); 1147 cache_lock_vnodes(dvlp, vlp); 1148 rw_wlock(blp); 1149 CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { 1150 if (rncp == ncp && rncp->nc_dvp == dvp && 1151 rncp->nc_nlen == cnp->cn_namelen && 1152 !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) 1153 break; 1154 } 1155 if (rncp != NULL) { 1156 cache_zap_locked(rncp); 1157 rw_wunlock(blp); 1158 cache_unlock_vnodes(dvlp, vlp); 1159 counter_u64_add(zap_and_exit_bucket_relock_success, 1); 1160 return (0); 1161 } 1162 1163 rw_wunlock(blp); 1164 cache_unlock_vnodes(dvlp, vlp); 1165 return (EAGAIN); 1166 } 1167 1168 static int __noinline 1169 cache_zap_wlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1170 uint32_t hash, struct rwlock *blp) 1171 { 1172 struct mtx *dvlp, *vlp; 1173 struct vnode *dvp; 1174 1175 cache_assert_bucket_locked(ncp, RA_WLOCKED); 1176 1177 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1178 vlp = NULL; 1179 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1180 vlp = VP2VNODELOCK(ncp->nc_vp); 1181 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1182 cache_zap_locked(ncp); 1183 rw_wunlock(blp); 1184 cache_unlock_vnodes(dvlp, vlp); 1185 return (0); 1186 } 1187 1188 dvp = ncp->nc_dvp; 1189 rw_wunlock(blp); 1190 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1191 } 1192 1193 static int __noinline 1194 cache_zap_rlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1195 uint32_t hash, struct rwlock *blp) 1196 { 1197 struct mtx *dvlp, *vlp; 1198 struct vnode *dvp; 1199 1200 cache_assert_bucket_locked(ncp, RA_RLOCKED); 1201 1202 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1203 vlp = NULL; 1204 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1205 vlp = VP2VNODELOCK(ncp->nc_vp); 1206 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1207 rw_runlock(blp); 1208 rw_wlock(blp); 1209 cache_zap_locked(ncp); 1210 rw_wunlock(blp); 1211 cache_unlock_vnodes(dvlp, vlp); 1212 return (0); 1213 } 1214 1215 dvp = ncp->nc_dvp; 1216 rw_runlock(blp); 1217 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1218 } 1219 1220 static int 1221 cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp, 1222 struct mtx **vlpp1, struct mtx **vlpp2) 1223 { 1224 struct mtx *dvlp, *vlp; 1225 1226 cache_assert_bucket_locked(ncp, RA_WLOCKED); 1227 1228 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1229 vlp = NULL; 1230 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1231 vlp = VP2VNODELOCK(ncp->nc_vp); 1232 cache_sort_vnodes(&dvlp, &vlp); 1233 1234 if (*vlpp1 == dvlp && *vlpp2 == vlp) { 1235 cache_zap_locked(ncp); 1236 cache_unlock_vnodes(dvlp, vlp); 1237 *vlpp1 = NULL; 1238 *vlpp2 = NULL; 1239 return (0); 1240 } 1241 1242 if (*vlpp1 != NULL) 1243 mtx_unlock(*vlpp1); 1244 if (*vlpp2 != NULL) 1245 mtx_unlock(*vlpp2); 1246 *vlpp1 = NULL; 1247 *vlpp2 = NULL; 1248 1249 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1250 cache_zap_locked(ncp); 1251 cache_unlock_vnodes(dvlp, vlp); 1252 return (0); 1253 } 1254 1255 rw_wunlock(blp); 1256 *vlpp1 = dvlp; 1257 *vlpp2 = vlp; 1258 if (*vlpp1 != NULL) 1259 mtx_lock(*vlpp1); 1260 mtx_lock(*vlpp2); 1261 rw_wlock(blp); 1262 return (EAGAIN); 1263 } 1264 1265 static void 1266 cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp) 1267 { 1268 1269 if (blp != NULL) { 1270 rw_runlock(blp); 1271 } else { 1272 mtx_unlock(vlp); 1273 } 1274 } 1275 1276 static int __noinline 1277 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1278 struct timespec *tsp, int *ticksp) 1279 { 1280 int ltype; 1281 1282 *vpp = dvp; 1283 CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .", 1284 dvp, cnp->cn_nameptr); 1285 counter_u64_add(dothits, 1); 1286 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); 1287 if (tsp != NULL) 1288 timespecclear(tsp); 1289 if (ticksp != NULL) 1290 *ticksp = ticks; 1291 vrefact(*vpp); 1292 /* 1293 * When we lookup "." we still can be asked to lock it 1294 * differently... 1295 */ 1296 ltype = cnp->cn_lkflags & LK_TYPE_MASK; 1297 if (ltype != VOP_ISLOCKED(*vpp)) { 1298 if (ltype == LK_EXCLUSIVE) { 1299 vn_lock(*vpp, LK_UPGRADE | LK_RETRY); 1300 if (VN_IS_DOOMED((*vpp))) { 1301 /* forced unmount */ 1302 vrele(*vpp); 1303 *vpp = NULL; 1304 return (ENOENT); 1305 } 1306 } else 1307 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); 1308 } 1309 return (-1); 1310 } 1311 1312 static __noinline int 1313 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp) 1314 { 1315 struct namecache *ncp; 1316 struct rwlock *blp; 1317 struct mtx *dvlp, *dvlp2; 1318 uint32_t hash; 1319 int error; 1320 1321 if (cnp->cn_namelen == 2 && 1322 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1323 dvlp = VP2VNODELOCK(dvp); 1324 dvlp2 = NULL; 1325 mtx_lock(dvlp); 1326 retry_dotdot: 1327 ncp = dvp->v_cache_dd; 1328 if (ncp == NULL) { 1329 mtx_unlock(dvlp); 1330 if (dvlp2 != NULL) 1331 mtx_unlock(dvlp2); 1332 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1333 return (0); 1334 } 1335 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1336 if (ncp->nc_dvp != dvp) 1337 panic("dvp %p v_cache_dd %p\n", dvp, ncp); 1338 if (!cache_zap_locked_vnode_kl2(ncp, 1339 dvp, &dvlp2)) 1340 goto retry_dotdot; 1341 MPASS(dvp->v_cache_dd == NULL); 1342 mtx_unlock(dvlp); 1343 if (dvlp2 != NULL) 1344 mtx_unlock(dvlp2); 1345 cache_free(ncp); 1346 } else { 1347 vn_seqc_write_begin(dvp); 1348 dvp->v_cache_dd = NULL; 1349 vn_seqc_write_end(dvp); 1350 mtx_unlock(dvlp); 1351 if (dvlp2 != NULL) 1352 mtx_unlock(dvlp2); 1353 } 1354 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1355 return (1); 1356 } 1357 1358 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1359 blp = HASH2BUCKETLOCK(hash); 1360 retry: 1361 if (CK_SLIST_EMPTY(NCHHASH(hash))) 1362 goto out_no_entry; 1363 1364 rw_wlock(blp); 1365 1366 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1367 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1368 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1369 break; 1370 } 1371 1372 /* We failed to find an entry */ 1373 if (ncp == NULL) { 1374 rw_wunlock(blp); 1375 goto out_no_entry; 1376 } 1377 1378 error = cache_zap_wlocked_bucket(ncp, cnp, hash, blp); 1379 if (__predict_false(error != 0)) { 1380 zap_and_exit_bucket_fail++; 1381 cache_maybe_yield(); 1382 goto retry; 1383 } 1384 counter_u64_add(numposzaps, 1); 1385 cache_free(ncp); 1386 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1387 return (1); 1388 out_no_entry: 1389 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1390 counter_u64_add(nummisszap, 1); 1391 return (0); 1392 } 1393 1394 /** 1395 * Lookup a name in the name cache 1396 * 1397 * # Arguments 1398 * 1399 * - dvp: Parent directory in which to search. 1400 * - vpp: Return argument. Will contain desired vnode on cache hit. 1401 * - cnp: Parameters of the name search. The most interesting bits of 1402 * the cn_flags field have the following meanings: 1403 * - MAKEENTRY: If clear, free an entry from the cache rather than look 1404 * it up. 1405 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." 1406 * - tsp: Return storage for cache timestamp. On a successful (positive 1407 * or negative) lookup, tsp will be filled with any timespec that 1408 * was stored when this cache entry was created. However, it will 1409 * be clear for "." entries. 1410 * - ticks: Return storage for alternate cache timestamp. On a successful 1411 * (positive or negative) lookup, it will contain the ticks value 1412 * that was current when the cache entry was created, unless cnp 1413 * was ".". 1414 * 1415 * # Returns 1416 * 1417 * - -1: A positive cache hit. vpp will contain the desired vnode. 1418 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due 1419 * to a forced unmount. vpp will not be modified. If the entry 1420 * is a whiteout, then the ISWHITEOUT flag will be set in 1421 * cnp->cn_flags. 1422 * - 0: A cache miss. vpp will not be modified. 1423 * 1424 * # Locking 1425 * 1426 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up 1427 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the 1428 * lock is not recursively acquired. 1429 */ 1430 int 1431 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1432 struct timespec *tsp, int *ticksp) 1433 { 1434 struct namecache_ts *ncp_ts; 1435 struct namecache *ncp; 1436 struct negstate *negstate; 1437 struct rwlock *blp; 1438 struct mtx *dvlp; 1439 uint32_t hash; 1440 enum vgetstate vs; 1441 int error, ltype; 1442 bool try_smr, doing_smr, whiteout; 1443 1444 #ifdef DEBUG_CACHE 1445 if (__predict_false(!doingcache)) { 1446 cnp->cn_flags &= ~MAKEENTRY; 1447 return (0); 1448 } 1449 #endif 1450 1451 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) 1452 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); 1453 1454 if ((cnp->cn_flags & MAKEENTRY) == 0) { 1455 cache_remove_cnp(dvp, cnp); 1456 return (0); 1457 } 1458 1459 try_smr = true; 1460 if (cnp->cn_nameiop == CREATE) 1461 try_smr = false; 1462 retry: 1463 doing_smr = false; 1464 blp = NULL; 1465 dvlp = NULL; 1466 error = 0; 1467 if (cnp->cn_namelen == 2 && 1468 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1469 counter_u64_add(dotdothits, 1); 1470 dvlp = VP2VNODELOCK(dvp); 1471 mtx_lock(dvlp); 1472 ncp = dvp->v_cache_dd; 1473 if (ncp == NULL) { 1474 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, 1475 "..", NULL); 1476 mtx_unlock(dvlp); 1477 return (0); 1478 } 1479 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1480 if (ncp->nc_flag & NCF_NEGATIVE) 1481 *vpp = NULL; 1482 else 1483 *vpp = ncp->nc_vp; 1484 } else 1485 *vpp = ncp->nc_dvp; 1486 /* Return failure if negative entry was found. */ 1487 if (*vpp == NULL) 1488 goto negative_success; 1489 CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..", 1490 dvp, cnp->cn_nameptr, *vpp); 1491 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", 1492 *vpp); 1493 cache_out_ts(ncp, tsp, ticksp); 1494 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == 1495 NCF_DTS && tsp != NULL) { 1496 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1497 *tsp = ncp_ts->nc_dotdottime; 1498 } 1499 goto success; 1500 } 1501 1502 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1503 retry_hashed: 1504 if (try_smr) { 1505 vfs_smr_enter(); 1506 doing_smr = true; 1507 try_smr = false; 1508 } else { 1509 blp = HASH2BUCKETLOCK(hash); 1510 rw_rlock(blp); 1511 } 1512 1513 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1514 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1515 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1516 break; 1517 } 1518 1519 /* We failed to find an entry */ 1520 if (__predict_false(ncp == NULL)) { 1521 if (doing_smr) 1522 vfs_smr_exit(); 1523 else 1524 rw_runlock(blp); 1525 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1526 NULL); 1527 counter_u64_add(nummiss, 1); 1528 return (0); 1529 } 1530 1531 if (ncp->nc_flag & NCF_NEGATIVE) 1532 goto negative_success; 1533 1534 /* We found a "positive" match, return the vnode */ 1535 counter_u64_add(numposhits, 1); 1536 *vpp = ncp->nc_vp; 1537 CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p", 1538 dvp, cnp->cn_nameptr, *vpp, ncp); 1539 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, 1540 *vpp); 1541 cache_out_ts(ncp, tsp, ticksp); 1542 success: 1543 /* 1544 * On success we return a locked and ref'd vnode as per the lookup 1545 * protocol. 1546 */ 1547 MPASS(dvp != *vpp); 1548 ltype = 0; /* silence gcc warning */ 1549 if (cnp->cn_flags & ISDOTDOT) { 1550 ltype = VOP_ISLOCKED(dvp); 1551 VOP_UNLOCK(dvp); 1552 } 1553 if (doing_smr) { 1554 if (!cache_ncp_canuse(ncp)) { 1555 vfs_smr_exit(); 1556 *vpp = NULL; 1557 goto retry; 1558 } 1559 vs = vget_prep_smr(*vpp); 1560 vfs_smr_exit(); 1561 if (__predict_false(vs == VGET_NONE)) { 1562 *vpp = NULL; 1563 goto retry; 1564 } 1565 } else { 1566 vs = vget_prep(*vpp); 1567 cache_lookup_unlock(blp, dvlp); 1568 } 1569 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1570 if (cnp->cn_flags & ISDOTDOT) { 1571 vn_lock(dvp, ltype | LK_RETRY); 1572 if (VN_IS_DOOMED(dvp)) { 1573 if (error == 0) 1574 vput(*vpp); 1575 *vpp = NULL; 1576 return (ENOENT); 1577 } 1578 } 1579 if (error) { 1580 *vpp = NULL; 1581 goto retry; 1582 } 1583 if ((cnp->cn_flags & ISLASTCN) && 1584 (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) { 1585 ASSERT_VOP_ELOCKED(*vpp, "cache_lookup"); 1586 } 1587 return (-1); 1588 1589 negative_success: 1590 /* We found a negative match, and want to create it, so purge */ 1591 if (cnp->cn_nameiop == CREATE) { 1592 MPASS(!doing_smr); 1593 counter_u64_add(numnegzaps, 1); 1594 goto zap_and_exit; 1595 } 1596 1597 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 1598 cache_out_ts(ncp, tsp, ticksp); 1599 counter_u64_add(numneghits, 1); 1600 whiteout = (ncp->nc_flag & NCF_WHITE); 1601 1602 if (doing_smr) { 1603 /* 1604 * We need to take locks to promote an entry. 1605 */ 1606 negstate = NCP2NEGSTATE(ncp); 1607 if ((negstate->neg_flag & NEG_HOT) == 0 || 1608 !cache_ncp_canuse(ncp)) { 1609 vfs_smr_exit(); 1610 doing_smr = false; 1611 goto retry_hashed; 1612 } 1613 vfs_smr_exit(); 1614 } else { 1615 cache_negative_hit(ncp); 1616 cache_lookup_unlock(blp, dvlp); 1617 } 1618 if (whiteout) 1619 cnp->cn_flags |= ISWHITEOUT; 1620 return (ENOENT); 1621 1622 zap_and_exit: 1623 MPASS(!doing_smr); 1624 if (blp != NULL) 1625 error = cache_zap_rlocked_bucket(ncp, cnp, hash, blp); 1626 else 1627 error = cache_zap_locked_vnode(ncp, dvp); 1628 if (__predict_false(error != 0)) { 1629 zap_and_exit_bucket_fail2++; 1630 cache_maybe_yield(); 1631 goto retry; 1632 } 1633 cache_free(ncp); 1634 return (0); 1635 } 1636 1637 struct celockstate { 1638 struct mtx *vlp[3]; 1639 struct rwlock *blp[2]; 1640 }; 1641 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); 1642 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); 1643 1644 static inline void 1645 cache_celockstate_init(struct celockstate *cel) 1646 { 1647 1648 bzero(cel, sizeof(*cel)); 1649 } 1650 1651 static void 1652 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, 1653 struct vnode *dvp) 1654 { 1655 struct mtx *vlp1, *vlp2; 1656 1657 MPASS(cel->vlp[0] == NULL); 1658 MPASS(cel->vlp[1] == NULL); 1659 MPASS(cel->vlp[2] == NULL); 1660 1661 MPASS(vp != NULL || dvp != NULL); 1662 1663 vlp1 = VP2VNODELOCK(vp); 1664 vlp2 = VP2VNODELOCK(dvp); 1665 cache_sort_vnodes(&vlp1, &vlp2); 1666 1667 if (vlp1 != NULL) { 1668 mtx_lock(vlp1); 1669 cel->vlp[0] = vlp1; 1670 } 1671 mtx_lock(vlp2); 1672 cel->vlp[1] = vlp2; 1673 } 1674 1675 static void 1676 cache_unlock_vnodes_cel(struct celockstate *cel) 1677 { 1678 1679 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); 1680 1681 if (cel->vlp[0] != NULL) 1682 mtx_unlock(cel->vlp[0]); 1683 if (cel->vlp[1] != NULL) 1684 mtx_unlock(cel->vlp[1]); 1685 if (cel->vlp[2] != NULL) 1686 mtx_unlock(cel->vlp[2]); 1687 } 1688 1689 static bool 1690 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) 1691 { 1692 struct mtx *vlp; 1693 bool ret; 1694 1695 cache_assert_vlp_locked(cel->vlp[0]); 1696 cache_assert_vlp_locked(cel->vlp[1]); 1697 MPASS(cel->vlp[2] == NULL); 1698 1699 MPASS(vp != NULL); 1700 vlp = VP2VNODELOCK(vp); 1701 1702 ret = true; 1703 if (vlp >= cel->vlp[1]) { 1704 mtx_lock(vlp); 1705 } else { 1706 if (mtx_trylock(vlp)) 1707 goto out; 1708 cache_lock_vnodes_cel_3_failures++; 1709 cache_unlock_vnodes_cel(cel); 1710 if (vlp < cel->vlp[0]) { 1711 mtx_lock(vlp); 1712 mtx_lock(cel->vlp[0]); 1713 mtx_lock(cel->vlp[1]); 1714 } else { 1715 if (cel->vlp[0] != NULL) 1716 mtx_lock(cel->vlp[0]); 1717 mtx_lock(vlp); 1718 mtx_lock(cel->vlp[1]); 1719 } 1720 ret = false; 1721 } 1722 out: 1723 cel->vlp[2] = vlp; 1724 return (ret); 1725 } 1726 1727 static void 1728 cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1, 1729 struct rwlock *blp2) 1730 { 1731 1732 MPASS(cel->blp[0] == NULL); 1733 MPASS(cel->blp[1] == NULL); 1734 1735 cache_sort_vnodes(&blp1, &blp2); 1736 1737 if (blp1 != NULL) { 1738 rw_wlock(blp1); 1739 cel->blp[0] = blp1; 1740 } 1741 rw_wlock(blp2); 1742 cel->blp[1] = blp2; 1743 } 1744 1745 static void 1746 cache_unlock_buckets_cel(struct celockstate *cel) 1747 { 1748 1749 if (cel->blp[0] != NULL) 1750 rw_wunlock(cel->blp[0]); 1751 rw_wunlock(cel->blp[1]); 1752 } 1753 1754 /* 1755 * Lock part of the cache affected by the insertion. 1756 * 1757 * This means vnodelocks for dvp, vp and the relevant bucketlock. 1758 * However, insertion can result in removal of an old entry. In this 1759 * case we have an additional vnode and bucketlock pair to lock. If the 1760 * entry is negative, ncelock is locked instead of the vnode. 1761 * 1762 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while 1763 * preserving the locking order (smaller address first). 1764 */ 1765 static void 1766 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1767 uint32_t hash) 1768 { 1769 struct namecache *ncp; 1770 struct rwlock *blps[2]; 1771 1772 blps[0] = HASH2BUCKETLOCK(hash); 1773 for (;;) { 1774 blps[1] = NULL; 1775 cache_lock_vnodes_cel(cel, dvp, vp); 1776 if (vp == NULL || vp->v_type != VDIR) 1777 break; 1778 ncp = vp->v_cache_dd; 1779 if (ncp == NULL) 1780 break; 1781 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1782 break; 1783 MPASS(ncp->nc_dvp == vp); 1784 blps[1] = NCP2BUCKETLOCK(ncp); 1785 if (ncp->nc_flag & NCF_NEGATIVE) 1786 break; 1787 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1788 break; 1789 /* 1790 * All vnodes got re-locked. Re-validate the state and if 1791 * nothing changed we are done. Otherwise restart. 1792 */ 1793 if (ncp == vp->v_cache_dd && 1794 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1795 blps[1] == NCP2BUCKETLOCK(ncp) && 1796 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1797 break; 1798 cache_unlock_vnodes_cel(cel); 1799 cel->vlp[0] = NULL; 1800 cel->vlp[1] = NULL; 1801 cel->vlp[2] = NULL; 1802 } 1803 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1804 } 1805 1806 static void 1807 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1808 uint32_t hash) 1809 { 1810 struct namecache *ncp; 1811 struct rwlock *blps[2]; 1812 1813 blps[0] = HASH2BUCKETLOCK(hash); 1814 for (;;) { 1815 blps[1] = NULL; 1816 cache_lock_vnodes_cel(cel, dvp, vp); 1817 ncp = dvp->v_cache_dd; 1818 if (ncp == NULL) 1819 break; 1820 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1821 break; 1822 MPASS(ncp->nc_dvp == dvp); 1823 blps[1] = NCP2BUCKETLOCK(ncp); 1824 if (ncp->nc_flag & NCF_NEGATIVE) 1825 break; 1826 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1827 break; 1828 if (ncp == dvp->v_cache_dd && 1829 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1830 blps[1] == NCP2BUCKETLOCK(ncp) && 1831 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1832 break; 1833 cache_unlock_vnodes_cel(cel); 1834 cel->vlp[0] = NULL; 1835 cel->vlp[1] = NULL; 1836 cel->vlp[2] = NULL; 1837 } 1838 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1839 } 1840 1841 static void 1842 cache_enter_unlock(struct celockstate *cel) 1843 { 1844 1845 cache_unlock_buckets_cel(cel); 1846 cache_unlock_vnodes_cel(cel); 1847 } 1848 1849 static void __noinline 1850 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, 1851 struct componentname *cnp) 1852 { 1853 struct celockstate cel; 1854 struct namecache *ncp; 1855 uint32_t hash; 1856 int len; 1857 1858 if (dvp->v_cache_dd == NULL) 1859 return; 1860 len = cnp->cn_namelen; 1861 cache_celockstate_init(&cel); 1862 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1863 cache_enter_lock_dd(&cel, dvp, vp, hash); 1864 vn_seqc_write_begin(dvp); 1865 ncp = dvp->v_cache_dd; 1866 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) { 1867 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); 1868 cache_zap_locked(ncp); 1869 } else { 1870 ncp = NULL; 1871 } 1872 dvp->v_cache_dd = NULL; 1873 vn_seqc_write_end(dvp); 1874 cache_enter_unlock(&cel); 1875 cache_free(ncp); 1876 } 1877 1878 /* 1879 * Add an entry to the cache. 1880 */ 1881 void 1882 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 1883 struct timespec *tsp, struct timespec *dtsp) 1884 { 1885 struct celockstate cel; 1886 struct namecache *ncp, *n2, *ndd; 1887 struct namecache_ts *ncp_ts, *n2_ts; 1888 struct nchashhead *ncpp; 1889 uint32_t hash; 1890 int flag; 1891 int len; 1892 u_long lnumcache; 1893 1894 CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr); 1895 VNASSERT(vp == NULL || !VN_IS_DOOMED(vp), vp, 1896 ("cache_enter: Adding a doomed vnode")); 1897 VNASSERT(dvp == NULL || !VN_IS_DOOMED(dvp), dvp, 1898 ("cache_enter: Doomed vnode used as src")); 1899 1900 #ifdef DEBUG_CACHE 1901 if (__predict_false(!doingcache)) 1902 return; 1903 #endif 1904 1905 flag = 0; 1906 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 1907 if (cnp->cn_namelen == 1) 1908 return; 1909 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 1910 cache_enter_dotdot_prep(dvp, vp, cnp); 1911 flag = NCF_ISDOTDOT; 1912 } 1913 } 1914 1915 /* 1916 * Avoid blowout in namecache entries. 1917 */ 1918 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; 1919 if (__predict_false(lnumcache >= ncsize)) { 1920 atomic_add_long(&numcache, -1); 1921 counter_u64_add(numdrops, 1); 1922 return; 1923 } 1924 1925 cache_celockstate_init(&cel); 1926 ndd = NULL; 1927 ncp_ts = NULL; 1928 1929 /* 1930 * Calculate the hash key and setup as much of the new 1931 * namecache entry as possible before acquiring the lock. 1932 */ 1933 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); 1934 ncp->nc_flag = flag | NCF_WIP; 1935 ncp->nc_vp = vp; 1936 if (vp == NULL) 1937 cache_negative_init(ncp); 1938 ncp->nc_dvp = dvp; 1939 if (tsp != NULL) { 1940 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1941 ncp_ts->nc_time = *tsp; 1942 ncp_ts->nc_ticks = ticks; 1943 ncp_ts->nc_nc.nc_flag |= NCF_TS; 1944 if (dtsp != NULL) { 1945 ncp_ts->nc_dotdottime = *dtsp; 1946 ncp_ts->nc_nc.nc_flag |= NCF_DTS; 1947 } 1948 } 1949 len = ncp->nc_nlen = cnp->cn_namelen; 1950 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1951 memcpy(ncp->nc_name, cnp->cn_nameptr, len); 1952 ncp->nc_name[len] = '\0'; 1953 cache_enter_lock(&cel, dvp, vp, hash); 1954 1955 /* 1956 * See if this vnode or negative entry is already in the cache 1957 * with this name. This can happen with concurrent lookups of 1958 * the same path name. 1959 */ 1960 ncpp = NCHHASH(hash); 1961 CK_SLIST_FOREACH(n2, ncpp, nc_hash) { 1962 if (n2->nc_dvp == dvp && 1963 n2->nc_nlen == cnp->cn_namelen && 1964 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { 1965 MPASS(cache_ncp_canuse(n2)); 1966 if ((n2->nc_flag & NCF_NEGATIVE) != 0) 1967 KASSERT(vp == NULL, 1968 ("%s: found entry pointing to a different vnode (%p != %p)", 1969 __func__, NULL, vp)); 1970 else 1971 KASSERT(n2->nc_vp == vp, 1972 ("%s: found entry pointing to a different vnode (%p != %p)", 1973 __func__, n2->nc_vp, vp)); 1974 if (tsp != NULL) { 1975 KASSERT((n2->nc_flag & NCF_TS) != 0, 1976 ("no NCF_TS")); 1977 n2_ts = __containerof(n2, struct namecache_ts, nc_nc); 1978 n2_ts->nc_time = ncp_ts->nc_time; 1979 n2_ts->nc_ticks = ncp_ts->nc_ticks; 1980 if (dtsp != NULL) { 1981 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; 1982 n2_ts->nc_nc.nc_flag |= NCF_DTS; 1983 } 1984 } 1985 goto out_unlock_free; 1986 } 1987 } 1988 1989 if (flag == NCF_ISDOTDOT) { 1990 /* 1991 * See if we are trying to add .. entry, but some other lookup 1992 * has populated v_cache_dd pointer already. 1993 */ 1994 if (dvp->v_cache_dd != NULL) 1995 goto out_unlock_free; 1996 KASSERT(vp == NULL || vp->v_type == VDIR, 1997 ("wrong vnode type %p", vp)); 1998 vn_seqc_write_begin(dvp); 1999 dvp->v_cache_dd = ncp; 2000 vn_seqc_write_end(dvp); 2001 } 2002 2003 if (vp != NULL) { 2004 if (vp->v_type == VDIR) { 2005 if (flag != NCF_ISDOTDOT) { 2006 /* 2007 * For this case, the cache entry maps both the 2008 * directory name in it and the name ".." for the 2009 * directory's parent. 2010 */ 2011 vn_seqc_write_begin(vp); 2012 if ((ndd = vp->v_cache_dd) != NULL) { 2013 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) 2014 cache_zap_locked(ndd); 2015 else 2016 ndd = NULL; 2017 } 2018 vp->v_cache_dd = ncp; 2019 vn_seqc_write_end(vp); 2020 } 2021 } else { 2022 if (vp->v_cache_dd != NULL) { 2023 vn_seqc_write_begin(vp); 2024 vp->v_cache_dd = NULL; 2025 vn_seqc_write_end(vp); 2026 } 2027 } 2028 } 2029 2030 if (flag != NCF_ISDOTDOT) { 2031 if (LIST_EMPTY(&dvp->v_cache_src)) { 2032 vhold(dvp); 2033 counter_u64_add(numcachehv, 1); 2034 } 2035 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); 2036 } 2037 2038 /* 2039 * If the entry is "negative", we place it into the 2040 * "negative" cache queue, otherwise, we place it into the 2041 * destination vnode's cache entries queue. 2042 */ 2043 if (vp != NULL) { 2044 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); 2045 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, 2046 vp); 2047 } else { 2048 if (cnp->cn_flags & ISWHITEOUT) 2049 ncp->nc_flag |= NCF_WHITE; 2050 cache_negative_insert(ncp); 2051 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, 2052 ncp->nc_name); 2053 } 2054 2055 /* 2056 * Insert the new namecache entry into the appropriate chain 2057 * within the cache entries table. 2058 */ 2059 CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash); 2060 2061 atomic_thread_fence_rel(); 2062 /* 2063 * Mark the entry as fully constructed. 2064 * It is immutable past this point until its removal. 2065 */ 2066 atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP); 2067 2068 cache_enter_unlock(&cel); 2069 if (numneg * ncnegfactor > lnumcache) 2070 cache_negative_zap_one(); 2071 cache_free(ndd); 2072 return; 2073 out_unlock_free: 2074 cache_enter_unlock(&cel); 2075 atomic_add_long(&numcache, -1); 2076 cache_free(ncp); 2077 return; 2078 } 2079 2080 static u_int 2081 cache_roundup_2(u_int val) 2082 { 2083 u_int res; 2084 2085 for (res = 1; res <= val; res <<= 1) 2086 continue; 2087 2088 return (res); 2089 } 2090 2091 static struct nchashhead * 2092 nchinittbl(u_long elements, u_long *hashmask) 2093 { 2094 struct nchashhead *hashtbl; 2095 u_long hashsize, i; 2096 2097 hashsize = cache_roundup_2(elements) / 2; 2098 2099 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK); 2100 for (i = 0; i < hashsize; i++) 2101 CK_SLIST_INIT(&hashtbl[i]); 2102 *hashmask = hashsize - 1; 2103 return (hashtbl); 2104 } 2105 2106 static void 2107 ncfreetbl(struct nchashhead *hashtbl) 2108 { 2109 2110 free(hashtbl, M_VFSCACHE); 2111 } 2112 2113 /* 2114 * Name cache initialization, from vfs_init() when we are booting 2115 */ 2116 static void 2117 nchinit(void *dummy __unused) 2118 { 2119 u_int i; 2120 2121 cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE, 2122 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2123 cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE, 2124 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2125 cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE, 2126 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2127 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE, 2128 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2129 2130 VFS_SMR_ZONE_SET(cache_zone_small); 2131 VFS_SMR_ZONE_SET(cache_zone_small_ts); 2132 VFS_SMR_ZONE_SET(cache_zone_large); 2133 VFS_SMR_ZONE_SET(cache_zone_large_ts); 2134 2135 ncsize = desiredvnodes * ncsizefactor; 2136 nchashtbl = nchinittbl(desiredvnodes * 2, &nchash); 2137 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; 2138 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ 2139 ncbuckethash = 7; 2140 if (ncbuckethash > nchash) 2141 ncbuckethash = nchash; 2142 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, 2143 M_WAITOK | M_ZERO); 2144 for (i = 0; i < numbucketlocks; i++) 2145 rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE); 2146 ncvnodehash = ncbuckethash; 2147 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, 2148 M_WAITOK | M_ZERO); 2149 for (i = 0; i < numvnodelocks; i++) 2150 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); 2151 ncpurgeminvnodes = numbucketlocks * 2; 2152 2153 neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE, 2154 M_WAITOK | M_ZERO); 2155 for (i = 0; i < numneglists; i++) { 2156 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); 2157 TAILQ_INIT(&neglists[i].nl_list); 2158 } 2159 mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF); 2160 TAILQ_INIT(&ncneg_hot.nl_list); 2161 2162 mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF); 2163 } 2164 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); 2165 2166 void 2167 cache_vnode_init(struct vnode *vp) 2168 { 2169 2170 LIST_INIT(&vp->v_cache_src); 2171 TAILQ_INIT(&vp->v_cache_dst); 2172 vp->v_cache_dd = NULL; 2173 cache_prehash(vp); 2174 } 2175 2176 void 2177 cache_changesize(u_long newmaxvnodes) 2178 { 2179 struct nchashhead *new_nchashtbl, *old_nchashtbl; 2180 u_long new_nchash, old_nchash; 2181 struct namecache *ncp; 2182 uint32_t hash; 2183 u_long newncsize; 2184 int i; 2185 2186 newncsize = newmaxvnodes * ncsizefactor; 2187 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); 2188 if (newmaxvnodes < numbucketlocks) 2189 newmaxvnodes = numbucketlocks; 2190 2191 new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash); 2192 /* If same hash table size, nothing to do */ 2193 if (nchash == new_nchash) { 2194 ncfreetbl(new_nchashtbl); 2195 return; 2196 } 2197 /* 2198 * Move everything from the old hash table to the new table. 2199 * None of the namecache entries in the table can be removed 2200 * because to do so, they have to be removed from the hash table. 2201 */ 2202 cache_lock_all_vnodes(); 2203 cache_lock_all_buckets(); 2204 old_nchashtbl = nchashtbl; 2205 old_nchash = nchash; 2206 nchashtbl = new_nchashtbl; 2207 nchash = new_nchash; 2208 for (i = 0; i <= old_nchash; i++) { 2209 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) { 2210 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, 2211 ncp->nc_dvp); 2212 CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash); 2213 CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); 2214 } 2215 } 2216 ncsize = newncsize; 2217 cache_unlock_all_buckets(); 2218 cache_unlock_all_vnodes(); 2219 ncfreetbl(old_nchashtbl); 2220 } 2221 2222 /* 2223 * Invalidate all entries from and to a particular vnode. 2224 */ 2225 static void 2226 cache_purge_impl(struct vnode *vp) 2227 { 2228 TAILQ_HEAD(, namecache) ncps; 2229 struct namecache *ncp, *nnp; 2230 struct mtx *vlp, *vlp2; 2231 2232 TAILQ_INIT(&ncps); 2233 vlp = VP2VNODELOCK(vp); 2234 vlp2 = NULL; 2235 mtx_assert(vlp, MA_OWNED); 2236 retry: 2237 while (!LIST_EMPTY(&vp->v_cache_src)) { 2238 ncp = LIST_FIRST(&vp->v_cache_src); 2239 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2240 goto retry; 2241 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2242 } 2243 while (!TAILQ_EMPTY(&vp->v_cache_dst)) { 2244 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2245 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2246 goto retry; 2247 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2248 } 2249 ncp = vp->v_cache_dd; 2250 if (ncp != NULL) { 2251 KASSERT(ncp->nc_flag & NCF_ISDOTDOT, 2252 ("lost dotdot link")); 2253 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2254 goto retry; 2255 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2256 } 2257 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); 2258 mtx_unlock(vlp); 2259 if (vlp2 != NULL) 2260 mtx_unlock(vlp2); 2261 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2262 cache_free(ncp); 2263 } 2264 } 2265 2266 void 2267 cache_purge(struct vnode *vp) 2268 { 2269 struct mtx *vlp; 2270 2271 SDT_PROBE1(vfs, namecache, purge, done, vp); 2272 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2273 vp->v_cache_dd == NULL) 2274 return; 2275 vlp = VP2VNODELOCK(vp); 2276 mtx_lock(vlp); 2277 cache_purge_impl(vp); 2278 } 2279 2280 /* 2281 * Only to be used by vgone. 2282 */ 2283 void 2284 cache_purge_vgone(struct vnode *vp) 2285 { 2286 struct mtx *vlp; 2287 2288 VNPASS(VN_IS_DOOMED(vp), vp); 2289 vlp = VP2VNODELOCK(vp); 2290 if (!(LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2291 vp->v_cache_dd == NULL)) { 2292 mtx_lock(vlp); 2293 cache_purge_impl(vp); 2294 mtx_assert(vlp, MA_NOTOWNED); 2295 return; 2296 } 2297 2298 /* 2299 * All the NULL pointer state we found above may be transient. 2300 * Serialize against a possible thread doing cache_purge. 2301 */ 2302 mtx_wait_unlocked(vlp); 2303 if (!(LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2304 vp->v_cache_dd == NULL)) { 2305 mtx_lock(vlp); 2306 cache_purge_impl(vp); 2307 mtx_assert(vlp, MA_NOTOWNED); 2308 return; 2309 } 2310 return; 2311 } 2312 2313 /* 2314 * Invalidate all negative entries for a particular directory vnode. 2315 */ 2316 void 2317 cache_purge_negative(struct vnode *vp) 2318 { 2319 TAILQ_HEAD(, namecache) ncps; 2320 struct namecache *ncp, *nnp; 2321 struct mtx *vlp; 2322 2323 CTR1(KTR_VFS, "cache_purge_negative(%p)", vp); 2324 SDT_PROBE1(vfs, namecache, purge_negative, done, vp); 2325 if (LIST_EMPTY(&vp->v_cache_src)) 2326 return; 2327 TAILQ_INIT(&ncps); 2328 vlp = VP2VNODELOCK(vp); 2329 mtx_lock(vlp); 2330 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { 2331 if (!(ncp->nc_flag & NCF_NEGATIVE)) 2332 continue; 2333 cache_zap_negative_locked_vnode_kl(ncp, vp); 2334 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2335 } 2336 mtx_unlock(vlp); 2337 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2338 cache_free(ncp); 2339 } 2340 } 2341 2342 void 2343 cache_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, 2344 struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp) 2345 { 2346 2347 ASSERT_VOP_IN_SEQC(fdvp); 2348 ASSERT_VOP_IN_SEQC(fvp); 2349 ASSERT_VOP_IN_SEQC(tdvp); 2350 if (tvp != NULL) 2351 ASSERT_VOP_IN_SEQC(tvp); 2352 2353 cache_purge(fvp); 2354 if (tvp != NULL) { 2355 cache_purge(tvp); 2356 KASSERT(!cache_remove_cnp(tdvp, tcnp), 2357 ("%s: lingering negative entry", __func__)); 2358 } else { 2359 cache_remove_cnp(tdvp, tcnp); 2360 } 2361 } 2362 2363 /* 2364 * Flush all entries referencing a particular filesystem. 2365 */ 2366 void 2367 cache_purgevfs(struct mount *mp, bool force) 2368 { 2369 TAILQ_HEAD(, namecache) ncps; 2370 struct mtx *vlp1, *vlp2; 2371 struct rwlock *blp; 2372 struct nchashhead *bucket; 2373 struct namecache *ncp, *nnp; 2374 u_long i, j, n_nchash; 2375 int error; 2376 2377 /* Scan hash tables for applicable entries */ 2378 SDT_PROBE1(vfs, namecache, purgevfs, done, mp); 2379 if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes) 2380 return; 2381 TAILQ_INIT(&ncps); 2382 n_nchash = nchash + 1; 2383 vlp1 = vlp2 = NULL; 2384 for (i = 0; i < numbucketlocks; i++) { 2385 blp = (struct rwlock *)&bucketlocks[i]; 2386 rw_wlock(blp); 2387 for (j = i; j < n_nchash; j += numbucketlocks) { 2388 retry: 2389 bucket = &nchashtbl[j]; 2390 CK_SLIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) { 2391 cache_assert_bucket_locked(ncp, RA_WLOCKED); 2392 if (ncp->nc_dvp->v_mount != mp) 2393 continue; 2394 error = cache_zap_wlocked_bucket_kl(ncp, blp, 2395 &vlp1, &vlp2); 2396 if (error != 0) 2397 goto retry; 2398 TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst); 2399 } 2400 } 2401 rw_wunlock(blp); 2402 if (vlp1 == NULL && vlp2 == NULL) 2403 cache_maybe_yield(); 2404 } 2405 if (vlp1 != NULL) 2406 mtx_unlock(vlp1); 2407 if (vlp2 != NULL) 2408 mtx_unlock(vlp2); 2409 2410 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2411 cache_free(ncp); 2412 } 2413 } 2414 2415 /* 2416 * Perform canonical checks and cache lookup and pass on to filesystem 2417 * through the vop_cachedlookup only if needed. 2418 */ 2419 2420 int 2421 vfs_cache_lookup(struct vop_lookup_args *ap) 2422 { 2423 struct vnode *dvp; 2424 int error; 2425 struct vnode **vpp = ap->a_vpp; 2426 struct componentname *cnp = ap->a_cnp; 2427 int flags = cnp->cn_flags; 2428 2429 *vpp = NULL; 2430 dvp = ap->a_dvp; 2431 2432 if (dvp->v_type != VDIR) 2433 return (ENOTDIR); 2434 2435 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && 2436 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) 2437 return (EROFS); 2438 2439 error = vn_dir_check_exec(dvp, cnp); 2440 if (error != 0) 2441 return (error); 2442 2443 error = cache_lookup(dvp, vpp, cnp, NULL, NULL); 2444 if (error == 0) 2445 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); 2446 if (error == -1) 2447 return (0); 2448 return (error); 2449 } 2450 2451 /* Implementation of the getcwd syscall. */ 2452 int 2453 sys___getcwd(struct thread *td, struct __getcwd_args *uap) 2454 { 2455 char *buf, *retbuf; 2456 size_t buflen; 2457 int error; 2458 2459 buflen = uap->buflen; 2460 if (__predict_false(buflen < 2)) 2461 return (EINVAL); 2462 if (buflen > MAXPATHLEN) 2463 buflen = MAXPATHLEN; 2464 2465 buf = uma_zalloc(namei_zone, M_WAITOK); 2466 error = vn_getcwd(td, buf, &retbuf, &buflen); 2467 if (error == 0) 2468 error = copyout(retbuf, uap->buf, buflen); 2469 uma_zfree(namei_zone, buf); 2470 return (error); 2471 } 2472 2473 int 2474 vn_getcwd(struct thread *td, char *buf, char **retbuf, size_t *buflen) 2475 { 2476 struct pwd *pwd; 2477 int error; 2478 2479 pwd = pwd_hold(td); 2480 error = vn_fullpath_any(td, pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, buflen); 2481 pwd_drop(pwd); 2482 2483 #ifdef KTRACE 2484 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0) 2485 ktrnamei(*retbuf); 2486 #endif 2487 return (error); 2488 } 2489 2490 static int 2491 kern___realpathat(struct thread *td, int fd, const char *path, char *buf, 2492 size_t size, int flags, enum uio_seg pathseg) 2493 { 2494 struct nameidata nd; 2495 char *retbuf, *freebuf; 2496 int error; 2497 2498 if (flags != 0) 2499 return (EINVAL); 2500 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1, 2501 pathseg, path, fd, &cap_fstat_rights, td); 2502 if ((error = namei(&nd)) != 0) 2503 return (error); 2504 error = vn_fullpath_hardlink(td, &nd, &retbuf, &freebuf, &size); 2505 if (error == 0) { 2506 error = copyout(retbuf, buf, size); 2507 free(freebuf, M_TEMP); 2508 } 2509 NDFREE(&nd, 0); 2510 return (error); 2511 } 2512 2513 int 2514 sys___realpathat(struct thread *td, struct __realpathat_args *uap) 2515 { 2516 2517 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size, 2518 uap->flags, UIO_USERSPACE)); 2519 } 2520 2521 /* 2522 * Retrieve the full filesystem path that correspond to a vnode from the name 2523 * cache (if available) 2524 */ 2525 int 2526 vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf) 2527 { 2528 struct pwd *pwd; 2529 char *buf; 2530 size_t buflen; 2531 int error; 2532 2533 if (__predict_false(vn == NULL)) 2534 return (EINVAL); 2535 2536 buflen = MAXPATHLEN; 2537 buf = malloc(buflen, M_TEMP, M_WAITOK); 2538 pwd = pwd_hold(td); 2539 error = vn_fullpath_any(td, vn, pwd->pwd_rdir, buf, retbuf, &buflen); 2540 pwd_drop(pwd); 2541 2542 if (!error) 2543 *freebuf = buf; 2544 else 2545 free(buf, M_TEMP); 2546 return (error); 2547 } 2548 2549 /* 2550 * This function is similar to vn_fullpath, but it attempts to lookup the 2551 * pathname relative to the global root mount point. This is required for the 2552 * auditing sub-system, as audited pathnames must be absolute, relative to the 2553 * global root mount point. 2554 */ 2555 int 2556 vn_fullpath_global(struct thread *td, struct vnode *vn, 2557 char **retbuf, char **freebuf) 2558 { 2559 char *buf; 2560 size_t buflen; 2561 int error; 2562 2563 if (__predict_false(vn == NULL)) 2564 return (EINVAL); 2565 buflen = MAXPATHLEN; 2566 buf = malloc(buflen, M_TEMP, M_WAITOK); 2567 error = vn_fullpath_any(td, vn, rootvnode, buf, retbuf, &buflen); 2568 if (!error) 2569 *freebuf = buf; 2570 else 2571 free(buf, M_TEMP); 2572 return (error); 2573 } 2574 2575 int 2576 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, size_t *buflen) 2577 { 2578 struct vnode *dvp; 2579 struct namecache *ncp; 2580 struct mtx *vlp; 2581 int error; 2582 2583 vlp = VP2VNODELOCK(*vp); 2584 mtx_lock(vlp); 2585 TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) { 2586 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2587 break; 2588 } 2589 if (ncp != NULL) { 2590 if (*buflen < ncp->nc_nlen) { 2591 mtx_unlock(vlp); 2592 vrele(*vp); 2593 counter_u64_add(numfullpathfail4, 1); 2594 error = ENOMEM; 2595 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2596 vp, NULL); 2597 return (error); 2598 } 2599 *buflen -= ncp->nc_nlen; 2600 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 2601 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, 2602 ncp->nc_name, vp); 2603 dvp = *vp; 2604 *vp = ncp->nc_dvp; 2605 vref(*vp); 2606 mtx_unlock(vlp); 2607 vrele(dvp); 2608 return (0); 2609 } 2610 SDT_PROBE1(vfs, namecache, fullpath, miss, vp); 2611 2612 mtx_unlock(vlp); 2613 vn_lock(*vp, LK_SHARED | LK_RETRY); 2614 error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen); 2615 vput(*vp); 2616 if (error) { 2617 counter_u64_add(numfullpathfail2, 1); 2618 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2619 return (error); 2620 } 2621 2622 *vp = dvp; 2623 if (VN_IS_DOOMED(dvp)) { 2624 /* forced unmount */ 2625 vrele(dvp); 2626 error = ENOENT; 2627 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2628 return (error); 2629 } 2630 /* 2631 * *vp has its use count incremented still. 2632 */ 2633 2634 return (0); 2635 } 2636 2637 /* 2638 * Resolve a directory to a pathname. 2639 * 2640 * The name of the directory can always be found in the namecache or fetched 2641 * from the filesystem. There is also guaranteed to be only one parent, meaning 2642 * we can just follow vnodes up until we find the root. 2643 * 2644 * The vnode must be referenced. 2645 */ 2646 static int 2647 vn_fullpath_dir(struct thread *td, struct vnode *vp, struct vnode *rdir, 2648 char *buf, char **retbuf, size_t *len, bool slash_prefixed, size_t addend) 2649 { 2650 #ifdef KDTRACE_HOOKS 2651 struct vnode *startvp = vp; 2652 #endif 2653 struct vnode *vp1; 2654 size_t buflen; 2655 int error; 2656 2657 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); 2658 VNPASS(vp->v_usecount > 0, vp); 2659 2660 buflen = *len; 2661 2662 if (!slash_prefixed) { 2663 MPASS(*len >= 2); 2664 buflen--; 2665 buf[buflen] = '\0'; 2666 } 2667 2668 error = 0; 2669 2670 SDT_PROBE1(vfs, namecache, fullpath, entry, vp); 2671 counter_u64_add(numfullpathcalls, 1); 2672 while (vp != rdir && vp != rootvnode) { 2673 /* 2674 * The vp vnode must be already fully constructed, 2675 * since it is either found in namecache or obtained 2676 * from VOP_VPTOCNP(). We may test for VV_ROOT safely 2677 * without obtaining the vnode lock. 2678 */ 2679 if ((vp->v_vflag & VV_ROOT) != 0) { 2680 vn_lock(vp, LK_RETRY | LK_SHARED); 2681 2682 /* 2683 * With the vnode locked, check for races with 2684 * unmount, forced or not. Note that we 2685 * already verified that vp is not equal to 2686 * the root vnode, which means that 2687 * mnt_vnodecovered can be NULL only for the 2688 * case of unmount. 2689 */ 2690 if (VN_IS_DOOMED(vp) || 2691 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || 2692 vp1->v_mountedhere != vp->v_mount) { 2693 vput(vp); 2694 error = ENOENT; 2695 SDT_PROBE3(vfs, namecache, fullpath, return, 2696 error, vp, NULL); 2697 break; 2698 } 2699 2700 vref(vp1); 2701 vput(vp); 2702 vp = vp1; 2703 continue; 2704 } 2705 if (vp->v_type != VDIR) { 2706 vrele(vp); 2707 counter_u64_add(numfullpathfail1, 1); 2708 error = ENOTDIR; 2709 SDT_PROBE3(vfs, namecache, fullpath, return, 2710 error, vp, NULL); 2711 break; 2712 } 2713 error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen); 2714 if (error) 2715 break; 2716 if (buflen == 0) { 2717 vrele(vp); 2718 error = ENOMEM; 2719 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2720 startvp, NULL); 2721 break; 2722 } 2723 buf[--buflen] = '/'; 2724 slash_prefixed = true; 2725 } 2726 if (error) 2727 return (error); 2728 if (!slash_prefixed) { 2729 if (buflen == 0) { 2730 vrele(vp); 2731 counter_u64_add(numfullpathfail4, 1); 2732 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, 2733 startvp, NULL); 2734 return (ENOMEM); 2735 } 2736 buf[--buflen] = '/'; 2737 } 2738 counter_u64_add(numfullpathfound, 1); 2739 vrele(vp); 2740 2741 *retbuf = buf + buflen; 2742 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf); 2743 *len -= buflen; 2744 *len += addend; 2745 return (0); 2746 } 2747 2748 /* 2749 * Resolve an arbitrary vnode to a pathname. 2750 * 2751 * Note 2 caveats: 2752 * - hardlinks are not tracked, thus if the vnode is not a directory this can 2753 * resolve to a different path than the one used to find it 2754 * - namecache is not mandatory, meaning names are not guaranteed to be added 2755 * (in which case resolving fails) 2756 */ 2757 static int 2758 vn_fullpath_any(struct thread *td, struct vnode *vp, struct vnode *rdir, 2759 char *buf, char **retbuf, size_t *buflen) 2760 { 2761 size_t orig_buflen; 2762 bool slash_prefixed; 2763 int error; 2764 2765 if (*buflen < 2) 2766 return (EINVAL); 2767 2768 orig_buflen = *buflen; 2769 2770 vref(vp); 2771 slash_prefixed = false; 2772 if (vp->v_type != VDIR) { 2773 *buflen -= 1; 2774 buf[*buflen] = '\0'; 2775 error = vn_vptocnp(&vp, td->td_ucred, buf, buflen); 2776 if (error) 2777 return (error); 2778 if (*buflen == 0) { 2779 vrele(vp); 2780 return (ENOMEM); 2781 } 2782 *buflen -= 1; 2783 buf[*buflen] = '/'; 2784 slash_prefixed = true; 2785 } 2786 2787 return (vn_fullpath_dir(td, vp, rdir, buf, retbuf, buflen, slash_prefixed, 2788 orig_buflen - *buflen)); 2789 } 2790 2791 /* 2792 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks). 2793 * 2794 * Since the namecache does not track handlings, the caller is expected to first 2795 * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei. 2796 * 2797 * Then we have 2 cases: 2798 * - if the found vnode is a directory, the path can be constructed just by 2799 * fullowing names up the chain 2800 * - otherwise we populate the buffer with the saved name and start resolving 2801 * from the parent 2802 */ 2803 static int 2804 vn_fullpath_hardlink(struct thread *td, struct nameidata *ndp, char **retbuf, 2805 char **freebuf, size_t *buflen) 2806 { 2807 char *buf, *tmpbuf; 2808 struct pwd *pwd; 2809 struct componentname *cnp; 2810 struct vnode *vp; 2811 size_t addend; 2812 int error; 2813 bool slash_prefixed; 2814 enum vtype type; 2815 2816 if (*buflen < 2) 2817 return (EINVAL); 2818 if (*buflen > MAXPATHLEN) 2819 *buflen = MAXPATHLEN; 2820 2821 slash_prefixed = false; 2822 2823 buf = malloc(*buflen, M_TEMP, M_WAITOK); 2824 pwd = pwd_hold(td); 2825 2826 addend = 0; 2827 vp = ndp->ni_vp; 2828 /* 2829 * Check for VBAD to work around the vp_crossmp bug in lookup(). 2830 * 2831 * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be 2832 * set to mount point's root vnode while ni_dvp will be vp_crossmp. 2833 * If the type is VDIR (like in this very case) we can skip looking 2834 * at ni_dvp in the first place. However, since vnodes get passed here 2835 * unlocked the target may transition to doomed state (type == VBAD) 2836 * before we get to evaluate the condition. If this happens, we will 2837 * populate part of the buffer and descend to vn_fullpath_dir with 2838 * vp == vp_crossmp. Prevent the problem by checking for VBAD. 2839 * 2840 * This should be atomic_load(&vp->v_type) but it is ilegal to take 2841 * an address of a bit field, even if said field is sized to char. 2842 * Work around the problem by reading the value into a full-sized enum 2843 * and then re-reading it with atomic_load which will still prevent 2844 * the compiler from re-reading down the road. 2845 */ 2846 type = vp->v_type; 2847 type = atomic_load_int(&type); 2848 if (type == VBAD) { 2849 error = ENOENT; 2850 goto out_bad; 2851 } 2852 if (type != VDIR) { 2853 cnp = &ndp->ni_cnd; 2854 addend = cnp->cn_namelen + 2; 2855 if (*buflen < addend) { 2856 error = ENOMEM; 2857 goto out_bad; 2858 } 2859 *buflen -= addend; 2860 tmpbuf = buf + *buflen; 2861 tmpbuf[0] = '/'; 2862 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen); 2863 tmpbuf[addend - 1] = '\0'; 2864 slash_prefixed = true; 2865 vp = ndp->ni_dvp; 2866 } 2867 2868 vref(vp); 2869 error = vn_fullpath_dir(td, vp, pwd->pwd_rdir, buf, retbuf, buflen, 2870 slash_prefixed, addend); 2871 if (error != 0) 2872 goto out_bad; 2873 2874 pwd_drop(pwd); 2875 *freebuf = buf; 2876 2877 return (0); 2878 out_bad: 2879 pwd_drop(pwd); 2880 free(buf, M_TEMP); 2881 return (error); 2882 } 2883 2884 struct vnode * 2885 vn_dir_dd_ino(struct vnode *vp) 2886 { 2887 struct namecache *ncp; 2888 struct vnode *ddvp; 2889 struct mtx *vlp; 2890 enum vgetstate vs; 2891 2892 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); 2893 vlp = VP2VNODELOCK(vp); 2894 mtx_lock(vlp); 2895 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { 2896 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 2897 continue; 2898 ddvp = ncp->nc_dvp; 2899 vs = vget_prep(ddvp); 2900 mtx_unlock(vlp); 2901 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) 2902 return (NULL); 2903 return (ddvp); 2904 } 2905 mtx_unlock(vlp); 2906 return (NULL); 2907 } 2908 2909 int 2910 vn_commname(struct vnode *vp, char *buf, u_int buflen) 2911 { 2912 struct namecache *ncp; 2913 struct mtx *vlp; 2914 int l; 2915 2916 vlp = VP2VNODELOCK(vp); 2917 mtx_lock(vlp); 2918 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) 2919 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2920 break; 2921 if (ncp == NULL) { 2922 mtx_unlock(vlp); 2923 return (ENOENT); 2924 } 2925 l = min(ncp->nc_nlen, buflen - 1); 2926 memcpy(buf, ncp->nc_name, l); 2927 mtx_unlock(vlp); 2928 buf[l] = '\0'; 2929 return (0); 2930 } 2931 2932 /* 2933 * This function updates path string to vnode's full global path 2934 * and checks the size of the new path string against the pathlen argument. 2935 * 2936 * Requires a locked, referenced vnode. 2937 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 2938 * 2939 * If vp is a directory, the call to vn_fullpath_global() always succeeds 2940 * because it falls back to the ".." lookup if the namecache lookup fails. 2941 */ 2942 int 2943 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, 2944 u_int pathlen) 2945 { 2946 struct nameidata nd; 2947 struct vnode *vp1; 2948 char *rpath, *fbuf; 2949 int error; 2950 2951 ASSERT_VOP_ELOCKED(vp, __func__); 2952 2953 /* Construct global filesystem path from vp. */ 2954 VOP_UNLOCK(vp); 2955 error = vn_fullpath_global(td, vp, &rpath, &fbuf); 2956 2957 if (error != 0) { 2958 vrele(vp); 2959 return (error); 2960 } 2961 2962 if (strlen(rpath) >= pathlen) { 2963 vrele(vp); 2964 error = ENAMETOOLONG; 2965 goto out; 2966 } 2967 2968 /* 2969 * Re-lookup the vnode by path to detect a possible rename. 2970 * As a side effect, the vnode is relocked. 2971 * If vnode was renamed, return ENOENT. 2972 */ 2973 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, 2974 UIO_SYSSPACE, path, td); 2975 error = namei(&nd); 2976 if (error != 0) { 2977 vrele(vp); 2978 goto out; 2979 } 2980 NDFREE(&nd, NDF_ONLY_PNBUF); 2981 vp1 = nd.ni_vp; 2982 vrele(vp); 2983 if (vp1 == vp) 2984 strcpy(path, rpath); 2985 else { 2986 vput(vp1); 2987 error = ENOENT; 2988 } 2989 2990 out: 2991 free(fbuf, M_TEMP); 2992 return (error); 2993 } 2994 2995 #ifdef DDB 2996 static void 2997 db_print_vpath(struct vnode *vp) 2998 { 2999 3000 while (vp != NULL) { 3001 db_printf("%p: ", vp); 3002 if (vp == rootvnode) { 3003 db_printf("/"); 3004 vp = NULL; 3005 } else { 3006 if (vp->v_vflag & VV_ROOT) { 3007 db_printf("<mount point>"); 3008 vp = vp->v_mount->mnt_vnodecovered; 3009 } else { 3010 struct namecache *ncp; 3011 char *ncn; 3012 int i; 3013 3014 ncp = TAILQ_FIRST(&vp->v_cache_dst); 3015 if (ncp != NULL) { 3016 ncn = ncp->nc_name; 3017 for (i = 0; i < ncp->nc_nlen; i++) 3018 db_printf("%c", *ncn++); 3019 vp = ncp->nc_dvp; 3020 } else { 3021 vp = NULL; 3022 } 3023 } 3024 } 3025 db_printf("\n"); 3026 } 3027 3028 return; 3029 } 3030 3031 DB_SHOW_COMMAND(vpath, db_show_vpath) 3032 { 3033 struct vnode *vp; 3034 3035 if (!have_addr) { 3036 db_printf("usage: show vpath <struct vnode *>\n"); 3037 return; 3038 } 3039 3040 vp = (struct vnode *)addr; 3041 db_print_vpath(vp); 3042 } 3043 3044 #endif 3045 3046 static bool __read_frequently cache_fast_lookup = true; 3047 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW, 3048 &cache_fast_lookup, 0, ""); 3049 3050 #define CACHE_FPL_FAILED -2020 3051 3052 static void 3053 cache_fpl_cleanup_cnp(struct componentname *cnp) 3054 { 3055 3056 uma_zfree(namei_zone, cnp->cn_pnbuf); 3057 #ifdef DIAGNOSTIC 3058 cnp->cn_pnbuf = NULL; 3059 cnp->cn_nameptr = NULL; 3060 #endif 3061 } 3062 3063 static void 3064 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp) 3065 { 3066 struct componentname *cnp; 3067 3068 cnp = &ndp->ni_cnd; 3069 while (*(cnp->cn_nameptr) == '/') { 3070 cnp->cn_nameptr++; 3071 ndp->ni_pathlen--; 3072 } 3073 3074 *dpp = ndp->ni_rootdir; 3075 } 3076 3077 /* 3078 * Components of nameidata (or objects it can point to) which may 3079 * need restoring in case fast path lookup fails. 3080 */ 3081 struct nameidata_saved { 3082 long cn_namelen; 3083 char *cn_nameptr; 3084 size_t ni_pathlen; 3085 int cn_flags; 3086 }; 3087 3088 struct cache_fpl { 3089 struct nameidata *ndp; 3090 struct componentname *cnp; 3091 struct pwd *pwd; 3092 struct vnode *dvp; 3093 struct vnode *tvp; 3094 seqc_t dvp_seqc; 3095 seqc_t tvp_seqc; 3096 struct nameidata_saved snd; 3097 int line; 3098 enum cache_fpl_status status:8; 3099 bool in_smr; 3100 }; 3101 3102 static void 3103 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd) 3104 { 3105 3106 snd->cn_flags = fpl->ndp->ni_cnd.cn_flags; 3107 snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen; 3108 snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr; 3109 snd->ni_pathlen = fpl->ndp->ni_pathlen; 3110 } 3111 3112 static void 3113 cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd) 3114 { 3115 3116 fpl->ndp->ni_cnd.cn_flags = snd->cn_flags; 3117 fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen; 3118 fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr; 3119 fpl->ndp->ni_pathlen = snd->ni_pathlen; 3120 } 3121 3122 #ifdef INVARIANTS 3123 #define cache_fpl_smr_assert_entered(fpl) ({ \ 3124 struct cache_fpl *_fpl = (fpl); \ 3125 MPASS(_fpl->in_smr == true); \ 3126 VFS_SMR_ASSERT_ENTERED(); \ 3127 }) 3128 #define cache_fpl_smr_assert_not_entered(fpl) ({ \ 3129 struct cache_fpl *_fpl = (fpl); \ 3130 MPASS(_fpl->in_smr == false); \ 3131 VFS_SMR_ASSERT_NOT_ENTERED(); \ 3132 }) 3133 #else 3134 #define cache_fpl_smr_assert_entered(fpl) do { } while (0) 3135 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0) 3136 #endif 3137 3138 #define cache_fpl_smr_enter_initial(fpl) ({ \ 3139 struct cache_fpl *_fpl = (fpl); \ 3140 vfs_smr_enter(); \ 3141 _fpl->in_smr = true; \ 3142 }) 3143 3144 #define cache_fpl_smr_enter(fpl) ({ \ 3145 struct cache_fpl *_fpl = (fpl); \ 3146 MPASS(_fpl->in_smr == false); \ 3147 vfs_smr_enter(); \ 3148 _fpl->in_smr = true; \ 3149 }) 3150 3151 #define cache_fpl_smr_exit(fpl) ({ \ 3152 struct cache_fpl *_fpl = (fpl); \ 3153 MPASS(_fpl->in_smr == true); \ 3154 vfs_smr_exit(); \ 3155 _fpl->in_smr = false; \ 3156 }) 3157 3158 static int 3159 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line) 3160 { 3161 3162 if (fpl->status != CACHE_FPL_STATUS_UNSET) { 3163 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, 3164 ("%s: converting to abort from %d at %d, set at %d\n", 3165 __func__, fpl->status, line, fpl->line)); 3166 } 3167 fpl->status = CACHE_FPL_STATUS_ABORTED; 3168 fpl->line = line; 3169 return (CACHE_FPL_FAILED); 3170 } 3171 3172 #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__) 3173 3174 static int 3175 cache_fpl_partial_impl(struct cache_fpl *fpl, int line) 3176 { 3177 3178 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3179 ("%s: setting to partial at %d, but already set to %d at %d\n", 3180 __func__, line, fpl->status, fpl->line)); 3181 cache_fpl_smr_assert_entered(fpl); 3182 fpl->status = CACHE_FPL_STATUS_PARTIAL; 3183 fpl->line = line; 3184 return (CACHE_FPL_FAILED); 3185 } 3186 3187 #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__) 3188 3189 static int 3190 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line) 3191 { 3192 3193 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3194 ("%s: setting to handled at %d, but already set to %d at %d\n", 3195 __func__, line, fpl->status, fpl->line)); 3196 cache_fpl_smr_assert_not_entered(fpl); 3197 MPASS(error != CACHE_FPL_FAILED); 3198 fpl->status = CACHE_FPL_STATUS_HANDLED; 3199 fpl->line = line; 3200 return (error); 3201 } 3202 3203 #define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__) 3204 3205 #define CACHE_FPL_SUPPORTED_CN_FLAGS \ 3206 (LOCKLEAF | LOCKPARENT | WANTPARENT | NOCACHE | FOLLOW | LOCKSHARED | SAVENAME | \ 3207 SAVESTART | WILLBEDIR | ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK) 3208 3209 #define CACHE_FPL_INTERNAL_CN_FLAGS \ 3210 (ISDOTDOT | MAKEENTRY | ISLASTCN) 3211 3212 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, 3213 "supported and internal flags overlap"); 3214 3215 static bool 3216 cache_fpl_islastcn(struct nameidata *ndp) 3217 { 3218 3219 return (*ndp->ni_next == 0); 3220 } 3221 3222 static bool 3223 cache_fpl_isdotdot(struct componentname *cnp) 3224 { 3225 3226 if (cnp->cn_namelen == 2 && 3227 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') 3228 return (true); 3229 return (false); 3230 } 3231 3232 static bool 3233 cache_can_fplookup(struct cache_fpl *fpl) 3234 { 3235 struct nameidata *ndp; 3236 struct componentname *cnp; 3237 struct thread *td; 3238 3239 ndp = fpl->ndp; 3240 cnp = fpl->cnp; 3241 td = cnp->cn_thread; 3242 3243 if (!cache_fast_lookup) { 3244 cache_fpl_aborted(fpl); 3245 return (false); 3246 } 3247 #ifdef MAC 3248 if (mac_vnode_check_lookup_enabled()) { 3249 cache_fpl_aborted(fpl); 3250 return (false); 3251 } 3252 #endif 3253 if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) { 3254 cache_fpl_aborted(fpl); 3255 return (false); 3256 } 3257 if (ndp->ni_dirfd != AT_FDCWD) { 3258 cache_fpl_aborted(fpl); 3259 return (false); 3260 } 3261 if (IN_CAPABILITY_MODE(td)) { 3262 cache_fpl_aborted(fpl); 3263 return (false); 3264 } 3265 if (AUDITING_TD(td)) { 3266 cache_fpl_aborted(fpl); 3267 return (false); 3268 } 3269 if (ndp->ni_startdir != NULL) { 3270 cache_fpl_aborted(fpl); 3271 return (false); 3272 } 3273 return (true); 3274 } 3275 3276 static bool 3277 cache_fplookup_vnode_supported(struct vnode *vp) 3278 { 3279 3280 return (vp->v_type != VLNK); 3281 } 3282 3283 /* 3284 * Move a negative entry to the hot list. 3285 * 3286 * We have to take locks, but they may be contended and in the worst 3287 * case we may need to go off CPU. We don't want to spin within the 3288 * smr section and we can't block with it. Instead we are going to 3289 * look up the entry again. 3290 */ 3291 static int __noinline 3292 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp, 3293 uint32_t hash) 3294 { 3295 struct componentname *cnp; 3296 struct namecache *ncp; 3297 struct neglist *neglist; 3298 struct negstate *negstate; 3299 struct vnode *dvp; 3300 u_char nc_flag; 3301 3302 cnp = fpl->cnp; 3303 dvp = fpl->dvp; 3304 3305 if (!vhold_smr(dvp)) 3306 return (cache_fpl_aborted(fpl)); 3307 3308 neglist = NCP2NEGLIST(oncp); 3309 cache_fpl_smr_exit(fpl); 3310 3311 mtx_lock(&ncneg_hot.nl_lock); 3312 mtx_lock(&neglist->nl_lock); 3313 /* 3314 * For hash iteration. 3315 */ 3316 cache_fpl_smr_enter(fpl); 3317 3318 /* 3319 * Avoid all surprises by only succeeding if we got the same entry and 3320 * bailing completely otherwise. 3321 * 3322 * In particular at this point there can be a new ncp which matches the 3323 * search but hashes to a different neglist. 3324 */ 3325 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 3326 if (ncp == oncp) 3327 break; 3328 } 3329 3330 /* 3331 * No match to begin with. 3332 */ 3333 if (__predict_false(ncp == NULL)) { 3334 goto out_abort; 3335 } 3336 3337 /* 3338 * The newly found entry may be something different... 3339 */ 3340 if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 3341 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) { 3342 goto out_abort; 3343 } 3344 3345 /* 3346 * ... and not even negative. 3347 */ 3348 nc_flag = atomic_load_char(&ncp->nc_flag); 3349 if ((nc_flag & NCF_NEGATIVE) == 0) { 3350 goto out_abort; 3351 } 3352 3353 if (__predict_false(!cache_ncp_canuse(ncp))) { 3354 goto out_abort; 3355 } 3356 3357 negstate = NCP2NEGSTATE(ncp); 3358 if ((negstate->neg_flag & NEG_HOT) == 0) { 3359 numhotneg++; 3360 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 3361 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); 3362 negstate->neg_flag |= NEG_HOT; 3363 } 3364 3365 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 3366 counter_u64_add(numneghits, 1); 3367 cache_fpl_smr_exit(fpl); 3368 mtx_unlock(&neglist->nl_lock); 3369 mtx_unlock(&ncneg_hot.nl_lock); 3370 vdrop(dvp); 3371 return (cache_fpl_handled(fpl, ENOENT)); 3372 out_abort: 3373 cache_fpl_smr_exit(fpl); 3374 mtx_unlock(&neglist->nl_lock); 3375 mtx_unlock(&ncneg_hot.nl_lock); 3376 vdrop(dvp); 3377 return (cache_fpl_aborted(fpl)); 3378 } 3379 3380 /* 3381 * The target vnode is not supported, prepare for the slow path to take over. 3382 */ 3383 static int __noinline 3384 cache_fplookup_partial_setup(struct cache_fpl *fpl) 3385 { 3386 struct nameidata *ndp; 3387 struct componentname *cnp; 3388 enum vgetstate dvs; 3389 struct vnode *dvp; 3390 struct pwd *pwd; 3391 seqc_t dvp_seqc; 3392 3393 ndp = fpl->ndp; 3394 cnp = fpl->cnp; 3395 dvp = fpl->dvp; 3396 dvp_seqc = fpl->dvp_seqc; 3397 3398 dvs = vget_prep_smr(dvp); 3399 if (__predict_false(dvs == VGET_NONE)) { 3400 cache_fpl_smr_exit(fpl); 3401 return (cache_fpl_aborted(fpl)); 3402 } 3403 3404 cache_fpl_smr_exit(fpl); 3405 3406 vget_finish_ref(dvp, dvs); 3407 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3408 vrele(dvp); 3409 return (cache_fpl_aborted(fpl)); 3410 } 3411 3412 pwd = pwd_hold(curthread); 3413 if (fpl->pwd != pwd) { 3414 vrele(dvp); 3415 pwd_drop(pwd); 3416 return (cache_fpl_aborted(fpl)); 3417 } 3418 3419 cache_fpl_restore(fpl, &fpl->snd); 3420 3421 ndp->ni_startdir = dvp; 3422 cnp->cn_flags |= MAKEENTRY; 3423 if (cache_fpl_islastcn(ndp)) 3424 cnp->cn_flags |= ISLASTCN; 3425 if (cache_fpl_isdotdot(cnp)) 3426 cnp->cn_flags |= ISDOTDOT; 3427 3428 return (0); 3429 } 3430 3431 static int 3432 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs) 3433 { 3434 struct componentname *cnp; 3435 struct vnode *tvp; 3436 seqc_t tvp_seqc; 3437 int error, lkflags; 3438 3439 cnp = fpl->cnp; 3440 tvp = fpl->tvp; 3441 tvp_seqc = fpl->tvp_seqc; 3442 3443 if ((cnp->cn_flags & LOCKLEAF) != 0) { 3444 lkflags = LK_SHARED; 3445 if ((cnp->cn_flags & LOCKSHARED) == 0) 3446 lkflags = LK_EXCLUSIVE; 3447 error = vget_finish(tvp, lkflags, tvs); 3448 if (__predict_false(error != 0)) { 3449 return (cache_fpl_aborted(fpl)); 3450 } 3451 } else { 3452 vget_finish_ref(tvp, tvs); 3453 } 3454 3455 if (!vn_seqc_consistent(tvp, tvp_seqc)) { 3456 if ((cnp->cn_flags & LOCKLEAF) != 0) 3457 vput(tvp); 3458 else 3459 vrele(tvp); 3460 return (cache_fpl_aborted(fpl)); 3461 } 3462 3463 return (cache_fpl_handled(fpl, 0)); 3464 } 3465 3466 /* 3467 * They want to possibly modify the state of the namecache. 3468 * 3469 * Don't try to match the API contract, just leave. 3470 * TODO: this leaves scalability on the table 3471 */ 3472 static int 3473 cache_fplookup_final_modifying(struct cache_fpl *fpl) 3474 { 3475 struct componentname *cnp; 3476 3477 cnp = fpl->cnp; 3478 MPASS(cnp->cn_nameiop != LOOKUP); 3479 return (cache_fpl_partial(fpl)); 3480 } 3481 3482 static int __noinline 3483 cache_fplookup_final_withparent(struct cache_fpl *fpl) 3484 { 3485 struct componentname *cnp; 3486 enum vgetstate dvs, tvs; 3487 struct vnode *dvp, *tvp; 3488 seqc_t dvp_seqc, tvp_seqc; 3489 int error; 3490 3491 cnp = fpl->cnp; 3492 dvp = fpl->dvp; 3493 dvp_seqc = fpl->dvp_seqc; 3494 tvp = fpl->tvp; 3495 tvp_seqc = fpl->tvp_seqc; 3496 3497 MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0); 3498 3499 /* 3500 * This is less efficient than it can be for simplicity. 3501 */ 3502 dvs = vget_prep_smr(dvp); 3503 if (__predict_false(dvs == VGET_NONE)) { 3504 return (cache_fpl_aborted(fpl)); 3505 } 3506 tvs = vget_prep_smr(tvp); 3507 if (__predict_false(tvs == VGET_NONE)) { 3508 cache_fpl_smr_exit(fpl); 3509 vget_abort(dvp, dvs); 3510 return (cache_fpl_aborted(fpl)); 3511 } 3512 3513 cache_fpl_smr_exit(fpl); 3514 3515 if ((cnp->cn_flags & LOCKPARENT) != 0) { 3516 error = vget_finish(dvp, LK_EXCLUSIVE, dvs); 3517 if (__predict_false(error != 0)) { 3518 vget_abort(tvp, tvs); 3519 return (cache_fpl_aborted(fpl)); 3520 } 3521 } else { 3522 vget_finish_ref(dvp, dvs); 3523 } 3524 3525 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3526 vget_abort(tvp, tvs); 3527 if ((cnp->cn_flags & LOCKPARENT) != 0) 3528 vput(dvp); 3529 else 3530 vrele(dvp); 3531 return (cache_fpl_aborted(fpl)); 3532 } 3533 3534 error = cache_fplookup_final_child(fpl, tvs); 3535 if (__predict_false(error != 0)) { 3536 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED); 3537 if ((cnp->cn_flags & LOCKPARENT) != 0) 3538 vput(dvp); 3539 else 3540 vrele(dvp); 3541 return (error); 3542 } 3543 3544 MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED); 3545 return (0); 3546 } 3547 3548 static int 3549 cache_fplookup_final(struct cache_fpl *fpl) 3550 { 3551 struct componentname *cnp; 3552 enum vgetstate tvs; 3553 struct vnode *dvp, *tvp; 3554 seqc_t dvp_seqc, tvp_seqc; 3555 3556 cnp = fpl->cnp; 3557 dvp = fpl->dvp; 3558 dvp_seqc = fpl->dvp_seqc; 3559 tvp = fpl->tvp; 3560 tvp_seqc = fpl->tvp_seqc; 3561 3562 VNPASS(cache_fplookup_vnode_supported(dvp), dvp); 3563 3564 if (cnp->cn_nameiop != LOOKUP) { 3565 return (cache_fplookup_final_modifying(fpl)); 3566 } 3567 3568 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) 3569 return (cache_fplookup_final_withparent(fpl)); 3570 3571 tvs = vget_prep_smr(tvp); 3572 if (__predict_false(tvs == VGET_NONE)) { 3573 return (cache_fpl_partial(fpl)); 3574 } 3575 3576 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3577 cache_fpl_smr_exit(fpl); 3578 vget_abort(tvp, tvs); 3579 return (cache_fpl_aborted(fpl)); 3580 } 3581 3582 cache_fpl_smr_exit(fpl); 3583 return (cache_fplookup_final_child(fpl, tvs)); 3584 } 3585 3586 static int __noinline 3587 cache_fplookup_dot(struct cache_fpl *fpl) 3588 { 3589 struct vnode *dvp; 3590 3591 dvp = fpl->dvp; 3592 3593 fpl->tvp = dvp; 3594 fpl->tvp_seqc = vn_seqc_read_any(dvp); 3595 if (seqc_in_modify(fpl->tvp_seqc)) { 3596 return (cache_fpl_aborted(fpl)); 3597 } 3598 3599 counter_u64_add(dothits, 1); 3600 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", dvp); 3601 3602 return (0); 3603 } 3604 3605 static int __noinline 3606 cache_fplookup_dotdot(struct cache_fpl *fpl) 3607 { 3608 struct nameidata *ndp; 3609 struct componentname *cnp; 3610 struct namecache *ncp; 3611 struct vnode *dvp; 3612 struct prison *pr; 3613 u_char nc_flag; 3614 3615 ndp = fpl->ndp; 3616 cnp = fpl->cnp; 3617 dvp = fpl->dvp; 3618 3619 /* 3620 * XXX this is racy the same way regular lookup is 3621 */ 3622 for (pr = cnp->cn_cred->cr_prison; pr != NULL; 3623 pr = pr->pr_parent) 3624 if (dvp == pr->pr_root) 3625 break; 3626 3627 if (dvp == ndp->ni_rootdir || 3628 dvp == ndp->ni_topdir || 3629 dvp == rootvnode || 3630 pr != NULL) { 3631 fpl->tvp = dvp; 3632 fpl->tvp_seqc = vn_seqc_read_any(dvp); 3633 if (seqc_in_modify(fpl->tvp_seqc)) { 3634 return (cache_fpl_aborted(fpl)); 3635 } 3636 return (0); 3637 } 3638 3639 if ((dvp->v_vflag & VV_ROOT) != 0) { 3640 /* 3641 * TODO 3642 * The opposite of climb mount is needed here. 3643 */ 3644 return (cache_fpl_aborted(fpl)); 3645 } 3646 3647 ncp = atomic_load_ptr(&dvp->v_cache_dd); 3648 if (ncp == NULL) { 3649 return (cache_fpl_aborted(fpl)); 3650 } 3651 3652 nc_flag = atomic_load_char(&ncp->nc_flag); 3653 if ((nc_flag & NCF_ISDOTDOT) != 0) { 3654 if ((nc_flag & NCF_NEGATIVE) != 0) 3655 return (cache_fpl_aborted(fpl)); 3656 fpl->tvp = ncp->nc_vp; 3657 } else { 3658 fpl->tvp = ncp->nc_dvp; 3659 } 3660 3661 if (__predict_false(!cache_ncp_canuse(ncp))) { 3662 return (cache_fpl_aborted(fpl)); 3663 } 3664 3665 fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp); 3666 if (seqc_in_modify(fpl->tvp_seqc)) { 3667 return (cache_fpl_partial(fpl)); 3668 } 3669 3670 counter_u64_add(dotdothits, 1); 3671 return (0); 3672 } 3673 3674 static int 3675 cache_fplookup_next(struct cache_fpl *fpl) 3676 { 3677 struct componentname *cnp; 3678 struct namecache *ncp; 3679 struct negstate *negstate; 3680 struct vnode *dvp, *tvp; 3681 u_char nc_flag; 3682 uint32_t hash; 3683 bool neg_hot; 3684 3685 cnp = fpl->cnp; 3686 dvp = fpl->dvp; 3687 3688 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) { 3689 return (cache_fplookup_dot(fpl)); 3690 } 3691 3692 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 3693 3694 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 3695 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 3696 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 3697 break; 3698 } 3699 3700 /* 3701 * If there is no entry we have to punt to the slow path to perform 3702 * actual lookup. Should there be nothing with this name a negative 3703 * entry will be created. 3704 */ 3705 if (__predict_false(ncp == NULL)) { 3706 return (cache_fpl_partial(fpl)); 3707 } 3708 3709 tvp = atomic_load_ptr(&ncp->nc_vp); 3710 nc_flag = atomic_load_char(&ncp->nc_flag); 3711 if ((nc_flag & NCF_NEGATIVE) != 0) { 3712 /* 3713 * If they want to create an entry we need to replace this one. 3714 */ 3715 if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) { 3716 return (cache_fpl_partial(fpl)); 3717 } 3718 negstate = NCP2NEGSTATE(ncp); 3719 neg_hot = ((negstate->neg_flag & NEG_HOT) != 0); 3720 if (__predict_false(!cache_ncp_canuse(ncp))) { 3721 return (cache_fpl_partial(fpl)); 3722 } 3723 if (__predict_false((nc_flag & NCF_WHITE) != 0)) { 3724 return (cache_fpl_partial(fpl)); 3725 } 3726 if (!neg_hot) { 3727 return (cache_fplookup_negative_promote(fpl, ncp, hash)); 3728 } 3729 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, 3730 ncp->nc_name); 3731 counter_u64_add(numneghits, 1); 3732 cache_fpl_smr_exit(fpl); 3733 return (cache_fpl_handled(fpl, ENOENT)); 3734 } 3735 3736 if (__predict_false(!cache_ncp_canuse(ncp))) { 3737 return (cache_fpl_partial(fpl)); 3738 } 3739 3740 fpl->tvp = tvp; 3741 fpl->tvp_seqc = vn_seqc_read_any(tvp); 3742 if (seqc_in_modify(fpl->tvp_seqc)) { 3743 return (cache_fpl_partial(fpl)); 3744 } 3745 3746 if (!cache_fplookup_vnode_supported(tvp)) { 3747 return (cache_fpl_partial(fpl)); 3748 } 3749 3750 counter_u64_add(numposhits, 1); 3751 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp); 3752 return (0); 3753 } 3754 3755 static bool 3756 cache_fplookup_mp_supported(struct mount *mp) 3757 { 3758 3759 if (mp == NULL) 3760 return (false); 3761 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) 3762 return (false); 3763 return (true); 3764 } 3765 3766 /* 3767 * Walk up the mount stack (if any). 3768 * 3769 * Correctness is provided in the following ways: 3770 * - all vnodes are protected from freeing with SMR 3771 * - struct mount objects are type stable making them always safe to access 3772 * - stability of the particular mount is provided by busying it 3773 * - relationship between the vnode which is mounted on and the mount is 3774 * verified with the vnode sequence counter after busying 3775 * - association between root vnode of the mount and the mount is protected 3776 * by busy 3777 * 3778 * From that point on we can read the sequence counter of the root vnode 3779 * and get the next mount on the stack (if any) using the same protection. 3780 * 3781 * By the end of successful walk we are guaranteed the reached state was 3782 * indeed present at least at some point which matches the regular lookup. 3783 */ 3784 static int __noinline 3785 cache_fplookup_climb_mount(struct cache_fpl *fpl) 3786 { 3787 struct mount *mp, *prev_mp; 3788 struct vnode *vp; 3789 seqc_t vp_seqc; 3790 3791 vp = fpl->tvp; 3792 vp_seqc = fpl->tvp_seqc; 3793 3794 VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp); 3795 mp = atomic_load_ptr(&vp->v_mountedhere); 3796 if (mp == NULL) 3797 return (0); 3798 3799 prev_mp = NULL; 3800 for (;;) { 3801 if (!vfs_op_thread_enter_crit(mp)) { 3802 if (prev_mp != NULL) 3803 vfs_op_thread_exit_crit(prev_mp); 3804 return (cache_fpl_partial(fpl)); 3805 } 3806 if (prev_mp != NULL) 3807 vfs_op_thread_exit_crit(prev_mp); 3808 if (!vn_seqc_consistent(vp, vp_seqc)) { 3809 vfs_op_thread_exit_crit(mp); 3810 return (cache_fpl_partial(fpl)); 3811 } 3812 if (!cache_fplookup_mp_supported(mp)) { 3813 vfs_op_thread_exit_crit(mp); 3814 return (cache_fpl_partial(fpl)); 3815 } 3816 vp = atomic_load_ptr(&mp->mnt_rootvnode); 3817 if (vp == NULL || VN_IS_DOOMED(vp)) { 3818 vfs_op_thread_exit_crit(mp); 3819 return (cache_fpl_partial(fpl)); 3820 } 3821 vp_seqc = vn_seqc_read_any(vp); 3822 if (seqc_in_modify(vp_seqc)) { 3823 vfs_op_thread_exit_crit(mp); 3824 return (cache_fpl_partial(fpl)); 3825 } 3826 prev_mp = mp; 3827 mp = atomic_load_ptr(&vp->v_mountedhere); 3828 if (mp == NULL) 3829 break; 3830 } 3831 3832 vfs_op_thread_exit_crit(prev_mp); 3833 fpl->tvp = vp; 3834 fpl->tvp_seqc = vp_seqc; 3835 return (0); 3836 } 3837 3838 static bool 3839 cache_fplookup_need_climb_mount(struct cache_fpl *fpl) 3840 { 3841 struct mount *mp; 3842 struct vnode *vp; 3843 3844 vp = fpl->tvp; 3845 3846 /* 3847 * Hack: while this is a union, the pointer tends to be NULL so save on 3848 * a branch. 3849 */ 3850 mp = atomic_load_ptr(&vp->v_mountedhere); 3851 if (mp == NULL) 3852 return (false); 3853 if (vp->v_type == VDIR) 3854 return (true); 3855 return (false); 3856 } 3857 3858 /* 3859 * Parse the path. 3860 * 3861 * The code is mostly copy-pasted from regular lookup, see lookup(). 3862 * The structure is maintained along with comments for easier maintenance. 3863 * Deduplicating the code will become feasible after fast path lookup 3864 * becomes more feature-complete. 3865 */ 3866 static int 3867 cache_fplookup_parse(struct cache_fpl *fpl) 3868 { 3869 struct nameidata *ndp; 3870 struct componentname *cnp; 3871 char *cp; 3872 3873 ndp = fpl->ndp; 3874 cnp = fpl->cnp; 3875 3876 /* 3877 * Search a new directory. 3878 * 3879 * The last component of the filename is left accessible via 3880 * cnp->cn_nameptr for callers that need the name. Callers needing 3881 * the name set the SAVENAME flag. When done, they assume 3882 * responsibility for freeing the pathname buffer. 3883 */ 3884 for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++) 3885 continue; 3886 cnp->cn_namelen = cp - cnp->cn_nameptr; 3887 if (__predict_false(cnp->cn_namelen > NAME_MAX)) { 3888 cache_fpl_smr_exit(fpl); 3889 return (cache_fpl_handled(fpl, ENAMETOOLONG)); 3890 } 3891 ndp->ni_pathlen -= cnp->cn_namelen; 3892 KASSERT(ndp->ni_pathlen <= PATH_MAX, 3893 ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen)); 3894 ndp->ni_next = cp; 3895 3896 /* 3897 * Replace multiple slashes by a single slash and trailing slashes 3898 * by a null. This must be done before VOP_LOOKUP() because some 3899 * fs's don't know about trailing slashes. Remember if there were 3900 * trailing slashes to handle symlinks, existing non-directories 3901 * and non-existing files that won't be directories specially later. 3902 */ 3903 while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) { 3904 cp++; 3905 ndp->ni_pathlen--; 3906 if (*cp == '\0') { 3907 /* 3908 * TODO 3909 * Regular lookup performs the following: 3910 * *ndp->ni_next = '\0'; 3911 * cnp->cn_flags |= TRAILINGSLASH; 3912 * 3913 * Which is problematic since it modifies data read 3914 * from userspace. Then if fast path lookup was to 3915 * abort we would have to either restore it or convey 3916 * the flag. Since this is a corner case just ignore 3917 * it for simplicity. 3918 */ 3919 return (cache_fpl_partial(fpl)); 3920 } 3921 } 3922 ndp->ni_next = cp; 3923 3924 /* 3925 * Check for degenerate name (e.g. / or "") 3926 * which is a way of talking about a directory, 3927 * e.g. like "/." or ".". 3928 * 3929 * TODO 3930 * Another corner case handled by the regular lookup 3931 */ 3932 if (__predict_false(cnp->cn_nameptr[0] == '\0')) { 3933 return (cache_fpl_partial(fpl)); 3934 } 3935 return (0); 3936 } 3937 3938 static void 3939 cache_fplookup_parse_advance(struct cache_fpl *fpl) 3940 { 3941 struct nameidata *ndp; 3942 struct componentname *cnp; 3943 3944 ndp = fpl->ndp; 3945 cnp = fpl->cnp; 3946 3947 cnp->cn_nameptr = ndp->ni_next; 3948 while (*cnp->cn_nameptr == '/') { 3949 cnp->cn_nameptr++; 3950 ndp->ni_pathlen--; 3951 } 3952 } 3953 3954 static int __noinline 3955 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error) 3956 { 3957 3958 switch (error) { 3959 case EAGAIN: 3960 /* 3961 * Can happen when racing against vgone. 3962 * */ 3963 case EOPNOTSUPP: 3964 cache_fpl_partial(fpl); 3965 break; 3966 default: 3967 /* 3968 * See the API contract for VOP_FPLOOKUP_VEXEC. 3969 */ 3970 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { 3971 error = cache_fpl_aborted(fpl); 3972 } else { 3973 cache_fpl_smr_exit(fpl); 3974 cache_fpl_handled(fpl, error); 3975 } 3976 break; 3977 } 3978 return (error); 3979 } 3980 3981 static int 3982 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl) 3983 { 3984 struct nameidata *ndp; 3985 struct componentname *cnp; 3986 struct mount *mp; 3987 int error; 3988 3989 error = CACHE_FPL_FAILED; 3990 ndp = fpl->ndp; 3991 cnp = fpl->cnp; 3992 3993 cache_fpl_checkpoint(fpl, &fpl->snd); 3994 3995 fpl->dvp = dvp; 3996 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); 3997 if (seqc_in_modify(fpl->dvp_seqc)) { 3998 cache_fpl_aborted(fpl); 3999 goto out; 4000 } 4001 mp = atomic_load_ptr(&fpl->dvp->v_mount); 4002 if (!cache_fplookup_mp_supported(mp)) { 4003 cache_fpl_aborted(fpl); 4004 goto out; 4005 } 4006 4007 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); 4008 4009 for (;;) { 4010 error = cache_fplookup_parse(fpl); 4011 if (__predict_false(error != 0)) { 4012 break; 4013 } 4014 4015 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); 4016 4017 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred); 4018 if (__predict_false(error != 0)) { 4019 error = cache_fplookup_failed_vexec(fpl, error); 4020 break; 4021 } 4022 4023 if (__predict_false(cache_fpl_isdotdot(cnp))) { 4024 error = cache_fplookup_dotdot(fpl); 4025 if (__predict_false(error != 0)) { 4026 break; 4027 } 4028 } else { 4029 error = cache_fplookup_next(fpl); 4030 if (__predict_false(error != 0)) { 4031 break; 4032 } 4033 4034 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 4035 4036 if (cache_fplookup_need_climb_mount(fpl)) { 4037 error = cache_fplookup_climb_mount(fpl); 4038 if (__predict_false(error != 0)) { 4039 break; 4040 } 4041 } 4042 } 4043 4044 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 4045 4046 if (cache_fpl_islastcn(ndp)) { 4047 error = cache_fplookup_final(fpl); 4048 break; 4049 } 4050 4051 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { 4052 error = cache_fpl_aborted(fpl); 4053 break; 4054 } 4055 4056 fpl->dvp = fpl->tvp; 4057 fpl->dvp_seqc = fpl->tvp_seqc; 4058 4059 cache_fplookup_parse_advance(fpl); 4060 cache_fpl_checkpoint(fpl, &fpl->snd); 4061 } 4062 out: 4063 switch (fpl->status) { 4064 case CACHE_FPL_STATUS_UNSET: 4065 __assert_unreachable(); 4066 break; 4067 case CACHE_FPL_STATUS_PARTIAL: 4068 cache_fpl_smr_assert_entered(fpl); 4069 return (cache_fplookup_partial_setup(fpl)); 4070 case CACHE_FPL_STATUS_ABORTED: 4071 if (fpl->in_smr) 4072 cache_fpl_smr_exit(fpl); 4073 return (CACHE_FPL_FAILED); 4074 case CACHE_FPL_STATUS_HANDLED: 4075 MPASS(error != CACHE_FPL_FAILED); 4076 cache_fpl_smr_assert_not_entered(fpl); 4077 if (__predict_false(error != 0)) { 4078 ndp->ni_dvp = NULL; 4079 ndp->ni_vp = NULL; 4080 cache_fpl_cleanup_cnp(cnp); 4081 return (error); 4082 } 4083 ndp->ni_dvp = fpl->dvp; 4084 ndp->ni_vp = fpl->tvp; 4085 if (cnp->cn_flags & SAVENAME) 4086 cnp->cn_flags |= HASBUF; 4087 else 4088 cache_fpl_cleanup_cnp(cnp); 4089 return (error); 4090 } 4091 } 4092 4093 /* 4094 * Fast path lookup protected with SMR and sequence counters. 4095 * 4096 * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one. 4097 * 4098 * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria 4099 * outlined below. 4100 * 4101 * Traditional vnode lookup conceptually looks like this: 4102 * 4103 * vn_lock(current); 4104 * for (;;) { 4105 * next = find(); 4106 * vn_lock(next); 4107 * vn_unlock(current); 4108 * current = next; 4109 * if (last) 4110 * break; 4111 * } 4112 * return (current); 4113 * 4114 * Each jump to the next vnode is safe memory-wise and atomic with respect to 4115 * any modifications thanks to holding respective locks. 4116 * 4117 * The same guarantee can be provided with a combination of safe memory 4118 * reclamation and sequence counters instead. If all operations which affect 4119 * the relationship between the current vnode and the one we are looking for 4120 * also modify the counter, we can verify whether all the conditions held as 4121 * we made the jump. This includes things like permissions, mount points etc. 4122 * Counter modification is provided by enclosing relevant places in 4123 * vn_seqc_write_begin()/end() calls. 4124 * 4125 * Thus this translates to: 4126 * 4127 * vfs_smr_enter(); 4128 * dvp_seqc = seqc_read_any(dvp); 4129 * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode 4130 * abort(); 4131 * for (;;) { 4132 * tvp = find(); 4133 * tvp_seqc = seqc_read_any(tvp); 4134 * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode 4135 * abort(); 4136 * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode 4137 * abort(); 4138 * dvp = tvp; // we know nothing of importance has changed 4139 * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration 4140 * if (last) 4141 * break; 4142 * } 4143 * vget(); // secure the vnode 4144 * if (!seqc_consistent(tvp, tvp_seqc) // final check 4145 * abort(); 4146 * // at this point we know nothing has changed for any parent<->child pair 4147 * // as they were crossed during the lookup, meaning we matched the guarantee 4148 * // of the locked variant 4149 * return (tvp); 4150 * 4151 * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows: 4152 * - they are called while within vfs_smr protection which they must never exit 4153 * - EAGAIN can be returned to denote checking could not be performed, it is 4154 * always valid to return it 4155 * - if the sequence counter has not changed the result must be valid 4156 * - if the sequence counter has changed both false positives and false negatives 4157 * are permitted (since the result will be rejected later) 4158 * - for simple cases of unix permission checks vaccess_vexec_smr can be used 4159 * 4160 * Caveats to watch out for: 4161 * - vnodes are passed unlocked and unreferenced with nothing stopping 4162 * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised 4163 * to use atomic_load_ptr to fetch it. 4164 * - the aforementioned object can also get freed, meaning absent other means it 4165 * should be protected with vfs_smr 4166 * - either safely checking permissions as they are modified or guaranteeing 4167 * their stability is left to the routine 4168 */ 4169 int 4170 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status, 4171 struct pwd **pwdp) 4172 { 4173 struct cache_fpl fpl; 4174 struct pwd *pwd; 4175 struct vnode *dvp; 4176 struct componentname *cnp; 4177 struct nameidata_saved orig; 4178 int error; 4179 4180 MPASS(ndp->ni_lcf == 0); 4181 4182 fpl.status = CACHE_FPL_STATUS_UNSET; 4183 fpl.ndp = ndp; 4184 fpl.cnp = &ndp->ni_cnd; 4185 MPASS(curthread == fpl.cnp->cn_thread); 4186 4187 if ((fpl.cnp->cn_flags & SAVESTART) != 0) 4188 MPASS(fpl.cnp->cn_nameiop != LOOKUP); 4189 4190 if (!cache_can_fplookup(&fpl)) { 4191 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 4192 *status = fpl.status; 4193 return (EOPNOTSUPP); 4194 } 4195 4196 cache_fpl_checkpoint(&fpl, &orig); 4197 4198 cache_fpl_smr_enter_initial(&fpl); 4199 pwd = pwd_get_smr(); 4200 fpl.pwd = pwd; 4201 ndp->ni_rootdir = pwd->pwd_rdir; 4202 ndp->ni_topdir = pwd->pwd_jdir; 4203 4204 cnp = fpl.cnp; 4205 cnp->cn_nameptr = cnp->cn_pnbuf; 4206 if (cnp->cn_pnbuf[0] == '/') { 4207 cache_fpl_handle_root(ndp, &dvp); 4208 } else { 4209 MPASS(ndp->ni_dirfd == AT_FDCWD); 4210 dvp = pwd->pwd_cdir; 4211 } 4212 4213 SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true); 4214 4215 error = cache_fplookup_impl(dvp, &fpl); 4216 cache_fpl_smr_assert_not_entered(&fpl); 4217 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 4218 4219 *status = fpl.status; 4220 switch (fpl.status) { 4221 case CACHE_FPL_STATUS_UNSET: 4222 __assert_unreachable(); 4223 break; 4224 case CACHE_FPL_STATUS_HANDLED: 4225 SDT_PROBE3(vfs, namei, lookup, return, error, 4226 (error == 0 ? ndp->ni_vp : NULL), true); 4227 break; 4228 case CACHE_FPL_STATUS_PARTIAL: 4229 *pwdp = fpl.pwd; 4230 /* 4231 * Status restored by cache_fplookup_partial_setup. 4232 */ 4233 break; 4234 case CACHE_FPL_STATUS_ABORTED: 4235 cache_fpl_restore(&fpl, &orig); 4236 break; 4237 } 4238 return (error); 4239 } 4240