1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Poul-Henning Kamp of the FreeBSD Project. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_ddb.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/capsicum.h> 46 #include <sys/counter.h> 47 #include <sys/filedesc.h> 48 #include <sys/fnv_hash.h> 49 #include <sys/kernel.h> 50 #include <sys/ktr.h> 51 #include <sys/lock.h> 52 #include <sys/malloc.h> 53 #include <sys/fcntl.h> 54 #include <sys/jail.h> 55 #include <sys/mount.h> 56 #include <sys/namei.h> 57 #include <sys/proc.h> 58 #include <sys/rwlock.h> 59 #include <sys/seqc.h> 60 #include <sys/sdt.h> 61 #include <sys/smr.h> 62 #include <sys/smp.h> 63 #include <sys/syscallsubr.h> 64 #include <sys/sysctl.h> 65 #include <sys/sysproto.h> 66 #include <sys/vnode.h> 67 #include <ck_queue.h> 68 #ifdef KTRACE 69 #include <sys/ktrace.h> 70 #endif 71 72 #include <sys/capsicum.h> 73 74 #include <security/audit/audit.h> 75 #include <security/mac/mac_framework.h> 76 77 #ifdef DDB 78 #include <ddb/ddb.h> 79 #endif 80 81 #include <vm/uma.h> 82 83 SDT_PROVIDER_DECLARE(vfs); 84 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", 85 "struct vnode *"); 86 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", 87 "char *"); 88 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); 89 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", 90 "char *", "struct vnode *"); 91 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); 92 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", 93 "struct vnode *", "char *"); 94 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", 95 "struct vnode *"); 96 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, 97 "struct vnode *", "char *"); 98 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", 99 "char *"); 100 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); 101 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); 102 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); 103 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", 104 "struct vnode *"); 105 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", 106 "char *"); 107 SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *", 108 "char *"); 109 110 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool"); 111 SDT_PROBE_DECLARE(vfs, namei, lookup, entry); 112 SDT_PROBE_DECLARE(vfs, namei, lookup, return); 113 114 /* 115 * This structure describes the elements in the cache of recent 116 * names looked up by namei. 117 */ 118 struct negstate { 119 u_char neg_flag; 120 }; 121 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *), 122 "the state must fit in a union with a pointer without growing it"); 123 124 struct namecache { 125 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 126 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 127 CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */ 128 struct vnode *nc_dvp; /* vnode of parent of name */ 129 union { 130 struct vnode *nu_vp; /* vnode the name refers to */ 131 struct negstate nu_neg;/* negative entry state */ 132 } n_un; 133 u_char nc_flag; /* flag bits */ 134 u_char nc_nlen; /* length of name */ 135 char nc_name[0]; /* segment name + nul */ 136 }; 137 138 /* 139 * struct namecache_ts repeats struct namecache layout up to the 140 * nc_nlen member. 141 * struct namecache_ts is used in place of struct namecache when time(s) need 142 * to be stored. The nc_dotdottime field is used when a cache entry is mapping 143 * both a non-dotdot directory name plus dotdot for the directory's 144 * parent. 145 * 146 * See below for alignment requirement. 147 */ 148 struct namecache_ts { 149 struct timespec nc_time; /* timespec provided by fs */ 150 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ 151 int nc_ticks; /* ticks value when entry was added */ 152 struct namecache nc_nc; 153 }; 154 155 /* 156 * At least mips n32 performs 64-bit accesses to timespec as found 157 * in namecache_ts and requires them to be aligned. Since others 158 * may be in the same spot suffer a little bit and enforce the 159 * alignment for everyone. Note this is a nop for 64-bit platforms. 160 */ 161 #define CACHE_ZONE_ALIGNMENT UMA_ALIGNOF(time_t) 162 #define CACHE_PATH_CUTOFF 39 163 164 #define CACHE_ZONE_SMALL_SIZE (sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1) 165 #define CACHE_ZONE_SMALL_TS_SIZE (sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1) 166 #define CACHE_ZONE_LARGE_SIZE (sizeof(struct namecache) + NAME_MAX + 1) 167 #define CACHE_ZONE_LARGE_TS_SIZE (sizeof(struct namecache_ts) + NAME_MAX + 1) 168 169 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 170 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 171 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 172 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 173 174 #define nc_vp n_un.nu_vp 175 #define nc_neg n_un.nu_neg 176 177 /* 178 * Flags in namecache.nc_flag 179 */ 180 #define NCF_WHITE 0x01 181 #define NCF_ISDOTDOT 0x02 182 #define NCF_TS 0x04 183 #define NCF_DTS 0x08 184 #define NCF_DVDROP 0x10 185 #define NCF_NEGATIVE 0x20 186 #define NCF_INVALID 0x40 187 #define NCF_WIP 0x80 188 189 /* 190 * Flags in negstate.neg_flag 191 */ 192 #define NEG_HOT 0x01 193 194 /* 195 * Mark an entry as invalid. 196 * 197 * This is called before it starts getting deconstructed. 198 */ 199 static void 200 cache_ncp_invalidate(struct namecache *ncp) 201 { 202 203 KASSERT((ncp->nc_flag & NCF_INVALID) == 0, 204 ("%s: entry %p already invalid", __func__, ncp)); 205 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID); 206 atomic_thread_fence_rel(); 207 } 208 209 /* 210 * Check whether the entry can be safely used. 211 * 212 * All places which elide locks are supposed to call this after they are 213 * done with reading from an entry. 214 */ 215 static bool 216 cache_ncp_canuse(struct namecache *ncp) 217 { 218 219 atomic_thread_fence_acq(); 220 return ((atomic_load_char(&ncp->nc_flag) & (NCF_INVALID | NCF_WIP)) == 0); 221 } 222 223 /* 224 * Name caching works as follows: 225 * 226 * Names found by directory scans are retained in a cache 227 * for future reference. It is managed LRU, so frequently 228 * used names will hang around. Cache is indexed by hash value 229 * obtained from (dvp, name) where dvp refers to the directory 230 * containing name. 231 * 232 * If it is a "negative" entry, (i.e. for a name that is known NOT to 233 * exist) the vnode pointer will be NULL. 234 * 235 * Upon reaching the last segment of a path, if the reference 236 * is for DELETE, or NOCACHE is set (rewrite), and the 237 * name is located in the cache, it will be dropped. 238 * 239 * These locks are used (in the order in which they can be taken): 240 * NAME TYPE ROLE 241 * vnodelock mtx vnode lists and v_cache_dd field protection 242 * bucketlock rwlock for access to given set of hash buckets 243 * neglist mtx negative entry LRU management 244 * 245 * Additionally, ncneg_shrink_lock mtx is used to have at most one thread 246 * shrinking the LRU list. 247 * 248 * It is legal to take multiple vnodelock and bucketlock locks. The locking 249 * order is lower address first. Both are recursive. 250 * 251 * "." lookups are lockless. 252 * 253 * ".." and vnode -> name lookups require vnodelock. 254 * 255 * name -> vnode lookup requires the relevant bucketlock to be held for reading. 256 * 257 * Insertions and removals of entries require involved vnodes and bucketlocks 258 * to be write-locked to prevent other threads from seeing the entry. 259 * 260 * Some lookups result in removal of the found entry (e.g. getting rid of a 261 * negative entry with the intent to create a positive one), which poses a 262 * problem when multiple threads reach the state. Similarly, two different 263 * threads can purge two different vnodes and try to remove the same name. 264 * 265 * If the already held vnode lock is lower than the second required lock, we 266 * can just take the other lock. However, in the opposite case, this could 267 * deadlock. As such, this is resolved by trylocking and if that fails unlocking 268 * the first node, locking everything in order and revalidating the state. 269 */ 270 271 VFS_SMR_DECLARE; 272 273 /* 274 * Structures associated with name caching. 275 */ 276 #define NCHHASH(hash) \ 277 (&nchashtbl[(hash) & nchash]) 278 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ 279 static u_long __read_mostly nchash; /* size of hash table */ 280 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 281 "Size of namecache hash table"); 282 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */ 283 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 284 "Ratio of negative namecache entries"); 285 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ 286 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ 287 u_int ncsizefactor = 2; 288 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0, 289 "Size factor for namecache"); 290 static u_int __read_mostly ncpurgeminvnodes; 291 SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0, 292 "Number of vnodes below which purgevfs ignores the request"); 293 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */ 294 295 struct nchstats nchstats; /* cache effectiveness statistics */ 296 297 static struct mtx __exclusive_cache_line ncneg_shrink_lock; 298 299 struct neglist { 300 struct mtx nl_lock; 301 TAILQ_HEAD(, namecache) nl_list; 302 } __aligned(CACHE_LINE_SIZE); 303 304 static struct neglist __read_mostly *neglists; 305 static struct neglist ncneg_hot; 306 static u_long numhotneg; 307 308 #define ncneghash 3 309 #define numneglists (ncneghash + 1) 310 static inline struct neglist * 311 NCP2NEGLIST(struct namecache *ncp) 312 { 313 314 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); 315 } 316 317 static inline struct negstate * 318 NCP2NEGSTATE(struct namecache *ncp) 319 { 320 321 MPASS(ncp->nc_flag & NCF_NEGATIVE); 322 return (&ncp->nc_neg); 323 } 324 325 #define numbucketlocks (ncbuckethash + 1) 326 static u_int __read_mostly ncbuckethash; 327 static struct rwlock_padalign __read_mostly *bucketlocks; 328 #define HASH2BUCKETLOCK(hash) \ 329 ((struct rwlock *)(&bucketlocks[((hash) & ncbuckethash)])) 330 331 #define numvnodelocks (ncvnodehash + 1) 332 static u_int __read_mostly ncvnodehash; 333 static struct mtx __read_mostly *vnodelocks; 334 static inline struct mtx * 335 VP2VNODELOCK(struct vnode *vp) 336 { 337 338 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); 339 } 340 341 /* 342 * UMA zones for the VFS cache. 343 * 344 * The small cache is used for entries with short names, which are the 345 * most common. The large cache is used for entries which are too big to 346 * fit in the small cache. 347 */ 348 static uma_zone_t __read_mostly cache_zone_small; 349 static uma_zone_t __read_mostly cache_zone_small_ts; 350 static uma_zone_t __read_mostly cache_zone_large; 351 static uma_zone_t __read_mostly cache_zone_large_ts; 352 353 static struct namecache * 354 cache_alloc(int len, int ts) 355 { 356 struct namecache_ts *ncp_ts; 357 struct namecache *ncp; 358 359 if (__predict_false(ts)) { 360 if (len <= CACHE_PATH_CUTOFF) 361 ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK); 362 else 363 ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK); 364 ncp = &ncp_ts->nc_nc; 365 } else { 366 if (len <= CACHE_PATH_CUTOFF) 367 ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK); 368 else 369 ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK); 370 } 371 return (ncp); 372 } 373 374 static void 375 cache_free(struct namecache *ncp) 376 { 377 struct namecache_ts *ncp_ts; 378 379 if (ncp == NULL) 380 return; 381 if ((ncp->nc_flag & NCF_DVDROP) != 0) 382 vdrop(ncp->nc_dvp); 383 if (__predict_false(ncp->nc_flag & NCF_TS)) { 384 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 385 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 386 uma_zfree_smr(cache_zone_small_ts, ncp_ts); 387 else 388 uma_zfree_smr(cache_zone_large_ts, ncp_ts); 389 } else { 390 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 391 uma_zfree_smr(cache_zone_small, ncp); 392 else 393 uma_zfree_smr(cache_zone_large, ncp); 394 } 395 } 396 397 static void 398 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) 399 { 400 struct namecache_ts *ncp_ts; 401 402 KASSERT((ncp->nc_flag & NCF_TS) != 0 || 403 (tsp == NULL && ticksp == NULL), 404 ("No NCF_TS")); 405 406 if (tsp == NULL && ticksp == NULL) 407 return; 408 409 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 410 if (tsp != NULL) 411 *tsp = ncp_ts->nc_time; 412 if (ticksp != NULL) 413 *ticksp = ncp_ts->nc_ticks; 414 } 415 416 #ifdef DEBUG_CACHE 417 static int __read_mostly doingcache = 1; /* 1 => enable the cache */ 418 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, 419 "VFS namecache enabled"); 420 #endif 421 422 /* Export size information to userland */ 423 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 424 sizeof(struct namecache), "sizeof(struct namecache)"); 425 426 /* 427 * The new name cache statistics 428 */ 429 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 430 "Name cache statistics"); 431 #define STATNODE_ULONG(name, descr) \ 432 SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr); 433 #define STATNODE_COUNTER(name, descr) \ 434 static COUNTER_U64_DEFINE_EARLY(name); \ 435 SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, \ 436 descr); 437 STATNODE_ULONG(numneg, "Number of negative cache entries"); 438 STATNODE_ULONG(numcache, "Number of cache entries"); 439 STATNODE_COUNTER(numcachehv, "Number of namecache entries with vnodes held"); 440 STATNODE_COUNTER(numdrops, "Number of dropped entries due to reaching the limit"); 441 STATNODE_COUNTER(dothits, "Number of '.' hits"); 442 STATNODE_COUNTER(dotdothits, "Number of '..' hits"); 443 STATNODE_COUNTER(nummiss, "Number of cache misses"); 444 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache"); 445 STATNODE_COUNTER(numposzaps, 446 "Number of cache hits (positive) we do not want to cache"); 447 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)"); 448 STATNODE_COUNTER(numnegzaps, 449 "Number of cache hits (negative) we do not want to cache"); 450 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)"); 451 /* These count for vn_getcwd(), too. */ 452 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls"); 453 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); 454 STATNODE_COUNTER(numfullpathfail2, 455 "Number of fullpath search errors (VOP_VPTOCNP failures)"); 456 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); 457 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls"); 458 STATNODE_COUNTER(zap_and_exit_bucket_relock_success, 459 "Number of successful removals after relocking"); 460 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail, 461 "Number of times zap_and_exit failed to lock"); 462 static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2, 463 "Number of times zap_and_exit failed to lock"); 464 static long cache_lock_vnodes_cel_3_failures; 465 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures, 466 "Number of times 3-way vnode locking failed"); 467 STATNODE_ULONG(numhotneg, "Number of hot negative entries"); 468 STATNODE_COUNTER(numneg_evicted, 469 "Number of negative entries evicted when adding a new entry"); 470 STATNODE_COUNTER(shrinking_skipped, 471 "Number of times shrinking was already in progress"); 472 473 static void cache_zap_locked(struct namecache *ncp); 474 static int vn_fullpath_hardlink(struct thread *td, struct nameidata *ndp, char **retbuf, 475 char **freebuf, size_t *buflen); 476 static int vn_fullpath_any(struct thread *td, struct vnode *vp, struct vnode *rdir, 477 char *buf, char **retbuf, size_t *buflen); 478 static int vn_fullpath_dir(struct thread *td, struct vnode *vp, struct vnode *rdir, 479 char *buf, char **retbuf, size_t *len, bool slash_prefixed, size_t addend); 480 481 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 482 483 static int cache_yield; 484 SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0, 485 "Number of times cache called yield"); 486 487 static void __noinline 488 cache_maybe_yield(void) 489 { 490 491 if (should_yield()) { 492 cache_yield++; 493 kern_yield(PRI_USER); 494 } 495 } 496 497 static inline void 498 cache_assert_vlp_locked(struct mtx *vlp) 499 { 500 501 if (vlp != NULL) 502 mtx_assert(vlp, MA_OWNED); 503 } 504 505 static inline void 506 cache_assert_vnode_locked(struct vnode *vp) 507 { 508 struct mtx *vlp; 509 510 vlp = VP2VNODELOCK(vp); 511 cache_assert_vlp_locked(vlp); 512 } 513 514 /* 515 * TODO: With the value stored we can do better than computing the hash based 516 * on the address. The choice of FNV should also be revisited. 517 */ 518 static void 519 cache_prehash(struct vnode *vp) 520 { 521 522 vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT); 523 } 524 525 static uint32_t 526 cache_get_hash(char *name, u_char len, struct vnode *dvp) 527 { 528 529 return (fnv_32_buf(name, len, dvp->v_nchash)); 530 } 531 532 static inline struct nchashhead * 533 NCP2BUCKET(struct namecache *ncp) 534 { 535 uint32_t hash; 536 537 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 538 return (NCHHASH(hash)); 539 } 540 541 static inline struct rwlock * 542 NCP2BUCKETLOCK(struct namecache *ncp) 543 { 544 uint32_t hash; 545 546 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 547 return (HASH2BUCKETLOCK(hash)); 548 } 549 550 #ifdef INVARIANTS 551 static void 552 cache_assert_bucket_locked(struct namecache *ncp, int mode) 553 { 554 struct rwlock *blp; 555 556 blp = NCP2BUCKETLOCK(ncp); 557 rw_assert(blp, mode); 558 } 559 #else 560 #define cache_assert_bucket_locked(x, y) do { } while (0) 561 #endif 562 563 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) 564 static void 565 _cache_sort_vnodes(void **p1, void **p2) 566 { 567 void *tmp; 568 569 MPASS(*p1 != NULL || *p2 != NULL); 570 571 if (*p1 > *p2) { 572 tmp = *p2; 573 *p2 = *p1; 574 *p1 = tmp; 575 } 576 } 577 578 static void 579 cache_lock_all_buckets(void) 580 { 581 u_int i; 582 583 for (i = 0; i < numbucketlocks; i++) 584 rw_wlock(&bucketlocks[i]); 585 } 586 587 static void 588 cache_unlock_all_buckets(void) 589 { 590 u_int i; 591 592 for (i = 0; i < numbucketlocks; i++) 593 rw_wunlock(&bucketlocks[i]); 594 } 595 596 static void 597 cache_lock_all_vnodes(void) 598 { 599 u_int i; 600 601 for (i = 0; i < numvnodelocks; i++) 602 mtx_lock(&vnodelocks[i]); 603 } 604 605 static void 606 cache_unlock_all_vnodes(void) 607 { 608 u_int i; 609 610 for (i = 0; i < numvnodelocks; i++) 611 mtx_unlock(&vnodelocks[i]); 612 } 613 614 static int 615 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 616 { 617 618 cache_sort_vnodes(&vlp1, &vlp2); 619 620 if (vlp1 != NULL) { 621 if (!mtx_trylock(vlp1)) 622 return (EAGAIN); 623 } 624 if (!mtx_trylock(vlp2)) { 625 if (vlp1 != NULL) 626 mtx_unlock(vlp1); 627 return (EAGAIN); 628 } 629 630 return (0); 631 } 632 633 static void 634 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 635 { 636 637 MPASS(vlp1 != NULL || vlp2 != NULL); 638 MPASS(vlp1 <= vlp2); 639 640 if (vlp1 != NULL) 641 mtx_lock(vlp1); 642 if (vlp2 != NULL) 643 mtx_lock(vlp2); 644 } 645 646 static void 647 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 648 { 649 650 MPASS(vlp1 != NULL || vlp2 != NULL); 651 652 if (vlp1 != NULL) 653 mtx_unlock(vlp1); 654 if (vlp2 != NULL) 655 mtx_unlock(vlp2); 656 } 657 658 static int 659 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 660 { 661 struct nchstats snap; 662 663 if (req->oldptr == NULL) 664 return (SYSCTL_OUT(req, 0, sizeof(snap))); 665 666 snap = nchstats; 667 snap.ncs_goodhits = counter_u64_fetch(numposhits); 668 snap.ncs_neghits = counter_u64_fetch(numneghits); 669 snap.ncs_badhits = counter_u64_fetch(numposzaps) + 670 counter_u64_fetch(numnegzaps); 671 snap.ncs_miss = counter_u64_fetch(nummisszap) + 672 counter_u64_fetch(nummiss); 673 674 return (SYSCTL_OUT(req, &snap, sizeof(snap))); 675 } 676 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | 677 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", 678 "VFS cache effectiveness statistics"); 679 680 #ifdef DIAGNOSTIC 681 /* 682 * Grab an atomic snapshot of the name cache hash chain lengths 683 */ 684 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, 685 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 686 "hash table stats"); 687 688 static int 689 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) 690 { 691 struct nchashhead *ncpp; 692 struct namecache *ncp; 693 int i, error, n_nchash, *cntbuf; 694 695 retry: 696 n_nchash = nchash + 1; /* nchash is max index, not count */ 697 if (req->oldptr == NULL) 698 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); 699 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); 700 cache_lock_all_buckets(); 701 if (n_nchash != nchash + 1) { 702 cache_unlock_all_buckets(); 703 free(cntbuf, M_TEMP); 704 goto retry; 705 } 706 /* Scan hash tables counting entries */ 707 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) 708 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) 709 cntbuf[i]++; 710 cache_unlock_all_buckets(); 711 for (error = 0, i = 0; i < n_nchash; i++) 712 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) 713 break; 714 free(cntbuf, M_TEMP); 715 return (error); 716 } 717 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| 718 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", 719 "nchash chain lengths"); 720 721 static int 722 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) 723 { 724 int error; 725 struct nchashhead *ncpp; 726 struct namecache *ncp; 727 int n_nchash; 728 int count, maxlength, used, pct; 729 730 if (!req->oldptr) 731 return SYSCTL_OUT(req, 0, 4 * sizeof(int)); 732 733 cache_lock_all_buckets(); 734 n_nchash = nchash + 1; /* nchash is max index, not count */ 735 used = 0; 736 maxlength = 0; 737 738 /* Scan hash tables for applicable entries */ 739 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { 740 count = 0; 741 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) { 742 count++; 743 } 744 if (count) 745 used++; 746 if (maxlength < count) 747 maxlength = count; 748 } 749 n_nchash = nchash + 1; 750 cache_unlock_all_buckets(); 751 pct = (used * 100) / (n_nchash / 100); 752 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); 753 if (error) 754 return (error); 755 error = SYSCTL_OUT(req, &used, sizeof(used)); 756 if (error) 757 return (error); 758 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); 759 if (error) 760 return (error); 761 error = SYSCTL_OUT(req, &pct, sizeof(pct)); 762 if (error) 763 return (error); 764 return (0); 765 } 766 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| 767 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", 768 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); 769 #endif 770 771 /* 772 * Negative entries management 773 * 774 * A variation of LRU scheme is used. New entries are hashed into one of 775 * numneglists cold lists. Entries get promoted to the hot list on first hit. 776 * 777 * The shrinker will demote hot list head and evict from the cold list in a 778 * round-robin manner. 779 */ 780 static void 781 cache_negative_init(struct namecache *ncp) 782 { 783 struct negstate *negstate; 784 785 ncp->nc_flag |= NCF_NEGATIVE; 786 negstate = NCP2NEGSTATE(ncp); 787 negstate->neg_flag = 0; 788 } 789 790 static void 791 cache_negative_hit(struct namecache *ncp) 792 { 793 struct neglist *neglist; 794 struct negstate *negstate; 795 796 negstate = NCP2NEGSTATE(ncp); 797 if ((negstate->neg_flag & NEG_HOT) != 0) 798 return; 799 neglist = NCP2NEGLIST(ncp); 800 mtx_lock(&ncneg_hot.nl_lock); 801 mtx_lock(&neglist->nl_lock); 802 if ((negstate->neg_flag & NEG_HOT) == 0) { 803 numhotneg++; 804 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 805 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); 806 negstate->neg_flag |= NEG_HOT; 807 } 808 mtx_unlock(&neglist->nl_lock); 809 mtx_unlock(&ncneg_hot.nl_lock); 810 } 811 812 static void 813 cache_negative_insert(struct namecache *ncp) 814 { 815 struct neglist *neglist; 816 817 MPASS(ncp->nc_flag & NCF_NEGATIVE); 818 cache_assert_bucket_locked(ncp, RA_WLOCKED); 819 neglist = NCP2NEGLIST(ncp); 820 mtx_lock(&neglist->nl_lock); 821 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 822 mtx_unlock(&neglist->nl_lock); 823 atomic_add_rel_long(&numneg, 1); 824 } 825 826 static void 827 cache_negative_remove(struct namecache *ncp) 828 { 829 struct neglist *neglist; 830 struct negstate *negstate; 831 bool hot_locked = false; 832 bool list_locked = false; 833 834 cache_assert_bucket_locked(ncp, RA_WLOCKED); 835 neglist = NCP2NEGLIST(ncp); 836 negstate = NCP2NEGSTATE(ncp); 837 if ((negstate->neg_flag & NEG_HOT) != 0) { 838 hot_locked = true; 839 mtx_lock(&ncneg_hot.nl_lock); 840 if ((negstate->neg_flag & NEG_HOT) == 0) { 841 list_locked = true; 842 mtx_lock(&neglist->nl_lock); 843 } 844 } else { 845 list_locked = true; 846 mtx_lock(&neglist->nl_lock); 847 /* 848 * We may be racing against promotion in lockless lookup. 849 */ 850 if ((negstate->neg_flag & NEG_HOT) != 0) { 851 mtx_unlock(&neglist->nl_lock); 852 hot_locked = true; 853 mtx_lock(&ncneg_hot.nl_lock); 854 mtx_lock(&neglist->nl_lock); 855 } 856 } 857 if ((negstate->neg_flag & NEG_HOT) != 0) { 858 mtx_assert(&ncneg_hot.nl_lock, MA_OWNED); 859 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 860 numhotneg--; 861 } else { 862 mtx_assert(&neglist->nl_lock, MA_OWNED); 863 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 864 } 865 if (list_locked) 866 mtx_unlock(&neglist->nl_lock); 867 if (hot_locked) 868 mtx_unlock(&ncneg_hot.nl_lock); 869 atomic_subtract_rel_long(&numneg, 1); 870 } 871 872 static void 873 cache_negative_shrink_select(struct namecache **ncpp, 874 struct neglist **neglistpp) 875 { 876 struct neglist *neglist; 877 struct namecache *ncp; 878 static u_int cycle; 879 u_int i; 880 881 *ncpp = ncp = NULL; 882 883 for (i = 0; i < numneglists; i++) { 884 neglist = &neglists[(cycle + i) % numneglists]; 885 if (TAILQ_FIRST(&neglist->nl_list) == NULL) 886 continue; 887 mtx_lock(&neglist->nl_lock); 888 ncp = TAILQ_FIRST(&neglist->nl_list); 889 if (ncp != NULL) 890 break; 891 mtx_unlock(&neglist->nl_lock); 892 } 893 894 *neglistpp = neglist; 895 *ncpp = ncp; 896 cycle++; 897 } 898 899 static void 900 cache_negative_zap_one(void) 901 { 902 struct namecache *ncp, *ncp2; 903 struct neglist *neglist; 904 struct negstate *negstate; 905 struct mtx *dvlp; 906 struct rwlock *blp; 907 908 if (mtx_owner(&ncneg_shrink_lock) != NULL || 909 !mtx_trylock(&ncneg_shrink_lock)) { 910 counter_u64_add(shrinking_skipped, 1); 911 return; 912 } 913 914 mtx_lock(&ncneg_hot.nl_lock); 915 ncp = TAILQ_FIRST(&ncneg_hot.nl_list); 916 if (ncp != NULL) { 917 neglist = NCP2NEGLIST(ncp); 918 negstate = NCP2NEGSTATE(ncp); 919 mtx_lock(&neglist->nl_lock); 920 MPASS((negstate->neg_flag & NEG_HOT) != 0); 921 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 922 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 923 negstate->neg_flag &= ~NEG_HOT; 924 numhotneg--; 925 mtx_unlock(&neglist->nl_lock); 926 } 927 mtx_unlock(&ncneg_hot.nl_lock); 928 929 cache_negative_shrink_select(&ncp, &neglist); 930 931 mtx_unlock(&ncneg_shrink_lock); 932 if (ncp == NULL) 933 return; 934 935 MPASS(ncp->nc_flag & NCF_NEGATIVE); 936 dvlp = VP2VNODELOCK(ncp->nc_dvp); 937 blp = NCP2BUCKETLOCK(ncp); 938 mtx_unlock(&neglist->nl_lock); 939 mtx_lock(dvlp); 940 rw_wlock(blp); 941 /* 942 * Enter SMR to safely check the negative list. 943 * Even if the found pointer matches, the entry may now be reallocated 944 * and used by a different vnode. 945 */ 946 vfs_smr_enter(); 947 ncp2 = TAILQ_FIRST(&neglist->nl_list); 948 if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) || 949 blp != NCP2BUCKETLOCK(ncp2)) { 950 vfs_smr_exit(); 951 ncp = NULL; 952 } else { 953 vfs_smr_exit(); 954 SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp, 955 ncp->nc_name); 956 cache_zap_locked(ncp); 957 counter_u64_add(numneg_evicted, 1); 958 } 959 rw_wunlock(blp); 960 mtx_unlock(dvlp); 961 cache_free(ncp); 962 } 963 964 /* 965 * cache_zap_locked(): 966 * 967 * Removes a namecache entry from cache, whether it contains an actual 968 * pointer to a vnode or if it is just a negative cache entry. 969 */ 970 static void 971 cache_zap_locked(struct namecache *ncp) 972 { 973 struct nchashhead *ncpp; 974 975 if (!(ncp->nc_flag & NCF_NEGATIVE)) 976 cache_assert_vnode_locked(ncp->nc_vp); 977 cache_assert_vnode_locked(ncp->nc_dvp); 978 cache_assert_bucket_locked(ncp, RA_WLOCKED); 979 980 CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, 981 (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp); 982 983 cache_ncp_invalidate(ncp); 984 985 ncpp = NCP2BUCKET(ncp); 986 CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash); 987 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 988 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, 989 ncp->nc_name, ncp->nc_vp); 990 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); 991 if (ncp == ncp->nc_vp->v_cache_dd) { 992 vn_seqc_write_begin_unheld(ncp->nc_vp); 993 ncp->nc_vp->v_cache_dd = NULL; 994 vn_seqc_write_end(ncp->nc_vp); 995 } 996 } else { 997 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp, 998 ncp->nc_name); 999 cache_negative_remove(ncp); 1000 } 1001 if (ncp->nc_flag & NCF_ISDOTDOT) { 1002 if (ncp == ncp->nc_dvp->v_cache_dd) { 1003 vn_seqc_write_begin_unheld(ncp->nc_dvp); 1004 ncp->nc_dvp->v_cache_dd = NULL; 1005 vn_seqc_write_end(ncp->nc_dvp); 1006 } 1007 } else { 1008 LIST_REMOVE(ncp, nc_src); 1009 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { 1010 ncp->nc_flag |= NCF_DVDROP; 1011 counter_u64_add(numcachehv, -1); 1012 } 1013 } 1014 atomic_subtract_rel_long(&numcache, 1); 1015 } 1016 1017 static void 1018 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) 1019 { 1020 struct rwlock *blp; 1021 1022 MPASS(ncp->nc_dvp == vp); 1023 MPASS(ncp->nc_flag & NCF_NEGATIVE); 1024 cache_assert_vnode_locked(vp); 1025 1026 blp = NCP2BUCKETLOCK(ncp); 1027 rw_wlock(blp); 1028 cache_zap_locked(ncp); 1029 rw_wunlock(blp); 1030 } 1031 1032 static bool 1033 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, 1034 struct mtx **vlpp) 1035 { 1036 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 1037 struct rwlock *blp; 1038 1039 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 1040 cache_assert_vnode_locked(vp); 1041 1042 if (ncp->nc_flag & NCF_NEGATIVE) { 1043 if (*vlpp != NULL) { 1044 mtx_unlock(*vlpp); 1045 *vlpp = NULL; 1046 } 1047 cache_zap_negative_locked_vnode_kl(ncp, vp); 1048 return (true); 1049 } 1050 1051 pvlp = VP2VNODELOCK(vp); 1052 blp = NCP2BUCKETLOCK(ncp); 1053 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 1054 vlp2 = VP2VNODELOCK(ncp->nc_vp); 1055 1056 if (*vlpp == vlp1 || *vlpp == vlp2) { 1057 to_unlock = *vlpp; 1058 *vlpp = NULL; 1059 } else { 1060 if (*vlpp != NULL) { 1061 mtx_unlock(*vlpp); 1062 *vlpp = NULL; 1063 } 1064 cache_sort_vnodes(&vlp1, &vlp2); 1065 if (vlp1 == pvlp) { 1066 mtx_lock(vlp2); 1067 to_unlock = vlp2; 1068 } else { 1069 if (!mtx_trylock(vlp1)) 1070 goto out_relock; 1071 to_unlock = vlp1; 1072 } 1073 } 1074 rw_wlock(blp); 1075 cache_zap_locked(ncp); 1076 rw_wunlock(blp); 1077 if (to_unlock != NULL) 1078 mtx_unlock(to_unlock); 1079 return (true); 1080 1081 out_relock: 1082 mtx_unlock(vlp2); 1083 mtx_lock(vlp1); 1084 mtx_lock(vlp2); 1085 MPASS(*vlpp == NULL); 1086 *vlpp = vlp1; 1087 return (false); 1088 } 1089 1090 static int __noinline 1091 cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp) 1092 { 1093 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 1094 struct rwlock *blp; 1095 int error = 0; 1096 1097 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 1098 cache_assert_vnode_locked(vp); 1099 1100 pvlp = VP2VNODELOCK(vp); 1101 if (ncp->nc_flag & NCF_NEGATIVE) { 1102 cache_zap_negative_locked_vnode_kl(ncp, vp); 1103 goto out; 1104 } 1105 1106 blp = NCP2BUCKETLOCK(ncp); 1107 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 1108 vlp2 = VP2VNODELOCK(ncp->nc_vp); 1109 cache_sort_vnodes(&vlp1, &vlp2); 1110 if (vlp1 == pvlp) { 1111 mtx_lock(vlp2); 1112 to_unlock = vlp2; 1113 } else { 1114 if (!mtx_trylock(vlp1)) { 1115 error = EAGAIN; 1116 goto out; 1117 } 1118 to_unlock = vlp1; 1119 } 1120 rw_wlock(blp); 1121 cache_zap_locked(ncp); 1122 rw_wunlock(blp); 1123 mtx_unlock(to_unlock); 1124 out: 1125 mtx_unlock(pvlp); 1126 return (error); 1127 } 1128 1129 /* 1130 * If trylocking failed we can get here. We know enough to take all needed locks 1131 * in the right order and re-lookup the entry. 1132 */ 1133 static int 1134 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1135 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, 1136 struct rwlock *blp) 1137 { 1138 struct namecache *rncp; 1139 1140 cache_assert_bucket_locked(ncp, RA_UNLOCKED); 1141 1142 cache_sort_vnodes(&dvlp, &vlp); 1143 cache_lock_vnodes(dvlp, vlp); 1144 rw_wlock(blp); 1145 CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { 1146 if (rncp == ncp && rncp->nc_dvp == dvp && 1147 rncp->nc_nlen == cnp->cn_namelen && 1148 !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) 1149 break; 1150 } 1151 if (rncp != NULL) { 1152 cache_zap_locked(rncp); 1153 rw_wunlock(blp); 1154 cache_unlock_vnodes(dvlp, vlp); 1155 counter_u64_add(zap_and_exit_bucket_relock_success, 1); 1156 return (0); 1157 } 1158 1159 rw_wunlock(blp); 1160 cache_unlock_vnodes(dvlp, vlp); 1161 return (EAGAIN); 1162 } 1163 1164 static int __noinline 1165 cache_zap_wlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1166 uint32_t hash, struct rwlock *blp) 1167 { 1168 struct mtx *dvlp, *vlp; 1169 struct vnode *dvp; 1170 1171 cache_assert_bucket_locked(ncp, RA_WLOCKED); 1172 1173 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1174 vlp = NULL; 1175 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1176 vlp = VP2VNODELOCK(ncp->nc_vp); 1177 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1178 cache_zap_locked(ncp); 1179 rw_wunlock(blp); 1180 cache_unlock_vnodes(dvlp, vlp); 1181 return (0); 1182 } 1183 1184 dvp = ncp->nc_dvp; 1185 rw_wunlock(blp); 1186 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1187 } 1188 1189 static int __noinline 1190 cache_zap_rlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1191 uint32_t hash, struct rwlock *blp) 1192 { 1193 struct mtx *dvlp, *vlp; 1194 struct vnode *dvp; 1195 1196 cache_assert_bucket_locked(ncp, RA_RLOCKED); 1197 1198 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1199 vlp = NULL; 1200 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1201 vlp = VP2VNODELOCK(ncp->nc_vp); 1202 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1203 rw_runlock(blp); 1204 rw_wlock(blp); 1205 cache_zap_locked(ncp); 1206 rw_wunlock(blp); 1207 cache_unlock_vnodes(dvlp, vlp); 1208 return (0); 1209 } 1210 1211 dvp = ncp->nc_dvp; 1212 rw_runlock(blp); 1213 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1214 } 1215 1216 static int 1217 cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp, 1218 struct mtx **vlpp1, struct mtx **vlpp2) 1219 { 1220 struct mtx *dvlp, *vlp; 1221 1222 cache_assert_bucket_locked(ncp, RA_WLOCKED); 1223 1224 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1225 vlp = NULL; 1226 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1227 vlp = VP2VNODELOCK(ncp->nc_vp); 1228 cache_sort_vnodes(&dvlp, &vlp); 1229 1230 if (*vlpp1 == dvlp && *vlpp2 == vlp) { 1231 cache_zap_locked(ncp); 1232 cache_unlock_vnodes(dvlp, vlp); 1233 *vlpp1 = NULL; 1234 *vlpp2 = NULL; 1235 return (0); 1236 } 1237 1238 if (*vlpp1 != NULL) 1239 mtx_unlock(*vlpp1); 1240 if (*vlpp2 != NULL) 1241 mtx_unlock(*vlpp2); 1242 *vlpp1 = NULL; 1243 *vlpp2 = NULL; 1244 1245 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1246 cache_zap_locked(ncp); 1247 cache_unlock_vnodes(dvlp, vlp); 1248 return (0); 1249 } 1250 1251 rw_wunlock(blp); 1252 *vlpp1 = dvlp; 1253 *vlpp2 = vlp; 1254 if (*vlpp1 != NULL) 1255 mtx_lock(*vlpp1); 1256 mtx_lock(*vlpp2); 1257 rw_wlock(blp); 1258 return (EAGAIN); 1259 } 1260 1261 static void 1262 cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp) 1263 { 1264 1265 if (blp != NULL) { 1266 rw_runlock(blp); 1267 } else { 1268 mtx_unlock(vlp); 1269 } 1270 } 1271 1272 static int __noinline 1273 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1274 struct timespec *tsp, int *ticksp) 1275 { 1276 int ltype; 1277 1278 *vpp = dvp; 1279 CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .", 1280 dvp, cnp->cn_nameptr); 1281 counter_u64_add(dothits, 1); 1282 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); 1283 if (tsp != NULL) 1284 timespecclear(tsp); 1285 if (ticksp != NULL) 1286 *ticksp = ticks; 1287 vrefact(*vpp); 1288 /* 1289 * When we lookup "." we still can be asked to lock it 1290 * differently... 1291 */ 1292 ltype = cnp->cn_lkflags & LK_TYPE_MASK; 1293 if (ltype != VOP_ISLOCKED(*vpp)) { 1294 if (ltype == LK_EXCLUSIVE) { 1295 vn_lock(*vpp, LK_UPGRADE | LK_RETRY); 1296 if (VN_IS_DOOMED((*vpp))) { 1297 /* forced unmount */ 1298 vrele(*vpp); 1299 *vpp = NULL; 1300 return (ENOENT); 1301 } 1302 } else 1303 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); 1304 } 1305 return (-1); 1306 } 1307 1308 static __noinline int 1309 cache_lookup_nomakeentry(struct vnode *dvp, struct vnode **vpp, 1310 struct componentname *cnp, struct timespec *tsp, int *ticksp) 1311 { 1312 struct namecache *ncp; 1313 struct rwlock *blp; 1314 struct mtx *dvlp, *dvlp2; 1315 uint32_t hash; 1316 int error; 1317 1318 if (cnp->cn_namelen == 2 && 1319 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1320 counter_u64_add(dotdothits, 1); 1321 dvlp = VP2VNODELOCK(dvp); 1322 dvlp2 = NULL; 1323 mtx_lock(dvlp); 1324 retry_dotdot: 1325 ncp = dvp->v_cache_dd; 1326 if (ncp == NULL) { 1327 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, 1328 "..", NULL); 1329 mtx_unlock(dvlp); 1330 if (dvlp2 != NULL) 1331 mtx_unlock(dvlp2); 1332 return (0); 1333 } 1334 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1335 if (ncp->nc_dvp != dvp) 1336 panic("dvp %p v_cache_dd %p\n", dvp, ncp); 1337 if (!cache_zap_locked_vnode_kl2(ncp, 1338 dvp, &dvlp2)) 1339 goto retry_dotdot; 1340 MPASS(dvp->v_cache_dd == NULL); 1341 mtx_unlock(dvlp); 1342 if (dvlp2 != NULL) 1343 mtx_unlock(dvlp2); 1344 cache_free(ncp); 1345 } else { 1346 vn_seqc_write_begin(dvp); 1347 dvp->v_cache_dd = NULL; 1348 vn_seqc_write_end(dvp); 1349 mtx_unlock(dvlp); 1350 if (dvlp2 != NULL) 1351 mtx_unlock(dvlp2); 1352 } 1353 return (0); 1354 } 1355 1356 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1357 blp = HASH2BUCKETLOCK(hash); 1358 retry: 1359 if (CK_SLIST_EMPTY(NCHHASH(hash))) 1360 goto out_no_entry; 1361 1362 rw_wlock(blp); 1363 1364 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1365 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1366 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1367 break; 1368 } 1369 1370 /* We failed to find an entry */ 1371 if (ncp == NULL) { 1372 rw_wunlock(blp); 1373 goto out_no_entry; 1374 } 1375 1376 error = cache_zap_wlocked_bucket(ncp, cnp, hash, blp); 1377 if (__predict_false(error != 0)) { 1378 zap_and_exit_bucket_fail++; 1379 cache_maybe_yield(); 1380 goto retry; 1381 } 1382 counter_u64_add(numposzaps, 1); 1383 cache_free(ncp); 1384 return (0); 1385 out_no_entry: 1386 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, NULL); 1387 counter_u64_add(nummisszap, 1); 1388 return (0); 1389 } 1390 1391 /** 1392 * Lookup a name in the name cache 1393 * 1394 * # Arguments 1395 * 1396 * - dvp: Parent directory in which to search. 1397 * - vpp: Return argument. Will contain desired vnode on cache hit. 1398 * - cnp: Parameters of the name search. The most interesting bits of 1399 * the cn_flags field have the following meanings: 1400 * - MAKEENTRY: If clear, free an entry from the cache rather than look 1401 * it up. 1402 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." 1403 * - tsp: Return storage for cache timestamp. On a successful (positive 1404 * or negative) lookup, tsp will be filled with any timespec that 1405 * was stored when this cache entry was created. However, it will 1406 * be clear for "." entries. 1407 * - ticks: Return storage for alternate cache timestamp. On a successful 1408 * (positive or negative) lookup, it will contain the ticks value 1409 * that was current when the cache entry was created, unless cnp 1410 * was ".". 1411 * 1412 * # Returns 1413 * 1414 * - -1: A positive cache hit. vpp will contain the desired vnode. 1415 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due 1416 * to a forced unmount. vpp will not be modified. If the entry 1417 * is a whiteout, then the ISWHITEOUT flag will be set in 1418 * cnp->cn_flags. 1419 * - 0: A cache miss. vpp will not be modified. 1420 * 1421 * # Locking 1422 * 1423 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up 1424 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the 1425 * lock is not recursively acquired. 1426 */ 1427 int 1428 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1429 struct timespec *tsp, int *ticksp) 1430 { 1431 struct namecache_ts *ncp_ts; 1432 struct namecache *ncp; 1433 struct negstate *negstate; 1434 struct rwlock *blp; 1435 struct mtx *dvlp; 1436 uint32_t hash; 1437 enum vgetstate vs; 1438 int error, ltype; 1439 bool try_smr, doing_smr, whiteout; 1440 1441 #ifdef DEBUG_CACHE 1442 if (__predict_false(!doingcache)) { 1443 cnp->cn_flags &= ~MAKEENTRY; 1444 return (0); 1445 } 1446 #endif 1447 1448 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) 1449 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); 1450 1451 if ((cnp->cn_flags & MAKEENTRY) == 0) 1452 return (cache_lookup_nomakeentry(dvp, vpp, cnp, tsp, ticksp)); 1453 1454 try_smr = true; 1455 if (cnp->cn_nameiop == CREATE) 1456 try_smr = false; 1457 retry: 1458 doing_smr = false; 1459 blp = NULL; 1460 dvlp = NULL; 1461 error = 0; 1462 if (cnp->cn_namelen == 2 && 1463 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1464 counter_u64_add(dotdothits, 1); 1465 dvlp = VP2VNODELOCK(dvp); 1466 mtx_lock(dvlp); 1467 ncp = dvp->v_cache_dd; 1468 if (ncp == NULL) { 1469 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, 1470 "..", NULL); 1471 mtx_unlock(dvlp); 1472 return (0); 1473 } 1474 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1475 if (ncp->nc_flag & NCF_NEGATIVE) 1476 *vpp = NULL; 1477 else 1478 *vpp = ncp->nc_vp; 1479 } else 1480 *vpp = ncp->nc_dvp; 1481 /* Return failure if negative entry was found. */ 1482 if (*vpp == NULL) 1483 goto negative_success; 1484 CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..", 1485 dvp, cnp->cn_nameptr, *vpp); 1486 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", 1487 *vpp); 1488 cache_out_ts(ncp, tsp, ticksp); 1489 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == 1490 NCF_DTS && tsp != NULL) { 1491 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1492 *tsp = ncp_ts->nc_dotdottime; 1493 } 1494 goto success; 1495 } 1496 1497 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1498 retry_hashed: 1499 if (try_smr) { 1500 vfs_smr_enter(); 1501 doing_smr = true; 1502 try_smr = false; 1503 } else { 1504 blp = HASH2BUCKETLOCK(hash); 1505 rw_rlock(blp); 1506 } 1507 1508 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1509 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1510 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1511 break; 1512 } 1513 1514 /* We failed to find an entry */ 1515 if (__predict_false(ncp == NULL)) { 1516 if (doing_smr) 1517 vfs_smr_exit(); 1518 else 1519 rw_runlock(blp); 1520 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1521 NULL); 1522 counter_u64_add(nummiss, 1); 1523 return (0); 1524 } 1525 1526 if (ncp->nc_flag & NCF_NEGATIVE) 1527 goto negative_success; 1528 1529 /* We found a "positive" match, return the vnode */ 1530 counter_u64_add(numposhits, 1); 1531 *vpp = ncp->nc_vp; 1532 CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p", 1533 dvp, cnp->cn_nameptr, *vpp, ncp); 1534 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, 1535 *vpp); 1536 cache_out_ts(ncp, tsp, ticksp); 1537 success: 1538 /* 1539 * On success we return a locked and ref'd vnode as per the lookup 1540 * protocol. 1541 */ 1542 MPASS(dvp != *vpp); 1543 ltype = 0; /* silence gcc warning */ 1544 if (cnp->cn_flags & ISDOTDOT) { 1545 ltype = VOP_ISLOCKED(dvp); 1546 VOP_UNLOCK(dvp); 1547 } 1548 if (doing_smr) { 1549 if (!cache_ncp_canuse(ncp)) { 1550 vfs_smr_exit(); 1551 *vpp = NULL; 1552 goto retry; 1553 } 1554 vs = vget_prep_smr(*vpp); 1555 vfs_smr_exit(); 1556 if (__predict_false(vs == VGET_NONE)) { 1557 *vpp = NULL; 1558 goto retry; 1559 } 1560 } else { 1561 vs = vget_prep(*vpp); 1562 cache_lookup_unlock(blp, dvlp); 1563 } 1564 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1565 if (cnp->cn_flags & ISDOTDOT) { 1566 vn_lock(dvp, ltype | LK_RETRY); 1567 if (VN_IS_DOOMED(dvp)) { 1568 if (error == 0) 1569 vput(*vpp); 1570 *vpp = NULL; 1571 return (ENOENT); 1572 } 1573 } 1574 if (error) { 1575 *vpp = NULL; 1576 goto retry; 1577 } 1578 if ((cnp->cn_flags & ISLASTCN) && 1579 (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) { 1580 ASSERT_VOP_ELOCKED(*vpp, "cache_lookup"); 1581 } 1582 return (-1); 1583 1584 negative_success: 1585 /* We found a negative match, and want to create it, so purge */ 1586 if (cnp->cn_nameiop == CREATE) { 1587 MPASS(!doing_smr); 1588 counter_u64_add(numnegzaps, 1); 1589 goto zap_and_exit; 1590 } 1591 1592 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 1593 cache_out_ts(ncp, tsp, ticksp); 1594 counter_u64_add(numneghits, 1); 1595 whiteout = (ncp->nc_flag & NCF_WHITE); 1596 1597 if (doing_smr) { 1598 /* 1599 * We need to take locks to promote an entry. 1600 */ 1601 negstate = NCP2NEGSTATE(ncp); 1602 if ((negstate->neg_flag & NEG_HOT) == 0 || 1603 !cache_ncp_canuse(ncp)) { 1604 vfs_smr_exit(); 1605 doing_smr = false; 1606 goto retry_hashed; 1607 } 1608 vfs_smr_exit(); 1609 } else { 1610 cache_negative_hit(ncp); 1611 cache_lookup_unlock(blp, dvlp); 1612 } 1613 if (whiteout) 1614 cnp->cn_flags |= ISWHITEOUT; 1615 return (ENOENT); 1616 1617 zap_and_exit: 1618 MPASS(!doing_smr); 1619 if (blp != NULL) 1620 error = cache_zap_rlocked_bucket(ncp, cnp, hash, blp); 1621 else 1622 error = cache_zap_locked_vnode(ncp, dvp); 1623 if (__predict_false(error != 0)) { 1624 zap_and_exit_bucket_fail2++; 1625 cache_maybe_yield(); 1626 goto retry; 1627 } 1628 cache_free(ncp); 1629 return (0); 1630 } 1631 1632 struct celockstate { 1633 struct mtx *vlp[3]; 1634 struct rwlock *blp[2]; 1635 }; 1636 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); 1637 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); 1638 1639 static inline void 1640 cache_celockstate_init(struct celockstate *cel) 1641 { 1642 1643 bzero(cel, sizeof(*cel)); 1644 } 1645 1646 static void 1647 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, 1648 struct vnode *dvp) 1649 { 1650 struct mtx *vlp1, *vlp2; 1651 1652 MPASS(cel->vlp[0] == NULL); 1653 MPASS(cel->vlp[1] == NULL); 1654 MPASS(cel->vlp[2] == NULL); 1655 1656 MPASS(vp != NULL || dvp != NULL); 1657 1658 vlp1 = VP2VNODELOCK(vp); 1659 vlp2 = VP2VNODELOCK(dvp); 1660 cache_sort_vnodes(&vlp1, &vlp2); 1661 1662 if (vlp1 != NULL) { 1663 mtx_lock(vlp1); 1664 cel->vlp[0] = vlp1; 1665 } 1666 mtx_lock(vlp2); 1667 cel->vlp[1] = vlp2; 1668 } 1669 1670 static void 1671 cache_unlock_vnodes_cel(struct celockstate *cel) 1672 { 1673 1674 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); 1675 1676 if (cel->vlp[0] != NULL) 1677 mtx_unlock(cel->vlp[0]); 1678 if (cel->vlp[1] != NULL) 1679 mtx_unlock(cel->vlp[1]); 1680 if (cel->vlp[2] != NULL) 1681 mtx_unlock(cel->vlp[2]); 1682 } 1683 1684 static bool 1685 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) 1686 { 1687 struct mtx *vlp; 1688 bool ret; 1689 1690 cache_assert_vlp_locked(cel->vlp[0]); 1691 cache_assert_vlp_locked(cel->vlp[1]); 1692 MPASS(cel->vlp[2] == NULL); 1693 1694 MPASS(vp != NULL); 1695 vlp = VP2VNODELOCK(vp); 1696 1697 ret = true; 1698 if (vlp >= cel->vlp[1]) { 1699 mtx_lock(vlp); 1700 } else { 1701 if (mtx_trylock(vlp)) 1702 goto out; 1703 cache_lock_vnodes_cel_3_failures++; 1704 cache_unlock_vnodes_cel(cel); 1705 if (vlp < cel->vlp[0]) { 1706 mtx_lock(vlp); 1707 mtx_lock(cel->vlp[0]); 1708 mtx_lock(cel->vlp[1]); 1709 } else { 1710 if (cel->vlp[0] != NULL) 1711 mtx_lock(cel->vlp[0]); 1712 mtx_lock(vlp); 1713 mtx_lock(cel->vlp[1]); 1714 } 1715 ret = false; 1716 } 1717 out: 1718 cel->vlp[2] = vlp; 1719 return (ret); 1720 } 1721 1722 static void 1723 cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1, 1724 struct rwlock *blp2) 1725 { 1726 1727 MPASS(cel->blp[0] == NULL); 1728 MPASS(cel->blp[1] == NULL); 1729 1730 cache_sort_vnodes(&blp1, &blp2); 1731 1732 if (blp1 != NULL) { 1733 rw_wlock(blp1); 1734 cel->blp[0] = blp1; 1735 } 1736 rw_wlock(blp2); 1737 cel->blp[1] = blp2; 1738 } 1739 1740 static void 1741 cache_unlock_buckets_cel(struct celockstate *cel) 1742 { 1743 1744 if (cel->blp[0] != NULL) 1745 rw_wunlock(cel->blp[0]); 1746 rw_wunlock(cel->blp[1]); 1747 } 1748 1749 /* 1750 * Lock part of the cache affected by the insertion. 1751 * 1752 * This means vnodelocks for dvp, vp and the relevant bucketlock. 1753 * However, insertion can result in removal of an old entry. In this 1754 * case we have an additional vnode and bucketlock pair to lock. If the 1755 * entry is negative, ncelock is locked instead of the vnode. 1756 * 1757 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while 1758 * preserving the locking order (smaller address first). 1759 */ 1760 static void 1761 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1762 uint32_t hash) 1763 { 1764 struct namecache *ncp; 1765 struct rwlock *blps[2]; 1766 1767 blps[0] = HASH2BUCKETLOCK(hash); 1768 for (;;) { 1769 blps[1] = NULL; 1770 cache_lock_vnodes_cel(cel, dvp, vp); 1771 if (vp == NULL || vp->v_type != VDIR) 1772 break; 1773 ncp = vp->v_cache_dd; 1774 if (ncp == NULL) 1775 break; 1776 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1777 break; 1778 MPASS(ncp->nc_dvp == vp); 1779 blps[1] = NCP2BUCKETLOCK(ncp); 1780 if (ncp->nc_flag & NCF_NEGATIVE) 1781 break; 1782 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1783 break; 1784 /* 1785 * All vnodes got re-locked. Re-validate the state and if 1786 * nothing changed we are done. Otherwise restart. 1787 */ 1788 if (ncp == vp->v_cache_dd && 1789 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1790 blps[1] == NCP2BUCKETLOCK(ncp) && 1791 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1792 break; 1793 cache_unlock_vnodes_cel(cel); 1794 cel->vlp[0] = NULL; 1795 cel->vlp[1] = NULL; 1796 cel->vlp[2] = NULL; 1797 } 1798 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1799 } 1800 1801 static void 1802 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1803 uint32_t hash) 1804 { 1805 struct namecache *ncp; 1806 struct rwlock *blps[2]; 1807 1808 blps[0] = HASH2BUCKETLOCK(hash); 1809 for (;;) { 1810 blps[1] = NULL; 1811 cache_lock_vnodes_cel(cel, dvp, vp); 1812 ncp = dvp->v_cache_dd; 1813 if (ncp == NULL) 1814 break; 1815 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1816 break; 1817 MPASS(ncp->nc_dvp == dvp); 1818 blps[1] = NCP2BUCKETLOCK(ncp); 1819 if (ncp->nc_flag & NCF_NEGATIVE) 1820 break; 1821 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1822 break; 1823 if (ncp == dvp->v_cache_dd && 1824 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1825 blps[1] == NCP2BUCKETLOCK(ncp) && 1826 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1827 break; 1828 cache_unlock_vnodes_cel(cel); 1829 cel->vlp[0] = NULL; 1830 cel->vlp[1] = NULL; 1831 cel->vlp[2] = NULL; 1832 } 1833 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1834 } 1835 1836 static void 1837 cache_enter_unlock(struct celockstate *cel) 1838 { 1839 1840 cache_unlock_buckets_cel(cel); 1841 cache_unlock_vnodes_cel(cel); 1842 } 1843 1844 static void __noinline 1845 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, 1846 struct componentname *cnp) 1847 { 1848 struct celockstate cel; 1849 struct namecache *ncp; 1850 uint32_t hash; 1851 int len; 1852 1853 if (dvp->v_cache_dd == NULL) 1854 return; 1855 len = cnp->cn_namelen; 1856 cache_celockstate_init(&cel); 1857 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1858 cache_enter_lock_dd(&cel, dvp, vp, hash); 1859 vn_seqc_write_begin(dvp); 1860 ncp = dvp->v_cache_dd; 1861 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) { 1862 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); 1863 cache_zap_locked(ncp); 1864 } else { 1865 ncp = NULL; 1866 } 1867 dvp->v_cache_dd = NULL; 1868 vn_seqc_write_end(dvp); 1869 cache_enter_unlock(&cel); 1870 cache_free(ncp); 1871 } 1872 1873 /* 1874 * Add an entry to the cache. 1875 */ 1876 void 1877 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 1878 struct timespec *tsp, struct timespec *dtsp) 1879 { 1880 struct celockstate cel; 1881 struct namecache *ncp, *n2, *ndd; 1882 struct namecache_ts *ncp_ts, *n2_ts; 1883 struct nchashhead *ncpp; 1884 uint32_t hash; 1885 int flag; 1886 int len; 1887 u_long lnumcache; 1888 1889 CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr); 1890 VNASSERT(vp == NULL || !VN_IS_DOOMED(vp), vp, 1891 ("cache_enter: Adding a doomed vnode")); 1892 VNASSERT(dvp == NULL || !VN_IS_DOOMED(dvp), dvp, 1893 ("cache_enter: Doomed vnode used as src")); 1894 1895 #ifdef DEBUG_CACHE 1896 if (__predict_false(!doingcache)) 1897 return; 1898 #endif 1899 1900 flag = 0; 1901 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 1902 if (cnp->cn_namelen == 1) 1903 return; 1904 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 1905 cache_enter_dotdot_prep(dvp, vp, cnp); 1906 flag = NCF_ISDOTDOT; 1907 } 1908 } 1909 1910 /* 1911 * Avoid blowout in namecache entries. 1912 */ 1913 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; 1914 if (__predict_false(lnumcache >= ncsize)) { 1915 atomic_add_long(&numcache, -1); 1916 counter_u64_add(numdrops, 1); 1917 return; 1918 } 1919 1920 cache_celockstate_init(&cel); 1921 ndd = NULL; 1922 ncp_ts = NULL; 1923 1924 /* 1925 * Calculate the hash key and setup as much of the new 1926 * namecache entry as possible before acquiring the lock. 1927 */ 1928 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); 1929 ncp->nc_flag = flag | NCF_WIP; 1930 ncp->nc_vp = vp; 1931 if (vp == NULL) 1932 cache_negative_init(ncp); 1933 ncp->nc_dvp = dvp; 1934 if (tsp != NULL) { 1935 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1936 ncp_ts->nc_time = *tsp; 1937 ncp_ts->nc_ticks = ticks; 1938 ncp_ts->nc_nc.nc_flag |= NCF_TS; 1939 if (dtsp != NULL) { 1940 ncp_ts->nc_dotdottime = *dtsp; 1941 ncp_ts->nc_nc.nc_flag |= NCF_DTS; 1942 } 1943 } 1944 len = ncp->nc_nlen = cnp->cn_namelen; 1945 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1946 memcpy(ncp->nc_name, cnp->cn_nameptr, len); 1947 ncp->nc_name[len] = '\0'; 1948 cache_enter_lock(&cel, dvp, vp, hash); 1949 1950 /* 1951 * See if this vnode or negative entry is already in the cache 1952 * with this name. This can happen with concurrent lookups of 1953 * the same path name. 1954 */ 1955 ncpp = NCHHASH(hash); 1956 CK_SLIST_FOREACH(n2, ncpp, nc_hash) { 1957 if (n2->nc_dvp == dvp && 1958 n2->nc_nlen == cnp->cn_namelen && 1959 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { 1960 MPASS(cache_ncp_canuse(n2)); 1961 if ((n2->nc_flag & NCF_NEGATIVE) != 0) 1962 KASSERT(vp == NULL, 1963 ("%s: found entry pointing to a different vnode (%p != %p)", 1964 __func__, NULL, vp)); 1965 else 1966 KASSERT(n2->nc_vp == vp, 1967 ("%s: found entry pointing to a different vnode (%p != %p)", 1968 __func__, n2->nc_vp, vp)); 1969 if (tsp != NULL) { 1970 KASSERT((n2->nc_flag & NCF_TS) != 0, 1971 ("no NCF_TS")); 1972 n2_ts = __containerof(n2, struct namecache_ts, nc_nc); 1973 n2_ts->nc_time = ncp_ts->nc_time; 1974 n2_ts->nc_ticks = ncp_ts->nc_ticks; 1975 if (dtsp != NULL) { 1976 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; 1977 n2_ts->nc_nc.nc_flag |= NCF_DTS; 1978 } 1979 } 1980 goto out_unlock_free; 1981 } 1982 } 1983 1984 if (flag == NCF_ISDOTDOT) { 1985 /* 1986 * See if we are trying to add .. entry, but some other lookup 1987 * has populated v_cache_dd pointer already. 1988 */ 1989 if (dvp->v_cache_dd != NULL) 1990 goto out_unlock_free; 1991 KASSERT(vp == NULL || vp->v_type == VDIR, 1992 ("wrong vnode type %p", vp)); 1993 vn_seqc_write_begin(dvp); 1994 dvp->v_cache_dd = ncp; 1995 vn_seqc_write_end(dvp); 1996 } 1997 1998 if (vp != NULL) { 1999 if (vp->v_type == VDIR) { 2000 if (flag != NCF_ISDOTDOT) { 2001 /* 2002 * For this case, the cache entry maps both the 2003 * directory name in it and the name ".." for the 2004 * directory's parent. 2005 */ 2006 vn_seqc_write_begin(vp); 2007 if ((ndd = vp->v_cache_dd) != NULL) { 2008 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) 2009 cache_zap_locked(ndd); 2010 else 2011 ndd = NULL; 2012 } 2013 vp->v_cache_dd = ncp; 2014 vn_seqc_write_end(vp); 2015 } 2016 } else { 2017 if (vp->v_cache_dd != NULL) { 2018 vn_seqc_write_begin(vp); 2019 vp->v_cache_dd = NULL; 2020 vn_seqc_write_end(vp); 2021 } 2022 } 2023 } 2024 2025 if (flag != NCF_ISDOTDOT) { 2026 if (LIST_EMPTY(&dvp->v_cache_src)) { 2027 vhold(dvp); 2028 counter_u64_add(numcachehv, 1); 2029 } 2030 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); 2031 } 2032 2033 /* 2034 * If the entry is "negative", we place it into the 2035 * "negative" cache queue, otherwise, we place it into the 2036 * destination vnode's cache entries queue. 2037 */ 2038 if (vp != NULL) { 2039 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); 2040 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, 2041 vp); 2042 } else { 2043 if (cnp->cn_flags & ISWHITEOUT) 2044 ncp->nc_flag |= NCF_WHITE; 2045 cache_negative_insert(ncp); 2046 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, 2047 ncp->nc_name); 2048 } 2049 2050 /* 2051 * Insert the new namecache entry into the appropriate chain 2052 * within the cache entries table. 2053 */ 2054 CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash); 2055 2056 atomic_thread_fence_rel(); 2057 /* 2058 * Mark the entry as fully constructed. 2059 * It is immutable past this point until its removal. 2060 */ 2061 atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP); 2062 2063 cache_enter_unlock(&cel); 2064 if (numneg * ncnegfactor > lnumcache) 2065 cache_negative_zap_one(); 2066 cache_free(ndd); 2067 return; 2068 out_unlock_free: 2069 cache_enter_unlock(&cel); 2070 atomic_add_long(&numcache, -1); 2071 cache_free(ncp); 2072 return; 2073 } 2074 2075 static u_int 2076 cache_roundup_2(u_int val) 2077 { 2078 u_int res; 2079 2080 for (res = 1; res <= val; res <<= 1) 2081 continue; 2082 2083 return (res); 2084 } 2085 2086 static struct nchashhead * 2087 nchinittbl(u_long elements, u_long *hashmask) 2088 { 2089 struct nchashhead *hashtbl; 2090 u_long hashsize, i; 2091 2092 hashsize = cache_roundup_2(elements) / 2; 2093 2094 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK); 2095 for (i = 0; i < hashsize; i++) 2096 CK_SLIST_INIT(&hashtbl[i]); 2097 *hashmask = hashsize - 1; 2098 return (hashtbl); 2099 } 2100 2101 static void 2102 ncfreetbl(struct nchashhead *hashtbl) 2103 { 2104 2105 free(hashtbl, M_VFSCACHE); 2106 } 2107 2108 /* 2109 * Name cache initialization, from vfs_init() when we are booting 2110 */ 2111 static void 2112 nchinit(void *dummy __unused) 2113 { 2114 u_int i; 2115 2116 cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE, 2117 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2118 cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE, 2119 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2120 cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE, 2121 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2122 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE, 2123 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2124 2125 VFS_SMR_ZONE_SET(cache_zone_small); 2126 VFS_SMR_ZONE_SET(cache_zone_small_ts); 2127 VFS_SMR_ZONE_SET(cache_zone_large); 2128 VFS_SMR_ZONE_SET(cache_zone_large_ts); 2129 2130 ncsize = desiredvnodes * ncsizefactor; 2131 nchashtbl = nchinittbl(desiredvnodes * 2, &nchash); 2132 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; 2133 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ 2134 ncbuckethash = 7; 2135 if (ncbuckethash > nchash) 2136 ncbuckethash = nchash; 2137 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, 2138 M_WAITOK | M_ZERO); 2139 for (i = 0; i < numbucketlocks; i++) 2140 rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE); 2141 ncvnodehash = ncbuckethash; 2142 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, 2143 M_WAITOK | M_ZERO); 2144 for (i = 0; i < numvnodelocks; i++) 2145 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); 2146 ncpurgeminvnodes = numbucketlocks * 2; 2147 2148 neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE, 2149 M_WAITOK | M_ZERO); 2150 for (i = 0; i < numneglists; i++) { 2151 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); 2152 TAILQ_INIT(&neglists[i].nl_list); 2153 } 2154 mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF); 2155 TAILQ_INIT(&ncneg_hot.nl_list); 2156 2157 mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF); 2158 } 2159 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); 2160 2161 void 2162 cache_vnode_init(struct vnode *vp) 2163 { 2164 2165 LIST_INIT(&vp->v_cache_src); 2166 TAILQ_INIT(&vp->v_cache_dst); 2167 vp->v_cache_dd = NULL; 2168 cache_prehash(vp); 2169 } 2170 2171 void 2172 cache_changesize(u_long newmaxvnodes) 2173 { 2174 struct nchashhead *new_nchashtbl, *old_nchashtbl; 2175 u_long new_nchash, old_nchash; 2176 struct namecache *ncp; 2177 uint32_t hash; 2178 u_long newncsize; 2179 int i; 2180 2181 newncsize = newmaxvnodes * ncsizefactor; 2182 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); 2183 if (newmaxvnodes < numbucketlocks) 2184 newmaxvnodes = numbucketlocks; 2185 2186 new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash); 2187 /* If same hash table size, nothing to do */ 2188 if (nchash == new_nchash) { 2189 ncfreetbl(new_nchashtbl); 2190 return; 2191 } 2192 /* 2193 * Move everything from the old hash table to the new table. 2194 * None of the namecache entries in the table can be removed 2195 * because to do so, they have to be removed from the hash table. 2196 */ 2197 cache_lock_all_vnodes(); 2198 cache_lock_all_buckets(); 2199 old_nchashtbl = nchashtbl; 2200 old_nchash = nchash; 2201 nchashtbl = new_nchashtbl; 2202 nchash = new_nchash; 2203 for (i = 0; i <= old_nchash; i++) { 2204 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) { 2205 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, 2206 ncp->nc_dvp); 2207 CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash); 2208 CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); 2209 } 2210 } 2211 ncsize = newncsize; 2212 cache_unlock_all_buckets(); 2213 cache_unlock_all_vnodes(); 2214 ncfreetbl(old_nchashtbl); 2215 } 2216 2217 /* 2218 * Invalidate all entries from and to a particular vnode. 2219 */ 2220 static void 2221 cache_purge_impl(struct vnode *vp) 2222 { 2223 TAILQ_HEAD(, namecache) ncps; 2224 struct namecache *ncp, *nnp; 2225 struct mtx *vlp, *vlp2; 2226 2227 TAILQ_INIT(&ncps); 2228 vlp = VP2VNODELOCK(vp); 2229 vlp2 = NULL; 2230 mtx_assert(vlp, MA_OWNED); 2231 retry: 2232 while (!LIST_EMPTY(&vp->v_cache_src)) { 2233 ncp = LIST_FIRST(&vp->v_cache_src); 2234 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2235 goto retry; 2236 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2237 } 2238 while (!TAILQ_EMPTY(&vp->v_cache_dst)) { 2239 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2240 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2241 goto retry; 2242 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2243 } 2244 ncp = vp->v_cache_dd; 2245 if (ncp != NULL) { 2246 KASSERT(ncp->nc_flag & NCF_ISDOTDOT, 2247 ("lost dotdot link")); 2248 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2249 goto retry; 2250 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2251 } 2252 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); 2253 mtx_unlock(vlp); 2254 if (vlp2 != NULL) 2255 mtx_unlock(vlp2); 2256 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2257 cache_free(ncp); 2258 } 2259 } 2260 2261 void 2262 cache_purge(struct vnode *vp) 2263 { 2264 struct mtx *vlp; 2265 2266 SDT_PROBE1(vfs, namecache, purge, done, vp); 2267 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2268 vp->v_cache_dd == NULL) 2269 return; 2270 vlp = VP2VNODELOCK(vp); 2271 mtx_lock(vlp); 2272 cache_purge_impl(vp); 2273 } 2274 2275 /* 2276 * Only to be used by vgone. 2277 */ 2278 void 2279 cache_purge_vgone(struct vnode *vp) 2280 { 2281 struct mtx *vlp; 2282 2283 VNPASS(VN_IS_DOOMED(vp), vp); 2284 vlp = VP2VNODELOCK(vp); 2285 if (!(LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2286 vp->v_cache_dd == NULL)) { 2287 mtx_lock(vlp); 2288 cache_purge_impl(vp); 2289 mtx_assert(vlp, MA_NOTOWNED); 2290 return; 2291 } 2292 2293 /* 2294 * All the NULL pointer state we found above may be transient. 2295 * Serialize against a possible thread doing cache_purge. 2296 */ 2297 mtx_wait_unlocked(vlp); 2298 if (!(LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2299 vp->v_cache_dd == NULL)) { 2300 mtx_lock(vlp); 2301 cache_purge_impl(vp); 2302 mtx_assert(vlp, MA_NOTOWNED); 2303 return; 2304 } 2305 return; 2306 } 2307 2308 /* 2309 * Invalidate all negative entries for a particular directory vnode. 2310 */ 2311 void 2312 cache_purge_negative(struct vnode *vp) 2313 { 2314 TAILQ_HEAD(, namecache) ncps; 2315 struct namecache *ncp, *nnp; 2316 struct mtx *vlp; 2317 2318 CTR1(KTR_VFS, "cache_purge_negative(%p)", vp); 2319 SDT_PROBE1(vfs, namecache, purge_negative, done, vp); 2320 if (LIST_EMPTY(&vp->v_cache_src)) 2321 return; 2322 TAILQ_INIT(&ncps); 2323 vlp = VP2VNODELOCK(vp); 2324 mtx_lock(vlp); 2325 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { 2326 if (!(ncp->nc_flag & NCF_NEGATIVE)) 2327 continue; 2328 cache_zap_negative_locked_vnode_kl(ncp, vp); 2329 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2330 } 2331 mtx_unlock(vlp); 2332 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2333 cache_free(ncp); 2334 } 2335 } 2336 2337 /* 2338 * Flush all entries referencing a particular filesystem. 2339 */ 2340 void 2341 cache_purgevfs(struct mount *mp, bool force) 2342 { 2343 TAILQ_HEAD(, namecache) ncps; 2344 struct mtx *vlp1, *vlp2; 2345 struct rwlock *blp; 2346 struct nchashhead *bucket; 2347 struct namecache *ncp, *nnp; 2348 u_long i, j, n_nchash; 2349 int error; 2350 2351 /* Scan hash tables for applicable entries */ 2352 SDT_PROBE1(vfs, namecache, purgevfs, done, mp); 2353 if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes) 2354 return; 2355 TAILQ_INIT(&ncps); 2356 n_nchash = nchash + 1; 2357 vlp1 = vlp2 = NULL; 2358 for (i = 0; i < numbucketlocks; i++) { 2359 blp = (struct rwlock *)&bucketlocks[i]; 2360 rw_wlock(blp); 2361 for (j = i; j < n_nchash; j += numbucketlocks) { 2362 retry: 2363 bucket = &nchashtbl[j]; 2364 CK_SLIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) { 2365 cache_assert_bucket_locked(ncp, RA_WLOCKED); 2366 if (ncp->nc_dvp->v_mount != mp) 2367 continue; 2368 error = cache_zap_wlocked_bucket_kl(ncp, blp, 2369 &vlp1, &vlp2); 2370 if (error != 0) 2371 goto retry; 2372 TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst); 2373 } 2374 } 2375 rw_wunlock(blp); 2376 if (vlp1 == NULL && vlp2 == NULL) 2377 cache_maybe_yield(); 2378 } 2379 if (vlp1 != NULL) 2380 mtx_unlock(vlp1); 2381 if (vlp2 != NULL) 2382 mtx_unlock(vlp2); 2383 2384 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2385 cache_free(ncp); 2386 } 2387 } 2388 2389 /* 2390 * Perform canonical checks and cache lookup and pass on to filesystem 2391 * through the vop_cachedlookup only if needed. 2392 */ 2393 2394 int 2395 vfs_cache_lookup(struct vop_lookup_args *ap) 2396 { 2397 struct vnode *dvp; 2398 int error; 2399 struct vnode **vpp = ap->a_vpp; 2400 struct componentname *cnp = ap->a_cnp; 2401 int flags = cnp->cn_flags; 2402 2403 *vpp = NULL; 2404 dvp = ap->a_dvp; 2405 2406 if (dvp->v_type != VDIR) 2407 return (ENOTDIR); 2408 2409 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && 2410 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) 2411 return (EROFS); 2412 2413 error = vn_dir_check_exec(dvp, cnp); 2414 if (error != 0) 2415 return (error); 2416 2417 error = cache_lookup(dvp, vpp, cnp, NULL, NULL); 2418 if (error == 0) 2419 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); 2420 if (error == -1) 2421 return (0); 2422 return (error); 2423 } 2424 2425 /* Implementation of the getcwd syscall. */ 2426 int 2427 sys___getcwd(struct thread *td, struct __getcwd_args *uap) 2428 { 2429 char *buf, *retbuf; 2430 size_t buflen; 2431 int error; 2432 2433 buflen = uap->buflen; 2434 if (__predict_false(buflen < 2)) 2435 return (EINVAL); 2436 if (buflen > MAXPATHLEN) 2437 buflen = MAXPATHLEN; 2438 2439 buf = uma_zalloc(namei_zone, M_WAITOK); 2440 error = vn_getcwd(td, buf, &retbuf, &buflen); 2441 if (error == 0) 2442 error = copyout(retbuf, uap->buf, buflen); 2443 uma_zfree(namei_zone, buf); 2444 return (error); 2445 } 2446 2447 int 2448 vn_getcwd(struct thread *td, char *buf, char **retbuf, size_t *buflen) 2449 { 2450 struct pwd *pwd; 2451 int error; 2452 2453 pwd = pwd_hold(td); 2454 error = vn_fullpath_any(td, pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, buflen); 2455 pwd_drop(pwd); 2456 2457 #ifdef KTRACE 2458 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0) 2459 ktrnamei(*retbuf); 2460 #endif 2461 return (error); 2462 } 2463 2464 static int 2465 kern___realpathat(struct thread *td, int fd, const char *path, char *buf, 2466 size_t size, int flags, enum uio_seg pathseg) 2467 { 2468 struct nameidata nd; 2469 char *retbuf, *freebuf; 2470 int error; 2471 2472 if (flags != 0) 2473 return (EINVAL); 2474 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1, 2475 pathseg, path, fd, &cap_fstat_rights, td); 2476 if ((error = namei(&nd)) != 0) 2477 return (error); 2478 error = vn_fullpath_hardlink(td, &nd, &retbuf, &freebuf, &size); 2479 if (error == 0) { 2480 error = copyout(retbuf, buf, size); 2481 free(freebuf, M_TEMP); 2482 } 2483 NDFREE(&nd, 0); 2484 return (error); 2485 } 2486 2487 int 2488 sys___realpathat(struct thread *td, struct __realpathat_args *uap) 2489 { 2490 2491 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size, 2492 uap->flags, UIO_USERSPACE)); 2493 } 2494 2495 /* 2496 * Retrieve the full filesystem path that correspond to a vnode from the name 2497 * cache (if available) 2498 */ 2499 int 2500 vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf) 2501 { 2502 struct pwd *pwd; 2503 char *buf; 2504 size_t buflen; 2505 int error; 2506 2507 if (__predict_false(vn == NULL)) 2508 return (EINVAL); 2509 2510 buflen = MAXPATHLEN; 2511 buf = malloc(buflen, M_TEMP, M_WAITOK); 2512 pwd = pwd_hold(td); 2513 error = vn_fullpath_any(td, vn, pwd->pwd_rdir, buf, retbuf, &buflen); 2514 pwd_drop(pwd); 2515 2516 if (!error) 2517 *freebuf = buf; 2518 else 2519 free(buf, M_TEMP); 2520 return (error); 2521 } 2522 2523 /* 2524 * This function is similar to vn_fullpath, but it attempts to lookup the 2525 * pathname relative to the global root mount point. This is required for the 2526 * auditing sub-system, as audited pathnames must be absolute, relative to the 2527 * global root mount point. 2528 */ 2529 int 2530 vn_fullpath_global(struct thread *td, struct vnode *vn, 2531 char **retbuf, char **freebuf) 2532 { 2533 char *buf; 2534 size_t buflen; 2535 int error; 2536 2537 if (__predict_false(vn == NULL)) 2538 return (EINVAL); 2539 buflen = MAXPATHLEN; 2540 buf = malloc(buflen, M_TEMP, M_WAITOK); 2541 error = vn_fullpath_any(td, vn, rootvnode, buf, retbuf, &buflen); 2542 if (!error) 2543 *freebuf = buf; 2544 else 2545 free(buf, M_TEMP); 2546 return (error); 2547 } 2548 2549 int 2550 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, size_t *buflen) 2551 { 2552 struct vnode *dvp; 2553 struct namecache *ncp; 2554 struct mtx *vlp; 2555 int error; 2556 2557 vlp = VP2VNODELOCK(*vp); 2558 mtx_lock(vlp); 2559 TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) { 2560 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2561 break; 2562 } 2563 if (ncp != NULL) { 2564 if (*buflen < ncp->nc_nlen) { 2565 mtx_unlock(vlp); 2566 vrele(*vp); 2567 counter_u64_add(numfullpathfail4, 1); 2568 error = ENOMEM; 2569 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2570 vp, NULL); 2571 return (error); 2572 } 2573 *buflen -= ncp->nc_nlen; 2574 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 2575 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, 2576 ncp->nc_name, vp); 2577 dvp = *vp; 2578 *vp = ncp->nc_dvp; 2579 vref(*vp); 2580 mtx_unlock(vlp); 2581 vrele(dvp); 2582 return (0); 2583 } 2584 SDT_PROBE1(vfs, namecache, fullpath, miss, vp); 2585 2586 mtx_unlock(vlp); 2587 vn_lock(*vp, LK_SHARED | LK_RETRY); 2588 error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen); 2589 vput(*vp); 2590 if (error) { 2591 counter_u64_add(numfullpathfail2, 1); 2592 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2593 return (error); 2594 } 2595 2596 *vp = dvp; 2597 if (VN_IS_DOOMED(dvp)) { 2598 /* forced unmount */ 2599 vrele(dvp); 2600 error = ENOENT; 2601 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2602 return (error); 2603 } 2604 /* 2605 * *vp has its use count incremented still. 2606 */ 2607 2608 return (0); 2609 } 2610 2611 /* 2612 * Resolve a directory to a pathname. 2613 * 2614 * The name of the directory can always be found in the namecache or fetched 2615 * from the filesystem. There is also guaranteed to be only one parent, meaning 2616 * we can just follow vnodes up until we find the root. 2617 * 2618 * The vnode must be referenced. 2619 */ 2620 static int 2621 vn_fullpath_dir(struct thread *td, struct vnode *vp, struct vnode *rdir, 2622 char *buf, char **retbuf, size_t *len, bool slash_prefixed, size_t addend) 2623 { 2624 #ifdef KDTRACE_HOOKS 2625 struct vnode *startvp = vp; 2626 #endif 2627 struct vnode *vp1; 2628 size_t buflen; 2629 int error; 2630 2631 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); 2632 VNPASS(vp->v_usecount > 0, vp); 2633 2634 buflen = *len; 2635 2636 if (!slash_prefixed) { 2637 MPASS(*len >= 2); 2638 buflen--; 2639 buf[buflen] = '\0'; 2640 } 2641 2642 error = 0; 2643 2644 SDT_PROBE1(vfs, namecache, fullpath, entry, vp); 2645 counter_u64_add(numfullpathcalls, 1); 2646 while (vp != rdir && vp != rootvnode) { 2647 /* 2648 * The vp vnode must be already fully constructed, 2649 * since it is either found in namecache or obtained 2650 * from VOP_VPTOCNP(). We may test for VV_ROOT safely 2651 * without obtaining the vnode lock. 2652 */ 2653 if ((vp->v_vflag & VV_ROOT) != 0) { 2654 vn_lock(vp, LK_RETRY | LK_SHARED); 2655 2656 /* 2657 * With the vnode locked, check for races with 2658 * unmount, forced or not. Note that we 2659 * already verified that vp is not equal to 2660 * the root vnode, which means that 2661 * mnt_vnodecovered can be NULL only for the 2662 * case of unmount. 2663 */ 2664 if (VN_IS_DOOMED(vp) || 2665 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || 2666 vp1->v_mountedhere != vp->v_mount) { 2667 vput(vp); 2668 error = ENOENT; 2669 SDT_PROBE3(vfs, namecache, fullpath, return, 2670 error, vp, NULL); 2671 break; 2672 } 2673 2674 vref(vp1); 2675 vput(vp); 2676 vp = vp1; 2677 continue; 2678 } 2679 if (vp->v_type != VDIR) { 2680 vrele(vp); 2681 counter_u64_add(numfullpathfail1, 1); 2682 error = ENOTDIR; 2683 SDT_PROBE3(vfs, namecache, fullpath, return, 2684 error, vp, NULL); 2685 break; 2686 } 2687 error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen); 2688 if (error) 2689 break; 2690 if (buflen == 0) { 2691 vrele(vp); 2692 error = ENOMEM; 2693 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2694 startvp, NULL); 2695 break; 2696 } 2697 buf[--buflen] = '/'; 2698 slash_prefixed = true; 2699 } 2700 if (error) 2701 return (error); 2702 if (!slash_prefixed) { 2703 if (buflen == 0) { 2704 vrele(vp); 2705 counter_u64_add(numfullpathfail4, 1); 2706 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, 2707 startvp, NULL); 2708 return (ENOMEM); 2709 } 2710 buf[--buflen] = '/'; 2711 } 2712 counter_u64_add(numfullpathfound, 1); 2713 vrele(vp); 2714 2715 *retbuf = buf + buflen; 2716 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf); 2717 *len -= buflen; 2718 *len += addend; 2719 return (0); 2720 } 2721 2722 /* 2723 * Resolve an arbitrary vnode to a pathname. 2724 * 2725 * Note 2 caveats: 2726 * - hardlinks are not tracked, thus if the vnode is not a directory this can 2727 * resolve to a different path than the one used to find it 2728 * - namecache is not mandatory, meaning names are not guaranteed to be added 2729 * (in which case resolving fails) 2730 */ 2731 static int 2732 vn_fullpath_any(struct thread *td, struct vnode *vp, struct vnode *rdir, 2733 char *buf, char **retbuf, size_t *buflen) 2734 { 2735 size_t orig_buflen; 2736 bool slash_prefixed; 2737 int error; 2738 2739 if (*buflen < 2) 2740 return (EINVAL); 2741 2742 orig_buflen = *buflen; 2743 2744 vref(vp); 2745 slash_prefixed = false; 2746 if (vp->v_type != VDIR) { 2747 *buflen -= 1; 2748 buf[*buflen] = '\0'; 2749 error = vn_vptocnp(&vp, td->td_ucred, buf, buflen); 2750 if (error) 2751 return (error); 2752 if (*buflen == 0) { 2753 vrele(vp); 2754 return (ENOMEM); 2755 } 2756 *buflen -= 1; 2757 buf[*buflen] = '/'; 2758 slash_prefixed = true; 2759 } 2760 2761 return (vn_fullpath_dir(td, vp, rdir, buf, retbuf, buflen, slash_prefixed, 2762 orig_buflen - *buflen)); 2763 } 2764 2765 /* 2766 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks). 2767 * 2768 * Since the namecache does not track handlings, the caller is expected to first 2769 * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei. 2770 * 2771 * Then we have 2 cases: 2772 * - if the found vnode is a directory, the path can be constructed just by 2773 * fullowing names up the chain 2774 * - otherwise we populate the buffer with the saved name and start resolving 2775 * from the parent 2776 */ 2777 static int 2778 vn_fullpath_hardlink(struct thread *td, struct nameidata *ndp, char **retbuf, 2779 char **freebuf, size_t *buflen) 2780 { 2781 char *buf, *tmpbuf; 2782 struct pwd *pwd; 2783 struct componentname *cnp; 2784 struct vnode *vp; 2785 size_t addend; 2786 int error; 2787 bool slash_prefixed; 2788 2789 if (*buflen < 2) 2790 return (EINVAL); 2791 if (*buflen > MAXPATHLEN) 2792 *buflen = MAXPATHLEN; 2793 2794 slash_prefixed = false; 2795 2796 buf = malloc(*buflen, M_TEMP, M_WAITOK); 2797 pwd = pwd_hold(td); 2798 2799 addend = 0; 2800 vp = ndp->ni_vp; 2801 if (vp->v_type != VDIR) { 2802 cnp = &ndp->ni_cnd; 2803 addend = cnp->cn_namelen + 2; 2804 if (*buflen < addend) { 2805 error = ENOMEM; 2806 goto out_bad; 2807 } 2808 *buflen -= addend; 2809 tmpbuf = buf + *buflen; 2810 tmpbuf[0] = '/'; 2811 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen); 2812 tmpbuf[addend - 1] = '\0'; 2813 slash_prefixed = true; 2814 vp = ndp->ni_dvp; 2815 } 2816 2817 vref(vp); 2818 error = vn_fullpath_dir(td, vp, pwd->pwd_rdir, buf, retbuf, buflen, 2819 slash_prefixed, addend); 2820 if (error != 0) 2821 goto out_bad; 2822 2823 pwd_drop(pwd); 2824 *freebuf = buf; 2825 2826 return (0); 2827 out_bad: 2828 pwd_drop(pwd); 2829 free(buf, M_TEMP); 2830 return (error); 2831 } 2832 2833 struct vnode * 2834 vn_dir_dd_ino(struct vnode *vp) 2835 { 2836 struct namecache *ncp; 2837 struct vnode *ddvp; 2838 struct mtx *vlp; 2839 enum vgetstate vs; 2840 2841 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); 2842 vlp = VP2VNODELOCK(vp); 2843 mtx_lock(vlp); 2844 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { 2845 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 2846 continue; 2847 ddvp = ncp->nc_dvp; 2848 vs = vget_prep(ddvp); 2849 mtx_unlock(vlp); 2850 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) 2851 return (NULL); 2852 return (ddvp); 2853 } 2854 mtx_unlock(vlp); 2855 return (NULL); 2856 } 2857 2858 int 2859 vn_commname(struct vnode *vp, char *buf, u_int buflen) 2860 { 2861 struct namecache *ncp; 2862 struct mtx *vlp; 2863 int l; 2864 2865 vlp = VP2VNODELOCK(vp); 2866 mtx_lock(vlp); 2867 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) 2868 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2869 break; 2870 if (ncp == NULL) { 2871 mtx_unlock(vlp); 2872 return (ENOENT); 2873 } 2874 l = min(ncp->nc_nlen, buflen - 1); 2875 memcpy(buf, ncp->nc_name, l); 2876 mtx_unlock(vlp); 2877 buf[l] = '\0'; 2878 return (0); 2879 } 2880 2881 /* 2882 * This function updates path string to vnode's full global path 2883 * and checks the size of the new path string against the pathlen argument. 2884 * 2885 * Requires a locked, referenced vnode. 2886 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 2887 * 2888 * If vp is a directory, the call to vn_fullpath_global() always succeeds 2889 * because it falls back to the ".." lookup if the namecache lookup fails. 2890 */ 2891 int 2892 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, 2893 u_int pathlen) 2894 { 2895 struct nameidata nd; 2896 struct vnode *vp1; 2897 char *rpath, *fbuf; 2898 int error; 2899 2900 ASSERT_VOP_ELOCKED(vp, __func__); 2901 2902 /* Construct global filesystem path from vp. */ 2903 VOP_UNLOCK(vp); 2904 error = vn_fullpath_global(td, vp, &rpath, &fbuf); 2905 2906 if (error != 0) { 2907 vrele(vp); 2908 return (error); 2909 } 2910 2911 if (strlen(rpath) >= pathlen) { 2912 vrele(vp); 2913 error = ENAMETOOLONG; 2914 goto out; 2915 } 2916 2917 /* 2918 * Re-lookup the vnode by path to detect a possible rename. 2919 * As a side effect, the vnode is relocked. 2920 * If vnode was renamed, return ENOENT. 2921 */ 2922 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, 2923 UIO_SYSSPACE, path, td); 2924 error = namei(&nd); 2925 if (error != 0) { 2926 vrele(vp); 2927 goto out; 2928 } 2929 NDFREE(&nd, NDF_ONLY_PNBUF); 2930 vp1 = nd.ni_vp; 2931 vrele(vp); 2932 if (vp1 == vp) 2933 strcpy(path, rpath); 2934 else { 2935 vput(vp1); 2936 error = ENOENT; 2937 } 2938 2939 out: 2940 free(fbuf, M_TEMP); 2941 return (error); 2942 } 2943 2944 #ifdef DDB 2945 static void 2946 db_print_vpath(struct vnode *vp) 2947 { 2948 2949 while (vp != NULL) { 2950 db_printf("%p: ", vp); 2951 if (vp == rootvnode) { 2952 db_printf("/"); 2953 vp = NULL; 2954 } else { 2955 if (vp->v_vflag & VV_ROOT) { 2956 db_printf("<mount point>"); 2957 vp = vp->v_mount->mnt_vnodecovered; 2958 } else { 2959 struct namecache *ncp; 2960 char *ncn; 2961 int i; 2962 2963 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2964 if (ncp != NULL) { 2965 ncn = ncp->nc_name; 2966 for (i = 0; i < ncp->nc_nlen; i++) 2967 db_printf("%c", *ncn++); 2968 vp = ncp->nc_dvp; 2969 } else { 2970 vp = NULL; 2971 } 2972 } 2973 } 2974 db_printf("\n"); 2975 } 2976 2977 return; 2978 } 2979 2980 DB_SHOW_COMMAND(vpath, db_show_vpath) 2981 { 2982 struct vnode *vp; 2983 2984 if (!have_addr) { 2985 db_printf("usage: show vpath <struct vnode *>\n"); 2986 return; 2987 } 2988 2989 vp = (struct vnode *)addr; 2990 db_print_vpath(vp); 2991 } 2992 2993 #endif 2994 2995 static bool __read_frequently cache_fast_lookup = true; 2996 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW, 2997 &cache_fast_lookup, 0, ""); 2998 2999 #define CACHE_FPL_FAILED -2020 3000 3001 static void 3002 cache_fpl_cleanup_cnp(struct componentname *cnp) 3003 { 3004 3005 uma_zfree(namei_zone, cnp->cn_pnbuf); 3006 #ifdef DIAGNOSTIC 3007 cnp->cn_pnbuf = NULL; 3008 cnp->cn_nameptr = NULL; 3009 #endif 3010 } 3011 3012 static void 3013 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp) 3014 { 3015 struct componentname *cnp; 3016 3017 cnp = &ndp->ni_cnd; 3018 while (*(cnp->cn_nameptr) == '/') { 3019 cnp->cn_nameptr++; 3020 ndp->ni_pathlen--; 3021 } 3022 3023 *dpp = ndp->ni_rootdir; 3024 } 3025 3026 /* 3027 * Components of nameidata (or objects it can point to) which may 3028 * need restoring in case fast path lookup fails. 3029 */ 3030 struct nameidata_saved { 3031 long cn_namelen; 3032 char *cn_nameptr; 3033 size_t ni_pathlen; 3034 int cn_flags; 3035 }; 3036 3037 struct cache_fpl { 3038 struct nameidata *ndp; 3039 struct componentname *cnp; 3040 struct pwd *pwd; 3041 struct vnode *dvp; 3042 struct vnode *tvp; 3043 seqc_t dvp_seqc; 3044 seqc_t tvp_seqc; 3045 struct nameidata_saved snd; 3046 int line; 3047 enum cache_fpl_status status:8; 3048 bool in_smr; 3049 }; 3050 3051 static void 3052 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd) 3053 { 3054 3055 snd->cn_flags = fpl->ndp->ni_cnd.cn_flags; 3056 snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen; 3057 snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr; 3058 snd->ni_pathlen = fpl->ndp->ni_pathlen; 3059 } 3060 3061 static void 3062 cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd) 3063 { 3064 3065 fpl->ndp->ni_cnd.cn_flags = snd->cn_flags; 3066 fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen; 3067 fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr; 3068 fpl->ndp->ni_pathlen = snd->ni_pathlen; 3069 } 3070 3071 #ifdef INVARIANTS 3072 #define cache_fpl_smr_assert_entered(fpl) ({ \ 3073 struct cache_fpl *_fpl = (fpl); \ 3074 MPASS(_fpl->in_smr == true); \ 3075 VFS_SMR_ASSERT_ENTERED(); \ 3076 }) 3077 #define cache_fpl_smr_assert_not_entered(fpl) ({ \ 3078 struct cache_fpl *_fpl = (fpl); \ 3079 MPASS(_fpl->in_smr == false); \ 3080 VFS_SMR_ASSERT_NOT_ENTERED(); \ 3081 }) 3082 #else 3083 #define cache_fpl_smr_assert_entered(fpl) do { } while (0) 3084 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0) 3085 #endif 3086 3087 #define cache_fpl_smr_enter_initial(fpl) ({ \ 3088 struct cache_fpl *_fpl = (fpl); \ 3089 vfs_smr_enter(); \ 3090 _fpl->in_smr = true; \ 3091 }) 3092 3093 #define cache_fpl_smr_enter(fpl) ({ \ 3094 struct cache_fpl *_fpl = (fpl); \ 3095 MPASS(_fpl->in_smr == false); \ 3096 vfs_smr_enter(); \ 3097 _fpl->in_smr = true; \ 3098 }) 3099 3100 #define cache_fpl_smr_exit(fpl) ({ \ 3101 struct cache_fpl *_fpl = (fpl); \ 3102 MPASS(_fpl->in_smr == true); \ 3103 vfs_smr_exit(); \ 3104 _fpl->in_smr = false; \ 3105 }) 3106 3107 static int 3108 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line) 3109 { 3110 3111 if (fpl->status != CACHE_FPL_STATUS_UNSET) { 3112 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, 3113 ("%s: converting to abort from %d at %d, set at %d\n", 3114 __func__, fpl->status, line, fpl->line)); 3115 } 3116 fpl->status = CACHE_FPL_STATUS_ABORTED; 3117 fpl->line = line; 3118 return (CACHE_FPL_FAILED); 3119 } 3120 3121 #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__) 3122 3123 static int 3124 cache_fpl_partial_impl(struct cache_fpl *fpl, int line) 3125 { 3126 3127 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3128 ("%s: setting to partial at %d, but already set to %d at %d\n", 3129 __func__, line, fpl->status, fpl->line)); 3130 cache_fpl_smr_assert_entered(fpl); 3131 fpl->status = CACHE_FPL_STATUS_PARTIAL; 3132 fpl->line = line; 3133 return (CACHE_FPL_FAILED); 3134 } 3135 3136 #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__) 3137 3138 static int 3139 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line) 3140 { 3141 3142 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3143 ("%s: setting to handled at %d, but already set to %d at %d\n", 3144 __func__, line, fpl->status, fpl->line)); 3145 cache_fpl_smr_assert_not_entered(fpl); 3146 MPASS(error != CACHE_FPL_FAILED); 3147 fpl->status = CACHE_FPL_STATUS_HANDLED; 3148 fpl->line = line; 3149 return (error); 3150 } 3151 3152 #define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__) 3153 3154 #define CACHE_FPL_SUPPORTED_CN_FLAGS \ 3155 (LOCKLEAF | LOCKPARENT | WANTPARENT | NOCACHE | FOLLOW | LOCKSHARED | SAVENAME | \ 3156 SAVESTART | WILLBEDIR | ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK) 3157 3158 #define CACHE_FPL_INTERNAL_CN_FLAGS \ 3159 (ISDOTDOT | MAKEENTRY | ISLASTCN) 3160 3161 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, 3162 "supported and internal flags overlap"); 3163 3164 static bool 3165 cache_fpl_islastcn(struct nameidata *ndp) 3166 { 3167 3168 return (*ndp->ni_next == 0); 3169 } 3170 3171 static bool 3172 cache_fpl_isdotdot(struct componentname *cnp) 3173 { 3174 3175 if (cnp->cn_namelen == 2 && 3176 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') 3177 return (true); 3178 return (false); 3179 } 3180 3181 static bool 3182 cache_can_fplookup(struct cache_fpl *fpl) 3183 { 3184 struct nameidata *ndp; 3185 struct componentname *cnp; 3186 struct thread *td; 3187 3188 ndp = fpl->ndp; 3189 cnp = fpl->cnp; 3190 td = cnp->cn_thread; 3191 3192 if (!cache_fast_lookup) { 3193 cache_fpl_aborted(fpl); 3194 return (false); 3195 } 3196 #ifdef MAC 3197 if (mac_vnode_check_lookup_enabled()) { 3198 cache_fpl_aborted(fpl); 3199 return (false); 3200 } 3201 #endif 3202 if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) { 3203 cache_fpl_aborted(fpl); 3204 return (false); 3205 } 3206 if (ndp->ni_dirfd != AT_FDCWD) { 3207 cache_fpl_aborted(fpl); 3208 return (false); 3209 } 3210 if (IN_CAPABILITY_MODE(td)) { 3211 cache_fpl_aborted(fpl); 3212 return (false); 3213 } 3214 if (AUDITING_TD(td)) { 3215 cache_fpl_aborted(fpl); 3216 return (false); 3217 } 3218 if (ndp->ni_startdir != NULL) { 3219 cache_fpl_aborted(fpl); 3220 return (false); 3221 } 3222 return (true); 3223 } 3224 3225 static bool 3226 cache_fplookup_vnode_supported(struct vnode *vp) 3227 { 3228 3229 return (vp->v_type != VLNK); 3230 } 3231 3232 /* 3233 * Move a negative entry to the hot list. 3234 * 3235 * We have to take locks, but they may be contended and in the worst 3236 * case we may need to go off CPU. We don't want to spin within the 3237 * smr section and we can't block with it. Instead we are going to 3238 * look up the entry again. 3239 */ 3240 static int __noinline 3241 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp, 3242 uint32_t hash) 3243 { 3244 struct componentname *cnp; 3245 struct namecache *ncp; 3246 struct neglist *neglist; 3247 struct negstate *negstate; 3248 struct vnode *dvp; 3249 u_char nc_flag; 3250 3251 cnp = fpl->cnp; 3252 dvp = fpl->dvp; 3253 3254 if (!vhold_smr(dvp)) 3255 return (cache_fpl_aborted(fpl)); 3256 3257 neglist = NCP2NEGLIST(oncp); 3258 cache_fpl_smr_exit(fpl); 3259 3260 mtx_lock(&ncneg_hot.nl_lock); 3261 mtx_lock(&neglist->nl_lock); 3262 /* 3263 * For hash iteration. 3264 */ 3265 cache_fpl_smr_enter(fpl); 3266 3267 /* 3268 * Avoid all surprises by only succeeding if we got the same entry and 3269 * bailing completely otherwise. 3270 * 3271 * In particular at this point there can be a new ncp which matches the 3272 * search but hashes to a different neglist. 3273 */ 3274 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 3275 if (ncp == oncp) 3276 break; 3277 } 3278 3279 /* 3280 * No match to begin with. 3281 */ 3282 if (__predict_false(ncp == NULL)) { 3283 goto out_abort; 3284 } 3285 3286 /* 3287 * The newly found entry may be something different... 3288 */ 3289 if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 3290 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) { 3291 goto out_abort; 3292 } 3293 3294 /* 3295 * ... and not even negative. 3296 */ 3297 nc_flag = atomic_load_char(&ncp->nc_flag); 3298 if ((nc_flag & NCF_NEGATIVE) == 0) { 3299 goto out_abort; 3300 } 3301 3302 if (__predict_false(!cache_ncp_canuse(ncp))) { 3303 goto out_abort; 3304 } 3305 3306 negstate = NCP2NEGSTATE(ncp); 3307 if ((negstate->neg_flag & NEG_HOT) == 0) { 3308 numhotneg++; 3309 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 3310 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); 3311 negstate->neg_flag |= NEG_HOT; 3312 } 3313 3314 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 3315 counter_u64_add(numneghits, 1); 3316 cache_fpl_smr_exit(fpl); 3317 mtx_unlock(&neglist->nl_lock); 3318 mtx_unlock(&ncneg_hot.nl_lock); 3319 vdrop(dvp); 3320 return (cache_fpl_handled(fpl, ENOENT)); 3321 out_abort: 3322 cache_fpl_smr_exit(fpl); 3323 mtx_unlock(&neglist->nl_lock); 3324 mtx_unlock(&ncneg_hot.nl_lock); 3325 vdrop(dvp); 3326 return (cache_fpl_aborted(fpl)); 3327 } 3328 3329 /* 3330 * The target vnode is not supported, prepare for the slow path to take over. 3331 */ 3332 static int __noinline 3333 cache_fplookup_partial_setup(struct cache_fpl *fpl) 3334 { 3335 struct nameidata *ndp; 3336 struct componentname *cnp; 3337 enum vgetstate dvs; 3338 struct vnode *dvp; 3339 struct pwd *pwd; 3340 seqc_t dvp_seqc; 3341 3342 ndp = fpl->ndp; 3343 cnp = fpl->cnp; 3344 dvp = fpl->dvp; 3345 dvp_seqc = fpl->dvp_seqc; 3346 3347 dvs = vget_prep_smr(dvp); 3348 if (__predict_false(dvs == VGET_NONE)) { 3349 cache_fpl_smr_exit(fpl); 3350 return (cache_fpl_aborted(fpl)); 3351 } 3352 3353 cache_fpl_smr_exit(fpl); 3354 3355 vget_finish_ref(dvp, dvs); 3356 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3357 vrele(dvp); 3358 return (cache_fpl_aborted(fpl)); 3359 } 3360 3361 pwd = pwd_hold(curthread); 3362 if (fpl->pwd != pwd) { 3363 vrele(dvp); 3364 pwd_drop(pwd); 3365 return (cache_fpl_aborted(fpl)); 3366 } 3367 3368 cache_fpl_restore(fpl, &fpl->snd); 3369 3370 ndp->ni_startdir = dvp; 3371 cnp->cn_flags |= MAKEENTRY; 3372 if (cache_fpl_islastcn(ndp)) 3373 cnp->cn_flags |= ISLASTCN; 3374 if (cache_fpl_isdotdot(cnp)) 3375 cnp->cn_flags |= ISDOTDOT; 3376 3377 return (0); 3378 } 3379 3380 static int 3381 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs) 3382 { 3383 struct componentname *cnp; 3384 struct vnode *tvp; 3385 seqc_t tvp_seqc; 3386 int error, lkflags; 3387 3388 cnp = fpl->cnp; 3389 tvp = fpl->tvp; 3390 tvp_seqc = fpl->tvp_seqc; 3391 3392 if ((cnp->cn_flags & LOCKLEAF) != 0) { 3393 lkflags = LK_SHARED; 3394 if ((cnp->cn_flags & LOCKSHARED) == 0) 3395 lkflags = LK_EXCLUSIVE; 3396 error = vget_finish(tvp, lkflags, tvs); 3397 if (__predict_false(error != 0)) { 3398 return (cache_fpl_aborted(fpl)); 3399 } 3400 } else { 3401 vget_finish_ref(tvp, tvs); 3402 } 3403 3404 if (!vn_seqc_consistent(tvp, tvp_seqc)) { 3405 if ((cnp->cn_flags & LOCKLEAF) != 0) 3406 vput(tvp); 3407 else 3408 vrele(tvp); 3409 return (cache_fpl_aborted(fpl)); 3410 } 3411 3412 return (cache_fpl_handled(fpl, 0)); 3413 } 3414 3415 /* 3416 * They want to possibly modify the state of the namecache. 3417 * 3418 * Don't try to match the API contract, just leave. 3419 * TODO: this leaves scalability on the table 3420 */ 3421 static int 3422 cache_fplookup_final_modifying(struct cache_fpl *fpl) 3423 { 3424 struct componentname *cnp; 3425 3426 cnp = fpl->cnp; 3427 MPASS(cnp->cn_nameiop != LOOKUP); 3428 return (cache_fpl_partial(fpl)); 3429 } 3430 3431 static int __noinline 3432 cache_fplookup_final_withparent(struct cache_fpl *fpl) 3433 { 3434 struct componentname *cnp; 3435 enum vgetstate dvs, tvs; 3436 struct vnode *dvp, *tvp; 3437 seqc_t dvp_seqc, tvp_seqc; 3438 int error; 3439 3440 cnp = fpl->cnp; 3441 dvp = fpl->dvp; 3442 dvp_seqc = fpl->dvp_seqc; 3443 tvp = fpl->tvp; 3444 tvp_seqc = fpl->tvp_seqc; 3445 3446 MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0); 3447 3448 /* 3449 * This is less efficient than it can be for simplicity. 3450 */ 3451 dvs = vget_prep_smr(dvp); 3452 if (__predict_false(dvs == VGET_NONE)) { 3453 return (cache_fpl_aborted(fpl)); 3454 } 3455 tvs = vget_prep_smr(tvp); 3456 if (__predict_false(tvs == VGET_NONE)) { 3457 cache_fpl_smr_exit(fpl); 3458 vget_abort(dvp, dvs); 3459 return (cache_fpl_aborted(fpl)); 3460 } 3461 3462 cache_fpl_smr_exit(fpl); 3463 3464 if ((cnp->cn_flags & LOCKPARENT) != 0) { 3465 error = vget_finish(dvp, LK_EXCLUSIVE, dvs); 3466 if (__predict_false(error != 0)) { 3467 vget_abort(tvp, tvs); 3468 return (cache_fpl_aborted(fpl)); 3469 } 3470 } else { 3471 vget_finish_ref(dvp, dvs); 3472 } 3473 3474 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3475 vget_abort(tvp, tvs); 3476 if ((cnp->cn_flags & LOCKPARENT) != 0) 3477 vput(dvp); 3478 else 3479 vrele(dvp); 3480 return (cache_fpl_aborted(fpl)); 3481 } 3482 3483 error = cache_fplookup_final_child(fpl, tvs); 3484 if (__predict_false(error != 0)) { 3485 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED); 3486 if ((cnp->cn_flags & LOCKPARENT) != 0) 3487 vput(dvp); 3488 else 3489 vrele(dvp); 3490 return (error); 3491 } 3492 3493 MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED); 3494 return (0); 3495 } 3496 3497 static int 3498 cache_fplookup_final(struct cache_fpl *fpl) 3499 { 3500 struct componentname *cnp; 3501 enum vgetstate tvs; 3502 struct vnode *dvp, *tvp; 3503 seqc_t dvp_seqc, tvp_seqc; 3504 3505 cnp = fpl->cnp; 3506 dvp = fpl->dvp; 3507 dvp_seqc = fpl->dvp_seqc; 3508 tvp = fpl->tvp; 3509 tvp_seqc = fpl->tvp_seqc; 3510 3511 VNPASS(cache_fplookup_vnode_supported(dvp), dvp); 3512 3513 if (cnp->cn_nameiop != LOOKUP) { 3514 return (cache_fplookup_final_modifying(fpl)); 3515 } 3516 3517 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) 3518 return (cache_fplookup_final_withparent(fpl)); 3519 3520 tvs = vget_prep_smr(tvp); 3521 if (__predict_false(tvs == VGET_NONE)) { 3522 return (cache_fpl_partial(fpl)); 3523 } 3524 3525 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3526 cache_fpl_smr_exit(fpl); 3527 vget_abort(tvp, tvs); 3528 return (cache_fpl_aborted(fpl)); 3529 } 3530 3531 cache_fpl_smr_exit(fpl); 3532 return (cache_fplookup_final_child(fpl, tvs)); 3533 } 3534 3535 static int __noinline 3536 cache_fplookup_dot(struct cache_fpl *fpl) 3537 { 3538 struct vnode *dvp; 3539 3540 dvp = fpl->dvp; 3541 3542 fpl->tvp = dvp; 3543 fpl->tvp_seqc = vn_seqc_read_any(dvp); 3544 if (seqc_in_modify(fpl->tvp_seqc)) { 3545 return (cache_fpl_aborted(fpl)); 3546 } 3547 3548 counter_u64_add(dothits, 1); 3549 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", dvp); 3550 3551 return (0); 3552 } 3553 3554 static int __noinline 3555 cache_fplookup_dotdot(struct cache_fpl *fpl) 3556 { 3557 struct nameidata *ndp; 3558 struct componentname *cnp; 3559 struct namecache *ncp; 3560 struct vnode *dvp; 3561 struct prison *pr; 3562 u_char nc_flag; 3563 3564 ndp = fpl->ndp; 3565 cnp = fpl->cnp; 3566 dvp = fpl->dvp; 3567 3568 /* 3569 * XXX this is racy the same way regular lookup is 3570 */ 3571 for (pr = cnp->cn_cred->cr_prison; pr != NULL; 3572 pr = pr->pr_parent) 3573 if (dvp == pr->pr_root) 3574 break; 3575 3576 if (dvp == ndp->ni_rootdir || 3577 dvp == ndp->ni_topdir || 3578 dvp == rootvnode || 3579 pr != NULL) { 3580 fpl->tvp = dvp; 3581 fpl->tvp_seqc = vn_seqc_read_any(dvp); 3582 if (seqc_in_modify(fpl->tvp_seqc)) { 3583 return (cache_fpl_aborted(fpl)); 3584 } 3585 return (0); 3586 } 3587 3588 if ((dvp->v_vflag & VV_ROOT) != 0) { 3589 /* 3590 * TODO 3591 * The opposite of climb mount is needed here. 3592 */ 3593 return (cache_fpl_aborted(fpl)); 3594 } 3595 3596 ncp = atomic_load_ptr(&dvp->v_cache_dd); 3597 if (ncp == NULL) { 3598 return (cache_fpl_aborted(fpl)); 3599 } 3600 3601 nc_flag = atomic_load_char(&ncp->nc_flag); 3602 if ((nc_flag & NCF_ISDOTDOT) != 0) { 3603 if ((nc_flag & NCF_NEGATIVE) != 0) 3604 return (cache_fpl_aborted(fpl)); 3605 fpl->tvp = ncp->nc_vp; 3606 } else { 3607 fpl->tvp = ncp->nc_dvp; 3608 } 3609 3610 if (__predict_false(!cache_ncp_canuse(ncp))) { 3611 return (cache_fpl_aborted(fpl)); 3612 } 3613 3614 fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp); 3615 if (seqc_in_modify(fpl->tvp_seqc)) { 3616 return (cache_fpl_partial(fpl)); 3617 } 3618 3619 counter_u64_add(dotdothits, 1); 3620 return (0); 3621 } 3622 3623 static int 3624 cache_fplookup_next(struct cache_fpl *fpl) 3625 { 3626 struct componentname *cnp; 3627 struct namecache *ncp; 3628 struct negstate *negstate; 3629 struct vnode *dvp, *tvp; 3630 u_char nc_flag; 3631 uint32_t hash; 3632 bool neg_hot; 3633 3634 cnp = fpl->cnp; 3635 dvp = fpl->dvp; 3636 3637 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) { 3638 return (cache_fplookup_dot(fpl)); 3639 } 3640 3641 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 3642 3643 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 3644 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 3645 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 3646 break; 3647 } 3648 3649 /* 3650 * If there is no entry we have to punt to the slow path to perform 3651 * actual lookup. Should there be nothing with this name a negative 3652 * entry will be created. 3653 */ 3654 if (__predict_false(ncp == NULL)) { 3655 return (cache_fpl_partial(fpl)); 3656 } 3657 3658 tvp = atomic_load_ptr(&ncp->nc_vp); 3659 nc_flag = atomic_load_char(&ncp->nc_flag); 3660 if ((nc_flag & NCF_NEGATIVE) != 0) { 3661 /* 3662 * If they want to create an entry we need to replace this one. 3663 */ 3664 if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) { 3665 return (cache_fpl_partial(fpl)); 3666 } 3667 negstate = NCP2NEGSTATE(ncp); 3668 neg_hot = ((negstate->neg_flag & NEG_HOT) != 0); 3669 if (__predict_false(!cache_ncp_canuse(ncp))) { 3670 return (cache_fpl_partial(fpl)); 3671 } 3672 if (__predict_false((nc_flag & NCF_WHITE) != 0)) { 3673 return (cache_fpl_partial(fpl)); 3674 } 3675 if (!neg_hot) { 3676 return (cache_fplookup_negative_promote(fpl, ncp, hash)); 3677 } 3678 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, 3679 ncp->nc_name); 3680 counter_u64_add(numneghits, 1); 3681 cache_fpl_smr_exit(fpl); 3682 return (cache_fpl_handled(fpl, ENOENT)); 3683 } 3684 3685 if (__predict_false(!cache_ncp_canuse(ncp))) { 3686 return (cache_fpl_partial(fpl)); 3687 } 3688 3689 fpl->tvp = tvp; 3690 fpl->tvp_seqc = vn_seqc_read_any(tvp); 3691 if (seqc_in_modify(fpl->tvp_seqc)) { 3692 return (cache_fpl_partial(fpl)); 3693 } 3694 3695 if (!cache_fplookup_vnode_supported(tvp)) { 3696 return (cache_fpl_partial(fpl)); 3697 } 3698 3699 counter_u64_add(numposhits, 1); 3700 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp); 3701 return (0); 3702 } 3703 3704 static bool 3705 cache_fplookup_mp_supported(struct mount *mp) 3706 { 3707 3708 if (mp == NULL) 3709 return (false); 3710 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) 3711 return (false); 3712 return (true); 3713 } 3714 3715 /* 3716 * Walk up the mount stack (if any). 3717 * 3718 * Correctness is provided in the following ways: 3719 * - all vnodes are protected from freeing with SMR 3720 * - struct mount objects are type stable making them always safe to access 3721 * - stability of the particular mount is provided by busying it 3722 * - relationship between the vnode which is mounted on and the mount is 3723 * verified with the vnode sequence counter after busying 3724 * - association between root vnode of the mount and the mount is protected 3725 * by busy 3726 * 3727 * From that point on we can read the sequence counter of the root vnode 3728 * and get the next mount on the stack (if any) using the same protection. 3729 * 3730 * By the end of successful walk we are guaranteed the reached state was 3731 * indeed present at least at some point which matches the regular lookup. 3732 */ 3733 static int __noinline 3734 cache_fplookup_climb_mount(struct cache_fpl *fpl) 3735 { 3736 struct mount *mp, *prev_mp; 3737 struct vnode *vp; 3738 seqc_t vp_seqc; 3739 3740 vp = fpl->tvp; 3741 vp_seqc = fpl->tvp_seqc; 3742 3743 VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp); 3744 mp = atomic_load_ptr(&vp->v_mountedhere); 3745 if (mp == NULL) 3746 return (0); 3747 3748 prev_mp = NULL; 3749 for (;;) { 3750 if (!vfs_op_thread_enter_crit(mp)) { 3751 if (prev_mp != NULL) 3752 vfs_op_thread_exit_crit(prev_mp); 3753 return (cache_fpl_partial(fpl)); 3754 } 3755 if (prev_mp != NULL) 3756 vfs_op_thread_exit_crit(prev_mp); 3757 if (!vn_seqc_consistent(vp, vp_seqc)) { 3758 vfs_op_thread_exit_crit(mp); 3759 return (cache_fpl_partial(fpl)); 3760 } 3761 if (!cache_fplookup_mp_supported(mp)) { 3762 vfs_op_thread_exit_crit(mp); 3763 return (cache_fpl_partial(fpl)); 3764 } 3765 vp = atomic_load_ptr(&mp->mnt_rootvnode); 3766 if (vp == NULL || VN_IS_DOOMED(vp)) { 3767 vfs_op_thread_exit_crit(mp); 3768 return (cache_fpl_partial(fpl)); 3769 } 3770 vp_seqc = vn_seqc_read_any(vp); 3771 if (seqc_in_modify(vp_seqc)) { 3772 vfs_op_thread_exit_crit(mp); 3773 return (cache_fpl_partial(fpl)); 3774 } 3775 prev_mp = mp; 3776 mp = atomic_load_ptr(&vp->v_mountedhere); 3777 if (mp == NULL) 3778 break; 3779 } 3780 3781 vfs_op_thread_exit_crit(prev_mp); 3782 fpl->tvp = vp; 3783 fpl->tvp_seqc = vp_seqc; 3784 return (0); 3785 } 3786 3787 static bool 3788 cache_fplookup_need_climb_mount(struct cache_fpl *fpl) 3789 { 3790 struct mount *mp; 3791 struct vnode *vp; 3792 3793 vp = fpl->tvp; 3794 3795 /* 3796 * Hack: while this is a union, the pointer tends to be NULL so save on 3797 * a branch. 3798 */ 3799 mp = atomic_load_ptr(&vp->v_mountedhere); 3800 if (mp == NULL) 3801 return (false); 3802 if (vp->v_type == VDIR) 3803 return (true); 3804 return (false); 3805 } 3806 3807 /* 3808 * Parse the path. 3809 * 3810 * The code is mostly copy-pasted from regular lookup, see lookup(). 3811 * The structure is maintained along with comments for easier maintenance. 3812 * Deduplicating the code will become feasible after fast path lookup 3813 * becomes more feature-complete. 3814 */ 3815 static int 3816 cache_fplookup_parse(struct cache_fpl *fpl) 3817 { 3818 struct nameidata *ndp; 3819 struct componentname *cnp; 3820 char *cp; 3821 3822 ndp = fpl->ndp; 3823 cnp = fpl->cnp; 3824 3825 /* 3826 * Search a new directory. 3827 * 3828 * The last component of the filename is left accessible via 3829 * cnp->cn_nameptr for callers that need the name. Callers needing 3830 * the name set the SAVENAME flag. When done, they assume 3831 * responsibility for freeing the pathname buffer. 3832 */ 3833 for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++) 3834 continue; 3835 cnp->cn_namelen = cp - cnp->cn_nameptr; 3836 if (__predict_false(cnp->cn_namelen > NAME_MAX)) { 3837 cache_fpl_smr_exit(fpl); 3838 return (cache_fpl_handled(fpl, ENAMETOOLONG)); 3839 } 3840 ndp->ni_pathlen -= cnp->cn_namelen; 3841 KASSERT(ndp->ni_pathlen <= PATH_MAX, 3842 ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen)); 3843 ndp->ni_next = cp; 3844 3845 /* 3846 * Replace multiple slashes by a single slash and trailing slashes 3847 * by a null. This must be done before VOP_LOOKUP() because some 3848 * fs's don't know about trailing slashes. Remember if there were 3849 * trailing slashes to handle symlinks, existing non-directories 3850 * and non-existing files that won't be directories specially later. 3851 */ 3852 while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) { 3853 cp++; 3854 ndp->ni_pathlen--; 3855 if (*cp == '\0') { 3856 /* 3857 * TODO 3858 * Regular lookup performs the following: 3859 * *ndp->ni_next = '\0'; 3860 * cnp->cn_flags |= TRAILINGSLASH; 3861 * 3862 * Which is problematic since it modifies data read 3863 * from userspace. Then if fast path lookup was to 3864 * abort we would have to either restore it or convey 3865 * the flag. Since this is a corner case just ignore 3866 * it for simplicity. 3867 */ 3868 return (cache_fpl_partial(fpl)); 3869 } 3870 } 3871 ndp->ni_next = cp; 3872 3873 /* 3874 * Check for degenerate name (e.g. / or "") 3875 * which is a way of talking about a directory, 3876 * e.g. like "/." or ".". 3877 * 3878 * TODO 3879 * Another corner case handled by the regular lookup 3880 */ 3881 if (__predict_false(cnp->cn_nameptr[0] == '\0')) { 3882 return (cache_fpl_partial(fpl)); 3883 } 3884 return (0); 3885 } 3886 3887 static void 3888 cache_fplookup_parse_advance(struct cache_fpl *fpl) 3889 { 3890 struct nameidata *ndp; 3891 struct componentname *cnp; 3892 3893 ndp = fpl->ndp; 3894 cnp = fpl->cnp; 3895 3896 cnp->cn_nameptr = ndp->ni_next; 3897 while (*cnp->cn_nameptr == '/') { 3898 cnp->cn_nameptr++; 3899 ndp->ni_pathlen--; 3900 } 3901 } 3902 3903 static int __noinline 3904 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error) 3905 { 3906 3907 switch (error) { 3908 case EAGAIN: 3909 /* 3910 * Can happen when racing against vgone. 3911 * */ 3912 case EOPNOTSUPP: 3913 cache_fpl_partial(fpl); 3914 break; 3915 default: 3916 /* 3917 * See the API contract for VOP_FPLOOKUP_VEXEC. 3918 */ 3919 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { 3920 error = cache_fpl_aborted(fpl); 3921 } else { 3922 cache_fpl_smr_exit(fpl); 3923 cache_fpl_handled(fpl, error); 3924 } 3925 break; 3926 } 3927 return (error); 3928 } 3929 3930 static int 3931 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl) 3932 { 3933 struct nameidata *ndp; 3934 struct componentname *cnp; 3935 struct mount *mp; 3936 int error; 3937 3938 error = CACHE_FPL_FAILED; 3939 ndp = fpl->ndp; 3940 cnp = fpl->cnp; 3941 3942 cache_fpl_checkpoint(fpl, &fpl->snd); 3943 3944 fpl->dvp = dvp; 3945 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); 3946 if (seqc_in_modify(fpl->dvp_seqc)) { 3947 cache_fpl_aborted(fpl); 3948 goto out; 3949 } 3950 mp = atomic_load_ptr(&fpl->dvp->v_mount); 3951 if (!cache_fplookup_mp_supported(mp)) { 3952 cache_fpl_aborted(fpl); 3953 goto out; 3954 } 3955 3956 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); 3957 3958 for (;;) { 3959 error = cache_fplookup_parse(fpl); 3960 if (__predict_false(error != 0)) { 3961 break; 3962 } 3963 3964 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); 3965 3966 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred); 3967 if (__predict_false(error != 0)) { 3968 error = cache_fplookup_failed_vexec(fpl, error); 3969 break; 3970 } 3971 3972 if (__predict_false(cache_fpl_isdotdot(cnp))) { 3973 error = cache_fplookup_dotdot(fpl); 3974 if (__predict_false(error != 0)) { 3975 break; 3976 } 3977 } else { 3978 error = cache_fplookup_next(fpl); 3979 if (__predict_false(error != 0)) { 3980 break; 3981 } 3982 3983 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 3984 3985 if (cache_fplookup_need_climb_mount(fpl)) { 3986 error = cache_fplookup_climb_mount(fpl); 3987 if (__predict_false(error != 0)) { 3988 break; 3989 } 3990 } 3991 } 3992 3993 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 3994 3995 if (cache_fpl_islastcn(ndp)) { 3996 error = cache_fplookup_final(fpl); 3997 break; 3998 } 3999 4000 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { 4001 error = cache_fpl_aborted(fpl); 4002 break; 4003 } 4004 4005 fpl->dvp = fpl->tvp; 4006 fpl->dvp_seqc = fpl->tvp_seqc; 4007 4008 cache_fplookup_parse_advance(fpl); 4009 cache_fpl_checkpoint(fpl, &fpl->snd); 4010 } 4011 out: 4012 switch (fpl->status) { 4013 case CACHE_FPL_STATUS_UNSET: 4014 __assert_unreachable(); 4015 break; 4016 case CACHE_FPL_STATUS_PARTIAL: 4017 cache_fpl_smr_assert_entered(fpl); 4018 return (cache_fplookup_partial_setup(fpl)); 4019 case CACHE_FPL_STATUS_ABORTED: 4020 if (fpl->in_smr) 4021 cache_fpl_smr_exit(fpl); 4022 return (CACHE_FPL_FAILED); 4023 case CACHE_FPL_STATUS_HANDLED: 4024 MPASS(error != CACHE_FPL_FAILED); 4025 cache_fpl_smr_assert_not_entered(fpl); 4026 if (__predict_false(error != 0)) { 4027 ndp->ni_dvp = NULL; 4028 ndp->ni_vp = NULL; 4029 cache_fpl_cleanup_cnp(cnp); 4030 return (error); 4031 } 4032 ndp->ni_dvp = fpl->dvp; 4033 ndp->ni_vp = fpl->tvp; 4034 if (cnp->cn_flags & SAVENAME) 4035 cnp->cn_flags |= HASBUF; 4036 else 4037 cache_fpl_cleanup_cnp(cnp); 4038 return (error); 4039 } 4040 } 4041 4042 /* 4043 * Fast path lookup protected with SMR and sequence counters. 4044 * 4045 * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one. 4046 * 4047 * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria 4048 * outlined below. 4049 * 4050 * Traditional vnode lookup conceptually looks like this: 4051 * 4052 * vn_lock(current); 4053 * for (;;) { 4054 * next = find(); 4055 * vn_lock(next); 4056 * vn_unlock(current); 4057 * current = next; 4058 * if (last) 4059 * break; 4060 * } 4061 * return (current); 4062 * 4063 * Each jump to the next vnode is safe memory-wise and atomic with respect to 4064 * any modifications thanks to holding respective locks. 4065 * 4066 * The same guarantee can be provided with a combination of safe memory 4067 * reclamation and sequence counters instead. If all operations which affect 4068 * the relationship between the current vnode and the one we are looking for 4069 * also modify the counter, we can verify whether all the conditions held as 4070 * we made the jump. This includes things like permissions, mount points etc. 4071 * Counter modification is provided by enclosing relevant places in 4072 * vn_seqc_write_begin()/end() calls. 4073 * 4074 * Thus this translates to: 4075 * 4076 * vfs_smr_enter(); 4077 * dvp_seqc = seqc_read_any(dvp); 4078 * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode 4079 * abort(); 4080 * for (;;) { 4081 * tvp = find(); 4082 * tvp_seqc = seqc_read_any(tvp); 4083 * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode 4084 * abort(); 4085 * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode 4086 * abort(); 4087 * dvp = tvp; // we know nothing of importance has changed 4088 * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration 4089 * if (last) 4090 * break; 4091 * } 4092 * vget(); // secure the vnode 4093 * if (!seqc_consistent(tvp, tvp_seqc) // final check 4094 * abort(); 4095 * // at this point we know nothing has changed for any parent<->child pair 4096 * // as they were crossed during the lookup, meaning we matched the guarantee 4097 * // of the locked variant 4098 * return (tvp); 4099 * 4100 * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows: 4101 * - they are called while within vfs_smr protection which they must never exit 4102 * - EAGAIN can be returned to denote checking could not be performed, it is 4103 * always valid to return it 4104 * - if the sequence counter has not changed the result must be valid 4105 * - if the sequence counter has changed both false positives and false negatives 4106 * are permitted (since the result will be rejected later) 4107 * - for simple cases of unix permission checks vaccess_vexec_smr can be used 4108 * 4109 * Caveats to watch out for: 4110 * - vnodes are passed unlocked and unreferenced with nothing stopping 4111 * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised 4112 * to use atomic_load_ptr to fetch it. 4113 * - the aforementioned object can also get freed, meaning absent other means it 4114 * should be protected with vfs_smr 4115 * - either safely checking permissions as they are modified or guaranteeing 4116 * their stability is left to the routine 4117 */ 4118 int 4119 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status, 4120 struct pwd **pwdp) 4121 { 4122 struct cache_fpl fpl; 4123 struct pwd *pwd; 4124 struct vnode *dvp; 4125 struct componentname *cnp; 4126 struct nameidata_saved orig; 4127 int error; 4128 4129 MPASS(ndp->ni_lcf == 0); 4130 4131 fpl.status = CACHE_FPL_STATUS_UNSET; 4132 fpl.ndp = ndp; 4133 fpl.cnp = &ndp->ni_cnd; 4134 MPASS(curthread == fpl.cnp->cn_thread); 4135 4136 if ((fpl.cnp->cn_flags & SAVESTART) != 0) 4137 MPASS(fpl.cnp->cn_nameiop != LOOKUP); 4138 4139 if (!cache_can_fplookup(&fpl)) { 4140 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 4141 *status = fpl.status; 4142 return (EOPNOTSUPP); 4143 } 4144 4145 cache_fpl_checkpoint(&fpl, &orig); 4146 4147 cache_fpl_smr_enter_initial(&fpl); 4148 pwd = pwd_get_smr(); 4149 fpl.pwd = pwd; 4150 ndp->ni_rootdir = pwd->pwd_rdir; 4151 ndp->ni_topdir = pwd->pwd_jdir; 4152 4153 cnp = fpl.cnp; 4154 cnp->cn_nameptr = cnp->cn_pnbuf; 4155 if (cnp->cn_pnbuf[0] == '/') { 4156 cache_fpl_handle_root(ndp, &dvp); 4157 } else { 4158 MPASS(ndp->ni_dirfd == AT_FDCWD); 4159 dvp = pwd->pwd_cdir; 4160 } 4161 4162 SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true); 4163 4164 error = cache_fplookup_impl(dvp, &fpl); 4165 cache_fpl_smr_assert_not_entered(&fpl); 4166 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 4167 4168 *status = fpl.status; 4169 switch (fpl.status) { 4170 case CACHE_FPL_STATUS_UNSET: 4171 __assert_unreachable(); 4172 break; 4173 case CACHE_FPL_STATUS_HANDLED: 4174 SDT_PROBE3(vfs, namei, lookup, return, error, 4175 (error == 0 ? ndp->ni_vp : NULL), true); 4176 break; 4177 case CACHE_FPL_STATUS_PARTIAL: 4178 *pwdp = fpl.pwd; 4179 /* 4180 * Status restored by cache_fplookup_partial_setup. 4181 */ 4182 break; 4183 case CACHE_FPL_STATUS_ABORTED: 4184 cache_fpl_restore(&fpl, &orig); 4185 break; 4186 } 4187 return (error); 4188 } 4189