1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Poul-Henning Kamp of the FreeBSD Project. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_ddb.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/capsicum.h> 46 #include <sys/counter.h> 47 #include <sys/filedesc.h> 48 #include <sys/fnv_hash.h> 49 #include <sys/kernel.h> 50 #include <sys/ktr.h> 51 #include <sys/lock.h> 52 #include <sys/malloc.h> 53 #include <sys/fcntl.h> 54 #include <sys/jail.h> 55 #include <sys/mount.h> 56 #include <sys/namei.h> 57 #include <sys/proc.h> 58 #include <sys/rwlock.h> 59 #include <sys/seqc.h> 60 #include <sys/sdt.h> 61 #include <sys/smr.h> 62 #include <sys/smp.h> 63 #include <sys/syscallsubr.h> 64 #include <sys/sysctl.h> 65 #include <sys/sysproto.h> 66 #include <sys/vnode.h> 67 #include <ck_queue.h> 68 #ifdef KTRACE 69 #include <sys/ktrace.h> 70 #endif 71 72 #include <sys/capsicum.h> 73 74 #include <security/audit/audit.h> 75 #include <security/mac/mac_framework.h> 76 77 #ifdef DDB 78 #include <ddb/ddb.h> 79 #endif 80 81 #include <vm/uma.h> 82 83 SDT_PROVIDER_DECLARE(vfs); 84 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", 85 "struct vnode *"); 86 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", 87 "char *"); 88 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); 89 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", 90 "char *", "struct vnode *"); 91 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); 92 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", 93 "struct vnode *", "char *"); 94 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", 95 "struct vnode *"); 96 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, 97 "struct vnode *", "char *"); 98 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", 99 "char *"); 100 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); 101 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); 102 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); 103 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", 104 "struct vnode *"); 105 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", 106 "char *"); 107 SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *", 108 "char *"); 109 110 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool"); 111 SDT_PROBE_DECLARE(vfs, namei, lookup, entry); 112 SDT_PROBE_DECLARE(vfs, namei, lookup, return); 113 114 /* 115 * This structure describes the elements in the cache of recent 116 * names looked up by namei. 117 */ 118 struct negstate { 119 u_char neg_flag; 120 }; 121 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *), 122 "the state must fit in a union with a pointer without growing it"); 123 124 struct namecache { 125 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 126 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 127 CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */ 128 struct vnode *nc_dvp; /* vnode of parent of name */ 129 union { 130 struct vnode *nu_vp; /* vnode the name refers to */ 131 struct negstate nu_neg;/* negative entry state */ 132 } n_un; 133 u_char nc_flag; /* flag bits */ 134 u_char nc_nlen; /* length of name */ 135 char nc_name[0]; /* segment name + nul */ 136 }; 137 138 /* 139 * struct namecache_ts repeats struct namecache layout up to the 140 * nc_nlen member. 141 * struct namecache_ts is used in place of struct namecache when time(s) need 142 * to be stored. The nc_dotdottime field is used when a cache entry is mapping 143 * both a non-dotdot directory name plus dotdot for the directory's 144 * parent. 145 * 146 * See below for alignment requirement. 147 */ 148 struct namecache_ts { 149 struct timespec nc_time; /* timespec provided by fs */ 150 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ 151 int nc_ticks; /* ticks value when entry was added */ 152 struct namecache nc_nc; 153 }; 154 155 /* 156 * At least mips n32 performs 64-bit accesses to timespec as found 157 * in namecache_ts and requires them to be aligned. Since others 158 * may be in the same spot suffer a little bit and enforce the 159 * alignment for everyone. Note this is a nop for 64-bit platforms. 160 */ 161 #define CACHE_ZONE_ALIGNMENT UMA_ALIGNOF(time_t) 162 #define CACHE_PATH_CUTOFF 39 163 164 #define CACHE_ZONE_SMALL_SIZE (sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1) 165 #define CACHE_ZONE_SMALL_TS_SIZE (sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1) 166 #define CACHE_ZONE_LARGE_SIZE (sizeof(struct namecache) + NAME_MAX + 1) 167 #define CACHE_ZONE_LARGE_TS_SIZE (sizeof(struct namecache_ts) + NAME_MAX + 1) 168 169 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 170 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 171 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 172 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 173 174 #define nc_vp n_un.nu_vp 175 #define nc_neg n_un.nu_neg 176 177 /* 178 * Flags in namecache.nc_flag 179 */ 180 #define NCF_WHITE 0x01 181 #define NCF_ISDOTDOT 0x02 182 #define NCF_TS 0x04 183 #define NCF_DTS 0x08 184 #define NCF_DVDROP 0x10 185 #define NCF_NEGATIVE 0x20 186 #define NCF_INVALID 0x40 187 #define NCF_WIP 0x80 188 189 /* 190 * Flags in negstate.neg_flag 191 */ 192 #define NEG_HOT 0x01 193 194 /* 195 * Mark an entry as invalid. 196 * 197 * This is called before it starts getting deconstructed. 198 */ 199 static void 200 cache_ncp_invalidate(struct namecache *ncp) 201 { 202 203 KASSERT((ncp->nc_flag & NCF_INVALID) == 0, 204 ("%s: entry %p already invalid", __func__, ncp)); 205 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID); 206 atomic_thread_fence_rel(); 207 } 208 209 /* 210 * Check whether the entry can be safely used. 211 * 212 * All places which elide locks are supposed to call this after they are 213 * done with reading from an entry. 214 */ 215 static bool 216 cache_ncp_canuse(struct namecache *ncp) 217 { 218 219 atomic_thread_fence_acq(); 220 return ((atomic_load_char(&ncp->nc_flag) & (NCF_INVALID | NCF_WIP)) == 0); 221 } 222 223 /* 224 * Name caching works as follows: 225 * 226 * Names found by directory scans are retained in a cache 227 * for future reference. It is managed LRU, so frequently 228 * used names will hang around. Cache is indexed by hash value 229 * obtained from (dvp, name) where dvp refers to the directory 230 * containing name. 231 * 232 * If it is a "negative" entry, (i.e. for a name that is known NOT to 233 * exist) the vnode pointer will be NULL. 234 * 235 * Upon reaching the last segment of a path, if the reference 236 * is for DELETE, or NOCACHE is set (rewrite), and the 237 * name is located in the cache, it will be dropped. 238 * 239 * These locks are used (in the order in which they can be taken): 240 * NAME TYPE ROLE 241 * vnodelock mtx vnode lists and v_cache_dd field protection 242 * bucketlock rwlock for access to given set of hash buckets 243 * neglist mtx negative entry LRU management 244 * 245 * Additionally, ncneg_shrink_lock mtx is used to have at most one thread 246 * shrinking the LRU list. 247 * 248 * It is legal to take multiple vnodelock and bucketlock locks. The locking 249 * order is lower address first. Both are recursive. 250 * 251 * "." lookups are lockless. 252 * 253 * ".." and vnode -> name lookups require vnodelock. 254 * 255 * name -> vnode lookup requires the relevant bucketlock to be held for reading. 256 * 257 * Insertions and removals of entries require involved vnodes and bucketlocks 258 * to be write-locked to prevent other threads from seeing the entry. 259 * 260 * Some lookups result in removal of the found entry (e.g. getting rid of a 261 * negative entry with the intent to create a positive one), which poses a 262 * problem when multiple threads reach the state. Similarly, two different 263 * threads can purge two different vnodes and try to remove the same name. 264 * 265 * If the already held vnode lock is lower than the second required lock, we 266 * can just take the other lock. However, in the opposite case, this could 267 * deadlock. As such, this is resolved by trylocking and if that fails unlocking 268 * the first node, locking everything in order and revalidating the state. 269 */ 270 271 VFS_SMR_DECLARE; 272 273 /* 274 * Structures associated with name caching. 275 */ 276 #define NCHHASH(hash) \ 277 (&nchashtbl[(hash) & nchash]) 278 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ 279 static u_long __read_mostly nchash; /* size of hash table */ 280 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 281 "Size of namecache hash table"); 282 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */ 283 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 284 "Ratio of negative namecache entries"); 285 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ 286 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ 287 u_int ncsizefactor = 2; 288 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0, 289 "Size factor for namecache"); 290 static u_int __read_mostly ncpurgeminvnodes; 291 SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0, 292 "Number of vnodes below which purgevfs ignores the request"); 293 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */ 294 295 struct nchstats nchstats; /* cache effectiveness statistics */ 296 297 static struct mtx __exclusive_cache_line ncneg_shrink_lock; 298 299 struct neglist { 300 struct mtx nl_lock; 301 TAILQ_HEAD(, namecache) nl_list; 302 } __aligned(CACHE_LINE_SIZE); 303 304 static struct neglist __read_mostly *neglists; 305 static struct neglist ncneg_hot; 306 static u_long numhotneg; 307 308 #define ncneghash 3 309 #define numneglists (ncneghash + 1) 310 static inline struct neglist * 311 NCP2NEGLIST(struct namecache *ncp) 312 { 313 314 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); 315 } 316 317 static inline struct negstate * 318 NCP2NEGSTATE(struct namecache *ncp) 319 { 320 321 MPASS(ncp->nc_flag & NCF_NEGATIVE); 322 return (&ncp->nc_neg); 323 } 324 325 #define numbucketlocks (ncbuckethash + 1) 326 static u_int __read_mostly ncbuckethash; 327 static struct rwlock_padalign __read_mostly *bucketlocks; 328 #define HASH2BUCKETLOCK(hash) \ 329 ((struct rwlock *)(&bucketlocks[((hash) & ncbuckethash)])) 330 331 #define numvnodelocks (ncvnodehash + 1) 332 static u_int __read_mostly ncvnodehash; 333 static struct mtx __read_mostly *vnodelocks; 334 static inline struct mtx * 335 VP2VNODELOCK(struct vnode *vp) 336 { 337 338 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); 339 } 340 341 /* 342 * UMA zones for the VFS cache. 343 * 344 * The small cache is used for entries with short names, which are the 345 * most common. The large cache is used for entries which are too big to 346 * fit in the small cache. 347 */ 348 static uma_zone_t __read_mostly cache_zone_small; 349 static uma_zone_t __read_mostly cache_zone_small_ts; 350 static uma_zone_t __read_mostly cache_zone_large; 351 static uma_zone_t __read_mostly cache_zone_large_ts; 352 353 static struct namecache * 354 cache_alloc(int len, int ts) 355 { 356 struct namecache_ts *ncp_ts; 357 struct namecache *ncp; 358 359 if (__predict_false(ts)) { 360 if (len <= CACHE_PATH_CUTOFF) 361 ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK); 362 else 363 ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK); 364 ncp = &ncp_ts->nc_nc; 365 } else { 366 if (len <= CACHE_PATH_CUTOFF) 367 ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK); 368 else 369 ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK); 370 } 371 return (ncp); 372 } 373 374 static void 375 cache_free(struct namecache *ncp) 376 { 377 struct namecache_ts *ncp_ts; 378 379 if (ncp == NULL) 380 return; 381 if ((ncp->nc_flag & NCF_DVDROP) != 0) 382 vdrop(ncp->nc_dvp); 383 if (__predict_false(ncp->nc_flag & NCF_TS)) { 384 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 385 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 386 uma_zfree_smr(cache_zone_small_ts, ncp_ts); 387 else 388 uma_zfree_smr(cache_zone_large_ts, ncp_ts); 389 } else { 390 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 391 uma_zfree_smr(cache_zone_small, ncp); 392 else 393 uma_zfree_smr(cache_zone_large, ncp); 394 } 395 } 396 397 static void 398 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) 399 { 400 struct namecache_ts *ncp_ts; 401 402 KASSERT((ncp->nc_flag & NCF_TS) != 0 || 403 (tsp == NULL && ticksp == NULL), 404 ("No NCF_TS")); 405 406 if (tsp == NULL && ticksp == NULL) 407 return; 408 409 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 410 if (tsp != NULL) 411 *tsp = ncp_ts->nc_time; 412 if (ticksp != NULL) 413 *ticksp = ncp_ts->nc_ticks; 414 } 415 416 #ifdef DEBUG_CACHE 417 static int __read_mostly doingcache = 1; /* 1 => enable the cache */ 418 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, 419 "VFS namecache enabled"); 420 #endif 421 422 /* Export size information to userland */ 423 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 424 sizeof(struct namecache), "sizeof(struct namecache)"); 425 426 /* 427 * The new name cache statistics 428 */ 429 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 430 "Name cache statistics"); 431 #define STATNODE_ULONG(name, descr) \ 432 SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr); 433 #define STATNODE_COUNTER(name, descr) \ 434 static COUNTER_U64_DEFINE_EARLY(name); \ 435 SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, \ 436 descr); 437 STATNODE_ULONG(numneg, "Number of negative cache entries"); 438 STATNODE_ULONG(numcache, "Number of cache entries"); 439 STATNODE_COUNTER(numcachehv, "Number of namecache entries with vnodes held"); 440 STATNODE_COUNTER(numdrops, "Number of dropped entries due to reaching the limit"); 441 STATNODE_COUNTER(dothits, "Number of '.' hits"); 442 STATNODE_COUNTER(dotdothits, "Number of '..' hits"); 443 STATNODE_COUNTER(nummiss, "Number of cache misses"); 444 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache"); 445 STATNODE_COUNTER(numposzaps, 446 "Number of cache hits (positive) we do not want to cache"); 447 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)"); 448 STATNODE_COUNTER(numnegzaps, 449 "Number of cache hits (negative) we do not want to cache"); 450 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)"); 451 /* These count for vn_getcwd(), too. */ 452 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls"); 453 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); 454 STATNODE_COUNTER(numfullpathfail2, 455 "Number of fullpath search errors (VOP_VPTOCNP failures)"); 456 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); 457 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls"); 458 STATNODE_COUNTER(zap_and_exit_bucket_relock_success, 459 "Number of successful removals after relocking"); 460 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail, 461 "Number of times zap_and_exit failed to lock"); 462 static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2, 463 "Number of times zap_and_exit failed to lock"); 464 static long cache_lock_vnodes_cel_3_failures; 465 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures, 466 "Number of times 3-way vnode locking failed"); 467 STATNODE_ULONG(numhotneg, "Number of hot negative entries"); 468 STATNODE_COUNTER(numneg_evicted, 469 "Number of negative entries evicted when adding a new entry"); 470 STATNODE_COUNTER(shrinking_skipped, 471 "Number of times shrinking was already in progress"); 472 473 static void cache_zap_locked(struct namecache *ncp); 474 static int vn_fullpath_hardlink(struct thread *td, struct nameidata *ndp, char **retbuf, 475 char **freebuf, size_t *buflen); 476 static int vn_fullpath_any(struct thread *td, struct vnode *vp, struct vnode *rdir, 477 char *buf, char **retbuf, size_t *buflen); 478 static int vn_fullpath_dir(struct thread *td, struct vnode *vp, struct vnode *rdir, 479 char *buf, char **retbuf, size_t *len, bool slash_prefixed, size_t addend); 480 481 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 482 483 static int cache_yield; 484 SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0, 485 "Number of times cache called yield"); 486 487 static void __noinline 488 cache_maybe_yield(void) 489 { 490 491 if (should_yield()) { 492 cache_yield++; 493 kern_yield(PRI_USER); 494 } 495 } 496 497 static inline void 498 cache_assert_vlp_locked(struct mtx *vlp) 499 { 500 501 if (vlp != NULL) 502 mtx_assert(vlp, MA_OWNED); 503 } 504 505 static inline void 506 cache_assert_vnode_locked(struct vnode *vp) 507 { 508 struct mtx *vlp; 509 510 vlp = VP2VNODELOCK(vp); 511 cache_assert_vlp_locked(vlp); 512 } 513 514 /* 515 * TODO: With the value stored we can do better than computing the hash based 516 * on the address and the choice of FNV should also be revisisted. 517 */ 518 static void 519 cache_prehash(struct vnode *vp) 520 { 521 522 vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT); 523 } 524 525 static uint32_t 526 cache_get_hash(char *name, u_char len, struct vnode *dvp) 527 { 528 529 return (fnv_32_buf(name, len, dvp->v_nchash)); 530 } 531 532 static inline struct nchashhead * 533 NCP2BUCKET(struct namecache *ncp) 534 { 535 uint32_t hash; 536 537 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 538 return (NCHHASH(hash)); 539 } 540 541 static inline struct rwlock * 542 NCP2BUCKETLOCK(struct namecache *ncp) 543 { 544 uint32_t hash; 545 546 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 547 return (HASH2BUCKETLOCK(hash)); 548 } 549 550 #ifdef INVARIANTS 551 static void 552 cache_assert_bucket_locked(struct namecache *ncp, int mode) 553 { 554 struct rwlock *blp; 555 556 blp = NCP2BUCKETLOCK(ncp); 557 rw_assert(blp, mode); 558 } 559 #else 560 #define cache_assert_bucket_locked(x, y) do { } while (0) 561 #endif 562 563 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) 564 static void 565 _cache_sort_vnodes(void **p1, void **p2) 566 { 567 void *tmp; 568 569 MPASS(*p1 != NULL || *p2 != NULL); 570 571 if (*p1 > *p2) { 572 tmp = *p2; 573 *p2 = *p1; 574 *p1 = tmp; 575 } 576 } 577 578 static void 579 cache_lock_all_buckets(void) 580 { 581 u_int i; 582 583 for (i = 0; i < numbucketlocks; i++) 584 rw_wlock(&bucketlocks[i]); 585 } 586 587 static void 588 cache_unlock_all_buckets(void) 589 { 590 u_int i; 591 592 for (i = 0; i < numbucketlocks; i++) 593 rw_wunlock(&bucketlocks[i]); 594 } 595 596 static void 597 cache_lock_all_vnodes(void) 598 { 599 u_int i; 600 601 for (i = 0; i < numvnodelocks; i++) 602 mtx_lock(&vnodelocks[i]); 603 } 604 605 static void 606 cache_unlock_all_vnodes(void) 607 { 608 u_int i; 609 610 for (i = 0; i < numvnodelocks; i++) 611 mtx_unlock(&vnodelocks[i]); 612 } 613 614 static int 615 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 616 { 617 618 cache_sort_vnodes(&vlp1, &vlp2); 619 620 if (vlp1 != NULL) { 621 if (!mtx_trylock(vlp1)) 622 return (EAGAIN); 623 } 624 if (!mtx_trylock(vlp2)) { 625 if (vlp1 != NULL) 626 mtx_unlock(vlp1); 627 return (EAGAIN); 628 } 629 630 return (0); 631 } 632 633 static void 634 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 635 { 636 637 MPASS(vlp1 != NULL || vlp2 != NULL); 638 MPASS(vlp1 <= vlp2); 639 640 if (vlp1 != NULL) 641 mtx_lock(vlp1); 642 if (vlp2 != NULL) 643 mtx_lock(vlp2); 644 } 645 646 static void 647 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 648 { 649 650 MPASS(vlp1 != NULL || vlp2 != NULL); 651 652 if (vlp1 != NULL) 653 mtx_unlock(vlp1); 654 if (vlp2 != NULL) 655 mtx_unlock(vlp2); 656 } 657 658 static int 659 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 660 { 661 struct nchstats snap; 662 663 if (req->oldptr == NULL) 664 return (SYSCTL_OUT(req, 0, sizeof(snap))); 665 666 snap = nchstats; 667 snap.ncs_goodhits = counter_u64_fetch(numposhits); 668 snap.ncs_neghits = counter_u64_fetch(numneghits); 669 snap.ncs_badhits = counter_u64_fetch(numposzaps) + 670 counter_u64_fetch(numnegzaps); 671 snap.ncs_miss = counter_u64_fetch(nummisszap) + 672 counter_u64_fetch(nummiss); 673 674 return (SYSCTL_OUT(req, &snap, sizeof(snap))); 675 } 676 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | 677 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", 678 "VFS cache effectiveness statistics"); 679 680 #ifdef DIAGNOSTIC 681 /* 682 * Grab an atomic snapshot of the name cache hash chain lengths 683 */ 684 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, 685 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 686 "hash table stats"); 687 688 static int 689 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) 690 { 691 struct nchashhead *ncpp; 692 struct namecache *ncp; 693 int i, error, n_nchash, *cntbuf; 694 695 retry: 696 n_nchash = nchash + 1; /* nchash is max index, not count */ 697 if (req->oldptr == NULL) 698 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); 699 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); 700 cache_lock_all_buckets(); 701 if (n_nchash != nchash + 1) { 702 cache_unlock_all_buckets(); 703 free(cntbuf, M_TEMP); 704 goto retry; 705 } 706 /* Scan hash tables counting entries */ 707 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) 708 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) 709 cntbuf[i]++; 710 cache_unlock_all_buckets(); 711 for (error = 0, i = 0; i < n_nchash; i++) 712 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) 713 break; 714 free(cntbuf, M_TEMP); 715 return (error); 716 } 717 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| 718 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", 719 "nchash chain lengths"); 720 721 static int 722 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) 723 { 724 int error; 725 struct nchashhead *ncpp; 726 struct namecache *ncp; 727 int n_nchash; 728 int count, maxlength, used, pct; 729 730 if (!req->oldptr) 731 return SYSCTL_OUT(req, 0, 4 * sizeof(int)); 732 733 cache_lock_all_buckets(); 734 n_nchash = nchash + 1; /* nchash is max index, not count */ 735 used = 0; 736 maxlength = 0; 737 738 /* Scan hash tables for applicable entries */ 739 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { 740 count = 0; 741 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) { 742 count++; 743 } 744 if (count) 745 used++; 746 if (maxlength < count) 747 maxlength = count; 748 } 749 n_nchash = nchash + 1; 750 cache_unlock_all_buckets(); 751 pct = (used * 100) / (n_nchash / 100); 752 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); 753 if (error) 754 return (error); 755 error = SYSCTL_OUT(req, &used, sizeof(used)); 756 if (error) 757 return (error); 758 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); 759 if (error) 760 return (error); 761 error = SYSCTL_OUT(req, &pct, sizeof(pct)); 762 if (error) 763 return (error); 764 return (0); 765 } 766 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| 767 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", 768 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); 769 #endif 770 771 /* 772 * Negative entries management 773 * 774 * A variation of LRU scheme is used. New entries are hashed into one of 775 * numneglists cold lists. Entries get promoted to the hot list on first hit. 776 * 777 * The shrinker will demote hot list head and evict from the cold list in a 778 * round-robin manner. 779 */ 780 static void 781 cache_negative_init(struct namecache *ncp) 782 { 783 struct negstate *negstate; 784 785 ncp->nc_flag |= NCF_NEGATIVE; 786 negstate = NCP2NEGSTATE(ncp); 787 negstate->neg_flag = 0; 788 } 789 790 static void 791 cache_negative_hit(struct namecache *ncp) 792 { 793 struct neglist *neglist; 794 struct negstate *negstate; 795 796 negstate = NCP2NEGSTATE(ncp); 797 if ((negstate->neg_flag & NEG_HOT) != 0) 798 return; 799 neglist = NCP2NEGLIST(ncp); 800 mtx_lock(&ncneg_hot.nl_lock); 801 mtx_lock(&neglist->nl_lock); 802 if ((negstate->neg_flag & NEG_HOT) == 0) { 803 numhotneg++; 804 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 805 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); 806 negstate->neg_flag |= NEG_HOT; 807 } 808 mtx_unlock(&neglist->nl_lock); 809 mtx_unlock(&ncneg_hot.nl_lock); 810 } 811 812 static void 813 cache_negative_insert(struct namecache *ncp) 814 { 815 struct neglist *neglist; 816 817 MPASS(ncp->nc_flag & NCF_NEGATIVE); 818 cache_assert_bucket_locked(ncp, RA_WLOCKED); 819 neglist = NCP2NEGLIST(ncp); 820 mtx_lock(&neglist->nl_lock); 821 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 822 mtx_unlock(&neglist->nl_lock); 823 atomic_add_rel_long(&numneg, 1); 824 } 825 826 static void 827 cache_negative_remove(struct namecache *ncp) 828 { 829 struct neglist *neglist; 830 struct negstate *negstate; 831 bool hot_locked = false; 832 bool list_locked = false; 833 834 cache_assert_bucket_locked(ncp, RA_WLOCKED); 835 neglist = NCP2NEGLIST(ncp); 836 negstate = NCP2NEGSTATE(ncp); 837 if ((negstate->neg_flag & NEG_HOT) != 0) { 838 hot_locked = true; 839 mtx_lock(&ncneg_hot.nl_lock); 840 if ((negstate->neg_flag & NEG_HOT) == 0) { 841 list_locked = true; 842 mtx_lock(&neglist->nl_lock); 843 } 844 } else { 845 list_locked = true; 846 mtx_lock(&neglist->nl_lock); 847 /* 848 * We may be racing against promotion in lockless lookup. 849 */ 850 if ((negstate->neg_flag & NEG_HOT) != 0) { 851 mtx_unlock(&neglist->nl_lock); 852 hot_locked = true; 853 mtx_lock(&ncneg_hot.nl_lock); 854 mtx_lock(&neglist->nl_lock); 855 } 856 } 857 if ((negstate->neg_flag & NEG_HOT) != 0) { 858 mtx_assert(&ncneg_hot.nl_lock, MA_OWNED); 859 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 860 numhotneg--; 861 } else { 862 mtx_assert(&neglist->nl_lock, MA_OWNED); 863 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 864 } 865 if (list_locked) 866 mtx_unlock(&neglist->nl_lock); 867 if (hot_locked) 868 mtx_unlock(&ncneg_hot.nl_lock); 869 atomic_subtract_rel_long(&numneg, 1); 870 } 871 872 static void 873 cache_negative_shrink_select(struct namecache **ncpp, 874 struct neglist **neglistpp) 875 { 876 struct neglist *neglist; 877 struct namecache *ncp; 878 static u_int cycle; 879 u_int i; 880 881 *ncpp = ncp = NULL; 882 883 for (i = 0; i < numneglists; i++) { 884 neglist = &neglists[(cycle + i) % numneglists]; 885 if (TAILQ_FIRST(&neglist->nl_list) == NULL) 886 continue; 887 mtx_lock(&neglist->nl_lock); 888 ncp = TAILQ_FIRST(&neglist->nl_list); 889 if (ncp != NULL) 890 break; 891 mtx_unlock(&neglist->nl_lock); 892 } 893 894 *neglistpp = neglist; 895 *ncpp = ncp; 896 cycle++; 897 } 898 899 static void 900 cache_negative_zap_one(void) 901 { 902 struct namecache *ncp, *ncp2; 903 struct neglist *neglist; 904 struct negstate *negstate; 905 struct mtx *dvlp; 906 struct rwlock *blp; 907 908 if (mtx_owner(&ncneg_shrink_lock) != NULL || 909 !mtx_trylock(&ncneg_shrink_lock)) { 910 counter_u64_add(shrinking_skipped, 1); 911 return; 912 } 913 914 mtx_lock(&ncneg_hot.nl_lock); 915 ncp = TAILQ_FIRST(&ncneg_hot.nl_list); 916 if (ncp != NULL) { 917 neglist = NCP2NEGLIST(ncp); 918 negstate = NCP2NEGSTATE(ncp); 919 mtx_lock(&neglist->nl_lock); 920 MPASS((negstate->neg_flag & NEG_HOT) != 0); 921 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 922 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 923 negstate->neg_flag &= ~NEG_HOT; 924 numhotneg--; 925 mtx_unlock(&neglist->nl_lock); 926 } 927 mtx_unlock(&ncneg_hot.nl_lock); 928 929 cache_negative_shrink_select(&ncp, &neglist); 930 931 mtx_unlock(&ncneg_shrink_lock); 932 if (ncp == NULL) 933 return; 934 935 MPASS(ncp->nc_flag & NCF_NEGATIVE); 936 dvlp = VP2VNODELOCK(ncp->nc_dvp); 937 blp = NCP2BUCKETLOCK(ncp); 938 mtx_unlock(&neglist->nl_lock); 939 mtx_lock(dvlp); 940 rw_wlock(blp); 941 /* 942 * Enter SMR to safely check the negative list. 943 * Even if the found pointer matches, the entry may now be reallocated 944 * and used by a different vnode. 945 */ 946 vfs_smr_enter(); 947 ncp2 = TAILQ_FIRST(&neglist->nl_list); 948 if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) || 949 blp != NCP2BUCKETLOCK(ncp2)) { 950 vfs_smr_exit(); 951 ncp = NULL; 952 } else { 953 vfs_smr_exit(); 954 SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp, 955 ncp->nc_name); 956 cache_zap_locked(ncp); 957 counter_u64_add(numneg_evicted, 1); 958 } 959 rw_wunlock(blp); 960 mtx_unlock(dvlp); 961 cache_free(ncp); 962 } 963 964 /* 965 * cache_zap_locked(): 966 * 967 * Removes a namecache entry from cache, whether it contains an actual 968 * pointer to a vnode or if it is just a negative cache entry. 969 */ 970 static void 971 cache_zap_locked(struct namecache *ncp) 972 { 973 struct nchashhead *ncpp; 974 975 if (!(ncp->nc_flag & NCF_NEGATIVE)) 976 cache_assert_vnode_locked(ncp->nc_vp); 977 cache_assert_vnode_locked(ncp->nc_dvp); 978 cache_assert_bucket_locked(ncp, RA_WLOCKED); 979 980 CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, 981 (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp); 982 983 cache_ncp_invalidate(ncp); 984 985 ncpp = NCP2BUCKET(ncp); 986 CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash); 987 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 988 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, 989 ncp->nc_name, ncp->nc_vp); 990 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); 991 if (ncp == ncp->nc_vp->v_cache_dd) { 992 vn_seqc_write_begin_unheld(ncp->nc_vp); 993 ncp->nc_vp->v_cache_dd = NULL; 994 vn_seqc_write_end(ncp->nc_vp); 995 } 996 } else { 997 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp, 998 ncp->nc_name); 999 cache_negative_remove(ncp); 1000 } 1001 if (ncp->nc_flag & NCF_ISDOTDOT) { 1002 if (ncp == ncp->nc_dvp->v_cache_dd) { 1003 vn_seqc_write_begin_unheld(ncp->nc_dvp); 1004 ncp->nc_dvp->v_cache_dd = NULL; 1005 vn_seqc_write_end(ncp->nc_dvp); 1006 } 1007 } else { 1008 LIST_REMOVE(ncp, nc_src); 1009 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { 1010 ncp->nc_flag |= NCF_DVDROP; 1011 counter_u64_add(numcachehv, -1); 1012 } 1013 } 1014 atomic_subtract_rel_long(&numcache, 1); 1015 } 1016 1017 static void 1018 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) 1019 { 1020 struct rwlock *blp; 1021 1022 MPASS(ncp->nc_dvp == vp); 1023 MPASS(ncp->nc_flag & NCF_NEGATIVE); 1024 cache_assert_vnode_locked(vp); 1025 1026 blp = NCP2BUCKETLOCK(ncp); 1027 rw_wlock(blp); 1028 cache_zap_locked(ncp); 1029 rw_wunlock(blp); 1030 } 1031 1032 static bool 1033 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, 1034 struct mtx **vlpp) 1035 { 1036 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 1037 struct rwlock *blp; 1038 1039 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 1040 cache_assert_vnode_locked(vp); 1041 1042 if (ncp->nc_flag & NCF_NEGATIVE) { 1043 if (*vlpp != NULL) { 1044 mtx_unlock(*vlpp); 1045 *vlpp = NULL; 1046 } 1047 cache_zap_negative_locked_vnode_kl(ncp, vp); 1048 return (true); 1049 } 1050 1051 pvlp = VP2VNODELOCK(vp); 1052 blp = NCP2BUCKETLOCK(ncp); 1053 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 1054 vlp2 = VP2VNODELOCK(ncp->nc_vp); 1055 1056 if (*vlpp == vlp1 || *vlpp == vlp2) { 1057 to_unlock = *vlpp; 1058 *vlpp = NULL; 1059 } else { 1060 if (*vlpp != NULL) { 1061 mtx_unlock(*vlpp); 1062 *vlpp = NULL; 1063 } 1064 cache_sort_vnodes(&vlp1, &vlp2); 1065 if (vlp1 == pvlp) { 1066 mtx_lock(vlp2); 1067 to_unlock = vlp2; 1068 } else { 1069 if (!mtx_trylock(vlp1)) 1070 goto out_relock; 1071 to_unlock = vlp1; 1072 } 1073 } 1074 rw_wlock(blp); 1075 cache_zap_locked(ncp); 1076 rw_wunlock(blp); 1077 if (to_unlock != NULL) 1078 mtx_unlock(to_unlock); 1079 return (true); 1080 1081 out_relock: 1082 mtx_unlock(vlp2); 1083 mtx_lock(vlp1); 1084 mtx_lock(vlp2); 1085 MPASS(*vlpp == NULL); 1086 *vlpp = vlp1; 1087 return (false); 1088 } 1089 1090 static int __noinline 1091 cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp) 1092 { 1093 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 1094 struct rwlock *blp; 1095 int error = 0; 1096 1097 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 1098 cache_assert_vnode_locked(vp); 1099 1100 pvlp = VP2VNODELOCK(vp); 1101 if (ncp->nc_flag & NCF_NEGATIVE) { 1102 cache_zap_negative_locked_vnode_kl(ncp, vp); 1103 goto out; 1104 } 1105 1106 blp = NCP2BUCKETLOCK(ncp); 1107 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 1108 vlp2 = VP2VNODELOCK(ncp->nc_vp); 1109 cache_sort_vnodes(&vlp1, &vlp2); 1110 if (vlp1 == pvlp) { 1111 mtx_lock(vlp2); 1112 to_unlock = vlp2; 1113 } else { 1114 if (!mtx_trylock(vlp1)) { 1115 error = EAGAIN; 1116 goto out; 1117 } 1118 to_unlock = vlp1; 1119 } 1120 rw_wlock(blp); 1121 cache_zap_locked(ncp); 1122 rw_wunlock(blp); 1123 mtx_unlock(to_unlock); 1124 out: 1125 mtx_unlock(pvlp); 1126 return (error); 1127 } 1128 1129 /* 1130 * If trylocking failed we can get here. We know enough to take all needed locks 1131 * in the right order and re-lookup the entry. 1132 */ 1133 static int 1134 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1135 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, 1136 struct rwlock *blp) 1137 { 1138 struct namecache *rncp; 1139 1140 cache_assert_bucket_locked(ncp, RA_UNLOCKED); 1141 1142 cache_sort_vnodes(&dvlp, &vlp); 1143 cache_lock_vnodes(dvlp, vlp); 1144 rw_wlock(blp); 1145 CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { 1146 if (rncp == ncp && rncp->nc_dvp == dvp && 1147 rncp->nc_nlen == cnp->cn_namelen && 1148 !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) 1149 break; 1150 } 1151 if (rncp != NULL) { 1152 cache_zap_locked(rncp); 1153 rw_wunlock(blp); 1154 cache_unlock_vnodes(dvlp, vlp); 1155 counter_u64_add(zap_and_exit_bucket_relock_success, 1); 1156 return (0); 1157 } 1158 1159 rw_wunlock(blp); 1160 cache_unlock_vnodes(dvlp, vlp); 1161 return (EAGAIN); 1162 } 1163 1164 static int __noinline 1165 cache_zap_wlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1166 uint32_t hash, struct rwlock *blp) 1167 { 1168 struct mtx *dvlp, *vlp; 1169 struct vnode *dvp; 1170 1171 cache_assert_bucket_locked(ncp, RA_WLOCKED); 1172 1173 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1174 vlp = NULL; 1175 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1176 vlp = VP2VNODELOCK(ncp->nc_vp); 1177 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1178 cache_zap_locked(ncp); 1179 rw_wunlock(blp); 1180 cache_unlock_vnodes(dvlp, vlp); 1181 return (0); 1182 } 1183 1184 dvp = ncp->nc_dvp; 1185 rw_wunlock(blp); 1186 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1187 } 1188 1189 static int __noinline 1190 cache_zap_rlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1191 uint32_t hash, struct rwlock *blp) 1192 { 1193 struct mtx *dvlp, *vlp; 1194 struct vnode *dvp; 1195 1196 cache_assert_bucket_locked(ncp, RA_RLOCKED); 1197 1198 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1199 vlp = NULL; 1200 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1201 vlp = VP2VNODELOCK(ncp->nc_vp); 1202 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1203 rw_runlock(blp); 1204 rw_wlock(blp); 1205 cache_zap_locked(ncp); 1206 rw_wunlock(blp); 1207 cache_unlock_vnodes(dvlp, vlp); 1208 return (0); 1209 } 1210 1211 dvp = ncp->nc_dvp; 1212 rw_runlock(blp); 1213 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1214 } 1215 1216 static int 1217 cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp, 1218 struct mtx **vlpp1, struct mtx **vlpp2) 1219 { 1220 struct mtx *dvlp, *vlp; 1221 1222 cache_assert_bucket_locked(ncp, RA_WLOCKED); 1223 1224 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1225 vlp = NULL; 1226 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1227 vlp = VP2VNODELOCK(ncp->nc_vp); 1228 cache_sort_vnodes(&dvlp, &vlp); 1229 1230 if (*vlpp1 == dvlp && *vlpp2 == vlp) { 1231 cache_zap_locked(ncp); 1232 cache_unlock_vnodes(dvlp, vlp); 1233 *vlpp1 = NULL; 1234 *vlpp2 = NULL; 1235 return (0); 1236 } 1237 1238 if (*vlpp1 != NULL) 1239 mtx_unlock(*vlpp1); 1240 if (*vlpp2 != NULL) 1241 mtx_unlock(*vlpp2); 1242 *vlpp1 = NULL; 1243 *vlpp2 = NULL; 1244 1245 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1246 cache_zap_locked(ncp); 1247 cache_unlock_vnodes(dvlp, vlp); 1248 return (0); 1249 } 1250 1251 rw_wunlock(blp); 1252 *vlpp1 = dvlp; 1253 *vlpp2 = vlp; 1254 if (*vlpp1 != NULL) 1255 mtx_lock(*vlpp1); 1256 mtx_lock(*vlpp2); 1257 rw_wlock(blp); 1258 return (EAGAIN); 1259 } 1260 1261 static void 1262 cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp) 1263 { 1264 1265 if (blp != NULL) { 1266 rw_runlock(blp); 1267 } else { 1268 mtx_unlock(vlp); 1269 } 1270 } 1271 1272 static int __noinline 1273 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1274 struct timespec *tsp, int *ticksp) 1275 { 1276 int ltype; 1277 1278 *vpp = dvp; 1279 CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .", 1280 dvp, cnp->cn_nameptr); 1281 counter_u64_add(dothits, 1); 1282 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); 1283 if (tsp != NULL) 1284 timespecclear(tsp); 1285 if (ticksp != NULL) 1286 *ticksp = ticks; 1287 vrefact(*vpp); 1288 /* 1289 * When we lookup "." we still can be asked to lock it 1290 * differently... 1291 */ 1292 ltype = cnp->cn_lkflags & LK_TYPE_MASK; 1293 if (ltype != VOP_ISLOCKED(*vpp)) { 1294 if (ltype == LK_EXCLUSIVE) { 1295 vn_lock(*vpp, LK_UPGRADE | LK_RETRY); 1296 if (VN_IS_DOOMED((*vpp))) { 1297 /* forced unmount */ 1298 vrele(*vpp); 1299 *vpp = NULL; 1300 return (ENOENT); 1301 } 1302 } else 1303 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); 1304 } 1305 return (-1); 1306 } 1307 1308 static __noinline int 1309 cache_lookup_nomakeentry(struct vnode *dvp, struct vnode **vpp, 1310 struct componentname *cnp, struct timespec *tsp, int *ticksp) 1311 { 1312 struct namecache *ncp; 1313 struct rwlock *blp; 1314 struct mtx *dvlp, *dvlp2; 1315 uint32_t hash; 1316 int error; 1317 1318 if (cnp->cn_namelen == 2 && 1319 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1320 counter_u64_add(dotdothits, 1); 1321 dvlp = VP2VNODELOCK(dvp); 1322 dvlp2 = NULL; 1323 mtx_lock(dvlp); 1324 retry_dotdot: 1325 ncp = dvp->v_cache_dd; 1326 if (ncp == NULL) { 1327 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, 1328 "..", NULL); 1329 mtx_unlock(dvlp); 1330 if (dvlp2 != NULL) 1331 mtx_unlock(dvlp2); 1332 return (0); 1333 } 1334 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1335 if (ncp->nc_dvp != dvp) 1336 panic("dvp %p v_cache_dd %p\n", dvp, ncp); 1337 if (!cache_zap_locked_vnode_kl2(ncp, 1338 dvp, &dvlp2)) 1339 goto retry_dotdot; 1340 MPASS(dvp->v_cache_dd == NULL); 1341 mtx_unlock(dvlp); 1342 if (dvlp2 != NULL) 1343 mtx_unlock(dvlp2); 1344 cache_free(ncp); 1345 } else { 1346 vn_seqc_write_begin(dvp); 1347 dvp->v_cache_dd = NULL; 1348 vn_seqc_write_end(dvp); 1349 mtx_unlock(dvlp); 1350 if (dvlp2 != NULL) 1351 mtx_unlock(dvlp2); 1352 } 1353 return (0); 1354 } 1355 1356 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1357 blp = HASH2BUCKETLOCK(hash); 1358 retry: 1359 if (CK_SLIST_EMPTY(NCHHASH(hash))) 1360 goto out_no_entry; 1361 1362 rw_wlock(blp); 1363 1364 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1365 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1366 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1367 break; 1368 } 1369 1370 /* We failed to find an entry */ 1371 if (ncp == NULL) { 1372 rw_wunlock(blp); 1373 goto out_no_entry; 1374 } 1375 1376 error = cache_zap_wlocked_bucket(ncp, cnp, hash, blp); 1377 if (__predict_false(error != 0)) { 1378 zap_and_exit_bucket_fail++; 1379 cache_maybe_yield(); 1380 goto retry; 1381 } 1382 counter_u64_add(numposzaps, 1); 1383 cache_free(ncp); 1384 return (0); 1385 out_no_entry: 1386 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, NULL); 1387 counter_u64_add(nummisszap, 1); 1388 return (0); 1389 } 1390 1391 /** 1392 * Lookup a name in the name cache 1393 * 1394 * # Arguments 1395 * 1396 * - dvp: Parent directory in which to search. 1397 * - vpp: Return argument. Will contain desired vnode on cache hit. 1398 * - cnp: Parameters of the name search. The most interesting bits of 1399 * the cn_flags field have the following meanings: 1400 * - MAKEENTRY: If clear, free an entry from the cache rather than look 1401 * it up. 1402 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." 1403 * - tsp: Return storage for cache timestamp. On a successful (positive 1404 * or negative) lookup, tsp will be filled with any timespec that 1405 * was stored when this cache entry was created. However, it will 1406 * be clear for "." entries. 1407 * - ticks: Return storage for alternate cache timestamp. On a successful 1408 * (positive or negative) lookup, it will contain the ticks value 1409 * that was current when the cache entry was created, unless cnp 1410 * was ".". 1411 * 1412 * # Returns 1413 * 1414 * - -1: A positive cache hit. vpp will contain the desired vnode. 1415 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due 1416 * to a forced unmount. vpp will not be modified. If the entry 1417 * is a whiteout, then the ISWHITEOUT flag will be set in 1418 * cnp->cn_flags. 1419 * - 0: A cache miss. vpp will not be modified. 1420 * 1421 * # Locking 1422 * 1423 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up 1424 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the 1425 * lock is not recursively acquired. 1426 */ 1427 int 1428 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1429 struct timespec *tsp, int *ticksp) 1430 { 1431 struct namecache_ts *ncp_ts; 1432 struct namecache *ncp; 1433 struct negstate *negstate; 1434 struct rwlock *blp; 1435 struct mtx *dvlp; 1436 uint32_t hash; 1437 enum vgetstate vs; 1438 int error, ltype; 1439 bool try_smr, doing_smr, whiteout; 1440 1441 #ifdef DEBUG_CACHE 1442 if (__predict_false(!doingcache)) { 1443 cnp->cn_flags &= ~MAKEENTRY; 1444 return (0); 1445 } 1446 #endif 1447 1448 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) 1449 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); 1450 1451 if ((cnp->cn_flags & MAKEENTRY) == 0) 1452 return (cache_lookup_nomakeentry(dvp, vpp, cnp, tsp, ticksp)); 1453 1454 try_smr = true; 1455 if (cnp->cn_nameiop == CREATE) 1456 try_smr = false; 1457 retry: 1458 doing_smr = false; 1459 blp = NULL; 1460 dvlp = NULL; 1461 error = 0; 1462 if (cnp->cn_namelen == 2 && 1463 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1464 counter_u64_add(dotdothits, 1); 1465 dvlp = VP2VNODELOCK(dvp); 1466 mtx_lock(dvlp); 1467 ncp = dvp->v_cache_dd; 1468 if (ncp == NULL) { 1469 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, 1470 "..", NULL); 1471 mtx_unlock(dvlp); 1472 return (0); 1473 } 1474 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1475 if (ncp->nc_flag & NCF_NEGATIVE) 1476 *vpp = NULL; 1477 else 1478 *vpp = ncp->nc_vp; 1479 } else 1480 *vpp = ncp->nc_dvp; 1481 /* Return failure if negative entry was found. */ 1482 if (*vpp == NULL) 1483 goto negative_success; 1484 CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..", 1485 dvp, cnp->cn_nameptr, *vpp); 1486 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", 1487 *vpp); 1488 cache_out_ts(ncp, tsp, ticksp); 1489 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == 1490 NCF_DTS && tsp != NULL) { 1491 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1492 *tsp = ncp_ts->nc_dotdottime; 1493 } 1494 goto success; 1495 } 1496 1497 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1498 retry_hashed: 1499 if (try_smr) { 1500 vfs_smr_enter(); 1501 doing_smr = true; 1502 try_smr = false; 1503 } else { 1504 blp = HASH2BUCKETLOCK(hash); 1505 rw_rlock(blp); 1506 } 1507 1508 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1509 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1510 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1511 break; 1512 } 1513 1514 /* We failed to find an entry */ 1515 if (__predict_false(ncp == NULL)) { 1516 if (doing_smr) 1517 vfs_smr_exit(); 1518 else 1519 rw_runlock(blp); 1520 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1521 NULL); 1522 counter_u64_add(nummiss, 1); 1523 return (0); 1524 } 1525 1526 if (ncp->nc_flag & NCF_NEGATIVE) 1527 goto negative_success; 1528 1529 /* We found a "positive" match, return the vnode */ 1530 counter_u64_add(numposhits, 1); 1531 *vpp = ncp->nc_vp; 1532 CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p", 1533 dvp, cnp->cn_nameptr, *vpp, ncp); 1534 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, 1535 *vpp); 1536 cache_out_ts(ncp, tsp, ticksp); 1537 success: 1538 /* 1539 * On success we return a locked and ref'd vnode as per the lookup 1540 * protocol. 1541 */ 1542 MPASS(dvp != *vpp); 1543 ltype = 0; /* silence gcc warning */ 1544 if (cnp->cn_flags & ISDOTDOT) { 1545 ltype = VOP_ISLOCKED(dvp); 1546 VOP_UNLOCK(dvp); 1547 } 1548 if (doing_smr) { 1549 if (!cache_ncp_canuse(ncp)) { 1550 vfs_smr_exit(); 1551 *vpp = NULL; 1552 goto retry; 1553 } 1554 vs = vget_prep_smr(*vpp); 1555 vfs_smr_exit(); 1556 if (__predict_false(vs == VGET_NONE)) { 1557 *vpp = NULL; 1558 goto retry; 1559 } 1560 } else { 1561 vs = vget_prep(*vpp); 1562 cache_lookup_unlock(blp, dvlp); 1563 } 1564 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1565 if (cnp->cn_flags & ISDOTDOT) { 1566 vn_lock(dvp, ltype | LK_RETRY); 1567 if (VN_IS_DOOMED(dvp)) { 1568 if (error == 0) 1569 vput(*vpp); 1570 *vpp = NULL; 1571 return (ENOENT); 1572 } 1573 } 1574 if (error) { 1575 *vpp = NULL; 1576 goto retry; 1577 } 1578 if ((cnp->cn_flags & ISLASTCN) && 1579 (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) { 1580 ASSERT_VOP_ELOCKED(*vpp, "cache_lookup"); 1581 } 1582 return (-1); 1583 1584 negative_success: 1585 /* We found a negative match, and want to create it, so purge */ 1586 if (cnp->cn_nameiop == CREATE) { 1587 MPASS(!doing_smr); 1588 counter_u64_add(numnegzaps, 1); 1589 goto zap_and_exit; 1590 } 1591 1592 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 1593 cache_out_ts(ncp, tsp, ticksp); 1594 counter_u64_add(numneghits, 1); 1595 whiteout = (ncp->nc_flag & NCF_WHITE); 1596 1597 if (doing_smr) { 1598 /* 1599 * We need to take locks to promote an entry. 1600 */ 1601 negstate = NCP2NEGSTATE(ncp); 1602 if ((negstate->neg_flag & NEG_HOT) == 0 || 1603 !cache_ncp_canuse(ncp)) { 1604 vfs_smr_exit(); 1605 doing_smr = false; 1606 goto retry_hashed; 1607 } 1608 vfs_smr_exit(); 1609 } else { 1610 cache_negative_hit(ncp); 1611 cache_lookup_unlock(blp, dvlp); 1612 } 1613 if (whiteout) 1614 cnp->cn_flags |= ISWHITEOUT; 1615 return (ENOENT); 1616 1617 zap_and_exit: 1618 MPASS(!doing_smr); 1619 if (blp != NULL) 1620 error = cache_zap_rlocked_bucket(ncp, cnp, hash, blp); 1621 else 1622 error = cache_zap_locked_vnode(ncp, dvp); 1623 if (__predict_false(error != 0)) { 1624 zap_and_exit_bucket_fail2++; 1625 cache_maybe_yield(); 1626 goto retry; 1627 } 1628 cache_free(ncp); 1629 return (0); 1630 } 1631 1632 struct celockstate { 1633 struct mtx *vlp[3]; 1634 struct rwlock *blp[2]; 1635 }; 1636 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); 1637 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); 1638 1639 static inline void 1640 cache_celockstate_init(struct celockstate *cel) 1641 { 1642 1643 bzero(cel, sizeof(*cel)); 1644 } 1645 1646 static void 1647 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, 1648 struct vnode *dvp) 1649 { 1650 struct mtx *vlp1, *vlp2; 1651 1652 MPASS(cel->vlp[0] == NULL); 1653 MPASS(cel->vlp[1] == NULL); 1654 MPASS(cel->vlp[2] == NULL); 1655 1656 MPASS(vp != NULL || dvp != NULL); 1657 1658 vlp1 = VP2VNODELOCK(vp); 1659 vlp2 = VP2VNODELOCK(dvp); 1660 cache_sort_vnodes(&vlp1, &vlp2); 1661 1662 if (vlp1 != NULL) { 1663 mtx_lock(vlp1); 1664 cel->vlp[0] = vlp1; 1665 } 1666 mtx_lock(vlp2); 1667 cel->vlp[1] = vlp2; 1668 } 1669 1670 static void 1671 cache_unlock_vnodes_cel(struct celockstate *cel) 1672 { 1673 1674 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); 1675 1676 if (cel->vlp[0] != NULL) 1677 mtx_unlock(cel->vlp[0]); 1678 if (cel->vlp[1] != NULL) 1679 mtx_unlock(cel->vlp[1]); 1680 if (cel->vlp[2] != NULL) 1681 mtx_unlock(cel->vlp[2]); 1682 } 1683 1684 static bool 1685 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) 1686 { 1687 struct mtx *vlp; 1688 bool ret; 1689 1690 cache_assert_vlp_locked(cel->vlp[0]); 1691 cache_assert_vlp_locked(cel->vlp[1]); 1692 MPASS(cel->vlp[2] == NULL); 1693 1694 MPASS(vp != NULL); 1695 vlp = VP2VNODELOCK(vp); 1696 1697 ret = true; 1698 if (vlp >= cel->vlp[1]) { 1699 mtx_lock(vlp); 1700 } else { 1701 if (mtx_trylock(vlp)) 1702 goto out; 1703 cache_lock_vnodes_cel_3_failures++; 1704 cache_unlock_vnodes_cel(cel); 1705 if (vlp < cel->vlp[0]) { 1706 mtx_lock(vlp); 1707 mtx_lock(cel->vlp[0]); 1708 mtx_lock(cel->vlp[1]); 1709 } else { 1710 if (cel->vlp[0] != NULL) 1711 mtx_lock(cel->vlp[0]); 1712 mtx_lock(vlp); 1713 mtx_lock(cel->vlp[1]); 1714 } 1715 ret = false; 1716 } 1717 out: 1718 cel->vlp[2] = vlp; 1719 return (ret); 1720 } 1721 1722 static void 1723 cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1, 1724 struct rwlock *blp2) 1725 { 1726 1727 MPASS(cel->blp[0] == NULL); 1728 MPASS(cel->blp[1] == NULL); 1729 1730 cache_sort_vnodes(&blp1, &blp2); 1731 1732 if (blp1 != NULL) { 1733 rw_wlock(blp1); 1734 cel->blp[0] = blp1; 1735 } 1736 rw_wlock(blp2); 1737 cel->blp[1] = blp2; 1738 } 1739 1740 static void 1741 cache_unlock_buckets_cel(struct celockstate *cel) 1742 { 1743 1744 if (cel->blp[0] != NULL) 1745 rw_wunlock(cel->blp[0]); 1746 rw_wunlock(cel->blp[1]); 1747 } 1748 1749 /* 1750 * Lock part of the cache affected by the insertion. 1751 * 1752 * This means vnodelocks for dvp, vp and the relevant bucketlock. 1753 * However, insertion can result in removal of an old entry. In this 1754 * case we have an additional vnode and bucketlock pair to lock. If the 1755 * entry is negative, ncelock is locked instead of the vnode. 1756 * 1757 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while 1758 * preserving the locking order (smaller address first). 1759 */ 1760 static void 1761 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1762 uint32_t hash) 1763 { 1764 struct namecache *ncp; 1765 struct rwlock *blps[2]; 1766 1767 blps[0] = HASH2BUCKETLOCK(hash); 1768 for (;;) { 1769 blps[1] = NULL; 1770 cache_lock_vnodes_cel(cel, dvp, vp); 1771 if (vp == NULL || vp->v_type != VDIR) 1772 break; 1773 ncp = vp->v_cache_dd; 1774 if (ncp == NULL) 1775 break; 1776 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1777 break; 1778 MPASS(ncp->nc_dvp == vp); 1779 blps[1] = NCP2BUCKETLOCK(ncp); 1780 if (ncp->nc_flag & NCF_NEGATIVE) 1781 break; 1782 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1783 break; 1784 /* 1785 * All vnodes got re-locked. Re-validate the state and if 1786 * nothing changed we are done. Otherwise restart. 1787 */ 1788 if (ncp == vp->v_cache_dd && 1789 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1790 blps[1] == NCP2BUCKETLOCK(ncp) && 1791 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1792 break; 1793 cache_unlock_vnodes_cel(cel); 1794 cel->vlp[0] = NULL; 1795 cel->vlp[1] = NULL; 1796 cel->vlp[2] = NULL; 1797 } 1798 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1799 } 1800 1801 static void 1802 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1803 uint32_t hash) 1804 { 1805 struct namecache *ncp; 1806 struct rwlock *blps[2]; 1807 1808 blps[0] = HASH2BUCKETLOCK(hash); 1809 for (;;) { 1810 blps[1] = NULL; 1811 cache_lock_vnodes_cel(cel, dvp, vp); 1812 ncp = dvp->v_cache_dd; 1813 if (ncp == NULL) 1814 break; 1815 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1816 break; 1817 MPASS(ncp->nc_dvp == dvp); 1818 blps[1] = NCP2BUCKETLOCK(ncp); 1819 if (ncp->nc_flag & NCF_NEGATIVE) 1820 break; 1821 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1822 break; 1823 if (ncp == dvp->v_cache_dd && 1824 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1825 blps[1] == NCP2BUCKETLOCK(ncp) && 1826 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1827 break; 1828 cache_unlock_vnodes_cel(cel); 1829 cel->vlp[0] = NULL; 1830 cel->vlp[1] = NULL; 1831 cel->vlp[2] = NULL; 1832 } 1833 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1834 } 1835 1836 static void 1837 cache_enter_unlock(struct celockstate *cel) 1838 { 1839 1840 cache_unlock_buckets_cel(cel); 1841 cache_unlock_vnodes_cel(cel); 1842 } 1843 1844 static void __noinline 1845 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, 1846 struct componentname *cnp) 1847 { 1848 struct celockstate cel; 1849 struct namecache *ncp; 1850 uint32_t hash; 1851 int len; 1852 1853 if (dvp->v_cache_dd == NULL) 1854 return; 1855 len = cnp->cn_namelen; 1856 cache_celockstate_init(&cel); 1857 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1858 cache_enter_lock_dd(&cel, dvp, vp, hash); 1859 vn_seqc_write_begin(dvp); 1860 ncp = dvp->v_cache_dd; 1861 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) { 1862 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); 1863 cache_zap_locked(ncp); 1864 } else { 1865 ncp = NULL; 1866 } 1867 dvp->v_cache_dd = NULL; 1868 vn_seqc_write_end(dvp); 1869 cache_enter_unlock(&cel); 1870 cache_free(ncp); 1871 } 1872 1873 /* 1874 * Add an entry to the cache. 1875 */ 1876 void 1877 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 1878 struct timespec *tsp, struct timespec *dtsp) 1879 { 1880 struct celockstate cel; 1881 struct namecache *ncp, *n2, *ndd; 1882 struct namecache_ts *ncp_ts, *n2_ts; 1883 struct nchashhead *ncpp; 1884 uint32_t hash; 1885 int flag; 1886 int len; 1887 u_long lnumcache; 1888 1889 CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr); 1890 VNASSERT(vp == NULL || !VN_IS_DOOMED(vp), vp, 1891 ("cache_enter: Adding a doomed vnode")); 1892 VNASSERT(dvp == NULL || !VN_IS_DOOMED(dvp), dvp, 1893 ("cache_enter: Doomed vnode used as src")); 1894 1895 #ifdef DEBUG_CACHE 1896 if (__predict_false(!doingcache)) 1897 return; 1898 #endif 1899 1900 flag = 0; 1901 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 1902 if (cnp->cn_namelen == 1) 1903 return; 1904 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 1905 cache_enter_dotdot_prep(dvp, vp, cnp); 1906 flag = NCF_ISDOTDOT; 1907 } 1908 } 1909 1910 /* 1911 * Avoid blowout in namecache entries. 1912 */ 1913 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; 1914 if (__predict_false(lnumcache >= ncsize)) { 1915 atomic_add_long(&numcache, -1); 1916 counter_u64_add(numdrops, 1); 1917 return; 1918 } 1919 1920 cache_celockstate_init(&cel); 1921 ndd = NULL; 1922 ncp_ts = NULL; 1923 1924 /* 1925 * Calculate the hash key and setup as much of the new 1926 * namecache entry as possible before acquiring the lock. 1927 */ 1928 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); 1929 ncp->nc_flag = flag | NCF_WIP; 1930 ncp->nc_vp = vp; 1931 if (vp == NULL) 1932 cache_negative_init(ncp); 1933 ncp->nc_dvp = dvp; 1934 if (tsp != NULL) { 1935 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1936 ncp_ts->nc_time = *tsp; 1937 ncp_ts->nc_ticks = ticks; 1938 ncp_ts->nc_nc.nc_flag |= NCF_TS; 1939 if (dtsp != NULL) { 1940 ncp_ts->nc_dotdottime = *dtsp; 1941 ncp_ts->nc_nc.nc_flag |= NCF_DTS; 1942 } 1943 } 1944 len = ncp->nc_nlen = cnp->cn_namelen; 1945 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1946 memcpy(ncp->nc_name, cnp->cn_nameptr, len); 1947 ncp->nc_name[len] = '\0'; 1948 cache_enter_lock(&cel, dvp, vp, hash); 1949 1950 /* 1951 * See if this vnode or negative entry is already in the cache 1952 * with this name. This can happen with concurrent lookups of 1953 * the same path name. 1954 */ 1955 ncpp = NCHHASH(hash); 1956 CK_SLIST_FOREACH(n2, ncpp, nc_hash) { 1957 if (n2->nc_dvp == dvp && 1958 n2->nc_nlen == cnp->cn_namelen && 1959 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { 1960 if (tsp != NULL) { 1961 KASSERT((n2->nc_flag & NCF_TS) != 0, 1962 ("no NCF_TS")); 1963 n2_ts = __containerof(n2, struct namecache_ts, nc_nc); 1964 n2_ts->nc_time = ncp_ts->nc_time; 1965 n2_ts->nc_ticks = ncp_ts->nc_ticks; 1966 if (dtsp != NULL) { 1967 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; 1968 n2_ts->nc_nc.nc_flag |= NCF_DTS; 1969 } 1970 } 1971 goto out_unlock_free; 1972 } 1973 } 1974 1975 if (flag == NCF_ISDOTDOT) { 1976 /* 1977 * See if we are trying to add .. entry, but some other lookup 1978 * has populated v_cache_dd pointer already. 1979 */ 1980 if (dvp->v_cache_dd != NULL) 1981 goto out_unlock_free; 1982 KASSERT(vp == NULL || vp->v_type == VDIR, 1983 ("wrong vnode type %p", vp)); 1984 vn_seqc_write_begin(dvp); 1985 dvp->v_cache_dd = ncp; 1986 vn_seqc_write_end(dvp); 1987 } 1988 1989 if (vp != NULL) { 1990 if (vp->v_type == VDIR) { 1991 if (flag != NCF_ISDOTDOT) { 1992 /* 1993 * For this case, the cache entry maps both the 1994 * directory name in it and the name ".." for the 1995 * directory's parent. 1996 */ 1997 vn_seqc_write_begin(vp); 1998 if ((ndd = vp->v_cache_dd) != NULL) { 1999 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) 2000 cache_zap_locked(ndd); 2001 else 2002 ndd = NULL; 2003 } 2004 vp->v_cache_dd = ncp; 2005 vn_seqc_write_end(vp); 2006 } 2007 } else { 2008 if (vp->v_cache_dd != NULL) { 2009 vn_seqc_write_begin(vp); 2010 vp->v_cache_dd = NULL; 2011 vn_seqc_write_end(vp); 2012 } 2013 } 2014 } 2015 2016 if (flag != NCF_ISDOTDOT) { 2017 if (LIST_EMPTY(&dvp->v_cache_src)) { 2018 vhold(dvp); 2019 counter_u64_add(numcachehv, 1); 2020 } 2021 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); 2022 } 2023 2024 /* 2025 * If the entry is "negative", we place it into the 2026 * "negative" cache queue, otherwise, we place it into the 2027 * destination vnode's cache entries queue. 2028 */ 2029 if (vp != NULL) { 2030 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); 2031 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, 2032 vp); 2033 } else { 2034 if (cnp->cn_flags & ISWHITEOUT) 2035 ncp->nc_flag |= NCF_WHITE; 2036 cache_negative_insert(ncp); 2037 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, 2038 ncp->nc_name); 2039 } 2040 2041 /* 2042 * Insert the new namecache entry into the appropriate chain 2043 * within the cache entries table. 2044 */ 2045 CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash); 2046 2047 atomic_thread_fence_rel(); 2048 /* 2049 * Mark the entry as fully constructed. 2050 * It is immutable past this point until its removal. 2051 */ 2052 atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP); 2053 2054 cache_enter_unlock(&cel); 2055 if (numneg * ncnegfactor > lnumcache) 2056 cache_negative_zap_one(); 2057 cache_free(ndd); 2058 return; 2059 out_unlock_free: 2060 cache_enter_unlock(&cel); 2061 atomic_add_long(&numcache, -1); 2062 cache_free(ncp); 2063 return; 2064 } 2065 2066 static u_int 2067 cache_roundup_2(u_int val) 2068 { 2069 u_int res; 2070 2071 for (res = 1; res <= val; res <<= 1) 2072 continue; 2073 2074 return (res); 2075 } 2076 2077 static struct nchashhead * 2078 nchinittbl(u_long elements, u_long *hashmask) 2079 { 2080 struct nchashhead *hashtbl; 2081 u_long hashsize, i; 2082 2083 hashsize = cache_roundup_2(elements) / 2; 2084 2085 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK); 2086 for (i = 0; i < hashsize; i++) 2087 CK_SLIST_INIT(&hashtbl[i]); 2088 *hashmask = hashsize - 1; 2089 return (hashtbl); 2090 } 2091 2092 static void 2093 ncfreetbl(struct nchashhead *hashtbl) 2094 { 2095 2096 free(hashtbl, M_VFSCACHE); 2097 } 2098 2099 /* 2100 * Name cache initialization, from vfs_init() when we are booting 2101 */ 2102 static void 2103 nchinit(void *dummy __unused) 2104 { 2105 u_int i; 2106 2107 cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE, 2108 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2109 cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE, 2110 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2111 cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE, 2112 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2113 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE, 2114 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2115 2116 VFS_SMR_ZONE_SET(cache_zone_small); 2117 VFS_SMR_ZONE_SET(cache_zone_small_ts); 2118 VFS_SMR_ZONE_SET(cache_zone_large); 2119 VFS_SMR_ZONE_SET(cache_zone_large_ts); 2120 2121 ncsize = desiredvnodes * ncsizefactor; 2122 nchashtbl = nchinittbl(desiredvnodes * 2, &nchash); 2123 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; 2124 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ 2125 ncbuckethash = 7; 2126 if (ncbuckethash > nchash) 2127 ncbuckethash = nchash; 2128 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, 2129 M_WAITOK | M_ZERO); 2130 for (i = 0; i < numbucketlocks; i++) 2131 rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE); 2132 ncvnodehash = ncbuckethash; 2133 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, 2134 M_WAITOK | M_ZERO); 2135 for (i = 0; i < numvnodelocks; i++) 2136 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); 2137 ncpurgeminvnodes = numbucketlocks * 2; 2138 2139 neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE, 2140 M_WAITOK | M_ZERO); 2141 for (i = 0; i < numneglists; i++) { 2142 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); 2143 TAILQ_INIT(&neglists[i].nl_list); 2144 } 2145 mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF); 2146 TAILQ_INIT(&ncneg_hot.nl_list); 2147 2148 mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF); 2149 } 2150 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); 2151 2152 void 2153 cache_vnode_init(struct vnode *vp) 2154 { 2155 2156 LIST_INIT(&vp->v_cache_src); 2157 TAILQ_INIT(&vp->v_cache_dst); 2158 vp->v_cache_dd = NULL; 2159 cache_prehash(vp); 2160 } 2161 2162 void 2163 cache_changesize(u_long newmaxvnodes) 2164 { 2165 struct nchashhead *new_nchashtbl, *old_nchashtbl; 2166 u_long new_nchash, old_nchash; 2167 struct namecache *ncp; 2168 uint32_t hash; 2169 u_long newncsize; 2170 int i; 2171 2172 newncsize = newmaxvnodes * ncsizefactor; 2173 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); 2174 if (newmaxvnodes < numbucketlocks) 2175 newmaxvnodes = numbucketlocks; 2176 2177 new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash); 2178 /* If same hash table size, nothing to do */ 2179 if (nchash == new_nchash) { 2180 ncfreetbl(new_nchashtbl); 2181 return; 2182 } 2183 /* 2184 * Move everything from the old hash table to the new table. 2185 * None of the namecache entries in the table can be removed 2186 * because to do so, they have to be removed from the hash table. 2187 */ 2188 cache_lock_all_vnodes(); 2189 cache_lock_all_buckets(); 2190 old_nchashtbl = nchashtbl; 2191 old_nchash = nchash; 2192 nchashtbl = new_nchashtbl; 2193 nchash = new_nchash; 2194 for (i = 0; i <= old_nchash; i++) { 2195 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) { 2196 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, 2197 ncp->nc_dvp); 2198 CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash); 2199 CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); 2200 } 2201 } 2202 ncsize = newncsize; 2203 cache_unlock_all_buckets(); 2204 cache_unlock_all_vnodes(); 2205 ncfreetbl(old_nchashtbl); 2206 } 2207 2208 /* 2209 * Invalidate all entries from and to a particular vnode. 2210 */ 2211 static void 2212 cache_purge_impl(struct vnode *vp) 2213 { 2214 TAILQ_HEAD(, namecache) ncps; 2215 struct namecache *ncp, *nnp; 2216 struct mtx *vlp, *vlp2; 2217 2218 TAILQ_INIT(&ncps); 2219 vlp = VP2VNODELOCK(vp); 2220 vlp2 = NULL; 2221 mtx_assert(vlp, MA_OWNED); 2222 retry: 2223 while (!LIST_EMPTY(&vp->v_cache_src)) { 2224 ncp = LIST_FIRST(&vp->v_cache_src); 2225 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2226 goto retry; 2227 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2228 } 2229 while (!TAILQ_EMPTY(&vp->v_cache_dst)) { 2230 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2231 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2232 goto retry; 2233 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2234 } 2235 ncp = vp->v_cache_dd; 2236 if (ncp != NULL) { 2237 KASSERT(ncp->nc_flag & NCF_ISDOTDOT, 2238 ("lost dotdot link")); 2239 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2240 goto retry; 2241 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2242 } 2243 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); 2244 mtx_unlock(vlp); 2245 if (vlp2 != NULL) 2246 mtx_unlock(vlp2); 2247 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2248 cache_free(ncp); 2249 } 2250 } 2251 2252 void 2253 cache_purge(struct vnode *vp) 2254 { 2255 struct mtx *vlp; 2256 2257 SDT_PROBE1(vfs, namecache, purge, done, vp); 2258 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2259 vp->v_cache_dd == NULL) 2260 return; 2261 vlp = VP2VNODELOCK(vp); 2262 mtx_lock(vlp); 2263 cache_purge_impl(vp); 2264 } 2265 2266 /* 2267 * Only to be used by vgone. 2268 */ 2269 void 2270 cache_purge_vgone(struct vnode *vp) 2271 { 2272 struct mtx *vlp; 2273 2274 VNPASS(VN_IS_DOOMED(vp), vp); 2275 vlp = VP2VNODELOCK(vp); 2276 if (!(LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2277 vp->v_cache_dd == NULL)) { 2278 mtx_lock(vlp); 2279 cache_purge_impl(vp); 2280 mtx_assert(vlp, MA_NOTOWNED); 2281 return; 2282 } 2283 2284 /* 2285 * All the NULL pointer state we found above may be transient. 2286 * Serialize against a possible thread doing cache_purge. 2287 */ 2288 mtx_wait_unlocked(vlp); 2289 if (!(LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2290 vp->v_cache_dd == NULL)) { 2291 mtx_lock(vlp); 2292 cache_purge_impl(vp); 2293 mtx_assert(vlp, MA_NOTOWNED); 2294 return; 2295 } 2296 return; 2297 } 2298 2299 /* 2300 * Invalidate all negative entries for a particular directory vnode. 2301 */ 2302 void 2303 cache_purge_negative(struct vnode *vp) 2304 { 2305 TAILQ_HEAD(, namecache) ncps; 2306 struct namecache *ncp, *nnp; 2307 struct mtx *vlp; 2308 2309 CTR1(KTR_VFS, "cache_purge_negative(%p)", vp); 2310 SDT_PROBE1(vfs, namecache, purge_negative, done, vp); 2311 if (LIST_EMPTY(&vp->v_cache_src)) 2312 return; 2313 TAILQ_INIT(&ncps); 2314 vlp = VP2VNODELOCK(vp); 2315 mtx_lock(vlp); 2316 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { 2317 if (!(ncp->nc_flag & NCF_NEGATIVE)) 2318 continue; 2319 cache_zap_negative_locked_vnode_kl(ncp, vp); 2320 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2321 } 2322 mtx_unlock(vlp); 2323 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2324 cache_free(ncp); 2325 } 2326 } 2327 2328 /* 2329 * Flush all entries referencing a particular filesystem. 2330 */ 2331 void 2332 cache_purgevfs(struct mount *mp, bool force) 2333 { 2334 TAILQ_HEAD(, namecache) ncps; 2335 struct mtx *vlp1, *vlp2; 2336 struct rwlock *blp; 2337 struct nchashhead *bucket; 2338 struct namecache *ncp, *nnp; 2339 u_long i, j, n_nchash; 2340 int error; 2341 2342 /* Scan hash tables for applicable entries */ 2343 SDT_PROBE1(vfs, namecache, purgevfs, done, mp); 2344 if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes) 2345 return; 2346 TAILQ_INIT(&ncps); 2347 n_nchash = nchash + 1; 2348 vlp1 = vlp2 = NULL; 2349 for (i = 0; i < numbucketlocks; i++) { 2350 blp = (struct rwlock *)&bucketlocks[i]; 2351 rw_wlock(blp); 2352 for (j = i; j < n_nchash; j += numbucketlocks) { 2353 retry: 2354 bucket = &nchashtbl[j]; 2355 CK_SLIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) { 2356 cache_assert_bucket_locked(ncp, RA_WLOCKED); 2357 if (ncp->nc_dvp->v_mount != mp) 2358 continue; 2359 error = cache_zap_wlocked_bucket_kl(ncp, blp, 2360 &vlp1, &vlp2); 2361 if (error != 0) 2362 goto retry; 2363 TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst); 2364 } 2365 } 2366 rw_wunlock(blp); 2367 if (vlp1 == NULL && vlp2 == NULL) 2368 cache_maybe_yield(); 2369 } 2370 if (vlp1 != NULL) 2371 mtx_unlock(vlp1); 2372 if (vlp2 != NULL) 2373 mtx_unlock(vlp2); 2374 2375 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2376 cache_free(ncp); 2377 } 2378 } 2379 2380 /* 2381 * Perform canonical checks and cache lookup and pass on to filesystem 2382 * through the vop_cachedlookup only if needed. 2383 */ 2384 2385 int 2386 vfs_cache_lookup(struct vop_lookup_args *ap) 2387 { 2388 struct vnode *dvp; 2389 int error; 2390 struct vnode **vpp = ap->a_vpp; 2391 struct componentname *cnp = ap->a_cnp; 2392 int flags = cnp->cn_flags; 2393 2394 *vpp = NULL; 2395 dvp = ap->a_dvp; 2396 2397 if (dvp->v_type != VDIR) 2398 return (ENOTDIR); 2399 2400 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && 2401 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) 2402 return (EROFS); 2403 2404 error = vn_dir_check_exec(dvp, cnp); 2405 if (error != 0) 2406 return (error); 2407 2408 error = cache_lookup(dvp, vpp, cnp, NULL, NULL); 2409 if (error == 0) 2410 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); 2411 if (error == -1) 2412 return (0); 2413 return (error); 2414 } 2415 2416 /* Implementation of the getcwd syscall. */ 2417 int 2418 sys___getcwd(struct thread *td, struct __getcwd_args *uap) 2419 { 2420 char *buf, *retbuf; 2421 size_t buflen; 2422 int error; 2423 2424 buflen = uap->buflen; 2425 if (__predict_false(buflen < 2)) 2426 return (EINVAL); 2427 if (buflen > MAXPATHLEN) 2428 buflen = MAXPATHLEN; 2429 2430 buf = malloc(buflen, M_TEMP, M_WAITOK); 2431 error = vn_getcwd(td, buf, &retbuf, &buflen); 2432 if (error == 0) 2433 error = copyout(retbuf, uap->buf, buflen); 2434 free(buf, M_TEMP); 2435 return (error); 2436 } 2437 2438 int 2439 vn_getcwd(struct thread *td, char *buf, char **retbuf, size_t *buflen) 2440 { 2441 struct pwd *pwd; 2442 int error; 2443 2444 pwd = pwd_hold(td); 2445 error = vn_fullpath_any(td, pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, buflen); 2446 pwd_drop(pwd); 2447 2448 #ifdef KTRACE 2449 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0) 2450 ktrnamei(*retbuf); 2451 #endif 2452 return (error); 2453 } 2454 2455 static int 2456 kern___realpathat(struct thread *td, int fd, const char *path, char *buf, 2457 size_t size, int flags, enum uio_seg pathseg) 2458 { 2459 struct nameidata nd; 2460 char *retbuf, *freebuf; 2461 int error; 2462 2463 if (flags != 0) 2464 return (EINVAL); 2465 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1, 2466 pathseg, path, fd, &cap_fstat_rights, td); 2467 if ((error = namei(&nd)) != 0) 2468 return (error); 2469 error = vn_fullpath_hardlink(td, &nd, &retbuf, &freebuf, &size); 2470 if (error == 0) { 2471 error = copyout(retbuf, buf, size); 2472 free(freebuf, M_TEMP); 2473 } 2474 NDFREE(&nd, 0); 2475 return (error); 2476 } 2477 2478 int 2479 sys___realpathat(struct thread *td, struct __realpathat_args *uap) 2480 { 2481 2482 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size, 2483 uap->flags, UIO_USERSPACE)); 2484 } 2485 2486 /* 2487 * Retrieve the full filesystem path that correspond to a vnode from the name 2488 * cache (if available) 2489 */ 2490 int 2491 vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf) 2492 { 2493 struct pwd *pwd; 2494 char *buf; 2495 size_t buflen; 2496 int error; 2497 2498 if (__predict_false(vn == NULL)) 2499 return (EINVAL); 2500 2501 buflen = MAXPATHLEN; 2502 buf = malloc(buflen, M_TEMP, M_WAITOK); 2503 pwd = pwd_hold(td); 2504 error = vn_fullpath_any(td, vn, pwd->pwd_rdir, buf, retbuf, &buflen); 2505 pwd_drop(pwd); 2506 2507 if (!error) 2508 *freebuf = buf; 2509 else 2510 free(buf, M_TEMP); 2511 return (error); 2512 } 2513 2514 /* 2515 * This function is similar to vn_fullpath, but it attempts to lookup the 2516 * pathname relative to the global root mount point. This is required for the 2517 * auditing sub-system, as audited pathnames must be absolute, relative to the 2518 * global root mount point. 2519 */ 2520 int 2521 vn_fullpath_global(struct thread *td, struct vnode *vn, 2522 char **retbuf, char **freebuf) 2523 { 2524 char *buf; 2525 size_t buflen; 2526 int error; 2527 2528 if (__predict_false(vn == NULL)) 2529 return (EINVAL); 2530 buflen = MAXPATHLEN; 2531 buf = malloc(buflen, M_TEMP, M_WAITOK); 2532 error = vn_fullpath_any(td, vn, rootvnode, buf, retbuf, &buflen); 2533 if (!error) 2534 *freebuf = buf; 2535 else 2536 free(buf, M_TEMP); 2537 return (error); 2538 } 2539 2540 int 2541 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, size_t *buflen) 2542 { 2543 struct vnode *dvp; 2544 struct namecache *ncp; 2545 struct mtx *vlp; 2546 int error; 2547 2548 vlp = VP2VNODELOCK(*vp); 2549 mtx_lock(vlp); 2550 TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) { 2551 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2552 break; 2553 } 2554 if (ncp != NULL) { 2555 if (*buflen < ncp->nc_nlen) { 2556 mtx_unlock(vlp); 2557 vrele(*vp); 2558 counter_u64_add(numfullpathfail4, 1); 2559 error = ENOMEM; 2560 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2561 vp, NULL); 2562 return (error); 2563 } 2564 *buflen -= ncp->nc_nlen; 2565 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 2566 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, 2567 ncp->nc_name, vp); 2568 dvp = *vp; 2569 *vp = ncp->nc_dvp; 2570 vref(*vp); 2571 mtx_unlock(vlp); 2572 vrele(dvp); 2573 return (0); 2574 } 2575 SDT_PROBE1(vfs, namecache, fullpath, miss, vp); 2576 2577 mtx_unlock(vlp); 2578 vn_lock(*vp, LK_SHARED | LK_RETRY); 2579 error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen); 2580 vput(*vp); 2581 if (error) { 2582 counter_u64_add(numfullpathfail2, 1); 2583 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2584 return (error); 2585 } 2586 2587 *vp = dvp; 2588 if (VN_IS_DOOMED(dvp)) { 2589 /* forced unmount */ 2590 vrele(dvp); 2591 error = ENOENT; 2592 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2593 return (error); 2594 } 2595 /* 2596 * *vp has its use count incremented still. 2597 */ 2598 2599 return (0); 2600 } 2601 2602 /* 2603 * Resolve a directory to a pathname. 2604 * 2605 * The name of the directory can always be found in the namecache or fetched 2606 * from the filesystem. There is also guaranteed to be only one parent, meaning 2607 * we can just follow vnodes up until we find the root. 2608 * 2609 * The vnode must be referenced. 2610 */ 2611 static int 2612 vn_fullpath_dir(struct thread *td, struct vnode *vp, struct vnode *rdir, 2613 char *buf, char **retbuf, size_t *len, bool slash_prefixed, size_t addend) 2614 { 2615 #ifdef KDTRACE_HOOKS 2616 struct vnode *startvp = vp; 2617 #endif 2618 struct vnode *vp1; 2619 size_t buflen; 2620 int error; 2621 2622 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); 2623 VNPASS(vp->v_usecount > 0, vp); 2624 2625 buflen = *len; 2626 2627 if (!slash_prefixed) { 2628 MPASS(*len >= 2); 2629 buflen--; 2630 buf[buflen] = '\0'; 2631 } 2632 2633 error = 0; 2634 2635 SDT_PROBE1(vfs, namecache, fullpath, entry, vp); 2636 counter_u64_add(numfullpathcalls, 1); 2637 while (vp != rdir && vp != rootvnode) { 2638 /* 2639 * The vp vnode must be already fully constructed, 2640 * since it is either found in namecache or obtained 2641 * from VOP_VPTOCNP(). We may test for VV_ROOT safely 2642 * without obtaining the vnode lock. 2643 */ 2644 if ((vp->v_vflag & VV_ROOT) != 0) { 2645 vn_lock(vp, LK_RETRY | LK_SHARED); 2646 2647 /* 2648 * With the vnode locked, check for races with 2649 * unmount, forced or not. Note that we 2650 * already verified that vp is not equal to 2651 * the root vnode, which means that 2652 * mnt_vnodecovered can be NULL only for the 2653 * case of unmount. 2654 */ 2655 if (VN_IS_DOOMED(vp) || 2656 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || 2657 vp1->v_mountedhere != vp->v_mount) { 2658 vput(vp); 2659 error = ENOENT; 2660 SDT_PROBE3(vfs, namecache, fullpath, return, 2661 error, vp, NULL); 2662 break; 2663 } 2664 2665 vref(vp1); 2666 vput(vp); 2667 vp = vp1; 2668 continue; 2669 } 2670 if (vp->v_type != VDIR) { 2671 vrele(vp); 2672 counter_u64_add(numfullpathfail1, 1); 2673 error = ENOTDIR; 2674 SDT_PROBE3(vfs, namecache, fullpath, return, 2675 error, vp, NULL); 2676 break; 2677 } 2678 error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen); 2679 if (error) 2680 break; 2681 if (buflen == 0) { 2682 vrele(vp); 2683 error = ENOMEM; 2684 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2685 startvp, NULL); 2686 break; 2687 } 2688 buf[--buflen] = '/'; 2689 slash_prefixed = true; 2690 } 2691 if (error) 2692 return (error); 2693 if (!slash_prefixed) { 2694 if (buflen == 0) { 2695 vrele(vp); 2696 counter_u64_add(numfullpathfail4, 1); 2697 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, 2698 startvp, NULL); 2699 return (ENOMEM); 2700 } 2701 buf[--buflen] = '/'; 2702 } 2703 counter_u64_add(numfullpathfound, 1); 2704 vrele(vp); 2705 2706 *retbuf = buf + buflen; 2707 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf); 2708 *len -= buflen; 2709 *len += addend; 2710 return (0); 2711 } 2712 2713 /* 2714 * Resolve an arbitrary vnode to a pathname. 2715 * 2716 * Note 2 caveats: 2717 * - hardlinks are not tracked, thus if the vnode is not a directory this can 2718 * resolve to a different path than the one used to find it 2719 * - namecache is not mandatory, meaning names are not guaranteed to be added 2720 * (in which case resolving fails) 2721 */ 2722 static int 2723 vn_fullpath_any(struct thread *td, struct vnode *vp, struct vnode *rdir, 2724 char *buf, char **retbuf, size_t *buflen) 2725 { 2726 size_t orig_buflen; 2727 bool slash_prefixed; 2728 int error; 2729 2730 if (*buflen < 2) 2731 return (EINVAL); 2732 2733 orig_buflen = *buflen; 2734 2735 vref(vp); 2736 slash_prefixed = false; 2737 if (vp->v_type != VDIR) { 2738 *buflen -= 1; 2739 buf[*buflen] = '\0'; 2740 error = vn_vptocnp(&vp, td->td_ucred, buf, buflen); 2741 if (error) 2742 return (error); 2743 if (*buflen == 0) { 2744 vrele(vp); 2745 return (ENOMEM); 2746 } 2747 *buflen -= 1; 2748 buf[*buflen] = '/'; 2749 slash_prefixed = true; 2750 } 2751 2752 return (vn_fullpath_dir(td, vp, rdir, buf, retbuf, buflen, slash_prefixed, 2753 orig_buflen - *buflen)); 2754 } 2755 2756 /* 2757 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks). 2758 * 2759 * Since the namecache does not track handlings, the caller is expected to first 2760 * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei. 2761 * 2762 * Then we have 2 cases: 2763 * - if the found vnode is a directory, the path can be constructed just by 2764 * fullowing names up the chain 2765 * - otherwise we populate the buffer with the saved name and start resolving 2766 * from the parent 2767 */ 2768 static int 2769 vn_fullpath_hardlink(struct thread *td, struct nameidata *ndp, char **retbuf, 2770 char **freebuf, size_t *buflen) 2771 { 2772 char *buf, *tmpbuf; 2773 struct pwd *pwd; 2774 struct componentname *cnp; 2775 struct vnode *vp; 2776 size_t addend; 2777 int error; 2778 bool slash_prefixed; 2779 2780 if (*buflen < 2) 2781 return (EINVAL); 2782 if (*buflen > MAXPATHLEN) 2783 *buflen = MAXPATHLEN; 2784 2785 slash_prefixed = false; 2786 2787 buf = malloc(*buflen, M_TEMP, M_WAITOK); 2788 pwd = pwd_hold(td); 2789 2790 addend = 0; 2791 vp = ndp->ni_vp; 2792 if (vp->v_type != VDIR) { 2793 cnp = &ndp->ni_cnd; 2794 addend = cnp->cn_namelen + 2; 2795 if (*buflen < addend) { 2796 error = ENOMEM; 2797 goto out_bad; 2798 } 2799 *buflen -= addend; 2800 tmpbuf = buf + *buflen; 2801 tmpbuf[0] = '/'; 2802 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen); 2803 tmpbuf[addend - 1] = '\0'; 2804 slash_prefixed = true; 2805 vp = ndp->ni_dvp; 2806 } 2807 2808 vref(vp); 2809 error = vn_fullpath_dir(td, vp, pwd->pwd_rdir, buf, retbuf, buflen, 2810 slash_prefixed, addend); 2811 if (error != 0) 2812 goto out_bad; 2813 2814 pwd_drop(pwd); 2815 *freebuf = buf; 2816 2817 return (0); 2818 out_bad: 2819 pwd_drop(pwd); 2820 free(buf, M_TEMP); 2821 return (error); 2822 } 2823 2824 struct vnode * 2825 vn_dir_dd_ino(struct vnode *vp) 2826 { 2827 struct namecache *ncp; 2828 struct vnode *ddvp; 2829 struct mtx *vlp; 2830 enum vgetstate vs; 2831 2832 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); 2833 vlp = VP2VNODELOCK(vp); 2834 mtx_lock(vlp); 2835 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { 2836 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 2837 continue; 2838 ddvp = ncp->nc_dvp; 2839 vs = vget_prep(ddvp); 2840 mtx_unlock(vlp); 2841 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) 2842 return (NULL); 2843 return (ddvp); 2844 } 2845 mtx_unlock(vlp); 2846 return (NULL); 2847 } 2848 2849 int 2850 vn_commname(struct vnode *vp, char *buf, u_int buflen) 2851 { 2852 struct namecache *ncp; 2853 struct mtx *vlp; 2854 int l; 2855 2856 vlp = VP2VNODELOCK(vp); 2857 mtx_lock(vlp); 2858 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) 2859 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2860 break; 2861 if (ncp == NULL) { 2862 mtx_unlock(vlp); 2863 return (ENOENT); 2864 } 2865 l = min(ncp->nc_nlen, buflen - 1); 2866 memcpy(buf, ncp->nc_name, l); 2867 mtx_unlock(vlp); 2868 buf[l] = '\0'; 2869 return (0); 2870 } 2871 2872 /* 2873 * This function updates path string to vnode's full global path 2874 * and checks the size of the new path string against the pathlen argument. 2875 * 2876 * Requires a locked, referenced vnode. 2877 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 2878 * 2879 * If vp is a directory, the call to vn_fullpath_global() always succeeds 2880 * because it falls back to the ".." lookup if the namecache lookup fails. 2881 */ 2882 int 2883 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, 2884 u_int pathlen) 2885 { 2886 struct nameidata nd; 2887 struct vnode *vp1; 2888 char *rpath, *fbuf; 2889 int error; 2890 2891 ASSERT_VOP_ELOCKED(vp, __func__); 2892 2893 /* Construct global filesystem path from vp. */ 2894 VOP_UNLOCK(vp); 2895 error = vn_fullpath_global(td, vp, &rpath, &fbuf); 2896 2897 if (error != 0) { 2898 vrele(vp); 2899 return (error); 2900 } 2901 2902 if (strlen(rpath) >= pathlen) { 2903 vrele(vp); 2904 error = ENAMETOOLONG; 2905 goto out; 2906 } 2907 2908 /* 2909 * Re-lookup the vnode by path to detect a possible rename. 2910 * As a side effect, the vnode is relocked. 2911 * If vnode was renamed, return ENOENT. 2912 */ 2913 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, 2914 UIO_SYSSPACE, path, td); 2915 error = namei(&nd); 2916 if (error != 0) { 2917 vrele(vp); 2918 goto out; 2919 } 2920 NDFREE(&nd, NDF_ONLY_PNBUF); 2921 vp1 = nd.ni_vp; 2922 vrele(vp); 2923 if (vp1 == vp) 2924 strcpy(path, rpath); 2925 else { 2926 vput(vp1); 2927 error = ENOENT; 2928 } 2929 2930 out: 2931 free(fbuf, M_TEMP); 2932 return (error); 2933 } 2934 2935 #ifdef DDB 2936 static void 2937 db_print_vpath(struct vnode *vp) 2938 { 2939 2940 while (vp != NULL) { 2941 db_printf("%p: ", vp); 2942 if (vp == rootvnode) { 2943 db_printf("/"); 2944 vp = NULL; 2945 } else { 2946 if (vp->v_vflag & VV_ROOT) { 2947 db_printf("<mount point>"); 2948 vp = vp->v_mount->mnt_vnodecovered; 2949 } else { 2950 struct namecache *ncp; 2951 char *ncn; 2952 int i; 2953 2954 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2955 if (ncp != NULL) { 2956 ncn = ncp->nc_name; 2957 for (i = 0; i < ncp->nc_nlen; i++) 2958 db_printf("%c", *ncn++); 2959 vp = ncp->nc_dvp; 2960 } else { 2961 vp = NULL; 2962 } 2963 } 2964 } 2965 db_printf("\n"); 2966 } 2967 2968 return; 2969 } 2970 2971 DB_SHOW_COMMAND(vpath, db_show_vpath) 2972 { 2973 struct vnode *vp; 2974 2975 if (!have_addr) { 2976 db_printf("usage: show vpath <struct vnode *>\n"); 2977 return; 2978 } 2979 2980 vp = (struct vnode *)addr; 2981 db_print_vpath(vp); 2982 } 2983 2984 #endif 2985 2986 extern uma_zone_t namei_zone; 2987 2988 static bool __read_frequently cache_fast_lookup = true; 2989 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW, 2990 &cache_fast_lookup, 0, ""); 2991 2992 #define CACHE_FPL_FAILED -2020 2993 2994 static void 2995 cache_fpl_cleanup_cnp(struct componentname *cnp) 2996 { 2997 2998 uma_zfree(namei_zone, cnp->cn_pnbuf); 2999 #ifdef DIAGNOSTIC 3000 cnp->cn_pnbuf = NULL; 3001 cnp->cn_nameptr = NULL; 3002 #endif 3003 } 3004 3005 static void 3006 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp) 3007 { 3008 struct componentname *cnp; 3009 3010 cnp = &ndp->ni_cnd; 3011 while (*(cnp->cn_nameptr) == '/') { 3012 cnp->cn_nameptr++; 3013 ndp->ni_pathlen--; 3014 } 3015 3016 *dpp = ndp->ni_rootdir; 3017 } 3018 3019 /* 3020 * Components of nameidata (or objects it can point to) which may 3021 * need restoring in case fast path lookup fails. 3022 */ 3023 struct nameidata_saved { 3024 long cn_namelen; 3025 char *cn_nameptr; 3026 size_t ni_pathlen; 3027 int cn_flags; 3028 }; 3029 3030 struct cache_fpl { 3031 struct nameidata *ndp; 3032 struct componentname *cnp; 3033 struct pwd *pwd; 3034 struct vnode *dvp; 3035 struct vnode *tvp; 3036 seqc_t dvp_seqc; 3037 seqc_t tvp_seqc; 3038 struct nameidata_saved snd; 3039 int line; 3040 enum cache_fpl_status status:8; 3041 bool in_smr; 3042 }; 3043 3044 static void 3045 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd) 3046 { 3047 3048 snd->cn_flags = fpl->ndp->ni_cnd.cn_flags; 3049 snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen; 3050 snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr; 3051 snd->ni_pathlen = fpl->ndp->ni_pathlen; 3052 } 3053 3054 static void 3055 cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd) 3056 { 3057 3058 fpl->ndp->ni_cnd.cn_flags = snd->cn_flags; 3059 fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen; 3060 fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr; 3061 fpl->ndp->ni_pathlen = snd->ni_pathlen; 3062 } 3063 3064 #ifdef INVARIANTS 3065 #define cache_fpl_smr_assert_entered(fpl) ({ \ 3066 struct cache_fpl *_fpl = (fpl); \ 3067 MPASS(_fpl->in_smr == true); \ 3068 VFS_SMR_ASSERT_ENTERED(); \ 3069 }) 3070 #define cache_fpl_smr_assert_not_entered(fpl) ({ \ 3071 struct cache_fpl *_fpl = (fpl); \ 3072 MPASS(_fpl->in_smr == false); \ 3073 VFS_SMR_ASSERT_NOT_ENTERED(); \ 3074 }) 3075 #else 3076 #define cache_fpl_smr_assert_entered(fpl) do { } while (0) 3077 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0) 3078 #endif 3079 3080 #define cache_fpl_smr_enter_initial(fpl) ({ \ 3081 struct cache_fpl *_fpl = (fpl); \ 3082 vfs_smr_enter(); \ 3083 _fpl->in_smr = true; \ 3084 }) 3085 3086 #define cache_fpl_smr_enter(fpl) ({ \ 3087 struct cache_fpl *_fpl = (fpl); \ 3088 MPASS(_fpl->in_smr == false); \ 3089 vfs_smr_enter(); \ 3090 _fpl->in_smr = true; \ 3091 }) 3092 3093 #define cache_fpl_smr_exit(fpl) ({ \ 3094 struct cache_fpl *_fpl = (fpl); \ 3095 MPASS(_fpl->in_smr == true); \ 3096 vfs_smr_exit(); \ 3097 _fpl->in_smr = false; \ 3098 }) 3099 3100 static int 3101 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line) 3102 { 3103 3104 if (fpl->status != CACHE_FPL_STATUS_UNSET) { 3105 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, 3106 ("%s: converting to abort from %d at %d, set at %d\n", 3107 __func__, fpl->status, line, fpl->line)); 3108 } 3109 fpl->status = CACHE_FPL_STATUS_ABORTED; 3110 fpl->line = line; 3111 return (CACHE_FPL_FAILED); 3112 } 3113 3114 #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__) 3115 3116 static int 3117 cache_fpl_partial_impl(struct cache_fpl *fpl, int line) 3118 { 3119 3120 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3121 ("%s: setting to partial at %d, but already set to %d at %d\n", 3122 __func__, line, fpl->status, fpl->line)); 3123 cache_fpl_smr_assert_entered(fpl); 3124 fpl->status = CACHE_FPL_STATUS_PARTIAL; 3125 fpl->line = line; 3126 return (CACHE_FPL_FAILED); 3127 } 3128 3129 #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__) 3130 3131 static int 3132 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line) 3133 { 3134 3135 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3136 ("%s: setting to handled at %d, but already set to %d at %d\n", 3137 __func__, line, fpl->status, fpl->line)); 3138 cache_fpl_smr_assert_not_entered(fpl); 3139 MPASS(error != CACHE_FPL_FAILED); 3140 fpl->status = CACHE_FPL_STATUS_HANDLED; 3141 fpl->line = line; 3142 return (error); 3143 } 3144 3145 #define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__) 3146 3147 #define CACHE_FPL_SUPPORTED_CN_FLAGS \ 3148 (LOCKLEAF | LOCKPARENT | WANTPARENT | NOCACHE | FOLLOW | LOCKSHARED | SAVENAME | \ 3149 SAVESTART | WILLBEDIR | ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2) 3150 3151 #define CACHE_FPL_INTERNAL_CN_FLAGS \ 3152 (ISDOTDOT | MAKEENTRY | ISLASTCN) 3153 3154 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, 3155 "supported and internal flags overlap"); 3156 3157 static bool 3158 cache_fpl_islastcn(struct nameidata *ndp) 3159 { 3160 3161 return (*ndp->ni_next == 0); 3162 } 3163 3164 static bool 3165 cache_fpl_isdotdot(struct componentname *cnp) 3166 { 3167 3168 if (cnp->cn_namelen == 2 && 3169 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') 3170 return (true); 3171 return (false); 3172 } 3173 3174 static bool 3175 cache_can_fplookup(struct cache_fpl *fpl) 3176 { 3177 struct nameidata *ndp; 3178 struct componentname *cnp; 3179 struct thread *td; 3180 3181 ndp = fpl->ndp; 3182 cnp = fpl->cnp; 3183 td = cnp->cn_thread; 3184 3185 if (!cache_fast_lookup) { 3186 cache_fpl_aborted(fpl); 3187 return (false); 3188 } 3189 #ifdef MAC 3190 if (mac_vnode_check_lookup_enabled()) { 3191 cache_fpl_aborted(fpl); 3192 return (false); 3193 } 3194 #endif 3195 if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) { 3196 cache_fpl_aborted(fpl); 3197 return (false); 3198 } 3199 if (ndp->ni_dirfd != AT_FDCWD) { 3200 cache_fpl_aborted(fpl); 3201 return (false); 3202 } 3203 if (IN_CAPABILITY_MODE(td)) { 3204 cache_fpl_aborted(fpl); 3205 return (false); 3206 } 3207 if (AUDITING_TD(td)) { 3208 cache_fpl_aborted(fpl); 3209 return (false); 3210 } 3211 if (ndp->ni_startdir != NULL) { 3212 cache_fpl_aborted(fpl); 3213 return (false); 3214 } 3215 return (true); 3216 } 3217 3218 static bool 3219 cache_fplookup_vnode_supported(struct vnode *vp) 3220 { 3221 3222 return (vp->v_type != VLNK); 3223 } 3224 3225 /* 3226 * Move a negative entry to the hot list. 3227 * 3228 * We have to take locks, but they may be contended and in the worst 3229 * case we may need to go off CPU. We don't want to spin within the 3230 * smr section and we can't block with it. Instead we are going to 3231 * look up the entry again. 3232 */ 3233 static int __noinline 3234 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp, 3235 uint32_t hash) 3236 { 3237 struct componentname *cnp; 3238 struct namecache *ncp; 3239 struct neglist *neglist; 3240 struct negstate *negstate; 3241 struct vnode *dvp; 3242 u_char nc_flag; 3243 3244 cnp = fpl->cnp; 3245 dvp = fpl->dvp; 3246 3247 if (!vhold_smr(dvp)) 3248 return (cache_fpl_aborted(fpl)); 3249 3250 neglist = NCP2NEGLIST(oncp); 3251 cache_fpl_smr_exit(fpl); 3252 3253 mtx_lock(&ncneg_hot.nl_lock); 3254 mtx_lock(&neglist->nl_lock); 3255 /* 3256 * For hash iteration. 3257 */ 3258 cache_fpl_smr_enter(fpl); 3259 3260 /* 3261 * Avoid all surprises by only succeeding if we got the same entry and 3262 * bailing completely otherwise. 3263 * 3264 * In particular at this point there can be a new ncp which matches the 3265 * search but hashes to a different neglist. 3266 */ 3267 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 3268 if (ncp == oncp) 3269 break; 3270 } 3271 3272 /* 3273 * No match to begin with. 3274 */ 3275 if (__predict_false(ncp == NULL)) { 3276 goto out_abort; 3277 } 3278 3279 /* 3280 * The newly found entry may be something different... 3281 */ 3282 if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 3283 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) { 3284 goto out_abort; 3285 } 3286 3287 /* 3288 * ... and not even negative. 3289 */ 3290 nc_flag = atomic_load_char(&ncp->nc_flag); 3291 if ((nc_flag & NCF_NEGATIVE) == 0) { 3292 goto out_abort; 3293 } 3294 3295 if (__predict_false(!cache_ncp_canuse(ncp))) { 3296 goto out_abort; 3297 } 3298 3299 negstate = NCP2NEGSTATE(ncp); 3300 if ((negstate->neg_flag & NEG_HOT) == 0) { 3301 numhotneg++; 3302 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 3303 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); 3304 negstate->neg_flag |= NEG_HOT; 3305 } 3306 3307 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 3308 counter_u64_add(numneghits, 1); 3309 cache_fpl_smr_exit(fpl); 3310 mtx_unlock(&neglist->nl_lock); 3311 mtx_unlock(&ncneg_hot.nl_lock); 3312 vdrop(dvp); 3313 return (cache_fpl_handled(fpl, ENOENT)); 3314 out_abort: 3315 cache_fpl_smr_exit(fpl); 3316 mtx_unlock(&neglist->nl_lock); 3317 mtx_unlock(&ncneg_hot.nl_lock); 3318 vdrop(dvp); 3319 return (cache_fpl_aborted(fpl)); 3320 } 3321 3322 /* 3323 * The target vnode is not supported, prepare for the slow path to take over. 3324 */ 3325 static int __noinline 3326 cache_fplookup_partial_setup(struct cache_fpl *fpl) 3327 { 3328 struct nameidata *ndp; 3329 struct componentname *cnp; 3330 enum vgetstate dvs; 3331 struct vnode *dvp; 3332 struct pwd *pwd; 3333 seqc_t dvp_seqc; 3334 3335 ndp = fpl->ndp; 3336 cnp = fpl->cnp; 3337 dvp = fpl->dvp; 3338 dvp_seqc = fpl->dvp_seqc; 3339 3340 dvs = vget_prep_smr(dvp); 3341 if (__predict_false(dvs == VGET_NONE)) { 3342 cache_fpl_smr_exit(fpl); 3343 return (cache_fpl_aborted(fpl)); 3344 } 3345 3346 cache_fpl_smr_exit(fpl); 3347 3348 vget_finish_ref(dvp, dvs); 3349 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3350 vrele(dvp); 3351 return (cache_fpl_aborted(fpl)); 3352 } 3353 3354 pwd = pwd_hold(curthread); 3355 if (fpl->pwd != pwd) { 3356 vrele(dvp); 3357 pwd_drop(pwd); 3358 return (cache_fpl_aborted(fpl)); 3359 } 3360 3361 cache_fpl_restore(fpl, &fpl->snd); 3362 3363 ndp->ni_startdir = dvp; 3364 cnp->cn_flags |= MAKEENTRY; 3365 if (cache_fpl_islastcn(ndp)) 3366 cnp->cn_flags |= ISLASTCN; 3367 if (cache_fpl_isdotdot(cnp)) 3368 cnp->cn_flags |= ISDOTDOT; 3369 3370 return (0); 3371 } 3372 3373 static int 3374 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs) 3375 { 3376 struct componentname *cnp; 3377 struct vnode *tvp; 3378 seqc_t tvp_seqc; 3379 int error, lkflags; 3380 3381 cnp = fpl->cnp; 3382 tvp = fpl->tvp; 3383 tvp_seqc = fpl->tvp_seqc; 3384 3385 if ((cnp->cn_flags & LOCKLEAF) != 0) { 3386 lkflags = LK_SHARED; 3387 if ((cnp->cn_flags & LOCKSHARED) == 0) 3388 lkflags = LK_EXCLUSIVE; 3389 error = vget_finish(tvp, lkflags, tvs); 3390 if (__predict_false(error != 0)) { 3391 return (cache_fpl_aborted(fpl)); 3392 } 3393 } else { 3394 vget_finish_ref(tvp, tvs); 3395 } 3396 3397 if (!vn_seqc_consistent(tvp, tvp_seqc)) { 3398 if ((cnp->cn_flags & LOCKLEAF) != 0) 3399 vput(tvp); 3400 else 3401 vrele(tvp); 3402 return (cache_fpl_aborted(fpl)); 3403 } 3404 3405 return (cache_fpl_handled(fpl, 0)); 3406 } 3407 3408 /* 3409 * They want to possibly modify the state of the namecache. 3410 * 3411 * Don't try to match the API contract, just leave. 3412 * TODO: this leaves scalability on the table 3413 */ 3414 static int 3415 cache_fplookup_final_modifying(struct cache_fpl *fpl) 3416 { 3417 struct componentname *cnp; 3418 3419 cnp = fpl->cnp; 3420 MPASS(cnp->cn_nameiop != LOOKUP); 3421 return (cache_fpl_partial(fpl)); 3422 } 3423 3424 static int __noinline 3425 cache_fplookup_final_withparent(struct cache_fpl *fpl) 3426 { 3427 struct componentname *cnp; 3428 enum vgetstate dvs, tvs; 3429 struct vnode *dvp, *tvp; 3430 seqc_t dvp_seqc, tvp_seqc; 3431 int error; 3432 3433 cnp = fpl->cnp; 3434 dvp = fpl->dvp; 3435 dvp_seqc = fpl->dvp_seqc; 3436 tvp = fpl->tvp; 3437 tvp_seqc = fpl->tvp_seqc; 3438 3439 MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0); 3440 3441 /* 3442 * This is less efficient than it can be for simplicity. 3443 */ 3444 dvs = vget_prep_smr(dvp); 3445 if (__predict_false(dvs == VGET_NONE)) { 3446 return (cache_fpl_aborted(fpl)); 3447 } 3448 tvs = vget_prep_smr(tvp); 3449 if (__predict_false(tvs == VGET_NONE)) { 3450 cache_fpl_smr_exit(fpl); 3451 vget_abort(dvp, dvs); 3452 return (cache_fpl_aborted(fpl)); 3453 } 3454 3455 cache_fpl_smr_exit(fpl); 3456 3457 if ((cnp->cn_flags & LOCKPARENT) != 0) { 3458 error = vget_finish(dvp, LK_EXCLUSIVE, dvs); 3459 if (__predict_false(error != 0)) { 3460 vget_abort(tvp, tvs); 3461 return (cache_fpl_aborted(fpl)); 3462 } 3463 } else { 3464 vget_finish_ref(dvp, dvs); 3465 } 3466 3467 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3468 vget_abort(tvp, tvs); 3469 if ((cnp->cn_flags & LOCKPARENT) != 0) 3470 vput(dvp); 3471 else 3472 vrele(dvp); 3473 return (cache_fpl_aborted(fpl)); 3474 } 3475 3476 error = cache_fplookup_final_child(fpl, tvs); 3477 if (__predict_false(error != 0)) { 3478 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED); 3479 if ((cnp->cn_flags & LOCKPARENT) != 0) 3480 vput(dvp); 3481 else 3482 vrele(dvp); 3483 return (error); 3484 } 3485 3486 MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED); 3487 return (0); 3488 } 3489 3490 static int 3491 cache_fplookup_final(struct cache_fpl *fpl) 3492 { 3493 struct componentname *cnp; 3494 enum vgetstate tvs; 3495 struct vnode *dvp, *tvp; 3496 seqc_t dvp_seqc, tvp_seqc; 3497 3498 cnp = fpl->cnp; 3499 dvp = fpl->dvp; 3500 dvp_seqc = fpl->dvp_seqc; 3501 tvp = fpl->tvp; 3502 tvp_seqc = fpl->tvp_seqc; 3503 3504 VNPASS(cache_fplookup_vnode_supported(dvp), dvp); 3505 3506 if (cnp->cn_nameiop != LOOKUP) { 3507 return (cache_fplookup_final_modifying(fpl)); 3508 } 3509 3510 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) 3511 return (cache_fplookup_final_withparent(fpl)); 3512 3513 tvs = vget_prep_smr(tvp); 3514 if (__predict_false(tvs == VGET_NONE)) { 3515 return (cache_fpl_partial(fpl)); 3516 } 3517 3518 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3519 cache_fpl_smr_exit(fpl); 3520 vget_abort(tvp, tvs); 3521 return (cache_fpl_aborted(fpl)); 3522 } 3523 3524 cache_fpl_smr_exit(fpl); 3525 return (cache_fplookup_final_child(fpl, tvs)); 3526 } 3527 3528 static int __noinline 3529 cache_fplookup_dot(struct cache_fpl *fpl) 3530 { 3531 struct vnode *dvp; 3532 3533 dvp = fpl->dvp; 3534 3535 fpl->tvp = dvp; 3536 fpl->tvp_seqc = vn_seqc_read_any(dvp); 3537 if (seqc_in_modify(fpl->tvp_seqc)) { 3538 return (cache_fpl_aborted(fpl)); 3539 } 3540 3541 counter_u64_add(dothits, 1); 3542 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", dvp); 3543 3544 return (0); 3545 } 3546 3547 static int __noinline 3548 cache_fplookup_dotdot(struct cache_fpl *fpl) 3549 { 3550 struct nameidata *ndp; 3551 struct componentname *cnp; 3552 struct namecache *ncp; 3553 struct vnode *dvp; 3554 struct prison *pr; 3555 u_char nc_flag; 3556 3557 ndp = fpl->ndp; 3558 cnp = fpl->cnp; 3559 dvp = fpl->dvp; 3560 3561 /* 3562 * XXX this is racy the same way regular lookup is 3563 */ 3564 for (pr = cnp->cn_cred->cr_prison; pr != NULL; 3565 pr = pr->pr_parent) 3566 if (dvp == pr->pr_root) 3567 break; 3568 3569 if (dvp == ndp->ni_rootdir || 3570 dvp == ndp->ni_topdir || 3571 dvp == rootvnode || 3572 pr != NULL) { 3573 fpl->tvp = dvp; 3574 fpl->tvp_seqc = vn_seqc_read_any(dvp); 3575 if (seqc_in_modify(fpl->tvp_seqc)) { 3576 return (cache_fpl_aborted(fpl)); 3577 } 3578 return (0); 3579 } 3580 3581 if ((dvp->v_vflag & VV_ROOT) != 0) { 3582 /* 3583 * TODO 3584 * The opposite of climb mount is needed here. 3585 */ 3586 return (cache_fpl_aborted(fpl)); 3587 } 3588 3589 ncp = atomic_load_ptr(&dvp->v_cache_dd); 3590 if (ncp == NULL) { 3591 return (cache_fpl_aborted(fpl)); 3592 } 3593 3594 nc_flag = atomic_load_char(&ncp->nc_flag); 3595 if ((nc_flag & NCF_ISDOTDOT) != 0) { 3596 if ((nc_flag & NCF_NEGATIVE) != 0) 3597 return (cache_fpl_aborted(fpl)); 3598 fpl->tvp = ncp->nc_vp; 3599 } else { 3600 fpl->tvp = ncp->nc_dvp; 3601 } 3602 3603 if (__predict_false(!cache_ncp_canuse(ncp))) { 3604 return (cache_fpl_aborted(fpl)); 3605 } 3606 3607 fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp); 3608 if (seqc_in_modify(fpl->tvp_seqc)) { 3609 return (cache_fpl_partial(fpl)); 3610 } 3611 3612 counter_u64_add(dotdothits, 1); 3613 return (0); 3614 } 3615 3616 static int 3617 cache_fplookup_next(struct cache_fpl *fpl) 3618 { 3619 struct componentname *cnp; 3620 struct namecache *ncp; 3621 struct negstate *negstate; 3622 struct vnode *dvp, *tvp; 3623 u_char nc_flag; 3624 uint32_t hash; 3625 bool neg_hot; 3626 3627 cnp = fpl->cnp; 3628 dvp = fpl->dvp; 3629 3630 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) { 3631 return (cache_fplookup_dot(fpl)); 3632 } 3633 3634 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 3635 3636 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 3637 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 3638 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 3639 break; 3640 } 3641 3642 /* 3643 * If there is no entry we have to punt to the slow path to perform 3644 * actual lookup. Should there be nothing with this name a negative 3645 * entry will be created. 3646 */ 3647 if (__predict_false(ncp == NULL)) { 3648 return (cache_fpl_partial(fpl)); 3649 } 3650 3651 tvp = atomic_load_ptr(&ncp->nc_vp); 3652 nc_flag = atomic_load_char(&ncp->nc_flag); 3653 if ((nc_flag & NCF_NEGATIVE) != 0) { 3654 /* 3655 * If they want to create an entry we need to replace this one. 3656 */ 3657 if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) { 3658 return (cache_fpl_partial(fpl)); 3659 } 3660 negstate = NCP2NEGSTATE(ncp); 3661 neg_hot = ((negstate->neg_flag & NEG_HOT) != 0); 3662 if (__predict_false(!cache_ncp_canuse(ncp))) { 3663 return (cache_fpl_partial(fpl)); 3664 } 3665 if (__predict_false((nc_flag & NCF_WHITE) != 0)) { 3666 return (cache_fpl_partial(fpl)); 3667 } 3668 if (!neg_hot) { 3669 return (cache_fplookup_negative_promote(fpl, ncp, hash)); 3670 } 3671 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, 3672 ncp->nc_name); 3673 counter_u64_add(numneghits, 1); 3674 cache_fpl_smr_exit(fpl); 3675 return (cache_fpl_handled(fpl, ENOENT)); 3676 } 3677 3678 if (__predict_false(!cache_ncp_canuse(ncp))) { 3679 return (cache_fpl_partial(fpl)); 3680 } 3681 3682 fpl->tvp = tvp; 3683 fpl->tvp_seqc = vn_seqc_read_any(tvp); 3684 if (seqc_in_modify(fpl->tvp_seqc)) { 3685 return (cache_fpl_partial(fpl)); 3686 } 3687 3688 if (!cache_fplookup_vnode_supported(tvp)) { 3689 return (cache_fpl_partial(fpl)); 3690 } 3691 3692 counter_u64_add(numposhits, 1); 3693 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp); 3694 return (0); 3695 } 3696 3697 static bool 3698 cache_fplookup_mp_supported(struct mount *mp) 3699 { 3700 3701 if (mp == NULL) 3702 return (false); 3703 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) 3704 return (false); 3705 return (true); 3706 } 3707 3708 /* 3709 * Walk up the mount stack (if any). 3710 * 3711 * Correctness is provided in the following ways: 3712 * - all vnodes are protected from freeing with SMR 3713 * - struct mount objects are type stable making them always safe to access 3714 * - stability of the particular mount is provided by busying it 3715 * - relationship between the vnode which is mounted on and the mount is 3716 * verified with the vnode sequence counter after busying 3717 * - association between root vnode of the mount and the mount is protected 3718 * by busy 3719 * 3720 * From that point on we can read the sequence counter of the root vnode 3721 * and get the next mount on the stack (if any) using the same protection. 3722 * 3723 * By the end of successful walk we are guaranteed the reached state was 3724 * indeed present at least at some point which matches the regular lookup. 3725 */ 3726 static int __noinline 3727 cache_fplookup_climb_mount(struct cache_fpl *fpl) 3728 { 3729 struct mount *mp, *prev_mp; 3730 struct vnode *vp; 3731 seqc_t vp_seqc; 3732 3733 vp = fpl->tvp; 3734 vp_seqc = fpl->tvp_seqc; 3735 3736 VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp); 3737 mp = atomic_load_ptr(&vp->v_mountedhere); 3738 if (mp == NULL) 3739 return (0); 3740 3741 prev_mp = NULL; 3742 for (;;) { 3743 if (!vfs_op_thread_enter_crit(mp)) { 3744 if (prev_mp != NULL) 3745 vfs_op_thread_exit_crit(prev_mp); 3746 return (cache_fpl_partial(fpl)); 3747 } 3748 if (prev_mp != NULL) 3749 vfs_op_thread_exit_crit(prev_mp); 3750 if (!vn_seqc_consistent(vp, vp_seqc)) { 3751 vfs_op_thread_exit_crit(mp); 3752 return (cache_fpl_partial(fpl)); 3753 } 3754 if (!cache_fplookup_mp_supported(mp)) { 3755 vfs_op_thread_exit_crit(mp); 3756 return (cache_fpl_partial(fpl)); 3757 } 3758 vp = atomic_load_ptr(&mp->mnt_rootvnode); 3759 if (vp == NULL || VN_IS_DOOMED(vp)) { 3760 vfs_op_thread_exit_crit(mp); 3761 return (cache_fpl_partial(fpl)); 3762 } 3763 vp_seqc = vn_seqc_read_any(vp); 3764 if (seqc_in_modify(vp_seqc)) { 3765 vfs_op_thread_exit_crit(mp); 3766 return (cache_fpl_partial(fpl)); 3767 } 3768 prev_mp = mp; 3769 mp = atomic_load_ptr(&vp->v_mountedhere); 3770 if (mp == NULL) 3771 break; 3772 } 3773 3774 vfs_op_thread_exit_crit(prev_mp); 3775 fpl->tvp = vp; 3776 fpl->tvp_seqc = vp_seqc; 3777 return (0); 3778 } 3779 3780 static bool 3781 cache_fplookup_need_climb_mount(struct cache_fpl *fpl) 3782 { 3783 struct mount *mp; 3784 struct vnode *vp; 3785 3786 vp = fpl->tvp; 3787 3788 /* 3789 * Hack: while this is a union, the pointer tends to be NULL so save on 3790 * a branch. 3791 */ 3792 mp = atomic_load_ptr(&vp->v_mountedhere); 3793 if (mp == NULL) 3794 return (false); 3795 if (vp->v_type == VDIR) 3796 return (true); 3797 return (false); 3798 } 3799 3800 /* 3801 * Parse the path. 3802 * 3803 * The code is mostly copy-pasted from regular lookup, see lookup(). 3804 * The structure is maintained along with comments for easier maintenance. 3805 * Deduplicating the code will become feasible after fast path lookup 3806 * becomes more feature-complete. 3807 */ 3808 static int 3809 cache_fplookup_parse(struct cache_fpl *fpl) 3810 { 3811 struct nameidata *ndp; 3812 struct componentname *cnp; 3813 char *cp; 3814 3815 ndp = fpl->ndp; 3816 cnp = fpl->cnp; 3817 3818 /* 3819 * Search a new directory. 3820 * 3821 * The last component of the filename is left accessible via 3822 * cnp->cn_nameptr for callers that need the name. Callers needing 3823 * the name set the SAVENAME flag. When done, they assume 3824 * responsibility for freeing the pathname buffer. 3825 */ 3826 for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++) 3827 continue; 3828 cnp->cn_namelen = cp - cnp->cn_nameptr; 3829 if (__predict_false(cnp->cn_namelen > NAME_MAX)) { 3830 cache_fpl_smr_exit(fpl); 3831 return (cache_fpl_handled(fpl, ENAMETOOLONG)); 3832 } 3833 ndp->ni_pathlen -= cnp->cn_namelen; 3834 KASSERT(ndp->ni_pathlen <= PATH_MAX, 3835 ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen)); 3836 ndp->ni_next = cp; 3837 3838 /* 3839 * Replace multiple slashes by a single slash and trailing slashes 3840 * by a null. This must be done before VOP_LOOKUP() because some 3841 * fs's don't know about trailing slashes. Remember if there were 3842 * trailing slashes to handle symlinks, existing non-directories 3843 * and non-existing files that won't be directories specially later. 3844 */ 3845 while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) { 3846 cp++; 3847 ndp->ni_pathlen--; 3848 if (*cp == '\0') { 3849 /* 3850 * TODO 3851 * Regular lookup performs the following: 3852 * *ndp->ni_next = '\0'; 3853 * cnp->cn_flags |= TRAILINGSLASH; 3854 * 3855 * Which is problematic since it modifies data read 3856 * from userspace. Then if fast path lookup was to 3857 * abort we would have to either restore it or convey 3858 * the flag. Since this is a corner case just ignore 3859 * it for simplicity. 3860 */ 3861 return (cache_fpl_partial(fpl)); 3862 } 3863 } 3864 ndp->ni_next = cp; 3865 3866 /* 3867 * Check for degenerate name (e.g. / or "") 3868 * which is a way of talking about a directory, 3869 * e.g. like "/." or ".". 3870 * 3871 * TODO 3872 * Another corner case handled by the regular lookup 3873 */ 3874 if (__predict_false(cnp->cn_nameptr[0] == '\0')) { 3875 return (cache_fpl_partial(fpl)); 3876 } 3877 return (0); 3878 } 3879 3880 static void 3881 cache_fplookup_parse_advance(struct cache_fpl *fpl) 3882 { 3883 struct nameidata *ndp; 3884 struct componentname *cnp; 3885 3886 ndp = fpl->ndp; 3887 cnp = fpl->cnp; 3888 3889 cnp->cn_nameptr = ndp->ni_next; 3890 while (*cnp->cn_nameptr == '/') { 3891 cnp->cn_nameptr++; 3892 ndp->ni_pathlen--; 3893 } 3894 } 3895 3896 static int __noinline 3897 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error) 3898 { 3899 3900 switch (error) { 3901 case EAGAIN: 3902 /* 3903 * Can happen when racing against vgone. 3904 * */ 3905 case EOPNOTSUPP: 3906 cache_fpl_partial(fpl); 3907 break; 3908 default: 3909 /* 3910 * See the API contract for VOP_FPLOOKUP_VEXEC. 3911 */ 3912 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { 3913 error = cache_fpl_aborted(fpl); 3914 } else { 3915 cache_fpl_smr_exit(fpl); 3916 cache_fpl_handled(fpl, error); 3917 } 3918 break; 3919 } 3920 return (error); 3921 } 3922 3923 static int 3924 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl) 3925 { 3926 struct nameidata *ndp; 3927 struct componentname *cnp; 3928 struct mount *mp; 3929 int error; 3930 3931 error = CACHE_FPL_FAILED; 3932 ndp = fpl->ndp; 3933 cnp = fpl->cnp; 3934 3935 cache_fpl_checkpoint(fpl, &fpl->snd); 3936 3937 fpl->dvp = dvp; 3938 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); 3939 if (seqc_in_modify(fpl->dvp_seqc)) { 3940 cache_fpl_aborted(fpl); 3941 goto out; 3942 } 3943 mp = atomic_load_ptr(&fpl->dvp->v_mount); 3944 if (!cache_fplookup_mp_supported(mp)) { 3945 cache_fpl_aborted(fpl); 3946 goto out; 3947 } 3948 3949 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); 3950 3951 for (;;) { 3952 error = cache_fplookup_parse(fpl); 3953 if (__predict_false(error != 0)) { 3954 break; 3955 } 3956 3957 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); 3958 3959 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred); 3960 if (__predict_false(error != 0)) { 3961 error = cache_fplookup_failed_vexec(fpl, error); 3962 break; 3963 } 3964 3965 if (__predict_false(cache_fpl_isdotdot(cnp))) { 3966 error = cache_fplookup_dotdot(fpl); 3967 if (__predict_false(error != 0)) { 3968 break; 3969 } 3970 } else { 3971 error = cache_fplookup_next(fpl); 3972 if (__predict_false(error != 0)) { 3973 break; 3974 } 3975 3976 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 3977 3978 if (cache_fplookup_need_climb_mount(fpl)) { 3979 error = cache_fplookup_climb_mount(fpl); 3980 if (__predict_false(error != 0)) { 3981 break; 3982 } 3983 } 3984 } 3985 3986 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 3987 3988 if (cache_fpl_islastcn(ndp)) { 3989 error = cache_fplookup_final(fpl); 3990 break; 3991 } 3992 3993 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { 3994 error = cache_fpl_aborted(fpl); 3995 break; 3996 } 3997 3998 fpl->dvp = fpl->tvp; 3999 fpl->dvp_seqc = fpl->tvp_seqc; 4000 4001 cache_fplookup_parse_advance(fpl); 4002 cache_fpl_checkpoint(fpl, &fpl->snd); 4003 } 4004 out: 4005 switch (fpl->status) { 4006 case CACHE_FPL_STATUS_UNSET: 4007 __assert_unreachable(); 4008 break; 4009 case CACHE_FPL_STATUS_PARTIAL: 4010 cache_fpl_smr_assert_entered(fpl); 4011 return (cache_fplookup_partial_setup(fpl)); 4012 case CACHE_FPL_STATUS_ABORTED: 4013 if (fpl->in_smr) 4014 cache_fpl_smr_exit(fpl); 4015 return (CACHE_FPL_FAILED); 4016 case CACHE_FPL_STATUS_HANDLED: 4017 MPASS(error != CACHE_FPL_FAILED); 4018 cache_fpl_smr_assert_not_entered(fpl); 4019 if (__predict_false(error != 0)) { 4020 ndp->ni_dvp = NULL; 4021 ndp->ni_vp = NULL; 4022 cache_fpl_cleanup_cnp(cnp); 4023 return (error); 4024 } 4025 ndp->ni_dvp = fpl->dvp; 4026 ndp->ni_vp = fpl->tvp; 4027 if (cnp->cn_flags & SAVENAME) 4028 cnp->cn_flags |= HASBUF; 4029 else 4030 cache_fpl_cleanup_cnp(cnp); 4031 return (error); 4032 } 4033 } 4034 4035 /* 4036 * Fast path lookup protected with SMR and sequence counters. 4037 * 4038 * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one. 4039 * 4040 * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria 4041 * outlined below. 4042 * 4043 * Traditional vnode lookup conceptually looks like this: 4044 * 4045 * vn_lock(current); 4046 * for (;;) { 4047 * next = find(); 4048 * vn_lock(next); 4049 * vn_unlock(current); 4050 * current = next; 4051 * if (last) 4052 * break; 4053 * } 4054 * return (current); 4055 * 4056 * Each jump to the next vnode is safe memory-wise and atomic with respect to 4057 * any modifications thanks to holding respective locks. 4058 * 4059 * The same guarantee can be provided with a combination of safe memory 4060 * reclamation and sequence counters instead. If all operations which affect 4061 * the relationship between the current vnode and the one we are looking for 4062 * also modify the counter, we can verify whether all the conditions held as 4063 * we made the jump. This includes things like permissions, mount points etc. 4064 * Counter modification is provided by enclosing relevant places in 4065 * vn_seqc_write_begin()/end() calls. 4066 * 4067 * Thus this translates to: 4068 * 4069 * vfs_smr_enter(); 4070 * dvp_seqc = seqc_read_any(dvp); 4071 * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode 4072 * abort(); 4073 * for (;;) { 4074 * tvp = find(); 4075 * tvp_seqc = seqc_read_any(tvp); 4076 * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode 4077 * abort(); 4078 * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode 4079 * abort(); 4080 * dvp = tvp; // we know nothing of importance has changed 4081 * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration 4082 * if (last) 4083 * break; 4084 * } 4085 * vget(); // secure the vnode 4086 * if (!seqc_consistent(tvp, tvp_seqc) // final check 4087 * abort(); 4088 * // at this point we know nothing has changed for any parent<->child pair 4089 * // as they were crossed during the lookup, meaning we matched the guarantee 4090 * // of the locked variant 4091 * return (tvp); 4092 * 4093 * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows: 4094 * - they are called while within vfs_smr protection which they must never exit 4095 * - EAGAIN can be returned to denote checking could not be performed, it is 4096 * always valid to return it 4097 * - if the sequence counter has not changed the result must be valid 4098 * - if the sequence counter has changed both false positives and false negatives 4099 * are permitted (since the result will be rejected later) 4100 * - for simple cases of unix permission checks vaccess_vexec_smr can be used 4101 * 4102 * Caveats to watch out for: 4103 * - vnodes are passed unlocked and unreferenced with nothing stopping 4104 * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised 4105 * to use atomic_load_ptr to fetch it. 4106 * - the aforementioned object can also get freed, meaning absent other means it 4107 * should be protected with vfs_smr 4108 * - either safely checking permissions as they are modified or guaranteeing 4109 * their stability is left to the routine 4110 */ 4111 int 4112 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status, 4113 struct pwd **pwdp) 4114 { 4115 struct cache_fpl fpl; 4116 struct pwd *pwd; 4117 struct vnode *dvp; 4118 struct componentname *cnp; 4119 struct nameidata_saved orig; 4120 int error; 4121 4122 MPASS(ndp->ni_lcf == 0); 4123 4124 fpl.status = CACHE_FPL_STATUS_UNSET; 4125 fpl.ndp = ndp; 4126 fpl.cnp = &ndp->ni_cnd; 4127 MPASS(curthread == fpl.cnp->cn_thread); 4128 4129 if ((fpl.cnp->cn_flags & SAVESTART) != 0) 4130 MPASS(fpl.cnp->cn_nameiop != LOOKUP); 4131 4132 if (!cache_can_fplookup(&fpl)) { 4133 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 4134 *status = fpl.status; 4135 return (EOPNOTSUPP); 4136 } 4137 4138 cache_fpl_checkpoint(&fpl, &orig); 4139 4140 cache_fpl_smr_enter_initial(&fpl); 4141 pwd = pwd_get_smr(); 4142 fpl.pwd = pwd; 4143 ndp->ni_rootdir = pwd->pwd_rdir; 4144 ndp->ni_topdir = pwd->pwd_jdir; 4145 4146 cnp = fpl.cnp; 4147 cnp->cn_nameptr = cnp->cn_pnbuf; 4148 if (cnp->cn_pnbuf[0] == '/') { 4149 cache_fpl_handle_root(ndp, &dvp); 4150 } else { 4151 MPASS(ndp->ni_dirfd == AT_FDCWD); 4152 dvp = pwd->pwd_cdir; 4153 } 4154 4155 SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true); 4156 4157 error = cache_fplookup_impl(dvp, &fpl); 4158 cache_fpl_smr_assert_not_entered(&fpl); 4159 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 4160 4161 *status = fpl.status; 4162 switch (fpl.status) { 4163 case CACHE_FPL_STATUS_UNSET: 4164 __assert_unreachable(); 4165 break; 4166 case CACHE_FPL_STATUS_HANDLED: 4167 SDT_PROBE3(vfs, namei, lookup, return, error, 4168 (error == 0 ? ndp->ni_vp : NULL), true); 4169 break; 4170 case CACHE_FPL_STATUS_PARTIAL: 4171 *pwdp = fpl.pwd; 4172 /* 4173 * Status restored by cache_fplookup_partial_setup. 4174 */ 4175 break; 4176 case CACHE_FPL_STATUS_ABORTED: 4177 cache_fpl_restore(&fpl, &orig); 4178 break; 4179 } 4180 return (error); 4181 } 4182