1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Poul-Henning Kamp of the FreeBSD Project. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_ddb.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/capsicum.h> 46 #include <sys/counter.h> 47 #include <sys/filedesc.h> 48 #include <sys/fnv_hash.h> 49 #include <sys/kernel.h> 50 #include <sys/ktr.h> 51 #include <sys/lock.h> 52 #include <sys/malloc.h> 53 #include <sys/fcntl.h> 54 #include <sys/jail.h> 55 #include <sys/mount.h> 56 #include <sys/namei.h> 57 #include <sys/proc.h> 58 #include <sys/rwlock.h> 59 #include <sys/seqc.h> 60 #include <sys/sdt.h> 61 #include <sys/smr.h> 62 #include <sys/smp.h> 63 #include <sys/syscallsubr.h> 64 #include <sys/sysctl.h> 65 #include <sys/sysproto.h> 66 #include <sys/vnode.h> 67 #include <ck_queue.h> 68 #ifdef KTRACE 69 #include <sys/ktrace.h> 70 #endif 71 72 #include <sys/capsicum.h> 73 74 #include <security/audit/audit.h> 75 #include <security/mac/mac_framework.h> 76 77 #ifdef DDB 78 #include <ddb/ddb.h> 79 #endif 80 81 #include <vm/uma.h> 82 83 SDT_PROVIDER_DECLARE(vfs); 84 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", 85 "struct vnode *"); 86 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", 87 "char *"); 88 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); 89 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", 90 "char *", "struct vnode *"); 91 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); 92 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", 93 "struct vnode *", "char *"); 94 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", 95 "struct vnode *"); 96 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, 97 "struct vnode *", "char *"); 98 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", 99 "char *"); 100 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *", 101 "struct componentname *"); 102 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *", 103 "struct componentname *"); 104 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); 105 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); 106 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); 107 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", 108 "struct vnode *"); 109 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", 110 "char *"); 111 SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *", 112 "char *"); 113 114 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool"); 115 SDT_PROBE_DECLARE(vfs, namei, lookup, entry); 116 SDT_PROBE_DECLARE(vfs, namei, lookup, return); 117 118 /* 119 * This structure describes the elements in the cache of recent 120 * names looked up by namei. 121 */ 122 struct negstate { 123 u_char neg_flag; 124 }; 125 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *), 126 "the state must fit in a union with a pointer without growing it"); 127 128 struct namecache { 129 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 130 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 131 CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */ 132 struct vnode *nc_dvp; /* vnode of parent of name */ 133 union { 134 struct vnode *nu_vp; /* vnode the name refers to */ 135 struct negstate nu_neg;/* negative entry state */ 136 } n_un; 137 u_char nc_flag; /* flag bits */ 138 u_char nc_nlen; /* length of name */ 139 char nc_name[0]; /* segment name + nul */ 140 }; 141 142 /* 143 * struct namecache_ts repeats struct namecache layout up to the 144 * nc_nlen member. 145 * struct namecache_ts is used in place of struct namecache when time(s) need 146 * to be stored. The nc_dotdottime field is used when a cache entry is mapping 147 * both a non-dotdot directory name plus dotdot for the directory's 148 * parent. 149 * 150 * See below for alignment requirement. 151 */ 152 struct namecache_ts { 153 struct timespec nc_time; /* timespec provided by fs */ 154 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ 155 int nc_ticks; /* ticks value when entry was added */ 156 struct namecache nc_nc; 157 }; 158 159 /* 160 * At least mips n32 performs 64-bit accesses to timespec as found 161 * in namecache_ts and requires them to be aligned. Since others 162 * may be in the same spot suffer a little bit and enforce the 163 * alignment for everyone. Note this is a nop for 64-bit platforms. 164 */ 165 #define CACHE_ZONE_ALIGNMENT UMA_ALIGNOF(time_t) 166 #define CACHE_PATH_CUTOFF 39 167 168 #define CACHE_ZONE_SMALL_SIZE (sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1) 169 #define CACHE_ZONE_SMALL_TS_SIZE (sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1) 170 #define CACHE_ZONE_LARGE_SIZE (sizeof(struct namecache) + NAME_MAX + 1) 171 #define CACHE_ZONE_LARGE_TS_SIZE (sizeof(struct namecache_ts) + NAME_MAX + 1) 172 173 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 174 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 175 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 176 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 177 178 #define nc_vp n_un.nu_vp 179 #define nc_neg n_un.nu_neg 180 181 /* 182 * Flags in namecache.nc_flag 183 */ 184 #define NCF_WHITE 0x01 185 #define NCF_ISDOTDOT 0x02 186 #define NCF_TS 0x04 187 #define NCF_DTS 0x08 188 #define NCF_DVDROP 0x10 189 #define NCF_NEGATIVE 0x20 190 #define NCF_INVALID 0x40 191 #define NCF_WIP 0x80 192 193 /* 194 * Flags in negstate.neg_flag 195 */ 196 #define NEG_HOT 0x01 197 198 /* 199 * Mark an entry as invalid. 200 * 201 * This is called before it starts getting deconstructed. 202 */ 203 static void 204 cache_ncp_invalidate(struct namecache *ncp) 205 { 206 207 KASSERT((ncp->nc_flag & NCF_INVALID) == 0, 208 ("%s: entry %p already invalid", __func__, ncp)); 209 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID); 210 atomic_thread_fence_rel(); 211 } 212 213 /* 214 * Check whether the entry can be safely used. 215 * 216 * All places which elide locks are supposed to call this after they are 217 * done with reading from an entry. 218 */ 219 static bool 220 cache_ncp_canuse(struct namecache *ncp) 221 { 222 223 atomic_thread_fence_acq(); 224 return ((atomic_load_char(&ncp->nc_flag) & (NCF_INVALID | NCF_WIP)) == 0); 225 } 226 227 /* 228 * Name caching works as follows: 229 * 230 * Names found by directory scans are retained in a cache 231 * for future reference. It is managed LRU, so frequently 232 * used names will hang around. Cache is indexed by hash value 233 * obtained from (dvp, name) where dvp refers to the directory 234 * containing name. 235 * 236 * If it is a "negative" entry, (i.e. for a name that is known NOT to 237 * exist) the vnode pointer will be NULL. 238 * 239 * Upon reaching the last segment of a path, if the reference 240 * is for DELETE, or NOCACHE is set (rewrite), and the 241 * name is located in the cache, it will be dropped. 242 * 243 * These locks are used (in the order in which they can be taken): 244 * NAME TYPE ROLE 245 * vnodelock mtx vnode lists and v_cache_dd field protection 246 * bucketlock rwlock for access to given set of hash buckets 247 * neglist mtx negative entry LRU management 248 * 249 * Additionally, ncneg_shrink_lock mtx is used to have at most one thread 250 * shrinking the LRU list. 251 * 252 * It is legal to take multiple vnodelock and bucketlock locks. The locking 253 * order is lower address first. Both are recursive. 254 * 255 * "." lookups are lockless. 256 * 257 * ".." and vnode -> name lookups require vnodelock. 258 * 259 * name -> vnode lookup requires the relevant bucketlock to be held for reading. 260 * 261 * Insertions and removals of entries require involved vnodes and bucketlocks 262 * to be write-locked to prevent other threads from seeing the entry. 263 * 264 * Some lookups result in removal of the found entry (e.g. getting rid of a 265 * negative entry with the intent to create a positive one), which poses a 266 * problem when multiple threads reach the state. Similarly, two different 267 * threads can purge two different vnodes and try to remove the same name. 268 * 269 * If the already held vnode lock is lower than the second required lock, we 270 * can just take the other lock. However, in the opposite case, this could 271 * deadlock. As such, this is resolved by trylocking and if that fails unlocking 272 * the first node, locking everything in order and revalidating the state. 273 */ 274 275 VFS_SMR_DECLARE; 276 277 /* 278 * Structures associated with name caching. 279 */ 280 #define NCHHASH(hash) \ 281 (&nchashtbl[(hash) & nchash]) 282 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ 283 static u_long __read_mostly nchash; /* size of hash table */ 284 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 285 "Size of namecache hash table"); 286 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */ 287 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 288 "Ratio of negative namecache entries"); 289 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ 290 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ 291 u_int ncsizefactor = 2; 292 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0, 293 "Size factor for namecache"); 294 static u_int __read_mostly ncpurgeminvnodes; 295 SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0, 296 "Number of vnodes below which purgevfs ignores the request"); 297 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */ 298 299 struct nchstats nchstats; /* cache effectiveness statistics */ 300 301 static struct mtx __exclusive_cache_line ncneg_shrink_lock; 302 303 struct neglist { 304 struct mtx nl_lock; 305 TAILQ_HEAD(, namecache) nl_list; 306 } __aligned(CACHE_LINE_SIZE); 307 308 static struct neglist __read_mostly *neglists; 309 static struct neglist ncneg_hot; 310 static u_long numhotneg; 311 312 #define ncneghash 3 313 #define numneglists (ncneghash + 1) 314 static inline struct neglist * 315 NCP2NEGLIST(struct namecache *ncp) 316 { 317 318 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); 319 } 320 321 static inline struct negstate * 322 NCP2NEGSTATE(struct namecache *ncp) 323 { 324 325 MPASS(ncp->nc_flag & NCF_NEGATIVE); 326 return (&ncp->nc_neg); 327 } 328 329 #define numbucketlocks (ncbuckethash + 1) 330 static u_int __read_mostly ncbuckethash; 331 static struct rwlock_padalign __read_mostly *bucketlocks; 332 #define HASH2BUCKETLOCK(hash) \ 333 ((struct rwlock *)(&bucketlocks[((hash) & ncbuckethash)])) 334 335 #define numvnodelocks (ncvnodehash + 1) 336 static u_int __read_mostly ncvnodehash; 337 static struct mtx __read_mostly *vnodelocks; 338 static inline struct mtx * 339 VP2VNODELOCK(struct vnode *vp) 340 { 341 342 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); 343 } 344 345 /* 346 * UMA zones for the VFS cache. 347 * 348 * The small cache is used for entries with short names, which are the 349 * most common. The large cache is used for entries which are too big to 350 * fit in the small cache. 351 */ 352 static uma_zone_t __read_mostly cache_zone_small; 353 static uma_zone_t __read_mostly cache_zone_small_ts; 354 static uma_zone_t __read_mostly cache_zone_large; 355 static uma_zone_t __read_mostly cache_zone_large_ts; 356 357 static struct namecache * 358 cache_alloc(int len, int ts) 359 { 360 struct namecache_ts *ncp_ts; 361 struct namecache *ncp; 362 363 if (__predict_false(ts)) { 364 if (len <= CACHE_PATH_CUTOFF) 365 ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK); 366 else 367 ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK); 368 ncp = &ncp_ts->nc_nc; 369 } else { 370 if (len <= CACHE_PATH_CUTOFF) 371 ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK); 372 else 373 ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK); 374 } 375 return (ncp); 376 } 377 378 static void 379 cache_free(struct namecache *ncp) 380 { 381 struct namecache_ts *ncp_ts; 382 383 if (ncp == NULL) 384 return; 385 if ((ncp->nc_flag & NCF_DVDROP) != 0) 386 vdrop(ncp->nc_dvp); 387 if (__predict_false(ncp->nc_flag & NCF_TS)) { 388 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 389 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 390 uma_zfree_smr(cache_zone_small_ts, ncp_ts); 391 else 392 uma_zfree_smr(cache_zone_large_ts, ncp_ts); 393 } else { 394 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 395 uma_zfree_smr(cache_zone_small, ncp); 396 else 397 uma_zfree_smr(cache_zone_large, ncp); 398 } 399 } 400 401 static void 402 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) 403 { 404 struct namecache_ts *ncp_ts; 405 406 KASSERT((ncp->nc_flag & NCF_TS) != 0 || 407 (tsp == NULL && ticksp == NULL), 408 ("No NCF_TS")); 409 410 if (tsp == NULL && ticksp == NULL) 411 return; 412 413 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 414 if (tsp != NULL) 415 *tsp = ncp_ts->nc_time; 416 if (ticksp != NULL) 417 *ticksp = ncp_ts->nc_ticks; 418 } 419 420 #ifdef DEBUG_CACHE 421 static int __read_mostly doingcache = 1; /* 1 => enable the cache */ 422 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, 423 "VFS namecache enabled"); 424 #endif 425 426 /* Export size information to userland */ 427 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 428 sizeof(struct namecache), "sizeof(struct namecache)"); 429 430 /* 431 * The new name cache statistics 432 */ 433 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 434 "Name cache statistics"); 435 #define STATNODE_ULONG(name, descr) \ 436 SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr); 437 #define STATNODE_COUNTER(name, descr) \ 438 static COUNTER_U64_DEFINE_EARLY(name); \ 439 SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, \ 440 descr); 441 STATNODE_ULONG(numneg, "Number of negative cache entries"); 442 STATNODE_ULONG(numcache, "Number of cache entries"); 443 STATNODE_COUNTER(numcachehv, "Number of namecache entries with vnodes held"); 444 STATNODE_COUNTER(numdrops, "Number of dropped entries due to reaching the limit"); 445 STATNODE_COUNTER(dothits, "Number of '.' hits"); 446 STATNODE_COUNTER(dotdothits, "Number of '..' hits"); 447 STATNODE_COUNTER(nummiss, "Number of cache misses"); 448 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache"); 449 STATNODE_COUNTER(numposzaps, 450 "Number of cache hits (positive) we do not want to cache"); 451 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)"); 452 STATNODE_COUNTER(numnegzaps, 453 "Number of cache hits (negative) we do not want to cache"); 454 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)"); 455 /* These count for vn_getcwd(), too. */ 456 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls"); 457 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); 458 STATNODE_COUNTER(numfullpathfail2, 459 "Number of fullpath search errors (VOP_VPTOCNP failures)"); 460 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); 461 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls"); 462 STATNODE_COUNTER(zap_and_exit_bucket_relock_success, 463 "Number of successful removals after relocking"); 464 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail, 465 "Number of times zap_and_exit failed to lock"); 466 static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2, 467 "Number of times zap_and_exit failed to lock"); 468 static long cache_lock_vnodes_cel_3_failures; 469 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures, 470 "Number of times 3-way vnode locking failed"); 471 STATNODE_ULONG(numhotneg, "Number of hot negative entries"); 472 STATNODE_COUNTER(numneg_evicted, 473 "Number of negative entries evicted when adding a new entry"); 474 STATNODE_COUNTER(shrinking_skipped, 475 "Number of times shrinking was already in progress"); 476 477 static void cache_zap_locked(struct namecache *ncp); 478 static int vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, 479 char **freebuf, size_t *buflen); 480 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, 481 char **retbuf, size_t *buflen); 482 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, 483 char **retbuf, size_t *len, bool slash_prefixed, size_t addend); 484 485 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 486 487 static int cache_yield; 488 SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0, 489 "Number of times cache called yield"); 490 491 static void __noinline 492 cache_maybe_yield(void) 493 { 494 495 if (should_yield()) { 496 cache_yield++; 497 kern_yield(PRI_USER); 498 } 499 } 500 501 static inline void 502 cache_assert_vlp_locked(struct mtx *vlp) 503 { 504 505 if (vlp != NULL) 506 mtx_assert(vlp, MA_OWNED); 507 } 508 509 static inline void 510 cache_assert_vnode_locked(struct vnode *vp) 511 { 512 struct mtx *vlp; 513 514 vlp = VP2VNODELOCK(vp); 515 cache_assert_vlp_locked(vlp); 516 } 517 518 /* 519 * TODO: With the value stored we can do better than computing the hash based 520 * on the address. The choice of FNV should also be revisited. 521 */ 522 static void 523 cache_prehash(struct vnode *vp) 524 { 525 526 vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT); 527 } 528 529 static uint32_t 530 cache_get_hash(char *name, u_char len, struct vnode *dvp) 531 { 532 533 return (fnv_32_buf(name, len, dvp->v_nchash)); 534 } 535 536 static inline struct nchashhead * 537 NCP2BUCKET(struct namecache *ncp) 538 { 539 uint32_t hash; 540 541 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 542 return (NCHHASH(hash)); 543 } 544 545 static inline struct rwlock * 546 NCP2BUCKETLOCK(struct namecache *ncp) 547 { 548 uint32_t hash; 549 550 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 551 return (HASH2BUCKETLOCK(hash)); 552 } 553 554 #ifdef INVARIANTS 555 static void 556 cache_assert_bucket_locked(struct namecache *ncp, int mode) 557 { 558 struct rwlock *blp; 559 560 blp = NCP2BUCKETLOCK(ncp); 561 rw_assert(blp, mode); 562 } 563 #else 564 #define cache_assert_bucket_locked(x, y) do { } while (0) 565 #endif 566 567 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) 568 static void 569 _cache_sort_vnodes(void **p1, void **p2) 570 { 571 void *tmp; 572 573 MPASS(*p1 != NULL || *p2 != NULL); 574 575 if (*p1 > *p2) { 576 tmp = *p2; 577 *p2 = *p1; 578 *p1 = tmp; 579 } 580 } 581 582 static void 583 cache_lock_all_buckets(void) 584 { 585 u_int i; 586 587 for (i = 0; i < numbucketlocks; i++) 588 rw_wlock(&bucketlocks[i]); 589 } 590 591 static void 592 cache_unlock_all_buckets(void) 593 { 594 u_int i; 595 596 for (i = 0; i < numbucketlocks; i++) 597 rw_wunlock(&bucketlocks[i]); 598 } 599 600 static void 601 cache_lock_all_vnodes(void) 602 { 603 u_int i; 604 605 for (i = 0; i < numvnodelocks; i++) 606 mtx_lock(&vnodelocks[i]); 607 } 608 609 static void 610 cache_unlock_all_vnodes(void) 611 { 612 u_int i; 613 614 for (i = 0; i < numvnodelocks; i++) 615 mtx_unlock(&vnodelocks[i]); 616 } 617 618 static int 619 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 620 { 621 622 cache_sort_vnodes(&vlp1, &vlp2); 623 624 if (vlp1 != NULL) { 625 if (!mtx_trylock(vlp1)) 626 return (EAGAIN); 627 } 628 if (!mtx_trylock(vlp2)) { 629 if (vlp1 != NULL) 630 mtx_unlock(vlp1); 631 return (EAGAIN); 632 } 633 634 return (0); 635 } 636 637 static void 638 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 639 { 640 641 MPASS(vlp1 != NULL || vlp2 != NULL); 642 MPASS(vlp1 <= vlp2); 643 644 if (vlp1 != NULL) 645 mtx_lock(vlp1); 646 if (vlp2 != NULL) 647 mtx_lock(vlp2); 648 } 649 650 static void 651 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 652 { 653 654 MPASS(vlp1 != NULL || vlp2 != NULL); 655 656 if (vlp1 != NULL) 657 mtx_unlock(vlp1); 658 if (vlp2 != NULL) 659 mtx_unlock(vlp2); 660 } 661 662 static int 663 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 664 { 665 struct nchstats snap; 666 667 if (req->oldptr == NULL) 668 return (SYSCTL_OUT(req, 0, sizeof(snap))); 669 670 snap = nchstats; 671 snap.ncs_goodhits = counter_u64_fetch(numposhits); 672 snap.ncs_neghits = counter_u64_fetch(numneghits); 673 snap.ncs_badhits = counter_u64_fetch(numposzaps) + 674 counter_u64_fetch(numnegzaps); 675 snap.ncs_miss = counter_u64_fetch(nummisszap) + 676 counter_u64_fetch(nummiss); 677 678 return (SYSCTL_OUT(req, &snap, sizeof(snap))); 679 } 680 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | 681 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", 682 "VFS cache effectiveness statistics"); 683 684 #ifdef DIAGNOSTIC 685 /* 686 * Grab an atomic snapshot of the name cache hash chain lengths 687 */ 688 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, 689 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 690 "hash table stats"); 691 692 static int 693 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) 694 { 695 struct nchashhead *ncpp; 696 struct namecache *ncp; 697 int i, error, n_nchash, *cntbuf; 698 699 retry: 700 n_nchash = nchash + 1; /* nchash is max index, not count */ 701 if (req->oldptr == NULL) 702 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); 703 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); 704 cache_lock_all_buckets(); 705 if (n_nchash != nchash + 1) { 706 cache_unlock_all_buckets(); 707 free(cntbuf, M_TEMP); 708 goto retry; 709 } 710 /* Scan hash tables counting entries */ 711 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) 712 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) 713 cntbuf[i]++; 714 cache_unlock_all_buckets(); 715 for (error = 0, i = 0; i < n_nchash; i++) 716 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) 717 break; 718 free(cntbuf, M_TEMP); 719 return (error); 720 } 721 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| 722 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", 723 "nchash chain lengths"); 724 725 static int 726 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) 727 { 728 int error; 729 struct nchashhead *ncpp; 730 struct namecache *ncp; 731 int n_nchash; 732 int count, maxlength, used, pct; 733 734 if (!req->oldptr) 735 return SYSCTL_OUT(req, 0, 4 * sizeof(int)); 736 737 cache_lock_all_buckets(); 738 n_nchash = nchash + 1; /* nchash is max index, not count */ 739 used = 0; 740 maxlength = 0; 741 742 /* Scan hash tables for applicable entries */ 743 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { 744 count = 0; 745 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) { 746 count++; 747 } 748 if (count) 749 used++; 750 if (maxlength < count) 751 maxlength = count; 752 } 753 n_nchash = nchash + 1; 754 cache_unlock_all_buckets(); 755 pct = (used * 100) / (n_nchash / 100); 756 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); 757 if (error) 758 return (error); 759 error = SYSCTL_OUT(req, &used, sizeof(used)); 760 if (error) 761 return (error); 762 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); 763 if (error) 764 return (error); 765 error = SYSCTL_OUT(req, &pct, sizeof(pct)); 766 if (error) 767 return (error); 768 return (0); 769 } 770 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| 771 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", 772 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); 773 #endif 774 775 /* 776 * Negative entries management 777 * 778 * A variation of LRU scheme is used. New entries are hashed into one of 779 * numneglists cold lists. Entries get promoted to the hot list on first hit. 780 * 781 * The shrinker will demote hot list head and evict from the cold list in a 782 * round-robin manner. 783 */ 784 static void 785 cache_negative_init(struct namecache *ncp) 786 { 787 struct negstate *negstate; 788 789 ncp->nc_flag |= NCF_NEGATIVE; 790 negstate = NCP2NEGSTATE(ncp); 791 negstate->neg_flag = 0; 792 } 793 794 static void 795 cache_negative_hit(struct namecache *ncp) 796 { 797 struct neglist *neglist; 798 struct negstate *negstate; 799 800 negstate = NCP2NEGSTATE(ncp); 801 if ((negstate->neg_flag & NEG_HOT) != 0) 802 return; 803 neglist = NCP2NEGLIST(ncp); 804 mtx_lock(&ncneg_hot.nl_lock); 805 mtx_lock(&neglist->nl_lock); 806 if ((negstate->neg_flag & NEG_HOT) == 0) { 807 numhotneg++; 808 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 809 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); 810 negstate->neg_flag |= NEG_HOT; 811 } 812 mtx_unlock(&neglist->nl_lock); 813 mtx_unlock(&ncneg_hot.nl_lock); 814 } 815 816 static void 817 cache_negative_insert(struct namecache *ncp) 818 { 819 struct neglist *neglist; 820 821 MPASS(ncp->nc_flag & NCF_NEGATIVE); 822 cache_assert_bucket_locked(ncp, RA_WLOCKED); 823 neglist = NCP2NEGLIST(ncp); 824 mtx_lock(&neglist->nl_lock); 825 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 826 mtx_unlock(&neglist->nl_lock); 827 atomic_add_rel_long(&numneg, 1); 828 } 829 830 static void 831 cache_negative_remove(struct namecache *ncp) 832 { 833 struct neglist *neglist; 834 struct negstate *negstate; 835 bool hot_locked = false; 836 bool list_locked = false; 837 838 cache_assert_bucket_locked(ncp, RA_WLOCKED); 839 neglist = NCP2NEGLIST(ncp); 840 negstate = NCP2NEGSTATE(ncp); 841 if ((negstate->neg_flag & NEG_HOT) != 0) { 842 hot_locked = true; 843 mtx_lock(&ncneg_hot.nl_lock); 844 if ((negstate->neg_flag & NEG_HOT) == 0) { 845 list_locked = true; 846 mtx_lock(&neglist->nl_lock); 847 } 848 } else { 849 list_locked = true; 850 mtx_lock(&neglist->nl_lock); 851 /* 852 * We may be racing against promotion in lockless lookup. 853 */ 854 if ((negstate->neg_flag & NEG_HOT) != 0) { 855 mtx_unlock(&neglist->nl_lock); 856 hot_locked = true; 857 mtx_lock(&ncneg_hot.nl_lock); 858 mtx_lock(&neglist->nl_lock); 859 } 860 } 861 if ((negstate->neg_flag & NEG_HOT) != 0) { 862 mtx_assert(&ncneg_hot.nl_lock, MA_OWNED); 863 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 864 numhotneg--; 865 } else { 866 mtx_assert(&neglist->nl_lock, MA_OWNED); 867 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 868 } 869 if (list_locked) 870 mtx_unlock(&neglist->nl_lock); 871 if (hot_locked) 872 mtx_unlock(&ncneg_hot.nl_lock); 873 atomic_subtract_rel_long(&numneg, 1); 874 } 875 876 static void 877 cache_negative_shrink_select(struct namecache **ncpp, 878 struct neglist **neglistpp) 879 { 880 struct neglist *neglist; 881 struct namecache *ncp; 882 static u_int cycle; 883 u_int i; 884 885 *ncpp = ncp = NULL; 886 887 for (i = 0; i < numneglists; i++) { 888 neglist = &neglists[(cycle + i) % numneglists]; 889 if (TAILQ_FIRST(&neglist->nl_list) == NULL) 890 continue; 891 mtx_lock(&neglist->nl_lock); 892 ncp = TAILQ_FIRST(&neglist->nl_list); 893 if (ncp != NULL) 894 break; 895 mtx_unlock(&neglist->nl_lock); 896 } 897 898 *neglistpp = neglist; 899 *ncpp = ncp; 900 cycle++; 901 } 902 903 static void 904 cache_negative_zap_one(void) 905 { 906 struct namecache *ncp, *ncp2; 907 struct neglist *neglist; 908 struct negstate *negstate; 909 struct mtx *dvlp; 910 struct rwlock *blp; 911 912 if (mtx_owner(&ncneg_shrink_lock) != NULL || 913 !mtx_trylock(&ncneg_shrink_lock)) { 914 counter_u64_add(shrinking_skipped, 1); 915 return; 916 } 917 918 mtx_lock(&ncneg_hot.nl_lock); 919 ncp = TAILQ_FIRST(&ncneg_hot.nl_list); 920 if (ncp != NULL) { 921 neglist = NCP2NEGLIST(ncp); 922 negstate = NCP2NEGSTATE(ncp); 923 mtx_lock(&neglist->nl_lock); 924 MPASS((negstate->neg_flag & NEG_HOT) != 0); 925 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 926 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 927 negstate->neg_flag &= ~NEG_HOT; 928 numhotneg--; 929 mtx_unlock(&neglist->nl_lock); 930 } 931 mtx_unlock(&ncneg_hot.nl_lock); 932 933 cache_negative_shrink_select(&ncp, &neglist); 934 935 mtx_unlock(&ncneg_shrink_lock); 936 if (ncp == NULL) 937 return; 938 939 MPASS(ncp->nc_flag & NCF_NEGATIVE); 940 dvlp = VP2VNODELOCK(ncp->nc_dvp); 941 blp = NCP2BUCKETLOCK(ncp); 942 mtx_unlock(&neglist->nl_lock); 943 mtx_lock(dvlp); 944 rw_wlock(blp); 945 /* 946 * Enter SMR to safely check the negative list. 947 * Even if the found pointer matches, the entry may now be reallocated 948 * and used by a different vnode. 949 */ 950 vfs_smr_enter(); 951 ncp2 = TAILQ_FIRST(&neglist->nl_list); 952 if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) || 953 blp != NCP2BUCKETLOCK(ncp2)) { 954 vfs_smr_exit(); 955 ncp = NULL; 956 } else { 957 vfs_smr_exit(); 958 SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp, 959 ncp->nc_name); 960 cache_zap_locked(ncp); 961 counter_u64_add(numneg_evicted, 1); 962 } 963 rw_wunlock(blp); 964 mtx_unlock(dvlp); 965 cache_free(ncp); 966 } 967 968 /* 969 * cache_zap_locked(): 970 * 971 * Removes a namecache entry from cache, whether it contains an actual 972 * pointer to a vnode or if it is just a negative cache entry. 973 */ 974 static void 975 cache_zap_locked(struct namecache *ncp) 976 { 977 struct nchashhead *ncpp; 978 979 if (!(ncp->nc_flag & NCF_NEGATIVE)) 980 cache_assert_vnode_locked(ncp->nc_vp); 981 cache_assert_vnode_locked(ncp->nc_dvp); 982 cache_assert_bucket_locked(ncp, RA_WLOCKED); 983 984 CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, 985 (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp); 986 987 cache_ncp_invalidate(ncp); 988 989 ncpp = NCP2BUCKET(ncp); 990 CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash); 991 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 992 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, 993 ncp->nc_name, ncp->nc_vp); 994 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); 995 if (ncp == ncp->nc_vp->v_cache_dd) { 996 vn_seqc_write_begin_unheld(ncp->nc_vp); 997 ncp->nc_vp->v_cache_dd = NULL; 998 vn_seqc_write_end(ncp->nc_vp); 999 } 1000 } else { 1001 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp, 1002 ncp->nc_name); 1003 cache_negative_remove(ncp); 1004 } 1005 if (ncp->nc_flag & NCF_ISDOTDOT) { 1006 if (ncp == ncp->nc_dvp->v_cache_dd) { 1007 vn_seqc_write_begin_unheld(ncp->nc_dvp); 1008 ncp->nc_dvp->v_cache_dd = NULL; 1009 vn_seqc_write_end(ncp->nc_dvp); 1010 } 1011 } else { 1012 LIST_REMOVE(ncp, nc_src); 1013 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { 1014 ncp->nc_flag |= NCF_DVDROP; 1015 counter_u64_add(numcachehv, -1); 1016 } 1017 } 1018 atomic_subtract_rel_long(&numcache, 1); 1019 } 1020 1021 static void 1022 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) 1023 { 1024 struct rwlock *blp; 1025 1026 MPASS(ncp->nc_dvp == vp); 1027 MPASS(ncp->nc_flag & NCF_NEGATIVE); 1028 cache_assert_vnode_locked(vp); 1029 1030 blp = NCP2BUCKETLOCK(ncp); 1031 rw_wlock(blp); 1032 cache_zap_locked(ncp); 1033 rw_wunlock(blp); 1034 } 1035 1036 static bool 1037 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, 1038 struct mtx **vlpp) 1039 { 1040 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 1041 struct rwlock *blp; 1042 1043 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 1044 cache_assert_vnode_locked(vp); 1045 1046 if (ncp->nc_flag & NCF_NEGATIVE) { 1047 if (*vlpp != NULL) { 1048 mtx_unlock(*vlpp); 1049 *vlpp = NULL; 1050 } 1051 cache_zap_negative_locked_vnode_kl(ncp, vp); 1052 return (true); 1053 } 1054 1055 pvlp = VP2VNODELOCK(vp); 1056 blp = NCP2BUCKETLOCK(ncp); 1057 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 1058 vlp2 = VP2VNODELOCK(ncp->nc_vp); 1059 1060 if (*vlpp == vlp1 || *vlpp == vlp2) { 1061 to_unlock = *vlpp; 1062 *vlpp = NULL; 1063 } else { 1064 if (*vlpp != NULL) { 1065 mtx_unlock(*vlpp); 1066 *vlpp = NULL; 1067 } 1068 cache_sort_vnodes(&vlp1, &vlp2); 1069 if (vlp1 == pvlp) { 1070 mtx_lock(vlp2); 1071 to_unlock = vlp2; 1072 } else { 1073 if (!mtx_trylock(vlp1)) 1074 goto out_relock; 1075 to_unlock = vlp1; 1076 } 1077 } 1078 rw_wlock(blp); 1079 cache_zap_locked(ncp); 1080 rw_wunlock(blp); 1081 if (to_unlock != NULL) 1082 mtx_unlock(to_unlock); 1083 return (true); 1084 1085 out_relock: 1086 mtx_unlock(vlp2); 1087 mtx_lock(vlp1); 1088 mtx_lock(vlp2); 1089 MPASS(*vlpp == NULL); 1090 *vlpp = vlp1; 1091 return (false); 1092 } 1093 1094 static int __noinline 1095 cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp) 1096 { 1097 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 1098 struct rwlock *blp; 1099 int error = 0; 1100 1101 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 1102 cache_assert_vnode_locked(vp); 1103 1104 pvlp = VP2VNODELOCK(vp); 1105 if (ncp->nc_flag & NCF_NEGATIVE) { 1106 cache_zap_negative_locked_vnode_kl(ncp, vp); 1107 goto out; 1108 } 1109 1110 blp = NCP2BUCKETLOCK(ncp); 1111 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 1112 vlp2 = VP2VNODELOCK(ncp->nc_vp); 1113 cache_sort_vnodes(&vlp1, &vlp2); 1114 if (vlp1 == pvlp) { 1115 mtx_lock(vlp2); 1116 to_unlock = vlp2; 1117 } else { 1118 if (!mtx_trylock(vlp1)) { 1119 error = EAGAIN; 1120 goto out; 1121 } 1122 to_unlock = vlp1; 1123 } 1124 rw_wlock(blp); 1125 cache_zap_locked(ncp); 1126 rw_wunlock(blp); 1127 mtx_unlock(to_unlock); 1128 out: 1129 mtx_unlock(pvlp); 1130 return (error); 1131 } 1132 1133 /* 1134 * If trylocking failed we can get here. We know enough to take all needed locks 1135 * in the right order and re-lookup the entry. 1136 */ 1137 static int 1138 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1139 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, 1140 struct rwlock *blp) 1141 { 1142 struct namecache *rncp; 1143 1144 cache_assert_bucket_locked(ncp, RA_UNLOCKED); 1145 1146 cache_sort_vnodes(&dvlp, &vlp); 1147 cache_lock_vnodes(dvlp, vlp); 1148 rw_wlock(blp); 1149 CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { 1150 if (rncp == ncp && rncp->nc_dvp == dvp && 1151 rncp->nc_nlen == cnp->cn_namelen && 1152 !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) 1153 break; 1154 } 1155 if (rncp != NULL) { 1156 cache_zap_locked(rncp); 1157 rw_wunlock(blp); 1158 cache_unlock_vnodes(dvlp, vlp); 1159 counter_u64_add(zap_and_exit_bucket_relock_success, 1); 1160 return (0); 1161 } 1162 1163 rw_wunlock(blp); 1164 cache_unlock_vnodes(dvlp, vlp); 1165 return (EAGAIN); 1166 } 1167 1168 static int __noinline 1169 cache_zap_wlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1170 uint32_t hash, struct rwlock *blp) 1171 { 1172 struct mtx *dvlp, *vlp; 1173 struct vnode *dvp; 1174 1175 cache_assert_bucket_locked(ncp, RA_WLOCKED); 1176 1177 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1178 vlp = NULL; 1179 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1180 vlp = VP2VNODELOCK(ncp->nc_vp); 1181 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1182 cache_zap_locked(ncp); 1183 rw_wunlock(blp); 1184 cache_unlock_vnodes(dvlp, vlp); 1185 return (0); 1186 } 1187 1188 dvp = ncp->nc_dvp; 1189 rw_wunlock(blp); 1190 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1191 } 1192 1193 static int __noinline 1194 cache_zap_rlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1195 uint32_t hash, struct rwlock *blp) 1196 { 1197 struct mtx *dvlp, *vlp; 1198 struct vnode *dvp; 1199 1200 cache_assert_bucket_locked(ncp, RA_RLOCKED); 1201 1202 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1203 vlp = NULL; 1204 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1205 vlp = VP2VNODELOCK(ncp->nc_vp); 1206 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1207 rw_runlock(blp); 1208 rw_wlock(blp); 1209 cache_zap_locked(ncp); 1210 rw_wunlock(blp); 1211 cache_unlock_vnodes(dvlp, vlp); 1212 return (0); 1213 } 1214 1215 dvp = ncp->nc_dvp; 1216 rw_runlock(blp); 1217 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1218 } 1219 1220 static int 1221 cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp, 1222 struct mtx **vlpp1, struct mtx **vlpp2) 1223 { 1224 struct mtx *dvlp, *vlp; 1225 1226 cache_assert_bucket_locked(ncp, RA_WLOCKED); 1227 1228 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1229 vlp = NULL; 1230 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1231 vlp = VP2VNODELOCK(ncp->nc_vp); 1232 cache_sort_vnodes(&dvlp, &vlp); 1233 1234 if (*vlpp1 == dvlp && *vlpp2 == vlp) { 1235 cache_zap_locked(ncp); 1236 cache_unlock_vnodes(dvlp, vlp); 1237 *vlpp1 = NULL; 1238 *vlpp2 = NULL; 1239 return (0); 1240 } 1241 1242 if (*vlpp1 != NULL) 1243 mtx_unlock(*vlpp1); 1244 if (*vlpp2 != NULL) 1245 mtx_unlock(*vlpp2); 1246 *vlpp1 = NULL; 1247 *vlpp2 = NULL; 1248 1249 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1250 cache_zap_locked(ncp); 1251 cache_unlock_vnodes(dvlp, vlp); 1252 return (0); 1253 } 1254 1255 rw_wunlock(blp); 1256 *vlpp1 = dvlp; 1257 *vlpp2 = vlp; 1258 if (*vlpp1 != NULL) 1259 mtx_lock(*vlpp1); 1260 mtx_lock(*vlpp2); 1261 rw_wlock(blp); 1262 return (EAGAIN); 1263 } 1264 1265 static void 1266 cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp) 1267 { 1268 1269 if (blp != NULL) { 1270 rw_runlock(blp); 1271 } else { 1272 mtx_unlock(vlp); 1273 } 1274 } 1275 1276 static int __noinline 1277 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1278 struct timespec *tsp, int *ticksp) 1279 { 1280 int ltype; 1281 1282 *vpp = dvp; 1283 CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .", 1284 dvp, cnp->cn_nameptr); 1285 counter_u64_add(dothits, 1); 1286 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); 1287 if (tsp != NULL) 1288 timespecclear(tsp); 1289 if (ticksp != NULL) 1290 *ticksp = ticks; 1291 vrefact(*vpp); 1292 /* 1293 * When we lookup "." we still can be asked to lock it 1294 * differently... 1295 */ 1296 ltype = cnp->cn_lkflags & LK_TYPE_MASK; 1297 if (ltype != VOP_ISLOCKED(*vpp)) { 1298 if (ltype == LK_EXCLUSIVE) { 1299 vn_lock(*vpp, LK_UPGRADE | LK_RETRY); 1300 if (VN_IS_DOOMED((*vpp))) { 1301 /* forced unmount */ 1302 vrele(*vpp); 1303 *vpp = NULL; 1304 return (ENOENT); 1305 } 1306 } else 1307 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); 1308 } 1309 return (-1); 1310 } 1311 1312 static __noinline int 1313 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp) 1314 { 1315 struct namecache *ncp; 1316 struct rwlock *blp; 1317 struct mtx *dvlp, *dvlp2; 1318 uint32_t hash; 1319 int error; 1320 1321 if (cnp->cn_namelen == 2 && 1322 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1323 dvlp = VP2VNODELOCK(dvp); 1324 dvlp2 = NULL; 1325 mtx_lock(dvlp); 1326 retry_dotdot: 1327 ncp = dvp->v_cache_dd; 1328 if (ncp == NULL) { 1329 mtx_unlock(dvlp); 1330 if (dvlp2 != NULL) 1331 mtx_unlock(dvlp2); 1332 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1333 return (0); 1334 } 1335 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1336 if (ncp->nc_dvp != dvp) 1337 panic("dvp %p v_cache_dd %p\n", dvp, ncp); 1338 if (!cache_zap_locked_vnode_kl2(ncp, 1339 dvp, &dvlp2)) 1340 goto retry_dotdot; 1341 MPASS(dvp->v_cache_dd == NULL); 1342 mtx_unlock(dvlp); 1343 if (dvlp2 != NULL) 1344 mtx_unlock(dvlp2); 1345 cache_free(ncp); 1346 } else { 1347 vn_seqc_write_begin(dvp); 1348 dvp->v_cache_dd = NULL; 1349 vn_seqc_write_end(dvp); 1350 mtx_unlock(dvlp); 1351 if (dvlp2 != NULL) 1352 mtx_unlock(dvlp2); 1353 } 1354 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1355 return (1); 1356 } 1357 1358 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1359 blp = HASH2BUCKETLOCK(hash); 1360 retry: 1361 if (CK_SLIST_EMPTY(NCHHASH(hash))) 1362 goto out_no_entry; 1363 1364 rw_wlock(blp); 1365 1366 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1367 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1368 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1369 break; 1370 } 1371 1372 /* We failed to find an entry */ 1373 if (ncp == NULL) { 1374 rw_wunlock(blp); 1375 goto out_no_entry; 1376 } 1377 1378 error = cache_zap_wlocked_bucket(ncp, cnp, hash, blp); 1379 if (__predict_false(error != 0)) { 1380 zap_and_exit_bucket_fail++; 1381 cache_maybe_yield(); 1382 goto retry; 1383 } 1384 counter_u64_add(numposzaps, 1); 1385 cache_free(ncp); 1386 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1387 return (1); 1388 out_no_entry: 1389 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1390 counter_u64_add(nummisszap, 1); 1391 return (0); 1392 } 1393 1394 /** 1395 * Lookup a name in the name cache 1396 * 1397 * # Arguments 1398 * 1399 * - dvp: Parent directory in which to search. 1400 * - vpp: Return argument. Will contain desired vnode on cache hit. 1401 * - cnp: Parameters of the name search. The most interesting bits of 1402 * the cn_flags field have the following meanings: 1403 * - MAKEENTRY: If clear, free an entry from the cache rather than look 1404 * it up. 1405 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." 1406 * - tsp: Return storage for cache timestamp. On a successful (positive 1407 * or negative) lookup, tsp will be filled with any timespec that 1408 * was stored when this cache entry was created. However, it will 1409 * be clear for "." entries. 1410 * - ticks: Return storage for alternate cache timestamp. On a successful 1411 * (positive or negative) lookup, it will contain the ticks value 1412 * that was current when the cache entry was created, unless cnp 1413 * was ".". 1414 * 1415 * # Returns 1416 * 1417 * - -1: A positive cache hit. vpp will contain the desired vnode. 1418 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due 1419 * to a forced unmount. vpp will not be modified. If the entry 1420 * is a whiteout, then the ISWHITEOUT flag will be set in 1421 * cnp->cn_flags. 1422 * - 0: A cache miss. vpp will not be modified. 1423 * 1424 * # Locking 1425 * 1426 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up 1427 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the 1428 * lock is not recursively acquired. 1429 */ 1430 int 1431 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1432 struct timespec *tsp, int *ticksp) 1433 { 1434 struct namecache_ts *ncp_ts; 1435 struct namecache *ncp; 1436 struct negstate *negstate; 1437 struct rwlock *blp; 1438 struct mtx *dvlp; 1439 uint32_t hash; 1440 enum vgetstate vs; 1441 int error, ltype; 1442 bool try_smr, doing_smr, whiteout; 1443 1444 #ifdef DEBUG_CACHE 1445 if (__predict_false(!doingcache)) { 1446 cnp->cn_flags &= ~MAKEENTRY; 1447 return (0); 1448 } 1449 #endif 1450 1451 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) 1452 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); 1453 1454 if ((cnp->cn_flags & MAKEENTRY) == 0) { 1455 cache_remove_cnp(dvp, cnp); 1456 return (0); 1457 } 1458 1459 try_smr = true; 1460 if (cnp->cn_nameiop == CREATE) 1461 try_smr = false; 1462 retry: 1463 doing_smr = false; 1464 blp = NULL; 1465 dvlp = NULL; 1466 error = 0; 1467 if (cnp->cn_namelen == 2 && 1468 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1469 counter_u64_add(dotdothits, 1); 1470 dvlp = VP2VNODELOCK(dvp); 1471 mtx_lock(dvlp); 1472 ncp = dvp->v_cache_dd; 1473 if (ncp == NULL) { 1474 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, 1475 "..", NULL); 1476 mtx_unlock(dvlp); 1477 return (0); 1478 } 1479 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1480 if (ncp->nc_flag & NCF_NEGATIVE) 1481 *vpp = NULL; 1482 else 1483 *vpp = ncp->nc_vp; 1484 } else 1485 *vpp = ncp->nc_dvp; 1486 /* Return failure if negative entry was found. */ 1487 if (*vpp == NULL) 1488 goto negative_success; 1489 CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..", 1490 dvp, cnp->cn_nameptr, *vpp); 1491 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", 1492 *vpp); 1493 cache_out_ts(ncp, tsp, ticksp); 1494 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == 1495 NCF_DTS && tsp != NULL) { 1496 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1497 *tsp = ncp_ts->nc_dotdottime; 1498 } 1499 goto success; 1500 } 1501 1502 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1503 retry_hashed: 1504 if (try_smr) { 1505 vfs_smr_enter(); 1506 doing_smr = true; 1507 try_smr = false; 1508 } else { 1509 blp = HASH2BUCKETLOCK(hash); 1510 rw_rlock(blp); 1511 } 1512 1513 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1514 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1515 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1516 break; 1517 } 1518 1519 /* We failed to find an entry */ 1520 if (__predict_false(ncp == NULL)) { 1521 if (doing_smr) 1522 vfs_smr_exit(); 1523 else 1524 rw_runlock(blp); 1525 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1526 NULL); 1527 counter_u64_add(nummiss, 1); 1528 return (0); 1529 } 1530 1531 if (ncp->nc_flag & NCF_NEGATIVE) 1532 goto negative_success; 1533 1534 /* We found a "positive" match, return the vnode */ 1535 counter_u64_add(numposhits, 1); 1536 *vpp = ncp->nc_vp; 1537 CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p", 1538 dvp, cnp->cn_nameptr, *vpp, ncp); 1539 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, 1540 *vpp); 1541 cache_out_ts(ncp, tsp, ticksp); 1542 success: 1543 /* 1544 * On success we return a locked and ref'd vnode as per the lookup 1545 * protocol. 1546 */ 1547 MPASS(dvp != *vpp); 1548 ltype = 0; /* silence gcc warning */ 1549 if (cnp->cn_flags & ISDOTDOT) { 1550 ltype = VOP_ISLOCKED(dvp); 1551 VOP_UNLOCK(dvp); 1552 } 1553 if (doing_smr) { 1554 if (!cache_ncp_canuse(ncp)) { 1555 vfs_smr_exit(); 1556 *vpp = NULL; 1557 goto retry; 1558 } 1559 vs = vget_prep_smr(*vpp); 1560 vfs_smr_exit(); 1561 if (__predict_false(vs == VGET_NONE)) { 1562 *vpp = NULL; 1563 goto retry; 1564 } 1565 } else { 1566 vs = vget_prep(*vpp); 1567 cache_lookup_unlock(blp, dvlp); 1568 } 1569 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1570 if (cnp->cn_flags & ISDOTDOT) { 1571 vn_lock(dvp, ltype | LK_RETRY); 1572 if (VN_IS_DOOMED(dvp)) { 1573 if (error == 0) 1574 vput(*vpp); 1575 *vpp = NULL; 1576 return (ENOENT); 1577 } 1578 } 1579 if (error) { 1580 *vpp = NULL; 1581 goto retry; 1582 } 1583 if ((cnp->cn_flags & ISLASTCN) && 1584 (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) { 1585 ASSERT_VOP_ELOCKED(*vpp, "cache_lookup"); 1586 } 1587 return (-1); 1588 1589 negative_success: 1590 /* We found a negative match, and want to create it, so purge */ 1591 if (cnp->cn_nameiop == CREATE) { 1592 MPASS(!doing_smr); 1593 counter_u64_add(numnegzaps, 1); 1594 goto zap_and_exit; 1595 } 1596 1597 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 1598 cache_out_ts(ncp, tsp, ticksp); 1599 counter_u64_add(numneghits, 1); 1600 whiteout = (ncp->nc_flag & NCF_WHITE); 1601 1602 if (doing_smr) { 1603 /* 1604 * We need to take locks to promote an entry. 1605 */ 1606 negstate = NCP2NEGSTATE(ncp); 1607 if ((negstate->neg_flag & NEG_HOT) == 0 || 1608 !cache_ncp_canuse(ncp)) { 1609 vfs_smr_exit(); 1610 doing_smr = false; 1611 goto retry_hashed; 1612 } 1613 vfs_smr_exit(); 1614 } else { 1615 cache_negative_hit(ncp); 1616 cache_lookup_unlock(blp, dvlp); 1617 } 1618 if (whiteout) 1619 cnp->cn_flags |= ISWHITEOUT; 1620 return (ENOENT); 1621 1622 zap_and_exit: 1623 MPASS(!doing_smr); 1624 if (blp != NULL) 1625 error = cache_zap_rlocked_bucket(ncp, cnp, hash, blp); 1626 else 1627 error = cache_zap_locked_vnode(ncp, dvp); 1628 if (__predict_false(error != 0)) { 1629 zap_and_exit_bucket_fail2++; 1630 cache_maybe_yield(); 1631 goto retry; 1632 } 1633 cache_free(ncp); 1634 return (0); 1635 } 1636 1637 struct celockstate { 1638 struct mtx *vlp[3]; 1639 struct rwlock *blp[2]; 1640 }; 1641 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); 1642 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); 1643 1644 static inline void 1645 cache_celockstate_init(struct celockstate *cel) 1646 { 1647 1648 bzero(cel, sizeof(*cel)); 1649 } 1650 1651 static void 1652 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, 1653 struct vnode *dvp) 1654 { 1655 struct mtx *vlp1, *vlp2; 1656 1657 MPASS(cel->vlp[0] == NULL); 1658 MPASS(cel->vlp[1] == NULL); 1659 MPASS(cel->vlp[2] == NULL); 1660 1661 MPASS(vp != NULL || dvp != NULL); 1662 1663 vlp1 = VP2VNODELOCK(vp); 1664 vlp2 = VP2VNODELOCK(dvp); 1665 cache_sort_vnodes(&vlp1, &vlp2); 1666 1667 if (vlp1 != NULL) { 1668 mtx_lock(vlp1); 1669 cel->vlp[0] = vlp1; 1670 } 1671 mtx_lock(vlp2); 1672 cel->vlp[1] = vlp2; 1673 } 1674 1675 static void 1676 cache_unlock_vnodes_cel(struct celockstate *cel) 1677 { 1678 1679 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); 1680 1681 if (cel->vlp[0] != NULL) 1682 mtx_unlock(cel->vlp[0]); 1683 if (cel->vlp[1] != NULL) 1684 mtx_unlock(cel->vlp[1]); 1685 if (cel->vlp[2] != NULL) 1686 mtx_unlock(cel->vlp[2]); 1687 } 1688 1689 static bool 1690 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) 1691 { 1692 struct mtx *vlp; 1693 bool ret; 1694 1695 cache_assert_vlp_locked(cel->vlp[0]); 1696 cache_assert_vlp_locked(cel->vlp[1]); 1697 MPASS(cel->vlp[2] == NULL); 1698 1699 MPASS(vp != NULL); 1700 vlp = VP2VNODELOCK(vp); 1701 1702 ret = true; 1703 if (vlp >= cel->vlp[1]) { 1704 mtx_lock(vlp); 1705 } else { 1706 if (mtx_trylock(vlp)) 1707 goto out; 1708 cache_lock_vnodes_cel_3_failures++; 1709 cache_unlock_vnodes_cel(cel); 1710 if (vlp < cel->vlp[0]) { 1711 mtx_lock(vlp); 1712 mtx_lock(cel->vlp[0]); 1713 mtx_lock(cel->vlp[1]); 1714 } else { 1715 if (cel->vlp[0] != NULL) 1716 mtx_lock(cel->vlp[0]); 1717 mtx_lock(vlp); 1718 mtx_lock(cel->vlp[1]); 1719 } 1720 ret = false; 1721 } 1722 out: 1723 cel->vlp[2] = vlp; 1724 return (ret); 1725 } 1726 1727 static void 1728 cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1, 1729 struct rwlock *blp2) 1730 { 1731 1732 MPASS(cel->blp[0] == NULL); 1733 MPASS(cel->blp[1] == NULL); 1734 1735 cache_sort_vnodes(&blp1, &blp2); 1736 1737 if (blp1 != NULL) { 1738 rw_wlock(blp1); 1739 cel->blp[0] = blp1; 1740 } 1741 rw_wlock(blp2); 1742 cel->blp[1] = blp2; 1743 } 1744 1745 static void 1746 cache_unlock_buckets_cel(struct celockstate *cel) 1747 { 1748 1749 if (cel->blp[0] != NULL) 1750 rw_wunlock(cel->blp[0]); 1751 rw_wunlock(cel->blp[1]); 1752 } 1753 1754 /* 1755 * Lock part of the cache affected by the insertion. 1756 * 1757 * This means vnodelocks for dvp, vp and the relevant bucketlock. 1758 * However, insertion can result in removal of an old entry. In this 1759 * case we have an additional vnode and bucketlock pair to lock. If the 1760 * entry is negative, ncelock is locked instead of the vnode. 1761 * 1762 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while 1763 * preserving the locking order (smaller address first). 1764 */ 1765 static void 1766 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1767 uint32_t hash) 1768 { 1769 struct namecache *ncp; 1770 struct rwlock *blps[2]; 1771 1772 blps[0] = HASH2BUCKETLOCK(hash); 1773 for (;;) { 1774 blps[1] = NULL; 1775 cache_lock_vnodes_cel(cel, dvp, vp); 1776 if (vp == NULL || vp->v_type != VDIR) 1777 break; 1778 ncp = vp->v_cache_dd; 1779 if (ncp == NULL) 1780 break; 1781 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1782 break; 1783 MPASS(ncp->nc_dvp == vp); 1784 blps[1] = NCP2BUCKETLOCK(ncp); 1785 if (ncp->nc_flag & NCF_NEGATIVE) 1786 break; 1787 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1788 break; 1789 /* 1790 * All vnodes got re-locked. Re-validate the state and if 1791 * nothing changed we are done. Otherwise restart. 1792 */ 1793 if (ncp == vp->v_cache_dd && 1794 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1795 blps[1] == NCP2BUCKETLOCK(ncp) && 1796 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1797 break; 1798 cache_unlock_vnodes_cel(cel); 1799 cel->vlp[0] = NULL; 1800 cel->vlp[1] = NULL; 1801 cel->vlp[2] = NULL; 1802 } 1803 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1804 } 1805 1806 static void 1807 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1808 uint32_t hash) 1809 { 1810 struct namecache *ncp; 1811 struct rwlock *blps[2]; 1812 1813 blps[0] = HASH2BUCKETLOCK(hash); 1814 for (;;) { 1815 blps[1] = NULL; 1816 cache_lock_vnodes_cel(cel, dvp, vp); 1817 ncp = dvp->v_cache_dd; 1818 if (ncp == NULL) 1819 break; 1820 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1821 break; 1822 MPASS(ncp->nc_dvp == dvp); 1823 blps[1] = NCP2BUCKETLOCK(ncp); 1824 if (ncp->nc_flag & NCF_NEGATIVE) 1825 break; 1826 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1827 break; 1828 if (ncp == dvp->v_cache_dd && 1829 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1830 blps[1] == NCP2BUCKETLOCK(ncp) && 1831 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1832 break; 1833 cache_unlock_vnodes_cel(cel); 1834 cel->vlp[0] = NULL; 1835 cel->vlp[1] = NULL; 1836 cel->vlp[2] = NULL; 1837 } 1838 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1839 } 1840 1841 static void 1842 cache_enter_unlock(struct celockstate *cel) 1843 { 1844 1845 cache_unlock_buckets_cel(cel); 1846 cache_unlock_vnodes_cel(cel); 1847 } 1848 1849 static void __noinline 1850 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, 1851 struct componentname *cnp) 1852 { 1853 struct celockstate cel; 1854 struct namecache *ncp; 1855 uint32_t hash; 1856 int len; 1857 1858 if (dvp->v_cache_dd == NULL) 1859 return; 1860 len = cnp->cn_namelen; 1861 cache_celockstate_init(&cel); 1862 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1863 cache_enter_lock_dd(&cel, dvp, vp, hash); 1864 vn_seqc_write_begin(dvp); 1865 ncp = dvp->v_cache_dd; 1866 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) { 1867 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); 1868 cache_zap_locked(ncp); 1869 } else { 1870 ncp = NULL; 1871 } 1872 dvp->v_cache_dd = NULL; 1873 vn_seqc_write_end(dvp); 1874 cache_enter_unlock(&cel); 1875 cache_free(ncp); 1876 } 1877 1878 /* 1879 * Add an entry to the cache. 1880 */ 1881 void 1882 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 1883 struct timespec *tsp, struct timespec *dtsp) 1884 { 1885 struct celockstate cel; 1886 struct namecache *ncp, *n2, *ndd; 1887 struct namecache_ts *ncp_ts, *n2_ts; 1888 struct nchashhead *ncpp; 1889 uint32_t hash; 1890 int flag; 1891 int len; 1892 u_long lnumcache; 1893 1894 CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr); 1895 VNPASS(!VN_IS_DOOMED(dvp), dvp); 1896 VNPASS(dvp->v_type != VNON, dvp); 1897 if (vp != NULL) { 1898 VNPASS(!VN_IS_DOOMED(vp), vp); 1899 VNPASS(vp->v_type != VNON, vp); 1900 } 1901 1902 #ifdef DEBUG_CACHE 1903 if (__predict_false(!doingcache)) 1904 return; 1905 #endif 1906 1907 flag = 0; 1908 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 1909 if (cnp->cn_namelen == 1) 1910 return; 1911 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 1912 cache_enter_dotdot_prep(dvp, vp, cnp); 1913 flag = NCF_ISDOTDOT; 1914 } 1915 } 1916 1917 /* 1918 * Avoid blowout in namecache entries. 1919 */ 1920 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; 1921 if (__predict_false(lnumcache >= ncsize)) { 1922 atomic_add_long(&numcache, -1); 1923 counter_u64_add(numdrops, 1); 1924 return; 1925 } 1926 1927 cache_celockstate_init(&cel); 1928 ndd = NULL; 1929 ncp_ts = NULL; 1930 1931 /* 1932 * Calculate the hash key and setup as much of the new 1933 * namecache entry as possible before acquiring the lock. 1934 */ 1935 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); 1936 ncp->nc_flag = flag | NCF_WIP; 1937 ncp->nc_vp = vp; 1938 if (vp == NULL) 1939 cache_negative_init(ncp); 1940 ncp->nc_dvp = dvp; 1941 if (tsp != NULL) { 1942 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1943 ncp_ts->nc_time = *tsp; 1944 ncp_ts->nc_ticks = ticks; 1945 ncp_ts->nc_nc.nc_flag |= NCF_TS; 1946 if (dtsp != NULL) { 1947 ncp_ts->nc_dotdottime = *dtsp; 1948 ncp_ts->nc_nc.nc_flag |= NCF_DTS; 1949 } 1950 } 1951 len = ncp->nc_nlen = cnp->cn_namelen; 1952 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1953 memcpy(ncp->nc_name, cnp->cn_nameptr, len); 1954 ncp->nc_name[len] = '\0'; 1955 cache_enter_lock(&cel, dvp, vp, hash); 1956 1957 /* 1958 * See if this vnode or negative entry is already in the cache 1959 * with this name. This can happen with concurrent lookups of 1960 * the same path name. 1961 */ 1962 ncpp = NCHHASH(hash); 1963 CK_SLIST_FOREACH(n2, ncpp, nc_hash) { 1964 if (n2->nc_dvp == dvp && 1965 n2->nc_nlen == cnp->cn_namelen && 1966 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { 1967 MPASS(cache_ncp_canuse(n2)); 1968 if ((n2->nc_flag & NCF_NEGATIVE) != 0) 1969 KASSERT(vp == NULL, 1970 ("%s: found entry pointing to a different vnode (%p != %p)", 1971 __func__, NULL, vp)); 1972 else 1973 KASSERT(n2->nc_vp == vp, 1974 ("%s: found entry pointing to a different vnode (%p != %p)", 1975 __func__, n2->nc_vp, vp)); 1976 if (tsp != NULL) { 1977 KASSERT((n2->nc_flag & NCF_TS) != 0, 1978 ("no NCF_TS")); 1979 n2_ts = __containerof(n2, struct namecache_ts, nc_nc); 1980 n2_ts->nc_time = ncp_ts->nc_time; 1981 n2_ts->nc_ticks = ncp_ts->nc_ticks; 1982 if (dtsp != NULL) { 1983 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; 1984 n2_ts->nc_nc.nc_flag |= NCF_DTS; 1985 } 1986 } 1987 goto out_unlock_free; 1988 } 1989 } 1990 1991 if (flag == NCF_ISDOTDOT) { 1992 /* 1993 * See if we are trying to add .. entry, but some other lookup 1994 * has populated v_cache_dd pointer already. 1995 */ 1996 if (dvp->v_cache_dd != NULL) 1997 goto out_unlock_free; 1998 KASSERT(vp == NULL || vp->v_type == VDIR, 1999 ("wrong vnode type %p", vp)); 2000 vn_seqc_write_begin(dvp); 2001 dvp->v_cache_dd = ncp; 2002 vn_seqc_write_end(dvp); 2003 } 2004 2005 if (vp != NULL) { 2006 if (flag != NCF_ISDOTDOT) { 2007 /* 2008 * For this case, the cache entry maps both the 2009 * directory name in it and the name ".." for the 2010 * directory's parent. 2011 */ 2012 vn_seqc_write_begin(vp); 2013 if ((ndd = vp->v_cache_dd) != NULL) { 2014 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) 2015 cache_zap_locked(ndd); 2016 else 2017 ndd = NULL; 2018 } 2019 vp->v_cache_dd = ncp; 2020 vn_seqc_write_end(vp); 2021 } else if (vp->v_type != VDIR) { 2022 if (vp->v_cache_dd != NULL) { 2023 vn_seqc_write_begin(vp); 2024 vp->v_cache_dd = NULL; 2025 vn_seqc_write_end(vp); 2026 } 2027 } 2028 } 2029 2030 if (flag != NCF_ISDOTDOT) { 2031 if (LIST_EMPTY(&dvp->v_cache_src)) { 2032 vhold(dvp); 2033 counter_u64_add(numcachehv, 1); 2034 } 2035 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); 2036 } 2037 2038 /* 2039 * If the entry is "negative", we place it into the 2040 * "negative" cache queue, otherwise, we place it into the 2041 * destination vnode's cache entries queue. 2042 */ 2043 if (vp != NULL) { 2044 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); 2045 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, 2046 vp); 2047 } else { 2048 if (cnp->cn_flags & ISWHITEOUT) 2049 ncp->nc_flag |= NCF_WHITE; 2050 cache_negative_insert(ncp); 2051 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, 2052 ncp->nc_name); 2053 } 2054 2055 /* 2056 * Insert the new namecache entry into the appropriate chain 2057 * within the cache entries table. 2058 */ 2059 CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash); 2060 2061 atomic_thread_fence_rel(); 2062 /* 2063 * Mark the entry as fully constructed. 2064 * It is immutable past this point until its removal. 2065 */ 2066 atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP); 2067 2068 cache_enter_unlock(&cel); 2069 if (numneg * ncnegfactor > lnumcache) 2070 cache_negative_zap_one(); 2071 cache_free(ndd); 2072 return; 2073 out_unlock_free: 2074 cache_enter_unlock(&cel); 2075 atomic_add_long(&numcache, -1); 2076 cache_free(ncp); 2077 return; 2078 } 2079 2080 static u_int 2081 cache_roundup_2(u_int val) 2082 { 2083 u_int res; 2084 2085 for (res = 1; res <= val; res <<= 1) 2086 continue; 2087 2088 return (res); 2089 } 2090 2091 static struct nchashhead * 2092 nchinittbl(u_long elements, u_long *hashmask) 2093 { 2094 struct nchashhead *hashtbl; 2095 u_long hashsize, i; 2096 2097 hashsize = cache_roundup_2(elements) / 2; 2098 2099 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK); 2100 for (i = 0; i < hashsize; i++) 2101 CK_SLIST_INIT(&hashtbl[i]); 2102 *hashmask = hashsize - 1; 2103 return (hashtbl); 2104 } 2105 2106 static void 2107 ncfreetbl(struct nchashhead *hashtbl) 2108 { 2109 2110 free(hashtbl, M_VFSCACHE); 2111 } 2112 2113 /* 2114 * Name cache initialization, from vfs_init() when we are booting 2115 */ 2116 static void 2117 nchinit(void *dummy __unused) 2118 { 2119 u_int i; 2120 2121 cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE, 2122 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2123 cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE, 2124 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2125 cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE, 2126 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2127 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE, 2128 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2129 2130 VFS_SMR_ZONE_SET(cache_zone_small); 2131 VFS_SMR_ZONE_SET(cache_zone_small_ts); 2132 VFS_SMR_ZONE_SET(cache_zone_large); 2133 VFS_SMR_ZONE_SET(cache_zone_large_ts); 2134 2135 ncsize = desiredvnodes * ncsizefactor; 2136 nchashtbl = nchinittbl(desiredvnodes * 2, &nchash); 2137 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; 2138 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ 2139 ncbuckethash = 7; 2140 if (ncbuckethash > nchash) 2141 ncbuckethash = nchash; 2142 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, 2143 M_WAITOK | M_ZERO); 2144 for (i = 0; i < numbucketlocks; i++) 2145 rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE); 2146 ncvnodehash = ncbuckethash; 2147 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, 2148 M_WAITOK | M_ZERO); 2149 for (i = 0; i < numvnodelocks; i++) 2150 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); 2151 ncpurgeminvnodes = numbucketlocks * 2; 2152 2153 neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE, 2154 M_WAITOK | M_ZERO); 2155 for (i = 0; i < numneglists; i++) { 2156 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); 2157 TAILQ_INIT(&neglists[i].nl_list); 2158 } 2159 mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF); 2160 TAILQ_INIT(&ncneg_hot.nl_list); 2161 2162 mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF); 2163 } 2164 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); 2165 2166 void 2167 cache_vnode_init(struct vnode *vp) 2168 { 2169 2170 LIST_INIT(&vp->v_cache_src); 2171 TAILQ_INIT(&vp->v_cache_dst); 2172 vp->v_cache_dd = NULL; 2173 cache_prehash(vp); 2174 } 2175 2176 void 2177 cache_changesize(u_long newmaxvnodes) 2178 { 2179 struct nchashhead *new_nchashtbl, *old_nchashtbl; 2180 u_long new_nchash, old_nchash; 2181 struct namecache *ncp; 2182 uint32_t hash; 2183 u_long newncsize; 2184 int i; 2185 2186 newncsize = newmaxvnodes * ncsizefactor; 2187 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); 2188 if (newmaxvnodes < numbucketlocks) 2189 newmaxvnodes = numbucketlocks; 2190 2191 new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash); 2192 /* If same hash table size, nothing to do */ 2193 if (nchash == new_nchash) { 2194 ncfreetbl(new_nchashtbl); 2195 return; 2196 } 2197 /* 2198 * Move everything from the old hash table to the new table. 2199 * None of the namecache entries in the table can be removed 2200 * because to do so, they have to be removed from the hash table. 2201 */ 2202 cache_lock_all_vnodes(); 2203 cache_lock_all_buckets(); 2204 old_nchashtbl = nchashtbl; 2205 old_nchash = nchash; 2206 nchashtbl = new_nchashtbl; 2207 nchash = new_nchash; 2208 for (i = 0; i <= old_nchash; i++) { 2209 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) { 2210 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, 2211 ncp->nc_dvp); 2212 CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash); 2213 CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); 2214 } 2215 } 2216 ncsize = newncsize; 2217 cache_unlock_all_buckets(); 2218 cache_unlock_all_vnodes(); 2219 ncfreetbl(old_nchashtbl); 2220 } 2221 2222 /* 2223 * Invalidate all entries from and to a particular vnode. 2224 */ 2225 static void 2226 cache_purge_impl(struct vnode *vp) 2227 { 2228 TAILQ_HEAD(, namecache) ncps; 2229 struct namecache *ncp, *nnp; 2230 struct mtx *vlp, *vlp2; 2231 2232 TAILQ_INIT(&ncps); 2233 vlp = VP2VNODELOCK(vp); 2234 vlp2 = NULL; 2235 mtx_assert(vlp, MA_OWNED); 2236 retry: 2237 while (!LIST_EMPTY(&vp->v_cache_src)) { 2238 ncp = LIST_FIRST(&vp->v_cache_src); 2239 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2240 goto retry; 2241 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2242 } 2243 while (!TAILQ_EMPTY(&vp->v_cache_dst)) { 2244 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2245 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2246 goto retry; 2247 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2248 } 2249 ncp = vp->v_cache_dd; 2250 if (ncp != NULL) { 2251 KASSERT(ncp->nc_flag & NCF_ISDOTDOT, 2252 ("lost dotdot link")); 2253 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2254 goto retry; 2255 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2256 } 2257 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); 2258 mtx_unlock(vlp); 2259 if (vlp2 != NULL) 2260 mtx_unlock(vlp2); 2261 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2262 cache_free(ncp); 2263 } 2264 } 2265 2266 void 2267 cache_purge(struct vnode *vp) 2268 { 2269 struct mtx *vlp; 2270 2271 SDT_PROBE1(vfs, namecache, purge, done, vp); 2272 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2273 vp->v_cache_dd == NULL) 2274 return; 2275 vlp = VP2VNODELOCK(vp); 2276 mtx_lock(vlp); 2277 cache_purge_impl(vp); 2278 } 2279 2280 /* 2281 * Only to be used by vgone. 2282 */ 2283 void 2284 cache_purge_vgone(struct vnode *vp) 2285 { 2286 struct mtx *vlp; 2287 2288 VNPASS(VN_IS_DOOMED(vp), vp); 2289 vlp = VP2VNODELOCK(vp); 2290 if (!(LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2291 vp->v_cache_dd == NULL)) { 2292 mtx_lock(vlp); 2293 cache_purge_impl(vp); 2294 mtx_assert(vlp, MA_NOTOWNED); 2295 return; 2296 } 2297 2298 /* 2299 * All the NULL pointer state we found above may be transient. 2300 * Serialize against a possible thread doing cache_purge. 2301 */ 2302 mtx_wait_unlocked(vlp); 2303 if (!(LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2304 vp->v_cache_dd == NULL)) { 2305 mtx_lock(vlp); 2306 cache_purge_impl(vp); 2307 mtx_assert(vlp, MA_NOTOWNED); 2308 return; 2309 } 2310 return; 2311 } 2312 2313 /* 2314 * Invalidate all negative entries for a particular directory vnode. 2315 */ 2316 void 2317 cache_purge_negative(struct vnode *vp) 2318 { 2319 TAILQ_HEAD(, namecache) ncps; 2320 struct namecache *ncp, *nnp; 2321 struct mtx *vlp; 2322 2323 CTR1(KTR_VFS, "cache_purge_negative(%p)", vp); 2324 SDT_PROBE1(vfs, namecache, purge_negative, done, vp); 2325 if (LIST_EMPTY(&vp->v_cache_src)) 2326 return; 2327 TAILQ_INIT(&ncps); 2328 vlp = VP2VNODELOCK(vp); 2329 mtx_lock(vlp); 2330 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { 2331 if (!(ncp->nc_flag & NCF_NEGATIVE)) 2332 continue; 2333 cache_zap_negative_locked_vnode_kl(ncp, vp); 2334 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2335 } 2336 mtx_unlock(vlp); 2337 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2338 cache_free(ncp); 2339 } 2340 } 2341 2342 void 2343 cache_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, 2344 struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp) 2345 { 2346 2347 ASSERT_VOP_IN_SEQC(fdvp); 2348 ASSERT_VOP_IN_SEQC(fvp); 2349 ASSERT_VOP_IN_SEQC(tdvp); 2350 if (tvp != NULL) 2351 ASSERT_VOP_IN_SEQC(tvp); 2352 2353 cache_purge(fvp); 2354 if (tvp != NULL) { 2355 cache_purge(tvp); 2356 KASSERT(!cache_remove_cnp(tdvp, tcnp), 2357 ("%s: lingering negative entry", __func__)); 2358 } else { 2359 cache_remove_cnp(tdvp, tcnp); 2360 } 2361 } 2362 2363 /* 2364 * Flush all entries referencing a particular filesystem. 2365 */ 2366 void 2367 cache_purgevfs(struct mount *mp, bool force) 2368 { 2369 TAILQ_HEAD(, namecache) ncps; 2370 struct mtx *vlp1, *vlp2; 2371 struct rwlock *blp; 2372 struct nchashhead *bucket; 2373 struct namecache *ncp, *nnp; 2374 u_long i, j, n_nchash; 2375 int error; 2376 2377 /* Scan hash tables for applicable entries */ 2378 SDT_PROBE1(vfs, namecache, purgevfs, done, mp); 2379 if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes) 2380 return; 2381 TAILQ_INIT(&ncps); 2382 n_nchash = nchash + 1; 2383 vlp1 = vlp2 = NULL; 2384 for (i = 0; i < numbucketlocks; i++) { 2385 blp = (struct rwlock *)&bucketlocks[i]; 2386 rw_wlock(blp); 2387 for (j = i; j < n_nchash; j += numbucketlocks) { 2388 retry: 2389 bucket = &nchashtbl[j]; 2390 CK_SLIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) { 2391 cache_assert_bucket_locked(ncp, RA_WLOCKED); 2392 if (ncp->nc_dvp->v_mount != mp) 2393 continue; 2394 error = cache_zap_wlocked_bucket_kl(ncp, blp, 2395 &vlp1, &vlp2); 2396 if (error != 0) 2397 goto retry; 2398 TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst); 2399 } 2400 } 2401 rw_wunlock(blp); 2402 if (vlp1 == NULL && vlp2 == NULL) 2403 cache_maybe_yield(); 2404 } 2405 if (vlp1 != NULL) 2406 mtx_unlock(vlp1); 2407 if (vlp2 != NULL) 2408 mtx_unlock(vlp2); 2409 2410 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2411 cache_free(ncp); 2412 } 2413 } 2414 2415 /* 2416 * Perform canonical checks and cache lookup and pass on to filesystem 2417 * through the vop_cachedlookup only if needed. 2418 */ 2419 2420 int 2421 vfs_cache_lookup(struct vop_lookup_args *ap) 2422 { 2423 struct vnode *dvp; 2424 int error; 2425 struct vnode **vpp = ap->a_vpp; 2426 struct componentname *cnp = ap->a_cnp; 2427 int flags = cnp->cn_flags; 2428 2429 *vpp = NULL; 2430 dvp = ap->a_dvp; 2431 2432 if (dvp->v_type != VDIR) 2433 return (ENOTDIR); 2434 2435 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && 2436 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) 2437 return (EROFS); 2438 2439 error = vn_dir_check_exec(dvp, cnp); 2440 if (error != 0) 2441 return (error); 2442 2443 error = cache_lookup(dvp, vpp, cnp, NULL, NULL); 2444 if (error == 0) 2445 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); 2446 if (error == -1) 2447 return (0); 2448 return (error); 2449 } 2450 2451 /* Implementation of the getcwd syscall. */ 2452 int 2453 sys___getcwd(struct thread *td, struct __getcwd_args *uap) 2454 { 2455 char *buf, *retbuf; 2456 size_t buflen; 2457 int error; 2458 2459 buflen = uap->buflen; 2460 if (__predict_false(buflen < 2)) 2461 return (EINVAL); 2462 if (buflen > MAXPATHLEN) 2463 buflen = MAXPATHLEN; 2464 2465 buf = uma_zalloc(namei_zone, M_WAITOK); 2466 error = vn_getcwd(buf, &retbuf, &buflen); 2467 if (error == 0) 2468 error = copyout(retbuf, uap->buf, buflen); 2469 uma_zfree(namei_zone, buf); 2470 return (error); 2471 } 2472 2473 int 2474 vn_getcwd(char *buf, char **retbuf, size_t *buflen) 2475 { 2476 struct pwd *pwd; 2477 int error; 2478 2479 pwd = pwd_hold(curthread); 2480 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, buflen); 2481 pwd_drop(pwd); 2482 2483 #ifdef KTRACE 2484 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0) 2485 ktrnamei(*retbuf); 2486 #endif 2487 return (error); 2488 } 2489 2490 static int 2491 kern___realpathat(struct thread *td, int fd, const char *path, char *buf, 2492 size_t size, int flags, enum uio_seg pathseg) 2493 { 2494 struct nameidata nd; 2495 char *retbuf, *freebuf; 2496 int error; 2497 2498 if (flags != 0) 2499 return (EINVAL); 2500 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1, 2501 pathseg, path, fd, &cap_fstat_rights, td); 2502 if ((error = namei(&nd)) != 0) 2503 return (error); 2504 error = vn_fullpath_hardlink(&nd, &retbuf, &freebuf, &size); 2505 if (error == 0) { 2506 error = copyout(retbuf, buf, size); 2507 free(freebuf, M_TEMP); 2508 } 2509 NDFREE(&nd, 0); 2510 return (error); 2511 } 2512 2513 int 2514 sys___realpathat(struct thread *td, struct __realpathat_args *uap) 2515 { 2516 2517 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size, 2518 uap->flags, UIO_USERSPACE)); 2519 } 2520 2521 /* 2522 * Retrieve the full filesystem path that correspond to a vnode from the name 2523 * cache (if available) 2524 */ 2525 int 2526 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf) 2527 { 2528 struct pwd *pwd; 2529 char *buf; 2530 size_t buflen; 2531 int error; 2532 2533 if (__predict_false(vp == NULL)) 2534 return (EINVAL); 2535 2536 buflen = MAXPATHLEN; 2537 buf = malloc(buflen, M_TEMP, M_WAITOK); 2538 pwd = pwd_hold(curthread); 2539 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen); 2540 pwd_drop(pwd); 2541 if (error == 0) 2542 *freebuf = buf; 2543 else 2544 free(buf, M_TEMP); 2545 return (error); 2546 } 2547 2548 /* 2549 * This function is similar to vn_fullpath, but it attempts to lookup the 2550 * pathname relative to the global root mount point. This is required for the 2551 * auditing sub-system, as audited pathnames must be absolute, relative to the 2552 * global root mount point. 2553 */ 2554 int 2555 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf) 2556 { 2557 char *buf; 2558 size_t buflen; 2559 int error; 2560 2561 if (__predict_false(vp == NULL)) 2562 return (EINVAL); 2563 buflen = MAXPATHLEN; 2564 buf = malloc(buflen, M_TEMP, M_WAITOK); 2565 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen); 2566 if (error == 0) 2567 *freebuf = buf; 2568 else 2569 free(buf, M_TEMP); 2570 return (error); 2571 } 2572 2573 static struct namecache * 2574 vn_dd_from_dst(struct vnode *vp) 2575 { 2576 struct namecache *ncp; 2577 2578 cache_assert_vnode_locked(vp); 2579 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) { 2580 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2581 return (ncp); 2582 } 2583 return (NULL); 2584 } 2585 2586 int 2587 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, size_t *buflen) 2588 { 2589 struct vnode *dvp; 2590 struct namecache *ncp; 2591 struct mtx *vlp; 2592 int error; 2593 2594 vlp = VP2VNODELOCK(*vp); 2595 mtx_lock(vlp); 2596 ncp = (*vp)->v_cache_dd; 2597 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) { 2598 KASSERT(ncp == vn_dd_from_dst(*vp), 2599 ("%s: mismatch for dd entry (%p != %p)", __func__, 2600 ncp, vn_dd_from_dst(*vp))); 2601 } else { 2602 ncp = vn_dd_from_dst(*vp); 2603 } 2604 if (ncp != NULL) { 2605 if (*buflen < ncp->nc_nlen) { 2606 mtx_unlock(vlp); 2607 vrele(*vp); 2608 counter_u64_add(numfullpathfail4, 1); 2609 error = ENOMEM; 2610 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2611 vp, NULL); 2612 return (error); 2613 } 2614 *buflen -= ncp->nc_nlen; 2615 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 2616 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, 2617 ncp->nc_name, vp); 2618 dvp = *vp; 2619 *vp = ncp->nc_dvp; 2620 vref(*vp); 2621 mtx_unlock(vlp); 2622 vrele(dvp); 2623 return (0); 2624 } 2625 SDT_PROBE1(vfs, namecache, fullpath, miss, vp); 2626 2627 mtx_unlock(vlp); 2628 vn_lock(*vp, LK_SHARED | LK_RETRY); 2629 error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen); 2630 vput(*vp); 2631 if (error) { 2632 counter_u64_add(numfullpathfail2, 1); 2633 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2634 return (error); 2635 } 2636 2637 *vp = dvp; 2638 if (VN_IS_DOOMED(dvp)) { 2639 /* forced unmount */ 2640 vrele(dvp); 2641 error = ENOENT; 2642 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2643 return (error); 2644 } 2645 /* 2646 * *vp has its use count incremented still. 2647 */ 2648 2649 return (0); 2650 } 2651 2652 /* 2653 * Resolve a directory to a pathname. 2654 * 2655 * The name of the directory can always be found in the namecache or fetched 2656 * from the filesystem. There is also guaranteed to be only one parent, meaning 2657 * we can just follow vnodes up until we find the root. 2658 * 2659 * The vnode must be referenced. 2660 */ 2661 static int 2662 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 2663 size_t *len, bool slash_prefixed, size_t addend) 2664 { 2665 #ifdef KDTRACE_HOOKS 2666 struct vnode *startvp = vp; 2667 #endif 2668 struct vnode *vp1; 2669 size_t buflen; 2670 int error; 2671 2672 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); 2673 VNPASS(vp->v_usecount > 0, vp); 2674 2675 buflen = *len; 2676 2677 if (!slash_prefixed) { 2678 MPASS(*len >= 2); 2679 buflen--; 2680 buf[buflen] = '\0'; 2681 } 2682 2683 error = 0; 2684 2685 SDT_PROBE1(vfs, namecache, fullpath, entry, vp); 2686 counter_u64_add(numfullpathcalls, 1); 2687 while (vp != rdir && vp != rootvnode) { 2688 /* 2689 * The vp vnode must be already fully constructed, 2690 * since it is either found in namecache or obtained 2691 * from VOP_VPTOCNP(). We may test for VV_ROOT safely 2692 * without obtaining the vnode lock. 2693 */ 2694 if ((vp->v_vflag & VV_ROOT) != 0) { 2695 vn_lock(vp, LK_RETRY | LK_SHARED); 2696 2697 /* 2698 * With the vnode locked, check for races with 2699 * unmount, forced or not. Note that we 2700 * already verified that vp is not equal to 2701 * the root vnode, which means that 2702 * mnt_vnodecovered can be NULL only for the 2703 * case of unmount. 2704 */ 2705 if (VN_IS_DOOMED(vp) || 2706 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || 2707 vp1->v_mountedhere != vp->v_mount) { 2708 vput(vp); 2709 error = ENOENT; 2710 SDT_PROBE3(vfs, namecache, fullpath, return, 2711 error, vp, NULL); 2712 break; 2713 } 2714 2715 vref(vp1); 2716 vput(vp); 2717 vp = vp1; 2718 continue; 2719 } 2720 if (vp->v_type != VDIR) { 2721 vrele(vp); 2722 counter_u64_add(numfullpathfail1, 1); 2723 error = ENOTDIR; 2724 SDT_PROBE3(vfs, namecache, fullpath, return, 2725 error, vp, NULL); 2726 break; 2727 } 2728 error = vn_vptocnp(&vp, curthread->td_ucred, buf, &buflen); 2729 if (error) 2730 break; 2731 if (buflen == 0) { 2732 vrele(vp); 2733 error = ENOMEM; 2734 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2735 startvp, NULL); 2736 break; 2737 } 2738 buf[--buflen] = '/'; 2739 slash_prefixed = true; 2740 } 2741 if (error) 2742 return (error); 2743 if (!slash_prefixed) { 2744 if (buflen == 0) { 2745 vrele(vp); 2746 counter_u64_add(numfullpathfail4, 1); 2747 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, 2748 startvp, NULL); 2749 return (ENOMEM); 2750 } 2751 buf[--buflen] = '/'; 2752 } 2753 counter_u64_add(numfullpathfound, 1); 2754 vrele(vp); 2755 2756 *retbuf = buf + buflen; 2757 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf); 2758 *len -= buflen; 2759 *len += addend; 2760 return (0); 2761 } 2762 2763 /* 2764 * Resolve an arbitrary vnode to a pathname. 2765 * 2766 * Note 2 caveats: 2767 * - hardlinks are not tracked, thus if the vnode is not a directory this can 2768 * resolve to a different path than the one used to find it 2769 * - namecache is not mandatory, meaning names are not guaranteed to be added 2770 * (in which case resolving fails) 2771 */ 2772 static int 2773 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 2774 size_t *buflen) 2775 { 2776 size_t orig_buflen; 2777 bool slash_prefixed; 2778 int error; 2779 2780 if (*buflen < 2) 2781 return (EINVAL); 2782 2783 orig_buflen = *buflen; 2784 2785 vref(vp); 2786 slash_prefixed = false; 2787 if (vp->v_type != VDIR) { 2788 *buflen -= 1; 2789 buf[*buflen] = '\0'; 2790 error = vn_vptocnp(&vp, curthread->td_ucred, buf, buflen); 2791 if (error) 2792 return (error); 2793 if (*buflen == 0) { 2794 vrele(vp); 2795 return (ENOMEM); 2796 } 2797 *buflen -= 1; 2798 buf[*buflen] = '/'; 2799 slash_prefixed = true; 2800 } 2801 2802 return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, slash_prefixed, 2803 orig_buflen - *buflen)); 2804 } 2805 2806 /* 2807 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks). 2808 * 2809 * Since the namecache does not track handlings, the caller is expected to first 2810 * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei. 2811 * 2812 * Then we have 2 cases: 2813 * - if the found vnode is a directory, the path can be constructed just by 2814 * fullowing names up the chain 2815 * - otherwise we populate the buffer with the saved name and start resolving 2816 * from the parent 2817 */ 2818 static int 2819 vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, char **freebuf, 2820 size_t *buflen) 2821 { 2822 char *buf, *tmpbuf; 2823 struct pwd *pwd; 2824 struct componentname *cnp; 2825 struct vnode *vp; 2826 size_t addend; 2827 int error; 2828 bool slash_prefixed; 2829 enum vtype type; 2830 2831 if (*buflen < 2) 2832 return (EINVAL); 2833 if (*buflen > MAXPATHLEN) 2834 *buflen = MAXPATHLEN; 2835 2836 slash_prefixed = false; 2837 2838 buf = malloc(*buflen, M_TEMP, M_WAITOK); 2839 pwd = pwd_hold(curthread); 2840 2841 addend = 0; 2842 vp = ndp->ni_vp; 2843 /* 2844 * Check for VBAD to work around the vp_crossmp bug in lookup(). 2845 * 2846 * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be 2847 * set to mount point's root vnode while ni_dvp will be vp_crossmp. 2848 * If the type is VDIR (like in this very case) we can skip looking 2849 * at ni_dvp in the first place. However, since vnodes get passed here 2850 * unlocked the target may transition to doomed state (type == VBAD) 2851 * before we get to evaluate the condition. If this happens, we will 2852 * populate part of the buffer and descend to vn_fullpath_dir with 2853 * vp == vp_crossmp. Prevent the problem by checking for VBAD. 2854 * 2855 * This should be atomic_load(&vp->v_type) but it is ilegal to take 2856 * an address of a bit field, even if said field is sized to char. 2857 * Work around the problem by reading the value into a full-sized enum 2858 * and then re-reading it with atomic_load which will still prevent 2859 * the compiler from re-reading down the road. 2860 */ 2861 type = vp->v_type; 2862 type = atomic_load_int(&type); 2863 if (type == VBAD) { 2864 error = ENOENT; 2865 goto out_bad; 2866 } 2867 if (type != VDIR) { 2868 cnp = &ndp->ni_cnd; 2869 addend = cnp->cn_namelen + 2; 2870 if (*buflen < addend) { 2871 error = ENOMEM; 2872 goto out_bad; 2873 } 2874 *buflen -= addend; 2875 tmpbuf = buf + *buflen; 2876 tmpbuf[0] = '/'; 2877 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen); 2878 tmpbuf[addend - 1] = '\0'; 2879 slash_prefixed = true; 2880 vp = ndp->ni_dvp; 2881 } 2882 2883 vref(vp); 2884 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen, 2885 slash_prefixed, addend); 2886 if (error != 0) 2887 goto out_bad; 2888 2889 pwd_drop(pwd); 2890 *freebuf = buf; 2891 2892 return (0); 2893 out_bad: 2894 pwd_drop(pwd); 2895 free(buf, M_TEMP); 2896 return (error); 2897 } 2898 2899 struct vnode * 2900 vn_dir_dd_ino(struct vnode *vp) 2901 { 2902 struct namecache *ncp; 2903 struct vnode *ddvp; 2904 struct mtx *vlp; 2905 enum vgetstate vs; 2906 2907 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); 2908 vlp = VP2VNODELOCK(vp); 2909 mtx_lock(vlp); 2910 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { 2911 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 2912 continue; 2913 ddvp = ncp->nc_dvp; 2914 vs = vget_prep(ddvp); 2915 mtx_unlock(vlp); 2916 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) 2917 return (NULL); 2918 return (ddvp); 2919 } 2920 mtx_unlock(vlp); 2921 return (NULL); 2922 } 2923 2924 int 2925 vn_commname(struct vnode *vp, char *buf, u_int buflen) 2926 { 2927 struct namecache *ncp; 2928 struct mtx *vlp; 2929 int l; 2930 2931 vlp = VP2VNODELOCK(vp); 2932 mtx_lock(vlp); 2933 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) 2934 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2935 break; 2936 if (ncp == NULL) { 2937 mtx_unlock(vlp); 2938 return (ENOENT); 2939 } 2940 l = min(ncp->nc_nlen, buflen - 1); 2941 memcpy(buf, ncp->nc_name, l); 2942 mtx_unlock(vlp); 2943 buf[l] = '\0'; 2944 return (0); 2945 } 2946 2947 /* 2948 * This function updates path string to vnode's full global path 2949 * and checks the size of the new path string against the pathlen argument. 2950 * 2951 * Requires a locked, referenced vnode. 2952 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 2953 * 2954 * If vp is a directory, the call to vn_fullpath_global() always succeeds 2955 * because it falls back to the ".." lookup if the namecache lookup fails. 2956 */ 2957 int 2958 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, 2959 u_int pathlen) 2960 { 2961 struct nameidata nd; 2962 struct vnode *vp1; 2963 char *rpath, *fbuf; 2964 int error; 2965 2966 ASSERT_VOP_ELOCKED(vp, __func__); 2967 2968 /* Construct global filesystem path from vp. */ 2969 VOP_UNLOCK(vp); 2970 error = vn_fullpath_global(vp, &rpath, &fbuf); 2971 2972 if (error != 0) { 2973 vrele(vp); 2974 return (error); 2975 } 2976 2977 if (strlen(rpath) >= pathlen) { 2978 vrele(vp); 2979 error = ENAMETOOLONG; 2980 goto out; 2981 } 2982 2983 /* 2984 * Re-lookup the vnode by path to detect a possible rename. 2985 * As a side effect, the vnode is relocked. 2986 * If vnode was renamed, return ENOENT. 2987 */ 2988 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, 2989 UIO_SYSSPACE, path, td); 2990 error = namei(&nd); 2991 if (error != 0) { 2992 vrele(vp); 2993 goto out; 2994 } 2995 NDFREE(&nd, NDF_ONLY_PNBUF); 2996 vp1 = nd.ni_vp; 2997 vrele(vp); 2998 if (vp1 == vp) 2999 strcpy(path, rpath); 3000 else { 3001 vput(vp1); 3002 error = ENOENT; 3003 } 3004 3005 out: 3006 free(fbuf, M_TEMP); 3007 return (error); 3008 } 3009 3010 #ifdef DDB 3011 static void 3012 db_print_vpath(struct vnode *vp) 3013 { 3014 3015 while (vp != NULL) { 3016 db_printf("%p: ", vp); 3017 if (vp == rootvnode) { 3018 db_printf("/"); 3019 vp = NULL; 3020 } else { 3021 if (vp->v_vflag & VV_ROOT) { 3022 db_printf("<mount point>"); 3023 vp = vp->v_mount->mnt_vnodecovered; 3024 } else { 3025 struct namecache *ncp; 3026 char *ncn; 3027 int i; 3028 3029 ncp = TAILQ_FIRST(&vp->v_cache_dst); 3030 if (ncp != NULL) { 3031 ncn = ncp->nc_name; 3032 for (i = 0; i < ncp->nc_nlen; i++) 3033 db_printf("%c", *ncn++); 3034 vp = ncp->nc_dvp; 3035 } else { 3036 vp = NULL; 3037 } 3038 } 3039 } 3040 db_printf("\n"); 3041 } 3042 3043 return; 3044 } 3045 3046 DB_SHOW_COMMAND(vpath, db_show_vpath) 3047 { 3048 struct vnode *vp; 3049 3050 if (!have_addr) { 3051 db_printf("usage: show vpath <struct vnode *>\n"); 3052 return; 3053 } 3054 3055 vp = (struct vnode *)addr; 3056 db_print_vpath(vp); 3057 } 3058 3059 #endif 3060 3061 static bool __read_frequently cache_fast_lookup = true; 3062 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW, 3063 &cache_fast_lookup, 0, ""); 3064 3065 #define CACHE_FPL_FAILED -2020 3066 3067 static void 3068 cache_fpl_cleanup_cnp(struct componentname *cnp) 3069 { 3070 3071 uma_zfree(namei_zone, cnp->cn_pnbuf); 3072 #ifdef DIAGNOSTIC 3073 cnp->cn_pnbuf = NULL; 3074 cnp->cn_nameptr = NULL; 3075 #endif 3076 } 3077 3078 static void 3079 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp) 3080 { 3081 struct componentname *cnp; 3082 3083 cnp = &ndp->ni_cnd; 3084 while (*(cnp->cn_nameptr) == '/') { 3085 cnp->cn_nameptr++; 3086 ndp->ni_pathlen--; 3087 } 3088 3089 *dpp = ndp->ni_rootdir; 3090 } 3091 3092 /* 3093 * Components of nameidata (or objects it can point to) which may 3094 * need restoring in case fast path lookup fails. 3095 */ 3096 struct nameidata_saved { 3097 long cn_namelen; 3098 char *cn_nameptr; 3099 size_t ni_pathlen; 3100 int cn_flags; 3101 }; 3102 3103 struct cache_fpl { 3104 struct nameidata *ndp; 3105 struct componentname *cnp; 3106 struct pwd *pwd; 3107 struct vnode *dvp; 3108 struct vnode *tvp; 3109 seqc_t dvp_seqc; 3110 seqc_t tvp_seqc; 3111 struct nameidata_saved snd; 3112 int line; 3113 enum cache_fpl_status status:8; 3114 bool in_smr; 3115 }; 3116 3117 static void 3118 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd) 3119 { 3120 3121 snd->cn_flags = fpl->ndp->ni_cnd.cn_flags; 3122 snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen; 3123 snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr; 3124 snd->ni_pathlen = fpl->ndp->ni_pathlen; 3125 } 3126 3127 static void 3128 cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd) 3129 { 3130 3131 fpl->ndp->ni_cnd.cn_flags = snd->cn_flags; 3132 fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen; 3133 fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr; 3134 fpl->ndp->ni_pathlen = snd->ni_pathlen; 3135 } 3136 3137 #ifdef INVARIANTS 3138 #define cache_fpl_smr_assert_entered(fpl) ({ \ 3139 struct cache_fpl *_fpl = (fpl); \ 3140 MPASS(_fpl->in_smr == true); \ 3141 VFS_SMR_ASSERT_ENTERED(); \ 3142 }) 3143 #define cache_fpl_smr_assert_not_entered(fpl) ({ \ 3144 struct cache_fpl *_fpl = (fpl); \ 3145 MPASS(_fpl->in_smr == false); \ 3146 VFS_SMR_ASSERT_NOT_ENTERED(); \ 3147 }) 3148 #else 3149 #define cache_fpl_smr_assert_entered(fpl) do { } while (0) 3150 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0) 3151 #endif 3152 3153 #define cache_fpl_smr_enter_initial(fpl) ({ \ 3154 struct cache_fpl *_fpl = (fpl); \ 3155 vfs_smr_enter(); \ 3156 _fpl->in_smr = true; \ 3157 }) 3158 3159 #define cache_fpl_smr_enter(fpl) ({ \ 3160 struct cache_fpl *_fpl = (fpl); \ 3161 MPASS(_fpl->in_smr == false); \ 3162 vfs_smr_enter(); \ 3163 _fpl->in_smr = true; \ 3164 }) 3165 3166 #define cache_fpl_smr_exit(fpl) ({ \ 3167 struct cache_fpl *_fpl = (fpl); \ 3168 MPASS(_fpl->in_smr == true); \ 3169 vfs_smr_exit(); \ 3170 _fpl->in_smr = false; \ 3171 }) 3172 3173 static int 3174 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line) 3175 { 3176 3177 if (fpl->status != CACHE_FPL_STATUS_UNSET) { 3178 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, 3179 ("%s: converting to abort from %d at %d, set at %d\n", 3180 __func__, fpl->status, line, fpl->line)); 3181 } 3182 fpl->status = CACHE_FPL_STATUS_ABORTED; 3183 fpl->line = line; 3184 return (CACHE_FPL_FAILED); 3185 } 3186 3187 #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__) 3188 3189 static int 3190 cache_fpl_partial_impl(struct cache_fpl *fpl, int line) 3191 { 3192 3193 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3194 ("%s: setting to partial at %d, but already set to %d at %d\n", 3195 __func__, line, fpl->status, fpl->line)); 3196 cache_fpl_smr_assert_entered(fpl); 3197 fpl->status = CACHE_FPL_STATUS_PARTIAL; 3198 fpl->line = line; 3199 return (CACHE_FPL_FAILED); 3200 } 3201 3202 #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__) 3203 3204 static int 3205 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line) 3206 { 3207 3208 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3209 ("%s: setting to handled at %d, but already set to %d at %d\n", 3210 __func__, line, fpl->status, fpl->line)); 3211 cache_fpl_smr_assert_not_entered(fpl); 3212 MPASS(error != CACHE_FPL_FAILED); 3213 fpl->status = CACHE_FPL_STATUS_HANDLED; 3214 fpl->line = line; 3215 return (error); 3216 } 3217 3218 #define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__) 3219 3220 #define CACHE_FPL_SUPPORTED_CN_FLAGS \ 3221 (LOCKLEAF | LOCKPARENT | WANTPARENT | NOCACHE | FOLLOW | LOCKSHARED | SAVENAME | \ 3222 SAVESTART | WILLBEDIR | ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK) 3223 3224 #define CACHE_FPL_INTERNAL_CN_FLAGS \ 3225 (ISDOTDOT | MAKEENTRY | ISLASTCN) 3226 3227 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, 3228 "supported and internal flags overlap"); 3229 3230 static bool 3231 cache_fpl_islastcn(struct nameidata *ndp) 3232 { 3233 3234 return (*ndp->ni_next == 0); 3235 } 3236 3237 static bool 3238 cache_fpl_isdotdot(struct componentname *cnp) 3239 { 3240 3241 if (cnp->cn_namelen == 2 && 3242 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') 3243 return (true); 3244 return (false); 3245 } 3246 3247 static bool 3248 cache_can_fplookup(struct cache_fpl *fpl) 3249 { 3250 struct nameidata *ndp; 3251 struct componentname *cnp; 3252 struct thread *td; 3253 3254 ndp = fpl->ndp; 3255 cnp = fpl->cnp; 3256 td = cnp->cn_thread; 3257 3258 if (!cache_fast_lookup) { 3259 cache_fpl_aborted(fpl); 3260 return (false); 3261 } 3262 #ifdef MAC 3263 if (mac_vnode_check_lookup_enabled()) { 3264 cache_fpl_aborted(fpl); 3265 return (false); 3266 } 3267 #endif 3268 if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) { 3269 cache_fpl_aborted(fpl); 3270 return (false); 3271 } 3272 if (ndp->ni_dirfd != AT_FDCWD) { 3273 cache_fpl_aborted(fpl); 3274 return (false); 3275 } 3276 if (IN_CAPABILITY_MODE(td)) { 3277 cache_fpl_aborted(fpl); 3278 return (false); 3279 } 3280 if (AUDITING_TD(td)) { 3281 cache_fpl_aborted(fpl); 3282 return (false); 3283 } 3284 if (ndp->ni_startdir != NULL) { 3285 cache_fpl_aborted(fpl); 3286 return (false); 3287 } 3288 return (true); 3289 } 3290 3291 static bool 3292 cache_fplookup_vnode_supported(struct vnode *vp) 3293 { 3294 3295 return (vp->v_type != VLNK); 3296 } 3297 3298 /* 3299 * Move a negative entry to the hot list. 3300 * 3301 * We have to take locks, but they may be contended and in the worst 3302 * case we may need to go off CPU. We don't want to spin within the 3303 * smr section and we can't block with it. Instead we are going to 3304 * look up the entry again. 3305 */ 3306 static int __noinline 3307 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp, 3308 uint32_t hash) 3309 { 3310 struct componentname *cnp; 3311 struct namecache *ncp; 3312 struct neglist *neglist; 3313 struct negstate *negstate; 3314 struct vnode *dvp; 3315 u_char nc_flag; 3316 3317 cnp = fpl->cnp; 3318 dvp = fpl->dvp; 3319 3320 if (!vhold_smr(dvp)) 3321 return (cache_fpl_aborted(fpl)); 3322 3323 neglist = NCP2NEGLIST(oncp); 3324 cache_fpl_smr_exit(fpl); 3325 3326 mtx_lock(&ncneg_hot.nl_lock); 3327 mtx_lock(&neglist->nl_lock); 3328 /* 3329 * For hash iteration. 3330 */ 3331 cache_fpl_smr_enter(fpl); 3332 3333 /* 3334 * Avoid all surprises by only succeeding if we got the same entry and 3335 * bailing completely otherwise. 3336 * 3337 * In particular at this point there can be a new ncp which matches the 3338 * search but hashes to a different neglist. 3339 */ 3340 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 3341 if (ncp == oncp) 3342 break; 3343 } 3344 3345 /* 3346 * No match to begin with. 3347 */ 3348 if (__predict_false(ncp == NULL)) { 3349 goto out_abort; 3350 } 3351 3352 /* 3353 * The newly found entry may be something different... 3354 */ 3355 if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 3356 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) { 3357 goto out_abort; 3358 } 3359 3360 /* 3361 * ... and not even negative. 3362 */ 3363 nc_flag = atomic_load_char(&ncp->nc_flag); 3364 if ((nc_flag & NCF_NEGATIVE) == 0) { 3365 goto out_abort; 3366 } 3367 3368 if (__predict_false(!cache_ncp_canuse(ncp))) { 3369 goto out_abort; 3370 } 3371 3372 negstate = NCP2NEGSTATE(ncp); 3373 if ((negstate->neg_flag & NEG_HOT) == 0) { 3374 numhotneg++; 3375 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 3376 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); 3377 negstate->neg_flag |= NEG_HOT; 3378 } 3379 3380 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 3381 counter_u64_add(numneghits, 1); 3382 cache_fpl_smr_exit(fpl); 3383 mtx_unlock(&neglist->nl_lock); 3384 mtx_unlock(&ncneg_hot.nl_lock); 3385 vdrop(dvp); 3386 return (cache_fpl_handled(fpl, ENOENT)); 3387 out_abort: 3388 cache_fpl_smr_exit(fpl); 3389 mtx_unlock(&neglist->nl_lock); 3390 mtx_unlock(&ncneg_hot.nl_lock); 3391 vdrop(dvp); 3392 return (cache_fpl_aborted(fpl)); 3393 } 3394 3395 /* 3396 * The target vnode is not supported, prepare for the slow path to take over. 3397 */ 3398 static int __noinline 3399 cache_fplookup_partial_setup(struct cache_fpl *fpl) 3400 { 3401 struct nameidata *ndp; 3402 struct componentname *cnp; 3403 enum vgetstate dvs; 3404 struct vnode *dvp; 3405 struct pwd *pwd; 3406 seqc_t dvp_seqc; 3407 3408 ndp = fpl->ndp; 3409 cnp = fpl->cnp; 3410 dvp = fpl->dvp; 3411 dvp_seqc = fpl->dvp_seqc; 3412 3413 dvs = vget_prep_smr(dvp); 3414 if (__predict_false(dvs == VGET_NONE)) { 3415 cache_fpl_smr_exit(fpl); 3416 return (cache_fpl_aborted(fpl)); 3417 } 3418 3419 cache_fpl_smr_exit(fpl); 3420 3421 vget_finish_ref(dvp, dvs); 3422 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3423 vrele(dvp); 3424 return (cache_fpl_aborted(fpl)); 3425 } 3426 3427 pwd = pwd_hold(curthread); 3428 if (fpl->pwd != pwd) { 3429 vrele(dvp); 3430 pwd_drop(pwd); 3431 return (cache_fpl_aborted(fpl)); 3432 } 3433 3434 cache_fpl_restore(fpl, &fpl->snd); 3435 3436 ndp->ni_startdir = dvp; 3437 cnp->cn_flags |= MAKEENTRY; 3438 if (cache_fpl_islastcn(ndp)) 3439 cnp->cn_flags |= ISLASTCN; 3440 if (cache_fpl_isdotdot(cnp)) 3441 cnp->cn_flags |= ISDOTDOT; 3442 3443 return (0); 3444 } 3445 3446 static int 3447 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs) 3448 { 3449 struct componentname *cnp; 3450 struct vnode *tvp; 3451 seqc_t tvp_seqc; 3452 int error, lkflags; 3453 3454 cnp = fpl->cnp; 3455 tvp = fpl->tvp; 3456 tvp_seqc = fpl->tvp_seqc; 3457 3458 if ((cnp->cn_flags & LOCKLEAF) != 0) { 3459 lkflags = LK_SHARED; 3460 if ((cnp->cn_flags & LOCKSHARED) == 0) 3461 lkflags = LK_EXCLUSIVE; 3462 error = vget_finish(tvp, lkflags, tvs); 3463 if (__predict_false(error != 0)) { 3464 return (cache_fpl_aborted(fpl)); 3465 } 3466 } else { 3467 vget_finish_ref(tvp, tvs); 3468 } 3469 3470 if (!vn_seqc_consistent(tvp, tvp_seqc)) { 3471 if ((cnp->cn_flags & LOCKLEAF) != 0) 3472 vput(tvp); 3473 else 3474 vrele(tvp); 3475 return (cache_fpl_aborted(fpl)); 3476 } 3477 3478 return (cache_fpl_handled(fpl, 0)); 3479 } 3480 3481 /* 3482 * They want to possibly modify the state of the namecache. 3483 * 3484 * Don't try to match the API contract, just leave. 3485 * TODO: this leaves scalability on the table 3486 */ 3487 static int 3488 cache_fplookup_final_modifying(struct cache_fpl *fpl) 3489 { 3490 struct componentname *cnp; 3491 3492 cnp = fpl->cnp; 3493 MPASS(cnp->cn_nameiop != LOOKUP); 3494 return (cache_fpl_partial(fpl)); 3495 } 3496 3497 static int __noinline 3498 cache_fplookup_final_withparent(struct cache_fpl *fpl) 3499 { 3500 struct componentname *cnp; 3501 enum vgetstate dvs, tvs; 3502 struct vnode *dvp, *tvp; 3503 seqc_t dvp_seqc, tvp_seqc; 3504 int error; 3505 3506 cnp = fpl->cnp; 3507 dvp = fpl->dvp; 3508 dvp_seqc = fpl->dvp_seqc; 3509 tvp = fpl->tvp; 3510 tvp_seqc = fpl->tvp_seqc; 3511 3512 MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0); 3513 3514 /* 3515 * This is less efficient than it can be for simplicity. 3516 */ 3517 dvs = vget_prep_smr(dvp); 3518 if (__predict_false(dvs == VGET_NONE)) { 3519 return (cache_fpl_aborted(fpl)); 3520 } 3521 tvs = vget_prep_smr(tvp); 3522 if (__predict_false(tvs == VGET_NONE)) { 3523 cache_fpl_smr_exit(fpl); 3524 vget_abort(dvp, dvs); 3525 return (cache_fpl_aborted(fpl)); 3526 } 3527 3528 cache_fpl_smr_exit(fpl); 3529 3530 if ((cnp->cn_flags & LOCKPARENT) != 0) { 3531 error = vget_finish(dvp, LK_EXCLUSIVE, dvs); 3532 if (__predict_false(error != 0)) { 3533 vget_abort(tvp, tvs); 3534 return (cache_fpl_aborted(fpl)); 3535 } 3536 } else { 3537 vget_finish_ref(dvp, dvs); 3538 } 3539 3540 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3541 vget_abort(tvp, tvs); 3542 if ((cnp->cn_flags & LOCKPARENT) != 0) 3543 vput(dvp); 3544 else 3545 vrele(dvp); 3546 return (cache_fpl_aborted(fpl)); 3547 } 3548 3549 error = cache_fplookup_final_child(fpl, tvs); 3550 if (__predict_false(error != 0)) { 3551 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED); 3552 if ((cnp->cn_flags & LOCKPARENT) != 0) 3553 vput(dvp); 3554 else 3555 vrele(dvp); 3556 return (error); 3557 } 3558 3559 MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED); 3560 return (0); 3561 } 3562 3563 static int 3564 cache_fplookup_final(struct cache_fpl *fpl) 3565 { 3566 struct componentname *cnp; 3567 enum vgetstate tvs; 3568 struct vnode *dvp, *tvp; 3569 seqc_t dvp_seqc, tvp_seqc; 3570 3571 cnp = fpl->cnp; 3572 dvp = fpl->dvp; 3573 dvp_seqc = fpl->dvp_seqc; 3574 tvp = fpl->tvp; 3575 tvp_seqc = fpl->tvp_seqc; 3576 3577 VNPASS(cache_fplookup_vnode_supported(dvp), dvp); 3578 3579 if (cnp->cn_nameiop != LOOKUP) { 3580 return (cache_fplookup_final_modifying(fpl)); 3581 } 3582 3583 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) 3584 return (cache_fplookup_final_withparent(fpl)); 3585 3586 tvs = vget_prep_smr(tvp); 3587 if (__predict_false(tvs == VGET_NONE)) { 3588 return (cache_fpl_partial(fpl)); 3589 } 3590 3591 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3592 cache_fpl_smr_exit(fpl); 3593 vget_abort(tvp, tvs); 3594 return (cache_fpl_aborted(fpl)); 3595 } 3596 3597 cache_fpl_smr_exit(fpl); 3598 return (cache_fplookup_final_child(fpl, tvs)); 3599 } 3600 3601 static int __noinline 3602 cache_fplookup_dot(struct cache_fpl *fpl) 3603 { 3604 struct vnode *dvp; 3605 3606 dvp = fpl->dvp; 3607 3608 fpl->tvp = dvp; 3609 fpl->tvp_seqc = vn_seqc_read_any(dvp); 3610 if (seqc_in_modify(fpl->tvp_seqc)) { 3611 return (cache_fpl_aborted(fpl)); 3612 } 3613 3614 counter_u64_add(dothits, 1); 3615 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", dvp); 3616 3617 return (0); 3618 } 3619 3620 static int __noinline 3621 cache_fplookup_dotdot(struct cache_fpl *fpl) 3622 { 3623 struct nameidata *ndp; 3624 struct componentname *cnp; 3625 struct namecache *ncp; 3626 struct vnode *dvp; 3627 struct prison *pr; 3628 u_char nc_flag; 3629 3630 ndp = fpl->ndp; 3631 cnp = fpl->cnp; 3632 dvp = fpl->dvp; 3633 3634 /* 3635 * XXX this is racy the same way regular lookup is 3636 */ 3637 for (pr = cnp->cn_cred->cr_prison; pr != NULL; 3638 pr = pr->pr_parent) 3639 if (dvp == pr->pr_root) 3640 break; 3641 3642 if (dvp == ndp->ni_rootdir || 3643 dvp == ndp->ni_topdir || 3644 dvp == rootvnode || 3645 pr != NULL) { 3646 fpl->tvp = dvp; 3647 fpl->tvp_seqc = vn_seqc_read_any(dvp); 3648 if (seqc_in_modify(fpl->tvp_seqc)) { 3649 return (cache_fpl_aborted(fpl)); 3650 } 3651 return (0); 3652 } 3653 3654 if ((dvp->v_vflag & VV_ROOT) != 0) { 3655 /* 3656 * TODO 3657 * The opposite of climb mount is needed here. 3658 */ 3659 return (cache_fpl_aborted(fpl)); 3660 } 3661 3662 ncp = atomic_load_ptr(&dvp->v_cache_dd); 3663 if (ncp == NULL) { 3664 return (cache_fpl_aborted(fpl)); 3665 } 3666 3667 nc_flag = atomic_load_char(&ncp->nc_flag); 3668 if ((nc_flag & NCF_ISDOTDOT) != 0) { 3669 if ((nc_flag & NCF_NEGATIVE) != 0) 3670 return (cache_fpl_aborted(fpl)); 3671 fpl->tvp = ncp->nc_vp; 3672 } else { 3673 fpl->tvp = ncp->nc_dvp; 3674 } 3675 3676 if (__predict_false(!cache_ncp_canuse(ncp))) { 3677 return (cache_fpl_aborted(fpl)); 3678 } 3679 3680 fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp); 3681 if (seqc_in_modify(fpl->tvp_seqc)) { 3682 return (cache_fpl_partial(fpl)); 3683 } 3684 3685 counter_u64_add(dotdothits, 1); 3686 return (0); 3687 } 3688 3689 static int 3690 cache_fplookup_next(struct cache_fpl *fpl) 3691 { 3692 struct componentname *cnp; 3693 struct namecache *ncp; 3694 struct negstate *negstate; 3695 struct vnode *dvp, *tvp; 3696 u_char nc_flag; 3697 uint32_t hash; 3698 bool neg_hot; 3699 3700 cnp = fpl->cnp; 3701 dvp = fpl->dvp; 3702 3703 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) { 3704 return (cache_fplookup_dot(fpl)); 3705 } 3706 3707 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 3708 3709 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 3710 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 3711 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 3712 break; 3713 } 3714 3715 /* 3716 * If there is no entry we have to punt to the slow path to perform 3717 * actual lookup. Should there be nothing with this name a negative 3718 * entry will be created. 3719 */ 3720 if (__predict_false(ncp == NULL)) { 3721 return (cache_fpl_partial(fpl)); 3722 } 3723 3724 tvp = atomic_load_ptr(&ncp->nc_vp); 3725 nc_flag = atomic_load_char(&ncp->nc_flag); 3726 if ((nc_flag & NCF_NEGATIVE) != 0) { 3727 /* 3728 * If they want to create an entry we need to replace this one. 3729 */ 3730 if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) { 3731 return (cache_fpl_partial(fpl)); 3732 } 3733 negstate = NCP2NEGSTATE(ncp); 3734 neg_hot = ((negstate->neg_flag & NEG_HOT) != 0); 3735 if (__predict_false(!cache_ncp_canuse(ncp))) { 3736 return (cache_fpl_partial(fpl)); 3737 } 3738 if (__predict_false((nc_flag & NCF_WHITE) != 0)) { 3739 return (cache_fpl_partial(fpl)); 3740 } 3741 if (!neg_hot) { 3742 return (cache_fplookup_negative_promote(fpl, ncp, hash)); 3743 } 3744 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, 3745 ncp->nc_name); 3746 counter_u64_add(numneghits, 1); 3747 cache_fpl_smr_exit(fpl); 3748 return (cache_fpl_handled(fpl, ENOENT)); 3749 } 3750 3751 if (__predict_false(!cache_ncp_canuse(ncp))) { 3752 return (cache_fpl_partial(fpl)); 3753 } 3754 3755 fpl->tvp = tvp; 3756 fpl->tvp_seqc = vn_seqc_read_any(tvp); 3757 if (seqc_in_modify(fpl->tvp_seqc)) { 3758 return (cache_fpl_partial(fpl)); 3759 } 3760 3761 if (!cache_fplookup_vnode_supported(tvp)) { 3762 return (cache_fpl_partial(fpl)); 3763 } 3764 3765 counter_u64_add(numposhits, 1); 3766 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp); 3767 return (0); 3768 } 3769 3770 static bool 3771 cache_fplookup_mp_supported(struct mount *mp) 3772 { 3773 3774 if (mp == NULL) 3775 return (false); 3776 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) 3777 return (false); 3778 return (true); 3779 } 3780 3781 /* 3782 * Walk up the mount stack (if any). 3783 * 3784 * Correctness is provided in the following ways: 3785 * - all vnodes are protected from freeing with SMR 3786 * - struct mount objects are type stable making them always safe to access 3787 * - stability of the particular mount is provided by busying it 3788 * - relationship between the vnode which is mounted on and the mount is 3789 * verified with the vnode sequence counter after busying 3790 * - association between root vnode of the mount and the mount is protected 3791 * by busy 3792 * 3793 * From that point on we can read the sequence counter of the root vnode 3794 * and get the next mount on the stack (if any) using the same protection. 3795 * 3796 * By the end of successful walk we are guaranteed the reached state was 3797 * indeed present at least at some point which matches the regular lookup. 3798 */ 3799 static int __noinline 3800 cache_fplookup_climb_mount(struct cache_fpl *fpl) 3801 { 3802 struct mount *mp, *prev_mp; 3803 struct vnode *vp; 3804 seqc_t vp_seqc; 3805 3806 vp = fpl->tvp; 3807 vp_seqc = fpl->tvp_seqc; 3808 3809 VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp); 3810 mp = atomic_load_ptr(&vp->v_mountedhere); 3811 if (mp == NULL) 3812 return (0); 3813 3814 prev_mp = NULL; 3815 for (;;) { 3816 if (!vfs_op_thread_enter_crit(mp)) { 3817 if (prev_mp != NULL) 3818 vfs_op_thread_exit_crit(prev_mp); 3819 return (cache_fpl_partial(fpl)); 3820 } 3821 if (prev_mp != NULL) 3822 vfs_op_thread_exit_crit(prev_mp); 3823 if (!vn_seqc_consistent(vp, vp_seqc)) { 3824 vfs_op_thread_exit_crit(mp); 3825 return (cache_fpl_partial(fpl)); 3826 } 3827 if (!cache_fplookup_mp_supported(mp)) { 3828 vfs_op_thread_exit_crit(mp); 3829 return (cache_fpl_partial(fpl)); 3830 } 3831 vp = atomic_load_ptr(&mp->mnt_rootvnode); 3832 if (vp == NULL || VN_IS_DOOMED(vp)) { 3833 vfs_op_thread_exit_crit(mp); 3834 return (cache_fpl_partial(fpl)); 3835 } 3836 vp_seqc = vn_seqc_read_any(vp); 3837 if (seqc_in_modify(vp_seqc)) { 3838 vfs_op_thread_exit_crit(mp); 3839 return (cache_fpl_partial(fpl)); 3840 } 3841 prev_mp = mp; 3842 mp = atomic_load_ptr(&vp->v_mountedhere); 3843 if (mp == NULL) 3844 break; 3845 } 3846 3847 vfs_op_thread_exit_crit(prev_mp); 3848 fpl->tvp = vp; 3849 fpl->tvp_seqc = vp_seqc; 3850 return (0); 3851 } 3852 3853 static bool 3854 cache_fplookup_need_climb_mount(struct cache_fpl *fpl) 3855 { 3856 struct mount *mp; 3857 struct vnode *vp; 3858 3859 vp = fpl->tvp; 3860 3861 /* 3862 * Hack: while this is a union, the pointer tends to be NULL so save on 3863 * a branch. 3864 */ 3865 mp = atomic_load_ptr(&vp->v_mountedhere); 3866 if (mp == NULL) 3867 return (false); 3868 if (vp->v_type == VDIR) 3869 return (true); 3870 return (false); 3871 } 3872 3873 /* 3874 * Parse the path. 3875 * 3876 * The code is mostly copy-pasted from regular lookup, see lookup(). 3877 * The structure is maintained along with comments for easier maintenance. 3878 * Deduplicating the code will become feasible after fast path lookup 3879 * becomes more feature-complete. 3880 */ 3881 static int 3882 cache_fplookup_parse(struct cache_fpl *fpl) 3883 { 3884 struct nameidata *ndp; 3885 struct componentname *cnp; 3886 char *cp; 3887 3888 ndp = fpl->ndp; 3889 cnp = fpl->cnp; 3890 3891 /* 3892 * Search a new directory. 3893 * 3894 * The last component of the filename is left accessible via 3895 * cnp->cn_nameptr for callers that need the name. Callers needing 3896 * the name set the SAVENAME flag. When done, they assume 3897 * responsibility for freeing the pathname buffer. 3898 */ 3899 for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++) 3900 continue; 3901 cnp->cn_namelen = cp - cnp->cn_nameptr; 3902 if (__predict_false(cnp->cn_namelen > NAME_MAX)) { 3903 cache_fpl_smr_exit(fpl); 3904 return (cache_fpl_handled(fpl, ENAMETOOLONG)); 3905 } 3906 ndp->ni_pathlen -= cnp->cn_namelen; 3907 KASSERT(ndp->ni_pathlen <= PATH_MAX, 3908 ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen)); 3909 ndp->ni_next = cp; 3910 3911 /* 3912 * Replace multiple slashes by a single slash and trailing slashes 3913 * by a null. This must be done before VOP_LOOKUP() because some 3914 * fs's don't know about trailing slashes. Remember if there were 3915 * trailing slashes to handle symlinks, existing non-directories 3916 * and non-existing files that won't be directories specially later. 3917 */ 3918 while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) { 3919 cp++; 3920 ndp->ni_pathlen--; 3921 if (*cp == '\0') { 3922 /* 3923 * TODO 3924 * Regular lookup performs the following: 3925 * *ndp->ni_next = '\0'; 3926 * cnp->cn_flags |= TRAILINGSLASH; 3927 * 3928 * Which is problematic since it modifies data read 3929 * from userspace. Then if fast path lookup was to 3930 * abort we would have to either restore it or convey 3931 * the flag. Since this is a corner case just ignore 3932 * it for simplicity. 3933 */ 3934 return (cache_fpl_partial(fpl)); 3935 } 3936 } 3937 ndp->ni_next = cp; 3938 3939 /* 3940 * Check for degenerate name (e.g. / or "") 3941 * which is a way of talking about a directory, 3942 * e.g. like "/." or ".". 3943 * 3944 * TODO 3945 * Another corner case handled by the regular lookup 3946 */ 3947 if (__predict_false(cnp->cn_nameptr[0] == '\0')) { 3948 return (cache_fpl_partial(fpl)); 3949 } 3950 return (0); 3951 } 3952 3953 static void 3954 cache_fplookup_parse_advance(struct cache_fpl *fpl) 3955 { 3956 struct nameidata *ndp; 3957 struct componentname *cnp; 3958 3959 ndp = fpl->ndp; 3960 cnp = fpl->cnp; 3961 3962 cnp->cn_nameptr = ndp->ni_next; 3963 while (*cnp->cn_nameptr == '/') { 3964 cnp->cn_nameptr++; 3965 ndp->ni_pathlen--; 3966 } 3967 } 3968 3969 static int __noinline 3970 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error) 3971 { 3972 3973 switch (error) { 3974 case EAGAIN: 3975 /* 3976 * Can happen when racing against vgone. 3977 * */ 3978 case EOPNOTSUPP: 3979 cache_fpl_partial(fpl); 3980 break; 3981 default: 3982 /* 3983 * See the API contract for VOP_FPLOOKUP_VEXEC. 3984 */ 3985 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { 3986 error = cache_fpl_aborted(fpl); 3987 } else { 3988 cache_fpl_smr_exit(fpl); 3989 cache_fpl_handled(fpl, error); 3990 } 3991 break; 3992 } 3993 return (error); 3994 } 3995 3996 static int 3997 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl) 3998 { 3999 struct nameidata *ndp; 4000 struct componentname *cnp; 4001 struct mount *mp; 4002 int error; 4003 4004 error = CACHE_FPL_FAILED; 4005 ndp = fpl->ndp; 4006 cnp = fpl->cnp; 4007 4008 cache_fpl_checkpoint(fpl, &fpl->snd); 4009 4010 fpl->dvp = dvp; 4011 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); 4012 if (seqc_in_modify(fpl->dvp_seqc)) { 4013 cache_fpl_aborted(fpl); 4014 goto out; 4015 } 4016 mp = atomic_load_ptr(&fpl->dvp->v_mount); 4017 if (!cache_fplookup_mp_supported(mp)) { 4018 cache_fpl_aborted(fpl); 4019 goto out; 4020 } 4021 4022 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); 4023 4024 for (;;) { 4025 error = cache_fplookup_parse(fpl); 4026 if (__predict_false(error != 0)) { 4027 break; 4028 } 4029 4030 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); 4031 4032 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred); 4033 if (__predict_false(error != 0)) { 4034 error = cache_fplookup_failed_vexec(fpl, error); 4035 break; 4036 } 4037 4038 if (__predict_false(cache_fpl_isdotdot(cnp))) { 4039 error = cache_fplookup_dotdot(fpl); 4040 if (__predict_false(error != 0)) { 4041 break; 4042 } 4043 } else { 4044 error = cache_fplookup_next(fpl); 4045 if (__predict_false(error != 0)) { 4046 break; 4047 } 4048 4049 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 4050 4051 if (cache_fplookup_need_climb_mount(fpl)) { 4052 error = cache_fplookup_climb_mount(fpl); 4053 if (__predict_false(error != 0)) { 4054 break; 4055 } 4056 } 4057 } 4058 4059 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 4060 4061 if (cache_fpl_islastcn(ndp)) { 4062 error = cache_fplookup_final(fpl); 4063 break; 4064 } 4065 4066 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { 4067 error = cache_fpl_aborted(fpl); 4068 break; 4069 } 4070 4071 fpl->dvp = fpl->tvp; 4072 fpl->dvp_seqc = fpl->tvp_seqc; 4073 4074 cache_fplookup_parse_advance(fpl); 4075 cache_fpl_checkpoint(fpl, &fpl->snd); 4076 } 4077 out: 4078 switch (fpl->status) { 4079 case CACHE_FPL_STATUS_UNSET: 4080 __assert_unreachable(); 4081 break; 4082 case CACHE_FPL_STATUS_PARTIAL: 4083 cache_fpl_smr_assert_entered(fpl); 4084 return (cache_fplookup_partial_setup(fpl)); 4085 case CACHE_FPL_STATUS_ABORTED: 4086 if (fpl->in_smr) 4087 cache_fpl_smr_exit(fpl); 4088 return (CACHE_FPL_FAILED); 4089 case CACHE_FPL_STATUS_HANDLED: 4090 MPASS(error != CACHE_FPL_FAILED); 4091 cache_fpl_smr_assert_not_entered(fpl); 4092 if (__predict_false(error != 0)) { 4093 ndp->ni_dvp = NULL; 4094 ndp->ni_vp = NULL; 4095 cache_fpl_cleanup_cnp(cnp); 4096 return (error); 4097 } 4098 ndp->ni_dvp = fpl->dvp; 4099 ndp->ni_vp = fpl->tvp; 4100 if (cnp->cn_flags & SAVENAME) 4101 cnp->cn_flags |= HASBUF; 4102 else 4103 cache_fpl_cleanup_cnp(cnp); 4104 return (error); 4105 } 4106 } 4107 4108 /* 4109 * Fast path lookup protected with SMR and sequence counters. 4110 * 4111 * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one. 4112 * 4113 * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria 4114 * outlined below. 4115 * 4116 * Traditional vnode lookup conceptually looks like this: 4117 * 4118 * vn_lock(current); 4119 * for (;;) { 4120 * next = find(); 4121 * vn_lock(next); 4122 * vn_unlock(current); 4123 * current = next; 4124 * if (last) 4125 * break; 4126 * } 4127 * return (current); 4128 * 4129 * Each jump to the next vnode is safe memory-wise and atomic with respect to 4130 * any modifications thanks to holding respective locks. 4131 * 4132 * The same guarantee can be provided with a combination of safe memory 4133 * reclamation and sequence counters instead. If all operations which affect 4134 * the relationship between the current vnode and the one we are looking for 4135 * also modify the counter, we can verify whether all the conditions held as 4136 * we made the jump. This includes things like permissions, mount points etc. 4137 * Counter modification is provided by enclosing relevant places in 4138 * vn_seqc_write_begin()/end() calls. 4139 * 4140 * Thus this translates to: 4141 * 4142 * vfs_smr_enter(); 4143 * dvp_seqc = seqc_read_any(dvp); 4144 * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode 4145 * abort(); 4146 * for (;;) { 4147 * tvp = find(); 4148 * tvp_seqc = seqc_read_any(tvp); 4149 * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode 4150 * abort(); 4151 * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode 4152 * abort(); 4153 * dvp = tvp; // we know nothing of importance has changed 4154 * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration 4155 * if (last) 4156 * break; 4157 * } 4158 * vget(); // secure the vnode 4159 * if (!seqc_consistent(tvp, tvp_seqc) // final check 4160 * abort(); 4161 * // at this point we know nothing has changed for any parent<->child pair 4162 * // as they were crossed during the lookup, meaning we matched the guarantee 4163 * // of the locked variant 4164 * return (tvp); 4165 * 4166 * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows: 4167 * - they are called while within vfs_smr protection which they must never exit 4168 * - EAGAIN can be returned to denote checking could not be performed, it is 4169 * always valid to return it 4170 * - if the sequence counter has not changed the result must be valid 4171 * - if the sequence counter has changed both false positives and false negatives 4172 * are permitted (since the result will be rejected later) 4173 * - for simple cases of unix permission checks vaccess_vexec_smr can be used 4174 * 4175 * Caveats to watch out for: 4176 * - vnodes are passed unlocked and unreferenced with nothing stopping 4177 * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised 4178 * to use atomic_load_ptr to fetch it. 4179 * - the aforementioned object can also get freed, meaning absent other means it 4180 * should be protected with vfs_smr 4181 * - either safely checking permissions as they are modified or guaranteeing 4182 * their stability is left to the routine 4183 */ 4184 int 4185 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status, 4186 struct pwd **pwdp) 4187 { 4188 struct cache_fpl fpl; 4189 struct pwd *pwd; 4190 struct vnode *dvp; 4191 struct componentname *cnp; 4192 struct nameidata_saved orig; 4193 int error; 4194 4195 MPASS(ndp->ni_lcf == 0); 4196 4197 fpl.status = CACHE_FPL_STATUS_UNSET; 4198 fpl.ndp = ndp; 4199 fpl.cnp = &ndp->ni_cnd; 4200 MPASS(curthread == fpl.cnp->cn_thread); 4201 4202 if ((fpl.cnp->cn_flags & SAVESTART) != 0) 4203 MPASS(fpl.cnp->cn_nameiop != LOOKUP); 4204 4205 if (!cache_can_fplookup(&fpl)) { 4206 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 4207 *status = fpl.status; 4208 return (EOPNOTSUPP); 4209 } 4210 4211 cache_fpl_checkpoint(&fpl, &orig); 4212 4213 cache_fpl_smr_enter_initial(&fpl); 4214 pwd = pwd_get_smr(); 4215 fpl.pwd = pwd; 4216 ndp->ni_rootdir = pwd->pwd_rdir; 4217 ndp->ni_topdir = pwd->pwd_jdir; 4218 4219 cnp = fpl.cnp; 4220 cnp->cn_nameptr = cnp->cn_pnbuf; 4221 if (cnp->cn_pnbuf[0] == '/') { 4222 cache_fpl_handle_root(ndp, &dvp); 4223 } else { 4224 MPASS(ndp->ni_dirfd == AT_FDCWD); 4225 dvp = pwd->pwd_cdir; 4226 } 4227 4228 SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true); 4229 4230 error = cache_fplookup_impl(dvp, &fpl); 4231 cache_fpl_smr_assert_not_entered(&fpl); 4232 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 4233 4234 *status = fpl.status; 4235 switch (fpl.status) { 4236 case CACHE_FPL_STATUS_UNSET: 4237 __assert_unreachable(); 4238 break; 4239 case CACHE_FPL_STATUS_HANDLED: 4240 SDT_PROBE3(vfs, namei, lookup, return, error, 4241 (error == 0 ? ndp->ni_vp : NULL), true); 4242 break; 4243 case CACHE_FPL_STATUS_PARTIAL: 4244 *pwdp = fpl.pwd; 4245 /* 4246 * Status restored by cache_fplookup_partial_setup. 4247 */ 4248 break; 4249 case CACHE_FPL_STATUS_ABORTED: 4250 cache_fpl_restore(&fpl, &orig); 4251 break; 4252 } 4253 return (error); 4254 } 4255