1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Poul-Henning Kamp of the FreeBSD Project. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_ddb.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/capsicum.h> 46 #include <sys/counter.h> 47 #include <sys/filedesc.h> 48 #include <sys/fnv_hash.h> 49 #include <sys/kernel.h> 50 #include <sys/ktr.h> 51 #include <sys/lock.h> 52 #include <sys/malloc.h> 53 #include <sys/fcntl.h> 54 #include <sys/mount.h> 55 #include <sys/namei.h> 56 #include <sys/proc.h> 57 #include <sys/rwlock.h> 58 #include <sys/seqc.h> 59 #include <sys/sdt.h> 60 #include <sys/smr.h> 61 #include <sys/smp.h> 62 #include <sys/syscallsubr.h> 63 #include <sys/sysctl.h> 64 #include <sys/sysproto.h> 65 #include <sys/vnode.h> 66 #include <ck_queue.h> 67 #ifdef KTRACE 68 #include <sys/ktrace.h> 69 #endif 70 71 #include <sys/capsicum.h> 72 73 #include <security/audit/audit.h> 74 #include <security/mac/mac_framework.h> 75 76 #ifdef DDB 77 #include <ddb/ddb.h> 78 #endif 79 80 #include <vm/uma.h> 81 82 SDT_PROVIDER_DECLARE(vfs); 83 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", 84 "struct vnode *"); 85 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", 86 "char *"); 87 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); 88 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", 89 "char *", "struct vnode *"); 90 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); 91 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", 92 "struct vnode *", "char *"); 93 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", 94 "struct vnode *"); 95 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, 96 "struct vnode *", "char *"); 97 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", 98 "char *"); 99 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); 100 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); 101 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); 102 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", 103 "struct vnode *"); 104 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", 105 "char *"); 106 SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *", 107 "char *"); 108 109 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool"); 110 SDT_PROBE_DECLARE(vfs, namei, lookup, entry); 111 SDT_PROBE_DECLARE(vfs, namei, lookup, return); 112 113 /* 114 * This structure describes the elements in the cache of recent 115 * names looked up by namei. 116 */ 117 struct negstate { 118 u_char neg_flag; 119 }; 120 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *), 121 "the state must fit in a union with a pointer without growing it"); 122 123 struct namecache { 124 CK_LIST_ENTRY(namecache) nc_hash;/* hash chain */ 125 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 126 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 127 struct vnode *nc_dvp; /* vnode of parent of name */ 128 union { 129 struct vnode *nu_vp; /* vnode the name refers to */ 130 struct negstate nu_neg;/* negative entry state */ 131 } n_un; 132 u_char nc_flag; /* flag bits */ 133 u_char nc_nlen; /* length of name */ 134 char nc_name[0]; /* segment name + nul */ 135 }; 136 137 /* 138 * struct namecache_ts repeats struct namecache layout up to the 139 * nc_nlen member. 140 * struct namecache_ts is used in place of struct namecache when time(s) need 141 * to be stored. The nc_dotdottime field is used when a cache entry is mapping 142 * both a non-dotdot directory name plus dotdot for the directory's 143 * parent. 144 */ 145 struct namecache_ts { 146 struct timespec nc_time; /* timespec provided by fs */ 147 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ 148 int nc_ticks; /* ticks value when entry was added */ 149 struct namecache nc_nc; 150 }; 151 152 #define nc_vp n_un.nu_vp 153 #define nc_neg n_un.nu_neg 154 155 /* 156 * Flags in namecache.nc_flag 157 */ 158 #define NCF_WHITE 0x01 159 #define NCF_ISDOTDOT 0x02 160 #define NCF_TS 0x04 161 #define NCF_DTS 0x08 162 #define NCF_DVDROP 0x10 163 #define NCF_NEGATIVE 0x20 164 #define NCF_INVALID 0x40 165 166 /* 167 * Flags in negstate.neg_flag 168 */ 169 #define NEG_HOT 0x01 170 171 /* 172 * Mark an entry as invalid. 173 * 174 * This is called before it starts getting deconstructed. 175 */ 176 static void 177 cache_ncp_invalidate(struct namecache *ncp) 178 { 179 180 KASSERT((ncp->nc_flag & NCF_INVALID) == 0, 181 ("%s: entry %p already invalid", __func__, ncp)); 182 ncp->nc_flag |= NCF_INVALID; 183 atomic_thread_fence_rel(); 184 } 185 186 /* 187 * Verify validity of an entry. 188 * 189 * All places which elide locks are supposed to call this after they are 190 * done with reading from an entry. 191 */ 192 static bool 193 cache_ncp_invalid(struct namecache *ncp) 194 { 195 196 atomic_thread_fence_acq(); 197 return ((ncp->nc_flag & NCF_INVALID) != 0); 198 } 199 200 /* 201 * Name caching works as follows: 202 * 203 * Names found by directory scans are retained in a cache 204 * for future reference. It is managed LRU, so frequently 205 * used names will hang around. Cache is indexed by hash value 206 * obtained from (dvp, name) where dvp refers to the directory 207 * containing name. 208 * 209 * If it is a "negative" entry, (i.e. for a name that is known NOT to 210 * exist) the vnode pointer will be NULL. 211 * 212 * Upon reaching the last segment of a path, if the reference 213 * is for DELETE, or NOCACHE is set (rewrite), and the 214 * name is located in the cache, it will be dropped. 215 * 216 * These locks are used (in the order in which they can be taken): 217 * NAME TYPE ROLE 218 * vnodelock mtx vnode lists and v_cache_dd field protection 219 * bucketlock rwlock for access to given set of hash buckets 220 * neglist mtx negative entry LRU management 221 * 222 * Additionally, ncneg_shrink_lock mtx is used to have at most one thread 223 * shrinking the LRU list. 224 * 225 * It is legal to take multiple vnodelock and bucketlock locks. The locking 226 * order is lower address first. Both are recursive. 227 * 228 * "." lookups are lockless. 229 * 230 * ".." and vnode -> name lookups require vnodelock. 231 * 232 * name -> vnode lookup requires the relevant bucketlock to be held for reading. 233 * 234 * Insertions and removals of entries require involved vnodes and bucketlocks 235 * to be write-locked to prevent other threads from seeing the entry. 236 * 237 * Some lookups result in removal of the found entry (e.g. getting rid of a 238 * negative entry with the intent to create a positive one), which poses a 239 * problem when multiple threads reach the state. Similarly, two different 240 * threads can purge two different vnodes and try to remove the same name. 241 * 242 * If the already held vnode lock is lower than the second required lock, we 243 * can just take the other lock. However, in the opposite case, this could 244 * deadlock. As such, this is resolved by trylocking and if that fails unlocking 245 * the first node, locking everything in order and revalidating the state. 246 */ 247 248 VFS_SMR_DECLARE; 249 250 /* 251 * Structures associated with name caching. 252 */ 253 #define NCHHASH(hash) \ 254 (&nchashtbl[(hash) & nchash]) 255 static __read_mostly CK_LIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ 256 static u_long __read_mostly nchash; /* size of hash table */ 257 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 258 "Size of namecache hash table"); 259 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */ 260 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 261 "Ratio of negative namecache entries"); 262 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ 263 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ 264 u_int ncsizefactor = 2; 265 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0, 266 "Size factor for namecache"); 267 static u_int __read_mostly ncpurgeminvnodes; 268 SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0, 269 "Number of vnodes below which purgevfs ignores the request"); 270 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */ 271 272 struct nchstats nchstats; /* cache effectiveness statistics */ 273 274 static struct mtx __exclusive_cache_line ncneg_shrink_lock; 275 276 struct neglist { 277 struct mtx nl_lock; 278 TAILQ_HEAD(, namecache) nl_list; 279 } __aligned(CACHE_LINE_SIZE); 280 281 static struct neglist __read_mostly *neglists; 282 static struct neglist ncneg_hot; 283 static u_long numhotneg; 284 285 #define numneglists (ncneghash + 1) 286 static u_int __read_mostly ncneghash; 287 static inline struct neglist * 288 NCP2NEGLIST(struct namecache *ncp) 289 { 290 291 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); 292 } 293 294 static inline struct negstate * 295 NCP2NEGSTATE(struct namecache *ncp) 296 { 297 298 MPASS(ncp->nc_flag & NCF_NEGATIVE); 299 return (&ncp->nc_neg); 300 } 301 302 #define numbucketlocks (ncbuckethash + 1) 303 static u_int __read_mostly ncbuckethash; 304 static struct rwlock_padalign __read_mostly *bucketlocks; 305 #define HASH2BUCKETLOCK(hash) \ 306 ((struct rwlock *)(&bucketlocks[((hash) & ncbuckethash)])) 307 308 #define numvnodelocks (ncvnodehash + 1) 309 static u_int __read_mostly ncvnodehash; 310 static struct mtx __read_mostly *vnodelocks; 311 static inline struct mtx * 312 VP2VNODELOCK(struct vnode *vp) 313 { 314 315 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); 316 } 317 318 /* 319 * UMA zones for the VFS cache. 320 * 321 * The small cache is used for entries with short names, which are the 322 * most common. The large cache is used for entries which are too big to 323 * fit in the small cache. 324 */ 325 static uma_zone_t __read_mostly cache_zone_small; 326 static uma_zone_t __read_mostly cache_zone_small_ts; 327 static uma_zone_t __read_mostly cache_zone_large; 328 static uma_zone_t __read_mostly cache_zone_large_ts; 329 330 #define CACHE_PATH_CUTOFF 35 331 332 static struct namecache * 333 cache_alloc(int len, int ts) 334 { 335 struct namecache_ts *ncp_ts; 336 struct namecache *ncp; 337 338 if (__predict_false(ts)) { 339 if (len <= CACHE_PATH_CUTOFF) 340 ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK); 341 else 342 ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK); 343 ncp = &ncp_ts->nc_nc; 344 } else { 345 if (len <= CACHE_PATH_CUTOFF) 346 ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK); 347 else 348 ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK); 349 } 350 return (ncp); 351 } 352 353 static void 354 cache_free(struct namecache *ncp) 355 { 356 struct namecache_ts *ncp_ts; 357 358 if (ncp == NULL) 359 return; 360 if ((ncp->nc_flag & NCF_DVDROP) != 0) 361 vdrop(ncp->nc_dvp); 362 if (__predict_false(ncp->nc_flag & NCF_TS)) { 363 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 364 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 365 uma_zfree_smr(cache_zone_small_ts, ncp_ts); 366 else 367 uma_zfree_smr(cache_zone_large_ts, ncp_ts); 368 } else { 369 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 370 uma_zfree_smr(cache_zone_small, ncp); 371 else 372 uma_zfree_smr(cache_zone_large, ncp); 373 } 374 } 375 376 static void 377 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) 378 { 379 struct namecache_ts *ncp_ts; 380 381 KASSERT((ncp->nc_flag & NCF_TS) != 0 || 382 (tsp == NULL && ticksp == NULL), 383 ("No NCF_TS")); 384 385 if (tsp == NULL && ticksp == NULL) 386 return; 387 388 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 389 if (tsp != NULL) 390 *tsp = ncp_ts->nc_time; 391 if (ticksp != NULL) 392 *ticksp = ncp_ts->nc_ticks; 393 } 394 395 #ifdef DEBUG_CACHE 396 static int __read_mostly doingcache = 1; /* 1 => enable the cache */ 397 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, 398 "VFS namecache enabled"); 399 #endif 400 401 /* Export size information to userland */ 402 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 403 sizeof(struct namecache), "sizeof(struct namecache)"); 404 405 /* 406 * The new name cache statistics 407 */ 408 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 409 "Name cache statistics"); 410 #define STATNODE_ULONG(name, descr) \ 411 SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr); 412 #define STATNODE_COUNTER(name, descr) \ 413 static COUNTER_U64_DEFINE_EARLY(name); \ 414 SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, \ 415 descr); 416 STATNODE_ULONG(numneg, "Number of negative cache entries"); 417 STATNODE_ULONG(numcache, "Number of cache entries"); 418 STATNODE_COUNTER(numcachehv, "Number of namecache entries with vnodes held"); 419 STATNODE_COUNTER(numdrops, "Number of dropped entries due to reaching the limit"); 420 STATNODE_COUNTER(dothits, "Number of '.' hits"); 421 STATNODE_COUNTER(dotdothits, "Number of '..' hits"); 422 STATNODE_COUNTER(nummiss, "Number of cache misses"); 423 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache"); 424 STATNODE_COUNTER(numposzaps, 425 "Number of cache hits (positive) we do not want to cache"); 426 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)"); 427 STATNODE_COUNTER(numnegzaps, 428 "Number of cache hits (negative) we do not want to cache"); 429 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)"); 430 /* These count for vn_getcwd(), too. */ 431 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls"); 432 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); 433 STATNODE_COUNTER(numfullpathfail2, 434 "Number of fullpath search errors (VOP_VPTOCNP failures)"); 435 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); 436 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls"); 437 STATNODE_COUNTER(zap_and_exit_bucket_relock_success, 438 "Number of successful removals after relocking"); 439 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail, 440 "Number of times zap_and_exit failed to lock"); 441 static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2, 442 "Number of times zap_and_exit failed to lock"); 443 static long cache_lock_vnodes_cel_3_failures; 444 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures, 445 "Number of times 3-way vnode locking failed"); 446 STATNODE_ULONG(numhotneg, "Number of hot negative entries"); 447 STATNODE_COUNTER(numneg_evicted, 448 "Number of negative entries evicted when adding a new entry"); 449 STATNODE_COUNTER(shrinking_skipped, 450 "Number of times shrinking was already in progress"); 451 452 static void cache_zap_locked(struct namecache *ncp); 453 static int vn_fullpath_hardlink(struct thread *td, struct nameidata *ndp, char **retbuf, 454 char **freebuf, size_t *buflen); 455 static int vn_fullpath_any(struct thread *td, struct vnode *vp, struct vnode *rdir, 456 char *buf, char **retbuf, size_t *buflen); 457 static int vn_fullpath_dir(struct thread *td, struct vnode *vp, struct vnode *rdir, 458 char *buf, char **retbuf, size_t *len, bool slash_prefixed, size_t addend); 459 460 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 461 462 static int cache_yield; 463 SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0, 464 "Number of times cache called yield"); 465 466 static void __noinline 467 cache_maybe_yield(void) 468 { 469 470 if (should_yield()) { 471 cache_yield++; 472 kern_yield(PRI_USER); 473 } 474 } 475 476 static inline void 477 cache_assert_vlp_locked(struct mtx *vlp) 478 { 479 480 if (vlp != NULL) 481 mtx_assert(vlp, MA_OWNED); 482 } 483 484 static inline void 485 cache_assert_vnode_locked(struct vnode *vp) 486 { 487 struct mtx *vlp; 488 489 vlp = VP2VNODELOCK(vp); 490 cache_assert_vlp_locked(vlp); 491 } 492 493 /* 494 * TODO: With the value stored we can do better than computing the hash based 495 * on the address and the choice of FNV should also be revisisted. 496 */ 497 static void 498 cache_prehash(struct vnode *vp) 499 { 500 501 vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT); 502 } 503 504 static uint32_t 505 cache_get_hash(char *name, u_char len, struct vnode *dvp) 506 { 507 508 return (fnv_32_buf(name, len, dvp->v_nchash)); 509 } 510 511 static inline struct rwlock * 512 NCP2BUCKETLOCK(struct namecache *ncp) 513 { 514 uint32_t hash; 515 516 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 517 return (HASH2BUCKETLOCK(hash)); 518 } 519 520 #ifdef INVARIANTS 521 static void 522 cache_assert_bucket_locked(struct namecache *ncp, int mode) 523 { 524 struct rwlock *blp; 525 526 blp = NCP2BUCKETLOCK(ncp); 527 rw_assert(blp, mode); 528 } 529 #else 530 #define cache_assert_bucket_locked(x, y) do { } while (0) 531 #endif 532 533 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) 534 static void 535 _cache_sort_vnodes(void **p1, void **p2) 536 { 537 void *tmp; 538 539 MPASS(*p1 != NULL || *p2 != NULL); 540 541 if (*p1 > *p2) { 542 tmp = *p2; 543 *p2 = *p1; 544 *p1 = tmp; 545 } 546 } 547 548 static void 549 cache_lock_all_buckets(void) 550 { 551 u_int i; 552 553 for (i = 0; i < numbucketlocks; i++) 554 rw_wlock(&bucketlocks[i]); 555 } 556 557 static void 558 cache_unlock_all_buckets(void) 559 { 560 u_int i; 561 562 for (i = 0; i < numbucketlocks; i++) 563 rw_wunlock(&bucketlocks[i]); 564 } 565 566 static void 567 cache_lock_all_vnodes(void) 568 { 569 u_int i; 570 571 for (i = 0; i < numvnodelocks; i++) 572 mtx_lock(&vnodelocks[i]); 573 } 574 575 static void 576 cache_unlock_all_vnodes(void) 577 { 578 u_int i; 579 580 for (i = 0; i < numvnodelocks; i++) 581 mtx_unlock(&vnodelocks[i]); 582 } 583 584 static int 585 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 586 { 587 588 cache_sort_vnodes(&vlp1, &vlp2); 589 590 if (vlp1 != NULL) { 591 if (!mtx_trylock(vlp1)) 592 return (EAGAIN); 593 } 594 if (!mtx_trylock(vlp2)) { 595 if (vlp1 != NULL) 596 mtx_unlock(vlp1); 597 return (EAGAIN); 598 } 599 600 return (0); 601 } 602 603 static void 604 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 605 { 606 607 MPASS(vlp1 != NULL || vlp2 != NULL); 608 MPASS(vlp1 <= vlp2); 609 610 if (vlp1 != NULL) 611 mtx_lock(vlp1); 612 if (vlp2 != NULL) 613 mtx_lock(vlp2); 614 } 615 616 static void 617 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 618 { 619 620 MPASS(vlp1 != NULL || vlp2 != NULL); 621 622 if (vlp1 != NULL) 623 mtx_unlock(vlp1); 624 if (vlp2 != NULL) 625 mtx_unlock(vlp2); 626 } 627 628 static int 629 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 630 { 631 struct nchstats snap; 632 633 if (req->oldptr == NULL) 634 return (SYSCTL_OUT(req, 0, sizeof(snap))); 635 636 snap = nchstats; 637 snap.ncs_goodhits = counter_u64_fetch(numposhits); 638 snap.ncs_neghits = counter_u64_fetch(numneghits); 639 snap.ncs_badhits = counter_u64_fetch(numposzaps) + 640 counter_u64_fetch(numnegzaps); 641 snap.ncs_miss = counter_u64_fetch(nummisszap) + 642 counter_u64_fetch(nummiss); 643 644 return (SYSCTL_OUT(req, &snap, sizeof(snap))); 645 } 646 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | 647 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", 648 "VFS cache effectiveness statistics"); 649 650 #ifdef DIAGNOSTIC 651 /* 652 * Grab an atomic snapshot of the name cache hash chain lengths 653 */ 654 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, 655 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 656 "hash table stats"); 657 658 static int 659 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) 660 { 661 struct nchashhead *ncpp; 662 struct namecache *ncp; 663 int i, error, n_nchash, *cntbuf; 664 665 retry: 666 n_nchash = nchash + 1; /* nchash is max index, not count */ 667 if (req->oldptr == NULL) 668 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); 669 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); 670 cache_lock_all_buckets(); 671 if (n_nchash != nchash + 1) { 672 cache_unlock_all_buckets(); 673 free(cntbuf, M_TEMP); 674 goto retry; 675 } 676 /* Scan hash tables counting entries */ 677 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) 678 CK_LIST_FOREACH(ncp, ncpp, nc_hash) 679 cntbuf[i]++; 680 cache_unlock_all_buckets(); 681 for (error = 0, i = 0; i < n_nchash; i++) 682 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) 683 break; 684 free(cntbuf, M_TEMP); 685 return (error); 686 } 687 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| 688 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", 689 "nchash chain lengths"); 690 691 static int 692 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) 693 { 694 int error; 695 struct nchashhead *ncpp; 696 struct namecache *ncp; 697 int n_nchash; 698 int count, maxlength, used, pct; 699 700 if (!req->oldptr) 701 return SYSCTL_OUT(req, 0, 4 * sizeof(int)); 702 703 cache_lock_all_buckets(); 704 n_nchash = nchash + 1; /* nchash is max index, not count */ 705 used = 0; 706 maxlength = 0; 707 708 /* Scan hash tables for applicable entries */ 709 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { 710 count = 0; 711 CK_LIST_FOREACH(ncp, ncpp, nc_hash) { 712 count++; 713 } 714 if (count) 715 used++; 716 if (maxlength < count) 717 maxlength = count; 718 } 719 n_nchash = nchash + 1; 720 cache_unlock_all_buckets(); 721 pct = (used * 100) / (n_nchash / 100); 722 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); 723 if (error) 724 return (error); 725 error = SYSCTL_OUT(req, &used, sizeof(used)); 726 if (error) 727 return (error); 728 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); 729 if (error) 730 return (error); 731 error = SYSCTL_OUT(req, &pct, sizeof(pct)); 732 if (error) 733 return (error); 734 return (0); 735 } 736 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| 737 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", 738 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); 739 #endif 740 741 /* 742 * Negative entries management 743 * 744 * A variation of LRU scheme is used. New entries are hashed into one of 745 * numneglists cold lists. Entries get promoted to the hot list on first hit. 746 * 747 * The shrinker will demote hot list head and evict from the cold list in a 748 * round-robin manner. 749 */ 750 static void 751 cache_negative_init(struct namecache *ncp) 752 { 753 struct negstate *negstate; 754 755 ncp->nc_flag |= NCF_NEGATIVE; 756 negstate = NCP2NEGSTATE(ncp); 757 negstate->neg_flag = 0; 758 } 759 760 static void 761 cache_negative_hit(struct namecache *ncp) 762 { 763 struct neglist *neglist; 764 struct negstate *negstate; 765 766 negstate = NCP2NEGSTATE(ncp); 767 if ((negstate->neg_flag & NEG_HOT) != 0) 768 return; 769 neglist = NCP2NEGLIST(ncp); 770 mtx_lock(&ncneg_hot.nl_lock); 771 mtx_lock(&neglist->nl_lock); 772 if ((negstate->neg_flag & NEG_HOT) == 0) { 773 numhotneg++; 774 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 775 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); 776 negstate->neg_flag |= NEG_HOT; 777 } 778 mtx_unlock(&neglist->nl_lock); 779 mtx_unlock(&ncneg_hot.nl_lock); 780 } 781 782 static void 783 cache_negative_insert(struct namecache *ncp) 784 { 785 struct neglist *neglist; 786 787 MPASS(ncp->nc_flag & NCF_NEGATIVE); 788 cache_assert_bucket_locked(ncp, RA_WLOCKED); 789 neglist = NCP2NEGLIST(ncp); 790 mtx_lock(&neglist->nl_lock); 791 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 792 mtx_unlock(&neglist->nl_lock); 793 atomic_add_rel_long(&numneg, 1); 794 } 795 796 static void 797 cache_negative_remove(struct namecache *ncp) 798 { 799 struct neglist *neglist; 800 struct negstate *negstate; 801 bool hot_locked = false; 802 bool list_locked = false; 803 804 cache_assert_bucket_locked(ncp, RA_WLOCKED); 805 neglist = NCP2NEGLIST(ncp); 806 negstate = NCP2NEGSTATE(ncp); 807 if ((negstate->neg_flag & NEG_HOT) != 0) { 808 hot_locked = true; 809 mtx_lock(&ncneg_hot.nl_lock); 810 if ((negstate->neg_flag & NEG_HOT) == 0) { 811 list_locked = true; 812 mtx_lock(&neglist->nl_lock); 813 } 814 } else { 815 list_locked = true; 816 mtx_lock(&neglist->nl_lock); 817 /* 818 * We may be racing against promotion in lockless lookup. 819 */ 820 if ((negstate->neg_flag & NEG_HOT) != 0) { 821 mtx_unlock(&neglist->nl_lock); 822 hot_locked = true; 823 mtx_lock(&ncneg_hot.nl_lock); 824 mtx_lock(&neglist->nl_lock); 825 } 826 } 827 if ((negstate->neg_flag & NEG_HOT) != 0) { 828 mtx_assert(&ncneg_hot.nl_lock, MA_OWNED); 829 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 830 numhotneg--; 831 } else { 832 mtx_assert(&neglist->nl_lock, MA_OWNED); 833 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 834 } 835 if (list_locked) 836 mtx_unlock(&neglist->nl_lock); 837 if (hot_locked) 838 mtx_unlock(&ncneg_hot.nl_lock); 839 atomic_subtract_rel_long(&numneg, 1); 840 } 841 842 static void 843 cache_negative_shrink_select(struct namecache **ncpp, 844 struct neglist **neglistpp) 845 { 846 struct neglist *neglist; 847 struct namecache *ncp; 848 static u_int cycle; 849 u_int i; 850 851 *ncpp = ncp = NULL; 852 853 for (i = 0; i < numneglists; i++) { 854 neglist = &neglists[(cycle + i) % numneglists]; 855 if (TAILQ_FIRST(&neglist->nl_list) == NULL) 856 continue; 857 mtx_lock(&neglist->nl_lock); 858 ncp = TAILQ_FIRST(&neglist->nl_list); 859 if (ncp != NULL) 860 break; 861 mtx_unlock(&neglist->nl_lock); 862 } 863 864 *neglistpp = neglist; 865 *ncpp = ncp; 866 cycle++; 867 } 868 869 static void 870 cache_negative_zap_one(void) 871 { 872 struct namecache *ncp, *ncp2; 873 struct neglist *neglist; 874 struct negstate *negstate; 875 struct mtx *dvlp; 876 struct rwlock *blp; 877 878 if (mtx_owner(&ncneg_shrink_lock) != NULL || 879 !mtx_trylock(&ncneg_shrink_lock)) { 880 counter_u64_add(shrinking_skipped, 1); 881 return; 882 } 883 884 mtx_lock(&ncneg_hot.nl_lock); 885 ncp = TAILQ_FIRST(&ncneg_hot.nl_list); 886 if (ncp != NULL) { 887 neglist = NCP2NEGLIST(ncp); 888 negstate = NCP2NEGSTATE(ncp); 889 mtx_lock(&neglist->nl_lock); 890 MPASS((negstate->neg_flag & NEG_HOT) != 0); 891 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 892 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 893 negstate->neg_flag &= ~NEG_HOT; 894 numhotneg--; 895 mtx_unlock(&neglist->nl_lock); 896 } 897 mtx_unlock(&ncneg_hot.nl_lock); 898 899 cache_negative_shrink_select(&ncp, &neglist); 900 901 mtx_unlock(&ncneg_shrink_lock); 902 if (ncp == NULL) 903 return; 904 905 MPASS(ncp->nc_flag & NCF_NEGATIVE); 906 dvlp = VP2VNODELOCK(ncp->nc_dvp); 907 blp = NCP2BUCKETLOCK(ncp); 908 mtx_unlock(&neglist->nl_lock); 909 mtx_lock(dvlp); 910 rw_wlock(blp); 911 /* 912 * Enter SMR to safely check the negative list. 913 * Even if the found pointer matches, the entry may now be reallocated 914 * and used by a different vnode. 915 */ 916 vfs_smr_enter(); 917 ncp2 = TAILQ_FIRST(&neglist->nl_list); 918 if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) || 919 blp != NCP2BUCKETLOCK(ncp2)) { 920 vfs_smr_exit(); 921 ncp = NULL; 922 } else { 923 vfs_smr_exit(); 924 SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp, 925 ncp->nc_name); 926 cache_zap_locked(ncp); 927 counter_u64_add(numneg_evicted, 1); 928 } 929 rw_wunlock(blp); 930 mtx_unlock(dvlp); 931 cache_free(ncp); 932 } 933 934 /* 935 * cache_zap_locked(): 936 * 937 * Removes a namecache entry from cache, whether it contains an actual 938 * pointer to a vnode or if it is just a negative cache entry. 939 */ 940 static void 941 cache_zap_locked(struct namecache *ncp) 942 { 943 944 if (!(ncp->nc_flag & NCF_NEGATIVE)) 945 cache_assert_vnode_locked(ncp->nc_vp); 946 cache_assert_vnode_locked(ncp->nc_dvp); 947 cache_assert_bucket_locked(ncp, RA_WLOCKED); 948 949 CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, 950 (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp); 951 952 cache_ncp_invalidate(ncp); 953 954 CK_LIST_REMOVE(ncp, nc_hash); 955 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 956 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, 957 ncp->nc_name, ncp->nc_vp); 958 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); 959 if (ncp == ncp->nc_vp->v_cache_dd) 960 ncp->nc_vp->v_cache_dd = NULL; 961 } else { 962 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp, 963 ncp->nc_name); 964 cache_negative_remove(ncp); 965 } 966 if (ncp->nc_flag & NCF_ISDOTDOT) { 967 if (ncp == ncp->nc_dvp->v_cache_dd) 968 ncp->nc_dvp->v_cache_dd = NULL; 969 } else { 970 LIST_REMOVE(ncp, nc_src); 971 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { 972 ncp->nc_flag |= NCF_DVDROP; 973 counter_u64_add(numcachehv, -1); 974 } 975 } 976 atomic_subtract_rel_long(&numcache, 1); 977 } 978 979 static void 980 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) 981 { 982 struct rwlock *blp; 983 984 MPASS(ncp->nc_dvp == vp); 985 MPASS(ncp->nc_flag & NCF_NEGATIVE); 986 cache_assert_vnode_locked(vp); 987 988 blp = NCP2BUCKETLOCK(ncp); 989 rw_wlock(blp); 990 cache_zap_locked(ncp); 991 rw_wunlock(blp); 992 } 993 994 static bool 995 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, 996 struct mtx **vlpp) 997 { 998 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 999 struct rwlock *blp; 1000 1001 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 1002 cache_assert_vnode_locked(vp); 1003 1004 if (ncp->nc_flag & NCF_NEGATIVE) { 1005 if (*vlpp != NULL) { 1006 mtx_unlock(*vlpp); 1007 *vlpp = NULL; 1008 } 1009 cache_zap_negative_locked_vnode_kl(ncp, vp); 1010 return (true); 1011 } 1012 1013 pvlp = VP2VNODELOCK(vp); 1014 blp = NCP2BUCKETLOCK(ncp); 1015 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 1016 vlp2 = VP2VNODELOCK(ncp->nc_vp); 1017 1018 if (*vlpp == vlp1 || *vlpp == vlp2) { 1019 to_unlock = *vlpp; 1020 *vlpp = NULL; 1021 } else { 1022 if (*vlpp != NULL) { 1023 mtx_unlock(*vlpp); 1024 *vlpp = NULL; 1025 } 1026 cache_sort_vnodes(&vlp1, &vlp2); 1027 if (vlp1 == pvlp) { 1028 mtx_lock(vlp2); 1029 to_unlock = vlp2; 1030 } else { 1031 if (!mtx_trylock(vlp1)) 1032 goto out_relock; 1033 to_unlock = vlp1; 1034 } 1035 } 1036 rw_wlock(blp); 1037 cache_zap_locked(ncp); 1038 rw_wunlock(blp); 1039 if (to_unlock != NULL) 1040 mtx_unlock(to_unlock); 1041 return (true); 1042 1043 out_relock: 1044 mtx_unlock(vlp2); 1045 mtx_lock(vlp1); 1046 mtx_lock(vlp2); 1047 MPASS(*vlpp == NULL); 1048 *vlpp = vlp1; 1049 return (false); 1050 } 1051 1052 static int __noinline 1053 cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp) 1054 { 1055 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 1056 struct rwlock *blp; 1057 int error = 0; 1058 1059 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 1060 cache_assert_vnode_locked(vp); 1061 1062 pvlp = VP2VNODELOCK(vp); 1063 if (ncp->nc_flag & NCF_NEGATIVE) { 1064 cache_zap_negative_locked_vnode_kl(ncp, vp); 1065 goto out; 1066 } 1067 1068 blp = NCP2BUCKETLOCK(ncp); 1069 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 1070 vlp2 = VP2VNODELOCK(ncp->nc_vp); 1071 cache_sort_vnodes(&vlp1, &vlp2); 1072 if (vlp1 == pvlp) { 1073 mtx_lock(vlp2); 1074 to_unlock = vlp2; 1075 } else { 1076 if (!mtx_trylock(vlp1)) { 1077 error = EAGAIN; 1078 goto out; 1079 } 1080 to_unlock = vlp1; 1081 } 1082 rw_wlock(blp); 1083 cache_zap_locked(ncp); 1084 rw_wunlock(blp); 1085 mtx_unlock(to_unlock); 1086 out: 1087 mtx_unlock(pvlp); 1088 return (error); 1089 } 1090 1091 /* 1092 * If trylocking failed we can get here. We know enough to take all needed locks 1093 * in the right order and re-lookup the entry. 1094 */ 1095 static int 1096 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1097 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, 1098 struct rwlock *blp) 1099 { 1100 struct namecache *rncp; 1101 1102 cache_assert_bucket_locked(ncp, RA_UNLOCKED); 1103 1104 cache_sort_vnodes(&dvlp, &vlp); 1105 cache_lock_vnodes(dvlp, vlp); 1106 rw_wlock(blp); 1107 CK_LIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { 1108 if (rncp == ncp && rncp->nc_dvp == dvp && 1109 rncp->nc_nlen == cnp->cn_namelen && 1110 !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) 1111 break; 1112 } 1113 if (rncp != NULL) { 1114 cache_zap_locked(rncp); 1115 rw_wunlock(blp); 1116 cache_unlock_vnodes(dvlp, vlp); 1117 counter_u64_add(zap_and_exit_bucket_relock_success, 1); 1118 return (0); 1119 } 1120 1121 rw_wunlock(blp); 1122 cache_unlock_vnodes(dvlp, vlp); 1123 return (EAGAIN); 1124 } 1125 1126 static int __noinline 1127 cache_zap_wlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1128 uint32_t hash, struct rwlock *blp) 1129 { 1130 struct mtx *dvlp, *vlp; 1131 struct vnode *dvp; 1132 1133 cache_assert_bucket_locked(ncp, RA_WLOCKED); 1134 1135 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1136 vlp = NULL; 1137 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1138 vlp = VP2VNODELOCK(ncp->nc_vp); 1139 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1140 cache_zap_locked(ncp); 1141 rw_wunlock(blp); 1142 cache_unlock_vnodes(dvlp, vlp); 1143 return (0); 1144 } 1145 1146 dvp = ncp->nc_dvp; 1147 rw_wunlock(blp); 1148 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1149 } 1150 1151 static int __noinline 1152 cache_zap_rlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1153 uint32_t hash, struct rwlock *blp) 1154 { 1155 struct mtx *dvlp, *vlp; 1156 struct vnode *dvp; 1157 1158 cache_assert_bucket_locked(ncp, RA_RLOCKED); 1159 1160 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1161 vlp = NULL; 1162 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1163 vlp = VP2VNODELOCK(ncp->nc_vp); 1164 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1165 rw_runlock(blp); 1166 rw_wlock(blp); 1167 cache_zap_locked(ncp); 1168 rw_wunlock(blp); 1169 cache_unlock_vnodes(dvlp, vlp); 1170 return (0); 1171 } 1172 1173 dvp = ncp->nc_dvp; 1174 rw_runlock(blp); 1175 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1176 } 1177 1178 static int 1179 cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp, 1180 struct mtx **vlpp1, struct mtx **vlpp2) 1181 { 1182 struct mtx *dvlp, *vlp; 1183 1184 cache_assert_bucket_locked(ncp, RA_WLOCKED); 1185 1186 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1187 vlp = NULL; 1188 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1189 vlp = VP2VNODELOCK(ncp->nc_vp); 1190 cache_sort_vnodes(&dvlp, &vlp); 1191 1192 if (*vlpp1 == dvlp && *vlpp2 == vlp) { 1193 cache_zap_locked(ncp); 1194 cache_unlock_vnodes(dvlp, vlp); 1195 *vlpp1 = NULL; 1196 *vlpp2 = NULL; 1197 return (0); 1198 } 1199 1200 if (*vlpp1 != NULL) 1201 mtx_unlock(*vlpp1); 1202 if (*vlpp2 != NULL) 1203 mtx_unlock(*vlpp2); 1204 *vlpp1 = NULL; 1205 *vlpp2 = NULL; 1206 1207 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1208 cache_zap_locked(ncp); 1209 cache_unlock_vnodes(dvlp, vlp); 1210 return (0); 1211 } 1212 1213 rw_wunlock(blp); 1214 *vlpp1 = dvlp; 1215 *vlpp2 = vlp; 1216 if (*vlpp1 != NULL) 1217 mtx_lock(*vlpp1); 1218 mtx_lock(*vlpp2); 1219 rw_wlock(blp); 1220 return (EAGAIN); 1221 } 1222 1223 static void 1224 cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp) 1225 { 1226 1227 if (blp != NULL) { 1228 rw_runlock(blp); 1229 } else { 1230 mtx_unlock(vlp); 1231 } 1232 } 1233 1234 static int __noinline 1235 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1236 struct timespec *tsp, int *ticksp) 1237 { 1238 int ltype; 1239 1240 *vpp = dvp; 1241 CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .", 1242 dvp, cnp->cn_nameptr); 1243 counter_u64_add(dothits, 1); 1244 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); 1245 if (tsp != NULL) 1246 timespecclear(tsp); 1247 if (ticksp != NULL) 1248 *ticksp = ticks; 1249 vrefact(*vpp); 1250 /* 1251 * When we lookup "." we still can be asked to lock it 1252 * differently... 1253 */ 1254 ltype = cnp->cn_lkflags & LK_TYPE_MASK; 1255 if (ltype != VOP_ISLOCKED(*vpp)) { 1256 if (ltype == LK_EXCLUSIVE) { 1257 vn_lock(*vpp, LK_UPGRADE | LK_RETRY); 1258 if (VN_IS_DOOMED((*vpp))) { 1259 /* forced unmount */ 1260 vrele(*vpp); 1261 *vpp = NULL; 1262 return (ENOENT); 1263 } 1264 } else 1265 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); 1266 } 1267 return (-1); 1268 } 1269 1270 static __noinline int 1271 cache_lookup_nomakeentry(struct vnode *dvp, struct vnode **vpp, 1272 struct componentname *cnp, struct timespec *tsp, int *ticksp) 1273 { 1274 struct namecache *ncp; 1275 struct rwlock *blp; 1276 struct mtx *dvlp, *dvlp2; 1277 uint32_t hash; 1278 int error; 1279 1280 if (cnp->cn_namelen == 2 && 1281 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1282 counter_u64_add(dotdothits, 1); 1283 dvlp = VP2VNODELOCK(dvp); 1284 dvlp2 = NULL; 1285 mtx_lock(dvlp); 1286 retry_dotdot: 1287 ncp = dvp->v_cache_dd; 1288 if (ncp == NULL) { 1289 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, 1290 "..", NULL); 1291 mtx_unlock(dvlp); 1292 if (dvlp2 != NULL) 1293 mtx_unlock(dvlp2); 1294 return (0); 1295 } 1296 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1297 if (ncp->nc_dvp != dvp) 1298 panic("dvp %p v_cache_dd %p\n", dvp, ncp); 1299 if (!cache_zap_locked_vnode_kl2(ncp, 1300 dvp, &dvlp2)) 1301 goto retry_dotdot; 1302 MPASS(dvp->v_cache_dd == NULL); 1303 mtx_unlock(dvlp); 1304 if (dvlp2 != NULL) 1305 mtx_unlock(dvlp2); 1306 cache_free(ncp); 1307 } else { 1308 dvp->v_cache_dd = NULL; 1309 mtx_unlock(dvlp); 1310 if (dvlp2 != NULL) 1311 mtx_unlock(dvlp2); 1312 } 1313 return (0); 1314 } 1315 1316 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1317 blp = HASH2BUCKETLOCK(hash); 1318 retry: 1319 if (CK_LIST_EMPTY(NCHHASH(hash))) 1320 goto out_no_entry; 1321 1322 rw_wlock(blp); 1323 1324 CK_LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1325 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1326 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1327 break; 1328 } 1329 1330 /* We failed to find an entry */ 1331 if (ncp == NULL) { 1332 rw_wunlock(blp); 1333 goto out_no_entry; 1334 } 1335 1336 error = cache_zap_wlocked_bucket(ncp, cnp, hash, blp); 1337 if (__predict_false(error != 0)) { 1338 zap_and_exit_bucket_fail++; 1339 cache_maybe_yield(); 1340 goto retry; 1341 } 1342 counter_u64_add(numposzaps, 1); 1343 cache_free(ncp); 1344 return (0); 1345 out_no_entry: 1346 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, NULL); 1347 counter_u64_add(nummisszap, 1); 1348 return (0); 1349 } 1350 1351 /** 1352 * Lookup a name in the name cache 1353 * 1354 * # Arguments 1355 * 1356 * - dvp: Parent directory in which to search. 1357 * - vpp: Return argument. Will contain desired vnode on cache hit. 1358 * - cnp: Parameters of the name search. The most interesting bits of 1359 * the cn_flags field have the following meanings: 1360 * - MAKEENTRY: If clear, free an entry from the cache rather than look 1361 * it up. 1362 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." 1363 * - tsp: Return storage for cache timestamp. On a successful (positive 1364 * or negative) lookup, tsp will be filled with any timespec that 1365 * was stored when this cache entry was created. However, it will 1366 * be clear for "." entries. 1367 * - ticks: Return storage for alternate cache timestamp. On a successful 1368 * (positive or negative) lookup, it will contain the ticks value 1369 * that was current when the cache entry was created, unless cnp 1370 * was ".". 1371 * 1372 * # Returns 1373 * 1374 * - -1: A positive cache hit. vpp will contain the desired vnode. 1375 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due 1376 * to a forced unmount. vpp will not be modified. If the entry 1377 * is a whiteout, then the ISWHITEOUT flag will be set in 1378 * cnp->cn_flags. 1379 * - 0: A cache miss. vpp will not be modified. 1380 * 1381 * # Locking 1382 * 1383 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up 1384 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the 1385 * lock is not recursively acquired. 1386 */ 1387 int 1388 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1389 struct timespec *tsp, int *ticksp) 1390 { 1391 struct namecache_ts *ncp_ts; 1392 struct namecache *ncp; 1393 struct negstate *negstate; 1394 struct rwlock *blp; 1395 struct mtx *dvlp; 1396 uint32_t hash; 1397 enum vgetstate vs; 1398 int error, ltype; 1399 bool try_smr, doing_smr, whiteout; 1400 1401 #ifdef DEBUG_CACHE 1402 if (__predict_false(!doingcache)) { 1403 cnp->cn_flags &= ~MAKEENTRY; 1404 return (0); 1405 } 1406 #endif 1407 1408 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) 1409 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); 1410 1411 if ((cnp->cn_flags & MAKEENTRY) == 0) 1412 return (cache_lookup_nomakeentry(dvp, vpp, cnp, tsp, ticksp)); 1413 1414 try_smr = true; 1415 if (cnp->cn_nameiop == CREATE) 1416 try_smr = false; 1417 retry: 1418 doing_smr = false; 1419 blp = NULL; 1420 dvlp = NULL; 1421 error = 0; 1422 if (cnp->cn_namelen == 2 && 1423 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1424 counter_u64_add(dotdothits, 1); 1425 dvlp = VP2VNODELOCK(dvp); 1426 mtx_lock(dvlp); 1427 ncp = dvp->v_cache_dd; 1428 if (ncp == NULL) { 1429 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, 1430 "..", NULL); 1431 mtx_unlock(dvlp); 1432 return (0); 1433 } 1434 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1435 if (ncp->nc_flag & NCF_NEGATIVE) 1436 *vpp = NULL; 1437 else 1438 *vpp = ncp->nc_vp; 1439 } else 1440 *vpp = ncp->nc_dvp; 1441 /* Return failure if negative entry was found. */ 1442 if (*vpp == NULL) 1443 goto negative_success; 1444 CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..", 1445 dvp, cnp->cn_nameptr, *vpp); 1446 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", 1447 *vpp); 1448 cache_out_ts(ncp, tsp, ticksp); 1449 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == 1450 NCF_DTS && tsp != NULL) { 1451 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1452 *tsp = ncp_ts->nc_dotdottime; 1453 } 1454 goto success; 1455 } 1456 1457 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1458 retry_hashed: 1459 if (try_smr) { 1460 vfs_smr_enter(); 1461 doing_smr = true; 1462 try_smr = false; 1463 } else { 1464 blp = HASH2BUCKETLOCK(hash); 1465 rw_rlock(blp); 1466 } 1467 1468 CK_LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1469 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1470 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1471 break; 1472 } 1473 1474 /* We failed to find an entry */ 1475 if (__predict_false(ncp == NULL)) { 1476 if (doing_smr) 1477 vfs_smr_exit(); 1478 else 1479 rw_runlock(blp); 1480 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1481 NULL); 1482 counter_u64_add(nummiss, 1); 1483 return (0); 1484 } 1485 1486 if (ncp->nc_flag & NCF_NEGATIVE) 1487 goto negative_success; 1488 1489 /* We found a "positive" match, return the vnode */ 1490 counter_u64_add(numposhits, 1); 1491 *vpp = ncp->nc_vp; 1492 CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p", 1493 dvp, cnp->cn_nameptr, *vpp, ncp); 1494 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, 1495 *vpp); 1496 cache_out_ts(ncp, tsp, ticksp); 1497 success: 1498 /* 1499 * On success we return a locked and ref'd vnode as per the lookup 1500 * protocol. 1501 */ 1502 MPASS(dvp != *vpp); 1503 ltype = 0; /* silence gcc warning */ 1504 if (cnp->cn_flags & ISDOTDOT) { 1505 ltype = VOP_ISLOCKED(dvp); 1506 VOP_UNLOCK(dvp); 1507 } 1508 if (doing_smr) { 1509 if (cache_ncp_invalid(ncp)) { 1510 vfs_smr_exit(); 1511 *vpp = NULL; 1512 goto retry; 1513 } 1514 vs = vget_prep_smr(*vpp); 1515 vfs_smr_exit(); 1516 if (vs == VGET_NONE) { 1517 *vpp = NULL; 1518 goto retry; 1519 } 1520 } else { 1521 vs = vget_prep(*vpp); 1522 cache_lookup_unlock(blp, dvlp); 1523 } 1524 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1525 if (cnp->cn_flags & ISDOTDOT) { 1526 vn_lock(dvp, ltype | LK_RETRY); 1527 if (VN_IS_DOOMED(dvp)) { 1528 if (error == 0) 1529 vput(*vpp); 1530 *vpp = NULL; 1531 return (ENOENT); 1532 } 1533 } 1534 if (error) { 1535 *vpp = NULL; 1536 goto retry; 1537 } 1538 if ((cnp->cn_flags & ISLASTCN) && 1539 (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) { 1540 ASSERT_VOP_ELOCKED(*vpp, "cache_lookup"); 1541 } 1542 return (-1); 1543 1544 negative_success: 1545 /* We found a negative match, and want to create it, so purge */ 1546 if (cnp->cn_nameiop == CREATE) { 1547 MPASS(!doing_smr); 1548 counter_u64_add(numnegzaps, 1); 1549 goto zap_and_exit; 1550 } 1551 1552 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 1553 cache_out_ts(ncp, tsp, ticksp); 1554 counter_u64_add(numneghits, 1); 1555 whiteout = (ncp->nc_flag & NCF_WHITE); 1556 1557 if (doing_smr) { 1558 /* 1559 * We need to take locks to promote an entry. 1560 */ 1561 negstate = NCP2NEGSTATE(ncp); 1562 if ((negstate->neg_flag & NEG_HOT) == 0 || 1563 cache_ncp_invalid(ncp)) { 1564 vfs_smr_exit(); 1565 doing_smr = false; 1566 goto retry_hashed; 1567 } 1568 vfs_smr_exit(); 1569 } else { 1570 cache_negative_hit(ncp); 1571 cache_lookup_unlock(blp, dvlp); 1572 } 1573 if (whiteout) 1574 cnp->cn_flags |= ISWHITEOUT; 1575 return (ENOENT); 1576 1577 zap_and_exit: 1578 MPASS(!doing_smr); 1579 if (blp != NULL) 1580 error = cache_zap_rlocked_bucket(ncp, cnp, hash, blp); 1581 else 1582 error = cache_zap_locked_vnode(ncp, dvp); 1583 if (__predict_false(error != 0)) { 1584 zap_and_exit_bucket_fail2++; 1585 cache_maybe_yield(); 1586 goto retry; 1587 } 1588 cache_free(ncp); 1589 return (0); 1590 } 1591 1592 struct celockstate { 1593 struct mtx *vlp[3]; 1594 struct rwlock *blp[2]; 1595 }; 1596 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); 1597 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); 1598 1599 static inline void 1600 cache_celockstate_init(struct celockstate *cel) 1601 { 1602 1603 bzero(cel, sizeof(*cel)); 1604 } 1605 1606 static void 1607 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, 1608 struct vnode *dvp) 1609 { 1610 struct mtx *vlp1, *vlp2; 1611 1612 MPASS(cel->vlp[0] == NULL); 1613 MPASS(cel->vlp[1] == NULL); 1614 MPASS(cel->vlp[2] == NULL); 1615 1616 MPASS(vp != NULL || dvp != NULL); 1617 1618 vlp1 = VP2VNODELOCK(vp); 1619 vlp2 = VP2VNODELOCK(dvp); 1620 cache_sort_vnodes(&vlp1, &vlp2); 1621 1622 if (vlp1 != NULL) { 1623 mtx_lock(vlp1); 1624 cel->vlp[0] = vlp1; 1625 } 1626 mtx_lock(vlp2); 1627 cel->vlp[1] = vlp2; 1628 } 1629 1630 static void 1631 cache_unlock_vnodes_cel(struct celockstate *cel) 1632 { 1633 1634 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); 1635 1636 if (cel->vlp[0] != NULL) 1637 mtx_unlock(cel->vlp[0]); 1638 if (cel->vlp[1] != NULL) 1639 mtx_unlock(cel->vlp[1]); 1640 if (cel->vlp[2] != NULL) 1641 mtx_unlock(cel->vlp[2]); 1642 } 1643 1644 static bool 1645 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) 1646 { 1647 struct mtx *vlp; 1648 bool ret; 1649 1650 cache_assert_vlp_locked(cel->vlp[0]); 1651 cache_assert_vlp_locked(cel->vlp[1]); 1652 MPASS(cel->vlp[2] == NULL); 1653 1654 MPASS(vp != NULL); 1655 vlp = VP2VNODELOCK(vp); 1656 1657 ret = true; 1658 if (vlp >= cel->vlp[1]) { 1659 mtx_lock(vlp); 1660 } else { 1661 if (mtx_trylock(vlp)) 1662 goto out; 1663 cache_lock_vnodes_cel_3_failures++; 1664 cache_unlock_vnodes_cel(cel); 1665 if (vlp < cel->vlp[0]) { 1666 mtx_lock(vlp); 1667 mtx_lock(cel->vlp[0]); 1668 mtx_lock(cel->vlp[1]); 1669 } else { 1670 if (cel->vlp[0] != NULL) 1671 mtx_lock(cel->vlp[0]); 1672 mtx_lock(vlp); 1673 mtx_lock(cel->vlp[1]); 1674 } 1675 ret = false; 1676 } 1677 out: 1678 cel->vlp[2] = vlp; 1679 return (ret); 1680 } 1681 1682 static void 1683 cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1, 1684 struct rwlock *blp2) 1685 { 1686 1687 MPASS(cel->blp[0] == NULL); 1688 MPASS(cel->blp[1] == NULL); 1689 1690 cache_sort_vnodes(&blp1, &blp2); 1691 1692 if (blp1 != NULL) { 1693 rw_wlock(blp1); 1694 cel->blp[0] = blp1; 1695 } 1696 rw_wlock(blp2); 1697 cel->blp[1] = blp2; 1698 } 1699 1700 static void 1701 cache_unlock_buckets_cel(struct celockstate *cel) 1702 { 1703 1704 if (cel->blp[0] != NULL) 1705 rw_wunlock(cel->blp[0]); 1706 rw_wunlock(cel->blp[1]); 1707 } 1708 1709 /* 1710 * Lock part of the cache affected by the insertion. 1711 * 1712 * This means vnodelocks for dvp, vp and the relevant bucketlock. 1713 * However, insertion can result in removal of an old entry. In this 1714 * case we have an additional vnode and bucketlock pair to lock. If the 1715 * entry is negative, ncelock is locked instead of the vnode. 1716 * 1717 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while 1718 * preserving the locking order (smaller address first). 1719 */ 1720 static void 1721 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1722 uint32_t hash) 1723 { 1724 struct namecache *ncp; 1725 struct rwlock *blps[2]; 1726 1727 blps[0] = HASH2BUCKETLOCK(hash); 1728 for (;;) { 1729 blps[1] = NULL; 1730 cache_lock_vnodes_cel(cel, dvp, vp); 1731 if (vp == NULL || vp->v_type != VDIR) 1732 break; 1733 ncp = vp->v_cache_dd; 1734 if (ncp == NULL) 1735 break; 1736 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1737 break; 1738 MPASS(ncp->nc_dvp == vp); 1739 blps[1] = NCP2BUCKETLOCK(ncp); 1740 if (ncp->nc_flag & NCF_NEGATIVE) 1741 break; 1742 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1743 break; 1744 /* 1745 * All vnodes got re-locked. Re-validate the state and if 1746 * nothing changed we are done. Otherwise restart. 1747 */ 1748 if (ncp == vp->v_cache_dd && 1749 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1750 blps[1] == NCP2BUCKETLOCK(ncp) && 1751 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1752 break; 1753 cache_unlock_vnodes_cel(cel); 1754 cel->vlp[0] = NULL; 1755 cel->vlp[1] = NULL; 1756 cel->vlp[2] = NULL; 1757 } 1758 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1759 } 1760 1761 static void 1762 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1763 uint32_t hash) 1764 { 1765 struct namecache *ncp; 1766 struct rwlock *blps[2]; 1767 1768 blps[0] = HASH2BUCKETLOCK(hash); 1769 for (;;) { 1770 blps[1] = NULL; 1771 cache_lock_vnodes_cel(cel, dvp, vp); 1772 ncp = dvp->v_cache_dd; 1773 if (ncp == NULL) 1774 break; 1775 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1776 break; 1777 MPASS(ncp->nc_dvp == dvp); 1778 blps[1] = NCP2BUCKETLOCK(ncp); 1779 if (ncp->nc_flag & NCF_NEGATIVE) 1780 break; 1781 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1782 break; 1783 if (ncp == dvp->v_cache_dd && 1784 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1785 blps[1] == NCP2BUCKETLOCK(ncp) && 1786 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1787 break; 1788 cache_unlock_vnodes_cel(cel); 1789 cel->vlp[0] = NULL; 1790 cel->vlp[1] = NULL; 1791 cel->vlp[2] = NULL; 1792 } 1793 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1794 } 1795 1796 static void 1797 cache_enter_unlock(struct celockstate *cel) 1798 { 1799 1800 cache_unlock_buckets_cel(cel); 1801 cache_unlock_vnodes_cel(cel); 1802 } 1803 1804 static void __noinline 1805 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, 1806 struct componentname *cnp) 1807 { 1808 struct celockstate cel; 1809 struct namecache *ncp; 1810 uint32_t hash; 1811 int len; 1812 1813 if (dvp->v_cache_dd == NULL) 1814 return; 1815 len = cnp->cn_namelen; 1816 cache_celockstate_init(&cel); 1817 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1818 cache_enter_lock_dd(&cel, dvp, vp, hash); 1819 ncp = dvp->v_cache_dd; 1820 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) { 1821 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); 1822 cache_zap_locked(ncp); 1823 } else { 1824 ncp = NULL; 1825 } 1826 dvp->v_cache_dd = NULL; 1827 cache_enter_unlock(&cel); 1828 cache_free(ncp); 1829 } 1830 1831 /* 1832 * Add an entry to the cache. 1833 */ 1834 void 1835 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 1836 struct timespec *tsp, struct timespec *dtsp) 1837 { 1838 struct celockstate cel; 1839 struct namecache *ncp, *n2, *ndd; 1840 struct namecache_ts *ncp_ts, *n2_ts; 1841 struct nchashhead *ncpp; 1842 uint32_t hash; 1843 int flag; 1844 int len; 1845 u_long lnumcache; 1846 1847 CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr); 1848 VNASSERT(vp == NULL || !VN_IS_DOOMED(vp), vp, 1849 ("cache_enter: Adding a doomed vnode")); 1850 VNASSERT(dvp == NULL || !VN_IS_DOOMED(dvp), dvp, 1851 ("cache_enter: Doomed vnode used as src")); 1852 1853 #ifdef DEBUG_CACHE 1854 if (__predict_false(!doingcache)) 1855 return; 1856 #endif 1857 1858 flag = 0; 1859 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 1860 if (cnp->cn_namelen == 1) 1861 return; 1862 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 1863 cache_enter_dotdot_prep(dvp, vp, cnp); 1864 flag = NCF_ISDOTDOT; 1865 } 1866 } 1867 1868 /* 1869 * Avoid blowout in namecache entries. 1870 */ 1871 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; 1872 if (__predict_false(lnumcache >= ncsize)) { 1873 atomic_add_long(&numcache, -1); 1874 counter_u64_add(numdrops, 1); 1875 return; 1876 } 1877 1878 cache_celockstate_init(&cel); 1879 ndd = NULL; 1880 ncp_ts = NULL; 1881 1882 /* 1883 * Calculate the hash key and setup as much of the new 1884 * namecache entry as possible before acquiring the lock. 1885 */ 1886 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); 1887 ncp->nc_flag = flag; 1888 ncp->nc_vp = vp; 1889 if (vp == NULL) 1890 cache_negative_init(ncp); 1891 ncp->nc_dvp = dvp; 1892 if (tsp != NULL) { 1893 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1894 ncp_ts->nc_time = *tsp; 1895 ncp_ts->nc_ticks = ticks; 1896 ncp_ts->nc_nc.nc_flag |= NCF_TS; 1897 if (dtsp != NULL) { 1898 ncp_ts->nc_dotdottime = *dtsp; 1899 ncp_ts->nc_nc.nc_flag |= NCF_DTS; 1900 } 1901 } 1902 len = ncp->nc_nlen = cnp->cn_namelen; 1903 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1904 strlcpy(ncp->nc_name, cnp->cn_nameptr, len + 1); 1905 cache_enter_lock(&cel, dvp, vp, hash); 1906 1907 /* 1908 * See if this vnode or negative entry is already in the cache 1909 * with this name. This can happen with concurrent lookups of 1910 * the same path name. 1911 */ 1912 ncpp = NCHHASH(hash); 1913 CK_LIST_FOREACH(n2, ncpp, nc_hash) { 1914 if (n2->nc_dvp == dvp && 1915 n2->nc_nlen == cnp->cn_namelen && 1916 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { 1917 if (tsp != NULL) { 1918 KASSERT((n2->nc_flag & NCF_TS) != 0, 1919 ("no NCF_TS")); 1920 n2_ts = __containerof(n2, struct namecache_ts, nc_nc); 1921 n2_ts->nc_time = ncp_ts->nc_time; 1922 n2_ts->nc_ticks = ncp_ts->nc_ticks; 1923 if (dtsp != NULL) { 1924 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; 1925 n2_ts->nc_nc.nc_flag |= NCF_DTS; 1926 } 1927 } 1928 goto out_unlock_free; 1929 } 1930 } 1931 1932 if (flag == NCF_ISDOTDOT) { 1933 /* 1934 * See if we are trying to add .. entry, but some other lookup 1935 * has populated v_cache_dd pointer already. 1936 */ 1937 if (dvp->v_cache_dd != NULL) 1938 goto out_unlock_free; 1939 KASSERT(vp == NULL || vp->v_type == VDIR, 1940 ("wrong vnode type %p", vp)); 1941 dvp->v_cache_dd = ncp; 1942 } 1943 1944 if (vp != NULL) { 1945 if (vp->v_type == VDIR) { 1946 if (flag != NCF_ISDOTDOT) { 1947 /* 1948 * For this case, the cache entry maps both the 1949 * directory name in it and the name ".." for the 1950 * directory's parent. 1951 */ 1952 if ((ndd = vp->v_cache_dd) != NULL) { 1953 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) 1954 cache_zap_locked(ndd); 1955 else 1956 ndd = NULL; 1957 } 1958 vp->v_cache_dd = ncp; 1959 } 1960 } else { 1961 vp->v_cache_dd = NULL; 1962 } 1963 } 1964 1965 if (flag != NCF_ISDOTDOT) { 1966 if (LIST_EMPTY(&dvp->v_cache_src)) { 1967 vhold(dvp); 1968 counter_u64_add(numcachehv, 1); 1969 } 1970 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); 1971 } 1972 1973 /* 1974 * If the entry is "negative", we place it into the 1975 * "negative" cache queue, otherwise, we place it into the 1976 * destination vnode's cache entries queue. 1977 */ 1978 if (vp != NULL) { 1979 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); 1980 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, 1981 vp); 1982 } else { 1983 if (cnp->cn_flags & ISWHITEOUT) 1984 ncp->nc_flag |= NCF_WHITE; 1985 cache_negative_insert(ncp); 1986 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, 1987 ncp->nc_name); 1988 } 1989 1990 atomic_thread_fence_rel(); 1991 /* 1992 * Insert the new namecache entry into the appropriate chain 1993 * within the cache entries table. 1994 */ 1995 CK_LIST_INSERT_HEAD(ncpp, ncp, nc_hash); 1996 1997 cache_enter_unlock(&cel); 1998 if (numneg * ncnegfactor > lnumcache) 1999 cache_negative_zap_one(); 2000 cache_free(ndd); 2001 return; 2002 out_unlock_free: 2003 cache_enter_unlock(&cel); 2004 cache_free(ncp); 2005 return; 2006 } 2007 2008 static u_int 2009 cache_roundup_2(u_int val) 2010 { 2011 u_int res; 2012 2013 for (res = 1; res <= val; res <<= 1) 2014 continue; 2015 2016 return (res); 2017 } 2018 2019 /* 2020 * Name cache initialization, from vfs_init() when we are booting 2021 */ 2022 static void 2023 nchinit(void *dummy __unused) 2024 { 2025 u_int i; 2026 2027 cache_zone_small = uma_zcreate("S VFS Cache", 2028 sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1, 2029 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache), 2030 UMA_ZONE_ZINIT); 2031 cache_zone_small_ts = uma_zcreate("STS VFS Cache", 2032 sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1, 2033 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts), 2034 UMA_ZONE_ZINIT); 2035 cache_zone_large = uma_zcreate("L VFS Cache", 2036 sizeof(struct namecache) + NAME_MAX + 1, 2037 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache), 2038 UMA_ZONE_ZINIT); 2039 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", 2040 sizeof(struct namecache_ts) + NAME_MAX + 1, 2041 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts), 2042 UMA_ZONE_ZINIT); 2043 2044 VFS_SMR_ZONE_SET(cache_zone_small); 2045 VFS_SMR_ZONE_SET(cache_zone_small_ts); 2046 VFS_SMR_ZONE_SET(cache_zone_large); 2047 VFS_SMR_ZONE_SET(cache_zone_large_ts); 2048 2049 ncsize = desiredvnodes * ncsizefactor; 2050 nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash); 2051 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; 2052 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ 2053 ncbuckethash = 7; 2054 if (ncbuckethash > nchash) 2055 ncbuckethash = nchash; 2056 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, 2057 M_WAITOK | M_ZERO); 2058 for (i = 0; i < numbucketlocks; i++) 2059 rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE); 2060 ncvnodehash = ncbuckethash; 2061 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, 2062 M_WAITOK | M_ZERO); 2063 for (i = 0; i < numvnodelocks; i++) 2064 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); 2065 ncpurgeminvnodes = numbucketlocks * 2; 2066 2067 ncneghash = 3; 2068 neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE, 2069 M_WAITOK | M_ZERO); 2070 for (i = 0; i < numneglists; i++) { 2071 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); 2072 TAILQ_INIT(&neglists[i].nl_list); 2073 } 2074 mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF); 2075 TAILQ_INIT(&ncneg_hot.nl_list); 2076 2077 mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF); 2078 } 2079 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); 2080 2081 void 2082 cache_vnode_init(struct vnode *vp) 2083 { 2084 2085 LIST_INIT(&vp->v_cache_src); 2086 TAILQ_INIT(&vp->v_cache_dst); 2087 vp->v_cache_dd = NULL; 2088 cache_prehash(vp); 2089 } 2090 2091 void 2092 cache_changesize(u_long newmaxvnodes) 2093 { 2094 struct nchashhead *new_nchashtbl, *old_nchashtbl; 2095 u_long new_nchash, old_nchash; 2096 struct namecache *ncp; 2097 uint32_t hash; 2098 u_long newncsize; 2099 int i; 2100 2101 newncsize = newmaxvnodes * ncsizefactor; 2102 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); 2103 if (newmaxvnodes < numbucketlocks) 2104 newmaxvnodes = numbucketlocks; 2105 2106 new_nchashtbl = hashinit(newmaxvnodes, M_VFSCACHE, &new_nchash); 2107 /* If same hash table size, nothing to do */ 2108 if (nchash == new_nchash) { 2109 free(new_nchashtbl, M_VFSCACHE); 2110 return; 2111 } 2112 /* 2113 * Move everything from the old hash table to the new table. 2114 * None of the namecache entries in the table can be removed 2115 * because to do so, they have to be removed from the hash table. 2116 */ 2117 cache_lock_all_vnodes(); 2118 cache_lock_all_buckets(); 2119 old_nchashtbl = nchashtbl; 2120 old_nchash = nchash; 2121 nchashtbl = new_nchashtbl; 2122 nchash = new_nchash; 2123 for (i = 0; i <= old_nchash; i++) { 2124 while ((ncp = CK_LIST_FIRST(&old_nchashtbl[i])) != NULL) { 2125 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, 2126 ncp->nc_dvp); 2127 CK_LIST_REMOVE(ncp, nc_hash); 2128 CK_LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); 2129 } 2130 } 2131 ncsize = newncsize; 2132 cache_unlock_all_buckets(); 2133 cache_unlock_all_vnodes(); 2134 free(old_nchashtbl, M_VFSCACHE); 2135 } 2136 2137 /* 2138 * Invalidate all entries from and to a particular vnode. 2139 */ 2140 void 2141 cache_purge(struct vnode *vp) 2142 { 2143 TAILQ_HEAD(, namecache) ncps; 2144 struct namecache *ncp, *nnp; 2145 struct mtx *vlp, *vlp2; 2146 2147 CTR1(KTR_VFS, "cache_purge(%p)", vp); 2148 SDT_PROBE1(vfs, namecache, purge, done, vp); 2149 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2150 vp->v_cache_dd == NULL) 2151 return; 2152 TAILQ_INIT(&ncps); 2153 vlp = VP2VNODELOCK(vp); 2154 vlp2 = NULL; 2155 mtx_lock(vlp); 2156 retry: 2157 while (!LIST_EMPTY(&vp->v_cache_src)) { 2158 ncp = LIST_FIRST(&vp->v_cache_src); 2159 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2160 goto retry; 2161 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2162 } 2163 while (!TAILQ_EMPTY(&vp->v_cache_dst)) { 2164 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2165 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2166 goto retry; 2167 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2168 } 2169 ncp = vp->v_cache_dd; 2170 if (ncp != NULL) { 2171 KASSERT(ncp->nc_flag & NCF_ISDOTDOT, 2172 ("lost dotdot link")); 2173 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2174 goto retry; 2175 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2176 } 2177 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); 2178 mtx_unlock(vlp); 2179 if (vlp2 != NULL) 2180 mtx_unlock(vlp2); 2181 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2182 cache_free(ncp); 2183 } 2184 } 2185 2186 /* 2187 * Invalidate all negative entries for a particular directory vnode. 2188 */ 2189 void 2190 cache_purge_negative(struct vnode *vp) 2191 { 2192 TAILQ_HEAD(, namecache) ncps; 2193 struct namecache *ncp, *nnp; 2194 struct mtx *vlp; 2195 2196 CTR1(KTR_VFS, "cache_purge_negative(%p)", vp); 2197 SDT_PROBE1(vfs, namecache, purge_negative, done, vp); 2198 if (LIST_EMPTY(&vp->v_cache_src)) 2199 return; 2200 TAILQ_INIT(&ncps); 2201 vlp = VP2VNODELOCK(vp); 2202 mtx_lock(vlp); 2203 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { 2204 if (!(ncp->nc_flag & NCF_NEGATIVE)) 2205 continue; 2206 cache_zap_negative_locked_vnode_kl(ncp, vp); 2207 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2208 } 2209 mtx_unlock(vlp); 2210 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2211 cache_free(ncp); 2212 } 2213 } 2214 2215 /* 2216 * Flush all entries referencing a particular filesystem. 2217 */ 2218 void 2219 cache_purgevfs(struct mount *mp, bool force) 2220 { 2221 TAILQ_HEAD(, namecache) ncps; 2222 struct mtx *vlp1, *vlp2; 2223 struct rwlock *blp; 2224 struct nchashhead *bucket; 2225 struct namecache *ncp, *nnp; 2226 u_long i, j, n_nchash; 2227 int error; 2228 2229 /* Scan hash tables for applicable entries */ 2230 SDT_PROBE1(vfs, namecache, purgevfs, done, mp); 2231 if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes) 2232 return; 2233 TAILQ_INIT(&ncps); 2234 n_nchash = nchash + 1; 2235 vlp1 = vlp2 = NULL; 2236 for (i = 0; i < numbucketlocks; i++) { 2237 blp = (struct rwlock *)&bucketlocks[i]; 2238 rw_wlock(blp); 2239 for (j = i; j < n_nchash; j += numbucketlocks) { 2240 retry: 2241 bucket = &nchashtbl[j]; 2242 CK_LIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) { 2243 cache_assert_bucket_locked(ncp, RA_WLOCKED); 2244 if (ncp->nc_dvp->v_mount != mp) 2245 continue; 2246 error = cache_zap_wlocked_bucket_kl(ncp, blp, 2247 &vlp1, &vlp2); 2248 if (error != 0) 2249 goto retry; 2250 TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst); 2251 } 2252 } 2253 rw_wunlock(blp); 2254 if (vlp1 == NULL && vlp2 == NULL) 2255 cache_maybe_yield(); 2256 } 2257 if (vlp1 != NULL) 2258 mtx_unlock(vlp1); 2259 if (vlp2 != NULL) 2260 mtx_unlock(vlp2); 2261 2262 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2263 cache_free(ncp); 2264 } 2265 } 2266 2267 /* 2268 * Perform canonical checks and cache lookup and pass on to filesystem 2269 * through the vop_cachedlookup only if needed. 2270 */ 2271 2272 int 2273 vfs_cache_lookup(struct vop_lookup_args *ap) 2274 { 2275 struct vnode *dvp; 2276 int error; 2277 struct vnode **vpp = ap->a_vpp; 2278 struct componentname *cnp = ap->a_cnp; 2279 int flags = cnp->cn_flags; 2280 2281 *vpp = NULL; 2282 dvp = ap->a_dvp; 2283 2284 if (dvp->v_type != VDIR) 2285 return (ENOTDIR); 2286 2287 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && 2288 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) 2289 return (EROFS); 2290 2291 error = vn_dir_check_exec(dvp, cnp); 2292 if (error != 0) 2293 return (error); 2294 2295 error = cache_lookup(dvp, vpp, cnp, NULL, NULL); 2296 if (error == 0) 2297 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); 2298 if (error == -1) 2299 return (0); 2300 return (error); 2301 } 2302 2303 /* Implementation of the getcwd syscall. */ 2304 int 2305 sys___getcwd(struct thread *td, struct __getcwd_args *uap) 2306 { 2307 char *buf, *retbuf; 2308 size_t buflen; 2309 int error; 2310 2311 buflen = uap->buflen; 2312 if (__predict_false(buflen < 2)) 2313 return (EINVAL); 2314 if (buflen > MAXPATHLEN) 2315 buflen = MAXPATHLEN; 2316 2317 buf = malloc(buflen, M_TEMP, M_WAITOK); 2318 error = vn_getcwd(td, buf, &retbuf, &buflen); 2319 if (error == 0) 2320 error = copyout(retbuf, uap->buf, buflen); 2321 free(buf, M_TEMP); 2322 return (error); 2323 } 2324 2325 int 2326 vn_getcwd(struct thread *td, char *buf, char **retbuf, size_t *buflen) 2327 { 2328 struct pwd *pwd; 2329 int error; 2330 2331 pwd = pwd_hold(td); 2332 error = vn_fullpath_any(td, pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, buflen); 2333 pwd_drop(pwd); 2334 2335 #ifdef KTRACE 2336 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0) 2337 ktrnamei(*retbuf); 2338 #endif 2339 return (error); 2340 } 2341 2342 static int 2343 kern___realpathat(struct thread *td, int fd, const char *path, char *buf, 2344 size_t size, int flags, enum uio_seg pathseg) 2345 { 2346 struct nameidata nd; 2347 char *retbuf, *freebuf; 2348 int error; 2349 2350 if (flags != 0) 2351 return (EINVAL); 2352 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1, 2353 pathseg, path, fd, &cap_fstat_rights, td); 2354 if ((error = namei(&nd)) != 0) 2355 return (error); 2356 error = vn_fullpath_hardlink(td, &nd, &retbuf, &freebuf, &size); 2357 if (error == 0) { 2358 error = copyout(retbuf, buf, size); 2359 free(freebuf, M_TEMP); 2360 } 2361 NDFREE(&nd, 0); 2362 return (error); 2363 } 2364 2365 int 2366 sys___realpathat(struct thread *td, struct __realpathat_args *uap) 2367 { 2368 2369 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size, 2370 uap->flags, UIO_USERSPACE)); 2371 } 2372 2373 /* 2374 * Retrieve the full filesystem path that correspond to a vnode from the name 2375 * cache (if available) 2376 */ 2377 int 2378 vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf) 2379 { 2380 struct pwd *pwd; 2381 char *buf; 2382 size_t buflen; 2383 int error; 2384 2385 if (__predict_false(vn == NULL)) 2386 return (EINVAL); 2387 2388 buflen = MAXPATHLEN; 2389 buf = malloc(buflen, M_TEMP, M_WAITOK); 2390 pwd = pwd_hold(td); 2391 error = vn_fullpath_any(td, vn, pwd->pwd_rdir, buf, retbuf, &buflen); 2392 pwd_drop(pwd); 2393 2394 if (!error) 2395 *freebuf = buf; 2396 else 2397 free(buf, M_TEMP); 2398 return (error); 2399 } 2400 2401 /* 2402 * This function is similar to vn_fullpath, but it attempts to lookup the 2403 * pathname relative to the global root mount point. This is required for the 2404 * auditing sub-system, as audited pathnames must be absolute, relative to the 2405 * global root mount point. 2406 */ 2407 int 2408 vn_fullpath_global(struct thread *td, struct vnode *vn, 2409 char **retbuf, char **freebuf) 2410 { 2411 char *buf; 2412 size_t buflen; 2413 int error; 2414 2415 if (__predict_false(vn == NULL)) 2416 return (EINVAL); 2417 buflen = MAXPATHLEN; 2418 buf = malloc(buflen, M_TEMP, M_WAITOK); 2419 error = vn_fullpath_any(td, vn, rootvnode, buf, retbuf, &buflen); 2420 if (!error) 2421 *freebuf = buf; 2422 else 2423 free(buf, M_TEMP); 2424 return (error); 2425 } 2426 2427 int 2428 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, size_t *buflen) 2429 { 2430 struct vnode *dvp; 2431 struct namecache *ncp; 2432 struct mtx *vlp; 2433 int error; 2434 2435 vlp = VP2VNODELOCK(*vp); 2436 mtx_lock(vlp); 2437 TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) { 2438 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2439 break; 2440 } 2441 if (ncp != NULL) { 2442 if (*buflen < ncp->nc_nlen) { 2443 mtx_unlock(vlp); 2444 vrele(*vp); 2445 counter_u64_add(numfullpathfail4, 1); 2446 error = ENOMEM; 2447 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2448 vp, NULL); 2449 return (error); 2450 } 2451 *buflen -= ncp->nc_nlen; 2452 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 2453 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, 2454 ncp->nc_name, vp); 2455 dvp = *vp; 2456 *vp = ncp->nc_dvp; 2457 vref(*vp); 2458 mtx_unlock(vlp); 2459 vrele(dvp); 2460 return (0); 2461 } 2462 SDT_PROBE1(vfs, namecache, fullpath, miss, vp); 2463 2464 mtx_unlock(vlp); 2465 vn_lock(*vp, LK_SHARED | LK_RETRY); 2466 error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen); 2467 vput(*vp); 2468 if (error) { 2469 counter_u64_add(numfullpathfail2, 1); 2470 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2471 return (error); 2472 } 2473 2474 *vp = dvp; 2475 if (VN_IS_DOOMED(dvp)) { 2476 /* forced unmount */ 2477 vrele(dvp); 2478 error = ENOENT; 2479 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2480 return (error); 2481 } 2482 /* 2483 * *vp has its use count incremented still. 2484 */ 2485 2486 return (0); 2487 } 2488 2489 /* 2490 * Resolve a directory to a pathname. 2491 * 2492 * The name of the directory can always be found in the namecache or fetched 2493 * from the filesystem. There is also guaranteed to be only one parent, meaning 2494 * we can just follow vnodes up until we find the root. 2495 * 2496 * The vnode must be referenced. 2497 */ 2498 static int 2499 vn_fullpath_dir(struct thread *td, struct vnode *vp, struct vnode *rdir, 2500 char *buf, char **retbuf, size_t *len, bool slash_prefixed, size_t addend) 2501 { 2502 #ifdef KDTRACE_HOOKS 2503 struct vnode *startvp = vp; 2504 #endif 2505 struct vnode *vp1; 2506 size_t buflen; 2507 int error; 2508 2509 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); 2510 VNPASS(vp->v_usecount > 0, vp); 2511 2512 buflen = *len; 2513 2514 if (!slash_prefixed) { 2515 MPASS(*len >= 2); 2516 buflen--; 2517 buf[buflen] = '\0'; 2518 } 2519 2520 error = 0; 2521 2522 SDT_PROBE1(vfs, namecache, fullpath, entry, vp); 2523 counter_u64_add(numfullpathcalls, 1); 2524 while (vp != rdir && vp != rootvnode) { 2525 /* 2526 * The vp vnode must be already fully constructed, 2527 * since it is either found in namecache or obtained 2528 * from VOP_VPTOCNP(). We may test for VV_ROOT safely 2529 * without obtaining the vnode lock. 2530 */ 2531 if ((vp->v_vflag & VV_ROOT) != 0) { 2532 vn_lock(vp, LK_RETRY | LK_SHARED); 2533 2534 /* 2535 * With the vnode locked, check for races with 2536 * unmount, forced or not. Note that we 2537 * already verified that vp is not equal to 2538 * the root vnode, which means that 2539 * mnt_vnodecovered can be NULL only for the 2540 * case of unmount. 2541 */ 2542 if (VN_IS_DOOMED(vp) || 2543 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || 2544 vp1->v_mountedhere != vp->v_mount) { 2545 vput(vp); 2546 error = ENOENT; 2547 SDT_PROBE3(vfs, namecache, fullpath, return, 2548 error, vp, NULL); 2549 break; 2550 } 2551 2552 vref(vp1); 2553 vput(vp); 2554 vp = vp1; 2555 continue; 2556 } 2557 if (vp->v_type != VDIR) { 2558 vrele(vp); 2559 counter_u64_add(numfullpathfail1, 1); 2560 error = ENOTDIR; 2561 SDT_PROBE3(vfs, namecache, fullpath, return, 2562 error, vp, NULL); 2563 break; 2564 } 2565 error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen); 2566 if (error) 2567 break; 2568 if (buflen == 0) { 2569 vrele(vp); 2570 error = ENOMEM; 2571 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2572 startvp, NULL); 2573 break; 2574 } 2575 buf[--buflen] = '/'; 2576 slash_prefixed = true; 2577 } 2578 if (error) 2579 return (error); 2580 if (!slash_prefixed) { 2581 if (buflen == 0) { 2582 vrele(vp); 2583 counter_u64_add(numfullpathfail4, 1); 2584 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, 2585 startvp, NULL); 2586 return (ENOMEM); 2587 } 2588 buf[--buflen] = '/'; 2589 } 2590 counter_u64_add(numfullpathfound, 1); 2591 vrele(vp); 2592 2593 *retbuf = buf + buflen; 2594 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf); 2595 *len -= buflen; 2596 *len += addend; 2597 return (0); 2598 } 2599 2600 /* 2601 * Resolve an arbitrary vnode to a pathname. 2602 * 2603 * Note 2 caveats: 2604 * - hardlinks are not tracked, thus if the vnode is not a directory this can 2605 * resolve to a different path than the one used to find it 2606 * - namecache is not mandatory, meaning names are not guaranteed to be added 2607 * (in which case resolving fails) 2608 */ 2609 static int 2610 vn_fullpath_any(struct thread *td, struct vnode *vp, struct vnode *rdir, 2611 char *buf, char **retbuf, size_t *buflen) 2612 { 2613 size_t orig_buflen; 2614 bool slash_prefixed; 2615 int error; 2616 2617 if (*buflen < 2) 2618 return (EINVAL); 2619 2620 orig_buflen = *buflen; 2621 2622 vref(vp); 2623 slash_prefixed = false; 2624 if (vp->v_type != VDIR) { 2625 *buflen -= 1; 2626 buf[*buflen] = '\0'; 2627 error = vn_vptocnp(&vp, td->td_ucred, buf, buflen); 2628 if (error) 2629 return (error); 2630 if (*buflen == 0) { 2631 vrele(vp); 2632 return (ENOMEM); 2633 } 2634 *buflen -= 1; 2635 buf[*buflen] = '/'; 2636 slash_prefixed = true; 2637 } 2638 2639 return (vn_fullpath_dir(td, vp, rdir, buf, retbuf, buflen, slash_prefixed, 2640 orig_buflen - *buflen)); 2641 } 2642 2643 /* 2644 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks). 2645 * 2646 * Since the namecache does not track handlings, the caller is expected to first 2647 * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei. 2648 * 2649 * Then we have 2 cases: 2650 * - if the found vnode is a directory, the path can be constructed just by 2651 * fullowing names up the chain 2652 * - otherwise we populate the buffer with the saved name and start resolving 2653 * from the parent 2654 */ 2655 static int 2656 vn_fullpath_hardlink(struct thread *td, struct nameidata *ndp, char **retbuf, 2657 char **freebuf, size_t *buflen) 2658 { 2659 char *buf, *tmpbuf; 2660 struct pwd *pwd; 2661 struct componentname *cnp; 2662 struct vnode *vp; 2663 size_t addend; 2664 int error; 2665 bool slash_prefixed; 2666 2667 if (*buflen < 2) 2668 return (EINVAL); 2669 if (*buflen > MAXPATHLEN) 2670 *buflen = MAXPATHLEN; 2671 2672 slash_prefixed = false; 2673 2674 buf = malloc(*buflen, M_TEMP, M_WAITOK); 2675 pwd = pwd_hold(td); 2676 2677 addend = 0; 2678 vp = ndp->ni_vp; 2679 if (vp->v_type != VDIR) { 2680 cnp = &ndp->ni_cnd; 2681 addend = cnp->cn_namelen + 2; 2682 if (*buflen < addend) { 2683 error = ENOMEM; 2684 goto out_bad; 2685 } 2686 *buflen -= addend; 2687 tmpbuf = buf + *buflen; 2688 tmpbuf[0] = '/'; 2689 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen); 2690 tmpbuf[addend - 1] = '\0'; 2691 slash_prefixed = true; 2692 vp = ndp->ni_dvp; 2693 } 2694 2695 vref(vp); 2696 error = vn_fullpath_dir(td, vp, pwd->pwd_rdir, buf, retbuf, buflen, 2697 slash_prefixed, addend); 2698 if (error != 0) 2699 goto out_bad; 2700 2701 pwd_drop(pwd); 2702 *freebuf = buf; 2703 2704 return (0); 2705 out_bad: 2706 pwd_drop(pwd); 2707 free(buf, M_TEMP); 2708 return (error); 2709 } 2710 2711 struct vnode * 2712 vn_dir_dd_ino(struct vnode *vp) 2713 { 2714 struct namecache *ncp; 2715 struct vnode *ddvp; 2716 struct mtx *vlp; 2717 enum vgetstate vs; 2718 2719 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); 2720 vlp = VP2VNODELOCK(vp); 2721 mtx_lock(vlp); 2722 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { 2723 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 2724 continue; 2725 ddvp = ncp->nc_dvp; 2726 vs = vget_prep(ddvp); 2727 mtx_unlock(vlp); 2728 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) 2729 return (NULL); 2730 return (ddvp); 2731 } 2732 mtx_unlock(vlp); 2733 return (NULL); 2734 } 2735 2736 int 2737 vn_commname(struct vnode *vp, char *buf, u_int buflen) 2738 { 2739 struct namecache *ncp; 2740 struct mtx *vlp; 2741 int l; 2742 2743 vlp = VP2VNODELOCK(vp); 2744 mtx_lock(vlp); 2745 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) 2746 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2747 break; 2748 if (ncp == NULL) { 2749 mtx_unlock(vlp); 2750 return (ENOENT); 2751 } 2752 l = min(ncp->nc_nlen, buflen - 1); 2753 memcpy(buf, ncp->nc_name, l); 2754 mtx_unlock(vlp); 2755 buf[l] = '\0'; 2756 return (0); 2757 } 2758 2759 /* 2760 * This function updates path string to vnode's full global path 2761 * and checks the size of the new path string against the pathlen argument. 2762 * 2763 * Requires a locked, referenced vnode. 2764 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 2765 * 2766 * If vp is a directory, the call to vn_fullpath_global() always succeeds 2767 * because it falls back to the ".." lookup if the namecache lookup fails. 2768 */ 2769 int 2770 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, 2771 u_int pathlen) 2772 { 2773 struct nameidata nd; 2774 struct vnode *vp1; 2775 char *rpath, *fbuf; 2776 int error; 2777 2778 ASSERT_VOP_ELOCKED(vp, __func__); 2779 2780 /* Construct global filesystem path from vp. */ 2781 VOP_UNLOCK(vp); 2782 error = vn_fullpath_global(td, vp, &rpath, &fbuf); 2783 2784 if (error != 0) { 2785 vrele(vp); 2786 return (error); 2787 } 2788 2789 if (strlen(rpath) >= pathlen) { 2790 vrele(vp); 2791 error = ENAMETOOLONG; 2792 goto out; 2793 } 2794 2795 /* 2796 * Re-lookup the vnode by path to detect a possible rename. 2797 * As a side effect, the vnode is relocked. 2798 * If vnode was renamed, return ENOENT. 2799 */ 2800 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, 2801 UIO_SYSSPACE, path, td); 2802 error = namei(&nd); 2803 if (error != 0) { 2804 vrele(vp); 2805 goto out; 2806 } 2807 NDFREE(&nd, NDF_ONLY_PNBUF); 2808 vp1 = nd.ni_vp; 2809 vrele(vp); 2810 if (vp1 == vp) 2811 strcpy(path, rpath); 2812 else { 2813 vput(vp1); 2814 error = ENOENT; 2815 } 2816 2817 out: 2818 free(fbuf, M_TEMP); 2819 return (error); 2820 } 2821 2822 #ifdef DDB 2823 static void 2824 db_print_vpath(struct vnode *vp) 2825 { 2826 2827 while (vp != NULL) { 2828 db_printf("%p: ", vp); 2829 if (vp == rootvnode) { 2830 db_printf("/"); 2831 vp = NULL; 2832 } else { 2833 if (vp->v_vflag & VV_ROOT) { 2834 db_printf("<mount point>"); 2835 vp = vp->v_mount->mnt_vnodecovered; 2836 } else { 2837 struct namecache *ncp; 2838 char *ncn; 2839 int i; 2840 2841 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2842 if (ncp != NULL) { 2843 ncn = ncp->nc_name; 2844 for (i = 0; i < ncp->nc_nlen; i++) 2845 db_printf("%c", *ncn++); 2846 vp = ncp->nc_dvp; 2847 } else { 2848 vp = NULL; 2849 } 2850 } 2851 } 2852 db_printf("\n"); 2853 } 2854 2855 return; 2856 } 2857 2858 DB_SHOW_COMMAND(vpath, db_show_vpath) 2859 { 2860 struct vnode *vp; 2861 2862 if (!have_addr) { 2863 db_printf("usage: show vpath <struct vnode *>\n"); 2864 return; 2865 } 2866 2867 vp = (struct vnode *)addr; 2868 db_print_vpath(vp); 2869 } 2870 2871 #endif 2872 2873 extern uma_zone_t namei_zone; 2874 2875 static bool __read_frequently cache_fast_lookup = true; 2876 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW, 2877 &cache_fast_lookup, 0, ""); 2878 2879 #define CACHE_FPL_FAILED -2020 2880 2881 static void 2882 cache_fpl_cleanup_cnp(struct componentname *cnp) 2883 { 2884 2885 uma_zfree(namei_zone, cnp->cn_pnbuf); 2886 #ifdef DIAGNOSTIC 2887 cnp->cn_pnbuf = NULL; 2888 cnp->cn_nameptr = NULL; 2889 #endif 2890 } 2891 2892 static void 2893 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp) 2894 { 2895 struct componentname *cnp; 2896 2897 cnp = &ndp->ni_cnd; 2898 while (*(cnp->cn_nameptr) == '/') { 2899 cnp->cn_nameptr++; 2900 ndp->ni_pathlen--; 2901 } 2902 2903 *dpp = ndp->ni_rootdir; 2904 } 2905 2906 /* 2907 * Components of nameidata (or objects it can point to) which may 2908 * need restoring in case fast path lookup fails. 2909 */ 2910 struct nameidata_saved { 2911 long cn_namelen; 2912 char *cn_nameptr; 2913 size_t ni_pathlen; 2914 int cn_flags; 2915 }; 2916 2917 struct cache_fpl { 2918 struct nameidata *ndp; 2919 struct componentname *cnp; 2920 struct pwd *pwd; 2921 struct vnode *dvp; 2922 struct vnode *tvp; 2923 seqc_t dvp_seqc; 2924 seqc_t tvp_seqc; 2925 struct nameidata_saved snd; 2926 int line; 2927 enum cache_fpl_status status:8; 2928 bool in_smr; 2929 }; 2930 2931 static void 2932 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd) 2933 { 2934 2935 snd->cn_flags = fpl->ndp->ni_cnd.cn_flags; 2936 snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen; 2937 snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr; 2938 snd->ni_pathlen = fpl->ndp->ni_pathlen; 2939 } 2940 2941 static void 2942 cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd) 2943 { 2944 2945 fpl->ndp->ni_cnd.cn_flags = snd->cn_flags; 2946 fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen; 2947 fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr; 2948 fpl->ndp->ni_pathlen = snd->ni_pathlen; 2949 } 2950 2951 #ifdef INVARIANTS 2952 #define cache_fpl_smr_assert_entered(fpl) ({ \ 2953 struct cache_fpl *_fpl = (fpl); \ 2954 MPASS(_fpl->in_smr == true); \ 2955 VFS_SMR_ASSERT_ENTERED(); \ 2956 }) 2957 #define cache_fpl_smr_assert_not_entered(fpl) ({ \ 2958 struct cache_fpl *_fpl = (fpl); \ 2959 MPASS(_fpl->in_smr == false); \ 2960 VFS_SMR_ASSERT_NOT_ENTERED(); \ 2961 }) 2962 #else 2963 #define cache_fpl_smr_assert_entered(fpl) do { } while (0) 2964 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0) 2965 #endif 2966 2967 #define cache_fpl_smr_enter(fpl) ({ \ 2968 struct cache_fpl *_fpl = (fpl); \ 2969 MPASS(_fpl->in_smr == false); \ 2970 vfs_smr_enter(); \ 2971 _fpl->in_smr = true; \ 2972 }) 2973 2974 #define cache_fpl_smr_exit(fpl) ({ \ 2975 struct cache_fpl *_fpl = (fpl); \ 2976 MPASS(_fpl->in_smr == true); \ 2977 vfs_smr_exit(); \ 2978 _fpl->in_smr = false; \ 2979 }) 2980 2981 static int 2982 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line) 2983 { 2984 2985 if (fpl->status != CACHE_FPL_STATUS_UNSET) { 2986 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, 2987 ("%s: converting to abort from %d at %d, set at %d\n", 2988 __func__, fpl->status, line, fpl->line)); 2989 } 2990 fpl->status = CACHE_FPL_STATUS_ABORTED; 2991 fpl->line = line; 2992 return (CACHE_FPL_FAILED); 2993 } 2994 2995 #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__) 2996 2997 static int 2998 cache_fpl_partial_impl(struct cache_fpl *fpl, int line) 2999 { 3000 3001 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3002 ("%s: setting to partial at %d, but already set to %d at %d\n", 3003 __func__, line, fpl->status, fpl->line)); 3004 cache_fpl_smr_assert_entered(fpl); 3005 fpl->status = CACHE_FPL_STATUS_PARTIAL; 3006 fpl->line = line; 3007 return (CACHE_FPL_FAILED); 3008 } 3009 3010 #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__) 3011 3012 static int 3013 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line) 3014 { 3015 3016 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3017 ("%s: setting to handled at %d, but already set to %d at %d\n", 3018 __func__, line, fpl->status, fpl->line)); 3019 cache_fpl_smr_assert_not_entered(fpl); 3020 MPASS(error != CACHE_FPL_FAILED); 3021 fpl->status = CACHE_FPL_STATUS_HANDLED; 3022 fpl->line = line; 3023 return (error); 3024 } 3025 3026 #define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__) 3027 3028 #define CACHE_FPL_SUPPORTED_CN_FLAGS \ 3029 (LOCKLEAF | LOCKPARENT | WANTPARENT | FOLLOW | LOCKSHARED | SAVENAME | \ 3030 ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2) 3031 3032 static bool 3033 cache_can_fplookup(struct cache_fpl *fpl) 3034 { 3035 struct nameidata *ndp; 3036 struct componentname *cnp; 3037 struct thread *td; 3038 3039 ndp = fpl->ndp; 3040 cnp = fpl->cnp; 3041 td = cnp->cn_thread; 3042 3043 if (!cache_fast_lookup) { 3044 cache_fpl_aborted(fpl); 3045 return (false); 3046 } 3047 #ifdef MAC 3048 if (mac_vnode_check_lookup_enabled()) { 3049 cache_fpl_aborted(fpl); 3050 return (false); 3051 } 3052 #endif 3053 if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) { 3054 cache_fpl_aborted(fpl); 3055 return (false); 3056 } 3057 if (cnp->cn_nameiop != LOOKUP) { 3058 cache_fpl_aborted(fpl); 3059 return (false); 3060 } 3061 if (ndp->ni_dirfd != AT_FDCWD) { 3062 cache_fpl_aborted(fpl); 3063 return (false); 3064 } 3065 if (IN_CAPABILITY_MODE(td)) { 3066 cache_fpl_aborted(fpl); 3067 return (false); 3068 } 3069 if (AUDITING_TD(td)) { 3070 cache_fpl_aborted(fpl); 3071 return (false); 3072 } 3073 if (ndp->ni_startdir != NULL) { 3074 cache_fpl_aborted(fpl); 3075 return (false); 3076 } 3077 return (true); 3078 } 3079 3080 static bool 3081 cache_fplookup_vnode_supported(struct vnode *vp) 3082 { 3083 3084 return (vp->v_type != VLNK); 3085 } 3086 3087 /* 3088 * Move a negative entry to the hot list. 3089 * 3090 * We have to take locks, but they may be contended and in the worst 3091 * case we may need to go off CPU. We don't want to spin within the 3092 * smr section and we can't block with it. Instead we are going to 3093 * look up the entry again. 3094 */ 3095 static int __noinline 3096 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp, 3097 uint32_t hash) 3098 { 3099 struct componentname *cnp; 3100 struct namecache *ncp; 3101 struct neglist *neglist; 3102 struct negstate *negstate; 3103 struct vnode *dvp; 3104 u_char nc_flag; 3105 3106 cnp = fpl->cnp; 3107 dvp = fpl->dvp; 3108 3109 if (!vhold_smr(dvp)) 3110 return (cache_fpl_aborted(fpl)); 3111 3112 neglist = NCP2NEGLIST(oncp); 3113 cache_fpl_smr_exit(fpl); 3114 3115 mtx_lock(&ncneg_hot.nl_lock); 3116 mtx_lock(&neglist->nl_lock); 3117 /* 3118 * For hash iteration. 3119 */ 3120 cache_fpl_smr_enter(fpl); 3121 3122 /* 3123 * Avoid all surprises by only succeeding if we got the same entry and 3124 * bailing completely otherwise. 3125 * 3126 * In particular at this point there can be a new ncp which matches the 3127 * search but hashes to a different neglist. 3128 */ 3129 CK_LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 3130 if (ncp == oncp) 3131 break; 3132 } 3133 3134 /* 3135 * No match to begin with. 3136 */ 3137 if (__predict_false(ncp == NULL)) { 3138 goto out_abort; 3139 } 3140 3141 /* 3142 * The newly found entry may be something different... 3143 */ 3144 if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 3145 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) { 3146 goto out_abort; 3147 } 3148 3149 /* 3150 * ... and not even negative. 3151 */ 3152 nc_flag = atomic_load_char(&ncp->nc_flag); 3153 if ((nc_flag & NCF_NEGATIVE) == 0) { 3154 goto out_abort; 3155 } 3156 3157 if (__predict_false(cache_ncp_invalid(ncp))) { 3158 goto out_abort; 3159 } 3160 3161 negstate = NCP2NEGSTATE(ncp); 3162 if ((negstate->neg_flag & NEG_HOT) == 0) { 3163 numhotneg++; 3164 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 3165 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); 3166 negstate->neg_flag |= NEG_HOT; 3167 } 3168 3169 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 3170 counter_u64_add(numneghits, 1); 3171 cache_fpl_smr_exit(fpl); 3172 mtx_unlock(&neglist->nl_lock); 3173 mtx_unlock(&ncneg_hot.nl_lock); 3174 vdrop(dvp); 3175 return (cache_fpl_handled(fpl, ENOENT)); 3176 out_abort: 3177 cache_fpl_smr_exit(fpl); 3178 mtx_unlock(&neglist->nl_lock); 3179 mtx_unlock(&ncneg_hot.nl_lock); 3180 vdrop(dvp); 3181 return (cache_fpl_aborted(fpl)); 3182 } 3183 3184 /* 3185 * The target vnode is not supported, prepare for the slow path to take over. 3186 */ 3187 static int 3188 cache_fplookup_partial_setup(struct cache_fpl *fpl) 3189 { 3190 struct componentname *cnp; 3191 enum vgetstate dvs; 3192 struct vnode *dvp; 3193 struct pwd *pwd; 3194 seqc_t dvp_seqc; 3195 3196 cnp = fpl->cnp; 3197 dvp = fpl->dvp; 3198 dvp_seqc = fpl->dvp_seqc; 3199 3200 dvs = vget_prep_smr(dvp); 3201 if (dvs == VGET_NONE) { 3202 cache_fpl_smr_exit(fpl); 3203 return (cache_fpl_aborted(fpl)); 3204 } 3205 3206 cache_fpl_smr_exit(fpl); 3207 3208 vget_finish_ref(dvp, dvs); 3209 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3210 vrele(dvp); 3211 return (cache_fpl_aborted(fpl)); 3212 } 3213 3214 pwd = pwd_hold(curthread); 3215 if (fpl->pwd != pwd) { 3216 vrele(dvp); 3217 pwd_drop(pwd); 3218 return (cache_fpl_aborted(fpl)); 3219 } 3220 3221 fpl->ndp->ni_startdir = dvp; 3222 return (0); 3223 } 3224 3225 static int 3226 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs) 3227 { 3228 struct componentname *cnp; 3229 struct vnode *tvp; 3230 seqc_t tvp_seqc; 3231 int error; 3232 3233 cnp = fpl->cnp; 3234 tvp = fpl->tvp; 3235 tvp_seqc = fpl->tvp_seqc; 3236 3237 if ((cnp->cn_flags & LOCKLEAF) != 0) { 3238 error = vget_finish(tvp, cnp->cn_lkflags, tvs); 3239 if (error != 0) { 3240 return (cache_fpl_aborted(fpl)); 3241 } 3242 } else { 3243 vget_finish_ref(tvp, tvs); 3244 } 3245 3246 if (!vn_seqc_consistent(tvp, tvp_seqc)) { 3247 if ((cnp->cn_flags & LOCKLEAF) != 0) 3248 vput(tvp); 3249 else 3250 vrele(tvp); 3251 return (cache_fpl_aborted(fpl)); 3252 } 3253 3254 return (cache_fpl_handled(fpl, 0)); 3255 } 3256 3257 static int __noinline 3258 cache_fplookup_final_withparent(struct cache_fpl *fpl) 3259 { 3260 enum vgetstate dvs, tvs; 3261 struct componentname *cnp; 3262 struct vnode *dvp, *tvp; 3263 seqc_t dvp_seqc, tvp_seqc; 3264 int error; 3265 3266 cnp = fpl->cnp; 3267 dvp = fpl->dvp; 3268 dvp_seqc = fpl->dvp_seqc; 3269 tvp = fpl->tvp; 3270 tvp_seqc = fpl->tvp_seqc; 3271 3272 MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0); 3273 3274 /* 3275 * This is less efficient than it can be for simplicity. 3276 */ 3277 dvs = vget_prep_smr(dvp); 3278 if (dvs == VGET_NONE) { 3279 return (cache_fpl_aborted(fpl)); 3280 } 3281 tvs = vget_prep_smr(tvp); 3282 if (tvs == VGET_NONE) { 3283 cache_fpl_smr_exit(fpl); 3284 vget_abort(dvp, dvs); 3285 return (cache_fpl_aborted(fpl)); 3286 } 3287 3288 cache_fpl_smr_exit(fpl); 3289 3290 if ((cnp->cn_flags & LOCKPARENT) != 0) { 3291 error = vget_finish(dvp, LK_EXCLUSIVE, dvs); 3292 if (error != 0) { 3293 vget_abort(tvp, tvs); 3294 return (cache_fpl_aborted(fpl)); 3295 } 3296 } else { 3297 vget_finish_ref(dvp, dvs); 3298 } 3299 3300 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3301 vget_abort(tvp, tvs); 3302 if ((cnp->cn_flags & LOCKPARENT) != 0) 3303 vput(dvp); 3304 else 3305 vrele(dvp); 3306 cache_fpl_aborted(fpl); 3307 return (error); 3308 } 3309 3310 error = cache_fplookup_final_child(fpl, tvs); 3311 if (error != 0) { 3312 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED); 3313 if ((cnp->cn_flags & LOCKPARENT) != 0) 3314 vput(dvp); 3315 else 3316 vrele(dvp); 3317 return (error); 3318 } 3319 3320 MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED); 3321 return (0); 3322 } 3323 3324 static int 3325 cache_fplookup_final(struct cache_fpl *fpl) 3326 { 3327 struct componentname *cnp; 3328 enum vgetstate tvs; 3329 struct vnode *dvp, *tvp; 3330 seqc_t dvp_seqc, tvp_seqc; 3331 3332 cnp = fpl->cnp; 3333 dvp = fpl->dvp; 3334 dvp_seqc = fpl->dvp_seqc; 3335 tvp = fpl->tvp; 3336 tvp_seqc = fpl->tvp_seqc; 3337 3338 VNPASS(cache_fplookup_vnode_supported(dvp), dvp); 3339 3340 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) 3341 return (cache_fplookup_final_withparent(fpl)); 3342 3343 tvs = vget_prep_smr(tvp); 3344 if (tvs == VGET_NONE) { 3345 return (cache_fpl_partial(fpl)); 3346 } 3347 3348 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3349 cache_fpl_smr_exit(fpl); 3350 vget_abort(tvp, tvs); 3351 return (cache_fpl_aborted(fpl)); 3352 } 3353 3354 cache_fpl_smr_exit(fpl); 3355 return (cache_fplookup_final_child(fpl, tvs)); 3356 } 3357 3358 static int 3359 cache_fplookup_next(struct cache_fpl *fpl) 3360 { 3361 struct componentname *cnp; 3362 struct namecache *ncp; 3363 struct negstate *negstate; 3364 struct vnode *dvp, *tvp; 3365 u_char nc_flag; 3366 uint32_t hash; 3367 bool neg_hot; 3368 3369 cnp = fpl->cnp; 3370 dvp = fpl->dvp; 3371 3372 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) { 3373 fpl->tvp = dvp; 3374 fpl->tvp_seqc = vn_seqc_read_any(dvp); 3375 if (seqc_in_modify(fpl->tvp_seqc)) { 3376 return (cache_fpl_aborted(fpl)); 3377 } 3378 return (0); 3379 } 3380 3381 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 3382 3383 CK_LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 3384 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 3385 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 3386 break; 3387 } 3388 3389 /* 3390 * If there is no entry we have to punt to the slow path to perform 3391 * actual lookup. Should there be nothing with this name a negative 3392 * entry will be created. 3393 */ 3394 if (__predict_false(ncp == NULL)) { 3395 return (cache_fpl_partial(fpl)); 3396 } 3397 3398 tvp = atomic_load_ptr(&ncp->nc_vp); 3399 nc_flag = atomic_load_char(&ncp->nc_flag); 3400 if ((nc_flag & NCF_NEGATIVE) != 0) { 3401 negstate = NCP2NEGSTATE(ncp); 3402 neg_hot = ((negstate->neg_flag & NEG_HOT) != 0); 3403 if (__predict_false(cache_ncp_invalid(ncp))) { 3404 return (cache_fpl_partial(fpl)); 3405 } 3406 if (__predict_false((nc_flag & NCF_WHITE) != 0)) { 3407 return (cache_fpl_partial(fpl)); 3408 } 3409 if (!neg_hot) { 3410 return (cache_fplookup_negative_promote(fpl, ncp, hash)); 3411 } 3412 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, 3413 ncp->nc_name); 3414 counter_u64_add(numneghits, 1); 3415 cache_fpl_smr_exit(fpl); 3416 return (cache_fpl_handled(fpl, ENOENT)); 3417 } 3418 3419 if (__predict_false(cache_ncp_invalid(ncp))) { 3420 return (cache_fpl_partial(fpl)); 3421 } 3422 3423 fpl->tvp = tvp; 3424 fpl->tvp_seqc = vn_seqc_read_any(tvp); 3425 if (seqc_in_modify(fpl->tvp_seqc)) { 3426 return (cache_fpl_partial(fpl)); 3427 } 3428 3429 if (!cache_fplookup_vnode_supported(tvp)) { 3430 return (cache_fpl_partial(fpl)); 3431 } 3432 3433 counter_u64_add(numposhits, 1); 3434 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp); 3435 return (0); 3436 } 3437 3438 static bool 3439 cache_fplookup_mp_supported(struct mount *mp) 3440 { 3441 3442 if (mp == NULL) 3443 return (false); 3444 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) 3445 return (false); 3446 if ((mp->mnt_flag & MNT_UNION) != 0) 3447 return (false); 3448 return (true); 3449 } 3450 3451 /* 3452 * Walk up the mount stack (if any). 3453 * 3454 * Correctness is provided in the following ways: 3455 * - all vnodes are protected from freeing with SMR 3456 * - struct mount objects are type stable making them always safe to access 3457 * - stability of the particular mount is provided by busying it 3458 * - relationship between the vnode which is mounted on and the mount is 3459 * verified with the vnode sequence counter after busying 3460 * - association between root vnode of the mount and the mount is protected 3461 * by busy 3462 * 3463 * From that point on we can read the sequence counter of the root vnode 3464 * and get the next mount on the stack (if any) using the same protection. 3465 * 3466 * By the end of successful walk we are guaranteed the reached state was 3467 * indeed present at least at some point which matches the regular lookup. 3468 */ 3469 static int __noinline 3470 cache_fplookup_climb_mount(struct cache_fpl *fpl) 3471 { 3472 struct mount *mp, *prev_mp; 3473 struct vnode *vp; 3474 seqc_t vp_seqc; 3475 3476 vp = fpl->tvp; 3477 vp_seqc = fpl->tvp_seqc; 3478 3479 VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp); 3480 mp = atomic_load_ptr(&vp->v_mountedhere); 3481 if (mp == NULL) 3482 return (0); 3483 3484 prev_mp = NULL; 3485 for (;;) { 3486 if (!vfs_op_thread_enter(mp)) { 3487 if (prev_mp != NULL) 3488 vfs_op_thread_exit(prev_mp); 3489 return (cache_fpl_partial(fpl)); 3490 } 3491 if (prev_mp != NULL) 3492 vfs_op_thread_exit(prev_mp); 3493 if (!vn_seqc_consistent(vp, vp_seqc)) { 3494 vfs_op_thread_exit(mp); 3495 return (cache_fpl_partial(fpl)); 3496 } 3497 if (!cache_fplookup_mp_supported(mp)) { 3498 vfs_op_thread_exit(mp); 3499 return (cache_fpl_partial(fpl)); 3500 } 3501 vp = atomic_load_ptr(&mp->mnt_rootvnode); 3502 if (vp == NULL || VN_IS_DOOMED(vp)) { 3503 vfs_op_thread_exit(mp); 3504 return (cache_fpl_partial(fpl)); 3505 } 3506 vp_seqc = vn_seqc_read_any(vp); 3507 if (seqc_in_modify(vp_seqc)) { 3508 vfs_op_thread_exit(mp); 3509 return (cache_fpl_partial(fpl)); 3510 } 3511 prev_mp = mp; 3512 mp = atomic_load_ptr(&vp->v_mountedhere); 3513 if (mp == NULL) 3514 break; 3515 } 3516 3517 vfs_op_thread_exit(prev_mp); 3518 fpl->tvp = vp; 3519 fpl->tvp_seqc = vp_seqc; 3520 return (0); 3521 } 3522 3523 static bool 3524 cache_fplookup_need_climb_mount(struct cache_fpl *fpl) 3525 { 3526 struct mount *mp; 3527 struct vnode *vp; 3528 3529 vp = fpl->tvp; 3530 3531 /* 3532 * Hack: while this is a union, the pointer tends to be NULL so save on 3533 * a branch. 3534 */ 3535 mp = atomic_load_ptr(&vp->v_mountedhere); 3536 if (mp == NULL) 3537 return (false); 3538 if (vp->v_type == VDIR) 3539 return (true); 3540 return (false); 3541 } 3542 3543 /* 3544 * Parse the path. 3545 * 3546 * The code is mostly copy-pasted from regular lookup, see lookup(). 3547 * The structure is maintained along with comments for easier maintenance. 3548 * Deduplicating the code will become feasible after fast path lookup 3549 * becomes more feature-complete. 3550 */ 3551 static int 3552 cache_fplookup_parse(struct cache_fpl *fpl) 3553 { 3554 struct nameidata *ndp; 3555 struct componentname *cnp; 3556 char *cp; 3557 char *prev_ni_next; /* saved ndp->ni_next */ 3558 size_t prev_ni_pathlen; /* saved ndp->ni_pathlen */ 3559 3560 ndp = fpl->ndp; 3561 cnp = fpl->cnp; 3562 3563 /* 3564 * Search a new directory. 3565 * 3566 * The last component of the filename is left accessible via 3567 * cnp->cn_nameptr for callers that need the name. Callers needing 3568 * the name set the SAVENAME flag. When done, they assume 3569 * responsibility for freeing the pathname buffer. 3570 */ 3571 for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++) 3572 continue; 3573 cnp->cn_namelen = cp - cnp->cn_nameptr; 3574 if (cnp->cn_namelen > NAME_MAX) { 3575 cache_fpl_smr_exit(fpl); 3576 return (cache_fpl_handled(fpl, ENAMETOOLONG)); 3577 } 3578 prev_ni_pathlen = ndp->ni_pathlen; 3579 ndp->ni_pathlen -= cnp->cn_namelen; 3580 KASSERT(ndp->ni_pathlen <= PATH_MAX, 3581 ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen)); 3582 prev_ni_next = ndp->ni_next; 3583 ndp->ni_next = cp; 3584 3585 /* 3586 * Replace multiple slashes by a single slash and trailing slashes 3587 * by a null. This must be done before VOP_LOOKUP() because some 3588 * fs's don't know about trailing slashes. Remember if there were 3589 * trailing slashes to handle symlinks, existing non-directories 3590 * and non-existing files that won't be directories specially later. 3591 */ 3592 while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) { 3593 cp++; 3594 ndp->ni_pathlen--; 3595 if (*cp == '\0') { 3596 /* 3597 * TODO 3598 * Regular lookup performs the following: 3599 * *ndp->ni_next = '\0'; 3600 * cnp->cn_flags |= TRAILINGSLASH; 3601 * 3602 * Which is problematic since it modifies data read 3603 * from userspace. Then if fast path lookup was to 3604 * abort we would have to either restore it or convey 3605 * the flag. Since this is a corner case just ignore 3606 * it for simplicity. 3607 */ 3608 return (cache_fpl_partial(fpl)); 3609 } 3610 } 3611 ndp->ni_next = cp; 3612 3613 cnp->cn_flags |= MAKEENTRY; 3614 3615 if (cnp->cn_namelen == 2 && 3616 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') 3617 cnp->cn_flags |= ISDOTDOT; 3618 else 3619 cnp->cn_flags &= ~ISDOTDOT; 3620 if (*ndp->ni_next == 0) 3621 cnp->cn_flags |= ISLASTCN; 3622 else 3623 cnp->cn_flags &= ~ISLASTCN; 3624 3625 /* 3626 * Check for degenerate name (e.g. / or "") 3627 * which is a way of talking about a directory, 3628 * e.g. like "/." or ".". 3629 * 3630 * TODO 3631 * Another corner case handled by the regular lookup 3632 */ 3633 if (__predict_false(cnp->cn_nameptr[0] == '\0')) { 3634 return (cache_fpl_partial(fpl)); 3635 } 3636 return (0); 3637 } 3638 3639 static void 3640 cache_fplookup_parse_advance(struct cache_fpl *fpl) 3641 { 3642 struct nameidata *ndp; 3643 struct componentname *cnp; 3644 3645 ndp = fpl->ndp; 3646 cnp = fpl->cnp; 3647 3648 cnp->cn_nameptr = ndp->ni_next; 3649 while (*cnp->cn_nameptr == '/') { 3650 cnp->cn_nameptr++; 3651 ndp->ni_pathlen--; 3652 } 3653 } 3654 3655 static int 3656 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl) 3657 { 3658 struct nameidata *ndp; 3659 struct componentname *cnp; 3660 struct mount *mp; 3661 int error; 3662 3663 error = CACHE_FPL_FAILED; 3664 ndp = fpl->ndp; 3665 ndp->ni_lcf = 0; 3666 cnp = fpl->cnp; 3667 cnp->cn_lkflags = LK_SHARED; 3668 if ((cnp->cn_flags & LOCKSHARED) == 0) 3669 cnp->cn_lkflags = LK_EXCLUSIVE; 3670 3671 cache_fpl_checkpoint(fpl, &fpl->snd); 3672 3673 fpl->dvp = dvp; 3674 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); 3675 if (seqc_in_modify(fpl->dvp_seqc)) { 3676 cache_fpl_aborted(fpl); 3677 goto out; 3678 } 3679 mp = atomic_load_ptr(&fpl->dvp->v_mount); 3680 if (!cache_fplookup_mp_supported(mp)) { 3681 cache_fpl_aborted(fpl); 3682 goto out; 3683 } 3684 3685 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); 3686 3687 for (;;) { 3688 error = cache_fplookup_parse(fpl); 3689 if (__predict_false(error != 0)) { 3690 break; 3691 } 3692 3693 if (cnp->cn_flags & ISDOTDOT) { 3694 error = cache_fpl_partial(fpl); 3695 break; 3696 } 3697 3698 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); 3699 3700 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred, cnp->cn_thread); 3701 if (__predict_false(error != 0)) { 3702 switch (error) { 3703 case EAGAIN: 3704 case EOPNOTSUPP: /* can happen when racing against vgone */ 3705 cache_fpl_partial(fpl); 3706 break; 3707 default: 3708 /* 3709 * See the API contract for VOP_FPLOOKUP_VEXEC. 3710 */ 3711 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { 3712 error = cache_fpl_aborted(fpl); 3713 } else { 3714 cache_fpl_smr_exit(fpl); 3715 cache_fpl_handled(fpl, error); 3716 } 3717 break; 3718 } 3719 break; 3720 } 3721 3722 error = cache_fplookup_next(fpl); 3723 if (__predict_false(error != 0)) { 3724 break; 3725 } 3726 3727 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 3728 3729 if (cache_fplookup_need_climb_mount(fpl)) { 3730 error = cache_fplookup_climb_mount(fpl); 3731 if (__predict_false(error != 0)) { 3732 break; 3733 } 3734 } 3735 3736 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 3737 3738 if (cnp->cn_flags & ISLASTCN) { 3739 error = cache_fplookup_final(fpl); 3740 break; 3741 } 3742 3743 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { 3744 error = cache_fpl_aborted(fpl); 3745 break; 3746 } 3747 3748 fpl->dvp = fpl->tvp; 3749 fpl->dvp_seqc = fpl->tvp_seqc; 3750 3751 cache_fplookup_parse_advance(fpl); 3752 cache_fpl_checkpoint(fpl, &fpl->snd); 3753 } 3754 out: 3755 switch (fpl->status) { 3756 case CACHE_FPL_STATUS_UNSET: 3757 __assert_unreachable(); 3758 break; 3759 case CACHE_FPL_STATUS_PARTIAL: 3760 cache_fpl_smr_assert_entered(fpl); 3761 return (cache_fplookup_partial_setup(fpl)); 3762 case CACHE_FPL_STATUS_ABORTED: 3763 if (fpl->in_smr) 3764 cache_fpl_smr_exit(fpl); 3765 return (CACHE_FPL_FAILED); 3766 case CACHE_FPL_STATUS_HANDLED: 3767 cache_fpl_smr_assert_not_entered(fpl); 3768 if (__predict_false(error != 0)) { 3769 ndp->ni_dvp = NULL; 3770 ndp->ni_vp = NULL; 3771 cache_fpl_cleanup_cnp(cnp); 3772 return (error); 3773 } 3774 ndp->ni_dvp = fpl->dvp; 3775 ndp->ni_vp = fpl->tvp; 3776 if (cnp->cn_flags & SAVENAME) 3777 cnp->cn_flags |= HASBUF; 3778 else 3779 cache_fpl_cleanup_cnp(cnp); 3780 return (error); 3781 } 3782 } 3783 3784 /* 3785 * Fast path lookup protected with SMR and sequence counters. 3786 * 3787 * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one. 3788 * 3789 * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria 3790 * outlined below. 3791 * 3792 * Traditional vnode lookup conceptually looks like this: 3793 * 3794 * vn_lock(current); 3795 * for (;;) { 3796 * next = find(); 3797 * vn_lock(next); 3798 * vn_unlock(current); 3799 * current = next; 3800 * if (last) 3801 * break; 3802 * } 3803 * return (current); 3804 * 3805 * Each jump to the next vnode is safe memory-wise and atomic with respect to 3806 * any modifications thanks to holding respective locks. 3807 * 3808 * The same guarantee can be provided with a combination of safe memory 3809 * reclamation and sequence counters instead. If all operations which affect 3810 * the relationship between the current vnode and the one we are looking for 3811 * also modify the counter, we can verify whether all the conditions held as 3812 * we made the jump. This includes things like permissions, mount points etc. 3813 * Counter modification is provided by enclosing relevant places in 3814 * vn_seqc_write_begin()/end() calls. 3815 * 3816 * Thus this translates to: 3817 * 3818 * vfs_smr_enter(); 3819 * dvp_seqc = seqc_read_any(dvp); 3820 * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode 3821 * abort(); 3822 * for (;;) { 3823 * tvp = find(); 3824 * tvp_seqc = seqc_read_any(tvp); 3825 * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode 3826 * abort(); 3827 * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode 3828 * abort(); 3829 * dvp = tvp; // we know nothing of importance has changed 3830 * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration 3831 * if (last) 3832 * break; 3833 * } 3834 * vget(); // secure the vnode 3835 * if (!seqc_consistent(tvp, tvp_seqc) // final check 3836 * abort(); 3837 * // at this point we know nothing has changed for any parent<->child pair 3838 * // as they were crossed during the lookup, meaning we matched the guarantee 3839 * // of the locked variant 3840 * return (tvp); 3841 * 3842 * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows: 3843 * - they are called while within vfs_smr protection which they must never exit 3844 * - EAGAIN can be returned to denote checking could not be performed, it is 3845 * always valid to return it 3846 * - if the sequence counter has not changed the result must be valid 3847 * - if the sequence counter has changed both false positives and false negatives 3848 * are permitted (since the result will be rejected later) 3849 * - for simple cases of unix permission checks vaccess_vexec_smr can be used 3850 * 3851 * Caveats to watch out for: 3852 * - vnodes are passed unlocked and unreferenced with nothing stopping 3853 * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised 3854 * to use atomic_load_ptr to fetch it. 3855 * - the aforementioned object can also get freed, meaning absent other means it 3856 * should be protected with vfs_smr 3857 * - either safely checking permissions as they are modified or guaranteeing 3858 * their stability is left to the routine 3859 */ 3860 int 3861 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status, 3862 struct pwd **pwdp) 3863 { 3864 struct cache_fpl fpl; 3865 struct pwd *pwd; 3866 struct vnode *dvp; 3867 struct componentname *cnp; 3868 struct nameidata_saved orig; 3869 int error; 3870 3871 *status = CACHE_FPL_STATUS_UNSET; 3872 bzero(&fpl, sizeof(fpl)); 3873 fpl.status = CACHE_FPL_STATUS_UNSET; 3874 fpl.ndp = ndp; 3875 fpl.cnp = &ndp->ni_cnd; 3876 MPASS(curthread == fpl.cnp->cn_thread); 3877 3878 if (!cache_can_fplookup(&fpl)) { 3879 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 3880 *status = fpl.status; 3881 return (EOPNOTSUPP); 3882 } 3883 3884 cache_fpl_checkpoint(&fpl, &orig); 3885 3886 cache_fpl_smr_enter(&fpl); 3887 pwd = pwd_get_smr(); 3888 fpl.pwd = pwd; 3889 ndp->ni_rootdir = pwd->pwd_rdir; 3890 ndp->ni_topdir = pwd->pwd_jdir; 3891 3892 cnp = fpl.cnp; 3893 cnp->cn_nameptr = cnp->cn_pnbuf; 3894 if (cnp->cn_pnbuf[0] == '/') { 3895 cache_fpl_handle_root(ndp, &dvp); 3896 } else { 3897 MPASS(ndp->ni_dirfd == AT_FDCWD); 3898 dvp = pwd->pwd_cdir; 3899 } 3900 3901 SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true); 3902 3903 error = cache_fplookup_impl(dvp, &fpl); 3904 cache_fpl_smr_assert_not_entered(&fpl); 3905 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 3906 3907 *status = fpl.status; 3908 switch (fpl.status) { 3909 case CACHE_FPL_STATUS_UNSET: 3910 __assert_unreachable(); 3911 break; 3912 case CACHE_FPL_STATUS_HANDLED: 3913 SDT_PROBE3(vfs, namei, lookup, return, error, 3914 (error == 0 ? ndp->ni_vp : NULL), true); 3915 break; 3916 case CACHE_FPL_STATUS_PARTIAL: 3917 *pwdp = fpl.pwd; 3918 cache_fpl_restore(&fpl, &fpl.snd); 3919 break; 3920 case CACHE_FPL_STATUS_ABORTED: 3921 cache_fpl_restore(&fpl, &orig); 3922 break; 3923 } 3924 return (error); 3925 } 3926