1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Poul-Henning Kamp of the FreeBSD Project. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_ddb.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/capsicum.h> 46 #include <sys/counter.h> 47 #include <sys/filedesc.h> 48 #include <sys/fnv_hash.h> 49 #include <sys/kernel.h> 50 #include <sys/ktr.h> 51 #include <sys/lock.h> 52 #include <sys/malloc.h> 53 #include <sys/fcntl.h> 54 #include <sys/mount.h> 55 #include <sys/namei.h> 56 #include <sys/proc.h> 57 #include <sys/rwlock.h> 58 #include <sys/seqc.h> 59 #include <sys/sdt.h> 60 #include <sys/smr.h> 61 #include <sys/smp.h> 62 #include <sys/syscallsubr.h> 63 #include <sys/sysctl.h> 64 #include <sys/sysproto.h> 65 #include <sys/vnode.h> 66 #include <ck_queue.h> 67 #ifdef KTRACE 68 #include <sys/ktrace.h> 69 #endif 70 71 #include <sys/capsicum.h> 72 73 #include <security/audit/audit.h> 74 #include <security/mac/mac_framework.h> 75 76 #ifdef DDB 77 #include <ddb/ddb.h> 78 #endif 79 80 #include <vm/uma.h> 81 82 SDT_PROVIDER_DECLARE(vfs); 83 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", 84 "struct vnode *"); 85 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", 86 "char *"); 87 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); 88 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", 89 "char *", "struct vnode *"); 90 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); 91 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", 92 "struct vnode *", "char *"); 93 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", 94 "struct vnode *"); 95 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, 96 "struct vnode *", "char *"); 97 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", 98 "char *"); 99 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); 100 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); 101 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); 102 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", 103 "struct vnode *"); 104 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", 105 "char *"); 106 SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *", 107 "char *"); 108 109 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool"); 110 SDT_PROBE_DECLARE(vfs, namei, lookup, entry); 111 SDT_PROBE_DECLARE(vfs, namei, lookup, return); 112 113 /* 114 * This structure describes the elements in the cache of recent 115 * names looked up by namei. 116 */ 117 struct negstate { 118 u_char neg_flag; 119 }; 120 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *), 121 "the state must fit in a union with a pointer without growing it"); 122 123 struct namecache { 124 CK_LIST_ENTRY(namecache) nc_hash;/* hash chain */ 125 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 126 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 127 struct vnode *nc_dvp; /* vnode of parent of name */ 128 union { 129 struct vnode *nu_vp; /* vnode the name refers to */ 130 struct negstate nu_neg;/* negative entry state */ 131 } n_un; 132 u_char nc_flag; /* flag bits */ 133 u_char nc_nlen; /* length of name */ 134 char nc_name[0]; /* segment name + nul */ 135 }; 136 137 /* 138 * struct namecache_ts repeats struct namecache layout up to the 139 * nc_nlen member. 140 * struct namecache_ts is used in place of struct namecache when time(s) need 141 * to be stored. The nc_dotdottime field is used when a cache entry is mapping 142 * both a non-dotdot directory name plus dotdot for the directory's 143 * parent. 144 */ 145 struct namecache_ts { 146 struct timespec nc_time; /* timespec provided by fs */ 147 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ 148 int nc_ticks; /* ticks value when entry was added */ 149 struct namecache nc_nc; 150 }; 151 152 #define nc_vp n_un.nu_vp 153 #define nc_neg n_un.nu_neg 154 155 /* 156 * Flags in namecache.nc_flag 157 */ 158 #define NCF_WHITE 0x01 159 #define NCF_ISDOTDOT 0x02 160 #define NCF_TS 0x04 161 #define NCF_DTS 0x08 162 #define NCF_DVDROP 0x10 163 #define NCF_NEGATIVE 0x20 164 #define NCF_INVALID 0x40 165 166 /* 167 * Flags in negstate.neg_flag 168 */ 169 #define NEG_HOT 0x01 170 171 /* 172 * Mark an entry as invalid. 173 * 174 * This is called before it starts getting deconstructed. 175 */ 176 static void 177 cache_ncp_invalidate(struct namecache *ncp) 178 { 179 180 KASSERT((ncp->nc_flag & NCF_INVALID) == 0, 181 ("%s: entry %p already invalid", __func__, ncp)); 182 ncp->nc_flag |= NCF_INVALID; 183 atomic_thread_fence_rel(); 184 } 185 186 /* 187 * Verify validity of an entry. 188 * 189 * All places which elide locks are supposed to call this after they are 190 * done with reading from an entry. 191 */ 192 static bool 193 cache_ncp_invalid(struct namecache *ncp) 194 { 195 196 atomic_thread_fence_acq(); 197 return ((ncp->nc_flag & NCF_INVALID) != 0); 198 } 199 200 /* 201 * Name caching works as follows: 202 * 203 * Names found by directory scans are retained in a cache 204 * for future reference. It is managed LRU, so frequently 205 * used names will hang around. Cache is indexed by hash value 206 * obtained from (dvp, name) where dvp refers to the directory 207 * containing name. 208 * 209 * If it is a "negative" entry, (i.e. for a name that is known NOT to 210 * exist) the vnode pointer will be NULL. 211 * 212 * Upon reaching the last segment of a path, if the reference 213 * is for DELETE, or NOCACHE is set (rewrite), and the 214 * name is located in the cache, it will be dropped. 215 * 216 * These locks are used (in the order in which they can be taken): 217 * NAME TYPE ROLE 218 * vnodelock mtx vnode lists and v_cache_dd field protection 219 * bucketlock rwlock for access to given set of hash buckets 220 * neglist mtx negative entry LRU management 221 * 222 * Additionally, ncneg_shrink_lock mtx is used to have at most one thread 223 * shrinking the LRU list. 224 * 225 * It is legal to take multiple vnodelock and bucketlock locks. The locking 226 * order is lower address first. Both are recursive. 227 * 228 * "." lookups are lockless. 229 * 230 * ".." and vnode -> name lookups require vnodelock. 231 * 232 * name -> vnode lookup requires the relevant bucketlock to be held for reading. 233 * 234 * Insertions and removals of entries require involved vnodes and bucketlocks 235 * to be write-locked to prevent other threads from seeing the entry. 236 * 237 * Some lookups result in removal of the found entry (e.g. getting rid of a 238 * negative entry with the intent to create a positive one), which poses a 239 * problem when multiple threads reach the state. Similarly, two different 240 * threads can purge two different vnodes and try to remove the same name. 241 * 242 * If the already held vnode lock is lower than the second required lock, we 243 * can just take the other lock. However, in the opposite case, this could 244 * deadlock. As such, this is resolved by trylocking and if that fails unlocking 245 * the first node, locking everything in order and revalidating the state. 246 */ 247 248 VFS_SMR_DECLARE; 249 250 /* 251 * Structures associated with name caching. 252 */ 253 #define NCHHASH(hash) \ 254 (&nchashtbl[(hash) & nchash]) 255 static __read_mostly CK_LIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ 256 static u_long __read_mostly nchash; /* size of hash table */ 257 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 258 "Size of namecache hash table"); 259 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */ 260 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 261 "Ratio of negative namecache entries"); 262 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ 263 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ 264 u_int ncsizefactor = 2; 265 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0, 266 "Size factor for namecache"); 267 static u_int __read_mostly ncpurgeminvnodes; 268 SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0, 269 "Number of vnodes below which purgevfs ignores the request"); 270 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */ 271 272 struct nchstats nchstats; /* cache effectiveness statistics */ 273 274 static struct mtx __exclusive_cache_line ncneg_shrink_lock; 275 276 struct neglist { 277 struct mtx nl_lock; 278 TAILQ_HEAD(, namecache) nl_list; 279 } __aligned(CACHE_LINE_SIZE); 280 281 static struct neglist __read_mostly *neglists; 282 static struct neglist ncneg_hot; 283 static u_long numhotneg; 284 285 #define numneglists (ncneghash + 1) 286 static u_int __read_mostly ncneghash; 287 static inline struct neglist * 288 NCP2NEGLIST(struct namecache *ncp) 289 { 290 291 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); 292 } 293 294 static inline struct negstate * 295 NCP2NEGSTATE(struct namecache *ncp) 296 { 297 298 MPASS(ncp->nc_flag & NCF_NEGATIVE); 299 return (&ncp->nc_neg); 300 } 301 302 #define numbucketlocks (ncbuckethash + 1) 303 static u_int __read_mostly ncbuckethash; 304 static struct rwlock_padalign __read_mostly *bucketlocks; 305 #define HASH2BUCKETLOCK(hash) \ 306 ((struct rwlock *)(&bucketlocks[((hash) & ncbuckethash)])) 307 308 #define numvnodelocks (ncvnodehash + 1) 309 static u_int __read_mostly ncvnodehash; 310 static struct mtx __read_mostly *vnodelocks; 311 static inline struct mtx * 312 VP2VNODELOCK(struct vnode *vp) 313 { 314 315 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); 316 } 317 318 /* 319 * UMA zones for the VFS cache. 320 * 321 * The small cache is used for entries with short names, which are the 322 * most common. The large cache is used for entries which are too big to 323 * fit in the small cache. 324 */ 325 static uma_zone_t __read_mostly cache_zone_small; 326 static uma_zone_t __read_mostly cache_zone_small_ts; 327 static uma_zone_t __read_mostly cache_zone_large; 328 static uma_zone_t __read_mostly cache_zone_large_ts; 329 330 #define CACHE_PATH_CUTOFF 35 331 332 static struct namecache * 333 cache_alloc(int len, int ts) 334 { 335 struct namecache_ts *ncp_ts; 336 struct namecache *ncp; 337 338 if (__predict_false(ts)) { 339 if (len <= CACHE_PATH_CUTOFF) 340 ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK); 341 else 342 ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK); 343 ncp = &ncp_ts->nc_nc; 344 } else { 345 if (len <= CACHE_PATH_CUTOFF) 346 ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK); 347 else 348 ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK); 349 } 350 return (ncp); 351 } 352 353 static void 354 cache_free(struct namecache *ncp) 355 { 356 struct namecache_ts *ncp_ts; 357 358 if (ncp == NULL) 359 return; 360 if ((ncp->nc_flag & NCF_DVDROP) != 0) 361 vdrop(ncp->nc_dvp); 362 if (__predict_false(ncp->nc_flag & NCF_TS)) { 363 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 364 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 365 uma_zfree_smr(cache_zone_small_ts, ncp_ts); 366 else 367 uma_zfree_smr(cache_zone_large_ts, ncp_ts); 368 } else { 369 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 370 uma_zfree_smr(cache_zone_small, ncp); 371 else 372 uma_zfree_smr(cache_zone_large, ncp); 373 } 374 } 375 376 static void 377 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) 378 { 379 struct namecache_ts *ncp_ts; 380 381 KASSERT((ncp->nc_flag & NCF_TS) != 0 || 382 (tsp == NULL && ticksp == NULL), 383 ("No NCF_TS")); 384 385 if (tsp == NULL && ticksp == NULL) 386 return; 387 388 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 389 if (tsp != NULL) 390 *tsp = ncp_ts->nc_time; 391 if (ticksp != NULL) 392 *ticksp = ncp_ts->nc_ticks; 393 } 394 395 #ifdef DEBUG_CACHE 396 static int __read_mostly doingcache = 1; /* 1 => enable the cache */ 397 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, 398 "VFS namecache enabled"); 399 #endif 400 401 /* Export size information to userland */ 402 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 403 sizeof(struct namecache), "sizeof(struct namecache)"); 404 405 /* 406 * The new name cache statistics 407 */ 408 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 409 "Name cache statistics"); 410 #define STATNODE_ULONG(name, descr) \ 411 SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr); 412 #define STATNODE_COUNTER(name, descr) \ 413 static COUNTER_U64_DEFINE_EARLY(name); \ 414 SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, \ 415 descr); 416 STATNODE_ULONG(numneg, "Number of negative cache entries"); 417 STATNODE_ULONG(numcache, "Number of cache entries"); 418 STATNODE_COUNTER(numcachehv, "Number of namecache entries with vnodes held"); 419 STATNODE_COUNTER(numdrops, "Number of dropped entries due to reaching the limit"); 420 STATNODE_COUNTER(dothits, "Number of '.' hits"); 421 STATNODE_COUNTER(dotdothits, "Number of '..' hits"); 422 STATNODE_COUNTER(nummiss, "Number of cache misses"); 423 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache"); 424 STATNODE_COUNTER(numposzaps, 425 "Number of cache hits (positive) we do not want to cache"); 426 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)"); 427 STATNODE_COUNTER(numnegzaps, 428 "Number of cache hits (negative) we do not want to cache"); 429 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)"); 430 /* These count for vn_getcwd(), too. */ 431 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls"); 432 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); 433 STATNODE_COUNTER(numfullpathfail2, 434 "Number of fullpath search errors (VOP_VPTOCNP failures)"); 435 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); 436 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls"); 437 STATNODE_COUNTER(zap_and_exit_bucket_relock_success, 438 "Number of successful removals after relocking"); 439 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail, 440 "Number of times zap_and_exit failed to lock"); 441 static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2, 442 "Number of times zap_and_exit failed to lock"); 443 static long cache_lock_vnodes_cel_3_failures; 444 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures, 445 "Number of times 3-way vnode locking failed"); 446 STATNODE_ULONG(numhotneg, "Number of hot negative entries"); 447 STATNODE_COUNTER(numneg_evicted, 448 "Number of negative entries evicted when adding a new entry"); 449 STATNODE_COUNTER(shrinking_skipped, 450 "Number of times shrinking was already in progress"); 451 452 static void cache_zap_locked(struct namecache *ncp); 453 static int vn_fullpath_hardlink(struct thread *td, struct nameidata *ndp, char **retbuf, 454 char **freebuf, size_t *buflen); 455 static int vn_fullpath_any(struct thread *td, struct vnode *vp, struct vnode *rdir, 456 char *buf, char **retbuf, size_t *buflen); 457 static int vn_fullpath_dir(struct thread *td, struct vnode *vp, struct vnode *rdir, 458 char *buf, char **retbuf, size_t *len, bool slash_prefixed, size_t addend); 459 460 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 461 462 static int cache_yield; 463 SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0, 464 "Number of times cache called yield"); 465 466 static void __noinline 467 cache_maybe_yield(void) 468 { 469 470 if (should_yield()) { 471 cache_yield++; 472 kern_yield(PRI_USER); 473 } 474 } 475 476 static inline void 477 cache_assert_vlp_locked(struct mtx *vlp) 478 { 479 480 if (vlp != NULL) 481 mtx_assert(vlp, MA_OWNED); 482 } 483 484 static inline void 485 cache_assert_vnode_locked(struct vnode *vp) 486 { 487 struct mtx *vlp; 488 489 vlp = VP2VNODELOCK(vp); 490 cache_assert_vlp_locked(vlp); 491 } 492 493 static uint32_t 494 cache_get_hash(char *name, u_char len, struct vnode *dvp) 495 { 496 uint32_t hash; 497 498 hash = fnv_32_buf(name, len, FNV1_32_INIT); 499 hash = fnv_32_buf(&dvp, sizeof(dvp), hash); 500 return (hash); 501 } 502 503 static inline struct rwlock * 504 NCP2BUCKETLOCK(struct namecache *ncp) 505 { 506 uint32_t hash; 507 508 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 509 return (HASH2BUCKETLOCK(hash)); 510 } 511 512 #ifdef INVARIANTS 513 static void 514 cache_assert_bucket_locked(struct namecache *ncp, int mode) 515 { 516 struct rwlock *blp; 517 518 blp = NCP2BUCKETLOCK(ncp); 519 rw_assert(blp, mode); 520 } 521 #else 522 #define cache_assert_bucket_locked(x, y) do { } while (0) 523 #endif 524 525 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) 526 static void 527 _cache_sort_vnodes(void **p1, void **p2) 528 { 529 void *tmp; 530 531 MPASS(*p1 != NULL || *p2 != NULL); 532 533 if (*p1 > *p2) { 534 tmp = *p2; 535 *p2 = *p1; 536 *p1 = tmp; 537 } 538 } 539 540 static void 541 cache_lock_all_buckets(void) 542 { 543 u_int i; 544 545 for (i = 0; i < numbucketlocks; i++) 546 rw_wlock(&bucketlocks[i]); 547 } 548 549 static void 550 cache_unlock_all_buckets(void) 551 { 552 u_int i; 553 554 for (i = 0; i < numbucketlocks; i++) 555 rw_wunlock(&bucketlocks[i]); 556 } 557 558 static void 559 cache_lock_all_vnodes(void) 560 { 561 u_int i; 562 563 for (i = 0; i < numvnodelocks; i++) 564 mtx_lock(&vnodelocks[i]); 565 } 566 567 static void 568 cache_unlock_all_vnodes(void) 569 { 570 u_int i; 571 572 for (i = 0; i < numvnodelocks; i++) 573 mtx_unlock(&vnodelocks[i]); 574 } 575 576 static int 577 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 578 { 579 580 cache_sort_vnodes(&vlp1, &vlp2); 581 582 if (vlp1 != NULL) { 583 if (!mtx_trylock(vlp1)) 584 return (EAGAIN); 585 } 586 if (!mtx_trylock(vlp2)) { 587 if (vlp1 != NULL) 588 mtx_unlock(vlp1); 589 return (EAGAIN); 590 } 591 592 return (0); 593 } 594 595 static void 596 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 597 { 598 599 MPASS(vlp1 != NULL || vlp2 != NULL); 600 MPASS(vlp1 <= vlp2); 601 602 if (vlp1 != NULL) 603 mtx_lock(vlp1); 604 if (vlp2 != NULL) 605 mtx_lock(vlp2); 606 } 607 608 static void 609 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 610 { 611 612 MPASS(vlp1 != NULL || vlp2 != NULL); 613 614 if (vlp1 != NULL) 615 mtx_unlock(vlp1); 616 if (vlp2 != NULL) 617 mtx_unlock(vlp2); 618 } 619 620 static int 621 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 622 { 623 struct nchstats snap; 624 625 if (req->oldptr == NULL) 626 return (SYSCTL_OUT(req, 0, sizeof(snap))); 627 628 snap = nchstats; 629 snap.ncs_goodhits = counter_u64_fetch(numposhits); 630 snap.ncs_neghits = counter_u64_fetch(numneghits); 631 snap.ncs_badhits = counter_u64_fetch(numposzaps) + 632 counter_u64_fetch(numnegzaps); 633 snap.ncs_miss = counter_u64_fetch(nummisszap) + 634 counter_u64_fetch(nummiss); 635 636 return (SYSCTL_OUT(req, &snap, sizeof(snap))); 637 } 638 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | 639 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", 640 "VFS cache effectiveness statistics"); 641 642 #ifdef DIAGNOSTIC 643 /* 644 * Grab an atomic snapshot of the name cache hash chain lengths 645 */ 646 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, 647 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 648 "hash table stats"); 649 650 static int 651 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) 652 { 653 struct nchashhead *ncpp; 654 struct namecache *ncp; 655 int i, error, n_nchash, *cntbuf; 656 657 retry: 658 n_nchash = nchash + 1; /* nchash is max index, not count */ 659 if (req->oldptr == NULL) 660 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); 661 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); 662 cache_lock_all_buckets(); 663 if (n_nchash != nchash + 1) { 664 cache_unlock_all_buckets(); 665 free(cntbuf, M_TEMP); 666 goto retry; 667 } 668 /* Scan hash tables counting entries */ 669 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) 670 CK_LIST_FOREACH(ncp, ncpp, nc_hash) 671 cntbuf[i]++; 672 cache_unlock_all_buckets(); 673 for (error = 0, i = 0; i < n_nchash; i++) 674 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) 675 break; 676 free(cntbuf, M_TEMP); 677 return (error); 678 } 679 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| 680 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", 681 "nchash chain lengths"); 682 683 static int 684 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) 685 { 686 int error; 687 struct nchashhead *ncpp; 688 struct namecache *ncp; 689 int n_nchash; 690 int count, maxlength, used, pct; 691 692 if (!req->oldptr) 693 return SYSCTL_OUT(req, 0, 4 * sizeof(int)); 694 695 cache_lock_all_buckets(); 696 n_nchash = nchash + 1; /* nchash is max index, not count */ 697 used = 0; 698 maxlength = 0; 699 700 /* Scan hash tables for applicable entries */ 701 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { 702 count = 0; 703 CK_LIST_FOREACH(ncp, ncpp, nc_hash) { 704 count++; 705 } 706 if (count) 707 used++; 708 if (maxlength < count) 709 maxlength = count; 710 } 711 n_nchash = nchash + 1; 712 cache_unlock_all_buckets(); 713 pct = (used * 100) / (n_nchash / 100); 714 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); 715 if (error) 716 return (error); 717 error = SYSCTL_OUT(req, &used, sizeof(used)); 718 if (error) 719 return (error); 720 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); 721 if (error) 722 return (error); 723 error = SYSCTL_OUT(req, &pct, sizeof(pct)); 724 if (error) 725 return (error); 726 return (0); 727 } 728 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| 729 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", 730 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); 731 #endif 732 733 /* 734 * Negative entries management 735 * 736 * A variation of LRU scheme is used. New entries are hashed into one of 737 * numneglists cold lists. Entries get promoted to the hot list on first hit. 738 * 739 * The shrinker will demote hot list head and evict from the cold list in a 740 * round-robin manner. 741 */ 742 static void 743 cache_negative_init(struct namecache *ncp) 744 { 745 struct negstate *negstate; 746 747 ncp->nc_flag |= NCF_NEGATIVE; 748 negstate = NCP2NEGSTATE(ncp); 749 negstate->neg_flag = 0; 750 } 751 752 static void 753 cache_negative_hit(struct namecache *ncp) 754 { 755 struct neglist *neglist; 756 struct negstate *negstate; 757 758 negstate = NCP2NEGSTATE(ncp); 759 if ((negstate->neg_flag & NEG_HOT) != 0) 760 return; 761 neglist = NCP2NEGLIST(ncp); 762 mtx_lock(&ncneg_hot.nl_lock); 763 mtx_lock(&neglist->nl_lock); 764 if ((negstate->neg_flag & NEG_HOT) == 0) { 765 numhotneg++; 766 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 767 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); 768 negstate->neg_flag |= NEG_HOT; 769 } 770 mtx_unlock(&neglist->nl_lock); 771 mtx_unlock(&ncneg_hot.nl_lock); 772 } 773 774 static void 775 cache_negative_insert(struct namecache *ncp) 776 { 777 struct neglist *neglist; 778 779 MPASS(ncp->nc_flag & NCF_NEGATIVE); 780 cache_assert_bucket_locked(ncp, RA_WLOCKED); 781 neglist = NCP2NEGLIST(ncp); 782 mtx_lock(&neglist->nl_lock); 783 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 784 mtx_unlock(&neglist->nl_lock); 785 atomic_add_rel_long(&numneg, 1); 786 } 787 788 static void 789 cache_negative_remove(struct namecache *ncp) 790 { 791 struct neglist *neglist; 792 struct negstate *negstate; 793 bool hot_locked = false; 794 bool list_locked = false; 795 796 cache_assert_bucket_locked(ncp, RA_WLOCKED); 797 neglist = NCP2NEGLIST(ncp); 798 negstate = NCP2NEGSTATE(ncp); 799 if ((negstate->neg_flag & NEG_HOT) != 0) { 800 hot_locked = true; 801 mtx_lock(&ncneg_hot.nl_lock); 802 if ((negstate->neg_flag & NEG_HOT) == 0) { 803 list_locked = true; 804 mtx_lock(&neglist->nl_lock); 805 } 806 } else { 807 list_locked = true; 808 mtx_lock(&neglist->nl_lock); 809 /* 810 * We may be racing against promotion in lockless lookup. 811 */ 812 if ((negstate->neg_flag & NEG_HOT) != 0) { 813 mtx_unlock(&neglist->nl_lock); 814 hot_locked = true; 815 mtx_lock(&ncneg_hot.nl_lock); 816 mtx_lock(&neglist->nl_lock); 817 } 818 } 819 if ((negstate->neg_flag & NEG_HOT) != 0) { 820 mtx_assert(&ncneg_hot.nl_lock, MA_OWNED); 821 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 822 numhotneg--; 823 } else { 824 mtx_assert(&neglist->nl_lock, MA_OWNED); 825 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 826 } 827 if (list_locked) 828 mtx_unlock(&neglist->nl_lock); 829 if (hot_locked) 830 mtx_unlock(&ncneg_hot.nl_lock); 831 atomic_subtract_rel_long(&numneg, 1); 832 } 833 834 static void 835 cache_negative_shrink_select(struct namecache **ncpp, 836 struct neglist **neglistpp) 837 { 838 struct neglist *neglist; 839 struct namecache *ncp; 840 static u_int cycle; 841 u_int i; 842 843 *ncpp = ncp = NULL; 844 845 for (i = 0; i < numneglists; i++) { 846 neglist = &neglists[(cycle + i) % numneglists]; 847 if (TAILQ_FIRST(&neglist->nl_list) == NULL) 848 continue; 849 mtx_lock(&neglist->nl_lock); 850 ncp = TAILQ_FIRST(&neglist->nl_list); 851 if (ncp != NULL) 852 break; 853 mtx_unlock(&neglist->nl_lock); 854 } 855 856 *neglistpp = neglist; 857 *ncpp = ncp; 858 cycle++; 859 } 860 861 static void 862 cache_negative_zap_one(void) 863 { 864 struct namecache *ncp, *ncp2; 865 struct neglist *neglist; 866 struct negstate *negstate; 867 struct mtx *dvlp; 868 struct rwlock *blp; 869 870 if (mtx_owner(&ncneg_shrink_lock) != NULL || 871 !mtx_trylock(&ncneg_shrink_lock)) { 872 counter_u64_add(shrinking_skipped, 1); 873 return; 874 } 875 876 mtx_lock(&ncneg_hot.nl_lock); 877 ncp = TAILQ_FIRST(&ncneg_hot.nl_list); 878 if (ncp != NULL) { 879 neglist = NCP2NEGLIST(ncp); 880 negstate = NCP2NEGSTATE(ncp); 881 mtx_lock(&neglist->nl_lock); 882 MPASS((negstate->neg_flag & NEG_HOT) != 0); 883 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 884 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 885 negstate->neg_flag &= ~NEG_HOT; 886 numhotneg--; 887 mtx_unlock(&neglist->nl_lock); 888 } 889 mtx_unlock(&ncneg_hot.nl_lock); 890 891 cache_negative_shrink_select(&ncp, &neglist); 892 893 mtx_unlock(&ncneg_shrink_lock); 894 if (ncp == NULL) 895 return; 896 897 MPASS(ncp->nc_flag & NCF_NEGATIVE); 898 dvlp = VP2VNODELOCK(ncp->nc_dvp); 899 blp = NCP2BUCKETLOCK(ncp); 900 mtx_unlock(&neglist->nl_lock); 901 mtx_lock(dvlp); 902 rw_wlock(blp); 903 /* 904 * Enter SMR to safely check the negative list. 905 * Even if the found pointer matches, the entry may now be reallocated 906 * and used by a different vnode. 907 */ 908 vfs_smr_enter(); 909 ncp2 = TAILQ_FIRST(&neglist->nl_list); 910 if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) || 911 blp != NCP2BUCKETLOCK(ncp2)) { 912 vfs_smr_exit(); 913 ncp = NULL; 914 } else { 915 vfs_smr_exit(); 916 SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp, 917 ncp->nc_name); 918 cache_zap_locked(ncp); 919 counter_u64_add(numneg_evicted, 1); 920 } 921 rw_wunlock(blp); 922 mtx_unlock(dvlp); 923 cache_free(ncp); 924 } 925 926 /* 927 * cache_zap_locked(): 928 * 929 * Removes a namecache entry from cache, whether it contains an actual 930 * pointer to a vnode or if it is just a negative cache entry. 931 */ 932 static void 933 cache_zap_locked(struct namecache *ncp) 934 { 935 936 if (!(ncp->nc_flag & NCF_NEGATIVE)) 937 cache_assert_vnode_locked(ncp->nc_vp); 938 cache_assert_vnode_locked(ncp->nc_dvp); 939 cache_assert_bucket_locked(ncp, RA_WLOCKED); 940 941 CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, 942 (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp); 943 944 cache_ncp_invalidate(ncp); 945 946 CK_LIST_REMOVE(ncp, nc_hash); 947 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 948 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, 949 ncp->nc_name, ncp->nc_vp); 950 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); 951 if (ncp == ncp->nc_vp->v_cache_dd) 952 ncp->nc_vp->v_cache_dd = NULL; 953 } else { 954 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp, 955 ncp->nc_name); 956 cache_negative_remove(ncp); 957 } 958 if (ncp->nc_flag & NCF_ISDOTDOT) { 959 if (ncp == ncp->nc_dvp->v_cache_dd) 960 ncp->nc_dvp->v_cache_dd = NULL; 961 } else { 962 LIST_REMOVE(ncp, nc_src); 963 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { 964 ncp->nc_flag |= NCF_DVDROP; 965 counter_u64_add(numcachehv, -1); 966 } 967 } 968 atomic_subtract_rel_long(&numcache, 1); 969 } 970 971 static void 972 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) 973 { 974 struct rwlock *blp; 975 976 MPASS(ncp->nc_dvp == vp); 977 MPASS(ncp->nc_flag & NCF_NEGATIVE); 978 cache_assert_vnode_locked(vp); 979 980 blp = NCP2BUCKETLOCK(ncp); 981 rw_wlock(blp); 982 cache_zap_locked(ncp); 983 rw_wunlock(blp); 984 } 985 986 static bool 987 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, 988 struct mtx **vlpp) 989 { 990 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 991 struct rwlock *blp; 992 993 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 994 cache_assert_vnode_locked(vp); 995 996 if (ncp->nc_flag & NCF_NEGATIVE) { 997 if (*vlpp != NULL) { 998 mtx_unlock(*vlpp); 999 *vlpp = NULL; 1000 } 1001 cache_zap_negative_locked_vnode_kl(ncp, vp); 1002 return (true); 1003 } 1004 1005 pvlp = VP2VNODELOCK(vp); 1006 blp = NCP2BUCKETLOCK(ncp); 1007 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 1008 vlp2 = VP2VNODELOCK(ncp->nc_vp); 1009 1010 if (*vlpp == vlp1 || *vlpp == vlp2) { 1011 to_unlock = *vlpp; 1012 *vlpp = NULL; 1013 } else { 1014 if (*vlpp != NULL) { 1015 mtx_unlock(*vlpp); 1016 *vlpp = NULL; 1017 } 1018 cache_sort_vnodes(&vlp1, &vlp2); 1019 if (vlp1 == pvlp) { 1020 mtx_lock(vlp2); 1021 to_unlock = vlp2; 1022 } else { 1023 if (!mtx_trylock(vlp1)) 1024 goto out_relock; 1025 to_unlock = vlp1; 1026 } 1027 } 1028 rw_wlock(blp); 1029 cache_zap_locked(ncp); 1030 rw_wunlock(blp); 1031 if (to_unlock != NULL) 1032 mtx_unlock(to_unlock); 1033 return (true); 1034 1035 out_relock: 1036 mtx_unlock(vlp2); 1037 mtx_lock(vlp1); 1038 mtx_lock(vlp2); 1039 MPASS(*vlpp == NULL); 1040 *vlpp = vlp1; 1041 return (false); 1042 } 1043 1044 static int __noinline 1045 cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp) 1046 { 1047 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 1048 struct rwlock *blp; 1049 int error = 0; 1050 1051 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 1052 cache_assert_vnode_locked(vp); 1053 1054 pvlp = VP2VNODELOCK(vp); 1055 if (ncp->nc_flag & NCF_NEGATIVE) { 1056 cache_zap_negative_locked_vnode_kl(ncp, vp); 1057 goto out; 1058 } 1059 1060 blp = NCP2BUCKETLOCK(ncp); 1061 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 1062 vlp2 = VP2VNODELOCK(ncp->nc_vp); 1063 cache_sort_vnodes(&vlp1, &vlp2); 1064 if (vlp1 == pvlp) { 1065 mtx_lock(vlp2); 1066 to_unlock = vlp2; 1067 } else { 1068 if (!mtx_trylock(vlp1)) { 1069 error = EAGAIN; 1070 goto out; 1071 } 1072 to_unlock = vlp1; 1073 } 1074 rw_wlock(blp); 1075 cache_zap_locked(ncp); 1076 rw_wunlock(blp); 1077 mtx_unlock(to_unlock); 1078 out: 1079 mtx_unlock(pvlp); 1080 return (error); 1081 } 1082 1083 /* 1084 * If trylocking failed we can get here. We know enough to take all needed locks 1085 * in the right order and re-lookup the entry. 1086 */ 1087 static int 1088 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1089 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, 1090 struct rwlock *blp) 1091 { 1092 struct namecache *rncp; 1093 1094 cache_assert_bucket_locked(ncp, RA_UNLOCKED); 1095 1096 cache_sort_vnodes(&dvlp, &vlp); 1097 cache_lock_vnodes(dvlp, vlp); 1098 rw_wlock(blp); 1099 CK_LIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { 1100 if (rncp == ncp && rncp->nc_dvp == dvp && 1101 rncp->nc_nlen == cnp->cn_namelen && 1102 !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) 1103 break; 1104 } 1105 if (rncp != NULL) { 1106 cache_zap_locked(rncp); 1107 rw_wunlock(blp); 1108 cache_unlock_vnodes(dvlp, vlp); 1109 counter_u64_add(zap_and_exit_bucket_relock_success, 1); 1110 return (0); 1111 } 1112 1113 rw_wunlock(blp); 1114 cache_unlock_vnodes(dvlp, vlp); 1115 return (EAGAIN); 1116 } 1117 1118 static int __noinline 1119 cache_zap_wlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1120 uint32_t hash, struct rwlock *blp) 1121 { 1122 struct mtx *dvlp, *vlp; 1123 struct vnode *dvp; 1124 1125 cache_assert_bucket_locked(ncp, RA_WLOCKED); 1126 1127 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1128 vlp = NULL; 1129 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1130 vlp = VP2VNODELOCK(ncp->nc_vp); 1131 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1132 cache_zap_locked(ncp); 1133 rw_wunlock(blp); 1134 cache_unlock_vnodes(dvlp, vlp); 1135 return (0); 1136 } 1137 1138 dvp = ncp->nc_dvp; 1139 rw_wunlock(blp); 1140 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1141 } 1142 1143 static int __noinline 1144 cache_zap_rlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1145 uint32_t hash, struct rwlock *blp) 1146 { 1147 struct mtx *dvlp, *vlp; 1148 struct vnode *dvp; 1149 1150 cache_assert_bucket_locked(ncp, RA_RLOCKED); 1151 1152 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1153 vlp = NULL; 1154 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1155 vlp = VP2VNODELOCK(ncp->nc_vp); 1156 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1157 rw_runlock(blp); 1158 rw_wlock(blp); 1159 cache_zap_locked(ncp); 1160 rw_wunlock(blp); 1161 cache_unlock_vnodes(dvlp, vlp); 1162 return (0); 1163 } 1164 1165 dvp = ncp->nc_dvp; 1166 rw_runlock(blp); 1167 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1168 } 1169 1170 static int 1171 cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp, 1172 struct mtx **vlpp1, struct mtx **vlpp2) 1173 { 1174 struct mtx *dvlp, *vlp; 1175 1176 cache_assert_bucket_locked(ncp, RA_WLOCKED); 1177 1178 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1179 vlp = NULL; 1180 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1181 vlp = VP2VNODELOCK(ncp->nc_vp); 1182 cache_sort_vnodes(&dvlp, &vlp); 1183 1184 if (*vlpp1 == dvlp && *vlpp2 == vlp) { 1185 cache_zap_locked(ncp); 1186 cache_unlock_vnodes(dvlp, vlp); 1187 *vlpp1 = NULL; 1188 *vlpp2 = NULL; 1189 return (0); 1190 } 1191 1192 if (*vlpp1 != NULL) 1193 mtx_unlock(*vlpp1); 1194 if (*vlpp2 != NULL) 1195 mtx_unlock(*vlpp2); 1196 *vlpp1 = NULL; 1197 *vlpp2 = NULL; 1198 1199 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1200 cache_zap_locked(ncp); 1201 cache_unlock_vnodes(dvlp, vlp); 1202 return (0); 1203 } 1204 1205 rw_wunlock(blp); 1206 *vlpp1 = dvlp; 1207 *vlpp2 = vlp; 1208 if (*vlpp1 != NULL) 1209 mtx_lock(*vlpp1); 1210 mtx_lock(*vlpp2); 1211 rw_wlock(blp); 1212 return (EAGAIN); 1213 } 1214 1215 static void 1216 cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp) 1217 { 1218 1219 if (blp != NULL) { 1220 rw_runlock(blp); 1221 } else { 1222 mtx_unlock(vlp); 1223 } 1224 } 1225 1226 static int __noinline 1227 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1228 struct timespec *tsp, int *ticksp) 1229 { 1230 int ltype; 1231 1232 *vpp = dvp; 1233 CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .", 1234 dvp, cnp->cn_nameptr); 1235 counter_u64_add(dothits, 1); 1236 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); 1237 if (tsp != NULL) 1238 timespecclear(tsp); 1239 if (ticksp != NULL) 1240 *ticksp = ticks; 1241 vrefact(*vpp); 1242 /* 1243 * When we lookup "." we still can be asked to lock it 1244 * differently... 1245 */ 1246 ltype = cnp->cn_lkflags & LK_TYPE_MASK; 1247 if (ltype != VOP_ISLOCKED(*vpp)) { 1248 if (ltype == LK_EXCLUSIVE) { 1249 vn_lock(*vpp, LK_UPGRADE | LK_RETRY); 1250 if (VN_IS_DOOMED((*vpp))) { 1251 /* forced unmount */ 1252 vrele(*vpp); 1253 *vpp = NULL; 1254 return (ENOENT); 1255 } 1256 } else 1257 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); 1258 } 1259 return (-1); 1260 } 1261 1262 static __noinline int 1263 cache_lookup_nomakeentry(struct vnode *dvp, struct vnode **vpp, 1264 struct componentname *cnp, struct timespec *tsp, int *ticksp) 1265 { 1266 struct namecache *ncp; 1267 struct rwlock *blp; 1268 struct mtx *dvlp, *dvlp2; 1269 uint32_t hash; 1270 int error; 1271 1272 if (cnp->cn_namelen == 2 && 1273 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1274 counter_u64_add(dotdothits, 1); 1275 dvlp = VP2VNODELOCK(dvp); 1276 dvlp2 = NULL; 1277 mtx_lock(dvlp); 1278 retry_dotdot: 1279 ncp = dvp->v_cache_dd; 1280 if (ncp == NULL) { 1281 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, 1282 "..", NULL); 1283 mtx_unlock(dvlp); 1284 if (dvlp2 != NULL) 1285 mtx_unlock(dvlp2); 1286 return (0); 1287 } 1288 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1289 if (ncp->nc_dvp != dvp) 1290 panic("dvp %p v_cache_dd %p\n", dvp, ncp); 1291 if (!cache_zap_locked_vnode_kl2(ncp, 1292 dvp, &dvlp2)) 1293 goto retry_dotdot; 1294 MPASS(dvp->v_cache_dd == NULL); 1295 mtx_unlock(dvlp); 1296 if (dvlp2 != NULL) 1297 mtx_unlock(dvlp2); 1298 cache_free(ncp); 1299 } else { 1300 dvp->v_cache_dd = NULL; 1301 mtx_unlock(dvlp); 1302 if (dvlp2 != NULL) 1303 mtx_unlock(dvlp2); 1304 } 1305 return (0); 1306 } 1307 1308 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1309 blp = HASH2BUCKETLOCK(hash); 1310 retry: 1311 if (CK_LIST_EMPTY(NCHHASH(hash))) 1312 goto out_no_entry; 1313 1314 rw_wlock(blp); 1315 1316 CK_LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1317 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1318 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1319 break; 1320 } 1321 1322 /* We failed to find an entry */ 1323 if (ncp == NULL) { 1324 rw_wunlock(blp); 1325 goto out_no_entry; 1326 } 1327 1328 error = cache_zap_wlocked_bucket(ncp, cnp, hash, blp); 1329 if (__predict_false(error != 0)) { 1330 zap_and_exit_bucket_fail++; 1331 cache_maybe_yield(); 1332 goto retry; 1333 } 1334 counter_u64_add(numposzaps, 1); 1335 cache_free(ncp); 1336 return (0); 1337 out_no_entry: 1338 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, NULL); 1339 counter_u64_add(nummisszap, 1); 1340 return (0); 1341 } 1342 1343 /** 1344 * Lookup a name in the name cache 1345 * 1346 * # Arguments 1347 * 1348 * - dvp: Parent directory in which to search. 1349 * - vpp: Return argument. Will contain desired vnode on cache hit. 1350 * - cnp: Parameters of the name search. The most interesting bits of 1351 * the cn_flags field have the following meanings: 1352 * - MAKEENTRY: If clear, free an entry from the cache rather than look 1353 * it up. 1354 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." 1355 * - tsp: Return storage for cache timestamp. On a successful (positive 1356 * or negative) lookup, tsp will be filled with any timespec that 1357 * was stored when this cache entry was created. However, it will 1358 * be clear for "." entries. 1359 * - ticks: Return storage for alternate cache timestamp. On a successful 1360 * (positive or negative) lookup, it will contain the ticks value 1361 * that was current when the cache entry was created, unless cnp 1362 * was ".". 1363 * 1364 * # Returns 1365 * 1366 * - -1: A positive cache hit. vpp will contain the desired vnode. 1367 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due 1368 * to a forced unmount. vpp will not be modified. If the entry 1369 * is a whiteout, then the ISWHITEOUT flag will be set in 1370 * cnp->cn_flags. 1371 * - 0: A cache miss. vpp will not be modified. 1372 * 1373 * # Locking 1374 * 1375 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up 1376 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the 1377 * lock is not recursively acquired. 1378 */ 1379 int 1380 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1381 struct timespec *tsp, int *ticksp) 1382 { 1383 struct namecache_ts *ncp_ts; 1384 struct namecache *ncp; 1385 struct negstate *negstate; 1386 struct rwlock *blp; 1387 struct mtx *dvlp; 1388 uint32_t hash; 1389 enum vgetstate vs; 1390 int error, ltype; 1391 bool try_smr, doing_smr, whiteout; 1392 1393 #ifdef DEBUG_CACHE 1394 if (__predict_false(!doingcache)) { 1395 cnp->cn_flags &= ~MAKEENTRY; 1396 return (0); 1397 } 1398 #endif 1399 1400 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) 1401 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); 1402 1403 if ((cnp->cn_flags & MAKEENTRY) == 0) 1404 return (cache_lookup_nomakeentry(dvp, vpp, cnp, tsp, ticksp)); 1405 1406 try_smr = true; 1407 if (cnp->cn_nameiop == CREATE) 1408 try_smr = false; 1409 retry: 1410 doing_smr = false; 1411 blp = NULL; 1412 dvlp = NULL; 1413 error = 0; 1414 if (cnp->cn_namelen == 2 && 1415 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1416 counter_u64_add(dotdothits, 1); 1417 dvlp = VP2VNODELOCK(dvp); 1418 mtx_lock(dvlp); 1419 ncp = dvp->v_cache_dd; 1420 if (ncp == NULL) { 1421 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, 1422 "..", NULL); 1423 mtx_unlock(dvlp); 1424 return (0); 1425 } 1426 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1427 if (ncp->nc_flag & NCF_NEGATIVE) 1428 *vpp = NULL; 1429 else 1430 *vpp = ncp->nc_vp; 1431 } else 1432 *vpp = ncp->nc_dvp; 1433 /* Return failure if negative entry was found. */ 1434 if (*vpp == NULL) 1435 goto negative_success; 1436 CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..", 1437 dvp, cnp->cn_nameptr, *vpp); 1438 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", 1439 *vpp); 1440 cache_out_ts(ncp, tsp, ticksp); 1441 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == 1442 NCF_DTS && tsp != NULL) { 1443 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1444 *tsp = ncp_ts->nc_dotdottime; 1445 } 1446 goto success; 1447 } 1448 1449 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1450 retry_hashed: 1451 if (try_smr) { 1452 vfs_smr_enter(); 1453 doing_smr = true; 1454 try_smr = false; 1455 } else { 1456 blp = HASH2BUCKETLOCK(hash); 1457 rw_rlock(blp); 1458 } 1459 1460 CK_LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1461 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1462 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1463 break; 1464 } 1465 1466 /* We failed to find an entry */ 1467 if (__predict_false(ncp == NULL)) { 1468 if (doing_smr) 1469 vfs_smr_exit(); 1470 else 1471 rw_runlock(blp); 1472 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1473 NULL); 1474 counter_u64_add(nummiss, 1); 1475 return (0); 1476 } 1477 1478 if (ncp->nc_flag & NCF_NEGATIVE) 1479 goto negative_success; 1480 1481 /* We found a "positive" match, return the vnode */ 1482 counter_u64_add(numposhits, 1); 1483 *vpp = ncp->nc_vp; 1484 CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p", 1485 dvp, cnp->cn_nameptr, *vpp, ncp); 1486 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, 1487 *vpp); 1488 cache_out_ts(ncp, tsp, ticksp); 1489 success: 1490 /* 1491 * On success we return a locked and ref'd vnode as per the lookup 1492 * protocol. 1493 */ 1494 MPASS(dvp != *vpp); 1495 ltype = 0; /* silence gcc warning */ 1496 if (cnp->cn_flags & ISDOTDOT) { 1497 ltype = VOP_ISLOCKED(dvp); 1498 VOP_UNLOCK(dvp); 1499 } 1500 if (doing_smr) { 1501 if (cache_ncp_invalid(ncp)) { 1502 vfs_smr_exit(); 1503 *vpp = NULL; 1504 goto retry; 1505 } 1506 vs = vget_prep_smr(*vpp); 1507 vfs_smr_exit(); 1508 if (vs == VGET_NONE) { 1509 *vpp = NULL; 1510 goto retry; 1511 } 1512 } else { 1513 vs = vget_prep(*vpp); 1514 cache_lookup_unlock(blp, dvlp); 1515 } 1516 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1517 if (cnp->cn_flags & ISDOTDOT) { 1518 vn_lock(dvp, ltype | LK_RETRY); 1519 if (VN_IS_DOOMED(dvp)) { 1520 if (error == 0) 1521 vput(*vpp); 1522 *vpp = NULL; 1523 return (ENOENT); 1524 } 1525 } 1526 if (error) { 1527 *vpp = NULL; 1528 goto retry; 1529 } 1530 if ((cnp->cn_flags & ISLASTCN) && 1531 (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) { 1532 ASSERT_VOP_ELOCKED(*vpp, "cache_lookup"); 1533 } 1534 return (-1); 1535 1536 negative_success: 1537 /* We found a negative match, and want to create it, so purge */ 1538 if (cnp->cn_nameiop == CREATE) { 1539 MPASS(!doing_smr); 1540 counter_u64_add(numnegzaps, 1); 1541 goto zap_and_exit; 1542 } 1543 1544 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 1545 cache_out_ts(ncp, tsp, ticksp); 1546 counter_u64_add(numneghits, 1); 1547 whiteout = (ncp->nc_flag & NCF_WHITE); 1548 1549 if (doing_smr) { 1550 /* 1551 * We need to take locks to promote an entry. 1552 */ 1553 negstate = NCP2NEGSTATE(ncp); 1554 if ((negstate->neg_flag & NEG_HOT) == 0 || 1555 cache_ncp_invalid(ncp)) { 1556 vfs_smr_exit(); 1557 doing_smr = false; 1558 goto retry_hashed; 1559 } 1560 vfs_smr_exit(); 1561 } else { 1562 cache_negative_hit(ncp); 1563 cache_lookup_unlock(blp, dvlp); 1564 } 1565 if (whiteout) 1566 cnp->cn_flags |= ISWHITEOUT; 1567 return (ENOENT); 1568 1569 zap_and_exit: 1570 MPASS(!doing_smr); 1571 if (blp != NULL) 1572 error = cache_zap_rlocked_bucket(ncp, cnp, hash, blp); 1573 else 1574 error = cache_zap_locked_vnode(ncp, dvp); 1575 if (__predict_false(error != 0)) { 1576 zap_and_exit_bucket_fail2++; 1577 cache_maybe_yield(); 1578 goto retry; 1579 } 1580 cache_free(ncp); 1581 return (0); 1582 } 1583 1584 struct celockstate { 1585 struct mtx *vlp[3]; 1586 struct rwlock *blp[2]; 1587 }; 1588 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); 1589 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); 1590 1591 static inline void 1592 cache_celockstate_init(struct celockstate *cel) 1593 { 1594 1595 bzero(cel, sizeof(*cel)); 1596 } 1597 1598 static void 1599 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, 1600 struct vnode *dvp) 1601 { 1602 struct mtx *vlp1, *vlp2; 1603 1604 MPASS(cel->vlp[0] == NULL); 1605 MPASS(cel->vlp[1] == NULL); 1606 MPASS(cel->vlp[2] == NULL); 1607 1608 MPASS(vp != NULL || dvp != NULL); 1609 1610 vlp1 = VP2VNODELOCK(vp); 1611 vlp2 = VP2VNODELOCK(dvp); 1612 cache_sort_vnodes(&vlp1, &vlp2); 1613 1614 if (vlp1 != NULL) { 1615 mtx_lock(vlp1); 1616 cel->vlp[0] = vlp1; 1617 } 1618 mtx_lock(vlp2); 1619 cel->vlp[1] = vlp2; 1620 } 1621 1622 static void 1623 cache_unlock_vnodes_cel(struct celockstate *cel) 1624 { 1625 1626 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); 1627 1628 if (cel->vlp[0] != NULL) 1629 mtx_unlock(cel->vlp[0]); 1630 if (cel->vlp[1] != NULL) 1631 mtx_unlock(cel->vlp[1]); 1632 if (cel->vlp[2] != NULL) 1633 mtx_unlock(cel->vlp[2]); 1634 } 1635 1636 static bool 1637 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) 1638 { 1639 struct mtx *vlp; 1640 bool ret; 1641 1642 cache_assert_vlp_locked(cel->vlp[0]); 1643 cache_assert_vlp_locked(cel->vlp[1]); 1644 MPASS(cel->vlp[2] == NULL); 1645 1646 MPASS(vp != NULL); 1647 vlp = VP2VNODELOCK(vp); 1648 1649 ret = true; 1650 if (vlp >= cel->vlp[1]) { 1651 mtx_lock(vlp); 1652 } else { 1653 if (mtx_trylock(vlp)) 1654 goto out; 1655 cache_lock_vnodes_cel_3_failures++; 1656 cache_unlock_vnodes_cel(cel); 1657 if (vlp < cel->vlp[0]) { 1658 mtx_lock(vlp); 1659 mtx_lock(cel->vlp[0]); 1660 mtx_lock(cel->vlp[1]); 1661 } else { 1662 if (cel->vlp[0] != NULL) 1663 mtx_lock(cel->vlp[0]); 1664 mtx_lock(vlp); 1665 mtx_lock(cel->vlp[1]); 1666 } 1667 ret = false; 1668 } 1669 out: 1670 cel->vlp[2] = vlp; 1671 return (ret); 1672 } 1673 1674 static void 1675 cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1, 1676 struct rwlock *blp2) 1677 { 1678 1679 MPASS(cel->blp[0] == NULL); 1680 MPASS(cel->blp[1] == NULL); 1681 1682 cache_sort_vnodes(&blp1, &blp2); 1683 1684 if (blp1 != NULL) { 1685 rw_wlock(blp1); 1686 cel->blp[0] = blp1; 1687 } 1688 rw_wlock(blp2); 1689 cel->blp[1] = blp2; 1690 } 1691 1692 static void 1693 cache_unlock_buckets_cel(struct celockstate *cel) 1694 { 1695 1696 if (cel->blp[0] != NULL) 1697 rw_wunlock(cel->blp[0]); 1698 rw_wunlock(cel->blp[1]); 1699 } 1700 1701 /* 1702 * Lock part of the cache affected by the insertion. 1703 * 1704 * This means vnodelocks for dvp, vp and the relevant bucketlock. 1705 * However, insertion can result in removal of an old entry. In this 1706 * case we have an additional vnode and bucketlock pair to lock. If the 1707 * entry is negative, ncelock is locked instead of the vnode. 1708 * 1709 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while 1710 * preserving the locking order (smaller address first). 1711 */ 1712 static void 1713 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1714 uint32_t hash) 1715 { 1716 struct namecache *ncp; 1717 struct rwlock *blps[2]; 1718 1719 blps[0] = HASH2BUCKETLOCK(hash); 1720 for (;;) { 1721 blps[1] = NULL; 1722 cache_lock_vnodes_cel(cel, dvp, vp); 1723 if (vp == NULL || vp->v_type != VDIR) 1724 break; 1725 ncp = vp->v_cache_dd; 1726 if (ncp == NULL) 1727 break; 1728 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1729 break; 1730 MPASS(ncp->nc_dvp == vp); 1731 blps[1] = NCP2BUCKETLOCK(ncp); 1732 if (ncp->nc_flag & NCF_NEGATIVE) 1733 break; 1734 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1735 break; 1736 /* 1737 * All vnodes got re-locked. Re-validate the state and if 1738 * nothing changed we are done. Otherwise restart. 1739 */ 1740 if (ncp == vp->v_cache_dd && 1741 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1742 blps[1] == NCP2BUCKETLOCK(ncp) && 1743 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1744 break; 1745 cache_unlock_vnodes_cel(cel); 1746 cel->vlp[0] = NULL; 1747 cel->vlp[1] = NULL; 1748 cel->vlp[2] = NULL; 1749 } 1750 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1751 } 1752 1753 static void 1754 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1755 uint32_t hash) 1756 { 1757 struct namecache *ncp; 1758 struct rwlock *blps[2]; 1759 1760 blps[0] = HASH2BUCKETLOCK(hash); 1761 for (;;) { 1762 blps[1] = NULL; 1763 cache_lock_vnodes_cel(cel, dvp, vp); 1764 ncp = dvp->v_cache_dd; 1765 if (ncp == NULL) 1766 break; 1767 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1768 break; 1769 MPASS(ncp->nc_dvp == dvp); 1770 blps[1] = NCP2BUCKETLOCK(ncp); 1771 if (ncp->nc_flag & NCF_NEGATIVE) 1772 break; 1773 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1774 break; 1775 if (ncp == dvp->v_cache_dd && 1776 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1777 blps[1] == NCP2BUCKETLOCK(ncp) && 1778 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1779 break; 1780 cache_unlock_vnodes_cel(cel); 1781 cel->vlp[0] = NULL; 1782 cel->vlp[1] = NULL; 1783 cel->vlp[2] = NULL; 1784 } 1785 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1786 } 1787 1788 static void 1789 cache_enter_unlock(struct celockstate *cel) 1790 { 1791 1792 cache_unlock_buckets_cel(cel); 1793 cache_unlock_vnodes_cel(cel); 1794 } 1795 1796 static void __noinline 1797 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, 1798 struct componentname *cnp) 1799 { 1800 struct celockstate cel; 1801 struct namecache *ncp; 1802 uint32_t hash; 1803 int len; 1804 1805 if (dvp->v_cache_dd == NULL) 1806 return; 1807 len = cnp->cn_namelen; 1808 cache_celockstate_init(&cel); 1809 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1810 cache_enter_lock_dd(&cel, dvp, vp, hash); 1811 ncp = dvp->v_cache_dd; 1812 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) { 1813 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); 1814 cache_zap_locked(ncp); 1815 } else { 1816 ncp = NULL; 1817 } 1818 dvp->v_cache_dd = NULL; 1819 cache_enter_unlock(&cel); 1820 cache_free(ncp); 1821 } 1822 1823 /* 1824 * Add an entry to the cache. 1825 */ 1826 void 1827 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 1828 struct timespec *tsp, struct timespec *dtsp) 1829 { 1830 struct celockstate cel; 1831 struct namecache *ncp, *n2, *ndd; 1832 struct namecache_ts *ncp_ts, *n2_ts; 1833 struct nchashhead *ncpp; 1834 uint32_t hash; 1835 int flag; 1836 int len; 1837 u_long lnumcache; 1838 1839 CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr); 1840 VNASSERT(vp == NULL || !VN_IS_DOOMED(vp), vp, 1841 ("cache_enter: Adding a doomed vnode")); 1842 VNASSERT(dvp == NULL || !VN_IS_DOOMED(dvp), dvp, 1843 ("cache_enter: Doomed vnode used as src")); 1844 1845 #ifdef DEBUG_CACHE 1846 if (__predict_false(!doingcache)) 1847 return; 1848 #endif 1849 1850 flag = 0; 1851 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 1852 if (cnp->cn_namelen == 1) 1853 return; 1854 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 1855 cache_enter_dotdot_prep(dvp, vp, cnp); 1856 flag = NCF_ISDOTDOT; 1857 } 1858 } 1859 1860 /* 1861 * Avoid blowout in namecache entries. 1862 */ 1863 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; 1864 if (__predict_false(lnumcache >= ncsize)) { 1865 atomic_add_long(&numcache, -1); 1866 counter_u64_add(numdrops, 1); 1867 return; 1868 } 1869 1870 cache_celockstate_init(&cel); 1871 ndd = NULL; 1872 ncp_ts = NULL; 1873 1874 /* 1875 * Calculate the hash key and setup as much of the new 1876 * namecache entry as possible before acquiring the lock. 1877 */ 1878 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); 1879 ncp->nc_flag = flag; 1880 ncp->nc_vp = vp; 1881 if (vp == NULL) 1882 cache_negative_init(ncp); 1883 ncp->nc_dvp = dvp; 1884 if (tsp != NULL) { 1885 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1886 ncp_ts->nc_time = *tsp; 1887 ncp_ts->nc_ticks = ticks; 1888 ncp_ts->nc_nc.nc_flag |= NCF_TS; 1889 if (dtsp != NULL) { 1890 ncp_ts->nc_dotdottime = *dtsp; 1891 ncp_ts->nc_nc.nc_flag |= NCF_DTS; 1892 } 1893 } 1894 len = ncp->nc_nlen = cnp->cn_namelen; 1895 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1896 strlcpy(ncp->nc_name, cnp->cn_nameptr, len + 1); 1897 cache_enter_lock(&cel, dvp, vp, hash); 1898 1899 /* 1900 * See if this vnode or negative entry is already in the cache 1901 * with this name. This can happen with concurrent lookups of 1902 * the same path name. 1903 */ 1904 ncpp = NCHHASH(hash); 1905 CK_LIST_FOREACH(n2, ncpp, nc_hash) { 1906 if (n2->nc_dvp == dvp && 1907 n2->nc_nlen == cnp->cn_namelen && 1908 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { 1909 if (tsp != NULL) { 1910 KASSERT((n2->nc_flag & NCF_TS) != 0, 1911 ("no NCF_TS")); 1912 n2_ts = __containerof(n2, struct namecache_ts, nc_nc); 1913 n2_ts->nc_time = ncp_ts->nc_time; 1914 n2_ts->nc_ticks = ncp_ts->nc_ticks; 1915 if (dtsp != NULL) { 1916 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; 1917 n2_ts->nc_nc.nc_flag |= NCF_DTS; 1918 } 1919 } 1920 goto out_unlock_free; 1921 } 1922 } 1923 1924 if (flag == NCF_ISDOTDOT) { 1925 /* 1926 * See if we are trying to add .. entry, but some other lookup 1927 * has populated v_cache_dd pointer already. 1928 */ 1929 if (dvp->v_cache_dd != NULL) 1930 goto out_unlock_free; 1931 KASSERT(vp == NULL || vp->v_type == VDIR, 1932 ("wrong vnode type %p", vp)); 1933 dvp->v_cache_dd = ncp; 1934 } 1935 1936 if (vp != NULL) { 1937 if (vp->v_type == VDIR) { 1938 if (flag != NCF_ISDOTDOT) { 1939 /* 1940 * For this case, the cache entry maps both the 1941 * directory name in it and the name ".." for the 1942 * directory's parent. 1943 */ 1944 if ((ndd = vp->v_cache_dd) != NULL) { 1945 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) 1946 cache_zap_locked(ndd); 1947 else 1948 ndd = NULL; 1949 } 1950 vp->v_cache_dd = ncp; 1951 } 1952 } else { 1953 vp->v_cache_dd = NULL; 1954 } 1955 } 1956 1957 if (flag != NCF_ISDOTDOT) { 1958 if (LIST_EMPTY(&dvp->v_cache_src)) { 1959 vhold(dvp); 1960 counter_u64_add(numcachehv, 1); 1961 } 1962 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); 1963 } 1964 1965 /* 1966 * If the entry is "negative", we place it into the 1967 * "negative" cache queue, otherwise, we place it into the 1968 * destination vnode's cache entries queue. 1969 */ 1970 if (vp != NULL) { 1971 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); 1972 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, 1973 vp); 1974 } else { 1975 if (cnp->cn_flags & ISWHITEOUT) 1976 ncp->nc_flag |= NCF_WHITE; 1977 cache_negative_insert(ncp); 1978 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, 1979 ncp->nc_name); 1980 } 1981 1982 atomic_thread_fence_rel(); 1983 /* 1984 * Insert the new namecache entry into the appropriate chain 1985 * within the cache entries table. 1986 */ 1987 CK_LIST_INSERT_HEAD(ncpp, ncp, nc_hash); 1988 1989 cache_enter_unlock(&cel); 1990 if (numneg * ncnegfactor > lnumcache) 1991 cache_negative_zap_one(); 1992 cache_free(ndd); 1993 return; 1994 out_unlock_free: 1995 cache_enter_unlock(&cel); 1996 cache_free(ncp); 1997 return; 1998 } 1999 2000 static u_int 2001 cache_roundup_2(u_int val) 2002 { 2003 u_int res; 2004 2005 for (res = 1; res <= val; res <<= 1) 2006 continue; 2007 2008 return (res); 2009 } 2010 2011 /* 2012 * Name cache initialization, from vfs_init() when we are booting 2013 */ 2014 static void 2015 nchinit(void *dummy __unused) 2016 { 2017 u_int i; 2018 2019 cache_zone_small = uma_zcreate("S VFS Cache", 2020 sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1, 2021 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache), 2022 UMA_ZONE_ZINIT); 2023 cache_zone_small_ts = uma_zcreate("STS VFS Cache", 2024 sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1, 2025 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts), 2026 UMA_ZONE_ZINIT); 2027 cache_zone_large = uma_zcreate("L VFS Cache", 2028 sizeof(struct namecache) + NAME_MAX + 1, 2029 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache), 2030 UMA_ZONE_ZINIT); 2031 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", 2032 sizeof(struct namecache_ts) + NAME_MAX + 1, 2033 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts), 2034 UMA_ZONE_ZINIT); 2035 2036 VFS_SMR_ZONE_SET(cache_zone_small); 2037 VFS_SMR_ZONE_SET(cache_zone_small_ts); 2038 VFS_SMR_ZONE_SET(cache_zone_large); 2039 VFS_SMR_ZONE_SET(cache_zone_large_ts); 2040 2041 ncsize = desiredvnodes * ncsizefactor; 2042 nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash); 2043 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; 2044 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ 2045 ncbuckethash = 7; 2046 if (ncbuckethash > nchash) 2047 ncbuckethash = nchash; 2048 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, 2049 M_WAITOK | M_ZERO); 2050 for (i = 0; i < numbucketlocks; i++) 2051 rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE); 2052 ncvnodehash = ncbuckethash; 2053 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, 2054 M_WAITOK | M_ZERO); 2055 for (i = 0; i < numvnodelocks; i++) 2056 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); 2057 ncpurgeminvnodes = numbucketlocks * 2; 2058 2059 ncneghash = 3; 2060 neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE, 2061 M_WAITOK | M_ZERO); 2062 for (i = 0; i < numneglists; i++) { 2063 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); 2064 TAILQ_INIT(&neglists[i].nl_list); 2065 } 2066 mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF); 2067 TAILQ_INIT(&ncneg_hot.nl_list); 2068 2069 mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF); 2070 } 2071 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); 2072 2073 void 2074 cache_changesize(u_long newmaxvnodes) 2075 { 2076 struct nchashhead *new_nchashtbl, *old_nchashtbl; 2077 u_long new_nchash, old_nchash; 2078 struct namecache *ncp; 2079 uint32_t hash; 2080 u_long newncsize; 2081 int i; 2082 2083 newncsize = newmaxvnodes * ncsizefactor; 2084 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); 2085 if (newmaxvnodes < numbucketlocks) 2086 newmaxvnodes = numbucketlocks; 2087 2088 new_nchashtbl = hashinit(newmaxvnodes, M_VFSCACHE, &new_nchash); 2089 /* If same hash table size, nothing to do */ 2090 if (nchash == new_nchash) { 2091 free(new_nchashtbl, M_VFSCACHE); 2092 return; 2093 } 2094 /* 2095 * Move everything from the old hash table to the new table. 2096 * None of the namecache entries in the table can be removed 2097 * because to do so, they have to be removed from the hash table. 2098 */ 2099 cache_lock_all_vnodes(); 2100 cache_lock_all_buckets(); 2101 old_nchashtbl = nchashtbl; 2102 old_nchash = nchash; 2103 nchashtbl = new_nchashtbl; 2104 nchash = new_nchash; 2105 for (i = 0; i <= old_nchash; i++) { 2106 while ((ncp = CK_LIST_FIRST(&old_nchashtbl[i])) != NULL) { 2107 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, 2108 ncp->nc_dvp); 2109 CK_LIST_REMOVE(ncp, nc_hash); 2110 CK_LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); 2111 } 2112 } 2113 ncsize = newncsize; 2114 cache_unlock_all_buckets(); 2115 cache_unlock_all_vnodes(); 2116 free(old_nchashtbl, M_VFSCACHE); 2117 } 2118 2119 /* 2120 * Invalidate all entries from and to a particular vnode. 2121 */ 2122 void 2123 cache_purge(struct vnode *vp) 2124 { 2125 TAILQ_HEAD(, namecache) ncps; 2126 struct namecache *ncp, *nnp; 2127 struct mtx *vlp, *vlp2; 2128 2129 CTR1(KTR_VFS, "cache_purge(%p)", vp); 2130 SDT_PROBE1(vfs, namecache, purge, done, vp); 2131 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2132 vp->v_cache_dd == NULL) 2133 return; 2134 TAILQ_INIT(&ncps); 2135 vlp = VP2VNODELOCK(vp); 2136 vlp2 = NULL; 2137 mtx_lock(vlp); 2138 retry: 2139 while (!LIST_EMPTY(&vp->v_cache_src)) { 2140 ncp = LIST_FIRST(&vp->v_cache_src); 2141 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2142 goto retry; 2143 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2144 } 2145 while (!TAILQ_EMPTY(&vp->v_cache_dst)) { 2146 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2147 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2148 goto retry; 2149 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2150 } 2151 ncp = vp->v_cache_dd; 2152 if (ncp != NULL) { 2153 KASSERT(ncp->nc_flag & NCF_ISDOTDOT, 2154 ("lost dotdot link")); 2155 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2156 goto retry; 2157 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2158 } 2159 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); 2160 mtx_unlock(vlp); 2161 if (vlp2 != NULL) 2162 mtx_unlock(vlp2); 2163 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2164 cache_free(ncp); 2165 } 2166 } 2167 2168 /* 2169 * Invalidate all negative entries for a particular directory vnode. 2170 */ 2171 void 2172 cache_purge_negative(struct vnode *vp) 2173 { 2174 TAILQ_HEAD(, namecache) ncps; 2175 struct namecache *ncp, *nnp; 2176 struct mtx *vlp; 2177 2178 CTR1(KTR_VFS, "cache_purge_negative(%p)", vp); 2179 SDT_PROBE1(vfs, namecache, purge_negative, done, vp); 2180 if (LIST_EMPTY(&vp->v_cache_src)) 2181 return; 2182 TAILQ_INIT(&ncps); 2183 vlp = VP2VNODELOCK(vp); 2184 mtx_lock(vlp); 2185 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { 2186 if (!(ncp->nc_flag & NCF_NEGATIVE)) 2187 continue; 2188 cache_zap_negative_locked_vnode_kl(ncp, vp); 2189 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2190 } 2191 mtx_unlock(vlp); 2192 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2193 cache_free(ncp); 2194 } 2195 } 2196 2197 /* 2198 * Flush all entries referencing a particular filesystem. 2199 */ 2200 void 2201 cache_purgevfs(struct mount *mp, bool force) 2202 { 2203 TAILQ_HEAD(, namecache) ncps; 2204 struct mtx *vlp1, *vlp2; 2205 struct rwlock *blp; 2206 struct nchashhead *bucket; 2207 struct namecache *ncp, *nnp; 2208 u_long i, j, n_nchash; 2209 int error; 2210 2211 /* Scan hash tables for applicable entries */ 2212 SDT_PROBE1(vfs, namecache, purgevfs, done, mp); 2213 if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes) 2214 return; 2215 TAILQ_INIT(&ncps); 2216 n_nchash = nchash + 1; 2217 vlp1 = vlp2 = NULL; 2218 for (i = 0; i < numbucketlocks; i++) { 2219 blp = (struct rwlock *)&bucketlocks[i]; 2220 rw_wlock(blp); 2221 for (j = i; j < n_nchash; j += numbucketlocks) { 2222 retry: 2223 bucket = &nchashtbl[j]; 2224 CK_LIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) { 2225 cache_assert_bucket_locked(ncp, RA_WLOCKED); 2226 if (ncp->nc_dvp->v_mount != mp) 2227 continue; 2228 error = cache_zap_wlocked_bucket_kl(ncp, blp, 2229 &vlp1, &vlp2); 2230 if (error != 0) 2231 goto retry; 2232 TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst); 2233 } 2234 } 2235 rw_wunlock(blp); 2236 if (vlp1 == NULL && vlp2 == NULL) 2237 cache_maybe_yield(); 2238 } 2239 if (vlp1 != NULL) 2240 mtx_unlock(vlp1); 2241 if (vlp2 != NULL) 2242 mtx_unlock(vlp2); 2243 2244 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2245 cache_free(ncp); 2246 } 2247 } 2248 2249 /* 2250 * Perform canonical checks and cache lookup and pass on to filesystem 2251 * through the vop_cachedlookup only if needed. 2252 */ 2253 2254 int 2255 vfs_cache_lookup(struct vop_lookup_args *ap) 2256 { 2257 struct vnode *dvp; 2258 int error; 2259 struct vnode **vpp = ap->a_vpp; 2260 struct componentname *cnp = ap->a_cnp; 2261 int flags = cnp->cn_flags; 2262 2263 *vpp = NULL; 2264 dvp = ap->a_dvp; 2265 2266 if (dvp->v_type != VDIR) 2267 return (ENOTDIR); 2268 2269 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && 2270 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) 2271 return (EROFS); 2272 2273 error = vn_dir_check_exec(dvp, cnp); 2274 if (error != 0) 2275 return (error); 2276 2277 error = cache_lookup(dvp, vpp, cnp, NULL, NULL); 2278 if (error == 0) 2279 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); 2280 if (error == -1) 2281 return (0); 2282 return (error); 2283 } 2284 2285 /* Implementation of the getcwd syscall. */ 2286 int 2287 sys___getcwd(struct thread *td, struct __getcwd_args *uap) 2288 { 2289 char *buf, *retbuf; 2290 size_t buflen; 2291 int error; 2292 2293 buflen = uap->buflen; 2294 if (__predict_false(buflen < 2)) 2295 return (EINVAL); 2296 if (buflen > MAXPATHLEN) 2297 buflen = MAXPATHLEN; 2298 2299 buf = malloc(buflen, M_TEMP, M_WAITOK); 2300 error = vn_getcwd(td, buf, &retbuf, &buflen); 2301 if (error == 0) 2302 error = copyout(retbuf, uap->buf, buflen); 2303 free(buf, M_TEMP); 2304 return (error); 2305 } 2306 2307 int 2308 vn_getcwd(struct thread *td, char *buf, char **retbuf, size_t *buflen) 2309 { 2310 struct pwd *pwd; 2311 int error; 2312 2313 pwd = pwd_hold(td); 2314 error = vn_fullpath_any(td, pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, buflen); 2315 pwd_drop(pwd); 2316 2317 #ifdef KTRACE 2318 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0) 2319 ktrnamei(*retbuf); 2320 #endif 2321 return (error); 2322 } 2323 2324 static int 2325 kern___realpathat(struct thread *td, int fd, const char *path, char *buf, 2326 size_t size, int flags, enum uio_seg pathseg) 2327 { 2328 struct nameidata nd; 2329 char *retbuf, *freebuf; 2330 int error; 2331 2332 if (flags != 0) 2333 return (EINVAL); 2334 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1, 2335 pathseg, path, fd, &cap_fstat_rights, td); 2336 if ((error = namei(&nd)) != 0) 2337 return (error); 2338 error = vn_fullpath_hardlink(td, &nd, &retbuf, &freebuf, &size); 2339 if (error == 0) { 2340 error = copyout(retbuf, buf, size); 2341 free(freebuf, M_TEMP); 2342 } 2343 NDFREE(&nd, 0); 2344 return (error); 2345 } 2346 2347 int 2348 sys___realpathat(struct thread *td, struct __realpathat_args *uap) 2349 { 2350 2351 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size, 2352 uap->flags, UIO_USERSPACE)); 2353 } 2354 2355 /* 2356 * Retrieve the full filesystem path that correspond to a vnode from the name 2357 * cache (if available) 2358 */ 2359 int 2360 vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf) 2361 { 2362 struct pwd *pwd; 2363 char *buf; 2364 size_t buflen; 2365 int error; 2366 2367 if (__predict_false(vn == NULL)) 2368 return (EINVAL); 2369 2370 buflen = MAXPATHLEN; 2371 buf = malloc(buflen, M_TEMP, M_WAITOK); 2372 pwd = pwd_hold(td); 2373 error = vn_fullpath_any(td, vn, pwd->pwd_rdir, buf, retbuf, &buflen); 2374 pwd_drop(pwd); 2375 2376 if (!error) 2377 *freebuf = buf; 2378 else 2379 free(buf, M_TEMP); 2380 return (error); 2381 } 2382 2383 /* 2384 * This function is similar to vn_fullpath, but it attempts to lookup the 2385 * pathname relative to the global root mount point. This is required for the 2386 * auditing sub-system, as audited pathnames must be absolute, relative to the 2387 * global root mount point. 2388 */ 2389 int 2390 vn_fullpath_global(struct thread *td, struct vnode *vn, 2391 char **retbuf, char **freebuf) 2392 { 2393 char *buf; 2394 size_t buflen; 2395 int error; 2396 2397 if (__predict_false(vn == NULL)) 2398 return (EINVAL); 2399 buflen = MAXPATHLEN; 2400 buf = malloc(buflen, M_TEMP, M_WAITOK); 2401 error = vn_fullpath_any(td, vn, rootvnode, buf, retbuf, &buflen); 2402 if (!error) 2403 *freebuf = buf; 2404 else 2405 free(buf, M_TEMP); 2406 return (error); 2407 } 2408 2409 int 2410 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, size_t *buflen) 2411 { 2412 struct vnode *dvp; 2413 struct namecache *ncp; 2414 struct mtx *vlp; 2415 int error; 2416 2417 vlp = VP2VNODELOCK(*vp); 2418 mtx_lock(vlp); 2419 TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) { 2420 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2421 break; 2422 } 2423 if (ncp != NULL) { 2424 if (*buflen < ncp->nc_nlen) { 2425 mtx_unlock(vlp); 2426 vrele(*vp); 2427 counter_u64_add(numfullpathfail4, 1); 2428 error = ENOMEM; 2429 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2430 vp, NULL); 2431 return (error); 2432 } 2433 *buflen -= ncp->nc_nlen; 2434 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 2435 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, 2436 ncp->nc_name, vp); 2437 dvp = *vp; 2438 *vp = ncp->nc_dvp; 2439 vref(*vp); 2440 mtx_unlock(vlp); 2441 vrele(dvp); 2442 return (0); 2443 } 2444 SDT_PROBE1(vfs, namecache, fullpath, miss, vp); 2445 2446 mtx_unlock(vlp); 2447 vn_lock(*vp, LK_SHARED | LK_RETRY); 2448 error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen); 2449 vput(*vp); 2450 if (error) { 2451 counter_u64_add(numfullpathfail2, 1); 2452 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2453 return (error); 2454 } 2455 2456 *vp = dvp; 2457 if (VN_IS_DOOMED(dvp)) { 2458 /* forced unmount */ 2459 vrele(dvp); 2460 error = ENOENT; 2461 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2462 return (error); 2463 } 2464 /* 2465 * *vp has its use count incremented still. 2466 */ 2467 2468 return (0); 2469 } 2470 2471 /* 2472 * Resolve a directory to a pathname. 2473 * 2474 * The name of the directory can always be found in the namecache or fetched 2475 * from the filesystem. There is also guaranteed to be only one parent, meaning 2476 * we can just follow vnodes up until we find the root. 2477 * 2478 * The vnode must be referenced. 2479 */ 2480 static int 2481 vn_fullpath_dir(struct thread *td, struct vnode *vp, struct vnode *rdir, 2482 char *buf, char **retbuf, size_t *len, bool slash_prefixed, size_t addend) 2483 { 2484 #ifdef KDTRACE_HOOKS 2485 struct vnode *startvp = vp; 2486 #endif 2487 struct vnode *vp1; 2488 size_t buflen; 2489 int error; 2490 2491 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); 2492 VNPASS(vp->v_usecount > 0, vp); 2493 2494 buflen = *len; 2495 2496 if (!slash_prefixed) { 2497 MPASS(*len >= 2); 2498 buflen--; 2499 buf[buflen] = '\0'; 2500 } 2501 2502 error = 0; 2503 2504 SDT_PROBE1(vfs, namecache, fullpath, entry, vp); 2505 counter_u64_add(numfullpathcalls, 1); 2506 while (vp != rdir && vp != rootvnode) { 2507 /* 2508 * The vp vnode must be already fully constructed, 2509 * since it is either found in namecache or obtained 2510 * from VOP_VPTOCNP(). We may test for VV_ROOT safely 2511 * without obtaining the vnode lock. 2512 */ 2513 if ((vp->v_vflag & VV_ROOT) != 0) { 2514 vn_lock(vp, LK_RETRY | LK_SHARED); 2515 2516 /* 2517 * With the vnode locked, check for races with 2518 * unmount, forced or not. Note that we 2519 * already verified that vp is not equal to 2520 * the root vnode, which means that 2521 * mnt_vnodecovered can be NULL only for the 2522 * case of unmount. 2523 */ 2524 if (VN_IS_DOOMED(vp) || 2525 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || 2526 vp1->v_mountedhere != vp->v_mount) { 2527 vput(vp); 2528 error = ENOENT; 2529 SDT_PROBE3(vfs, namecache, fullpath, return, 2530 error, vp, NULL); 2531 break; 2532 } 2533 2534 vref(vp1); 2535 vput(vp); 2536 vp = vp1; 2537 continue; 2538 } 2539 if (vp->v_type != VDIR) { 2540 vrele(vp); 2541 counter_u64_add(numfullpathfail1, 1); 2542 error = ENOTDIR; 2543 SDT_PROBE3(vfs, namecache, fullpath, return, 2544 error, vp, NULL); 2545 break; 2546 } 2547 error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen); 2548 if (error) 2549 break; 2550 if (buflen == 0) { 2551 vrele(vp); 2552 error = ENOMEM; 2553 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2554 startvp, NULL); 2555 break; 2556 } 2557 buf[--buflen] = '/'; 2558 slash_prefixed = true; 2559 } 2560 if (error) 2561 return (error); 2562 if (!slash_prefixed) { 2563 if (buflen == 0) { 2564 vrele(vp); 2565 counter_u64_add(numfullpathfail4, 1); 2566 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, 2567 startvp, NULL); 2568 return (ENOMEM); 2569 } 2570 buf[--buflen] = '/'; 2571 } 2572 counter_u64_add(numfullpathfound, 1); 2573 vrele(vp); 2574 2575 *retbuf = buf + buflen; 2576 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf); 2577 *len -= buflen; 2578 *len += addend; 2579 return (0); 2580 } 2581 2582 /* 2583 * Resolve an arbitrary vnode to a pathname. 2584 * 2585 * Note 2 caveats: 2586 * - hardlinks are not tracked, thus if the vnode is not a directory this can 2587 * resolve to a different path than the one used to find it 2588 * - namecache is not mandatory, meaning names are not guaranteed to be added 2589 * (in which case resolving fails) 2590 */ 2591 static int 2592 vn_fullpath_any(struct thread *td, struct vnode *vp, struct vnode *rdir, 2593 char *buf, char **retbuf, size_t *buflen) 2594 { 2595 size_t orig_buflen; 2596 bool slash_prefixed; 2597 int error; 2598 2599 if (*buflen < 2) 2600 return (EINVAL); 2601 2602 orig_buflen = *buflen; 2603 2604 vref(vp); 2605 slash_prefixed = false; 2606 if (vp->v_type != VDIR) { 2607 *buflen -= 1; 2608 buf[*buflen] = '\0'; 2609 error = vn_vptocnp(&vp, td->td_ucred, buf, buflen); 2610 if (error) 2611 return (error); 2612 if (*buflen == 0) { 2613 vrele(vp); 2614 return (ENOMEM); 2615 } 2616 *buflen -= 1; 2617 buf[*buflen] = '/'; 2618 slash_prefixed = true; 2619 } 2620 2621 return (vn_fullpath_dir(td, vp, rdir, buf, retbuf, buflen, slash_prefixed, 2622 orig_buflen - *buflen)); 2623 } 2624 2625 /* 2626 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks). 2627 * 2628 * Since the namecache does not track handlings, the caller is expected to first 2629 * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei. 2630 * 2631 * Then we have 2 cases: 2632 * - if the found vnode is a directory, the path can be constructed just by 2633 * fullowing names up the chain 2634 * - otherwise we populate the buffer with the saved name and start resolving 2635 * from the parent 2636 */ 2637 static int 2638 vn_fullpath_hardlink(struct thread *td, struct nameidata *ndp, char **retbuf, 2639 char **freebuf, size_t *buflen) 2640 { 2641 char *buf, *tmpbuf; 2642 struct pwd *pwd; 2643 struct componentname *cnp; 2644 struct vnode *vp; 2645 size_t addend; 2646 int error; 2647 bool slash_prefixed; 2648 2649 if (*buflen < 2) 2650 return (EINVAL); 2651 if (*buflen > MAXPATHLEN) 2652 *buflen = MAXPATHLEN; 2653 2654 slash_prefixed = false; 2655 2656 buf = malloc(*buflen, M_TEMP, M_WAITOK); 2657 pwd = pwd_hold(td); 2658 2659 addend = 0; 2660 vp = ndp->ni_vp; 2661 if (vp->v_type != VDIR) { 2662 cnp = &ndp->ni_cnd; 2663 addend = cnp->cn_namelen + 2; 2664 if (*buflen < addend) { 2665 error = ENOMEM; 2666 goto out_bad; 2667 } 2668 *buflen -= addend; 2669 tmpbuf = buf + *buflen; 2670 tmpbuf[0] = '/'; 2671 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen); 2672 tmpbuf[addend - 1] = '\0'; 2673 slash_prefixed = true; 2674 vp = ndp->ni_dvp; 2675 } 2676 2677 vref(vp); 2678 error = vn_fullpath_dir(td, vp, pwd->pwd_rdir, buf, retbuf, buflen, 2679 slash_prefixed, addend); 2680 if (error != 0) 2681 goto out_bad; 2682 2683 pwd_drop(pwd); 2684 *freebuf = buf; 2685 2686 return (0); 2687 out_bad: 2688 pwd_drop(pwd); 2689 free(buf, M_TEMP); 2690 return (error); 2691 } 2692 2693 struct vnode * 2694 vn_dir_dd_ino(struct vnode *vp) 2695 { 2696 struct namecache *ncp; 2697 struct vnode *ddvp; 2698 struct mtx *vlp; 2699 enum vgetstate vs; 2700 2701 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); 2702 vlp = VP2VNODELOCK(vp); 2703 mtx_lock(vlp); 2704 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { 2705 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 2706 continue; 2707 ddvp = ncp->nc_dvp; 2708 vs = vget_prep(ddvp); 2709 mtx_unlock(vlp); 2710 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) 2711 return (NULL); 2712 return (ddvp); 2713 } 2714 mtx_unlock(vlp); 2715 return (NULL); 2716 } 2717 2718 int 2719 vn_commname(struct vnode *vp, char *buf, u_int buflen) 2720 { 2721 struct namecache *ncp; 2722 struct mtx *vlp; 2723 int l; 2724 2725 vlp = VP2VNODELOCK(vp); 2726 mtx_lock(vlp); 2727 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) 2728 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2729 break; 2730 if (ncp == NULL) { 2731 mtx_unlock(vlp); 2732 return (ENOENT); 2733 } 2734 l = min(ncp->nc_nlen, buflen - 1); 2735 memcpy(buf, ncp->nc_name, l); 2736 mtx_unlock(vlp); 2737 buf[l] = '\0'; 2738 return (0); 2739 } 2740 2741 /* 2742 * This function updates path string to vnode's full global path 2743 * and checks the size of the new path string against the pathlen argument. 2744 * 2745 * Requires a locked, referenced vnode. 2746 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 2747 * 2748 * If vp is a directory, the call to vn_fullpath_global() always succeeds 2749 * because it falls back to the ".." lookup if the namecache lookup fails. 2750 */ 2751 int 2752 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, 2753 u_int pathlen) 2754 { 2755 struct nameidata nd; 2756 struct vnode *vp1; 2757 char *rpath, *fbuf; 2758 int error; 2759 2760 ASSERT_VOP_ELOCKED(vp, __func__); 2761 2762 /* Construct global filesystem path from vp. */ 2763 VOP_UNLOCK(vp); 2764 error = vn_fullpath_global(td, vp, &rpath, &fbuf); 2765 2766 if (error != 0) { 2767 vrele(vp); 2768 return (error); 2769 } 2770 2771 if (strlen(rpath) >= pathlen) { 2772 vrele(vp); 2773 error = ENAMETOOLONG; 2774 goto out; 2775 } 2776 2777 /* 2778 * Re-lookup the vnode by path to detect a possible rename. 2779 * As a side effect, the vnode is relocked. 2780 * If vnode was renamed, return ENOENT. 2781 */ 2782 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, 2783 UIO_SYSSPACE, path, td); 2784 error = namei(&nd); 2785 if (error != 0) { 2786 vrele(vp); 2787 goto out; 2788 } 2789 NDFREE(&nd, NDF_ONLY_PNBUF); 2790 vp1 = nd.ni_vp; 2791 vrele(vp); 2792 if (vp1 == vp) 2793 strcpy(path, rpath); 2794 else { 2795 vput(vp1); 2796 error = ENOENT; 2797 } 2798 2799 out: 2800 free(fbuf, M_TEMP); 2801 return (error); 2802 } 2803 2804 #ifdef DDB 2805 static void 2806 db_print_vpath(struct vnode *vp) 2807 { 2808 2809 while (vp != NULL) { 2810 db_printf("%p: ", vp); 2811 if (vp == rootvnode) { 2812 db_printf("/"); 2813 vp = NULL; 2814 } else { 2815 if (vp->v_vflag & VV_ROOT) { 2816 db_printf("<mount point>"); 2817 vp = vp->v_mount->mnt_vnodecovered; 2818 } else { 2819 struct namecache *ncp; 2820 char *ncn; 2821 int i; 2822 2823 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2824 if (ncp != NULL) { 2825 ncn = ncp->nc_name; 2826 for (i = 0; i < ncp->nc_nlen; i++) 2827 db_printf("%c", *ncn++); 2828 vp = ncp->nc_dvp; 2829 } else { 2830 vp = NULL; 2831 } 2832 } 2833 } 2834 db_printf("\n"); 2835 } 2836 2837 return; 2838 } 2839 2840 DB_SHOW_COMMAND(vpath, db_show_vpath) 2841 { 2842 struct vnode *vp; 2843 2844 if (!have_addr) { 2845 db_printf("usage: show vpath <struct vnode *>\n"); 2846 return; 2847 } 2848 2849 vp = (struct vnode *)addr; 2850 db_print_vpath(vp); 2851 } 2852 2853 #endif 2854 2855 extern uma_zone_t namei_zone; 2856 2857 static bool __read_frequently cache_fast_lookup = true; 2858 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW, 2859 &cache_fast_lookup, 0, ""); 2860 2861 #define CACHE_FPL_FAILED -2020 2862 2863 static void 2864 cache_fpl_cleanup_cnp(struct componentname *cnp) 2865 { 2866 2867 uma_zfree(namei_zone, cnp->cn_pnbuf); 2868 #ifdef DIAGNOSTIC 2869 cnp->cn_pnbuf = NULL; 2870 cnp->cn_nameptr = NULL; 2871 #endif 2872 } 2873 2874 static void 2875 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp) 2876 { 2877 struct componentname *cnp; 2878 2879 cnp = &ndp->ni_cnd; 2880 while (*(cnp->cn_nameptr) == '/') { 2881 cnp->cn_nameptr++; 2882 ndp->ni_pathlen--; 2883 } 2884 2885 *dpp = ndp->ni_rootdir; 2886 } 2887 2888 /* 2889 * Components of nameidata (or objects it can point to) which may 2890 * need restoring in case fast path lookup fails. 2891 */ 2892 struct nameidata_saved { 2893 int cn_flags; 2894 long cn_namelen; 2895 char *cn_nameptr; 2896 size_t ni_pathlen; 2897 }; 2898 2899 struct cache_fpl { 2900 int line; 2901 enum cache_fpl_status status; 2902 bool in_smr; 2903 struct nameidata *ndp; 2904 struct nameidata_saved snd; 2905 struct componentname *cnp; 2906 struct vnode *dvp; 2907 seqc_t dvp_seqc; 2908 struct vnode *tvp; 2909 seqc_t tvp_seqc; 2910 struct pwd *pwd; 2911 }; 2912 2913 static void 2914 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd) 2915 { 2916 2917 snd->cn_flags = fpl->ndp->ni_cnd.cn_flags; 2918 snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen; 2919 snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr; 2920 snd->ni_pathlen = fpl->ndp->ni_pathlen; 2921 } 2922 2923 static void 2924 cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd) 2925 { 2926 2927 fpl->ndp->ni_cnd.cn_flags = snd->cn_flags; 2928 fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen; 2929 fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr; 2930 fpl->ndp->ni_pathlen = snd->ni_pathlen; 2931 } 2932 2933 #ifdef INVARIANTS 2934 #define cache_fpl_smr_assert_entered(fpl) ({ \ 2935 struct cache_fpl *_fpl = (fpl); \ 2936 MPASS(_fpl->in_smr == true); \ 2937 VFS_SMR_ASSERT_ENTERED(); \ 2938 }) 2939 #define cache_fpl_smr_assert_not_entered(fpl) ({ \ 2940 struct cache_fpl *_fpl = (fpl); \ 2941 MPASS(_fpl->in_smr == false); \ 2942 VFS_SMR_ASSERT_NOT_ENTERED(); \ 2943 }) 2944 #else 2945 #define cache_fpl_smr_assert_entered(fpl) do { } while (0) 2946 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0) 2947 #endif 2948 2949 #define cache_fpl_smr_enter(fpl) ({ \ 2950 struct cache_fpl *_fpl = (fpl); \ 2951 MPASS(_fpl->in_smr == false); \ 2952 vfs_smr_enter(); \ 2953 _fpl->in_smr = true; \ 2954 }) 2955 2956 #define cache_fpl_smr_exit(fpl) ({ \ 2957 struct cache_fpl *_fpl = (fpl); \ 2958 MPASS(_fpl->in_smr == true); \ 2959 vfs_smr_exit(); \ 2960 _fpl->in_smr = false; \ 2961 }) 2962 2963 static int 2964 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line) 2965 { 2966 2967 if (fpl->status != CACHE_FPL_STATUS_UNSET) { 2968 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, 2969 ("%s: converting to abort from %d at %d, set at %d\n", 2970 __func__, fpl->status, line, fpl->line)); 2971 } 2972 fpl->status = CACHE_FPL_STATUS_ABORTED; 2973 fpl->line = line; 2974 return (CACHE_FPL_FAILED); 2975 } 2976 2977 #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__) 2978 2979 static int 2980 cache_fpl_partial_impl(struct cache_fpl *fpl, int line) 2981 { 2982 2983 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 2984 ("%s: setting to partial at %d, but already set to %d at %d\n", 2985 __func__, line, fpl->status, fpl->line)); 2986 cache_fpl_smr_assert_entered(fpl); 2987 fpl->status = CACHE_FPL_STATUS_PARTIAL; 2988 fpl->line = line; 2989 return (CACHE_FPL_FAILED); 2990 } 2991 2992 #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__) 2993 2994 static int 2995 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line) 2996 { 2997 2998 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 2999 ("%s: setting to handled at %d, but already set to %d at %d\n", 3000 __func__, line, fpl->status, fpl->line)); 3001 cache_fpl_smr_assert_not_entered(fpl); 3002 MPASS(error != CACHE_FPL_FAILED); 3003 fpl->status = CACHE_FPL_STATUS_HANDLED; 3004 fpl->line = line; 3005 return (error); 3006 } 3007 3008 #define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__) 3009 3010 #define CACHE_FPL_SUPPORTED_CN_FLAGS \ 3011 (LOCKLEAF | LOCKPARENT | WANTPARENT | FOLLOW | LOCKSHARED | SAVENAME | \ 3012 ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2) 3013 3014 static bool 3015 cache_can_fplookup(struct cache_fpl *fpl) 3016 { 3017 struct nameidata *ndp; 3018 struct componentname *cnp; 3019 struct thread *td; 3020 3021 ndp = fpl->ndp; 3022 cnp = fpl->cnp; 3023 td = cnp->cn_thread; 3024 3025 if (!cache_fast_lookup) { 3026 cache_fpl_aborted(fpl); 3027 return (false); 3028 } 3029 #ifdef MAC 3030 if (mac_vnode_check_lookup_enabled()) { 3031 cache_fpl_aborted(fpl); 3032 return (false); 3033 } 3034 #endif 3035 if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) { 3036 cache_fpl_aborted(fpl); 3037 return (false); 3038 } 3039 if (cnp->cn_nameiop != LOOKUP) { 3040 cache_fpl_aborted(fpl); 3041 return (false); 3042 } 3043 if (ndp->ni_dirfd != AT_FDCWD) { 3044 cache_fpl_aborted(fpl); 3045 return (false); 3046 } 3047 if (IN_CAPABILITY_MODE(td)) { 3048 cache_fpl_aborted(fpl); 3049 return (false); 3050 } 3051 if (AUDITING_TD(td)) { 3052 cache_fpl_aborted(fpl); 3053 return (false); 3054 } 3055 if (ndp->ni_startdir != NULL) { 3056 cache_fpl_aborted(fpl); 3057 return (false); 3058 } 3059 return (true); 3060 } 3061 3062 static bool 3063 cache_fplookup_vnode_supported(struct vnode *vp) 3064 { 3065 3066 return (vp->v_type != VLNK); 3067 } 3068 3069 /* 3070 * Move a negative entry to the hot list. 3071 * 3072 * We have to take locks, but they may be contended and in the worst 3073 * case we may need to go off CPU. We don't want to spin within the 3074 * smr section and we can't block with it. Instead we are going to 3075 * look up the entry again. 3076 */ 3077 static int __noinline 3078 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp, 3079 uint32_t hash) 3080 { 3081 struct componentname *cnp; 3082 struct namecache *ncp; 3083 struct neglist *neglist; 3084 struct negstate *negstate; 3085 struct vnode *dvp; 3086 u_char nc_flag; 3087 3088 cnp = fpl->cnp; 3089 dvp = fpl->dvp; 3090 3091 if (!vhold_smr(dvp)) 3092 return (cache_fpl_aborted(fpl)); 3093 3094 neglist = NCP2NEGLIST(oncp); 3095 cache_fpl_smr_exit(fpl); 3096 3097 mtx_lock(&ncneg_hot.nl_lock); 3098 mtx_lock(&neglist->nl_lock); 3099 /* 3100 * For hash iteration. 3101 */ 3102 cache_fpl_smr_enter(fpl); 3103 3104 /* 3105 * Avoid all surprises by only succeeding if we got the same entry and 3106 * bailing completely otherwise. 3107 * 3108 * In particular at this point there can be a new ncp which matches the 3109 * search but hashes to a different neglist. 3110 */ 3111 CK_LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 3112 if (ncp == oncp) 3113 break; 3114 } 3115 3116 /* 3117 * No match to begin with. 3118 */ 3119 if (__predict_false(ncp == NULL)) { 3120 goto out_abort; 3121 } 3122 3123 /* 3124 * The newly found entry may be something different... 3125 */ 3126 if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 3127 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) { 3128 goto out_abort; 3129 } 3130 3131 /* 3132 * ... and not even negative. 3133 */ 3134 nc_flag = atomic_load_char(&ncp->nc_flag); 3135 if ((nc_flag & NCF_NEGATIVE) == 0) { 3136 goto out_abort; 3137 } 3138 3139 if (__predict_false(cache_ncp_invalid(ncp))) { 3140 goto out_abort; 3141 } 3142 3143 negstate = NCP2NEGSTATE(ncp); 3144 if ((negstate->neg_flag & NEG_HOT) == 0) { 3145 numhotneg++; 3146 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 3147 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); 3148 negstate->neg_flag |= NEG_HOT; 3149 } 3150 3151 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 3152 counter_u64_add(numneghits, 1); 3153 cache_fpl_smr_exit(fpl); 3154 mtx_unlock(&neglist->nl_lock); 3155 mtx_unlock(&ncneg_hot.nl_lock); 3156 vdrop(dvp); 3157 return (cache_fpl_handled(fpl, ENOENT)); 3158 out_abort: 3159 cache_fpl_smr_exit(fpl); 3160 mtx_unlock(&neglist->nl_lock); 3161 mtx_unlock(&ncneg_hot.nl_lock); 3162 vdrop(dvp); 3163 return (cache_fpl_aborted(fpl)); 3164 } 3165 3166 /* 3167 * The target vnode is not supported, prepare for the slow path to take over. 3168 */ 3169 static int 3170 cache_fplookup_partial_setup(struct cache_fpl *fpl) 3171 { 3172 struct componentname *cnp; 3173 enum vgetstate dvs; 3174 struct vnode *dvp; 3175 struct pwd *pwd; 3176 seqc_t dvp_seqc; 3177 3178 cnp = fpl->cnp; 3179 dvp = fpl->dvp; 3180 dvp_seqc = fpl->dvp_seqc; 3181 3182 dvs = vget_prep_smr(dvp); 3183 if (dvs == VGET_NONE) { 3184 cache_fpl_smr_exit(fpl); 3185 return (cache_fpl_aborted(fpl)); 3186 } 3187 3188 cache_fpl_smr_exit(fpl); 3189 3190 vget_finish_ref(dvp, dvs); 3191 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3192 vrele(dvp); 3193 return (cache_fpl_aborted(fpl)); 3194 } 3195 3196 pwd = pwd_hold(curthread); 3197 if (fpl->pwd != pwd) { 3198 vrele(dvp); 3199 pwd_drop(pwd); 3200 return (cache_fpl_aborted(fpl)); 3201 } 3202 3203 fpl->ndp->ni_startdir = dvp; 3204 return (0); 3205 } 3206 3207 static int 3208 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs) 3209 { 3210 struct componentname *cnp; 3211 struct vnode *tvp; 3212 seqc_t tvp_seqc; 3213 int error; 3214 3215 cnp = fpl->cnp; 3216 tvp = fpl->tvp; 3217 tvp_seqc = fpl->tvp_seqc; 3218 3219 if ((cnp->cn_flags & LOCKLEAF) != 0) { 3220 error = vget_finish(tvp, cnp->cn_lkflags, tvs); 3221 if (error != 0) { 3222 return (cache_fpl_aborted(fpl)); 3223 } 3224 } else { 3225 vget_finish_ref(tvp, tvs); 3226 } 3227 3228 if (!vn_seqc_consistent(tvp, tvp_seqc)) { 3229 if ((cnp->cn_flags & LOCKLEAF) != 0) 3230 vput(tvp); 3231 else 3232 vrele(tvp); 3233 return (cache_fpl_aborted(fpl)); 3234 } 3235 3236 return (cache_fpl_handled(fpl, 0)); 3237 } 3238 3239 static int __noinline 3240 cache_fplookup_final_withparent(struct cache_fpl *fpl) 3241 { 3242 enum vgetstate dvs, tvs; 3243 struct componentname *cnp; 3244 struct vnode *dvp, *tvp; 3245 seqc_t dvp_seqc, tvp_seqc; 3246 int error; 3247 3248 cnp = fpl->cnp; 3249 dvp = fpl->dvp; 3250 dvp_seqc = fpl->dvp_seqc; 3251 tvp = fpl->tvp; 3252 tvp_seqc = fpl->tvp_seqc; 3253 3254 MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0); 3255 3256 /* 3257 * This is less efficient than it can be for simplicity. 3258 */ 3259 dvs = vget_prep_smr(dvp); 3260 if (dvs == VGET_NONE) { 3261 return (cache_fpl_aborted(fpl)); 3262 } 3263 tvs = vget_prep_smr(tvp); 3264 if (tvs == VGET_NONE) { 3265 cache_fpl_smr_exit(fpl); 3266 vget_abort(dvp, dvs); 3267 return (cache_fpl_aborted(fpl)); 3268 } 3269 3270 cache_fpl_smr_exit(fpl); 3271 3272 if ((cnp->cn_flags & LOCKPARENT) != 0) { 3273 error = vget_finish(dvp, LK_EXCLUSIVE, dvs); 3274 if (error != 0) { 3275 vget_abort(tvp, tvs); 3276 return (cache_fpl_aborted(fpl)); 3277 } 3278 } else { 3279 vget_finish_ref(dvp, dvs); 3280 } 3281 3282 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3283 vget_abort(tvp, tvs); 3284 if ((cnp->cn_flags & LOCKPARENT) != 0) 3285 vput(dvp); 3286 else 3287 vrele(dvp); 3288 cache_fpl_aborted(fpl); 3289 return (error); 3290 } 3291 3292 error = cache_fplookup_final_child(fpl, tvs); 3293 if (error != 0) { 3294 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED); 3295 if ((cnp->cn_flags & LOCKPARENT) != 0) 3296 vput(dvp); 3297 else 3298 vrele(dvp); 3299 return (error); 3300 } 3301 3302 MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED); 3303 return (0); 3304 } 3305 3306 static int 3307 cache_fplookup_final(struct cache_fpl *fpl) 3308 { 3309 struct componentname *cnp; 3310 enum vgetstate tvs; 3311 struct vnode *dvp, *tvp; 3312 seqc_t dvp_seqc, tvp_seqc; 3313 3314 cnp = fpl->cnp; 3315 dvp = fpl->dvp; 3316 dvp_seqc = fpl->dvp_seqc; 3317 tvp = fpl->tvp; 3318 tvp_seqc = fpl->tvp_seqc; 3319 3320 VNPASS(cache_fplookup_vnode_supported(dvp), dvp); 3321 3322 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) 3323 return (cache_fplookup_final_withparent(fpl)); 3324 3325 tvs = vget_prep_smr(tvp); 3326 if (tvs == VGET_NONE) { 3327 return (cache_fpl_partial(fpl)); 3328 } 3329 3330 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3331 cache_fpl_smr_exit(fpl); 3332 vget_abort(tvp, tvs); 3333 return (cache_fpl_aborted(fpl)); 3334 } 3335 3336 cache_fpl_smr_exit(fpl); 3337 return (cache_fplookup_final_child(fpl, tvs)); 3338 } 3339 3340 static int 3341 cache_fplookup_next(struct cache_fpl *fpl) 3342 { 3343 struct componentname *cnp; 3344 struct namecache *ncp; 3345 struct negstate *negstate; 3346 struct vnode *dvp, *tvp; 3347 u_char nc_flag; 3348 uint32_t hash; 3349 bool neg_hot; 3350 3351 cnp = fpl->cnp; 3352 dvp = fpl->dvp; 3353 3354 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) { 3355 fpl->tvp = dvp; 3356 fpl->tvp_seqc = vn_seqc_read_any(dvp); 3357 if (seqc_in_modify(fpl->tvp_seqc)) { 3358 return (cache_fpl_aborted(fpl)); 3359 } 3360 return (0); 3361 } 3362 3363 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 3364 3365 CK_LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 3366 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 3367 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 3368 break; 3369 } 3370 3371 /* 3372 * If there is no entry we have to punt to the slow path to perform 3373 * actual lookup. Should there be nothing with this name a negative 3374 * entry will be created. 3375 */ 3376 if (__predict_false(ncp == NULL)) { 3377 return (cache_fpl_partial(fpl)); 3378 } 3379 3380 tvp = atomic_load_ptr(&ncp->nc_vp); 3381 nc_flag = atomic_load_char(&ncp->nc_flag); 3382 if ((nc_flag & NCF_NEGATIVE) != 0) { 3383 negstate = NCP2NEGSTATE(ncp); 3384 neg_hot = ((negstate->neg_flag & NEG_HOT) != 0); 3385 if (__predict_false(cache_ncp_invalid(ncp))) { 3386 return (cache_fpl_partial(fpl)); 3387 } 3388 if (__predict_false((nc_flag & NCF_WHITE) != 0)) { 3389 return (cache_fpl_partial(fpl)); 3390 } 3391 if (!neg_hot) { 3392 return (cache_fplookup_negative_promote(fpl, ncp, hash)); 3393 } 3394 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, 3395 ncp->nc_name); 3396 counter_u64_add(numneghits, 1); 3397 cache_fpl_smr_exit(fpl); 3398 return (cache_fpl_handled(fpl, ENOENT)); 3399 } 3400 3401 if (__predict_false(cache_ncp_invalid(ncp))) { 3402 return (cache_fpl_partial(fpl)); 3403 } 3404 3405 fpl->tvp = tvp; 3406 fpl->tvp_seqc = vn_seqc_read_any(tvp); 3407 if (seqc_in_modify(fpl->tvp_seqc)) { 3408 return (cache_fpl_partial(fpl)); 3409 } 3410 3411 if (!cache_fplookup_vnode_supported(tvp)) { 3412 return (cache_fpl_partial(fpl)); 3413 } 3414 3415 counter_u64_add(numposhits, 1); 3416 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp); 3417 return (0); 3418 } 3419 3420 static bool 3421 cache_fplookup_mp_supported(struct mount *mp) 3422 { 3423 3424 if (mp == NULL) 3425 return (false); 3426 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) 3427 return (false); 3428 if ((mp->mnt_flag & MNT_UNION) != 0) 3429 return (false); 3430 return (true); 3431 } 3432 3433 /* 3434 * Walk up the mount stack (if any). 3435 * 3436 * Correctness is provided in the following ways: 3437 * - all vnodes are protected from freeing with SMR 3438 * - struct mount objects are type stable making them always safe to access 3439 * - stability of the particular mount is provided by busying it 3440 * - relationship between the vnode which is mounted on and the mount is 3441 * verified with the vnode sequence counter after busying 3442 * - association between root vnode of the mount and the mount is protected 3443 * by busy 3444 * 3445 * From that point on we can read the sequence counter of the root vnode 3446 * and get the next mount on the stack (if any) using the same protection. 3447 * 3448 * By the end of successful walk we are guaranteed the reached state was 3449 * indeed present at least at some point which matches the regular lookup. 3450 */ 3451 static int 3452 cache_fplookup_climb_mount(struct cache_fpl *fpl) 3453 { 3454 struct mount *mp, *prev_mp; 3455 struct vnode *vp; 3456 seqc_t vp_seqc; 3457 3458 vp = fpl->tvp; 3459 vp_seqc = fpl->tvp_seqc; 3460 if (vp->v_type != VDIR) 3461 return (0); 3462 3463 mp = atomic_load_ptr(&vp->v_mountedhere); 3464 if (mp == NULL) 3465 return (0); 3466 3467 prev_mp = NULL; 3468 for (;;) { 3469 if (!vfs_op_thread_enter(mp)) { 3470 if (prev_mp != NULL) 3471 vfs_op_thread_exit(prev_mp); 3472 return (cache_fpl_partial(fpl)); 3473 } 3474 if (prev_mp != NULL) 3475 vfs_op_thread_exit(prev_mp); 3476 if (!vn_seqc_consistent(vp, vp_seqc)) { 3477 vfs_op_thread_exit(mp); 3478 return (cache_fpl_partial(fpl)); 3479 } 3480 if (!cache_fplookup_mp_supported(mp)) { 3481 vfs_op_thread_exit(mp); 3482 return (cache_fpl_partial(fpl)); 3483 } 3484 vp = atomic_load_ptr(&mp->mnt_rootvnode); 3485 if (vp == NULL || VN_IS_DOOMED(vp)) { 3486 vfs_op_thread_exit(mp); 3487 return (cache_fpl_partial(fpl)); 3488 } 3489 vp_seqc = vn_seqc_read_any(vp); 3490 if (seqc_in_modify(vp_seqc)) { 3491 vfs_op_thread_exit(mp); 3492 return (cache_fpl_partial(fpl)); 3493 } 3494 prev_mp = mp; 3495 mp = atomic_load_ptr(&vp->v_mountedhere); 3496 if (mp == NULL) 3497 break; 3498 } 3499 3500 vfs_op_thread_exit(prev_mp); 3501 fpl->tvp = vp; 3502 fpl->tvp_seqc = vp_seqc; 3503 return (0); 3504 } 3505 3506 /* 3507 * Parse the path. 3508 * 3509 * The code is mostly copy-pasted from regular lookup, see lookup(). 3510 * The structure is maintained along with comments for easier maintenance. 3511 * Deduplicating the code will become feasible after fast path lookup 3512 * becomes more feature-complete. 3513 */ 3514 static int 3515 cache_fplookup_parse(struct cache_fpl *fpl) 3516 { 3517 struct nameidata *ndp; 3518 struct componentname *cnp; 3519 char *cp; 3520 char *prev_ni_next; /* saved ndp->ni_next */ 3521 size_t prev_ni_pathlen; /* saved ndp->ni_pathlen */ 3522 3523 ndp = fpl->ndp; 3524 cnp = fpl->cnp; 3525 3526 /* 3527 * Search a new directory. 3528 * 3529 * The last component of the filename is left accessible via 3530 * cnp->cn_nameptr for callers that need the name. Callers needing 3531 * the name set the SAVENAME flag. When done, they assume 3532 * responsibility for freeing the pathname buffer. 3533 */ 3534 for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++) 3535 continue; 3536 cnp->cn_namelen = cp - cnp->cn_nameptr; 3537 if (cnp->cn_namelen > NAME_MAX) { 3538 cache_fpl_smr_exit(fpl); 3539 return (cache_fpl_handled(fpl, ENAMETOOLONG)); 3540 } 3541 prev_ni_pathlen = ndp->ni_pathlen; 3542 ndp->ni_pathlen -= cnp->cn_namelen; 3543 KASSERT(ndp->ni_pathlen <= PATH_MAX, 3544 ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen)); 3545 prev_ni_next = ndp->ni_next; 3546 ndp->ni_next = cp; 3547 3548 /* 3549 * Replace multiple slashes by a single slash and trailing slashes 3550 * by a null. This must be done before VOP_LOOKUP() because some 3551 * fs's don't know about trailing slashes. Remember if there were 3552 * trailing slashes to handle symlinks, existing non-directories 3553 * and non-existing files that won't be directories specially later. 3554 */ 3555 while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) { 3556 cp++; 3557 ndp->ni_pathlen--; 3558 if (*cp == '\0') { 3559 /* 3560 * TODO 3561 * Regular lookup performs the following: 3562 * *ndp->ni_next = '\0'; 3563 * cnp->cn_flags |= TRAILINGSLASH; 3564 * 3565 * Which is problematic since it modifies data read 3566 * from userspace. Then if fast path lookup was to 3567 * abort we would have to either restore it or convey 3568 * the flag. Since this is a corner case just ignore 3569 * it for simplicity. 3570 */ 3571 return (cache_fpl_partial(fpl)); 3572 } 3573 } 3574 ndp->ni_next = cp; 3575 3576 cnp->cn_flags |= MAKEENTRY; 3577 3578 if (cnp->cn_namelen == 2 && 3579 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') 3580 cnp->cn_flags |= ISDOTDOT; 3581 else 3582 cnp->cn_flags &= ~ISDOTDOT; 3583 if (*ndp->ni_next == 0) 3584 cnp->cn_flags |= ISLASTCN; 3585 else 3586 cnp->cn_flags &= ~ISLASTCN; 3587 3588 /* 3589 * Check for degenerate name (e.g. / or "") 3590 * which is a way of talking about a directory, 3591 * e.g. like "/." or ".". 3592 * 3593 * TODO 3594 * Another corner case handled by the regular lookup 3595 */ 3596 if (__predict_false(cnp->cn_nameptr[0] == '\0')) { 3597 return (cache_fpl_partial(fpl)); 3598 } 3599 return (0); 3600 } 3601 3602 static void 3603 cache_fplookup_parse_advance(struct cache_fpl *fpl) 3604 { 3605 struct nameidata *ndp; 3606 struct componentname *cnp; 3607 3608 ndp = fpl->ndp; 3609 cnp = fpl->cnp; 3610 3611 cnp->cn_nameptr = ndp->ni_next; 3612 while (*cnp->cn_nameptr == '/') { 3613 cnp->cn_nameptr++; 3614 ndp->ni_pathlen--; 3615 } 3616 } 3617 3618 static int 3619 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl) 3620 { 3621 struct nameidata *ndp; 3622 struct componentname *cnp; 3623 struct mount *mp; 3624 int error; 3625 3626 error = CACHE_FPL_FAILED; 3627 ndp = fpl->ndp; 3628 ndp->ni_lcf = 0; 3629 cnp = fpl->cnp; 3630 cnp->cn_lkflags = LK_SHARED; 3631 if ((cnp->cn_flags & LOCKSHARED) == 0) 3632 cnp->cn_lkflags = LK_EXCLUSIVE; 3633 3634 cache_fpl_checkpoint(fpl, &fpl->snd); 3635 3636 fpl->dvp = dvp; 3637 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); 3638 if (seqc_in_modify(fpl->dvp_seqc)) { 3639 cache_fpl_aborted(fpl); 3640 goto out; 3641 } 3642 mp = atomic_load_ptr(&fpl->dvp->v_mount); 3643 if (!cache_fplookup_mp_supported(mp)) { 3644 cache_fpl_aborted(fpl); 3645 goto out; 3646 } 3647 3648 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); 3649 3650 for (;;) { 3651 error = cache_fplookup_parse(fpl); 3652 if (__predict_false(error != 0)) { 3653 break; 3654 } 3655 3656 if (cnp->cn_flags & ISDOTDOT) { 3657 error = cache_fpl_partial(fpl); 3658 break; 3659 } 3660 3661 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); 3662 3663 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred, cnp->cn_thread); 3664 if (__predict_false(error != 0)) { 3665 switch (error) { 3666 case EAGAIN: 3667 case EOPNOTSUPP: /* can happen when racing against vgone */ 3668 cache_fpl_partial(fpl); 3669 break; 3670 default: 3671 /* 3672 * See the API contract for VOP_FPLOOKUP_VEXEC. 3673 */ 3674 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { 3675 error = cache_fpl_aborted(fpl); 3676 } else { 3677 cache_fpl_smr_exit(fpl); 3678 cache_fpl_handled(fpl, error); 3679 } 3680 break; 3681 } 3682 break; 3683 } 3684 3685 error = cache_fplookup_next(fpl); 3686 if (__predict_false(error != 0)) { 3687 break; 3688 } 3689 3690 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 3691 3692 error = cache_fplookup_climb_mount(fpl); 3693 if (__predict_false(error != 0)) { 3694 break; 3695 } 3696 3697 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 3698 3699 if (cnp->cn_flags & ISLASTCN) { 3700 error = cache_fplookup_final(fpl); 3701 break; 3702 } 3703 3704 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { 3705 error = cache_fpl_aborted(fpl); 3706 break; 3707 } 3708 3709 fpl->dvp = fpl->tvp; 3710 fpl->dvp_seqc = fpl->tvp_seqc; 3711 3712 cache_fplookup_parse_advance(fpl); 3713 cache_fpl_checkpoint(fpl, &fpl->snd); 3714 } 3715 out: 3716 switch (fpl->status) { 3717 case CACHE_FPL_STATUS_UNSET: 3718 __assert_unreachable(); 3719 break; 3720 case CACHE_FPL_STATUS_PARTIAL: 3721 cache_fpl_smr_assert_entered(fpl); 3722 return (cache_fplookup_partial_setup(fpl)); 3723 case CACHE_FPL_STATUS_ABORTED: 3724 if (fpl->in_smr) 3725 cache_fpl_smr_exit(fpl); 3726 return (CACHE_FPL_FAILED); 3727 case CACHE_FPL_STATUS_HANDLED: 3728 cache_fpl_smr_assert_not_entered(fpl); 3729 if (__predict_false(error != 0)) { 3730 ndp->ni_dvp = NULL; 3731 ndp->ni_vp = NULL; 3732 cache_fpl_cleanup_cnp(cnp); 3733 return (error); 3734 } 3735 ndp->ni_dvp = fpl->dvp; 3736 ndp->ni_vp = fpl->tvp; 3737 if (cnp->cn_flags & SAVENAME) 3738 cnp->cn_flags |= HASBUF; 3739 else 3740 cache_fpl_cleanup_cnp(cnp); 3741 return (error); 3742 } 3743 } 3744 3745 /* 3746 * Fast path lookup protected with SMR and sequence counters. 3747 * 3748 * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one. 3749 * 3750 * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria 3751 * outlined below. 3752 * 3753 * Traditional vnode lookup conceptually looks like this: 3754 * 3755 * vn_lock(current); 3756 * for (;;) { 3757 * next = find(); 3758 * vn_lock(next); 3759 * vn_unlock(current); 3760 * current = next; 3761 * if (last) 3762 * break; 3763 * } 3764 * return (current); 3765 * 3766 * Each jump to the next vnode is safe memory-wise and atomic with respect to 3767 * any modifications thanks to holding respective locks. 3768 * 3769 * The same guarantee can be provided with a combination of safe memory 3770 * reclamation and sequence counters instead. If all operations which affect 3771 * the relationship between the current vnode and the one we are looking for 3772 * also modify the counter, we can verify whether all the conditions held as 3773 * we made the jump. This includes things like permissions, mount points etc. 3774 * Counter modification is provided by enclosing relevant places in 3775 * vn_seqc_write_begin()/end() calls. 3776 * 3777 * Thus this translates to: 3778 * 3779 * vfs_smr_enter(); 3780 * dvp_seqc = seqc_read_any(dvp); 3781 * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode 3782 * abort(); 3783 * for (;;) { 3784 * tvp = find(); 3785 * tvp_seqc = seqc_read_any(tvp); 3786 * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode 3787 * abort(); 3788 * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode 3789 * abort(); 3790 * dvp = tvp; // we know nothing of importance has changed 3791 * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration 3792 * if (last) 3793 * break; 3794 * } 3795 * vget(); // secure the vnode 3796 * if (!seqc_consistent(tvp, tvp_seqc) // final check 3797 * abort(); 3798 * // at this point we know nothing has changed for any parent<->child pair 3799 * // as they were crossed during the lookup, meaning we matched the guarantee 3800 * // of the locked variant 3801 * return (tvp); 3802 * 3803 * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows: 3804 * - they are called while within vfs_smr protection which they must never exit 3805 * - EAGAIN can be returned to denote checking could not be performed, it is 3806 * always valid to return it 3807 * - if the sequence counter has not changed the result must be valid 3808 * - if the sequence counter has changed both false positives and false negatives 3809 * are permitted (since the result will be rejected later) 3810 * - for simple cases of unix permission checks vaccess_vexec_smr can be used 3811 * 3812 * Caveats to watch out for: 3813 * - vnodes are passed unlocked and unreferenced with nothing stopping 3814 * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised 3815 * to use atomic_load_ptr to fetch it. 3816 * - the aforementioned object can also get freed, meaning absent other means it 3817 * should be protected with vfs_smr 3818 * - either safely checking permissions as they are modified or guaranteeing 3819 * their stability is left to the routine 3820 */ 3821 int 3822 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status, 3823 struct pwd **pwdp) 3824 { 3825 struct cache_fpl fpl; 3826 struct pwd *pwd; 3827 struct vnode *dvp; 3828 struct componentname *cnp; 3829 struct nameidata_saved orig; 3830 int error; 3831 3832 *status = CACHE_FPL_STATUS_UNSET; 3833 bzero(&fpl, sizeof(fpl)); 3834 fpl.status = CACHE_FPL_STATUS_UNSET; 3835 fpl.ndp = ndp; 3836 fpl.cnp = &ndp->ni_cnd; 3837 MPASS(curthread == fpl.cnp->cn_thread); 3838 3839 if (!cache_can_fplookup(&fpl)) { 3840 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 3841 *status = fpl.status; 3842 return (EOPNOTSUPP); 3843 } 3844 3845 cache_fpl_checkpoint(&fpl, &orig); 3846 3847 cache_fpl_smr_enter(&fpl); 3848 pwd = pwd_get_smr(); 3849 fpl.pwd = pwd; 3850 ndp->ni_rootdir = pwd->pwd_rdir; 3851 ndp->ni_topdir = pwd->pwd_jdir; 3852 3853 cnp = fpl.cnp; 3854 cnp->cn_nameptr = cnp->cn_pnbuf; 3855 if (cnp->cn_pnbuf[0] == '/') { 3856 cache_fpl_handle_root(ndp, &dvp); 3857 } else { 3858 MPASS(ndp->ni_dirfd == AT_FDCWD); 3859 dvp = pwd->pwd_cdir; 3860 } 3861 3862 SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true); 3863 3864 error = cache_fplookup_impl(dvp, &fpl); 3865 cache_fpl_smr_assert_not_entered(&fpl); 3866 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 3867 3868 *status = fpl.status; 3869 switch (fpl.status) { 3870 case CACHE_FPL_STATUS_UNSET: 3871 __assert_unreachable(); 3872 break; 3873 case CACHE_FPL_STATUS_HANDLED: 3874 SDT_PROBE3(vfs, namei, lookup, return, error, 3875 (error == 0 ? ndp->ni_vp : NULL), true); 3876 break; 3877 case CACHE_FPL_STATUS_PARTIAL: 3878 *pwdp = fpl.pwd; 3879 cache_fpl_restore(&fpl, &fpl.snd); 3880 break; 3881 case CACHE_FPL_STATUS_ABORTED: 3882 cache_fpl_restore(&fpl, &orig); 3883 break; 3884 } 3885 return (error); 3886 } 3887