1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Poul-Henning Kamp of the FreeBSD Project. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_ddb.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/capsicum.h> 46 #include <sys/counter.h> 47 #include <sys/filedesc.h> 48 #include <sys/fnv_hash.h> 49 #include <sys/kernel.h> 50 #include <sys/ktr.h> 51 #include <sys/lock.h> 52 #include <sys/malloc.h> 53 #include <sys/fcntl.h> 54 #include <sys/jail.h> 55 #include <sys/mount.h> 56 #include <sys/namei.h> 57 #include <sys/proc.h> 58 #include <sys/seqc.h> 59 #include <sys/sdt.h> 60 #include <sys/smr.h> 61 #include <sys/smp.h> 62 #include <sys/syscallsubr.h> 63 #include <sys/sysctl.h> 64 #include <sys/sysproto.h> 65 #include <sys/vnode.h> 66 #include <ck_queue.h> 67 #ifdef KTRACE 68 #include <sys/ktrace.h> 69 #endif 70 71 #include <sys/capsicum.h> 72 73 #include <security/audit/audit.h> 74 #include <security/mac/mac_framework.h> 75 76 #ifdef DDB 77 #include <ddb/ddb.h> 78 #endif 79 80 #include <vm/uma.h> 81 82 SDT_PROVIDER_DECLARE(vfs); 83 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", 84 "struct vnode *"); 85 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", 86 "char *"); 87 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *", 88 "const char *"); 89 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *", 90 "struct namecache *", "int", "int"); 91 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); 92 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", 93 "char *", "struct vnode *"); 94 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); 95 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", 96 "struct vnode *", "char *"); 97 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", 98 "struct vnode *"); 99 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, 100 "struct vnode *", "char *"); 101 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", 102 "char *"); 103 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *", 104 "struct componentname *"); 105 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *", 106 "struct componentname *"); 107 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); 108 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); 109 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); 110 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", 111 "struct vnode *"); 112 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", 113 "char *"); 114 SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *", 115 "char *"); 116 117 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool"); 118 SDT_PROBE_DECLARE(vfs, namei, lookup, entry); 119 SDT_PROBE_DECLARE(vfs, namei, lookup, return); 120 121 /* 122 * This structure describes the elements in the cache of recent 123 * names looked up by namei. 124 */ 125 struct negstate { 126 u_char neg_flag; 127 }; 128 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *), 129 "the state must fit in a union with a pointer without growing it"); 130 131 struct namecache { 132 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 133 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 134 CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */ 135 struct vnode *nc_dvp; /* vnode of parent of name */ 136 union { 137 struct vnode *nu_vp; /* vnode the name refers to */ 138 struct negstate nu_neg;/* negative entry state */ 139 } n_un; 140 u_char nc_flag; /* flag bits */ 141 u_char nc_nlen; /* length of name */ 142 char nc_name[0]; /* segment name + nul */ 143 }; 144 145 /* 146 * struct namecache_ts repeats struct namecache layout up to the 147 * nc_nlen member. 148 * struct namecache_ts is used in place of struct namecache when time(s) need 149 * to be stored. The nc_dotdottime field is used when a cache entry is mapping 150 * both a non-dotdot directory name plus dotdot for the directory's 151 * parent. 152 * 153 * See below for alignment requirement. 154 */ 155 struct namecache_ts { 156 struct timespec nc_time; /* timespec provided by fs */ 157 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ 158 int nc_ticks; /* ticks value when entry was added */ 159 struct namecache nc_nc; 160 }; 161 162 /* 163 * At least mips n32 performs 64-bit accesses to timespec as found 164 * in namecache_ts and requires them to be aligned. Since others 165 * may be in the same spot suffer a little bit and enforce the 166 * alignment for everyone. Note this is a nop for 64-bit platforms. 167 */ 168 #define CACHE_ZONE_ALIGNMENT UMA_ALIGNOF(time_t) 169 #define CACHE_PATH_CUTOFF 39 170 171 #define CACHE_ZONE_SMALL_SIZE (sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1) 172 #define CACHE_ZONE_SMALL_TS_SIZE (sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1) 173 #define CACHE_ZONE_LARGE_SIZE (sizeof(struct namecache) + NAME_MAX + 1) 174 #define CACHE_ZONE_LARGE_TS_SIZE (sizeof(struct namecache_ts) + NAME_MAX + 1) 175 176 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 177 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 178 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 179 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 180 181 #define nc_vp n_un.nu_vp 182 #define nc_neg n_un.nu_neg 183 184 /* 185 * Flags in namecache.nc_flag 186 */ 187 #define NCF_WHITE 0x01 188 #define NCF_ISDOTDOT 0x02 189 #define NCF_TS 0x04 190 #define NCF_DTS 0x08 191 #define NCF_DVDROP 0x10 192 #define NCF_NEGATIVE 0x20 193 #define NCF_INVALID 0x40 194 #define NCF_WIP 0x80 195 196 /* 197 * Flags in negstate.neg_flag 198 */ 199 #define NEG_HOT 0x01 200 201 /* 202 * Mark an entry as invalid. 203 * 204 * This is called before it starts getting deconstructed. 205 */ 206 static void 207 cache_ncp_invalidate(struct namecache *ncp) 208 { 209 210 KASSERT((ncp->nc_flag & NCF_INVALID) == 0, 211 ("%s: entry %p already invalid", __func__, ncp)); 212 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID); 213 atomic_thread_fence_rel(); 214 } 215 216 /* 217 * Check whether the entry can be safely used. 218 * 219 * All places which elide locks are supposed to call this after they are 220 * done with reading from an entry. 221 */ 222 static bool 223 cache_ncp_canuse(struct namecache *ncp) 224 { 225 226 atomic_thread_fence_acq(); 227 return ((atomic_load_char(&ncp->nc_flag) & (NCF_INVALID | NCF_WIP)) == 0); 228 } 229 230 /* 231 * Name caching works as follows: 232 * 233 * Names found by directory scans are retained in a cache 234 * for future reference. It is managed LRU, so frequently 235 * used names will hang around. Cache is indexed by hash value 236 * obtained from (dvp, name) where dvp refers to the directory 237 * containing name. 238 * 239 * If it is a "negative" entry, (i.e. for a name that is known NOT to 240 * exist) the vnode pointer will be NULL. 241 * 242 * Upon reaching the last segment of a path, if the reference 243 * is for DELETE, or NOCACHE is set (rewrite), and the 244 * name is located in the cache, it will be dropped. 245 * 246 * These locks are used (in the order in which they can be taken): 247 * NAME TYPE ROLE 248 * vnodelock mtx vnode lists and v_cache_dd field protection 249 * bucketlock mtx for access to given set of hash buckets 250 * neglist mtx negative entry LRU management 251 * 252 * Additionally, ncneg_shrink_lock mtx is used to have at most one thread 253 * shrinking the LRU list. 254 * 255 * It is legal to take multiple vnodelock and bucketlock locks. The locking 256 * order is lower address first. Both are recursive. 257 * 258 * "." lookups are lockless. 259 * 260 * ".." and vnode -> name lookups require vnodelock. 261 * 262 * name -> vnode lookup requires the relevant bucketlock to be held for reading. 263 * 264 * Insertions and removals of entries require involved vnodes and bucketlocks 265 * to be locked to provide safe operation against other threads modifying the 266 * cache. 267 * 268 * Some lookups result in removal of the found entry (e.g. getting rid of a 269 * negative entry with the intent to create a positive one), which poses a 270 * problem when multiple threads reach the state. Similarly, two different 271 * threads can purge two different vnodes and try to remove the same name. 272 * 273 * If the already held vnode lock is lower than the second required lock, we 274 * can just take the other lock. However, in the opposite case, this could 275 * deadlock. As such, this is resolved by trylocking and if that fails unlocking 276 * the first node, locking everything in order and revalidating the state. 277 */ 278 279 VFS_SMR_DECLARE; 280 281 /* 282 * Structures associated with name caching. 283 */ 284 #define NCHHASH(hash) \ 285 (&nchashtbl[(hash) & nchash]) 286 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ 287 static u_long __read_mostly nchash; /* size of hash table */ 288 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 289 "Size of namecache hash table"); 290 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */ 291 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 292 "Ratio of negative namecache entries"); 293 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ 294 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ 295 u_int ncsizefactor = 2; 296 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0, 297 "Size factor for namecache"); 298 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */ 299 300 struct nchstats nchstats; /* cache effectiveness statistics */ 301 302 static bool __read_frequently cache_fast_revlookup = true; 303 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_revlookup, CTLFLAG_RW, 304 &cache_fast_revlookup, 0, ""); 305 306 static struct mtx __exclusive_cache_line ncneg_shrink_lock; 307 308 #define ncneghash 3 309 #define numneglists (ncneghash + 1) 310 311 struct neglist { 312 struct mtx nl_lock; 313 TAILQ_HEAD(, namecache) nl_list; 314 TAILQ_HEAD(, namecache) nl_hotlist; 315 u_long nl_hotnum; 316 } __aligned(CACHE_LINE_SIZE); 317 318 static struct neglist neglists[numneglists]; 319 320 static inline struct neglist * 321 NCP2NEGLIST(struct namecache *ncp) 322 { 323 324 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); 325 } 326 327 static inline struct negstate * 328 NCP2NEGSTATE(struct namecache *ncp) 329 { 330 331 MPASS(ncp->nc_flag & NCF_NEGATIVE); 332 return (&ncp->nc_neg); 333 } 334 335 #define numbucketlocks (ncbuckethash + 1) 336 static u_int __read_mostly ncbuckethash; 337 static struct mtx_padalign __read_mostly *bucketlocks; 338 #define HASH2BUCKETLOCK(hash) \ 339 ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)])) 340 341 #define numvnodelocks (ncvnodehash + 1) 342 static u_int __read_mostly ncvnodehash; 343 static struct mtx __read_mostly *vnodelocks; 344 static inline struct mtx * 345 VP2VNODELOCK(struct vnode *vp) 346 { 347 348 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); 349 } 350 351 /* 352 * UMA zones for the VFS cache. 353 * 354 * The small cache is used for entries with short names, which are the 355 * most common. The large cache is used for entries which are too big to 356 * fit in the small cache. 357 */ 358 static uma_zone_t __read_mostly cache_zone_small; 359 static uma_zone_t __read_mostly cache_zone_small_ts; 360 static uma_zone_t __read_mostly cache_zone_large; 361 static uma_zone_t __read_mostly cache_zone_large_ts; 362 363 static struct namecache * 364 cache_alloc(int len, int ts) 365 { 366 struct namecache_ts *ncp_ts; 367 struct namecache *ncp; 368 369 if (__predict_false(ts)) { 370 if (len <= CACHE_PATH_CUTOFF) 371 ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK); 372 else 373 ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK); 374 ncp = &ncp_ts->nc_nc; 375 } else { 376 if (len <= CACHE_PATH_CUTOFF) 377 ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK); 378 else 379 ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK); 380 } 381 return (ncp); 382 } 383 384 static void 385 cache_free(struct namecache *ncp) 386 { 387 struct namecache_ts *ncp_ts; 388 389 MPASS(ncp != NULL); 390 if ((ncp->nc_flag & NCF_DVDROP) != 0) 391 vdrop(ncp->nc_dvp); 392 if (__predict_false(ncp->nc_flag & NCF_TS)) { 393 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 394 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 395 uma_zfree_smr(cache_zone_small_ts, ncp_ts); 396 else 397 uma_zfree_smr(cache_zone_large_ts, ncp_ts); 398 } else { 399 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 400 uma_zfree_smr(cache_zone_small, ncp); 401 else 402 uma_zfree_smr(cache_zone_large, ncp); 403 } 404 } 405 406 static void 407 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) 408 { 409 struct namecache_ts *ncp_ts; 410 411 KASSERT((ncp->nc_flag & NCF_TS) != 0 || 412 (tsp == NULL && ticksp == NULL), 413 ("No NCF_TS")); 414 415 if (tsp == NULL) 416 return; 417 418 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 419 *tsp = ncp_ts->nc_time; 420 *ticksp = ncp_ts->nc_ticks; 421 } 422 423 #ifdef DEBUG_CACHE 424 static int __read_mostly doingcache = 1; /* 1 => enable the cache */ 425 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, 426 "VFS namecache enabled"); 427 #endif 428 429 /* Export size information to userland */ 430 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 431 sizeof(struct namecache), "sizeof(struct namecache)"); 432 433 /* 434 * The new name cache statistics 435 */ 436 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 437 "Name cache statistics"); 438 #define STATNODE_ULONG(name, descr) \ 439 SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr); 440 #define STATNODE_COUNTER(name, descr) \ 441 static COUNTER_U64_DEFINE_EARLY(name); \ 442 SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, \ 443 descr); 444 STATNODE_ULONG(numneg, "Number of negative cache entries"); 445 STATNODE_ULONG(numcache, "Number of cache entries"); 446 STATNODE_COUNTER(numcachehv, "Number of namecache entries with vnodes held"); 447 STATNODE_COUNTER(numdrops, "Number of dropped entries due to reaching the limit"); 448 STATNODE_COUNTER(dothits, "Number of '.' hits"); 449 STATNODE_COUNTER(dotdothits, "Number of '..' hits"); 450 STATNODE_COUNTER(nummiss, "Number of cache misses"); 451 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache"); 452 STATNODE_COUNTER(numposzaps, 453 "Number of cache hits (positive) we do not want to cache"); 454 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)"); 455 STATNODE_COUNTER(numnegzaps, 456 "Number of cache hits (negative) we do not want to cache"); 457 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)"); 458 /* These count for vn_getcwd(), too. */ 459 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls"); 460 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); 461 STATNODE_COUNTER(numfullpathfail2, 462 "Number of fullpath search errors (VOP_VPTOCNP failures)"); 463 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); 464 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls"); 465 STATNODE_COUNTER(zap_and_exit_bucket_relock_success, 466 "Number of successful removals after relocking"); 467 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail, 468 "Number of times zap_and_exit failed to lock"); 469 static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2, 470 "Number of times zap_and_exit failed to lock"); 471 static long cache_lock_vnodes_cel_3_failures; 472 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures, 473 "Number of times 3-way vnode locking failed"); 474 STATNODE_COUNTER(numneg_evicted, 475 "Number of negative entries evicted when adding a new entry"); 476 STATNODE_COUNTER(shrinking_skipped, 477 "Number of times shrinking was already in progress"); 478 479 static void cache_zap_locked(struct namecache *ncp); 480 static int vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, 481 char **freebuf, size_t *buflen); 482 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 483 char **retbuf, size_t *buflen, bool slash_prefixed, size_t addend); 484 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, 485 char **retbuf, size_t *buflen); 486 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, 487 char **retbuf, size_t *len, bool slash_prefixed, size_t addend); 488 489 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 490 491 static inline void 492 cache_assert_vlp_locked(struct mtx *vlp) 493 { 494 495 if (vlp != NULL) 496 mtx_assert(vlp, MA_OWNED); 497 } 498 499 static inline void 500 cache_assert_vnode_locked(struct vnode *vp) 501 { 502 struct mtx *vlp; 503 504 vlp = VP2VNODELOCK(vp); 505 cache_assert_vlp_locked(vlp); 506 } 507 508 /* 509 * TODO: With the value stored we can do better than computing the hash based 510 * on the address. The choice of FNV should also be revisited. 511 */ 512 static void 513 cache_prehash(struct vnode *vp) 514 { 515 516 vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT); 517 } 518 519 static uint32_t 520 cache_get_hash(char *name, u_char len, struct vnode *dvp) 521 { 522 523 return (fnv_32_buf(name, len, dvp->v_nchash)); 524 } 525 526 static inline struct nchashhead * 527 NCP2BUCKET(struct namecache *ncp) 528 { 529 uint32_t hash; 530 531 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 532 return (NCHHASH(hash)); 533 } 534 535 static inline struct mtx * 536 NCP2BUCKETLOCK(struct namecache *ncp) 537 { 538 uint32_t hash; 539 540 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 541 return (HASH2BUCKETLOCK(hash)); 542 } 543 544 #ifdef INVARIANTS 545 static void 546 cache_assert_bucket_locked(struct namecache *ncp) 547 { 548 struct mtx *blp; 549 550 blp = NCP2BUCKETLOCK(ncp); 551 mtx_assert(blp, MA_OWNED); 552 } 553 554 static void 555 cache_assert_bucket_unlocked(struct namecache *ncp) 556 { 557 struct mtx *blp; 558 559 blp = NCP2BUCKETLOCK(ncp); 560 mtx_assert(blp, MA_NOTOWNED); 561 } 562 #else 563 #define cache_assert_bucket_locked(x) do { } while (0) 564 #define cache_assert_bucket_unlocked(x) do { } while (0) 565 #endif 566 567 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) 568 static void 569 _cache_sort_vnodes(void **p1, void **p2) 570 { 571 void *tmp; 572 573 MPASS(*p1 != NULL || *p2 != NULL); 574 575 if (*p1 > *p2) { 576 tmp = *p2; 577 *p2 = *p1; 578 *p1 = tmp; 579 } 580 } 581 582 static void 583 cache_lock_all_buckets(void) 584 { 585 u_int i; 586 587 for (i = 0; i < numbucketlocks; i++) 588 mtx_lock(&bucketlocks[i]); 589 } 590 591 static void 592 cache_unlock_all_buckets(void) 593 { 594 u_int i; 595 596 for (i = 0; i < numbucketlocks; i++) 597 mtx_unlock(&bucketlocks[i]); 598 } 599 600 static void 601 cache_lock_all_vnodes(void) 602 { 603 u_int i; 604 605 for (i = 0; i < numvnodelocks; i++) 606 mtx_lock(&vnodelocks[i]); 607 } 608 609 static void 610 cache_unlock_all_vnodes(void) 611 { 612 u_int i; 613 614 for (i = 0; i < numvnodelocks; i++) 615 mtx_unlock(&vnodelocks[i]); 616 } 617 618 static int 619 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 620 { 621 622 cache_sort_vnodes(&vlp1, &vlp2); 623 624 if (vlp1 != NULL) { 625 if (!mtx_trylock(vlp1)) 626 return (EAGAIN); 627 } 628 if (!mtx_trylock(vlp2)) { 629 if (vlp1 != NULL) 630 mtx_unlock(vlp1); 631 return (EAGAIN); 632 } 633 634 return (0); 635 } 636 637 static void 638 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 639 { 640 641 MPASS(vlp1 != NULL || vlp2 != NULL); 642 MPASS(vlp1 <= vlp2); 643 644 if (vlp1 != NULL) 645 mtx_lock(vlp1); 646 if (vlp2 != NULL) 647 mtx_lock(vlp2); 648 } 649 650 static void 651 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 652 { 653 654 MPASS(vlp1 != NULL || vlp2 != NULL); 655 656 if (vlp1 != NULL) 657 mtx_unlock(vlp1); 658 if (vlp2 != NULL) 659 mtx_unlock(vlp2); 660 } 661 662 static int 663 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 664 { 665 struct nchstats snap; 666 667 if (req->oldptr == NULL) 668 return (SYSCTL_OUT(req, 0, sizeof(snap))); 669 670 snap = nchstats; 671 snap.ncs_goodhits = counter_u64_fetch(numposhits); 672 snap.ncs_neghits = counter_u64_fetch(numneghits); 673 snap.ncs_badhits = counter_u64_fetch(numposzaps) + 674 counter_u64_fetch(numnegzaps); 675 snap.ncs_miss = counter_u64_fetch(nummisszap) + 676 counter_u64_fetch(nummiss); 677 678 return (SYSCTL_OUT(req, &snap, sizeof(snap))); 679 } 680 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | 681 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", 682 "VFS cache effectiveness statistics"); 683 684 static int 685 sysctl_hotnum(SYSCTL_HANDLER_ARGS) 686 { 687 int i, out; 688 689 out = 0; 690 for (i = 0; i < numneglists; i++) 691 out += neglists[i].nl_hotnum; 692 693 return (SYSCTL_OUT(req, &out, sizeof(out))); 694 } 695 SYSCTL_PROC(_vfs_cache, OID_AUTO, hotnum, CTLTYPE_INT | CTLFLAG_RD | 696 CTLFLAG_MPSAFE, 0, 0, sysctl_hotnum, "I", 697 "Number of hot negative entries"); 698 699 #ifdef DIAGNOSTIC 700 /* 701 * Grab an atomic snapshot of the name cache hash chain lengths 702 */ 703 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, 704 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 705 "hash table stats"); 706 707 static int 708 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) 709 { 710 struct nchashhead *ncpp; 711 struct namecache *ncp; 712 int i, error, n_nchash, *cntbuf; 713 714 retry: 715 n_nchash = nchash + 1; /* nchash is max index, not count */ 716 if (req->oldptr == NULL) 717 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); 718 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); 719 cache_lock_all_buckets(); 720 if (n_nchash != nchash + 1) { 721 cache_unlock_all_buckets(); 722 free(cntbuf, M_TEMP); 723 goto retry; 724 } 725 /* Scan hash tables counting entries */ 726 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) 727 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) 728 cntbuf[i]++; 729 cache_unlock_all_buckets(); 730 for (error = 0, i = 0; i < n_nchash; i++) 731 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) 732 break; 733 free(cntbuf, M_TEMP); 734 return (error); 735 } 736 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| 737 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", 738 "nchash chain lengths"); 739 740 static int 741 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) 742 { 743 int error; 744 struct nchashhead *ncpp; 745 struct namecache *ncp; 746 int n_nchash; 747 int count, maxlength, used, pct; 748 749 if (!req->oldptr) 750 return SYSCTL_OUT(req, 0, 4 * sizeof(int)); 751 752 cache_lock_all_buckets(); 753 n_nchash = nchash + 1; /* nchash is max index, not count */ 754 used = 0; 755 maxlength = 0; 756 757 /* Scan hash tables for applicable entries */ 758 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { 759 count = 0; 760 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) { 761 count++; 762 } 763 if (count) 764 used++; 765 if (maxlength < count) 766 maxlength = count; 767 } 768 n_nchash = nchash + 1; 769 cache_unlock_all_buckets(); 770 pct = (used * 100) / (n_nchash / 100); 771 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); 772 if (error) 773 return (error); 774 error = SYSCTL_OUT(req, &used, sizeof(used)); 775 if (error) 776 return (error); 777 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); 778 if (error) 779 return (error); 780 error = SYSCTL_OUT(req, &pct, sizeof(pct)); 781 if (error) 782 return (error); 783 return (0); 784 } 785 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| 786 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", 787 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); 788 #endif 789 790 /* 791 * Negative entries management 792 * 793 * A variation of LRU scheme is used. New entries are hashed into one of 794 * numneglists cold lists. Entries get promoted to the hot list on first hit. 795 * 796 * The shrinker will demote hot list head and evict from the cold list in a 797 * round-robin manner. 798 */ 799 static void 800 cache_negative_init(struct namecache *ncp) 801 { 802 struct negstate *ns; 803 804 ncp->nc_flag |= NCF_NEGATIVE; 805 ns = NCP2NEGSTATE(ncp); 806 ns->neg_flag = 0; 807 } 808 809 /* 810 * Move a negative entry to the hot list. 811 */ 812 static void 813 cache_negative_promote(struct namecache *ncp) 814 { 815 struct neglist *nl; 816 struct negstate *ns; 817 818 ns = NCP2NEGSTATE(ncp); 819 nl = NCP2NEGLIST(ncp); 820 mtx_assert(&nl->nl_lock, MA_OWNED); 821 if ((ns->neg_flag & NEG_HOT) == 0) { 822 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst); 823 TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst); 824 nl->nl_hotnum++; 825 ns->neg_flag |= NEG_HOT; 826 } 827 } 828 829 /* 830 * Move a negative entry to the hot list if it matches the lookup. 831 * 832 * We have to take locks, but they may be contended and in the worst 833 * case we may need to go off CPU. We don't want to spin within the 834 * smr section and we can't block with it. Exiting the section means 835 * the found entry could have been evicted. We are going to look it 836 * up again. 837 */ 838 static bool 839 cache_negative_promote_cond(struct vnode *dvp, struct componentname *cnp, 840 struct namecache *oncp, uint32_t hash) 841 { 842 struct namecache *ncp; 843 struct neglist *nl; 844 u_char nc_flag; 845 846 nl = NCP2NEGLIST(oncp); 847 848 mtx_lock(&nl->nl_lock); 849 /* 850 * For hash iteration. 851 */ 852 vfs_smr_enter(); 853 854 /* 855 * Avoid all surprises by only succeeding if we got the same entry and 856 * bailing completely otherwise. 857 * XXX There are no provisions to keep the vnode around, meaning we may 858 * end up promoting a negative entry for a *new* vnode and returning 859 * ENOENT on its account. This is the error we want to return anyway 860 * and promotion is harmless. 861 * 862 * In particular at this point there can be a new ncp which matches the 863 * search but hashes to a different neglist. 864 */ 865 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 866 if (ncp == oncp) 867 break; 868 } 869 870 /* 871 * No match to begin with. 872 */ 873 if (__predict_false(ncp == NULL)) { 874 goto out_abort; 875 } 876 877 /* 878 * The newly found entry may be something different... 879 */ 880 if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 881 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) { 882 goto out_abort; 883 } 884 885 /* 886 * ... and not even negative. 887 */ 888 nc_flag = atomic_load_char(&ncp->nc_flag); 889 if ((nc_flag & NCF_NEGATIVE) == 0) { 890 goto out_abort; 891 } 892 893 if (__predict_false(!cache_ncp_canuse(ncp))) { 894 goto out_abort; 895 } 896 897 cache_negative_promote(ncp); 898 899 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 900 counter_u64_add(numneghits, 1); 901 vfs_smr_exit(); 902 mtx_unlock(&nl->nl_lock); 903 return (true); 904 out_abort: 905 vfs_smr_exit(); 906 mtx_unlock(&nl->nl_lock); 907 return (false); 908 } 909 910 static void 911 cache_negative_hit(struct namecache *ncp) 912 { 913 struct neglist *nl; 914 struct negstate *ns; 915 916 ns = NCP2NEGSTATE(ncp); 917 if ((ns->neg_flag & NEG_HOT) != 0) 918 return; 919 nl = NCP2NEGLIST(ncp); 920 mtx_lock(&nl->nl_lock); 921 cache_negative_promote(ncp); 922 mtx_unlock(&nl->nl_lock); 923 } 924 925 static void 926 cache_negative_insert(struct namecache *ncp) 927 { 928 struct neglist *nl; 929 930 MPASS(ncp->nc_flag & NCF_NEGATIVE); 931 cache_assert_bucket_locked(ncp); 932 nl = NCP2NEGLIST(ncp); 933 mtx_lock(&nl->nl_lock); 934 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst); 935 mtx_unlock(&nl->nl_lock); 936 atomic_add_long(&numneg, 1); 937 } 938 939 static void 940 cache_negative_remove(struct namecache *ncp) 941 { 942 struct neglist *nl; 943 struct negstate *ns; 944 945 cache_assert_bucket_locked(ncp); 946 nl = NCP2NEGLIST(ncp); 947 ns = NCP2NEGSTATE(ncp); 948 mtx_lock(&nl->nl_lock); 949 if ((ns->neg_flag & NEG_HOT) != 0) { 950 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst); 951 nl->nl_hotnum--; 952 } else { 953 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst); 954 } 955 mtx_unlock(&nl->nl_lock); 956 atomic_subtract_long(&numneg, 1); 957 } 958 959 static struct neglist * 960 cache_negative_shrink_select(void) 961 { 962 struct neglist *nl; 963 static u_int cycle; 964 u_int i; 965 966 cycle++; 967 for (i = 0; i < numneglists; i++) { 968 nl = &neglists[(cycle + i) % numneglists]; 969 if (TAILQ_FIRST(&nl->nl_list) == NULL && 970 TAILQ_FIRST(&nl->nl_hotlist) == NULL) 971 continue; 972 mtx_lock(&nl->nl_lock); 973 if (TAILQ_FIRST(&nl->nl_list) != NULL || 974 TAILQ_FIRST(&nl->nl_hotlist) != NULL) 975 return (nl); 976 mtx_unlock(&nl->nl_lock); 977 } 978 979 return (NULL); 980 } 981 982 static void 983 cache_negative_zap_one(void) 984 { 985 struct namecache *ncp, *ncp2; 986 struct neglist *nl; 987 struct negstate *ns; 988 struct mtx *dvlp; 989 struct mtx *blp; 990 991 if (mtx_owner(&ncneg_shrink_lock) != NULL || 992 !mtx_trylock(&ncneg_shrink_lock)) { 993 counter_u64_add(shrinking_skipped, 1); 994 return; 995 } 996 997 nl = cache_negative_shrink_select(); 998 mtx_unlock(&ncneg_shrink_lock); 999 if (nl == NULL) { 1000 return; 1001 } 1002 1003 ncp = TAILQ_FIRST(&nl->nl_hotlist); 1004 if (ncp != NULL) { 1005 ns = NCP2NEGSTATE(ncp); 1006 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst); 1007 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst); 1008 nl->nl_hotnum--; 1009 ns->neg_flag &= ~NEG_HOT; 1010 } 1011 ncp = TAILQ_FIRST(&nl->nl_list); 1012 MPASS(ncp != NULL); 1013 ns = NCP2NEGSTATE(ncp); 1014 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1015 blp = NCP2BUCKETLOCK(ncp); 1016 mtx_unlock(&nl->nl_lock); 1017 mtx_lock(dvlp); 1018 mtx_lock(blp); 1019 /* 1020 * Enter SMR to safely check the negative list. 1021 * Even if the found pointer matches, the entry may now be reallocated 1022 * and used by a different vnode. 1023 */ 1024 vfs_smr_enter(); 1025 ncp2 = TAILQ_FIRST(&nl->nl_list); 1026 if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) || 1027 blp != NCP2BUCKETLOCK(ncp2)) { 1028 vfs_smr_exit(); 1029 ncp = NULL; 1030 } else { 1031 vfs_smr_exit(); 1032 SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp, 1033 ncp->nc_name); 1034 cache_zap_locked(ncp); 1035 counter_u64_add(numneg_evicted, 1); 1036 } 1037 mtx_unlock(blp); 1038 mtx_unlock(dvlp); 1039 if (ncp != NULL) 1040 cache_free(ncp); 1041 } 1042 1043 /* 1044 * cache_zap_locked(): 1045 * 1046 * Removes a namecache entry from cache, whether it contains an actual 1047 * pointer to a vnode or if it is just a negative cache entry. 1048 */ 1049 static void 1050 cache_zap_locked(struct namecache *ncp) 1051 { 1052 struct nchashhead *ncpp; 1053 1054 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1055 cache_assert_vnode_locked(ncp->nc_vp); 1056 cache_assert_vnode_locked(ncp->nc_dvp); 1057 cache_assert_bucket_locked(ncp); 1058 1059 cache_ncp_invalidate(ncp); 1060 1061 ncpp = NCP2BUCKET(ncp); 1062 CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash); 1063 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 1064 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, 1065 ncp->nc_name, ncp->nc_vp); 1066 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); 1067 if (ncp == ncp->nc_vp->v_cache_dd) { 1068 vn_seqc_write_begin_unheld(ncp->nc_vp); 1069 ncp->nc_vp->v_cache_dd = NULL; 1070 vn_seqc_write_end(ncp->nc_vp); 1071 } 1072 } else { 1073 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp, 1074 ncp->nc_name); 1075 cache_negative_remove(ncp); 1076 } 1077 if (ncp->nc_flag & NCF_ISDOTDOT) { 1078 if (ncp == ncp->nc_dvp->v_cache_dd) { 1079 vn_seqc_write_begin_unheld(ncp->nc_dvp); 1080 ncp->nc_dvp->v_cache_dd = NULL; 1081 vn_seqc_write_end(ncp->nc_dvp); 1082 } 1083 } else { 1084 LIST_REMOVE(ncp, nc_src); 1085 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { 1086 ncp->nc_flag |= NCF_DVDROP; 1087 counter_u64_add(numcachehv, -1); 1088 } 1089 } 1090 atomic_subtract_long(&numcache, 1); 1091 } 1092 1093 static void 1094 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) 1095 { 1096 struct mtx *blp; 1097 1098 MPASS(ncp->nc_dvp == vp); 1099 MPASS(ncp->nc_flag & NCF_NEGATIVE); 1100 cache_assert_vnode_locked(vp); 1101 1102 blp = NCP2BUCKETLOCK(ncp); 1103 mtx_lock(blp); 1104 cache_zap_locked(ncp); 1105 mtx_unlock(blp); 1106 } 1107 1108 static bool 1109 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, 1110 struct mtx **vlpp) 1111 { 1112 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 1113 struct mtx *blp; 1114 1115 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 1116 cache_assert_vnode_locked(vp); 1117 1118 if (ncp->nc_flag & NCF_NEGATIVE) { 1119 if (*vlpp != NULL) { 1120 mtx_unlock(*vlpp); 1121 *vlpp = NULL; 1122 } 1123 cache_zap_negative_locked_vnode_kl(ncp, vp); 1124 return (true); 1125 } 1126 1127 pvlp = VP2VNODELOCK(vp); 1128 blp = NCP2BUCKETLOCK(ncp); 1129 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 1130 vlp2 = VP2VNODELOCK(ncp->nc_vp); 1131 1132 if (*vlpp == vlp1 || *vlpp == vlp2) { 1133 to_unlock = *vlpp; 1134 *vlpp = NULL; 1135 } else { 1136 if (*vlpp != NULL) { 1137 mtx_unlock(*vlpp); 1138 *vlpp = NULL; 1139 } 1140 cache_sort_vnodes(&vlp1, &vlp2); 1141 if (vlp1 == pvlp) { 1142 mtx_lock(vlp2); 1143 to_unlock = vlp2; 1144 } else { 1145 if (!mtx_trylock(vlp1)) 1146 goto out_relock; 1147 to_unlock = vlp1; 1148 } 1149 } 1150 mtx_lock(blp); 1151 cache_zap_locked(ncp); 1152 mtx_unlock(blp); 1153 if (to_unlock != NULL) 1154 mtx_unlock(to_unlock); 1155 return (true); 1156 1157 out_relock: 1158 mtx_unlock(vlp2); 1159 mtx_lock(vlp1); 1160 mtx_lock(vlp2); 1161 MPASS(*vlpp == NULL); 1162 *vlpp = vlp1; 1163 return (false); 1164 } 1165 1166 /* 1167 * If trylocking failed we can get here. We know enough to take all needed locks 1168 * in the right order and re-lookup the entry. 1169 */ 1170 static int 1171 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1172 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, 1173 struct mtx *blp) 1174 { 1175 struct namecache *rncp; 1176 1177 cache_assert_bucket_unlocked(ncp); 1178 1179 cache_sort_vnodes(&dvlp, &vlp); 1180 cache_lock_vnodes(dvlp, vlp); 1181 mtx_lock(blp); 1182 CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { 1183 if (rncp == ncp && rncp->nc_dvp == dvp && 1184 rncp->nc_nlen == cnp->cn_namelen && 1185 !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) 1186 break; 1187 } 1188 if (rncp != NULL) { 1189 cache_zap_locked(rncp); 1190 mtx_unlock(blp); 1191 cache_unlock_vnodes(dvlp, vlp); 1192 counter_u64_add(zap_and_exit_bucket_relock_success, 1); 1193 return (0); 1194 } 1195 1196 mtx_unlock(blp); 1197 cache_unlock_vnodes(dvlp, vlp); 1198 return (EAGAIN); 1199 } 1200 1201 static int __noinline 1202 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp, 1203 uint32_t hash, struct mtx *blp) 1204 { 1205 struct mtx *dvlp, *vlp; 1206 struct vnode *dvp; 1207 1208 cache_assert_bucket_locked(ncp); 1209 1210 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1211 vlp = NULL; 1212 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1213 vlp = VP2VNODELOCK(ncp->nc_vp); 1214 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1215 cache_zap_locked(ncp); 1216 mtx_unlock(blp); 1217 cache_unlock_vnodes(dvlp, vlp); 1218 return (0); 1219 } 1220 1221 dvp = ncp->nc_dvp; 1222 mtx_unlock(blp); 1223 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1224 } 1225 1226 static __noinline int 1227 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp) 1228 { 1229 struct namecache *ncp; 1230 struct mtx *blp; 1231 struct mtx *dvlp, *dvlp2; 1232 uint32_t hash; 1233 int error; 1234 1235 if (cnp->cn_namelen == 2 && 1236 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1237 dvlp = VP2VNODELOCK(dvp); 1238 dvlp2 = NULL; 1239 mtx_lock(dvlp); 1240 retry_dotdot: 1241 ncp = dvp->v_cache_dd; 1242 if (ncp == NULL) { 1243 mtx_unlock(dvlp); 1244 if (dvlp2 != NULL) 1245 mtx_unlock(dvlp2); 1246 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1247 return (0); 1248 } 1249 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1250 if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2)) 1251 goto retry_dotdot; 1252 MPASS(dvp->v_cache_dd == NULL); 1253 mtx_unlock(dvlp); 1254 if (dvlp2 != NULL) 1255 mtx_unlock(dvlp2); 1256 cache_free(ncp); 1257 } else { 1258 vn_seqc_write_begin(dvp); 1259 dvp->v_cache_dd = NULL; 1260 vn_seqc_write_end(dvp); 1261 mtx_unlock(dvlp); 1262 if (dvlp2 != NULL) 1263 mtx_unlock(dvlp2); 1264 } 1265 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1266 return (1); 1267 } 1268 1269 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1270 blp = HASH2BUCKETLOCK(hash); 1271 retry: 1272 if (CK_SLIST_EMPTY(NCHHASH(hash))) 1273 goto out_no_entry; 1274 1275 mtx_lock(blp); 1276 1277 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1278 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1279 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1280 break; 1281 } 1282 1283 if (ncp == NULL) { 1284 mtx_unlock(blp); 1285 goto out_no_entry; 1286 } 1287 1288 error = cache_zap_locked_bucket(ncp, cnp, hash, blp); 1289 if (__predict_false(error != 0)) { 1290 zap_and_exit_bucket_fail++; 1291 goto retry; 1292 } 1293 counter_u64_add(numposzaps, 1); 1294 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1295 cache_free(ncp); 1296 return (1); 1297 out_no_entry: 1298 counter_u64_add(nummisszap, 1); 1299 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1300 return (0); 1301 } 1302 1303 static int __noinline 1304 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1305 struct timespec *tsp, int *ticksp) 1306 { 1307 int ltype; 1308 1309 *vpp = dvp; 1310 counter_u64_add(dothits, 1); 1311 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); 1312 if (tsp != NULL) 1313 timespecclear(tsp); 1314 if (ticksp != NULL) 1315 *ticksp = ticks; 1316 vrefact(*vpp); 1317 /* 1318 * When we lookup "." we still can be asked to lock it 1319 * differently... 1320 */ 1321 ltype = cnp->cn_lkflags & LK_TYPE_MASK; 1322 if (ltype != VOP_ISLOCKED(*vpp)) { 1323 if (ltype == LK_EXCLUSIVE) { 1324 vn_lock(*vpp, LK_UPGRADE | LK_RETRY); 1325 if (VN_IS_DOOMED((*vpp))) { 1326 /* forced unmount */ 1327 vrele(*vpp); 1328 *vpp = NULL; 1329 return (ENOENT); 1330 } 1331 } else 1332 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); 1333 } 1334 return (-1); 1335 } 1336 1337 static int __noinline 1338 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1339 struct timespec *tsp, int *ticksp) 1340 { 1341 struct namecache_ts *ncp_ts; 1342 struct namecache *ncp; 1343 struct mtx *dvlp; 1344 enum vgetstate vs; 1345 int error, ltype; 1346 bool whiteout; 1347 1348 MPASS((cnp->cn_flags & ISDOTDOT) != 0); 1349 1350 if ((cnp->cn_flags & MAKEENTRY) == 0) { 1351 cache_remove_cnp(dvp, cnp); 1352 return (0); 1353 } 1354 1355 counter_u64_add(dotdothits, 1); 1356 retry: 1357 dvlp = VP2VNODELOCK(dvp); 1358 mtx_lock(dvlp); 1359 ncp = dvp->v_cache_dd; 1360 if (ncp == NULL) { 1361 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, "..", NULL); 1362 mtx_unlock(dvlp); 1363 return (0); 1364 } 1365 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1366 if (ncp->nc_flag & NCF_NEGATIVE) 1367 *vpp = NULL; 1368 else 1369 *vpp = ncp->nc_vp; 1370 } else 1371 *vpp = ncp->nc_dvp; 1372 if (*vpp == NULL) 1373 goto negative_success; 1374 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp); 1375 cache_out_ts(ncp, tsp, ticksp); 1376 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == 1377 NCF_DTS && tsp != NULL) { 1378 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1379 *tsp = ncp_ts->nc_dotdottime; 1380 } 1381 1382 MPASS(dvp != *vpp); 1383 ltype = VOP_ISLOCKED(dvp); 1384 VOP_UNLOCK(dvp); 1385 vs = vget_prep(*vpp); 1386 mtx_unlock(dvlp); 1387 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1388 vn_lock(dvp, ltype | LK_RETRY); 1389 if (VN_IS_DOOMED(dvp)) { 1390 if (error == 0) 1391 vput(*vpp); 1392 *vpp = NULL; 1393 return (ENOENT); 1394 } 1395 if (error) { 1396 *vpp = NULL; 1397 goto retry; 1398 } 1399 return (-1); 1400 negative_success: 1401 if (__predict_false(cnp->cn_nameiop == CREATE)) { 1402 if (cnp->cn_flags & ISLASTCN) { 1403 counter_u64_add(numnegzaps, 1); 1404 cache_zap_negative_locked_vnode_kl(ncp, dvp); 1405 mtx_unlock(dvlp); 1406 cache_free(ncp); 1407 return (0); 1408 } 1409 } 1410 1411 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 1412 cache_out_ts(ncp, tsp, ticksp); 1413 counter_u64_add(numneghits, 1); 1414 whiteout = (ncp->nc_flag & NCF_WHITE); 1415 cache_negative_hit(ncp); 1416 mtx_unlock(dvlp); 1417 if (whiteout) 1418 cnp->cn_flags |= ISWHITEOUT; 1419 return (ENOENT); 1420 } 1421 1422 /** 1423 * Lookup a name in the name cache 1424 * 1425 * # Arguments 1426 * 1427 * - dvp: Parent directory in which to search. 1428 * - vpp: Return argument. Will contain desired vnode on cache hit. 1429 * - cnp: Parameters of the name search. The most interesting bits of 1430 * the cn_flags field have the following meanings: 1431 * - MAKEENTRY: If clear, free an entry from the cache rather than look 1432 * it up. 1433 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." 1434 * - tsp: Return storage for cache timestamp. On a successful (positive 1435 * or negative) lookup, tsp will be filled with any timespec that 1436 * was stored when this cache entry was created. However, it will 1437 * be clear for "." entries. 1438 * - ticks: Return storage for alternate cache timestamp. On a successful 1439 * (positive or negative) lookup, it will contain the ticks value 1440 * that was current when the cache entry was created, unless cnp 1441 * was ".". 1442 * 1443 * Either both tsp and ticks have to be provided or neither of them. 1444 * 1445 * # Returns 1446 * 1447 * - -1: A positive cache hit. vpp will contain the desired vnode. 1448 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due 1449 * to a forced unmount. vpp will not be modified. If the entry 1450 * is a whiteout, then the ISWHITEOUT flag will be set in 1451 * cnp->cn_flags. 1452 * - 0: A cache miss. vpp will not be modified. 1453 * 1454 * # Locking 1455 * 1456 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up 1457 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the 1458 * lock is not recursively acquired. 1459 */ 1460 static int __noinline 1461 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1462 struct timespec *tsp, int *ticksp) 1463 { 1464 struct namecache *ncp; 1465 struct mtx *blp; 1466 uint32_t hash; 1467 enum vgetstate vs; 1468 int error; 1469 bool whiteout; 1470 1471 MPASS((cnp->cn_flags & (MAKEENTRY | ISDOTDOT)) == MAKEENTRY); 1472 1473 retry: 1474 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1475 blp = HASH2BUCKETLOCK(hash); 1476 mtx_lock(blp); 1477 1478 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1479 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1480 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1481 break; 1482 } 1483 1484 if (__predict_false(ncp == NULL)) { 1485 mtx_unlock(blp); 1486 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1487 NULL); 1488 counter_u64_add(nummiss, 1); 1489 return (0); 1490 } 1491 1492 if (ncp->nc_flag & NCF_NEGATIVE) 1493 goto negative_success; 1494 1495 counter_u64_add(numposhits, 1); 1496 *vpp = ncp->nc_vp; 1497 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); 1498 cache_out_ts(ncp, tsp, ticksp); 1499 MPASS(dvp != *vpp); 1500 vs = vget_prep(*vpp); 1501 mtx_unlock(blp); 1502 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1503 if (error) { 1504 *vpp = NULL; 1505 goto retry; 1506 } 1507 return (-1); 1508 negative_success: 1509 if (__predict_false(cnp->cn_nameiop == CREATE)) { 1510 if (cnp->cn_flags & ISLASTCN) { 1511 counter_u64_add(numnegzaps, 1); 1512 error = cache_zap_locked_bucket(ncp, cnp, hash, blp); 1513 if (__predict_false(error != 0)) { 1514 zap_and_exit_bucket_fail2++; 1515 goto retry; 1516 } 1517 cache_free(ncp); 1518 return (0); 1519 } 1520 } 1521 1522 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 1523 cache_out_ts(ncp, tsp, ticksp); 1524 counter_u64_add(numneghits, 1); 1525 whiteout = (ncp->nc_flag & NCF_WHITE); 1526 cache_negative_hit(ncp); 1527 mtx_unlock(blp); 1528 if (whiteout) 1529 cnp->cn_flags |= ISWHITEOUT; 1530 return (ENOENT); 1531 } 1532 1533 int 1534 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1535 struct timespec *tsp, int *ticksp) 1536 { 1537 struct namecache *ncp; 1538 struct negstate *ns; 1539 uint32_t hash; 1540 enum vgetstate vs; 1541 int error; 1542 bool whiteout, neg_hot; 1543 u_short nc_flag; 1544 1545 MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL)); 1546 1547 #ifdef DEBUG_CACHE 1548 if (__predict_false(!doingcache)) { 1549 cnp->cn_flags &= ~MAKEENTRY; 1550 return (0); 1551 } 1552 #endif 1553 1554 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 1555 if (cnp->cn_namelen == 1) 1556 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); 1557 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') 1558 return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp)); 1559 } 1560 1561 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 1562 1563 if ((cnp->cn_flags & MAKEENTRY) == 0) { 1564 cache_remove_cnp(dvp, cnp); 1565 return (0); 1566 } 1567 1568 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1569 vfs_smr_enter(); 1570 1571 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1572 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1573 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1574 break; 1575 } 1576 1577 if (__predict_false(ncp == NULL)) { 1578 vfs_smr_exit(); 1579 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1580 NULL); 1581 counter_u64_add(nummiss, 1); 1582 return (0); 1583 } 1584 1585 nc_flag = atomic_load_char(&ncp->nc_flag); 1586 if (nc_flag & NCF_NEGATIVE) 1587 goto negative_success; 1588 1589 counter_u64_add(numposhits, 1); 1590 *vpp = ncp->nc_vp; 1591 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); 1592 cache_out_ts(ncp, tsp, ticksp); 1593 MPASS(dvp != *vpp); 1594 if (!cache_ncp_canuse(ncp)) { 1595 vfs_smr_exit(); 1596 *vpp = NULL; 1597 goto out_fallback; 1598 } 1599 vs = vget_prep_smr(*vpp); 1600 vfs_smr_exit(); 1601 if (__predict_false(vs == VGET_NONE)) { 1602 *vpp = NULL; 1603 goto out_fallback; 1604 } 1605 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1606 if (error) { 1607 *vpp = NULL; 1608 goto out_fallback; 1609 } 1610 return (-1); 1611 negative_success: 1612 if (__predict_false(cnp->cn_nameiop == CREATE)) { 1613 if (cnp->cn_flags & ISLASTCN) { 1614 vfs_smr_exit(); 1615 goto out_fallback; 1616 } 1617 } 1618 1619 cache_out_ts(ncp, tsp, ticksp); 1620 whiteout = (ncp->nc_flag & NCF_WHITE); 1621 ns = NCP2NEGSTATE(ncp); 1622 neg_hot = ((ns->neg_flag & NEG_HOT) != 0); 1623 if (__predict_false(!cache_ncp_canuse(ncp))) { 1624 vfs_smr_exit(); 1625 goto out_fallback; 1626 } 1627 if (!neg_hot) { 1628 vfs_smr_exit(); 1629 if (!cache_negative_promote_cond(dvp, cnp, ncp, hash)) 1630 goto out_fallback; 1631 } else { 1632 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 1633 counter_u64_add(numneghits, 1); 1634 vfs_smr_exit(); 1635 } 1636 if (whiteout) 1637 cnp->cn_flags |= ISWHITEOUT; 1638 return (ENOENT); 1639 out_fallback: 1640 return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp)); 1641 } 1642 1643 struct celockstate { 1644 struct mtx *vlp[3]; 1645 struct mtx *blp[2]; 1646 }; 1647 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); 1648 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); 1649 1650 static inline void 1651 cache_celockstate_init(struct celockstate *cel) 1652 { 1653 1654 bzero(cel, sizeof(*cel)); 1655 } 1656 1657 static void 1658 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, 1659 struct vnode *dvp) 1660 { 1661 struct mtx *vlp1, *vlp2; 1662 1663 MPASS(cel->vlp[0] == NULL); 1664 MPASS(cel->vlp[1] == NULL); 1665 MPASS(cel->vlp[2] == NULL); 1666 1667 MPASS(vp != NULL || dvp != NULL); 1668 1669 vlp1 = VP2VNODELOCK(vp); 1670 vlp2 = VP2VNODELOCK(dvp); 1671 cache_sort_vnodes(&vlp1, &vlp2); 1672 1673 if (vlp1 != NULL) { 1674 mtx_lock(vlp1); 1675 cel->vlp[0] = vlp1; 1676 } 1677 mtx_lock(vlp2); 1678 cel->vlp[1] = vlp2; 1679 } 1680 1681 static void 1682 cache_unlock_vnodes_cel(struct celockstate *cel) 1683 { 1684 1685 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); 1686 1687 if (cel->vlp[0] != NULL) 1688 mtx_unlock(cel->vlp[0]); 1689 if (cel->vlp[1] != NULL) 1690 mtx_unlock(cel->vlp[1]); 1691 if (cel->vlp[2] != NULL) 1692 mtx_unlock(cel->vlp[2]); 1693 } 1694 1695 static bool 1696 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) 1697 { 1698 struct mtx *vlp; 1699 bool ret; 1700 1701 cache_assert_vlp_locked(cel->vlp[0]); 1702 cache_assert_vlp_locked(cel->vlp[1]); 1703 MPASS(cel->vlp[2] == NULL); 1704 1705 MPASS(vp != NULL); 1706 vlp = VP2VNODELOCK(vp); 1707 1708 ret = true; 1709 if (vlp >= cel->vlp[1]) { 1710 mtx_lock(vlp); 1711 } else { 1712 if (mtx_trylock(vlp)) 1713 goto out; 1714 cache_lock_vnodes_cel_3_failures++; 1715 cache_unlock_vnodes_cel(cel); 1716 if (vlp < cel->vlp[0]) { 1717 mtx_lock(vlp); 1718 mtx_lock(cel->vlp[0]); 1719 mtx_lock(cel->vlp[1]); 1720 } else { 1721 if (cel->vlp[0] != NULL) 1722 mtx_lock(cel->vlp[0]); 1723 mtx_lock(vlp); 1724 mtx_lock(cel->vlp[1]); 1725 } 1726 ret = false; 1727 } 1728 out: 1729 cel->vlp[2] = vlp; 1730 return (ret); 1731 } 1732 1733 static void 1734 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1, 1735 struct mtx *blp2) 1736 { 1737 1738 MPASS(cel->blp[0] == NULL); 1739 MPASS(cel->blp[1] == NULL); 1740 1741 cache_sort_vnodes(&blp1, &blp2); 1742 1743 if (blp1 != NULL) { 1744 mtx_lock(blp1); 1745 cel->blp[0] = blp1; 1746 } 1747 mtx_lock(blp2); 1748 cel->blp[1] = blp2; 1749 } 1750 1751 static void 1752 cache_unlock_buckets_cel(struct celockstate *cel) 1753 { 1754 1755 if (cel->blp[0] != NULL) 1756 mtx_unlock(cel->blp[0]); 1757 mtx_unlock(cel->blp[1]); 1758 } 1759 1760 /* 1761 * Lock part of the cache affected by the insertion. 1762 * 1763 * This means vnodelocks for dvp, vp and the relevant bucketlock. 1764 * However, insertion can result in removal of an old entry. In this 1765 * case we have an additional vnode and bucketlock pair to lock. 1766 * 1767 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while 1768 * preserving the locking order (smaller address first). 1769 */ 1770 static void 1771 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1772 uint32_t hash) 1773 { 1774 struct namecache *ncp; 1775 struct mtx *blps[2]; 1776 1777 blps[0] = HASH2BUCKETLOCK(hash); 1778 for (;;) { 1779 blps[1] = NULL; 1780 cache_lock_vnodes_cel(cel, dvp, vp); 1781 if (vp == NULL || vp->v_type != VDIR) 1782 break; 1783 ncp = vp->v_cache_dd; 1784 if (ncp == NULL) 1785 break; 1786 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1787 break; 1788 MPASS(ncp->nc_dvp == vp); 1789 blps[1] = NCP2BUCKETLOCK(ncp); 1790 if (ncp->nc_flag & NCF_NEGATIVE) 1791 break; 1792 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1793 break; 1794 /* 1795 * All vnodes got re-locked. Re-validate the state and if 1796 * nothing changed we are done. Otherwise restart. 1797 */ 1798 if (ncp == vp->v_cache_dd && 1799 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1800 blps[1] == NCP2BUCKETLOCK(ncp) && 1801 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1802 break; 1803 cache_unlock_vnodes_cel(cel); 1804 cel->vlp[0] = NULL; 1805 cel->vlp[1] = NULL; 1806 cel->vlp[2] = NULL; 1807 } 1808 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1809 } 1810 1811 static void 1812 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1813 uint32_t hash) 1814 { 1815 struct namecache *ncp; 1816 struct mtx *blps[2]; 1817 1818 blps[0] = HASH2BUCKETLOCK(hash); 1819 for (;;) { 1820 blps[1] = NULL; 1821 cache_lock_vnodes_cel(cel, dvp, vp); 1822 ncp = dvp->v_cache_dd; 1823 if (ncp == NULL) 1824 break; 1825 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1826 break; 1827 MPASS(ncp->nc_dvp == dvp); 1828 blps[1] = NCP2BUCKETLOCK(ncp); 1829 if (ncp->nc_flag & NCF_NEGATIVE) 1830 break; 1831 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1832 break; 1833 if (ncp == dvp->v_cache_dd && 1834 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1835 blps[1] == NCP2BUCKETLOCK(ncp) && 1836 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1837 break; 1838 cache_unlock_vnodes_cel(cel); 1839 cel->vlp[0] = NULL; 1840 cel->vlp[1] = NULL; 1841 cel->vlp[2] = NULL; 1842 } 1843 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1844 } 1845 1846 static void 1847 cache_enter_unlock(struct celockstate *cel) 1848 { 1849 1850 cache_unlock_buckets_cel(cel); 1851 cache_unlock_vnodes_cel(cel); 1852 } 1853 1854 static void __noinline 1855 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, 1856 struct componentname *cnp) 1857 { 1858 struct celockstate cel; 1859 struct namecache *ncp; 1860 uint32_t hash; 1861 int len; 1862 1863 if (dvp->v_cache_dd == NULL) 1864 return; 1865 len = cnp->cn_namelen; 1866 cache_celockstate_init(&cel); 1867 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1868 cache_enter_lock_dd(&cel, dvp, vp, hash); 1869 vn_seqc_write_begin(dvp); 1870 ncp = dvp->v_cache_dd; 1871 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) { 1872 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); 1873 cache_zap_locked(ncp); 1874 } else { 1875 ncp = NULL; 1876 } 1877 dvp->v_cache_dd = NULL; 1878 vn_seqc_write_end(dvp); 1879 cache_enter_unlock(&cel); 1880 if (ncp != NULL) 1881 cache_free(ncp); 1882 } 1883 1884 /* 1885 * Add an entry to the cache. 1886 */ 1887 void 1888 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 1889 struct timespec *tsp, struct timespec *dtsp) 1890 { 1891 struct celockstate cel; 1892 struct namecache *ncp, *n2, *ndd; 1893 struct namecache_ts *ncp_ts; 1894 struct nchashhead *ncpp; 1895 uint32_t hash; 1896 int flag; 1897 int len; 1898 u_long lnumcache; 1899 1900 VNPASS(!VN_IS_DOOMED(dvp), dvp); 1901 VNPASS(dvp->v_type != VNON, dvp); 1902 if (vp != NULL) { 1903 VNPASS(!VN_IS_DOOMED(vp), vp); 1904 VNPASS(vp->v_type != VNON, vp); 1905 } 1906 1907 #ifdef DEBUG_CACHE 1908 if (__predict_false(!doingcache)) 1909 return; 1910 #endif 1911 1912 flag = 0; 1913 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 1914 if (cnp->cn_namelen == 1) 1915 return; 1916 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 1917 cache_enter_dotdot_prep(dvp, vp, cnp); 1918 flag = NCF_ISDOTDOT; 1919 } 1920 } 1921 1922 /* 1923 * Avoid blowout in namecache entries. 1924 */ 1925 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; 1926 if (__predict_false(lnumcache >= ncsize)) { 1927 atomic_subtract_long(&numcache, 1); 1928 counter_u64_add(numdrops, 1); 1929 return; 1930 } 1931 1932 cache_celockstate_init(&cel); 1933 ndd = NULL; 1934 ncp_ts = NULL; 1935 1936 /* 1937 * Calculate the hash key and setup as much of the new 1938 * namecache entry as possible before acquiring the lock. 1939 */ 1940 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); 1941 ncp->nc_flag = flag | NCF_WIP; 1942 ncp->nc_vp = vp; 1943 if (vp == NULL) 1944 cache_negative_init(ncp); 1945 ncp->nc_dvp = dvp; 1946 if (tsp != NULL) { 1947 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1948 ncp_ts->nc_time = *tsp; 1949 ncp_ts->nc_ticks = ticks; 1950 ncp_ts->nc_nc.nc_flag |= NCF_TS; 1951 if (dtsp != NULL) { 1952 ncp_ts->nc_dotdottime = *dtsp; 1953 ncp_ts->nc_nc.nc_flag |= NCF_DTS; 1954 } 1955 } 1956 len = ncp->nc_nlen = cnp->cn_namelen; 1957 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1958 memcpy(ncp->nc_name, cnp->cn_nameptr, len); 1959 ncp->nc_name[len] = '\0'; 1960 cache_enter_lock(&cel, dvp, vp, hash); 1961 1962 /* 1963 * See if this vnode or negative entry is already in the cache 1964 * with this name. This can happen with concurrent lookups of 1965 * the same path name. 1966 */ 1967 ncpp = NCHHASH(hash); 1968 CK_SLIST_FOREACH(n2, ncpp, nc_hash) { 1969 if (n2->nc_dvp == dvp && 1970 n2->nc_nlen == cnp->cn_namelen && 1971 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { 1972 MPASS(cache_ncp_canuse(n2)); 1973 if ((n2->nc_flag & NCF_NEGATIVE) != 0) 1974 KASSERT(vp == NULL, 1975 ("%s: found entry pointing to a different vnode (%p != %p)", 1976 __func__, NULL, vp)); 1977 else 1978 KASSERT(n2->nc_vp == vp, 1979 ("%s: found entry pointing to a different vnode (%p != %p)", 1980 __func__, n2->nc_vp, vp)); 1981 /* 1982 * Entries are supposed to be immutable unless in the 1983 * process of getting destroyed. Accommodating for 1984 * changing timestamps is possible but not worth it. 1985 * This should be harmless in terms of correctness, in 1986 * the worst case resulting in an earlier expiration. 1987 * Alternatively, the found entry can be replaced 1988 * altogether. 1989 */ 1990 MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS))); 1991 #if 0 1992 if (tsp != NULL) { 1993 KASSERT((n2->nc_flag & NCF_TS) != 0, 1994 ("no NCF_TS")); 1995 n2_ts = __containerof(n2, struct namecache_ts, nc_nc); 1996 n2_ts->nc_time = ncp_ts->nc_time; 1997 n2_ts->nc_ticks = ncp_ts->nc_ticks; 1998 if (dtsp != NULL) { 1999 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; 2000 n2_ts->nc_nc.nc_flag |= NCF_DTS; 2001 } 2002 } 2003 #endif 2004 goto out_unlock_free; 2005 } 2006 } 2007 2008 if (flag == NCF_ISDOTDOT) { 2009 /* 2010 * See if we are trying to add .. entry, but some other lookup 2011 * has populated v_cache_dd pointer already. 2012 */ 2013 if (dvp->v_cache_dd != NULL) 2014 goto out_unlock_free; 2015 KASSERT(vp == NULL || vp->v_type == VDIR, 2016 ("wrong vnode type %p", vp)); 2017 vn_seqc_write_begin(dvp); 2018 dvp->v_cache_dd = ncp; 2019 vn_seqc_write_end(dvp); 2020 } 2021 2022 if (vp != NULL) { 2023 if (flag != NCF_ISDOTDOT) { 2024 /* 2025 * For this case, the cache entry maps both the 2026 * directory name in it and the name ".." for the 2027 * directory's parent. 2028 */ 2029 vn_seqc_write_begin(vp); 2030 if ((ndd = vp->v_cache_dd) != NULL) { 2031 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) 2032 cache_zap_locked(ndd); 2033 else 2034 ndd = NULL; 2035 } 2036 vp->v_cache_dd = ncp; 2037 vn_seqc_write_end(vp); 2038 } else if (vp->v_type != VDIR) { 2039 if (vp->v_cache_dd != NULL) { 2040 vn_seqc_write_begin(vp); 2041 vp->v_cache_dd = NULL; 2042 vn_seqc_write_end(vp); 2043 } 2044 } 2045 } 2046 2047 if (flag != NCF_ISDOTDOT) { 2048 if (LIST_EMPTY(&dvp->v_cache_src)) { 2049 vhold(dvp); 2050 counter_u64_add(numcachehv, 1); 2051 } 2052 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); 2053 } 2054 2055 /* 2056 * If the entry is "negative", we place it into the 2057 * "negative" cache queue, otherwise, we place it into the 2058 * destination vnode's cache entries queue. 2059 */ 2060 if (vp != NULL) { 2061 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); 2062 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, 2063 vp); 2064 } else { 2065 if (cnp->cn_flags & ISWHITEOUT) 2066 ncp->nc_flag |= NCF_WHITE; 2067 cache_negative_insert(ncp); 2068 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, 2069 ncp->nc_name); 2070 } 2071 2072 /* 2073 * Insert the new namecache entry into the appropriate chain 2074 * within the cache entries table. 2075 */ 2076 CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash); 2077 2078 atomic_thread_fence_rel(); 2079 /* 2080 * Mark the entry as fully constructed. 2081 * It is immutable past this point until its removal. 2082 */ 2083 atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP); 2084 2085 cache_enter_unlock(&cel); 2086 if (numneg * ncnegfactor > lnumcache) 2087 cache_negative_zap_one(); 2088 if (ndd != NULL) 2089 cache_free(ndd); 2090 return; 2091 out_unlock_free: 2092 cache_enter_unlock(&cel); 2093 atomic_subtract_long(&numcache, 1); 2094 cache_free(ncp); 2095 return; 2096 } 2097 2098 static u_int 2099 cache_roundup_2(u_int val) 2100 { 2101 u_int res; 2102 2103 for (res = 1; res <= val; res <<= 1) 2104 continue; 2105 2106 return (res); 2107 } 2108 2109 static struct nchashhead * 2110 nchinittbl(u_long elements, u_long *hashmask) 2111 { 2112 struct nchashhead *hashtbl; 2113 u_long hashsize, i; 2114 2115 hashsize = cache_roundup_2(elements) / 2; 2116 2117 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK); 2118 for (i = 0; i < hashsize; i++) 2119 CK_SLIST_INIT(&hashtbl[i]); 2120 *hashmask = hashsize - 1; 2121 return (hashtbl); 2122 } 2123 2124 static void 2125 ncfreetbl(struct nchashhead *hashtbl) 2126 { 2127 2128 free(hashtbl, M_VFSCACHE); 2129 } 2130 2131 /* 2132 * Name cache initialization, from vfs_init() when we are booting 2133 */ 2134 static void 2135 nchinit(void *dummy __unused) 2136 { 2137 u_int i; 2138 2139 cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE, 2140 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2141 cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE, 2142 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2143 cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE, 2144 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2145 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE, 2146 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2147 2148 VFS_SMR_ZONE_SET(cache_zone_small); 2149 VFS_SMR_ZONE_SET(cache_zone_small_ts); 2150 VFS_SMR_ZONE_SET(cache_zone_large); 2151 VFS_SMR_ZONE_SET(cache_zone_large_ts); 2152 2153 ncsize = desiredvnodes * ncsizefactor; 2154 nchashtbl = nchinittbl(desiredvnodes * 2, &nchash); 2155 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; 2156 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ 2157 ncbuckethash = 7; 2158 if (ncbuckethash > nchash) 2159 ncbuckethash = nchash; 2160 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, 2161 M_WAITOK | M_ZERO); 2162 for (i = 0; i < numbucketlocks; i++) 2163 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE); 2164 ncvnodehash = ncbuckethash; 2165 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, 2166 M_WAITOK | M_ZERO); 2167 for (i = 0; i < numvnodelocks; i++) 2168 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); 2169 2170 for (i = 0; i < numneglists; i++) { 2171 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); 2172 TAILQ_INIT(&neglists[i].nl_list); 2173 TAILQ_INIT(&neglists[i].nl_hotlist); 2174 } 2175 2176 mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF); 2177 } 2178 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); 2179 2180 void 2181 cache_vnode_init(struct vnode *vp) 2182 { 2183 2184 LIST_INIT(&vp->v_cache_src); 2185 TAILQ_INIT(&vp->v_cache_dst); 2186 vp->v_cache_dd = NULL; 2187 cache_prehash(vp); 2188 } 2189 2190 void 2191 cache_changesize(u_long newmaxvnodes) 2192 { 2193 struct nchashhead *new_nchashtbl, *old_nchashtbl; 2194 u_long new_nchash, old_nchash; 2195 struct namecache *ncp; 2196 uint32_t hash; 2197 u_long newncsize; 2198 int i; 2199 2200 newncsize = newmaxvnodes * ncsizefactor; 2201 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); 2202 if (newmaxvnodes < numbucketlocks) 2203 newmaxvnodes = numbucketlocks; 2204 2205 new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash); 2206 /* If same hash table size, nothing to do */ 2207 if (nchash == new_nchash) { 2208 ncfreetbl(new_nchashtbl); 2209 return; 2210 } 2211 /* 2212 * Move everything from the old hash table to the new table. 2213 * None of the namecache entries in the table can be removed 2214 * because to do so, they have to be removed from the hash table. 2215 */ 2216 cache_lock_all_vnodes(); 2217 cache_lock_all_buckets(); 2218 old_nchashtbl = nchashtbl; 2219 old_nchash = nchash; 2220 nchashtbl = new_nchashtbl; 2221 nchash = new_nchash; 2222 for (i = 0; i <= old_nchash; i++) { 2223 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) { 2224 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, 2225 ncp->nc_dvp); 2226 CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash); 2227 CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); 2228 } 2229 } 2230 ncsize = newncsize; 2231 cache_unlock_all_buckets(); 2232 cache_unlock_all_vnodes(); 2233 ncfreetbl(old_nchashtbl); 2234 } 2235 2236 /* 2237 * Invalidate all entries from and to a particular vnode. 2238 */ 2239 static void 2240 cache_purge_impl(struct vnode *vp) 2241 { 2242 TAILQ_HEAD(, namecache) ncps; 2243 struct namecache *ncp, *nnp; 2244 struct mtx *vlp, *vlp2; 2245 2246 TAILQ_INIT(&ncps); 2247 vlp = VP2VNODELOCK(vp); 2248 vlp2 = NULL; 2249 mtx_lock(vlp); 2250 retry: 2251 while (!LIST_EMPTY(&vp->v_cache_src)) { 2252 ncp = LIST_FIRST(&vp->v_cache_src); 2253 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2254 goto retry; 2255 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2256 } 2257 while (!TAILQ_EMPTY(&vp->v_cache_dst)) { 2258 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2259 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2260 goto retry; 2261 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2262 } 2263 ncp = vp->v_cache_dd; 2264 if (ncp != NULL) { 2265 KASSERT(ncp->nc_flag & NCF_ISDOTDOT, 2266 ("lost dotdot link")); 2267 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2268 goto retry; 2269 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2270 } 2271 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); 2272 mtx_unlock(vlp); 2273 if (vlp2 != NULL) 2274 mtx_unlock(vlp2); 2275 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2276 cache_free(ncp); 2277 } 2278 } 2279 2280 /* 2281 * Opportunistic check to see if there is anything to do. 2282 */ 2283 static bool 2284 cache_has_entries(struct vnode *vp) 2285 { 2286 2287 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2288 vp->v_cache_dd == NULL) 2289 return (false); 2290 return (true); 2291 } 2292 2293 void 2294 cache_purge(struct vnode *vp) 2295 { 2296 2297 SDT_PROBE1(vfs, namecache, purge, done, vp); 2298 if (!cache_has_entries(vp)) 2299 return; 2300 cache_purge_impl(vp); 2301 } 2302 2303 /* 2304 * Only to be used by vgone. 2305 */ 2306 void 2307 cache_purge_vgone(struct vnode *vp) 2308 { 2309 struct mtx *vlp; 2310 2311 VNPASS(VN_IS_DOOMED(vp), vp); 2312 if (cache_has_entries(vp)) { 2313 cache_purge_impl(vp); 2314 return; 2315 } 2316 2317 /* 2318 * Serialize against a potential thread doing cache_purge. 2319 */ 2320 vlp = VP2VNODELOCK(vp); 2321 mtx_wait_unlocked(vlp); 2322 if (cache_has_entries(vp)) { 2323 cache_purge_impl(vp); 2324 return; 2325 } 2326 return; 2327 } 2328 2329 /* 2330 * Invalidate all negative entries for a particular directory vnode. 2331 */ 2332 void 2333 cache_purge_negative(struct vnode *vp) 2334 { 2335 TAILQ_HEAD(, namecache) ncps; 2336 struct namecache *ncp, *nnp; 2337 struct mtx *vlp; 2338 2339 SDT_PROBE1(vfs, namecache, purge_negative, done, vp); 2340 if (LIST_EMPTY(&vp->v_cache_src)) 2341 return; 2342 TAILQ_INIT(&ncps); 2343 vlp = VP2VNODELOCK(vp); 2344 mtx_lock(vlp); 2345 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { 2346 if (!(ncp->nc_flag & NCF_NEGATIVE)) 2347 continue; 2348 cache_zap_negative_locked_vnode_kl(ncp, vp); 2349 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2350 } 2351 mtx_unlock(vlp); 2352 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2353 cache_free(ncp); 2354 } 2355 } 2356 2357 void 2358 cache_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, 2359 struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp) 2360 { 2361 2362 ASSERT_VOP_IN_SEQC(fdvp); 2363 ASSERT_VOP_IN_SEQC(fvp); 2364 ASSERT_VOP_IN_SEQC(tdvp); 2365 if (tvp != NULL) 2366 ASSERT_VOP_IN_SEQC(tvp); 2367 2368 cache_purge(fvp); 2369 if (tvp != NULL) { 2370 cache_purge(tvp); 2371 KASSERT(!cache_remove_cnp(tdvp, tcnp), 2372 ("%s: lingering negative entry", __func__)); 2373 } else { 2374 cache_remove_cnp(tdvp, tcnp); 2375 } 2376 } 2377 2378 /* 2379 * Flush all entries referencing a particular filesystem. 2380 */ 2381 void 2382 cache_purgevfs(struct mount *mp) 2383 { 2384 struct vnode *vp, *mvp; 2385 2386 SDT_PROBE1(vfs, namecache, purgevfs, done, mp); 2387 /* 2388 * Somewhat wasteful iteration over all vnodes. Would be better to 2389 * support filtering and avoid the interlock to begin with. 2390 */ 2391 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 2392 if (!cache_has_entries(vp)) { 2393 VI_UNLOCK(vp); 2394 continue; 2395 } 2396 vholdl(vp); 2397 VI_UNLOCK(vp); 2398 cache_purge(vp); 2399 vdrop(vp); 2400 } 2401 } 2402 2403 /* 2404 * Perform canonical checks and cache lookup and pass on to filesystem 2405 * through the vop_cachedlookup only if needed. 2406 */ 2407 2408 int 2409 vfs_cache_lookup(struct vop_lookup_args *ap) 2410 { 2411 struct vnode *dvp; 2412 int error; 2413 struct vnode **vpp = ap->a_vpp; 2414 struct componentname *cnp = ap->a_cnp; 2415 int flags = cnp->cn_flags; 2416 2417 *vpp = NULL; 2418 dvp = ap->a_dvp; 2419 2420 if (dvp->v_type != VDIR) 2421 return (ENOTDIR); 2422 2423 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && 2424 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) 2425 return (EROFS); 2426 2427 error = vn_dir_check_exec(dvp, cnp); 2428 if (error != 0) 2429 return (error); 2430 2431 error = cache_lookup(dvp, vpp, cnp, NULL, NULL); 2432 if (error == 0) 2433 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); 2434 if (error == -1) 2435 return (0); 2436 return (error); 2437 } 2438 2439 /* Implementation of the getcwd syscall. */ 2440 int 2441 sys___getcwd(struct thread *td, struct __getcwd_args *uap) 2442 { 2443 char *buf, *retbuf; 2444 size_t buflen; 2445 int error; 2446 2447 buflen = uap->buflen; 2448 if (__predict_false(buflen < 2)) 2449 return (EINVAL); 2450 if (buflen > MAXPATHLEN) 2451 buflen = MAXPATHLEN; 2452 2453 buf = uma_zalloc(namei_zone, M_WAITOK); 2454 error = vn_getcwd(buf, &retbuf, &buflen); 2455 if (error == 0) 2456 error = copyout(retbuf, uap->buf, buflen); 2457 uma_zfree(namei_zone, buf); 2458 return (error); 2459 } 2460 2461 int 2462 vn_getcwd(char *buf, char **retbuf, size_t *buflen) 2463 { 2464 struct pwd *pwd; 2465 int error; 2466 2467 vfs_smr_enter(); 2468 pwd = pwd_get_smr(); 2469 error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, 2470 buflen, false, 0); 2471 VFS_SMR_ASSERT_NOT_ENTERED(); 2472 if (error < 0) { 2473 pwd = pwd_hold(curthread); 2474 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf, 2475 retbuf, buflen); 2476 pwd_drop(pwd); 2477 } 2478 2479 #ifdef KTRACE 2480 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0) 2481 ktrnamei(*retbuf); 2482 #endif 2483 return (error); 2484 } 2485 2486 static int 2487 kern___realpathat(struct thread *td, int fd, const char *path, char *buf, 2488 size_t size, int flags, enum uio_seg pathseg) 2489 { 2490 struct nameidata nd; 2491 char *retbuf, *freebuf; 2492 int error; 2493 2494 if (flags != 0) 2495 return (EINVAL); 2496 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1, 2497 pathseg, path, fd, &cap_fstat_rights, td); 2498 if ((error = namei(&nd)) != 0) 2499 return (error); 2500 error = vn_fullpath_hardlink(&nd, &retbuf, &freebuf, &size); 2501 if (error == 0) { 2502 error = copyout(retbuf, buf, size); 2503 free(freebuf, M_TEMP); 2504 } 2505 NDFREE(&nd, 0); 2506 return (error); 2507 } 2508 2509 int 2510 sys___realpathat(struct thread *td, struct __realpathat_args *uap) 2511 { 2512 2513 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size, 2514 uap->flags, UIO_USERSPACE)); 2515 } 2516 2517 /* 2518 * Retrieve the full filesystem path that correspond to a vnode from the name 2519 * cache (if available) 2520 */ 2521 int 2522 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf) 2523 { 2524 struct pwd *pwd; 2525 char *buf; 2526 size_t buflen; 2527 int error; 2528 2529 if (__predict_false(vp == NULL)) 2530 return (EINVAL); 2531 2532 buflen = MAXPATHLEN; 2533 buf = malloc(buflen, M_TEMP, M_WAITOK); 2534 vfs_smr_enter(); 2535 pwd = pwd_get_smr(); 2536 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, false, 0); 2537 VFS_SMR_ASSERT_NOT_ENTERED(); 2538 if (error < 0) { 2539 pwd = pwd_hold(curthread); 2540 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen); 2541 pwd_drop(pwd); 2542 } 2543 if (error == 0) 2544 *freebuf = buf; 2545 else 2546 free(buf, M_TEMP); 2547 return (error); 2548 } 2549 2550 /* 2551 * This function is similar to vn_fullpath, but it attempts to lookup the 2552 * pathname relative to the global root mount point. This is required for the 2553 * auditing sub-system, as audited pathnames must be absolute, relative to the 2554 * global root mount point. 2555 */ 2556 int 2557 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf) 2558 { 2559 char *buf; 2560 size_t buflen; 2561 int error; 2562 2563 if (__predict_false(vp == NULL)) 2564 return (EINVAL); 2565 buflen = MAXPATHLEN; 2566 buf = malloc(buflen, M_TEMP, M_WAITOK); 2567 vfs_smr_enter(); 2568 error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, false, 0); 2569 VFS_SMR_ASSERT_NOT_ENTERED(); 2570 if (error < 0) { 2571 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen); 2572 } 2573 if (error == 0) 2574 *freebuf = buf; 2575 else 2576 free(buf, M_TEMP); 2577 return (error); 2578 } 2579 2580 static struct namecache * 2581 vn_dd_from_dst(struct vnode *vp) 2582 { 2583 struct namecache *ncp; 2584 2585 cache_assert_vnode_locked(vp); 2586 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) { 2587 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2588 return (ncp); 2589 } 2590 return (NULL); 2591 } 2592 2593 int 2594 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, size_t *buflen) 2595 { 2596 struct vnode *dvp; 2597 struct namecache *ncp; 2598 struct mtx *vlp; 2599 int error; 2600 2601 vlp = VP2VNODELOCK(*vp); 2602 mtx_lock(vlp); 2603 ncp = (*vp)->v_cache_dd; 2604 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) { 2605 KASSERT(ncp == vn_dd_from_dst(*vp), 2606 ("%s: mismatch for dd entry (%p != %p)", __func__, 2607 ncp, vn_dd_from_dst(*vp))); 2608 } else { 2609 ncp = vn_dd_from_dst(*vp); 2610 } 2611 if (ncp != NULL) { 2612 if (*buflen < ncp->nc_nlen) { 2613 mtx_unlock(vlp); 2614 vrele(*vp); 2615 counter_u64_add(numfullpathfail4, 1); 2616 error = ENOMEM; 2617 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2618 vp, NULL); 2619 return (error); 2620 } 2621 *buflen -= ncp->nc_nlen; 2622 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 2623 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, 2624 ncp->nc_name, vp); 2625 dvp = *vp; 2626 *vp = ncp->nc_dvp; 2627 vref(*vp); 2628 mtx_unlock(vlp); 2629 vrele(dvp); 2630 return (0); 2631 } 2632 SDT_PROBE1(vfs, namecache, fullpath, miss, vp); 2633 2634 mtx_unlock(vlp); 2635 vn_lock(*vp, LK_SHARED | LK_RETRY); 2636 error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen); 2637 vput(*vp); 2638 if (error) { 2639 counter_u64_add(numfullpathfail2, 1); 2640 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2641 return (error); 2642 } 2643 2644 *vp = dvp; 2645 if (VN_IS_DOOMED(dvp)) { 2646 /* forced unmount */ 2647 vrele(dvp); 2648 error = ENOENT; 2649 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2650 return (error); 2651 } 2652 /* 2653 * *vp has its use count incremented still. 2654 */ 2655 2656 return (0); 2657 } 2658 2659 /* 2660 * Resolve a directory to a pathname. 2661 * 2662 * The name of the directory can always be found in the namecache or fetched 2663 * from the filesystem. There is also guaranteed to be only one parent, meaning 2664 * we can just follow vnodes up until we find the root. 2665 * 2666 * The vnode must be referenced. 2667 */ 2668 static int 2669 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 2670 size_t *len, bool slash_prefixed, size_t addend) 2671 { 2672 #ifdef KDTRACE_HOOKS 2673 struct vnode *startvp = vp; 2674 #endif 2675 struct vnode *vp1; 2676 size_t buflen; 2677 int error; 2678 2679 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); 2680 VNPASS(vp->v_usecount > 0, vp); 2681 2682 buflen = *len; 2683 2684 if (!slash_prefixed) { 2685 MPASS(*len >= 2); 2686 buflen--; 2687 buf[buflen] = '\0'; 2688 } 2689 2690 error = 0; 2691 2692 SDT_PROBE1(vfs, namecache, fullpath, entry, vp); 2693 counter_u64_add(numfullpathcalls, 1); 2694 while (vp != rdir && vp != rootvnode) { 2695 /* 2696 * The vp vnode must be already fully constructed, 2697 * since it is either found in namecache or obtained 2698 * from VOP_VPTOCNP(). We may test for VV_ROOT safely 2699 * without obtaining the vnode lock. 2700 */ 2701 if ((vp->v_vflag & VV_ROOT) != 0) { 2702 vn_lock(vp, LK_RETRY | LK_SHARED); 2703 2704 /* 2705 * With the vnode locked, check for races with 2706 * unmount, forced or not. Note that we 2707 * already verified that vp is not equal to 2708 * the root vnode, which means that 2709 * mnt_vnodecovered can be NULL only for the 2710 * case of unmount. 2711 */ 2712 if (VN_IS_DOOMED(vp) || 2713 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || 2714 vp1->v_mountedhere != vp->v_mount) { 2715 vput(vp); 2716 error = ENOENT; 2717 SDT_PROBE3(vfs, namecache, fullpath, return, 2718 error, vp, NULL); 2719 break; 2720 } 2721 2722 vref(vp1); 2723 vput(vp); 2724 vp = vp1; 2725 continue; 2726 } 2727 if (vp->v_type != VDIR) { 2728 vrele(vp); 2729 counter_u64_add(numfullpathfail1, 1); 2730 error = ENOTDIR; 2731 SDT_PROBE3(vfs, namecache, fullpath, return, 2732 error, vp, NULL); 2733 break; 2734 } 2735 error = vn_vptocnp(&vp, curthread->td_ucred, buf, &buflen); 2736 if (error) 2737 break; 2738 if (buflen == 0) { 2739 vrele(vp); 2740 error = ENOMEM; 2741 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2742 startvp, NULL); 2743 break; 2744 } 2745 buf[--buflen] = '/'; 2746 slash_prefixed = true; 2747 } 2748 if (error) 2749 return (error); 2750 if (!slash_prefixed) { 2751 if (buflen == 0) { 2752 vrele(vp); 2753 counter_u64_add(numfullpathfail4, 1); 2754 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, 2755 startvp, NULL); 2756 return (ENOMEM); 2757 } 2758 buf[--buflen] = '/'; 2759 } 2760 counter_u64_add(numfullpathfound, 1); 2761 vrele(vp); 2762 2763 *retbuf = buf + buflen; 2764 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf); 2765 *len -= buflen; 2766 *len += addend; 2767 return (0); 2768 } 2769 2770 /* 2771 * Resolve an arbitrary vnode to a pathname. 2772 * 2773 * Note 2 caveats: 2774 * - hardlinks are not tracked, thus if the vnode is not a directory this can 2775 * resolve to a different path than the one used to find it 2776 * - namecache is not mandatory, meaning names are not guaranteed to be added 2777 * (in which case resolving fails) 2778 */ 2779 static void __inline 2780 cache_rev_failed_impl(int *reason, int line) 2781 { 2782 2783 *reason = line; 2784 } 2785 #define cache_rev_failed(var) cache_rev_failed_impl((var), __LINE__) 2786 2787 static int 2788 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 2789 char **retbuf, size_t *buflen, bool slash_prefixed, size_t addend) 2790 { 2791 #ifdef KDTRACE_HOOKS 2792 struct vnode *startvp = vp; 2793 #endif 2794 struct vnode *tvp; 2795 struct mount *mp; 2796 struct namecache *ncp; 2797 size_t orig_buflen; 2798 int reason; 2799 int error; 2800 #ifdef KDTRACE_HOOKS 2801 int i; 2802 #endif 2803 seqc_t vp_seqc, tvp_seqc; 2804 u_char nc_flag; 2805 2806 VFS_SMR_ASSERT_ENTERED(); 2807 2808 if (!cache_fast_revlookup) { 2809 vfs_smr_exit(); 2810 return (-1); 2811 } 2812 2813 orig_buflen = *buflen; 2814 2815 if (!slash_prefixed) { 2816 MPASS(*buflen >= 2); 2817 *buflen -= 1; 2818 buf[*buflen] = '\0'; 2819 } 2820 2821 if (vp == rdir || vp == rootvnode) { 2822 if (!slash_prefixed) { 2823 *buflen -= 1; 2824 buf[*buflen] = '/'; 2825 } 2826 goto out_ok; 2827 } 2828 2829 #ifdef KDTRACE_HOOKS 2830 i = 0; 2831 #endif 2832 error = -1; 2833 ncp = NULL; /* for sdt probe down below */ 2834 vp_seqc = vn_seqc_read_any(vp); 2835 if (seqc_in_modify(vp_seqc)) { 2836 cache_rev_failed(&reason); 2837 goto out_abort; 2838 } 2839 2840 for (;;) { 2841 #ifdef KDTRACE_HOOKS 2842 i++; 2843 #endif 2844 if ((vp->v_vflag & VV_ROOT) != 0) { 2845 mp = atomic_load_ptr(&vp->v_mount); 2846 if (mp == NULL) { 2847 cache_rev_failed(&reason); 2848 goto out_abort; 2849 } 2850 tvp = atomic_load_ptr(&mp->mnt_vnodecovered); 2851 tvp_seqc = vn_seqc_read_any(tvp); 2852 if (seqc_in_modify(tvp_seqc)) { 2853 cache_rev_failed(&reason); 2854 goto out_abort; 2855 } 2856 if (!vn_seqc_consistent(vp, vp_seqc)) { 2857 cache_rev_failed(&reason); 2858 goto out_abort; 2859 } 2860 vp = tvp; 2861 vp_seqc = tvp_seqc; 2862 continue; 2863 } 2864 ncp = atomic_load_ptr(&vp->v_cache_dd); 2865 if (ncp == NULL) { 2866 cache_rev_failed(&reason); 2867 goto out_abort; 2868 } 2869 nc_flag = atomic_load_char(&ncp->nc_flag); 2870 if ((nc_flag & NCF_ISDOTDOT) != 0) { 2871 cache_rev_failed(&reason); 2872 goto out_abort; 2873 } 2874 if (!cache_ncp_canuse(ncp)) { 2875 cache_rev_failed(&reason); 2876 goto out_abort; 2877 } 2878 if (ncp->nc_nlen >= *buflen) { 2879 cache_rev_failed(&reason); 2880 error = ENOMEM; 2881 goto out_abort; 2882 } 2883 *buflen -= ncp->nc_nlen; 2884 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 2885 *buflen -= 1; 2886 buf[*buflen] = '/'; 2887 tvp = ncp->nc_dvp; 2888 tvp_seqc = vn_seqc_read_any(tvp); 2889 if (seqc_in_modify(tvp_seqc)) { 2890 cache_rev_failed(&reason); 2891 goto out_abort; 2892 } 2893 if (!vn_seqc_consistent(vp, vp_seqc)) { 2894 cache_rev_failed(&reason); 2895 goto out_abort; 2896 } 2897 vp = tvp; 2898 vp_seqc = tvp_seqc; 2899 if (vp == rdir || vp == rootvnode) 2900 break; 2901 } 2902 out_ok: 2903 vfs_smr_exit(); 2904 *retbuf = buf + *buflen; 2905 *buflen = orig_buflen - *buflen + addend; 2906 SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf); 2907 return (0); 2908 2909 out_abort: 2910 *buflen = orig_buflen; 2911 SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i); 2912 vfs_smr_exit(); 2913 return (error); 2914 } 2915 2916 static int 2917 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 2918 size_t *buflen) 2919 { 2920 size_t orig_buflen; 2921 bool slash_prefixed; 2922 int error; 2923 2924 if (*buflen < 2) 2925 return (EINVAL); 2926 2927 orig_buflen = *buflen; 2928 2929 vref(vp); 2930 slash_prefixed = false; 2931 if (vp->v_type != VDIR) { 2932 *buflen -= 1; 2933 buf[*buflen] = '\0'; 2934 error = vn_vptocnp(&vp, curthread->td_ucred, buf, buflen); 2935 if (error) 2936 return (error); 2937 if (*buflen == 0) { 2938 vrele(vp); 2939 return (ENOMEM); 2940 } 2941 *buflen -= 1; 2942 buf[*buflen] = '/'; 2943 slash_prefixed = true; 2944 } 2945 2946 return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, slash_prefixed, 2947 orig_buflen - *buflen)); 2948 } 2949 2950 /* 2951 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks). 2952 * 2953 * Since the namecache does not track handlings, the caller is expected to first 2954 * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei. 2955 * 2956 * Then we have 2 cases: 2957 * - if the found vnode is a directory, the path can be constructed just by 2958 * fullowing names up the chain 2959 * - otherwise we populate the buffer with the saved name and start resolving 2960 * from the parent 2961 */ 2962 static int 2963 vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, char **freebuf, 2964 size_t *buflen) 2965 { 2966 char *buf, *tmpbuf; 2967 struct pwd *pwd; 2968 struct componentname *cnp; 2969 struct vnode *vp; 2970 size_t addend; 2971 int error; 2972 bool slash_prefixed; 2973 enum vtype type; 2974 2975 if (*buflen < 2) 2976 return (EINVAL); 2977 if (*buflen > MAXPATHLEN) 2978 *buflen = MAXPATHLEN; 2979 2980 slash_prefixed = false; 2981 2982 buf = malloc(*buflen, M_TEMP, M_WAITOK); 2983 2984 addend = 0; 2985 vp = ndp->ni_vp; 2986 /* 2987 * Check for VBAD to work around the vp_crossmp bug in lookup(). 2988 * 2989 * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be 2990 * set to mount point's root vnode while ni_dvp will be vp_crossmp. 2991 * If the type is VDIR (like in this very case) we can skip looking 2992 * at ni_dvp in the first place. However, since vnodes get passed here 2993 * unlocked the target may transition to doomed state (type == VBAD) 2994 * before we get to evaluate the condition. If this happens, we will 2995 * populate part of the buffer and descend to vn_fullpath_dir with 2996 * vp == vp_crossmp. Prevent the problem by checking for VBAD. 2997 * 2998 * This should be atomic_load(&vp->v_type) but it is ilegal to take 2999 * an address of a bit field, even if said field is sized to char. 3000 * Work around the problem by reading the value into a full-sized enum 3001 * and then re-reading it with atomic_load which will still prevent 3002 * the compiler from re-reading down the road. 3003 */ 3004 type = vp->v_type; 3005 type = atomic_load_int(&type); 3006 if (type == VBAD) { 3007 error = ENOENT; 3008 goto out_bad; 3009 } 3010 if (type != VDIR) { 3011 cnp = &ndp->ni_cnd; 3012 addend = cnp->cn_namelen + 2; 3013 if (*buflen < addend) { 3014 error = ENOMEM; 3015 goto out_bad; 3016 } 3017 *buflen -= addend; 3018 tmpbuf = buf + *buflen; 3019 tmpbuf[0] = '/'; 3020 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen); 3021 tmpbuf[addend - 1] = '\0'; 3022 slash_prefixed = true; 3023 vp = ndp->ni_dvp; 3024 } 3025 3026 vfs_smr_enter(); 3027 pwd = pwd_get_smr(); 3028 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen, 3029 slash_prefixed, addend); 3030 VFS_SMR_ASSERT_NOT_ENTERED(); 3031 if (error < 0) { 3032 pwd = pwd_hold(curthread); 3033 vref(vp); 3034 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen, 3035 slash_prefixed, addend); 3036 pwd_drop(pwd); 3037 if (error != 0) 3038 goto out_bad; 3039 } 3040 3041 *freebuf = buf; 3042 3043 return (0); 3044 out_bad: 3045 free(buf, M_TEMP); 3046 return (error); 3047 } 3048 3049 struct vnode * 3050 vn_dir_dd_ino(struct vnode *vp) 3051 { 3052 struct namecache *ncp; 3053 struct vnode *ddvp; 3054 struct mtx *vlp; 3055 enum vgetstate vs; 3056 3057 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); 3058 vlp = VP2VNODELOCK(vp); 3059 mtx_lock(vlp); 3060 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { 3061 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 3062 continue; 3063 ddvp = ncp->nc_dvp; 3064 vs = vget_prep(ddvp); 3065 mtx_unlock(vlp); 3066 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) 3067 return (NULL); 3068 return (ddvp); 3069 } 3070 mtx_unlock(vlp); 3071 return (NULL); 3072 } 3073 3074 int 3075 vn_commname(struct vnode *vp, char *buf, u_int buflen) 3076 { 3077 struct namecache *ncp; 3078 struct mtx *vlp; 3079 int l; 3080 3081 vlp = VP2VNODELOCK(vp); 3082 mtx_lock(vlp); 3083 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) 3084 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 3085 break; 3086 if (ncp == NULL) { 3087 mtx_unlock(vlp); 3088 return (ENOENT); 3089 } 3090 l = min(ncp->nc_nlen, buflen - 1); 3091 memcpy(buf, ncp->nc_name, l); 3092 mtx_unlock(vlp); 3093 buf[l] = '\0'; 3094 return (0); 3095 } 3096 3097 /* 3098 * This function updates path string to vnode's full global path 3099 * and checks the size of the new path string against the pathlen argument. 3100 * 3101 * Requires a locked, referenced vnode. 3102 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 3103 * 3104 * If vp is a directory, the call to vn_fullpath_global() always succeeds 3105 * because it falls back to the ".." lookup if the namecache lookup fails. 3106 */ 3107 int 3108 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, 3109 u_int pathlen) 3110 { 3111 struct nameidata nd; 3112 struct vnode *vp1; 3113 char *rpath, *fbuf; 3114 int error; 3115 3116 ASSERT_VOP_ELOCKED(vp, __func__); 3117 3118 /* Construct global filesystem path from vp. */ 3119 VOP_UNLOCK(vp); 3120 error = vn_fullpath_global(vp, &rpath, &fbuf); 3121 3122 if (error != 0) { 3123 vrele(vp); 3124 return (error); 3125 } 3126 3127 if (strlen(rpath) >= pathlen) { 3128 vrele(vp); 3129 error = ENAMETOOLONG; 3130 goto out; 3131 } 3132 3133 /* 3134 * Re-lookup the vnode by path to detect a possible rename. 3135 * As a side effect, the vnode is relocked. 3136 * If vnode was renamed, return ENOENT. 3137 */ 3138 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, 3139 UIO_SYSSPACE, path, td); 3140 error = namei(&nd); 3141 if (error != 0) { 3142 vrele(vp); 3143 goto out; 3144 } 3145 NDFREE(&nd, NDF_ONLY_PNBUF); 3146 vp1 = nd.ni_vp; 3147 vrele(vp); 3148 if (vp1 == vp) 3149 strcpy(path, rpath); 3150 else { 3151 vput(vp1); 3152 error = ENOENT; 3153 } 3154 3155 out: 3156 free(fbuf, M_TEMP); 3157 return (error); 3158 } 3159 3160 #ifdef DDB 3161 static void 3162 db_print_vpath(struct vnode *vp) 3163 { 3164 3165 while (vp != NULL) { 3166 db_printf("%p: ", vp); 3167 if (vp == rootvnode) { 3168 db_printf("/"); 3169 vp = NULL; 3170 } else { 3171 if (vp->v_vflag & VV_ROOT) { 3172 db_printf("<mount point>"); 3173 vp = vp->v_mount->mnt_vnodecovered; 3174 } else { 3175 struct namecache *ncp; 3176 char *ncn; 3177 int i; 3178 3179 ncp = TAILQ_FIRST(&vp->v_cache_dst); 3180 if (ncp != NULL) { 3181 ncn = ncp->nc_name; 3182 for (i = 0; i < ncp->nc_nlen; i++) 3183 db_printf("%c", *ncn++); 3184 vp = ncp->nc_dvp; 3185 } else { 3186 vp = NULL; 3187 } 3188 } 3189 } 3190 db_printf("\n"); 3191 } 3192 3193 return; 3194 } 3195 3196 DB_SHOW_COMMAND(vpath, db_show_vpath) 3197 { 3198 struct vnode *vp; 3199 3200 if (!have_addr) { 3201 db_printf("usage: show vpath <struct vnode *>\n"); 3202 return; 3203 } 3204 3205 vp = (struct vnode *)addr; 3206 db_print_vpath(vp); 3207 } 3208 3209 #endif 3210 3211 static bool __read_frequently cache_fast_lookup = true; 3212 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW, 3213 &cache_fast_lookup, 0, ""); 3214 3215 #define CACHE_FPL_FAILED -2020 3216 3217 static void 3218 cache_fpl_cleanup_cnp(struct componentname *cnp) 3219 { 3220 3221 uma_zfree(namei_zone, cnp->cn_pnbuf); 3222 #ifdef DIAGNOSTIC 3223 cnp->cn_pnbuf = NULL; 3224 cnp->cn_nameptr = NULL; 3225 #endif 3226 } 3227 3228 static void 3229 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp) 3230 { 3231 struct componentname *cnp; 3232 3233 cnp = &ndp->ni_cnd; 3234 while (*(cnp->cn_nameptr) == '/') { 3235 cnp->cn_nameptr++; 3236 ndp->ni_pathlen--; 3237 } 3238 3239 *dpp = ndp->ni_rootdir; 3240 } 3241 3242 /* 3243 * Components of nameidata (or objects it can point to) which may 3244 * need restoring in case fast path lookup fails. 3245 */ 3246 struct nameidata_saved { 3247 long cn_namelen; 3248 char *cn_nameptr; 3249 size_t ni_pathlen; 3250 int cn_flags; 3251 }; 3252 3253 struct cache_fpl { 3254 struct nameidata *ndp; 3255 struct componentname *cnp; 3256 struct pwd *pwd; 3257 struct vnode *dvp; 3258 struct vnode *tvp; 3259 seqc_t dvp_seqc; 3260 seqc_t tvp_seqc; 3261 struct nameidata_saved snd; 3262 int line; 3263 enum cache_fpl_status status:8; 3264 bool in_smr; 3265 bool fsearch; 3266 }; 3267 3268 static void 3269 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd) 3270 { 3271 3272 snd->cn_flags = fpl->ndp->ni_cnd.cn_flags; 3273 snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen; 3274 snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr; 3275 snd->ni_pathlen = fpl->ndp->ni_pathlen; 3276 } 3277 3278 static void 3279 cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd) 3280 { 3281 3282 fpl->ndp->ni_cnd.cn_flags = snd->cn_flags; 3283 fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen; 3284 fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr; 3285 fpl->ndp->ni_pathlen = snd->ni_pathlen; 3286 } 3287 3288 #ifdef INVARIANTS 3289 #define cache_fpl_smr_assert_entered(fpl) ({ \ 3290 struct cache_fpl *_fpl = (fpl); \ 3291 MPASS(_fpl->in_smr == true); \ 3292 VFS_SMR_ASSERT_ENTERED(); \ 3293 }) 3294 #define cache_fpl_smr_assert_not_entered(fpl) ({ \ 3295 struct cache_fpl *_fpl = (fpl); \ 3296 MPASS(_fpl->in_smr == false); \ 3297 VFS_SMR_ASSERT_NOT_ENTERED(); \ 3298 }) 3299 #else 3300 #define cache_fpl_smr_assert_entered(fpl) do { } while (0) 3301 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0) 3302 #endif 3303 3304 #define cache_fpl_smr_enter_initial(fpl) ({ \ 3305 struct cache_fpl *_fpl = (fpl); \ 3306 vfs_smr_enter(); \ 3307 _fpl->in_smr = true; \ 3308 }) 3309 3310 #define cache_fpl_smr_enter(fpl) ({ \ 3311 struct cache_fpl *_fpl = (fpl); \ 3312 MPASS(_fpl->in_smr == false); \ 3313 vfs_smr_enter(); \ 3314 _fpl->in_smr = true; \ 3315 }) 3316 3317 #define cache_fpl_smr_exit(fpl) ({ \ 3318 struct cache_fpl *_fpl = (fpl); \ 3319 MPASS(_fpl->in_smr == true); \ 3320 vfs_smr_exit(); \ 3321 _fpl->in_smr = false; \ 3322 }) 3323 3324 static int 3325 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line) 3326 { 3327 3328 if (fpl->status != CACHE_FPL_STATUS_UNSET) { 3329 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, 3330 ("%s: converting to abort from %d at %d, set at %d\n", 3331 __func__, fpl->status, line, fpl->line)); 3332 } 3333 fpl->status = CACHE_FPL_STATUS_ABORTED; 3334 fpl->line = line; 3335 return (CACHE_FPL_FAILED); 3336 } 3337 3338 #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__) 3339 3340 static int 3341 cache_fpl_partial_impl(struct cache_fpl *fpl, int line) 3342 { 3343 3344 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3345 ("%s: setting to partial at %d, but already set to %d at %d\n", 3346 __func__, line, fpl->status, fpl->line)); 3347 cache_fpl_smr_assert_entered(fpl); 3348 fpl->status = CACHE_FPL_STATUS_PARTIAL; 3349 fpl->line = line; 3350 return (CACHE_FPL_FAILED); 3351 } 3352 3353 #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__) 3354 3355 static int 3356 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line) 3357 { 3358 3359 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3360 ("%s: setting to handled at %d, but already set to %d at %d\n", 3361 __func__, line, fpl->status, fpl->line)); 3362 cache_fpl_smr_assert_not_entered(fpl); 3363 MPASS(error != CACHE_FPL_FAILED); 3364 fpl->status = CACHE_FPL_STATUS_HANDLED; 3365 fpl->line = line; 3366 return (error); 3367 } 3368 3369 #define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__) 3370 3371 #define CACHE_FPL_SUPPORTED_CN_FLAGS \ 3372 (LOCKLEAF | LOCKPARENT | WANTPARENT | NOCACHE | FOLLOW | LOCKSHARED | SAVENAME | \ 3373 SAVESTART | WILLBEDIR | ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK) 3374 3375 #define CACHE_FPL_INTERNAL_CN_FLAGS \ 3376 (ISDOTDOT | MAKEENTRY | ISLASTCN) 3377 3378 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, 3379 "supported and internal flags overlap"); 3380 3381 static bool 3382 cache_fpl_islastcn(struct nameidata *ndp) 3383 { 3384 3385 return (*ndp->ni_next == 0); 3386 } 3387 3388 static bool 3389 cache_fpl_isdotdot(struct componentname *cnp) 3390 { 3391 3392 if (cnp->cn_namelen == 2 && 3393 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') 3394 return (true); 3395 return (false); 3396 } 3397 3398 static bool 3399 cache_can_fplookup(struct cache_fpl *fpl) 3400 { 3401 struct nameidata *ndp; 3402 struct componentname *cnp; 3403 struct thread *td; 3404 3405 ndp = fpl->ndp; 3406 cnp = fpl->cnp; 3407 td = cnp->cn_thread; 3408 3409 if (!cache_fast_lookup) { 3410 cache_fpl_aborted(fpl); 3411 return (false); 3412 } 3413 #ifdef MAC 3414 if (mac_vnode_check_lookup_enabled()) { 3415 cache_fpl_aborted(fpl); 3416 return (false); 3417 } 3418 #endif 3419 if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) { 3420 cache_fpl_aborted(fpl); 3421 return (false); 3422 } 3423 if (IN_CAPABILITY_MODE(td)) { 3424 cache_fpl_aborted(fpl); 3425 return (false); 3426 } 3427 if (AUDITING_TD(td)) { 3428 cache_fpl_aborted(fpl); 3429 return (false); 3430 } 3431 if (ndp->ni_startdir != NULL) { 3432 cache_fpl_aborted(fpl); 3433 return (false); 3434 } 3435 return (true); 3436 } 3437 3438 static int 3439 cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp) 3440 { 3441 struct nameidata *ndp; 3442 int error; 3443 bool fsearch; 3444 3445 ndp = fpl->ndp; 3446 error = fgetvp_lookup_smr(ndp->ni_dirfd, ndp, vpp, &fsearch); 3447 if (__predict_false(error != 0)) { 3448 cache_fpl_smr_exit(fpl); 3449 return (cache_fpl_aborted(fpl)); 3450 } 3451 fpl->fsearch = fsearch; 3452 return (0); 3453 } 3454 3455 static bool 3456 cache_fplookup_vnode_supported(struct vnode *vp) 3457 { 3458 3459 return (vp->v_type != VLNK); 3460 } 3461 3462 static int __noinline 3463 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp, 3464 uint32_t hash) 3465 { 3466 struct componentname *cnp; 3467 struct vnode *dvp; 3468 3469 cnp = fpl->cnp; 3470 dvp = fpl->dvp; 3471 3472 cache_fpl_smr_exit(fpl); 3473 if (cache_negative_promote_cond(dvp, cnp, oncp, hash)) 3474 return (cache_fpl_handled(fpl, ENOENT)); 3475 else 3476 return (cache_fpl_aborted(fpl)); 3477 } 3478 3479 /* 3480 * The target vnode is not supported, prepare for the slow path to take over. 3481 */ 3482 static int __noinline 3483 cache_fplookup_partial_setup(struct cache_fpl *fpl) 3484 { 3485 struct nameidata *ndp; 3486 struct componentname *cnp; 3487 enum vgetstate dvs; 3488 struct vnode *dvp; 3489 struct pwd *pwd; 3490 seqc_t dvp_seqc; 3491 3492 ndp = fpl->ndp; 3493 cnp = fpl->cnp; 3494 pwd = fpl->pwd; 3495 dvp = fpl->dvp; 3496 dvp_seqc = fpl->dvp_seqc; 3497 3498 if (!pwd_hold_smr(pwd)) { 3499 cache_fpl_smr_exit(fpl); 3500 return (cache_fpl_aborted(fpl)); 3501 } 3502 3503 dvs = vget_prep_smr(dvp); 3504 cache_fpl_smr_exit(fpl); 3505 if (__predict_false(dvs == VGET_NONE)) { 3506 pwd_drop(pwd); 3507 return (cache_fpl_aborted(fpl)); 3508 } 3509 3510 vget_finish_ref(dvp, dvs); 3511 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3512 vrele(dvp); 3513 pwd_drop(pwd); 3514 return (cache_fpl_aborted(fpl)); 3515 } 3516 3517 cache_fpl_restore(fpl, &fpl->snd); 3518 3519 ndp->ni_startdir = dvp; 3520 cnp->cn_flags |= MAKEENTRY; 3521 if (cache_fpl_islastcn(ndp)) 3522 cnp->cn_flags |= ISLASTCN; 3523 if (cache_fpl_isdotdot(cnp)) 3524 cnp->cn_flags |= ISDOTDOT; 3525 3526 return (0); 3527 } 3528 3529 static int 3530 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs) 3531 { 3532 struct componentname *cnp; 3533 struct vnode *tvp; 3534 seqc_t tvp_seqc; 3535 int error, lkflags; 3536 3537 cnp = fpl->cnp; 3538 tvp = fpl->tvp; 3539 tvp_seqc = fpl->tvp_seqc; 3540 3541 if ((cnp->cn_flags & LOCKLEAF) != 0) { 3542 lkflags = LK_SHARED; 3543 if ((cnp->cn_flags & LOCKSHARED) == 0) 3544 lkflags = LK_EXCLUSIVE; 3545 error = vget_finish(tvp, lkflags, tvs); 3546 if (__predict_false(error != 0)) { 3547 return (cache_fpl_aborted(fpl)); 3548 } 3549 } else { 3550 vget_finish_ref(tvp, tvs); 3551 } 3552 3553 if (!vn_seqc_consistent(tvp, tvp_seqc)) { 3554 if ((cnp->cn_flags & LOCKLEAF) != 0) 3555 vput(tvp); 3556 else 3557 vrele(tvp); 3558 return (cache_fpl_aborted(fpl)); 3559 } 3560 3561 return (cache_fpl_handled(fpl, 0)); 3562 } 3563 3564 /* 3565 * They want to possibly modify the state of the namecache. 3566 * 3567 * Don't try to match the API contract, just leave. 3568 * TODO: this leaves scalability on the table 3569 */ 3570 static int 3571 cache_fplookup_final_modifying(struct cache_fpl *fpl) 3572 { 3573 struct componentname *cnp; 3574 3575 cnp = fpl->cnp; 3576 MPASS(cnp->cn_nameiop != LOOKUP); 3577 return (cache_fpl_partial(fpl)); 3578 } 3579 3580 static int __noinline 3581 cache_fplookup_final_withparent(struct cache_fpl *fpl) 3582 { 3583 struct componentname *cnp; 3584 enum vgetstate dvs, tvs; 3585 struct vnode *dvp, *tvp; 3586 seqc_t dvp_seqc; 3587 int error; 3588 3589 cnp = fpl->cnp; 3590 dvp = fpl->dvp; 3591 dvp_seqc = fpl->dvp_seqc; 3592 tvp = fpl->tvp; 3593 3594 MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0); 3595 3596 /* 3597 * This is less efficient than it can be for simplicity. 3598 */ 3599 dvs = vget_prep_smr(dvp); 3600 if (__predict_false(dvs == VGET_NONE)) { 3601 return (cache_fpl_aborted(fpl)); 3602 } 3603 tvs = vget_prep_smr(tvp); 3604 if (__predict_false(tvs == VGET_NONE)) { 3605 cache_fpl_smr_exit(fpl); 3606 vget_abort(dvp, dvs); 3607 return (cache_fpl_aborted(fpl)); 3608 } 3609 3610 cache_fpl_smr_exit(fpl); 3611 3612 if ((cnp->cn_flags & LOCKPARENT) != 0) { 3613 error = vget_finish(dvp, LK_EXCLUSIVE, dvs); 3614 if (__predict_false(error != 0)) { 3615 vget_abort(tvp, tvs); 3616 return (cache_fpl_aborted(fpl)); 3617 } 3618 } else { 3619 vget_finish_ref(dvp, dvs); 3620 } 3621 3622 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3623 vget_abort(tvp, tvs); 3624 if ((cnp->cn_flags & LOCKPARENT) != 0) 3625 vput(dvp); 3626 else 3627 vrele(dvp); 3628 return (cache_fpl_aborted(fpl)); 3629 } 3630 3631 error = cache_fplookup_final_child(fpl, tvs); 3632 if (__predict_false(error != 0)) { 3633 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED); 3634 if ((cnp->cn_flags & LOCKPARENT) != 0) 3635 vput(dvp); 3636 else 3637 vrele(dvp); 3638 return (error); 3639 } 3640 3641 MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED); 3642 return (0); 3643 } 3644 3645 static int 3646 cache_fplookup_final(struct cache_fpl *fpl) 3647 { 3648 struct componentname *cnp; 3649 enum vgetstate tvs; 3650 struct vnode *dvp, *tvp; 3651 seqc_t dvp_seqc; 3652 3653 cnp = fpl->cnp; 3654 dvp = fpl->dvp; 3655 dvp_seqc = fpl->dvp_seqc; 3656 tvp = fpl->tvp; 3657 3658 VNPASS(cache_fplookup_vnode_supported(dvp), dvp); 3659 3660 if (cnp->cn_nameiop != LOOKUP) { 3661 return (cache_fplookup_final_modifying(fpl)); 3662 } 3663 3664 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) 3665 return (cache_fplookup_final_withparent(fpl)); 3666 3667 tvs = vget_prep_smr(tvp); 3668 if (__predict_false(tvs == VGET_NONE)) { 3669 return (cache_fpl_partial(fpl)); 3670 } 3671 3672 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3673 cache_fpl_smr_exit(fpl); 3674 vget_abort(tvp, tvs); 3675 return (cache_fpl_aborted(fpl)); 3676 } 3677 3678 cache_fpl_smr_exit(fpl); 3679 return (cache_fplookup_final_child(fpl, tvs)); 3680 } 3681 3682 static int __noinline 3683 cache_fplookup_dot(struct cache_fpl *fpl) 3684 { 3685 struct vnode *dvp; 3686 3687 dvp = fpl->dvp; 3688 3689 fpl->tvp = dvp; 3690 fpl->tvp_seqc = vn_seqc_read_any(dvp); 3691 if (seqc_in_modify(fpl->tvp_seqc)) { 3692 return (cache_fpl_aborted(fpl)); 3693 } 3694 3695 counter_u64_add(dothits, 1); 3696 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", dvp); 3697 3698 return (0); 3699 } 3700 3701 static int __noinline 3702 cache_fplookup_dotdot(struct cache_fpl *fpl) 3703 { 3704 struct nameidata *ndp; 3705 struct componentname *cnp; 3706 struct namecache *ncp; 3707 struct vnode *dvp; 3708 struct prison *pr; 3709 u_char nc_flag; 3710 3711 ndp = fpl->ndp; 3712 cnp = fpl->cnp; 3713 dvp = fpl->dvp; 3714 3715 /* 3716 * XXX this is racy the same way regular lookup is 3717 */ 3718 for (pr = cnp->cn_cred->cr_prison; pr != NULL; 3719 pr = pr->pr_parent) 3720 if (dvp == pr->pr_root) 3721 break; 3722 3723 if (dvp == ndp->ni_rootdir || 3724 dvp == ndp->ni_topdir || 3725 dvp == rootvnode || 3726 pr != NULL) { 3727 fpl->tvp = dvp; 3728 fpl->tvp_seqc = vn_seqc_read_any(dvp); 3729 if (seqc_in_modify(fpl->tvp_seqc)) { 3730 return (cache_fpl_aborted(fpl)); 3731 } 3732 return (0); 3733 } 3734 3735 if ((dvp->v_vflag & VV_ROOT) != 0) { 3736 /* 3737 * TODO 3738 * The opposite of climb mount is needed here. 3739 */ 3740 return (cache_fpl_aborted(fpl)); 3741 } 3742 3743 ncp = atomic_load_ptr(&dvp->v_cache_dd); 3744 if (ncp == NULL) { 3745 return (cache_fpl_aborted(fpl)); 3746 } 3747 3748 nc_flag = atomic_load_char(&ncp->nc_flag); 3749 if ((nc_flag & NCF_ISDOTDOT) != 0) { 3750 if ((nc_flag & NCF_NEGATIVE) != 0) 3751 return (cache_fpl_aborted(fpl)); 3752 fpl->tvp = ncp->nc_vp; 3753 } else { 3754 fpl->tvp = ncp->nc_dvp; 3755 } 3756 3757 if (__predict_false(!cache_ncp_canuse(ncp))) { 3758 return (cache_fpl_aborted(fpl)); 3759 } 3760 3761 fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp); 3762 if (seqc_in_modify(fpl->tvp_seqc)) { 3763 return (cache_fpl_partial(fpl)); 3764 } 3765 3766 counter_u64_add(dotdothits, 1); 3767 return (0); 3768 } 3769 3770 static int 3771 cache_fplookup_next(struct cache_fpl *fpl) 3772 { 3773 struct componentname *cnp; 3774 struct namecache *ncp; 3775 struct negstate *ns; 3776 struct vnode *dvp, *tvp; 3777 u_char nc_flag; 3778 uint32_t hash; 3779 bool neg_hot; 3780 3781 cnp = fpl->cnp; 3782 dvp = fpl->dvp; 3783 3784 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) { 3785 return (cache_fplookup_dot(fpl)); 3786 } 3787 3788 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 3789 3790 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 3791 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 3792 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 3793 break; 3794 } 3795 3796 /* 3797 * If there is no entry we have to punt to the slow path to perform 3798 * actual lookup. Should there be nothing with this name a negative 3799 * entry will be created. 3800 */ 3801 if (__predict_false(ncp == NULL)) { 3802 return (cache_fpl_partial(fpl)); 3803 } 3804 3805 tvp = atomic_load_ptr(&ncp->nc_vp); 3806 nc_flag = atomic_load_char(&ncp->nc_flag); 3807 if ((nc_flag & NCF_NEGATIVE) != 0) { 3808 /* 3809 * If they want to create an entry we need to replace this one. 3810 */ 3811 if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) { 3812 return (cache_fpl_partial(fpl)); 3813 } 3814 ns = NCP2NEGSTATE(ncp); 3815 neg_hot = ((ns->neg_flag & NEG_HOT) != 0); 3816 if (__predict_false(!cache_ncp_canuse(ncp))) { 3817 return (cache_fpl_partial(fpl)); 3818 } 3819 if (__predict_false((nc_flag & NCF_WHITE) != 0)) { 3820 return (cache_fpl_partial(fpl)); 3821 } 3822 if (!neg_hot) { 3823 return (cache_fplookup_negative_promote(fpl, ncp, hash)); 3824 } 3825 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, 3826 ncp->nc_name); 3827 counter_u64_add(numneghits, 1); 3828 cache_fpl_smr_exit(fpl); 3829 return (cache_fpl_handled(fpl, ENOENT)); 3830 } 3831 3832 if (__predict_false(!cache_ncp_canuse(ncp))) { 3833 return (cache_fpl_partial(fpl)); 3834 } 3835 3836 fpl->tvp = tvp; 3837 fpl->tvp_seqc = vn_seqc_read_any(tvp); 3838 if (seqc_in_modify(fpl->tvp_seqc)) { 3839 return (cache_fpl_partial(fpl)); 3840 } 3841 3842 if (!cache_fplookup_vnode_supported(tvp)) { 3843 return (cache_fpl_partial(fpl)); 3844 } 3845 3846 counter_u64_add(numposhits, 1); 3847 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp); 3848 return (0); 3849 } 3850 3851 static bool 3852 cache_fplookup_mp_supported(struct mount *mp) 3853 { 3854 3855 if (mp == NULL) 3856 return (false); 3857 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) 3858 return (false); 3859 return (true); 3860 } 3861 3862 /* 3863 * Walk up the mount stack (if any). 3864 * 3865 * Correctness is provided in the following ways: 3866 * - all vnodes are protected from freeing with SMR 3867 * - struct mount objects are type stable making them always safe to access 3868 * - stability of the particular mount is provided by busying it 3869 * - relationship between the vnode which is mounted on and the mount is 3870 * verified with the vnode sequence counter after busying 3871 * - association between root vnode of the mount and the mount is protected 3872 * by busy 3873 * 3874 * From that point on we can read the sequence counter of the root vnode 3875 * and get the next mount on the stack (if any) using the same protection. 3876 * 3877 * By the end of successful walk we are guaranteed the reached state was 3878 * indeed present at least at some point which matches the regular lookup. 3879 */ 3880 static int __noinline 3881 cache_fplookup_climb_mount(struct cache_fpl *fpl) 3882 { 3883 struct mount *mp, *prev_mp; 3884 struct vnode *vp; 3885 seqc_t vp_seqc; 3886 3887 vp = fpl->tvp; 3888 vp_seqc = fpl->tvp_seqc; 3889 3890 VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp); 3891 mp = atomic_load_ptr(&vp->v_mountedhere); 3892 if (mp == NULL) 3893 return (0); 3894 3895 prev_mp = NULL; 3896 for (;;) { 3897 if (!vfs_op_thread_enter_crit(mp)) { 3898 if (prev_mp != NULL) 3899 vfs_op_thread_exit_crit(prev_mp); 3900 return (cache_fpl_partial(fpl)); 3901 } 3902 if (prev_mp != NULL) 3903 vfs_op_thread_exit_crit(prev_mp); 3904 if (!vn_seqc_consistent(vp, vp_seqc)) { 3905 vfs_op_thread_exit_crit(mp); 3906 return (cache_fpl_partial(fpl)); 3907 } 3908 if (!cache_fplookup_mp_supported(mp)) { 3909 vfs_op_thread_exit_crit(mp); 3910 return (cache_fpl_partial(fpl)); 3911 } 3912 vp = atomic_load_ptr(&mp->mnt_rootvnode); 3913 if (vp == NULL || VN_IS_DOOMED(vp)) { 3914 vfs_op_thread_exit_crit(mp); 3915 return (cache_fpl_partial(fpl)); 3916 } 3917 vp_seqc = vn_seqc_read_any(vp); 3918 if (seqc_in_modify(vp_seqc)) { 3919 vfs_op_thread_exit_crit(mp); 3920 return (cache_fpl_partial(fpl)); 3921 } 3922 prev_mp = mp; 3923 mp = atomic_load_ptr(&vp->v_mountedhere); 3924 if (mp == NULL) 3925 break; 3926 } 3927 3928 vfs_op_thread_exit_crit(prev_mp); 3929 fpl->tvp = vp; 3930 fpl->tvp_seqc = vp_seqc; 3931 return (0); 3932 } 3933 3934 static bool 3935 cache_fplookup_need_climb_mount(struct cache_fpl *fpl) 3936 { 3937 struct mount *mp; 3938 struct vnode *vp; 3939 3940 vp = fpl->tvp; 3941 3942 /* 3943 * Hack: while this is a union, the pointer tends to be NULL so save on 3944 * a branch. 3945 */ 3946 mp = atomic_load_ptr(&vp->v_mountedhere); 3947 if (mp == NULL) 3948 return (false); 3949 if (vp->v_type == VDIR) 3950 return (true); 3951 return (false); 3952 } 3953 3954 /* 3955 * Parse the path. 3956 * 3957 * The code was originally copy-pasted from regular lookup and despite 3958 * clean ups leaves performance on the table. Any modifications here 3959 * must take into account that in case off fallback the resulting 3960 * nameidata state has to be compatible with the original. 3961 */ 3962 static int 3963 cache_fplookup_parse(struct cache_fpl *fpl) 3964 { 3965 struct nameidata *ndp; 3966 struct componentname *cnp; 3967 char *cp; 3968 3969 ndp = fpl->ndp; 3970 cnp = fpl->cnp; 3971 3972 /* 3973 * Search a new directory. 3974 * 3975 * The last component of the filename is left accessible via 3976 * cnp->cn_nameptr for callers that need the name. Callers needing 3977 * the name set the SAVENAME flag. When done, they assume 3978 * responsibility for freeing the pathname buffer. 3979 */ 3980 for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++) 3981 continue; 3982 cnp->cn_namelen = cp - cnp->cn_nameptr; 3983 if (__predict_false(cnp->cn_namelen > NAME_MAX)) { 3984 cache_fpl_smr_exit(fpl); 3985 return (cache_fpl_handled(fpl, ENAMETOOLONG)); 3986 } 3987 ndp->ni_pathlen -= cnp->cn_namelen; 3988 KASSERT(ndp->ni_pathlen <= PATH_MAX, 3989 ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen)); 3990 ndp->ni_next = cp; 3991 3992 /* 3993 * Replace multiple slashes by a single slash and trailing slashes 3994 * by a null. This must be done before VOP_LOOKUP() because some 3995 * fs's don't know about trailing slashes. Remember if there were 3996 * trailing slashes to handle symlinks, existing non-directories 3997 * and non-existing files that won't be directories specially later. 3998 */ 3999 while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) { 4000 cp++; 4001 ndp->ni_pathlen--; 4002 if (*cp == '\0') { 4003 /* 4004 * TODO 4005 * Regular lookup performs the following: 4006 * *ndp->ni_next = '\0'; 4007 * cnp->cn_flags |= TRAILINGSLASH; 4008 * 4009 * Which is problematic since it modifies data read 4010 * from userspace. Then if fast path lookup was to 4011 * abort we would have to either restore it or convey 4012 * the flag. Since this is a corner case just ignore 4013 * it for simplicity. 4014 */ 4015 return (cache_fpl_partial(fpl)); 4016 } 4017 } 4018 ndp->ni_next = cp; 4019 4020 /* 4021 * Check for degenerate name (e.g. / or "") 4022 * which is a way of talking about a directory, 4023 * e.g. like "/." or ".". 4024 * 4025 * TODO 4026 * Another corner case handled by the regular lookup 4027 */ 4028 if (__predict_false(cnp->cn_nameptr[0] == '\0')) { 4029 return (cache_fpl_partial(fpl)); 4030 } 4031 return (0); 4032 } 4033 4034 static void 4035 cache_fplookup_parse_advance(struct cache_fpl *fpl) 4036 { 4037 struct nameidata *ndp; 4038 struct componentname *cnp; 4039 4040 ndp = fpl->ndp; 4041 cnp = fpl->cnp; 4042 4043 cnp->cn_nameptr = ndp->ni_next; 4044 while (*cnp->cn_nameptr == '/') { 4045 cnp->cn_nameptr++; 4046 ndp->ni_pathlen--; 4047 } 4048 } 4049 4050 /* 4051 * See the API contract for VOP_FPLOOKUP_VEXEC. 4052 */ 4053 static int __noinline 4054 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error) 4055 { 4056 struct componentname *cnp; 4057 struct vnode *dvp; 4058 seqc_t dvp_seqc; 4059 4060 cnp = fpl->cnp; 4061 dvp = fpl->dvp; 4062 dvp_seqc = fpl->dvp_seqc; 4063 4064 /* 4065 * Hack: they may be looking up foo/bar, where foo is a 4066 * regular file. In such a case we need to turn ENOTDIR, 4067 * but we may happen to get here with a different error. 4068 */ 4069 if (dvp->v_type != VDIR) { 4070 /* 4071 * The check here is predominantly to catch 4072 * EOPNOTSUPP from dead_vnodeops. If the vnode 4073 * gets doomed past this point it is going to 4074 * fail seqc verification. 4075 */ 4076 if (VN_IS_DOOMED(dvp)) { 4077 return (cache_fpl_aborted(fpl)); 4078 } 4079 error = ENOTDIR; 4080 } 4081 4082 /* 4083 * Hack: handle O_SEARCH. 4084 * 4085 * Open Group Base Specifications Issue 7, 2018 edition states: 4086 * If the access mode of the open file description associated with the 4087 * file descriptor is not O_SEARCH, the function shall check whether 4088 * directory searches are permitted using the current permissions of 4089 * the directory underlying the file descriptor. If the access mode is 4090 * O_SEARCH, the function shall not perform the check. 4091 * 4092 * Regular lookup tests for the NOEXECCHECK flag for every path 4093 * component to decide whether to do the permission check. However, 4094 * since most lookups never have the flag (and when they do it is only 4095 * present for the first path component), lockless lookup only acts on 4096 * it if there is a permission problem. Here the flag is represented 4097 * with a boolean so that we don't have to clear it on the way out. 4098 * 4099 * For simplicity this always aborts. 4100 * TODO: check if this is the first lookup and ignore the permission 4101 * problem. Note the flag has to survive fallback (if it happens to be 4102 * performed). 4103 */ 4104 if (fpl->fsearch) { 4105 return (cache_fpl_aborted(fpl)); 4106 } 4107 4108 switch (error) { 4109 case EAGAIN: 4110 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4111 error = cache_fpl_aborted(fpl); 4112 } else { 4113 cache_fpl_partial(fpl); 4114 } 4115 break; 4116 default: 4117 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4118 error = cache_fpl_aborted(fpl); 4119 } else { 4120 cache_fpl_smr_exit(fpl); 4121 cache_fpl_handled(fpl, error); 4122 } 4123 break; 4124 } 4125 return (error); 4126 } 4127 4128 static int 4129 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl) 4130 { 4131 struct nameidata *ndp; 4132 struct componentname *cnp; 4133 struct mount *mp; 4134 int error; 4135 4136 error = CACHE_FPL_FAILED; 4137 ndp = fpl->ndp; 4138 cnp = fpl->cnp; 4139 4140 cache_fpl_checkpoint(fpl, &fpl->snd); 4141 4142 fpl->dvp = dvp; 4143 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); 4144 if (seqc_in_modify(fpl->dvp_seqc)) { 4145 cache_fpl_aborted(fpl); 4146 goto out; 4147 } 4148 mp = atomic_load_ptr(&fpl->dvp->v_mount); 4149 if (!cache_fplookup_mp_supported(mp)) { 4150 cache_fpl_aborted(fpl); 4151 goto out; 4152 } 4153 4154 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); 4155 4156 for (;;) { 4157 error = cache_fplookup_parse(fpl); 4158 if (__predict_false(error != 0)) { 4159 break; 4160 } 4161 4162 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); 4163 4164 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred); 4165 if (__predict_false(error != 0)) { 4166 error = cache_fplookup_failed_vexec(fpl, error); 4167 break; 4168 } 4169 4170 if (__predict_false(cache_fpl_isdotdot(cnp))) { 4171 error = cache_fplookup_dotdot(fpl); 4172 if (__predict_false(error != 0)) { 4173 break; 4174 } 4175 } else { 4176 error = cache_fplookup_next(fpl); 4177 if (__predict_false(error != 0)) { 4178 break; 4179 } 4180 4181 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 4182 4183 if (cache_fplookup_need_climb_mount(fpl)) { 4184 error = cache_fplookup_climb_mount(fpl); 4185 if (__predict_false(error != 0)) { 4186 break; 4187 } 4188 } 4189 } 4190 4191 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 4192 4193 if (cache_fpl_islastcn(ndp)) { 4194 error = cache_fplookup_final(fpl); 4195 break; 4196 } 4197 4198 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { 4199 error = cache_fpl_aborted(fpl); 4200 break; 4201 } 4202 4203 fpl->dvp = fpl->tvp; 4204 fpl->dvp_seqc = fpl->tvp_seqc; 4205 4206 cache_fplookup_parse_advance(fpl); 4207 cache_fpl_checkpoint(fpl, &fpl->snd); 4208 } 4209 out: 4210 switch (fpl->status) { 4211 case CACHE_FPL_STATUS_UNSET: 4212 __assert_unreachable(); 4213 break; 4214 case CACHE_FPL_STATUS_PARTIAL: 4215 cache_fpl_smr_assert_entered(fpl); 4216 return (cache_fplookup_partial_setup(fpl)); 4217 case CACHE_FPL_STATUS_ABORTED: 4218 if (fpl->in_smr) 4219 cache_fpl_smr_exit(fpl); 4220 return (CACHE_FPL_FAILED); 4221 case CACHE_FPL_STATUS_HANDLED: 4222 MPASS(error != CACHE_FPL_FAILED); 4223 cache_fpl_smr_assert_not_entered(fpl); 4224 if (__predict_false(error != 0)) { 4225 ndp->ni_dvp = NULL; 4226 ndp->ni_vp = NULL; 4227 cache_fpl_cleanup_cnp(cnp); 4228 return (error); 4229 } 4230 ndp->ni_dvp = fpl->dvp; 4231 ndp->ni_vp = fpl->tvp; 4232 if (cnp->cn_flags & SAVENAME) 4233 cnp->cn_flags |= HASBUF; 4234 else 4235 cache_fpl_cleanup_cnp(cnp); 4236 return (error); 4237 } 4238 } 4239 4240 /* 4241 * Fast path lookup protected with SMR and sequence counters. 4242 * 4243 * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one. 4244 * 4245 * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria 4246 * outlined below. 4247 * 4248 * Traditional vnode lookup conceptually looks like this: 4249 * 4250 * vn_lock(current); 4251 * for (;;) { 4252 * next = find(); 4253 * vn_lock(next); 4254 * vn_unlock(current); 4255 * current = next; 4256 * if (last) 4257 * break; 4258 * } 4259 * return (current); 4260 * 4261 * Each jump to the next vnode is safe memory-wise and atomic with respect to 4262 * any modifications thanks to holding respective locks. 4263 * 4264 * The same guarantee can be provided with a combination of safe memory 4265 * reclamation and sequence counters instead. If all operations which affect 4266 * the relationship between the current vnode and the one we are looking for 4267 * also modify the counter, we can verify whether all the conditions held as 4268 * we made the jump. This includes things like permissions, mount points etc. 4269 * Counter modification is provided by enclosing relevant places in 4270 * vn_seqc_write_begin()/end() calls. 4271 * 4272 * Thus this translates to: 4273 * 4274 * vfs_smr_enter(); 4275 * dvp_seqc = seqc_read_any(dvp); 4276 * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode 4277 * abort(); 4278 * for (;;) { 4279 * tvp = find(); 4280 * tvp_seqc = seqc_read_any(tvp); 4281 * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode 4282 * abort(); 4283 * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode 4284 * abort(); 4285 * dvp = tvp; // we know nothing of importance has changed 4286 * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration 4287 * if (last) 4288 * break; 4289 * } 4290 * vget(); // secure the vnode 4291 * if (!seqc_consistent(tvp, tvp_seqc) // final check 4292 * abort(); 4293 * // at this point we know nothing has changed for any parent<->child pair 4294 * // as they were crossed during the lookup, meaning we matched the guarantee 4295 * // of the locked variant 4296 * return (tvp); 4297 * 4298 * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows: 4299 * - they are called while within vfs_smr protection which they must never exit 4300 * - EAGAIN can be returned to denote checking could not be performed, it is 4301 * always valid to return it 4302 * - if the sequence counter has not changed the result must be valid 4303 * - if the sequence counter has changed both false positives and false negatives 4304 * are permitted (since the result will be rejected later) 4305 * - for simple cases of unix permission checks vaccess_vexec_smr can be used 4306 * 4307 * Caveats to watch out for: 4308 * - vnodes are passed unlocked and unreferenced with nothing stopping 4309 * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised 4310 * to use atomic_load_ptr to fetch it. 4311 * - the aforementioned object can also get freed, meaning absent other means it 4312 * should be protected with vfs_smr 4313 * - either safely checking permissions as they are modified or guaranteeing 4314 * their stability is left to the routine 4315 */ 4316 int 4317 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status, 4318 struct pwd **pwdp) 4319 { 4320 struct cache_fpl fpl; 4321 struct pwd *pwd; 4322 struct vnode *dvp; 4323 struct componentname *cnp; 4324 struct nameidata_saved orig; 4325 int error; 4326 4327 MPASS(ndp->ni_lcf == 0); 4328 4329 fpl.status = CACHE_FPL_STATUS_UNSET; 4330 fpl.ndp = ndp; 4331 fpl.cnp = &ndp->ni_cnd; 4332 MPASS(curthread == fpl.cnp->cn_thread); 4333 4334 if ((fpl.cnp->cn_flags & SAVESTART) != 0) 4335 MPASS(fpl.cnp->cn_nameiop != LOOKUP); 4336 4337 if (!cache_can_fplookup(&fpl)) { 4338 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 4339 *status = fpl.status; 4340 return (EOPNOTSUPP); 4341 } 4342 4343 cache_fpl_checkpoint(&fpl, &orig); 4344 4345 cache_fpl_smr_enter_initial(&fpl); 4346 fpl.fsearch = false; 4347 pwd = pwd_get_smr(); 4348 fpl.pwd = pwd; 4349 ndp->ni_rootdir = pwd->pwd_rdir; 4350 ndp->ni_topdir = pwd->pwd_jdir; 4351 4352 cnp = fpl.cnp; 4353 cnp->cn_nameptr = cnp->cn_pnbuf; 4354 if (cnp->cn_pnbuf[0] == '/') { 4355 cache_fpl_handle_root(ndp, &dvp); 4356 } else { 4357 if (ndp->ni_dirfd == AT_FDCWD) { 4358 dvp = pwd->pwd_cdir; 4359 } else { 4360 error = cache_fplookup_dirfd(&fpl, &dvp); 4361 if (__predict_false(error != 0)) { 4362 goto out; 4363 } 4364 } 4365 } 4366 4367 SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true); 4368 4369 error = cache_fplookup_impl(dvp, &fpl); 4370 out: 4371 cache_fpl_smr_assert_not_entered(&fpl); 4372 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 4373 4374 *status = fpl.status; 4375 switch (fpl.status) { 4376 case CACHE_FPL_STATUS_UNSET: 4377 __assert_unreachable(); 4378 break; 4379 case CACHE_FPL_STATUS_HANDLED: 4380 SDT_PROBE3(vfs, namei, lookup, return, error, 4381 (error == 0 ? ndp->ni_vp : NULL), true); 4382 break; 4383 case CACHE_FPL_STATUS_PARTIAL: 4384 *pwdp = fpl.pwd; 4385 /* 4386 * Status restored by cache_fplookup_partial_setup. 4387 */ 4388 break; 4389 case CACHE_FPL_STATUS_ABORTED: 4390 cache_fpl_restore(&fpl, &orig); 4391 break; 4392 } 4393 return (error); 4394 } 4395