1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Poul-Henning Kamp of the FreeBSD Project. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_ddb.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/capsicum.h> 46 #include <sys/counter.h> 47 #include <sys/filedesc.h> 48 #include <sys/fnv_hash.h> 49 #include <sys/kernel.h> 50 #include <sys/ktr.h> 51 #include <sys/lock.h> 52 #include <sys/malloc.h> 53 #include <sys/fcntl.h> 54 #include <sys/jail.h> 55 #include <sys/mount.h> 56 #include <sys/namei.h> 57 #include <sys/proc.h> 58 #include <sys/seqc.h> 59 #include <sys/sdt.h> 60 #include <sys/smr.h> 61 #include <sys/smp.h> 62 #include <sys/syscallsubr.h> 63 #include <sys/sysctl.h> 64 #include <sys/sysproto.h> 65 #include <sys/vnode.h> 66 #include <ck_queue.h> 67 #ifdef KTRACE 68 #include <sys/ktrace.h> 69 #endif 70 71 #include <sys/capsicum.h> 72 73 #include <security/audit/audit.h> 74 #include <security/mac/mac_framework.h> 75 76 #ifdef DDB 77 #include <ddb/ddb.h> 78 #endif 79 80 #include <vm/uma.h> 81 82 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 83 "Name cache"); 84 85 SDT_PROVIDER_DECLARE(vfs); 86 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", 87 "struct vnode *"); 88 SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *", 89 "struct vnode *"); 90 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", 91 "char *"); 92 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *", 93 "const char *"); 94 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *", 95 "struct namecache *", "int", "int"); 96 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); 97 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", 98 "char *", "struct vnode *"); 99 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); 100 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", 101 "struct vnode *", "char *"); 102 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", 103 "struct vnode *"); 104 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, 105 "struct vnode *", "char *"); 106 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", 107 "char *"); 108 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *", 109 "struct componentname *"); 110 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *", 111 "struct componentname *"); 112 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); 113 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); 114 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); 115 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", 116 "struct vnode *"); 117 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", 118 "char *"); 119 SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *", 120 "char *"); 121 122 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool"); 123 SDT_PROBE_DECLARE(vfs, namei, lookup, entry); 124 SDT_PROBE_DECLARE(vfs, namei, lookup, return); 125 126 /* 127 * This structure describes the elements in the cache of recent 128 * names looked up by namei. 129 */ 130 struct negstate { 131 u_char neg_flag; 132 u_char neg_hit; 133 }; 134 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *), 135 "the state must fit in a union with a pointer without growing it"); 136 137 struct namecache { 138 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 139 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 140 CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */ 141 struct vnode *nc_dvp; /* vnode of parent of name */ 142 union { 143 struct vnode *nu_vp; /* vnode the name refers to */ 144 struct negstate nu_neg;/* negative entry state */ 145 } n_un; 146 u_char nc_flag; /* flag bits */ 147 u_char nc_nlen; /* length of name */ 148 char nc_name[0]; /* segment name + nul */ 149 }; 150 151 /* 152 * struct namecache_ts repeats struct namecache layout up to the 153 * nc_nlen member. 154 * struct namecache_ts is used in place of struct namecache when time(s) need 155 * to be stored. The nc_dotdottime field is used when a cache entry is mapping 156 * both a non-dotdot directory name plus dotdot for the directory's 157 * parent. 158 * 159 * See below for alignment requirement. 160 */ 161 struct namecache_ts { 162 struct timespec nc_time; /* timespec provided by fs */ 163 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ 164 int nc_ticks; /* ticks value when entry was added */ 165 struct namecache nc_nc; 166 }; 167 168 /* 169 * At least mips n32 performs 64-bit accesses to timespec as found 170 * in namecache_ts and requires them to be aligned. Since others 171 * may be in the same spot suffer a little bit and enforce the 172 * alignment for everyone. Note this is a nop for 64-bit platforms. 173 */ 174 #define CACHE_ZONE_ALIGNMENT UMA_ALIGNOF(time_t) 175 #define CACHE_PATH_CUTOFF 39 176 177 #define CACHE_ZONE_SMALL_SIZE (sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1) 178 #define CACHE_ZONE_SMALL_TS_SIZE (sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1) 179 #define CACHE_ZONE_LARGE_SIZE (sizeof(struct namecache) + NAME_MAX + 1) 180 #define CACHE_ZONE_LARGE_TS_SIZE (sizeof(struct namecache_ts) + NAME_MAX + 1) 181 182 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 183 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 184 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 185 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 186 187 #define nc_vp n_un.nu_vp 188 #define nc_neg n_un.nu_neg 189 190 /* 191 * Flags in namecache.nc_flag 192 */ 193 #define NCF_WHITE 0x01 194 #define NCF_ISDOTDOT 0x02 195 #define NCF_TS 0x04 196 #define NCF_DTS 0x08 197 #define NCF_DVDROP 0x10 198 #define NCF_NEGATIVE 0x20 199 #define NCF_INVALID 0x40 200 #define NCF_WIP 0x80 201 202 /* 203 * Flags in negstate.neg_flag 204 */ 205 #define NEG_HOT 0x01 206 207 /* 208 * Mark an entry as invalid. 209 * 210 * This is called before it starts getting deconstructed. 211 */ 212 static void 213 cache_ncp_invalidate(struct namecache *ncp) 214 { 215 216 KASSERT((ncp->nc_flag & NCF_INVALID) == 0, 217 ("%s: entry %p already invalid", __func__, ncp)); 218 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID); 219 atomic_thread_fence_rel(); 220 } 221 222 /* 223 * Check whether the entry can be safely used. 224 * 225 * All places which elide locks are supposed to call this after they are 226 * done with reading from an entry. 227 */ 228 static bool 229 cache_ncp_canuse(struct namecache *ncp) 230 { 231 232 atomic_thread_fence_acq(); 233 return ((atomic_load_char(&ncp->nc_flag) & (NCF_INVALID | NCF_WIP)) == 0); 234 } 235 236 /* 237 * Name caching works as follows: 238 * 239 * Names found by directory scans are retained in a cache 240 * for future reference. It is managed LRU, so frequently 241 * used names will hang around. Cache is indexed by hash value 242 * obtained from (dvp, name) where dvp refers to the directory 243 * containing name. 244 * 245 * If it is a "negative" entry, (i.e. for a name that is known NOT to 246 * exist) the vnode pointer will be NULL. 247 * 248 * Upon reaching the last segment of a path, if the reference 249 * is for DELETE, or NOCACHE is set (rewrite), and the 250 * name is located in the cache, it will be dropped. 251 * 252 * These locks are used (in the order in which they can be taken): 253 * NAME TYPE ROLE 254 * vnodelock mtx vnode lists and v_cache_dd field protection 255 * bucketlock mtx for access to given set of hash buckets 256 * neglist mtx negative entry LRU management 257 * 258 * It is legal to take multiple vnodelock and bucketlock locks. The locking 259 * order is lower address first. Both are recursive. 260 * 261 * "." lookups are lockless. 262 * 263 * ".." and vnode -> name lookups require vnodelock. 264 * 265 * name -> vnode lookup requires the relevant bucketlock to be held for reading. 266 * 267 * Insertions and removals of entries require involved vnodes and bucketlocks 268 * to be locked to provide safe operation against other threads modifying the 269 * cache. 270 * 271 * Some lookups result in removal of the found entry (e.g. getting rid of a 272 * negative entry with the intent to create a positive one), which poses a 273 * problem when multiple threads reach the state. Similarly, two different 274 * threads can purge two different vnodes and try to remove the same name. 275 * 276 * If the already held vnode lock is lower than the second required lock, we 277 * can just take the other lock. However, in the opposite case, this could 278 * deadlock. As such, this is resolved by trylocking and if that fails unlocking 279 * the first node, locking everything in order and revalidating the state. 280 */ 281 282 VFS_SMR_DECLARE; 283 284 static SYSCTL_NODE(_vfs_cache, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 285 "Name cache parameters"); 286 287 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */ 288 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, size, CTLFLAG_RW, &ncsize, 0, 289 "Total namecache capacity"); 290 291 u_int ncsizefactor = 2; 292 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, sizefactor, CTLFLAG_RW, &ncsizefactor, 0, 293 "Size factor for namecache"); 294 295 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */ 296 SYSCTL_ULONG(_vfs_cache_param, OID_AUTO, negfactor, CTLFLAG_RW, &ncnegfactor, 0, 297 "Ratio of negative namecache entries"); 298 299 /* 300 * Negative entry % of namecahe capacity above which automatic eviction is allowed. 301 * 302 * Check cache_neg_evict_cond for details. 303 */ 304 static u_int ncnegminpct = 3; 305 306 static u_int __read_mostly neg_min; /* the above recomputed against ncsize */ 307 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, negmin, CTLFLAG_RD, &neg_min, 0, 308 "Negative entry count above which automatic eviction is allowed"); 309 310 /* 311 * Structures associated with name caching. 312 */ 313 #define NCHHASH(hash) \ 314 (&nchashtbl[(hash) & nchash]) 315 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ 316 static u_long __read_mostly nchash; /* size of hash table */ 317 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 318 "Size of namecache hash table"); 319 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ 320 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ 321 322 struct nchstats nchstats; /* cache effectiveness statistics */ 323 324 static bool __read_frequently cache_fast_revlookup = true; 325 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_revlookup, CTLFLAG_RW, 326 &cache_fast_revlookup, 0, ""); 327 328 static u_int __exclusive_cache_line neg_cycle; 329 330 #define ncneghash 3 331 #define numneglists (ncneghash + 1) 332 333 struct neglist { 334 struct mtx nl_evict_lock; 335 struct mtx nl_lock __aligned(CACHE_LINE_SIZE); 336 TAILQ_HEAD(, namecache) nl_list; 337 TAILQ_HEAD(, namecache) nl_hotlist; 338 u_long nl_hotnum; 339 } __aligned(CACHE_LINE_SIZE); 340 341 static struct neglist neglists[numneglists]; 342 343 static inline struct neglist * 344 NCP2NEGLIST(struct namecache *ncp) 345 { 346 347 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); 348 } 349 350 static inline struct negstate * 351 NCP2NEGSTATE(struct namecache *ncp) 352 { 353 354 MPASS(ncp->nc_flag & NCF_NEGATIVE); 355 return (&ncp->nc_neg); 356 } 357 358 #define numbucketlocks (ncbuckethash + 1) 359 static u_int __read_mostly ncbuckethash; 360 static struct mtx_padalign __read_mostly *bucketlocks; 361 #define HASH2BUCKETLOCK(hash) \ 362 ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)])) 363 364 #define numvnodelocks (ncvnodehash + 1) 365 static u_int __read_mostly ncvnodehash; 366 static struct mtx __read_mostly *vnodelocks; 367 static inline struct mtx * 368 VP2VNODELOCK(struct vnode *vp) 369 { 370 371 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); 372 } 373 374 /* 375 * UMA zones for the VFS cache. 376 * 377 * The small cache is used for entries with short names, which are the 378 * most common. The large cache is used for entries which are too big to 379 * fit in the small cache. 380 */ 381 static uma_zone_t __read_mostly cache_zone_small; 382 static uma_zone_t __read_mostly cache_zone_small_ts; 383 static uma_zone_t __read_mostly cache_zone_large; 384 static uma_zone_t __read_mostly cache_zone_large_ts; 385 386 static struct namecache * 387 cache_alloc(int len, int ts) 388 { 389 struct namecache_ts *ncp_ts; 390 struct namecache *ncp; 391 392 if (__predict_false(ts)) { 393 if (len <= CACHE_PATH_CUTOFF) 394 ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK); 395 else 396 ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK); 397 ncp = &ncp_ts->nc_nc; 398 } else { 399 if (len <= CACHE_PATH_CUTOFF) 400 ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK); 401 else 402 ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK); 403 } 404 return (ncp); 405 } 406 407 static void 408 cache_free(struct namecache *ncp) 409 { 410 struct namecache_ts *ncp_ts; 411 412 MPASS(ncp != NULL); 413 if ((ncp->nc_flag & NCF_DVDROP) != 0) 414 vdrop(ncp->nc_dvp); 415 if (__predict_false(ncp->nc_flag & NCF_TS)) { 416 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 417 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 418 uma_zfree_smr(cache_zone_small_ts, ncp_ts); 419 else 420 uma_zfree_smr(cache_zone_large_ts, ncp_ts); 421 } else { 422 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 423 uma_zfree_smr(cache_zone_small, ncp); 424 else 425 uma_zfree_smr(cache_zone_large, ncp); 426 } 427 } 428 429 static void 430 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) 431 { 432 struct namecache_ts *ncp_ts; 433 434 KASSERT((ncp->nc_flag & NCF_TS) != 0 || 435 (tsp == NULL && ticksp == NULL), 436 ("No NCF_TS")); 437 438 if (tsp == NULL) 439 return; 440 441 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 442 *tsp = ncp_ts->nc_time; 443 *ticksp = ncp_ts->nc_ticks; 444 } 445 446 #ifdef DEBUG_CACHE 447 static int __read_mostly doingcache = 1; /* 1 => enable the cache */ 448 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, 449 "VFS namecache enabled"); 450 #endif 451 452 /* Export size information to userland */ 453 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 454 sizeof(struct namecache), "sizeof(struct namecache)"); 455 456 /* 457 * The new name cache statistics 458 */ 459 static SYSCTL_NODE(_vfs_cache, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 460 "Name cache statistics"); 461 462 #define STATNODE_ULONG(name, varname, descr) \ 463 SYSCTL_ULONG(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr); 464 #define STATNODE_COUNTER(name, varname, descr) \ 465 static COUNTER_U64_DEFINE_EARLY(varname); \ 466 SYSCTL_COUNTER_U64(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, \ 467 descr); 468 STATNODE_ULONG(neg, numneg, "Number of negative cache entries"); 469 STATNODE_ULONG(count, numcache, "Number of cache entries"); 470 STATNODE_COUNTER(heldvnodes, numcachehv, "Number of namecache entries with vnodes held"); 471 STATNODE_COUNTER(drops, numdrops, "Number of dropped entries due to reaching the limit"); 472 STATNODE_COUNTER(dothits, dothits, "Number of '.' hits"); 473 STATNODE_COUNTER(dotdothis, dotdothits, "Number of '..' hits"); 474 STATNODE_COUNTER(miss, nummiss, "Number of cache misses"); 475 STATNODE_COUNTER(misszap, nummisszap, "Number of cache misses we do not want to cache"); 476 STATNODE_COUNTER(posszaps, numposzaps, 477 "Number of cache hits (positive) we do not want to cache"); 478 STATNODE_COUNTER(poshits, numposhits, "Number of cache hits (positive)"); 479 STATNODE_COUNTER(negzaps, numnegzaps, 480 "Number of cache hits (negative) we do not want to cache"); 481 STATNODE_COUNTER(neghits, numneghits, "Number of cache hits (negative)"); 482 /* These count for vn_getcwd(), too. */ 483 STATNODE_COUNTER(fullpathcalls, numfullpathcalls, "Number of fullpath search calls"); 484 STATNODE_COUNTER(fullpathfail1, numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); 485 STATNODE_COUNTER(fullpathfail2, numfullpathfail2, 486 "Number of fullpath search errors (VOP_VPTOCNP failures)"); 487 STATNODE_COUNTER(fullpathfail4, numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); 488 STATNODE_COUNTER(fullpathfound, numfullpathfound, "Number of successful fullpath calls"); 489 490 /* 491 * Debug or developer statistics. 492 */ 493 static SYSCTL_NODE(_vfs_cache, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 494 "Name cache debugging"); 495 #define DEBUGNODE_ULONG(name, varname, descr) \ 496 SYSCTL_ULONG(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr); 497 #define DEBUGNODE_COUNTER(name, varname, descr) \ 498 static COUNTER_U64_DEFINE_EARLY(varname); \ 499 SYSCTL_COUNTER_U64(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, \ 500 descr); 501 DEBUGNODE_COUNTER(zap_bucket_relock_success, zap_bucket_relock_success, 502 "Number of successful removals after relocking"); 503 static long zap_bucket_fail; 504 DEBUGNODE_ULONG(zap_bucket_fail, zap_bucket_fail, ""); 505 static long zap_bucket_fail2; 506 DEBUGNODE_ULONG(zap_bucket_fail2, zap_bucket_fail2, ""); 507 static long cache_lock_vnodes_cel_3_failures; 508 DEBUGNODE_ULONG(vnodes_cel_3_failures, cache_lock_vnodes_cel_3_failures, 509 "Number of times 3-way vnode locking failed"); 510 511 static void cache_zap_locked(struct namecache *ncp); 512 static int vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, 513 char **freebuf, size_t *buflen); 514 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 515 char **retbuf, size_t *buflen, bool slash_prefixed, size_t addend); 516 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, 517 char **retbuf, size_t *buflen); 518 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, 519 char **retbuf, size_t *len, bool slash_prefixed, size_t addend); 520 521 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 522 523 static inline void 524 cache_assert_vlp_locked(struct mtx *vlp) 525 { 526 527 if (vlp != NULL) 528 mtx_assert(vlp, MA_OWNED); 529 } 530 531 static inline void 532 cache_assert_vnode_locked(struct vnode *vp) 533 { 534 struct mtx *vlp; 535 536 vlp = VP2VNODELOCK(vp); 537 cache_assert_vlp_locked(vlp); 538 } 539 540 /* 541 * TODO: With the value stored we can do better than computing the hash based 542 * on the address. The choice of FNV should also be revisited. 543 */ 544 static void 545 cache_prehash(struct vnode *vp) 546 { 547 548 vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT); 549 } 550 551 static uint32_t 552 cache_get_hash(char *name, u_char len, struct vnode *dvp) 553 { 554 555 return (fnv_32_buf(name, len, dvp->v_nchash)); 556 } 557 558 static inline struct nchashhead * 559 NCP2BUCKET(struct namecache *ncp) 560 { 561 uint32_t hash; 562 563 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 564 return (NCHHASH(hash)); 565 } 566 567 static inline struct mtx * 568 NCP2BUCKETLOCK(struct namecache *ncp) 569 { 570 uint32_t hash; 571 572 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 573 return (HASH2BUCKETLOCK(hash)); 574 } 575 576 #ifdef INVARIANTS 577 static void 578 cache_assert_bucket_locked(struct namecache *ncp) 579 { 580 struct mtx *blp; 581 582 blp = NCP2BUCKETLOCK(ncp); 583 mtx_assert(blp, MA_OWNED); 584 } 585 586 static void 587 cache_assert_bucket_unlocked(struct namecache *ncp) 588 { 589 struct mtx *blp; 590 591 blp = NCP2BUCKETLOCK(ncp); 592 mtx_assert(blp, MA_NOTOWNED); 593 } 594 #else 595 #define cache_assert_bucket_locked(x) do { } while (0) 596 #define cache_assert_bucket_unlocked(x) do { } while (0) 597 #endif 598 599 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) 600 static void 601 _cache_sort_vnodes(void **p1, void **p2) 602 { 603 void *tmp; 604 605 MPASS(*p1 != NULL || *p2 != NULL); 606 607 if (*p1 > *p2) { 608 tmp = *p2; 609 *p2 = *p1; 610 *p1 = tmp; 611 } 612 } 613 614 static void 615 cache_lock_all_buckets(void) 616 { 617 u_int i; 618 619 for (i = 0; i < numbucketlocks; i++) 620 mtx_lock(&bucketlocks[i]); 621 } 622 623 static void 624 cache_unlock_all_buckets(void) 625 { 626 u_int i; 627 628 for (i = 0; i < numbucketlocks; i++) 629 mtx_unlock(&bucketlocks[i]); 630 } 631 632 static void 633 cache_lock_all_vnodes(void) 634 { 635 u_int i; 636 637 for (i = 0; i < numvnodelocks; i++) 638 mtx_lock(&vnodelocks[i]); 639 } 640 641 static void 642 cache_unlock_all_vnodes(void) 643 { 644 u_int i; 645 646 for (i = 0; i < numvnodelocks; i++) 647 mtx_unlock(&vnodelocks[i]); 648 } 649 650 static int 651 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 652 { 653 654 cache_sort_vnodes(&vlp1, &vlp2); 655 656 if (vlp1 != NULL) { 657 if (!mtx_trylock(vlp1)) 658 return (EAGAIN); 659 } 660 if (!mtx_trylock(vlp2)) { 661 if (vlp1 != NULL) 662 mtx_unlock(vlp1); 663 return (EAGAIN); 664 } 665 666 return (0); 667 } 668 669 static void 670 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 671 { 672 673 MPASS(vlp1 != NULL || vlp2 != NULL); 674 MPASS(vlp1 <= vlp2); 675 676 if (vlp1 != NULL) 677 mtx_lock(vlp1); 678 if (vlp2 != NULL) 679 mtx_lock(vlp2); 680 } 681 682 static void 683 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 684 { 685 686 MPASS(vlp1 != NULL || vlp2 != NULL); 687 688 if (vlp1 != NULL) 689 mtx_unlock(vlp1); 690 if (vlp2 != NULL) 691 mtx_unlock(vlp2); 692 } 693 694 static int 695 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 696 { 697 struct nchstats snap; 698 699 if (req->oldptr == NULL) 700 return (SYSCTL_OUT(req, 0, sizeof(snap))); 701 702 snap = nchstats; 703 snap.ncs_goodhits = counter_u64_fetch(numposhits); 704 snap.ncs_neghits = counter_u64_fetch(numneghits); 705 snap.ncs_badhits = counter_u64_fetch(numposzaps) + 706 counter_u64_fetch(numnegzaps); 707 snap.ncs_miss = counter_u64_fetch(nummisszap) + 708 counter_u64_fetch(nummiss); 709 710 return (SYSCTL_OUT(req, &snap, sizeof(snap))); 711 } 712 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | 713 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", 714 "VFS cache effectiveness statistics"); 715 716 static void 717 cache_recalc_neg_min(u_int val) 718 { 719 720 neg_min = (ncsize * val) / 100; 721 } 722 723 static int 724 sysctl_negminpct(SYSCTL_HANDLER_ARGS) 725 { 726 u_int val; 727 int error; 728 729 val = ncnegminpct; 730 error = sysctl_handle_int(oidp, &val, 0, req); 731 if (error != 0 || req->newptr == NULL) 732 return (error); 733 734 if (val == ncnegminpct) 735 return (0); 736 if (val < 0 || val > 99) 737 return (EINVAL); 738 ncnegminpct = val; 739 cache_recalc_neg_min(val); 740 return (0); 741 } 742 743 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, negminpct, 744 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_negminpct, 745 "I", "Negative entry \% of namecahe capacity above which automatic eviction is allowed"); 746 747 #ifdef DIAGNOSTIC 748 /* 749 * Grab an atomic snapshot of the name cache hash chain lengths 750 */ 751 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, 752 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 753 "hash table stats"); 754 755 static int 756 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) 757 { 758 struct nchashhead *ncpp; 759 struct namecache *ncp; 760 int i, error, n_nchash, *cntbuf; 761 762 retry: 763 n_nchash = nchash + 1; /* nchash is max index, not count */ 764 if (req->oldptr == NULL) 765 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); 766 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); 767 cache_lock_all_buckets(); 768 if (n_nchash != nchash + 1) { 769 cache_unlock_all_buckets(); 770 free(cntbuf, M_TEMP); 771 goto retry; 772 } 773 /* Scan hash tables counting entries */ 774 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) 775 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) 776 cntbuf[i]++; 777 cache_unlock_all_buckets(); 778 for (error = 0, i = 0; i < n_nchash; i++) 779 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) 780 break; 781 free(cntbuf, M_TEMP); 782 return (error); 783 } 784 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| 785 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", 786 "nchash chain lengths"); 787 788 static int 789 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) 790 { 791 int error; 792 struct nchashhead *ncpp; 793 struct namecache *ncp; 794 int n_nchash; 795 int count, maxlength, used, pct; 796 797 if (!req->oldptr) 798 return SYSCTL_OUT(req, 0, 4 * sizeof(int)); 799 800 cache_lock_all_buckets(); 801 n_nchash = nchash + 1; /* nchash is max index, not count */ 802 used = 0; 803 maxlength = 0; 804 805 /* Scan hash tables for applicable entries */ 806 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { 807 count = 0; 808 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) { 809 count++; 810 } 811 if (count) 812 used++; 813 if (maxlength < count) 814 maxlength = count; 815 } 816 n_nchash = nchash + 1; 817 cache_unlock_all_buckets(); 818 pct = (used * 100) / (n_nchash / 100); 819 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); 820 if (error) 821 return (error); 822 error = SYSCTL_OUT(req, &used, sizeof(used)); 823 if (error) 824 return (error); 825 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); 826 if (error) 827 return (error); 828 error = SYSCTL_OUT(req, &pct, sizeof(pct)); 829 if (error) 830 return (error); 831 return (0); 832 } 833 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| 834 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", 835 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); 836 #endif 837 838 /* 839 * Negative entries management 840 * 841 * Various workloads create plenty of negative entries and barely use them 842 * afterwards. Moreover malicious users can keep performing bogus lookups 843 * adding even more entries. For example "make tinderbox" as of writing this 844 * comment ends up with 2.6M namecache entries in total, 1.2M of which are 845 * negative. 846 * 847 * As such, a rather aggressive eviction method is needed. The currently 848 * employed method is a placeholder. 849 * 850 * Entries are split over numneglists separate lists, each of which is further 851 * split into hot and cold entries. Entries get promoted after getting a hit. 852 * Eviction happens on addition of new entry. 853 */ 854 static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 855 "Name cache negative entry statistics"); 856 857 SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0, 858 "Number of negative cache entries"); 859 860 static COUNTER_U64_DEFINE_EARLY(neg_created); 861 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created, 862 "Number of created negative entries"); 863 864 static COUNTER_U64_DEFINE_EARLY(neg_evicted); 865 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted, 866 "Number of evicted negative entries"); 867 868 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty); 869 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD, 870 &neg_evict_skipped_empty, 871 "Number of times evicting failed due to lack of entries"); 872 873 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed); 874 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD, 875 &neg_evict_skipped_missed, 876 "Number of times evicting failed due to target entry disappearing"); 877 878 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended); 879 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD, 880 &neg_evict_skipped_contended, 881 "Number of times evicting failed due to contention"); 882 883 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits, 884 "Number of cache hits (negative)"); 885 886 static int 887 sysctl_neg_hot(SYSCTL_HANDLER_ARGS) 888 { 889 int i, out; 890 891 out = 0; 892 for (i = 0; i < numneglists; i++) 893 out += neglists[i].nl_hotnum; 894 895 return (SYSCTL_OUT(req, &out, sizeof(out))); 896 } 897 SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD | 898 CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I", 899 "Number of hot negative entries"); 900 901 static void 902 cache_neg_init(struct namecache *ncp) 903 { 904 struct negstate *ns; 905 906 ncp->nc_flag |= NCF_NEGATIVE; 907 ns = NCP2NEGSTATE(ncp); 908 ns->neg_flag = 0; 909 ns->neg_hit = 0; 910 counter_u64_add(neg_created, 1); 911 } 912 913 #define CACHE_NEG_PROMOTION_THRESH 2 914 915 static bool 916 cache_neg_hit_prep(struct namecache *ncp) 917 { 918 struct negstate *ns; 919 u_char n; 920 921 ns = NCP2NEGSTATE(ncp); 922 n = atomic_load_char(&ns->neg_hit); 923 for (;;) { 924 if (n >= CACHE_NEG_PROMOTION_THRESH) 925 return (false); 926 if (atomic_fcmpset_8(&ns->neg_hit, &n, n + 1)) 927 break; 928 } 929 return (n + 1 == CACHE_NEG_PROMOTION_THRESH); 930 } 931 932 /* 933 * Nothing to do here but it is provided for completeness as some 934 * cache_neg_hit_prep callers may end up returning without even 935 * trying to promote. 936 */ 937 #define cache_neg_hit_abort(ncp) do { } while (0) 938 939 static void 940 cache_neg_hit_finish(struct namecache *ncp) 941 { 942 943 SDT_PROBE2(vfs, namecache, lookup, hit__negative, ncp->nc_dvp, ncp->nc_name); 944 counter_u64_add(numneghits, 1); 945 } 946 947 /* 948 * Move a negative entry to the hot list. 949 */ 950 static void 951 cache_neg_promote_locked(struct namecache *ncp) 952 { 953 struct neglist *nl; 954 struct negstate *ns; 955 956 ns = NCP2NEGSTATE(ncp); 957 nl = NCP2NEGLIST(ncp); 958 mtx_assert(&nl->nl_lock, MA_OWNED); 959 if ((ns->neg_flag & NEG_HOT) == 0) { 960 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst); 961 TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst); 962 nl->nl_hotnum++; 963 ns->neg_flag |= NEG_HOT; 964 } 965 } 966 967 /* 968 * Move a hot negative entry to the cold list. 969 */ 970 static void 971 cache_neg_demote_locked(struct namecache *ncp) 972 { 973 struct neglist *nl; 974 struct negstate *ns; 975 976 ns = NCP2NEGSTATE(ncp); 977 nl = NCP2NEGLIST(ncp); 978 mtx_assert(&nl->nl_lock, MA_OWNED); 979 MPASS(ns->neg_flag & NEG_HOT); 980 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst); 981 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst); 982 nl->nl_hotnum--; 983 ns->neg_flag &= ~NEG_HOT; 984 atomic_store_char(&ns->neg_hit, 0); 985 } 986 987 /* 988 * Move a negative entry to the hot list if it matches the lookup. 989 * 990 * We have to take locks, but they may be contended and in the worst 991 * case we may need to go off CPU. We don't want to spin within the 992 * smr section and we can't block with it. Exiting the section means 993 * the found entry could have been evicted. We are going to look it 994 * up again. 995 */ 996 static bool 997 cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp, 998 struct namecache *oncp, uint32_t hash) 999 { 1000 struct namecache *ncp; 1001 struct neglist *nl; 1002 u_char nc_flag; 1003 1004 nl = NCP2NEGLIST(oncp); 1005 1006 mtx_lock(&nl->nl_lock); 1007 /* 1008 * For hash iteration. 1009 */ 1010 vfs_smr_enter(); 1011 1012 /* 1013 * Avoid all surprises by only succeeding if we got the same entry and 1014 * bailing completely otherwise. 1015 * XXX There are no provisions to keep the vnode around, meaning we may 1016 * end up promoting a negative entry for a *new* vnode and returning 1017 * ENOENT on its account. This is the error we want to return anyway 1018 * and promotion is harmless. 1019 * 1020 * In particular at this point there can be a new ncp which matches the 1021 * search but hashes to a different neglist. 1022 */ 1023 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1024 if (ncp == oncp) 1025 break; 1026 } 1027 1028 /* 1029 * No match to begin with. 1030 */ 1031 if (__predict_false(ncp == NULL)) { 1032 goto out_abort; 1033 } 1034 1035 /* 1036 * The newly found entry may be something different... 1037 */ 1038 if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1039 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) { 1040 goto out_abort; 1041 } 1042 1043 /* 1044 * ... and not even negative. 1045 */ 1046 nc_flag = atomic_load_char(&ncp->nc_flag); 1047 if ((nc_flag & NCF_NEGATIVE) == 0) { 1048 goto out_abort; 1049 } 1050 1051 if (__predict_false(!cache_ncp_canuse(ncp))) { 1052 goto out_abort; 1053 } 1054 1055 cache_neg_promote_locked(ncp); 1056 cache_neg_hit_finish(ncp); 1057 vfs_smr_exit(); 1058 mtx_unlock(&nl->nl_lock); 1059 return (true); 1060 out_abort: 1061 vfs_smr_exit(); 1062 mtx_unlock(&nl->nl_lock); 1063 return (false); 1064 } 1065 1066 static void 1067 cache_neg_promote(struct namecache *ncp) 1068 { 1069 struct neglist *nl; 1070 1071 nl = NCP2NEGLIST(ncp); 1072 mtx_lock(&nl->nl_lock); 1073 cache_neg_promote_locked(ncp); 1074 mtx_unlock(&nl->nl_lock); 1075 } 1076 1077 static void 1078 cache_neg_insert(struct namecache *ncp) 1079 { 1080 struct neglist *nl; 1081 1082 MPASS(ncp->nc_flag & NCF_NEGATIVE); 1083 cache_assert_bucket_locked(ncp); 1084 nl = NCP2NEGLIST(ncp); 1085 mtx_lock(&nl->nl_lock); 1086 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst); 1087 mtx_unlock(&nl->nl_lock); 1088 atomic_add_long(&numneg, 1); 1089 } 1090 1091 static void 1092 cache_neg_remove(struct namecache *ncp) 1093 { 1094 struct neglist *nl; 1095 struct negstate *ns; 1096 1097 cache_assert_bucket_locked(ncp); 1098 nl = NCP2NEGLIST(ncp); 1099 ns = NCP2NEGSTATE(ncp); 1100 mtx_lock(&nl->nl_lock); 1101 if ((ns->neg_flag & NEG_HOT) != 0) { 1102 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst); 1103 nl->nl_hotnum--; 1104 } else { 1105 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst); 1106 } 1107 mtx_unlock(&nl->nl_lock); 1108 atomic_subtract_long(&numneg, 1); 1109 } 1110 1111 static struct neglist * 1112 cache_neg_evict_select_list(void) 1113 { 1114 struct neglist *nl; 1115 u_int c; 1116 1117 c = atomic_fetchadd_int(&neg_cycle, 1) + 1; 1118 nl = &neglists[c % numneglists]; 1119 if (!mtx_trylock(&nl->nl_evict_lock)) { 1120 counter_u64_add(neg_evict_skipped_contended, 1); 1121 return (NULL); 1122 } 1123 return (nl); 1124 } 1125 1126 static struct namecache * 1127 cache_neg_evict_select_entry(struct neglist *nl) 1128 { 1129 struct namecache *ncp, *lncp; 1130 struct negstate *ns, *lns; 1131 int i; 1132 1133 mtx_assert(&nl->nl_evict_lock, MA_OWNED); 1134 mtx_assert(&nl->nl_lock, MA_OWNED); 1135 ncp = TAILQ_FIRST(&nl->nl_list); 1136 if (ncp == NULL) 1137 return (NULL); 1138 lncp = ncp; 1139 lns = NCP2NEGSTATE(lncp); 1140 for (i = 1; i < 4; i++) { 1141 ncp = TAILQ_NEXT(ncp, nc_dst); 1142 if (ncp == NULL) 1143 break; 1144 ns = NCP2NEGSTATE(ncp); 1145 if (ns->neg_hit < lns->neg_hit) { 1146 lncp = ncp; 1147 lns = ns; 1148 } 1149 } 1150 return (lncp); 1151 } 1152 1153 static bool 1154 cache_neg_evict(void) 1155 { 1156 struct namecache *ncp, *ncp2; 1157 struct neglist *nl; 1158 struct negstate *ns; 1159 struct vnode *dvp; 1160 struct mtx *dvlp; 1161 struct mtx *blp; 1162 uint32_t hash; 1163 u_char nlen; 1164 bool evicted; 1165 1166 nl = cache_neg_evict_select_list(); 1167 if (nl == NULL) { 1168 return (false); 1169 } 1170 1171 mtx_lock(&nl->nl_lock); 1172 ncp = TAILQ_FIRST(&nl->nl_hotlist); 1173 if (ncp != NULL) { 1174 cache_neg_demote_locked(ncp); 1175 } 1176 ncp = cache_neg_evict_select_entry(nl); 1177 if (ncp == NULL) { 1178 counter_u64_add(neg_evict_skipped_empty, 1); 1179 mtx_unlock(&nl->nl_lock); 1180 mtx_unlock(&nl->nl_evict_lock); 1181 return (false); 1182 } 1183 ns = NCP2NEGSTATE(ncp); 1184 nlen = ncp->nc_nlen; 1185 dvp = ncp->nc_dvp; 1186 hash = cache_get_hash(ncp->nc_name, nlen, dvp); 1187 dvlp = VP2VNODELOCK(dvp); 1188 blp = HASH2BUCKETLOCK(hash); 1189 mtx_unlock(&nl->nl_lock); 1190 mtx_unlock(&nl->nl_evict_lock); 1191 mtx_lock(dvlp); 1192 mtx_lock(blp); 1193 /* 1194 * Note that since all locks were dropped above, the entry may be 1195 * gone or reallocated to be something else. 1196 */ 1197 CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) { 1198 if (ncp2 == ncp && ncp2->nc_dvp == dvp && 1199 ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0) 1200 break; 1201 } 1202 if (ncp2 == NULL) { 1203 counter_u64_add(neg_evict_skipped_missed, 1); 1204 ncp = NULL; 1205 evicted = false; 1206 } else { 1207 MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp)); 1208 MPASS(blp == NCP2BUCKETLOCK(ncp)); 1209 SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp, 1210 ncp->nc_name); 1211 cache_zap_locked(ncp); 1212 counter_u64_add(neg_evicted, 1); 1213 evicted = true; 1214 } 1215 mtx_unlock(blp); 1216 mtx_unlock(dvlp); 1217 if (ncp != NULL) 1218 cache_free(ncp); 1219 return (evicted); 1220 } 1221 1222 /* 1223 * Maybe evict a negative entry to create more room. 1224 * 1225 * The ncnegfactor parameter limits what fraction of the total count 1226 * can comprise of negative entries. However, if the cache is just 1227 * warming up this leads to excessive evictions. As such, ncnegminpct 1228 * (recomputed to neg_min) dictates whether the above should be 1229 * applied. 1230 * 1231 * Try evicting if the cache is close to full capacity regardless of 1232 * other considerations. 1233 */ 1234 static bool 1235 cache_neg_evict_cond(u_long lnumcache) 1236 { 1237 u_long lnumneg; 1238 1239 if (ncsize - 1000 < lnumcache) 1240 goto out_evict; 1241 lnumneg = atomic_load_long(&numneg); 1242 if (lnumneg < neg_min) 1243 return (false); 1244 if (lnumneg * ncnegfactor < lnumcache) 1245 return (false); 1246 out_evict: 1247 return (cache_neg_evict()); 1248 } 1249 1250 /* 1251 * cache_zap_locked(): 1252 * 1253 * Removes a namecache entry from cache, whether it contains an actual 1254 * pointer to a vnode or if it is just a negative cache entry. 1255 */ 1256 static void 1257 cache_zap_locked(struct namecache *ncp) 1258 { 1259 struct nchashhead *ncpp; 1260 1261 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1262 cache_assert_vnode_locked(ncp->nc_vp); 1263 cache_assert_vnode_locked(ncp->nc_dvp); 1264 cache_assert_bucket_locked(ncp); 1265 1266 cache_ncp_invalidate(ncp); 1267 1268 ncpp = NCP2BUCKET(ncp); 1269 CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash); 1270 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 1271 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, 1272 ncp->nc_name, ncp->nc_vp); 1273 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); 1274 if (ncp == ncp->nc_vp->v_cache_dd) { 1275 vn_seqc_write_begin_unheld(ncp->nc_vp); 1276 ncp->nc_vp->v_cache_dd = NULL; 1277 vn_seqc_write_end(ncp->nc_vp); 1278 } 1279 } else { 1280 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp, 1281 ncp->nc_name); 1282 cache_neg_remove(ncp); 1283 } 1284 if (ncp->nc_flag & NCF_ISDOTDOT) { 1285 if (ncp == ncp->nc_dvp->v_cache_dd) { 1286 vn_seqc_write_begin_unheld(ncp->nc_dvp); 1287 ncp->nc_dvp->v_cache_dd = NULL; 1288 vn_seqc_write_end(ncp->nc_dvp); 1289 } 1290 } else { 1291 LIST_REMOVE(ncp, nc_src); 1292 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { 1293 ncp->nc_flag |= NCF_DVDROP; 1294 counter_u64_add(numcachehv, -1); 1295 } 1296 } 1297 atomic_subtract_long(&numcache, 1); 1298 } 1299 1300 static void 1301 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) 1302 { 1303 struct mtx *blp; 1304 1305 MPASS(ncp->nc_dvp == vp); 1306 MPASS(ncp->nc_flag & NCF_NEGATIVE); 1307 cache_assert_vnode_locked(vp); 1308 1309 blp = NCP2BUCKETLOCK(ncp); 1310 mtx_lock(blp); 1311 cache_zap_locked(ncp); 1312 mtx_unlock(blp); 1313 } 1314 1315 static bool 1316 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, 1317 struct mtx **vlpp) 1318 { 1319 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 1320 struct mtx *blp; 1321 1322 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 1323 cache_assert_vnode_locked(vp); 1324 1325 if (ncp->nc_flag & NCF_NEGATIVE) { 1326 if (*vlpp != NULL) { 1327 mtx_unlock(*vlpp); 1328 *vlpp = NULL; 1329 } 1330 cache_zap_negative_locked_vnode_kl(ncp, vp); 1331 return (true); 1332 } 1333 1334 pvlp = VP2VNODELOCK(vp); 1335 blp = NCP2BUCKETLOCK(ncp); 1336 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 1337 vlp2 = VP2VNODELOCK(ncp->nc_vp); 1338 1339 if (*vlpp == vlp1 || *vlpp == vlp2) { 1340 to_unlock = *vlpp; 1341 *vlpp = NULL; 1342 } else { 1343 if (*vlpp != NULL) { 1344 mtx_unlock(*vlpp); 1345 *vlpp = NULL; 1346 } 1347 cache_sort_vnodes(&vlp1, &vlp2); 1348 if (vlp1 == pvlp) { 1349 mtx_lock(vlp2); 1350 to_unlock = vlp2; 1351 } else { 1352 if (!mtx_trylock(vlp1)) 1353 goto out_relock; 1354 to_unlock = vlp1; 1355 } 1356 } 1357 mtx_lock(blp); 1358 cache_zap_locked(ncp); 1359 mtx_unlock(blp); 1360 if (to_unlock != NULL) 1361 mtx_unlock(to_unlock); 1362 return (true); 1363 1364 out_relock: 1365 mtx_unlock(vlp2); 1366 mtx_lock(vlp1); 1367 mtx_lock(vlp2); 1368 MPASS(*vlpp == NULL); 1369 *vlpp = vlp1; 1370 return (false); 1371 } 1372 1373 /* 1374 * If trylocking failed we can get here. We know enough to take all needed locks 1375 * in the right order and re-lookup the entry. 1376 */ 1377 static int 1378 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1379 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, 1380 struct mtx *blp) 1381 { 1382 struct namecache *rncp; 1383 1384 cache_assert_bucket_unlocked(ncp); 1385 1386 cache_sort_vnodes(&dvlp, &vlp); 1387 cache_lock_vnodes(dvlp, vlp); 1388 mtx_lock(blp); 1389 CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { 1390 if (rncp == ncp && rncp->nc_dvp == dvp && 1391 rncp->nc_nlen == cnp->cn_namelen && 1392 !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) 1393 break; 1394 } 1395 if (rncp != NULL) { 1396 cache_zap_locked(rncp); 1397 mtx_unlock(blp); 1398 cache_unlock_vnodes(dvlp, vlp); 1399 counter_u64_add(zap_bucket_relock_success, 1); 1400 return (0); 1401 } 1402 1403 mtx_unlock(blp); 1404 cache_unlock_vnodes(dvlp, vlp); 1405 return (EAGAIN); 1406 } 1407 1408 static int __noinline 1409 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp, 1410 uint32_t hash, struct mtx *blp) 1411 { 1412 struct mtx *dvlp, *vlp; 1413 struct vnode *dvp; 1414 1415 cache_assert_bucket_locked(ncp); 1416 1417 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1418 vlp = NULL; 1419 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1420 vlp = VP2VNODELOCK(ncp->nc_vp); 1421 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1422 cache_zap_locked(ncp); 1423 mtx_unlock(blp); 1424 cache_unlock_vnodes(dvlp, vlp); 1425 return (0); 1426 } 1427 1428 dvp = ncp->nc_dvp; 1429 mtx_unlock(blp); 1430 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1431 } 1432 1433 static __noinline int 1434 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp) 1435 { 1436 struct namecache *ncp; 1437 struct mtx *blp; 1438 struct mtx *dvlp, *dvlp2; 1439 uint32_t hash; 1440 int error; 1441 1442 if (cnp->cn_namelen == 2 && 1443 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1444 dvlp = VP2VNODELOCK(dvp); 1445 dvlp2 = NULL; 1446 mtx_lock(dvlp); 1447 retry_dotdot: 1448 ncp = dvp->v_cache_dd; 1449 if (ncp == NULL) { 1450 mtx_unlock(dvlp); 1451 if (dvlp2 != NULL) 1452 mtx_unlock(dvlp2); 1453 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1454 return (0); 1455 } 1456 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1457 if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2)) 1458 goto retry_dotdot; 1459 MPASS(dvp->v_cache_dd == NULL); 1460 mtx_unlock(dvlp); 1461 if (dvlp2 != NULL) 1462 mtx_unlock(dvlp2); 1463 cache_free(ncp); 1464 } else { 1465 vn_seqc_write_begin(dvp); 1466 dvp->v_cache_dd = NULL; 1467 vn_seqc_write_end(dvp); 1468 mtx_unlock(dvlp); 1469 if (dvlp2 != NULL) 1470 mtx_unlock(dvlp2); 1471 } 1472 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1473 return (1); 1474 } 1475 1476 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1477 blp = HASH2BUCKETLOCK(hash); 1478 retry: 1479 if (CK_SLIST_EMPTY(NCHHASH(hash))) 1480 goto out_no_entry; 1481 1482 mtx_lock(blp); 1483 1484 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1485 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1486 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1487 break; 1488 } 1489 1490 if (ncp == NULL) { 1491 mtx_unlock(blp); 1492 goto out_no_entry; 1493 } 1494 1495 error = cache_zap_locked_bucket(ncp, cnp, hash, blp); 1496 if (__predict_false(error != 0)) { 1497 zap_bucket_fail++; 1498 goto retry; 1499 } 1500 counter_u64_add(numposzaps, 1); 1501 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1502 cache_free(ncp); 1503 return (1); 1504 out_no_entry: 1505 counter_u64_add(nummisszap, 1); 1506 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1507 return (0); 1508 } 1509 1510 static int __noinline 1511 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1512 struct timespec *tsp, int *ticksp) 1513 { 1514 int ltype; 1515 1516 *vpp = dvp; 1517 counter_u64_add(dothits, 1); 1518 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); 1519 if (tsp != NULL) 1520 timespecclear(tsp); 1521 if (ticksp != NULL) 1522 *ticksp = ticks; 1523 vrefact(*vpp); 1524 /* 1525 * When we lookup "." we still can be asked to lock it 1526 * differently... 1527 */ 1528 ltype = cnp->cn_lkflags & LK_TYPE_MASK; 1529 if (ltype != VOP_ISLOCKED(*vpp)) { 1530 if (ltype == LK_EXCLUSIVE) { 1531 vn_lock(*vpp, LK_UPGRADE | LK_RETRY); 1532 if (VN_IS_DOOMED((*vpp))) { 1533 /* forced unmount */ 1534 vrele(*vpp); 1535 *vpp = NULL; 1536 return (ENOENT); 1537 } 1538 } else 1539 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); 1540 } 1541 return (-1); 1542 } 1543 1544 static int __noinline 1545 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1546 struct timespec *tsp, int *ticksp) 1547 { 1548 struct namecache_ts *ncp_ts; 1549 struct namecache *ncp; 1550 struct mtx *dvlp; 1551 enum vgetstate vs; 1552 int error, ltype; 1553 bool whiteout; 1554 1555 MPASS((cnp->cn_flags & ISDOTDOT) != 0); 1556 1557 if ((cnp->cn_flags & MAKEENTRY) == 0) { 1558 cache_remove_cnp(dvp, cnp); 1559 return (0); 1560 } 1561 1562 counter_u64_add(dotdothits, 1); 1563 retry: 1564 dvlp = VP2VNODELOCK(dvp); 1565 mtx_lock(dvlp); 1566 ncp = dvp->v_cache_dd; 1567 if (ncp == NULL) { 1568 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, "..", NULL); 1569 mtx_unlock(dvlp); 1570 return (0); 1571 } 1572 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1573 if (ncp->nc_flag & NCF_NEGATIVE) 1574 *vpp = NULL; 1575 else 1576 *vpp = ncp->nc_vp; 1577 } else 1578 *vpp = ncp->nc_dvp; 1579 if (*vpp == NULL) 1580 goto negative_success; 1581 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp); 1582 cache_out_ts(ncp, tsp, ticksp); 1583 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == 1584 NCF_DTS && tsp != NULL) { 1585 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1586 *tsp = ncp_ts->nc_dotdottime; 1587 } 1588 1589 MPASS(dvp != *vpp); 1590 ltype = VOP_ISLOCKED(dvp); 1591 VOP_UNLOCK(dvp); 1592 vs = vget_prep(*vpp); 1593 mtx_unlock(dvlp); 1594 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1595 vn_lock(dvp, ltype | LK_RETRY); 1596 if (VN_IS_DOOMED(dvp)) { 1597 if (error == 0) 1598 vput(*vpp); 1599 *vpp = NULL; 1600 return (ENOENT); 1601 } 1602 if (error) { 1603 *vpp = NULL; 1604 goto retry; 1605 } 1606 return (-1); 1607 negative_success: 1608 if (__predict_false(cnp->cn_nameiop == CREATE)) { 1609 if (cnp->cn_flags & ISLASTCN) { 1610 counter_u64_add(numnegzaps, 1); 1611 cache_zap_negative_locked_vnode_kl(ncp, dvp); 1612 mtx_unlock(dvlp); 1613 cache_free(ncp); 1614 return (0); 1615 } 1616 } 1617 1618 whiteout = (ncp->nc_flag & NCF_WHITE); 1619 cache_out_ts(ncp, tsp, ticksp); 1620 if (cache_neg_hit_prep(ncp)) 1621 cache_neg_promote(ncp); 1622 else 1623 cache_neg_hit_finish(ncp); 1624 mtx_unlock(dvlp); 1625 if (whiteout) 1626 cnp->cn_flags |= ISWHITEOUT; 1627 return (ENOENT); 1628 } 1629 1630 /** 1631 * Lookup a name in the name cache 1632 * 1633 * # Arguments 1634 * 1635 * - dvp: Parent directory in which to search. 1636 * - vpp: Return argument. Will contain desired vnode on cache hit. 1637 * - cnp: Parameters of the name search. The most interesting bits of 1638 * the cn_flags field have the following meanings: 1639 * - MAKEENTRY: If clear, free an entry from the cache rather than look 1640 * it up. 1641 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." 1642 * - tsp: Return storage for cache timestamp. On a successful (positive 1643 * or negative) lookup, tsp will be filled with any timespec that 1644 * was stored when this cache entry was created. However, it will 1645 * be clear for "." entries. 1646 * - ticks: Return storage for alternate cache timestamp. On a successful 1647 * (positive or negative) lookup, it will contain the ticks value 1648 * that was current when the cache entry was created, unless cnp 1649 * was ".". 1650 * 1651 * Either both tsp and ticks have to be provided or neither of them. 1652 * 1653 * # Returns 1654 * 1655 * - -1: A positive cache hit. vpp will contain the desired vnode. 1656 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due 1657 * to a forced unmount. vpp will not be modified. If the entry 1658 * is a whiteout, then the ISWHITEOUT flag will be set in 1659 * cnp->cn_flags. 1660 * - 0: A cache miss. vpp will not be modified. 1661 * 1662 * # Locking 1663 * 1664 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up 1665 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the 1666 * lock is not recursively acquired. 1667 */ 1668 static int __noinline 1669 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1670 struct timespec *tsp, int *ticksp) 1671 { 1672 struct namecache *ncp; 1673 struct mtx *blp; 1674 uint32_t hash; 1675 enum vgetstate vs; 1676 int error; 1677 bool whiteout; 1678 1679 MPASS((cnp->cn_flags & (MAKEENTRY | ISDOTDOT)) == MAKEENTRY); 1680 1681 retry: 1682 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1683 blp = HASH2BUCKETLOCK(hash); 1684 mtx_lock(blp); 1685 1686 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1687 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1688 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1689 break; 1690 } 1691 1692 if (__predict_false(ncp == NULL)) { 1693 mtx_unlock(blp); 1694 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1695 NULL); 1696 counter_u64_add(nummiss, 1); 1697 return (0); 1698 } 1699 1700 if (ncp->nc_flag & NCF_NEGATIVE) 1701 goto negative_success; 1702 1703 counter_u64_add(numposhits, 1); 1704 *vpp = ncp->nc_vp; 1705 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); 1706 cache_out_ts(ncp, tsp, ticksp); 1707 MPASS(dvp != *vpp); 1708 vs = vget_prep(*vpp); 1709 mtx_unlock(blp); 1710 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1711 if (error) { 1712 *vpp = NULL; 1713 goto retry; 1714 } 1715 return (-1); 1716 negative_success: 1717 if (__predict_false(cnp->cn_nameiop == CREATE)) { 1718 if (cnp->cn_flags & ISLASTCN) { 1719 counter_u64_add(numnegzaps, 1); 1720 error = cache_zap_locked_bucket(ncp, cnp, hash, blp); 1721 if (__predict_false(error != 0)) { 1722 zap_bucket_fail2++; 1723 goto retry; 1724 } 1725 cache_free(ncp); 1726 return (0); 1727 } 1728 } 1729 1730 whiteout = (ncp->nc_flag & NCF_WHITE); 1731 cache_out_ts(ncp, tsp, ticksp); 1732 if (cache_neg_hit_prep(ncp)) 1733 cache_neg_promote(ncp); 1734 else 1735 cache_neg_hit_finish(ncp); 1736 mtx_unlock(blp); 1737 if (whiteout) 1738 cnp->cn_flags |= ISWHITEOUT; 1739 return (ENOENT); 1740 } 1741 1742 int 1743 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1744 struct timespec *tsp, int *ticksp) 1745 { 1746 struct namecache *ncp; 1747 uint32_t hash; 1748 enum vgetstate vs; 1749 int error; 1750 bool whiteout, neg_promote; 1751 u_short nc_flag; 1752 1753 MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL)); 1754 1755 #ifdef DEBUG_CACHE 1756 if (__predict_false(!doingcache)) { 1757 cnp->cn_flags &= ~MAKEENTRY; 1758 return (0); 1759 } 1760 #endif 1761 1762 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 1763 if (cnp->cn_namelen == 1) 1764 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); 1765 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') 1766 return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp)); 1767 } 1768 1769 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 1770 1771 if ((cnp->cn_flags & MAKEENTRY) == 0) { 1772 cache_remove_cnp(dvp, cnp); 1773 return (0); 1774 } 1775 1776 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1777 vfs_smr_enter(); 1778 1779 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1780 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1781 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1782 break; 1783 } 1784 1785 if (__predict_false(ncp == NULL)) { 1786 vfs_smr_exit(); 1787 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1788 NULL); 1789 counter_u64_add(nummiss, 1); 1790 return (0); 1791 } 1792 1793 nc_flag = atomic_load_char(&ncp->nc_flag); 1794 if (nc_flag & NCF_NEGATIVE) 1795 goto negative_success; 1796 1797 counter_u64_add(numposhits, 1); 1798 *vpp = ncp->nc_vp; 1799 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); 1800 cache_out_ts(ncp, tsp, ticksp); 1801 MPASS(dvp != *vpp); 1802 if (!cache_ncp_canuse(ncp)) { 1803 vfs_smr_exit(); 1804 *vpp = NULL; 1805 goto out_fallback; 1806 } 1807 vs = vget_prep_smr(*vpp); 1808 vfs_smr_exit(); 1809 if (__predict_false(vs == VGET_NONE)) { 1810 *vpp = NULL; 1811 goto out_fallback; 1812 } 1813 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1814 if (error) { 1815 *vpp = NULL; 1816 goto out_fallback; 1817 } 1818 return (-1); 1819 negative_success: 1820 if (__predict_false(cnp->cn_nameiop == CREATE)) { 1821 if (cnp->cn_flags & ISLASTCN) { 1822 vfs_smr_exit(); 1823 goto out_fallback; 1824 } 1825 } 1826 1827 cache_out_ts(ncp, tsp, ticksp); 1828 whiteout = (ncp->nc_flag & NCF_WHITE); 1829 neg_promote = cache_neg_hit_prep(ncp); 1830 if (__predict_false(!cache_ncp_canuse(ncp))) { 1831 cache_neg_hit_abort(ncp); 1832 vfs_smr_exit(); 1833 goto out_fallback; 1834 } 1835 if (neg_promote) { 1836 vfs_smr_exit(); 1837 if (!cache_neg_promote_cond(dvp, cnp, ncp, hash)) 1838 goto out_fallback; 1839 } else { 1840 cache_neg_hit_finish(ncp); 1841 vfs_smr_exit(); 1842 } 1843 if (whiteout) 1844 cnp->cn_flags |= ISWHITEOUT; 1845 return (ENOENT); 1846 out_fallback: 1847 return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp)); 1848 } 1849 1850 struct celockstate { 1851 struct mtx *vlp[3]; 1852 struct mtx *blp[2]; 1853 }; 1854 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); 1855 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); 1856 1857 static inline void 1858 cache_celockstate_init(struct celockstate *cel) 1859 { 1860 1861 bzero(cel, sizeof(*cel)); 1862 } 1863 1864 static void 1865 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, 1866 struct vnode *dvp) 1867 { 1868 struct mtx *vlp1, *vlp2; 1869 1870 MPASS(cel->vlp[0] == NULL); 1871 MPASS(cel->vlp[1] == NULL); 1872 MPASS(cel->vlp[2] == NULL); 1873 1874 MPASS(vp != NULL || dvp != NULL); 1875 1876 vlp1 = VP2VNODELOCK(vp); 1877 vlp2 = VP2VNODELOCK(dvp); 1878 cache_sort_vnodes(&vlp1, &vlp2); 1879 1880 if (vlp1 != NULL) { 1881 mtx_lock(vlp1); 1882 cel->vlp[0] = vlp1; 1883 } 1884 mtx_lock(vlp2); 1885 cel->vlp[1] = vlp2; 1886 } 1887 1888 static void 1889 cache_unlock_vnodes_cel(struct celockstate *cel) 1890 { 1891 1892 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); 1893 1894 if (cel->vlp[0] != NULL) 1895 mtx_unlock(cel->vlp[0]); 1896 if (cel->vlp[1] != NULL) 1897 mtx_unlock(cel->vlp[1]); 1898 if (cel->vlp[2] != NULL) 1899 mtx_unlock(cel->vlp[2]); 1900 } 1901 1902 static bool 1903 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) 1904 { 1905 struct mtx *vlp; 1906 bool ret; 1907 1908 cache_assert_vlp_locked(cel->vlp[0]); 1909 cache_assert_vlp_locked(cel->vlp[1]); 1910 MPASS(cel->vlp[2] == NULL); 1911 1912 MPASS(vp != NULL); 1913 vlp = VP2VNODELOCK(vp); 1914 1915 ret = true; 1916 if (vlp >= cel->vlp[1]) { 1917 mtx_lock(vlp); 1918 } else { 1919 if (mtx_trylock(vlp)) 1920 goto out; 1921 cache_lock_vnodes_cel_3_failures++; 1922 cache_unlock_vnodes_cel(cel); 1923 if (vlp < cel->vlp[0]) { 1924 mtx_lock(vlp); 1925 mtx_lock(cel->vlp[0]); 1926 mtx_lock(cel->vlp[1]); 1927 } else { 1928 if (cel->vlp[0] != NULL) 1929 mtx_lock(cel->vlp[0]); 1930 mtx_lock(vlp); 1931 mtx_lock(cel->vlp[1]); 1932 } 1933 ret = false; 1934 } 1935 out: 1936 cel->vlp[2] = vlp; 1937 return (ret); 1938 } 1939 1940 static void 1941 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1, 1942 struct mtx *blp2) 1943 { 1944 1945 MPASS(cel->blp[0] == NULL); 1946 MPASS(cel->blp[1] == NULL); 1947 1948 cache_sort_vnodes(&blp1, &blp2); 1949 1950 if (blp1 != NULL) { 1951 mtx_lock(blp1); 1952 cel->blp[0] = blp1; 1953 } 1954 mtx_lock(blp2); 1955 cel->blp[1] = blp2; 1956 } 1957 1958 static void 1959 cache_unlock_buckets_cel(struct celockstate *cel) 1960 { 1961 1962 if (cel->blp[0] != NULL) 1963 mtx_unlock(cel->blp[0]); 1964 mtx_unlock(cel->blp[1]); 1965 } 1966 1967 /* 1968 * Lock part of the cache affected by the insertion. 1969 * 1970 * This means vnodelocks for dvp, vp and the relevant bucketlock. 1971 * However, insertion can result in removal of an old entry. In this 1972 * case we have an additional vnode and bucketlock pair to lock. 1973 * 1974 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while 1975 * preserving the locking order (smaller address first). 1976 */ 1977 static void 1978 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1979 uint32_t hash) 1980 { 1981 struct namecache *ncp; 1982 struct mtx *blps[2]; 1983 1984 blps[0] = HASH2BUCKETLOCK(hash); 1985 for (;;) { 1986 blps[1] = NULL; 1987 cache_lock_vnodes_cel(cel, dvp, vp); 1988 if (vp == NULL || vp->v_type != VDIR) 1989 break; 1990 ncp = vp->v_cache_dd; 1991 if (ncp == NULL) 1992 break; 1993 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1994 break; 1995 MPASS(ncp->nc_dvp == vp); 1996 blps[1] = NCP2BUCKETLOCK(ncp); 1997 if (ncp->nc_flag & NCF_NEGATIVE) 1998 break; 1999 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 2000 break; 2001 /* 2002 * All vnodes got re-locked. Re-validate the state and if 2003 * nothing changed we are done. Otherwise restart. 2004 */ 2005 if (ncp == vp->v_cache_dd && 2006 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 2007 blps[1] == NCP2BUCKETLOCK(ncp) && 2008 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 2009 break; 2010 cache_unlock_vnodes_cel(cel); 2011 cel->vlp[0] = NULL; 2012 cel->vlp[1] = NULL; 2013 cel->vlp[2] = NULL; 2014 } 2015 cache_lock_buckets_cel(cel, blps[0], blps[1]); 2016 } 2017 2018 static void 2019 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 2020 uint32_t hash) 2021 { 2022 struct namecache *ncp; 2023 struct mtx *blps[2]; 2024 2025 blps[0] = HASH2BUCKETLOCK(hash); 2026 for (;;) { 2027 blps[1] = NULL; 2028 cache_lock_vnodes_cel(cel, dvp, vp); 2029 ncp = dvp->v_cache_dd; 2030 if (ncp == NULL) 2031 break; 2032 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2033 break; 2034 MPASS(ncp->nc_dvp == dvp); 2035 blps[1] = NCP2BUCKETLOCK(ncp); 2036 if (ncp->nc_flag & NCF_NEGATIVE) 2037 break; 2038 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 2039 break; 2040 if (ncp == dvp->v_cache_dd && 2041 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 2042 blps[1] == NCP2BUCKETLOCK(ncp) && 2043 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 2044 break; 2045 cache_unlock_vnodes_cel(cel); 2046 cel->vlp[0] = NULL; 2047 cel->vlp[1] = NULL; 2048 cel->vlp[2] = NULL; 2049 } 2050 cache_lock_buckets_cel(cel, blps[0], blps[1]); 2051 } 2052 2053 static void 2054 cache_enter_unlock(struct celockstate *cel) 2055 { 2056 2057 cache_unlock_buckets_cel(cel); 2058 cache_unlock_vnodes_cel(cel); 2059 } 2060 2061 static void __noinline 2062 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, 2063 struct componentname *cnp) 2064 { 2065 struct celockstate cel; 2066 struct namecache *ncp; 2067 uint32_t hash; 2068 int len; 2069 2070 if (dvp->v_cache_dd == NULL) 2071 return; 2072 len = cnp->cn_namelen; 2073 cache_celockstate_init(&cel); 2074 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 2075 cache_enter_lock_dd(&cel, dvp, vp, hash); 2076 vn_seqc_write_begin(dvp); 2077 ncp = dvp->v_cache_dd; 2078 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) { 2079 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); 2080 cache_zap_locked(ncp); 2081 } else { 2082 ncp = NULL; 2083 } 2084 dvp->v_cache_dd = NULL; 2085 vn_seqc_write_end(dvp); 2086 cache_enter_unlock(&cel); 2087 if (ncp != NULL) 2088 cache_free(ncp); 2089 } 2090 2091 /* 2092 * Add an entry to the cache. 2093 */ 2094 void 2095 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 2096 struct timespec *tsp, struct timespec *dtsp) 2097 { 2098 struct celockstate cel; 2099 struct namecache *ncp, *n2, *ndd; 2100 struct namecache_ts *ncp_ts; 2101 struct nchashhead *ncpp; 2102 uint32_t hash; 2103 int flag; 2104 int len; 2105 u_long lnumcache; 2106 2107 VNPASS(!VN_IS_DOOMED(dvp), dvp); 2108 VNPASS(dvp->v_type != VNON, dvp); 2109 if (vp != NULL) { 2110 VNPASS(!VN_IS_DOOMED(vp), vp); 2111 VNPASS(vp->v_type != VNON, vp); 2112 } 2113 2114 #ifdef DEBUG_CACHE 2115 if (__predict_false(!doingcache)) 2116 return; 2117 #endif 2118 2119 flag = 0; 2120 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 2121 if (cnp->cn_namelen == 1) 2122 return; 2123 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 2124 cache_enter_dotdot_prep(dvp, vp, cnp); 2125 flag = NCF_ISDOTDOT; 2126 } 2127 } 2128 2129 /* 2130 * Avoid blowout in namecache entries. 2131 * 2132 * Bugs: 2133 * 1. filesystems may end up tryng to add an already existing entry 2134 * (for example this can happen after a cache miss during concurrent 2135 * lookup), in which case we will call cache_neg_evict despite not 2136 * adding anything. 2137 * 2. the routine may fail to free anything and no provisions are made 2138 * to make it try harder (see the inside for failure modes) 2139 * 3. it only ever looks at negative entries. 2140 */ 2141 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; 2142 if (cache_neg_evict_cond(lnumcache)) { 2143 lnumcache = atomic_load_long(&numcache); 2144 } 2145 if (__predict_false(lnumcache >= ncsize)) { 2146 atomic_subtract_long(&numcache, 1); 2147 counter_u64_add(numdrops, 1); 2148 return; 2149 } 2150 2151 cache_celockstate_init(&cel); 2152 ndd = NULL; 2153 ncp_ts = NULL; 2154 2155 /* 2156 * Calculate the hash key and setup as much of the new 2157 * namecache entry as possible before acquiring the lock. 2158 */ 2159 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); 2160 ncp->nc_flag = flag | NCF_WIP; 2161 ncp->nc_vp = vp; 2162 if (vp == NULL) 2163 cache_neg_init(ncp); 2164 ncp->nc_dvp = dvp; 2165 if (tsp != NULL) { 2166 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 2167 ncp_ts->nc_time = *tsp; 2168 ncp_ts->nc_ticks = ticks; 2169 ncp_ts->nc_nc.nc_flag |= NCF_TS; 2170 if (dtsp != NULL) { 2171 ncp_ts->nc_dotdottime = *dtsp; 2172 ncp_ts->nc_nc.nc_flag |= NCF_DTS; 2173 } 2174 } 2175 len = ncp->nc_nlen = cnp->cn_namelen; 2176 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 2177 memcpy(ncp->nc_name, cnp->cn_nameptr, len); 2178 ncp->nc_name[len] = '\0'; 2179 cache_enter_lock(&cel, dvp, vp, hash); 2180 2181 /* 2182 * See if this vnode or negative entry is already in the cache 2183 * with this name. This can happen with concurrent lookups of 2184 * the same path name. 2185 */ 2186 ncpp = NCHHASH(hash); 2187 CK_SLIST_FOREACH(n2, ncpp, nc_hash) { 2188 if (n2->nc_dvp == dvp && 2189 n2->nc_nlen == cnp->cn_namelen && 2190 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { 2191 MPASS(cache_ncp_canuse(n2)); 2192 if ((n2->nc_flag & NCF_NEGATIVE) != 0) 2193 KASSERT(vp == NULL, 2194 ("%s: found entry pointing to a different vnode (%p != %p)", 2195 __func__, NULL, vp)); 2196 else 2197 KASSERT(n2->nc_vp == vp, 2198 ("%s: found entry pointing to a different vnode (%p != %p)", 2199 __func__, n2->nc_vp, vp)); 2200 /* 2201 * Entries are supposed to be immutable unless in the 2202 * process of getting destroyed. Accommodating for 2203 * changing timestamps is possible but not worth it. 2204 * This should be harmless in terms of correctness, in 2205 * the worst case resulting in an earlier expiration. 2206 * Alternatively, the found entry can be replaced 2207 * altogether. 2208 */ 2209 MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS))); 2210 #if 0 2211 if (tsp != NULL) { 2212 KASSERT((n2->nc_flag & NCF_TS) != 0, 2213 ("no NCF_TS")); 2214 n2_ts = __containerof(n2, struct namecache_ts, nc_nc); 2215 n2_ts->nc_time = ncp_ts->nc_time; 2216 n2_ts->nc_ticks = ncp_ts->nc_ticks; 2217 if (dtsp != NULL) { 2218 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; 2219 n2_ts->nc_nc.nc_flag |= NCF_DTS; 2220 } 2221 } 2222 #endif 2223 SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name, 2224 vp); 2225 goto out_unlock_free; 2226 } 2227 } 2228 2229 if (flag == NCF_ISDOTDOT) { 2230 /* 2231 * See if we are trying to add .. entry, but some other lookup 2232 * has populated v_cache_dd pointer already. 2233 */ 2234 if (dvp->v_cache_dd != NULL) 2235 goto out_unlock_free; 2236 KASSERT(vp == NULL || vp->v_type == VDIR, 2237 ("wrong vnode type %p", vp)); 2238 vn_seqc_write_begin(dvp); 2239 dvp->v_cache_dd = ncp; 2240 vn_seqc_write_end(dvp); 2241 } 2242 2243 if (vp != NULL) { 2244 if (flag != NCF_ISDOTDOT) { 2245 /* 2246 * For this case, the cache entry maps both the 2247 * directory name in it and the name ".." for the 2248 * directory's parent. 2249 */ 2250 vn_seqc_write_begin(vp); 2251 if ((ndd = vp->v_cache_dd) != NULL) { 2252 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) 2253 cache_zap_locked(ndd); 2254 else 2255 ndd = NULL; 2256 } 2257 vp->v_cache_dd = ncp; 2258 vn_seqc_write_end(vp); 2259 } else if (vp->v_type != VDIR) { 2260 if (vp->v_cache_dd != NULL) { 2261 vn_seqc_write_begin(vp); 2262 vp->v_cache_dd = NULL; 2263 vn_seqc_write_end(vp); 2264 } 2265 } 2266 } 2267 2268 if (flag != NCF_ISDOTDOT) { 2269 if (LIST_EMPTY(&dvp->v_cache_src)) { 2270 vhold(dvp); 2271 counter_u64_add(numcachehv, 1); 2272 } 2273 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); 2274 } 2275 2276 /* 2277 * If the entry is "negative", we place it into the 2278 * "negative" cache queue, otherwise, we place it into the 2279 * destination vnode's cache entries queue. 2280 */ 2281 if (vp != NULL) { 2282 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); 2283 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, 2284 vp); 2285 } else { 2286 if (cnp->cn_flags & ISWHITEOUT) 2287 ncp->nc_flag |= NCF_WHITE; 2288 cache_neg_insert(ncp); 2289 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, 2290 ncp->nc_name); 2291 } 2292 2293 /* 2294 * Insert the new namecache entry into the appropriate chain 2295 * within the cache entries table. 2296 */ 2297 CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash); 2298 2299 atomic_thread_fence_rel(); 2300 /* 2301 * Mark the entry as fully constructed. 2302 * It is immutable past this point until its removal. 2303 */ 2304 atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP); 2305 2306 cache_enter_unlock(&cel); 2307 if (ndd != NULL) 2308 cache_free(ndd); 2309 return; 2310 out_unlock_free: 2311 cache_enter_unlock(&cel); 2312 atomic_subtract_long(&numcache, 1); 2313 cache_free(ncp); 2314 return; 2315 } 2316 2317 static u_int 2318 cache_roundup_2(u_int val) 2319 { 2320 u_int res; 2321 2322 for (res = 1; res <= val; res <<= 1) 2323 continue; 2324 2325 return (res); 2326 } 2327 2328 static struct nchashhead * 2329 nchinittbl(u_long elements, u_long *hashmask) 2330 { 2331 struct nchashhead *hashtbl; 2332 u_long hashsize, i; 2333 2334 hashsize = cache_roundup_2(elements) / 2; 2335 2336 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK); 2337 for (i = 0; i < hashsize; i++) 2338 CK_SLIST_INIT(&hashtbl[i]); 2339 *hashmask = hashsize - 1; 2340 return (hashtbl); 2341 } 2342 2343 static void 2344 ncfreetbl(struct nchashhead *hashtbl) 2345 { 2346 2347 free(hashtbl, M_VFSCACHE); 2348 } 2349 2350 /* 2351 * Name cache initialization, from vfs_init() when we are booting 2352 */ 2353 static void 2354 nchinit(void *dummy __unused) 2355 { 2356 u_int i; 2357 2358 cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE, 2359 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2360 cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE, 2361 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2362 cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE, 2363 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2364 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE, 2365 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2366 2367 VFS_SMR_ZONE_SET(cache_zone_small); 2368 VFS_SMR_ZONE_SET(cache_zone_small_ts); 2369 VFS_SMR_ZONE_SET(cache_zone_large); 2370 VFS_SMR_ZONE_SET(cache_zone_large_ts); 2371 2372 ncsize = desiredvnodes * ncsizefactor; 2373 cache_recalc_neg_min(ncnegminpct); 2374 nchashtbl = nchinittbl(desiredvnodes * 2, &nchash); 2375 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; 2376 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ 2377 ncbuckethash = 7; 2378 if (ncbuckethash > nchash) 2379 ncbuckethash = nchash; 2380 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, 2381 M_WAITOK | M_ZERO); 2382 for (i = 0; i < numbucketlocks; i++) 2383 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE); 2384 ncvnodehash = ncbuckethash; 2385 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, 2386 M_WAITOK | M_ZERO); 2387 for (i = 0; i < numvnodelocks; i++) 2388 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); 2389 2390 for (i = 0; i < numneglists; i++) { 2391 mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF); 2392 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); 2393 TAILQ_INIT(&neglists[i].nl_list); 2394 TAILQ_INIT(&neglists[i].nl_hotlist); 2395 } 2396 } 2397 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); 2398 2399 void 2400 cache_vnode_init(struct vnode *vp) 2401 { 2402 2403 LIST_INIT(&vp->v_cache_src); 2404 TAILQ_INIT(&vp->v_cache_dst); 2405 vp->v_cache_dd = NULL; 2406 cache_prehash(vp); 2407 } 2408 2409 void 2410 cache_changesize(u_long newmaxvnodes) 2411 { 2412 struct nchashhead *new_nchashtbl, *old_nchashtbl; 2413 u_long new_nchash, old_nchash; 2414 struct namecache *ncp; 2415 uint32_t hash; 2416 u_long newncsize; 2417 int i; 2418 2419 newncsize = newmaxvnodes * ncsizefactor; 2420 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); 2421 if (newmaxvnodes < numbucketlocks) 2422 newmaxvnodes = numbucketlocks; 2423 2424 new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash); 2425 /* If same hash table size, nothing to do */ 2426 if (nchash == new_nchash) { 2427 ncfreetbl(new_nchashtbl); 2428 return; 2429 } 2430 /* 2431 * Move everything from the old hash table to the new table. 2432 * None of the namecache entries in the table can be removed 2433 * because to do so, they have to be removed from the hash table. 2434 */ 2435 cache_lock_all_vnodes(); 2436 cache_lock_all_buckets(); 2437 old_nchashtbl = nchashtbl; 2438 old_nchash = nchash; 2439 nchashtbl = new_nchashtbl; 2440 nchash = new_nchash; 2441 for (i = 0; i <= old_nchash; i++) { 2442 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) { 2443 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, 2444 ncp->nc_dvp); 2445 CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash); 2446 CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); 2447 } 2448 } 2449 ncsize = newncsize; 2450 cache_recalc_neg_min(ncnegminpct); 2451 cache_unlock_all_buckets(); 2452 cache_unlock_all_vnodes(); 2453 ncfreetbl(old_nchashtbl); 2454 } 2455 2456 /* 2457 * Invalidate all entries from and to a particular vnode. 2458 */ 2459 static void 2460 cache_purge_impl(struct vnode *vp) 2461 { 2462 TAILQ_HEAD(, namecache) ncps; 2463 struct namecache *ncp, *nnp; 2464 struct mtx *vlp, *vlp2; 2465 2466 TAILQ_INIT(&ncps); 2467 vlp = VP2VNODELOCK(vp); 2468 vlp2 = NULL; 2469 mtx_lock(vlp); 2470 retry: 2471 while (!LIST_EMPTY(&vp->v_cache_src)) { 2472 ncp = LIST_FIRST(&vp->v_cache_src); 2473 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2474 goto retry; 2475 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2476 } 2477 while (!TAILQ_EMPTY(&vp->v_cache_dst)) { 2478 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2479 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2480 goto retry; 2481 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2482 } 2483 ncp = vp->v_cache_dd; 2484 if (ncp != NULL) { 2485 KASSERT(ncp->nc_flag & NCF_ISDOTDOT, 2486 ("lost dotdot link")); 2487 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2488 goto retry; 2489 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2490 } 2491 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); 2492 mtx_unlock(vlp); 2493 if (vlp2 != NULL) 2494 mtx_unlock(vlp2); 2495 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2496 cache_free(ncp); 2497 } 2498 } 2499 2500 /* 2501 * Opportunistic check to see if there is anything to do. 2502 */ 2503 static bool 2504 cache_has_entries(struct vnode *vp) 2505 { 2506 2507 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2508 vp->v_cache_dd == NULL) 2509 return (false); 2510 return (true); 2511 } 2512 2513 void 2514 cache_purge(struct vnode *vp) 2515 { 2516 2517 SDT_PROBE1(vfs, namecache, purge, done, vp); 2518 if (!cache_has_entries(vp)) 2519 return; 2520 cache_purge_impl(vp); 2521 } 2522 2523 /* 2524 * Only to be used by vgone. 2525 */ 2526 void 2527 cache_purge_vgone(struct vnode *vp) 2528 { 2529 struct mtx *vlp; 2530 2531 VNPASS(VN_IS_DOOMED(vp), vp); 2532 if (cache_has_entries(vp)) { 2533 cache_purge_impl(vp); 2534 return; 2535 } 2536 2537 /* 2538 * Serialize against a potential thread doing cache_purge. 2539 */ 2540 vlp = VP2VNODELOCK(vp); 2541 mtx_wait_unlocked(vlp); 2542 if (cache_has_entries(vp)) { 2543 cache_purge_impl(vp); 2544 return; 2545 } 2546 return; 2547 } 2548 2549 /* 2550 * Invalidate all negative entries for a particular directory vnode. 2551 */ 2552 void 2553 cache_purge_negative(struct vnode *vp) 2554 { 2555 TAILQ_HEAD(, namecache) ncps; 2556 struct namecache *ncp, *nnp; 2557 struct mtx *vlp; 2558 2559 SDT_PROBE1(vfs, namecache, purge_negative, done, vp); 2560 if (LIST_EMPTY(&vp->v_cache_src)) 2561 return; 2562 TAILQ_INIT(&ncps); 2563 vlp = VP2VNODELOCK(vp); 2564 mtx_lock(vlp); 2565 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { 2566 if (!(ncp->nc_flag & NCF_NEGATIVE)) 2567 continue; 2568 cache_zap_negative_locked_vnode_kl(ncp, vp); 2569 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2570 } 2571 mtx_unlock(vlp); 2572 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2573 cache_free(ncp); 2574 } 2575 } 2576 2577 void 2578 cache_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, 2579 struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp) 2580 { 2581 2582 ASSERT_VOP_IN_SEQC(fdvp); 2583 ASSERT_VOP_IN_SEQC(fvp); 2584 ASSERT_VOP_IN_SEQC(tdvp); 2585 if (tvp != NULL) 2586 ASSERT_VOP_IN_SEQC(tvp); 2587 2588 cache_purge(fvp); 2589 if (tvp != NULL) { 2590 cache_purge(tvp); 2591 KASSERT(!cache_remove_cnp(tdvp, tcnp), 2592 ("%s: lingering negative entry", __func__)); 2593 } else { 2594 cache_remove_cnp(tdvp, tcnp); 2595 } 2596 } 2597 2598 /* 2599 * Flush all entries referencing a particular filesystem. 2600 */ 2601 void 2602 cache_purgevfs(struct mount *mp) 2603 { 2604 struct vnode *vp, *mvp; 2605 2606 SDT_PROBE1(vfs, namecache, purgevfs, done, mp); 2607 /* 2608 * Somewhat wasteful iteration over all vnodes. Would be better to 2609 * support filtering and avoid the interlock to begin with. 2610 */ 2611 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 2612 if (!cache_has_entries(vp)) { 2613 VI_UNLOCK(vp); 2614 continue; 2615 } 2616 vholdl(vp); 2617 VI_UNLOCK(vp); 2618 cache_purge(vp); 2619 vdrop(vp); 2620 } 2621 } 2622 2623 /* 2624 * Perform canonical checks and cache lookup and pass on to filesystem 2625 * through the vop_cachedlookup only if needed. 2626 */ 2627 2628 int 2629 vfs_cache_lookup(struct vop_lookup_args *ap) 2630 { 2631 struct vnode *dvp; 2632 int error; 2633 struct vnode **vpp = ap->a_vpp; 2634 struct componentname *cnp = ap->a_cnp; 2635 int flags = cnp->cn_flags; 2636 2637 *vpp = NULL; 2638 dvp = ap->a_dvp; 2639 2640 if (dvp->v_type != VDIR) 2641 return (ENOTDIR); 2642 2643 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && 2644 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) 2645 return (EROFS); 2646 2647 error = vn_dir_check_exec(dvp, cnp); 2648 if (error != 0) 2649 return (error); 2650 2651 error = cache_lookup(dvp, vpp, cnp, NULL, NULL); 2652 if (error == 0) 2653 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); 2654 if (error == -1) 2655 return (0); 2656 return (error); 2657 } 2658 2659 /* Implementation of the getcwd syscall. */ 2660 int 2661 sys___getcwd(struct thread *td, struct __getcwd_args *uap) 2662 { 2663 char *buf, *retbuf; 2664 size_t buflen; 2665 int error; 2666 2667 buflen = uap->buflen; 2668 if (__predict_false(buflen < 2)) 2669 return (EINVAL); 2670 if (buflen > MAXPATHLEN) 2671 buflen = MAXPATHLEN; 2672 2673 buf = uma_zalloc(namei_zone, M_WAITOK); 2674 error = vn_getcwd(buf, &retbuf, &buflen); 2675 if (error == 0) 2676 error = copyout(retbuf, uap->buf, buflen); 2677 uma_zfree(namei_zone, buf); 2678 return (error); 2679 } 2680 2681 int 2682 vn_getcwd(char *buf, char **retbuf, size_t *buflen) 2683 { 2684 struct pwd *pwd; 2685 int error; 2686 2687 vfs_smr_enter(); 2688 pwd = pwd_get_smr(); 2689 error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, 2690 buflen, false, 0); 2691 VFS_SMR_ASSERT_NOT_ENTERED(); 2692 if (error < 0) { 2693 pwd = pwd_hold(curthread); 2694 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf, 2695 retbuf, buflen); 2696 pwd_drop(pwd); 2697 } 2698 2699 #ifdef KTRACE 2700 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0) 2701 ktrnamei(*retbuf); 2702 #endif 2703 return (error); 2704 } 2705 2706 static int 2707 kern___realpathat(struct thread *td, int fd, const char *path, char *buf, 2708 size_t size, int flags, enum uio_seg pathseg) 2709 { 2710 struct nameidata nd; 2711 char *retbuf, *freebuf; 2712 int error; 2713 2714 if (flags != 0) 2715 return (EINVAL); 2716 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1, 2717 pathseg, path, fd, &cap_fstat_rights, td); 2718 if ((error = namei(&nd)) != 0) 2719 return (error); 2720 error = vn_fullpath_hardlink(&nd, &retbuf, &freebuf, &size); 2721 if (error == 0) { 2722 error = copyout(retbuf, buf, size); 2723 free(freebuf, M_TEMP); 2724 } 2725 NDFREE(&nd, 0); 2726 return (error); 2727 } 2728 2729 int 2730 sys___realpathat(struct thread *td, struct __realpathat_args *uap) 2731 { 2732 2733 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size, 2734 uap->flags, UIO_USERSPACE)); 2735 } 2736 2737 /* 2738 * Retrieve the full filesystem path that correspond to a vnode from the name 2739 * cache (if available) 2740 */ 2741 int 2742 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf) 2743 { 2744 struct pwd *pwd; 2745 char *buf; 2746 size_t buflen; 2747 int error; 2748 2749 if (__predict_false(vp == NULL)) 2750 return (EINVAL); 2751 2752 buflen = MAXPATHLEN; 2753 buf = malloc(buflen, M_TEMP, M_WAITOK); 2754 vfs_smr_enter(); 2755 pwd = pwd_get_smr(); 2756 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, false, 0); 2757 VFS_SMR_ASSERT_NOT_ENTERED(); 2758 if (error < 0) { 2759 pwd = pwd_hold(curthread); 2760 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen); 2761 pwd_drop(pwd); 2762 } 2763 if (error == 0) 2764 *freebuf = buf; 2765 else 2766 free(buf, M_TEMP); 2767 return (error); 2768 } 2769 2770 /* 2771 * This function is similar to vn_fullpath, but it attempts to lookup the 2772 * pathname relative to the global root mount point. This is required for the 2773 * auditing sub-system, as audited pathnames must be absolute, relative to the 2774 * global root mount point. 2775 */ 2776 int 2777 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf) 2778 { 2779 char *buf; 2780 size_t buflen; 2781 int error; 2782 2783 if (__predict_false(vp == NULL)) 2784 return (EINVAL); 2785 buflen = MAXPATHLEN; 2786 buf = malloc(buflen, M_TEMP, M_WAITOK); 2787 vfs_smr_enter(); 2788 error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, false, 0); 2789 VFS_SMR_ASSERT_NOT_ENTERED(); 2790 if (error < 0) { 2791 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen); 2792 } 2793 if (error == 0) 2794 *freebuf = buf; 2795 else 2796 free(buf, M_TEMP); 2797 return (error); 2798 } 2799 2800 static struct namecache * 2801 vn_dd_from_dst(struct vnode *vp) 2802 { 2803 struct namecache *ncp; 2804 2805 cache_assert_vnode_locked(vp); 2806 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) { 2807 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2808 return (ncp); 2809 } 2810 return (NULL); 2811 } 2812 2813 int 2814 vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen) 2815 { 2816 struct vnode *dvp; 2817 struct namecache *ncp; 2818 struct mtx *vlp; 2819 int error; 2820 2821 vlp = VP2VNODELOCK(*vp); 2822 mtx_lock(vlp); 2823 ncp = (*vp)->v_cache_dd; 2824 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) { 2825 KASSERT(ncp == vn_dd_from_dst(*vp), 2826 ("%s: mismatch for dd entry (%p != %p)", __func__, 2827 ncp, vn_dd_from_dst(*vp))); 2828 } else { 2829 ncp = vn_dd_from_dst(*vp); 2830 } 2831 if (ncp != NULL) { 2832 if (*buflen < ncp->nc_nlen) { 2833 mtx_unlock(vlp); 2834 vrele(*vp); 2835 counter_u64_add(numfullpathfail4, 1); 2836 error = ENOMEM; 2837 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2838 vp, NULL); 2839 return (error); 2840 } 2841 *buflen -= ncp->nc_nlen; 2842 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 2843 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, 2844 ncp->nc_name, vp); 2845 dvp = *vp; 2846 *vp = ncp->nc_dvp; 2847 vref(*vp); 2848 mtx_unlock(vlp); 2849 vrele(dvp); 2850 return (0); 2851 } 2852 SDT_PROBE1(vfs, namecache, fullpath, miss, vp); 2853 2854 mtx_unlock(vlp); 2855 vn_lock(*vp, LK_SHARED | LK_RETRY); 2856 error = VOP_VPTOCNP(*vp, &dvp, buf, buflen); 2857 vput(*vp); 2858 if (error) { 2859 counter_u64_add(numfullpathfail2, 1); 2860 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2861 return (error); 2862 } 2863 2864 *vp = dvp; 2865 if (VN_IS_DOOMED(dvp)) { 2866 /* forced unmount */ 2867 vrele(dvp); 2868 error = ENOENT; 2869 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2870 return (error); 2871 } 2872 /* 2873 * *vp has its use count incremented still. 2874 */ 2875 2876 return (0); 2877 } 2878 2879 /* 2880 * Resolve a directory to a pathname. 2881 * 2882 * The name of the directory can always be found in the namecache or fetched 2883 * from the filesystem. There is also guaranteed to be only one parent, meaning 2884 * we can just follow vnodes up until we find the root. 2885 * 2886 * The vnode must be referenced. 2887 */ 2888 static int 2889 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 2890 size_t *len, bool slash_prefixed, size_t addend) 2891 { 2892 #ifdef KDTRACE_HOOKS 2893 struct vnode *startvp = vp; 2894 #endif 2895 struct vnode *vp1; 2896 size_t buflen; 2897 int error; 2898 2899 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); 2900 VNPASS(vp->v_usecount > 0, vp); 2901 2902 buflen = *len; 2903 2904 if (!slash_prefixed) { 2905 MPASS(*len >= 2); 2906 buflen--; 2907 buf[buflen] = '\0'; 2908 } 2909 2910 error = 0; 2911 2912 SDT_PROBE1(vfs, namecache, fullpath, entry, vp); 2913 counter_u64_add(numfullpathcalls, 1); 2914 while (vp != rdir && vp != rootvnode) { 2915 /* 2916 * The vp vnode must be already fully constructed, 2917 * since it is either found in namecache or obtained 2918 * from VOP_VPTOCNP(). We may test for VV_ROOT safely 2919 * without obtaining the vnode lock. 2920 */ 2921 if ((vp->v_vflag & VV_ROOT) != 0) { 2922 vn_lock(vp, LK_RETRY | LK_SHARED); 2923 2924 /* 2925 * With the vnode locked, check for races with 2926 * unmount, forced or not. Note that we 2927 * already verified that vp is not equal to 2928 * the root vnode, which means that 2929 * mnt_vnodecovered can be NULL only for the 2930 * case of unmount. 2931 */ 2932 if (VN_IS_DOOMED(vp) || 2933 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || 2934 vp1->v_mountedhere != vp->v_mount) { 2935 vput(vp); 2936 error = ENOENT; 2937 SDT_PROBE3(vfs, namecache, fullpath, return, 2938 error, vp, NULL); 2939 break; 2940 } 2941 2942 vref(vp1); 2943 vput(vp); 2944 vp = vp1; 2945 continue; 2946 } 2947 if (vp->v_type != VDIR) { 2948 vrele(vp); 2949 counter_u64_add(numfullpathfail1, 1); 2950 error = ENOTDIR; 2951 SDT_PROBE3(vfs, namecache, fullpath, return, 2952 error, vp, NULL); 2953 break; 2954 } 2955 error = vn_vptocnp(&vp, buf, &buflen); 2956 if (error) 2957 break; 2958 if (buflen == 0) { 2959 vrele(vp); 2960 error = ENOMEM; 2961 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2962 startvp, NULL); 2963 break; 2964 } 2965 buf[--buflen] = '/'; 2966 slash_prefixed = true; 2967 } 2968 if (error) 2969 return (error); 2970 if (!slash_prefixed) { 2971 if (buflen == 0) { 2972 vrele(vp); 2973 counter_u64_add(numfullpathfail4, 1); 2974 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, 2975 startvp, NULL); 2976 return (ENOMEM); 2977 } 2978 buf[--buflen] = '/'; 2979 } 2980 counter_u64_add(numfullpathfound, 1); 2981 vrele(vp); 2982 2983 *retbuf = buf + buflen; 2984 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf); 2985 *len -= buflen; 2986 *len += addend; 2987 return (0); 2988 } 2989 2990 /* 2991 * Resolve an arbitrary vnode to a pathname. 2992 * 2993 * Note 2 caveats: 2994 * - hardlinks are not tracked, thus if the vnode is not a directory this can 2995 * resolve to a different path than the one used to find it 2996 * - namecache is not mandatory, meaning names are not guaranteed to be added 2997 * (in which case resolving fails) 2998 */ 2999 static void __inline 3000 cache_rev_failed_impl(int *reason, int line) 3001 { 3002 3003 *reason = line; 3004 } 3005 #define cache_rev_failed(var) cache_rev_failed_impl((var), __LINE__) 3006 3007 static int 3008 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 3009 char **retbuf, size_t *buflen, bool slash_prefixed, size_t addend) 3010 { 3011 #ifdef KDTRACE_HOOKS 3012 struct vnode *startvp = vp; 3013 #endif 3014 struct vnode *tvp; 3015 struct mount *mp; 3016 struct namecache *ncp; 3017 size_t orig_buflen; 3018 int reason; 3019 int error; 3020 #ifdef KDTRACE_HOOKS 3021 int i; 3022 #endif 3023 seqc_t vp_seqc, tvp_seqc; 3024 u_char nc_flag; 3025 3026 VFS_SMR_ASSERT_ENTERED(); 3027 3028 if (!cache_fast_revlookup) { 3029 vfs_smr_exit(); 3030 return (-1); 3031 } 3032 3033 orig_buflen = *buflen; 3034 3035 if (!slash_prefixed) { 3036 MPASS(*buflen >= 2); 3037 *buflen -= 1; 3038 buf[*buflen] = '\0'; 3039 } 3040 3041 if (vp == rdir || vp == rootvnode) { 3042 if (!slash_prefixed) { 3043 *buflen -= 1; 3044 buf[*buflen] = '/'; 3045 } 3046 goto out_ok; 3047 } 3048 3049 #ifdef KDTRACE_HOOKS 3050 i = 0; 3051 #endif 3052 error = -1; 3053 ncp = NULL; /* for sdt probe down below */ 3054 vp_seqc = vn_seqc_read_any(vp); 3055 if (seqc_in_modify(vp_seqc)) { 3056 cache_rev_failed(&reason); 3057 goto out_abort; 3058 } 3059 3060 for (;;) { 3061 #ifdef KDTRACE_HOOKS 3062 i++; 3063 #endif 3064 if ((vp->v_vflag & VV_ROOT) != 0) { 3065 mp = atomic_load_ptr(&vp->v_mount); 3066 if (mp == NULL) { 3067 cache_rev_failed(&reason); 3068 goto out_abort; 3069 } 3070 tvp = atomic_load_ptr(&mp->mnt_vnodecovered); 3071 tvp_seqc = vn_seqc_read_any(tvp); 3072 if (seqc_in_modify(tvp_seqc)) { 3073 cache_rev_failed(&reason); 3074 goto out_abort; 3075 } 3076 if (!vn_seqc_consistent(vp, vp_seqc)) { 3077 cache_rev_failed(&reason); 3078 goto out_abort; 3079 } 3080 vp = tvp; 3081 vp_seqc = tvp_seqc; 3082 continue; 3083 } 3084 ncp = atomic_load_ptr(&vp->v_cache_dd); 3085 if (ncp == NULL) { 3086 cache_rev_failed(&reason); 3087 goto out_abort; 3088 } 3089 nc_flag = atomic_load_char(&ncp->nc_flag); 3090 if ((nc_flag & NCF_ISDOTDOT) != 0) { 3091 cache_rev_failed(&reason); 3092 goto out_abort; 3093 } 3094 if (!cache_ncp_canuse(ncp)) { 3095 cache_rev_failed(&reason); 3096 goto out_abort; 3097 } 3098 if (ncp->nc_nlen >= *buflen) { 3099 cache_rev_failed(&reason); 3100 error = ENOMEM; 3101 goto out_abort; 3102 } 3103 *buflen -= ncp->nc_nlen; 3104 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 3105 *buflen -= 1; 3106 buf[*buflen] = '/'; 3107 tvp = ncp->nc_dvp; 3108 tvp_seqc = vn_seqc_read_any(tvp); 3109 if (seqc_in_modify(tvp_seqc)) { 3110 cache_rev_failed(&reason); 3111 goto out_abort; 3112 } 3113 if (!vn_seqc_consistent(vp, vp_seqc)) { 3114 cache_rev_failed(&reason); 3115 goto out_abort; 3116 } 3117 vp = tvp; 3118 vp_seqc = tvp_seqc; 3119 if (vp == rdir || vp == rootvnode) 3120 break; 3121 } 3122 out_ok: 3123 vfs_smr_exit(); 3124 *retbuf = buf + *buflen; 3125 *buflen = orig_buflen - *buflen + addend; 3126 SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf); 3127 return (0); 3128 3129 out_abort: 3130 *buflen = orig_buflen; 3131 SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i); 3132 vfs_smr_exit(); 3133 return (error); 3134 } 3135 3136 static int 3137 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 3138 size_t *buflen) 3139 { 3140 size_t orig_buflen; 3141 bool slash_prefixed; 3142 int error; 3143 3144 if (*buflen < 2) 3145 return (EINVAL); 3146 3147 orig_buflen = *buflen; 3148 3149 vref(vp); 3150 slash_prefixed = false; 3151 if (vp->v_type != VDIR) { 3152 *buflen -= 1; 3153 buf[*buflen] = '\0'; 3154 error = vn_vptocnp(&vp, buf, buflen); 3155 if (error) 3156 return (error); 3157 if (*buflen == 0) { 3158 vrele(vp); 3159 return (ENOMEM); 3160 } 3161 *buflen -= 1; 3162 buf[*buflen] = '/'; 3163 slash_prefixed = true; 3164 } 3165 3166 return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, slash_prefixed, 3167 orig_buflen - *buflen)); 3168 } 3169 3170 /* 3171 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks). 3172 * 3173 * Since the namecache does not track handlings, the caller is expected to first 3174 * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei. 3175 * 3176 * Then we have 2 cases: 3177 * - if the found vnode is a directory, the path can be constructed just by 3178 * fullowing names up the chain 3179 * - otherwise we populate the buffer with the saved name and start resolving 3180 * from the parent 3181 */ 3182 static int 3183 vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, char **freebuf, 3184 size_t *buflen) 3185 { 3186 char *buf, *tmpbuf; 3187 struct pwd *pwd; 3188 struct componentname *cnp; 3189 struct vnode *vp; 3190 size_t addend; 3191 int error; 3192 bool slash_prefixed; 3193 enum vtype type; 3194 3195 if (*buflen < 2) 3196 return (EINVAL); 3197 if (*buflen > MAXPATHLEN) 3198 *buflen = MAXPATHLEN; 3199 3200 slash_prefixed = false; 3201 3202 buf = malloc(*buflen, M_TEMP, M_WAITOK); 3203 3204 addend = 0; 3205 vp = ndp->ni_vp; 3206 /* 3207 * Check for VBAD to work around the vp_crossmp bug in lookup(). 3208 * 3209 * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be 3210 * set to mount point's root vnode while ni_dvp will be vp_crossmp. 3211 * If the type is VDIR (like in this very case) we can skip looking 3212 * at ni_dvp in the first place. However, since vnodes get passed here 3213 * unlocked the target may transition to doomed state (type == VBAD) 3214 * before we get to evaluate the condition. If this happens, we will 3215 * populate part of the buffer and descend to vn_fullpath_dir with 3216 * vp == vp_crossmp. Prevent the problem by checking for VBAD. 3217 * 3218 * This should be atomic_load(&vp->v_type) but it is ilegal to take 3219 * an address of a bit field, even if said field is sized to char. 3220 * Work around the problem by reading the value into a full-sized enum 3221 * and then re-reading it with atomic_load which will still prevent 3222 * the compiler from re-reading down the road. 3223 */ 3224 type = vp->v_type; 3225 type = atomic_load_int(&type); 3226 if (type == VBAD) { 3227 error = ENOENT; 3228 goto out_bad; 3229 } 3230 if (type != VDIR) { 3231 cnp = &ndp->ni_cnd; 3232 addend = cnp->cn_namelen + 2; 3233 if (*buflen < addend) { 3234 error = ENOMEM; 3235 goto out_bad; 3236 } 3237 *buflen -= addend; 3238 tmpbuf = buf + *buflen; 3239 tmpbuf[0] = '/'; 3240 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen); 3241 tmpbuf[addend - 1] = '\0'; 3242 slash_prefixed = true; 3243 vp = ndp->ni_dvp; 3244 } 3245 3246 vfs_smr_enter(); 3247 pwd = pwd_get_smr(); 3248 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen, 3249 slash_prefixed, addend); 3250 VFS_SMR_ASSERT_NOT_ENTERED(); 3251 if (error < 0) { 3252 pwd = pwd_hold(curthread); 3253 vref(vp); 3254 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen, 3255 slash_prefixed, addend); 3256 pwd_drop(pwd); 3257 if (error != 0) 3258 goto out_bad; 3259 } 3260 3261 *freebuf = buf; 3262 3263 return (0); 3264 out_bad: 3265 free(buf, M_TEMP); 3266 return (error); 3267 } 3268 3269 struct vnode * 3270 vn_dir_dd_ino(struct vnode *vp) 3271 { 3272 struct namecache *ncp; 3273 struct vnode *ddvp; 3274 struct mtx *vlp; 3275 enum vgetstate vs; 3276 3277 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); 3278 vlp = VP2VNODELOCK(vp); 3279 mtx_lock(vlp); 3280 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { 3281 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 3282 continue; 3283 ddvp = ncp->nc_dvp; 3284 vs = vget_prep(ddvp); 3285 mtx_unlock(vlp); 3286 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) 3287 return (NULL); 3288 return (ddvp); 3289 } 3290 mtx_unlock(vlp); 3291 return (NULL); 3292 } 3293 3294 int 3295 vn_commname(struct vnode *vp, char *buf, u_int buflen) 3296 { 3297 struct namecache *ncp; 3298 struct mtx *vlp; 3299 int l; 3300 3301 vlp = VP2VNODELOCK(vp); 3302 mtx_lock(vlp); 3303 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) 3304 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 3305 break; 3306 if (ncp == NULL) { 3307 mtx_unlock(vlp); 3308 return (ENOENT); 3309 } 3310 l = min(ncp->nc_nlen, buflen - 1); 3311 memcpy(buf, ncp->nc_name, l); 3312 mtx_unlock(vlp); 3313 buf[l] = '\0'; 3314 return (0); 3315 } 3316 3317 /* 3318 * This function updates path string to vnode's full global path 3319 * and checks the size of the new path string against the pathlen argument. 3320 * 3321 * Requires a locked, referenced vnode. 3322 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 3323 * 3324 * If vp is a directory, the call to vn_fullpath_global() always succeeds 3325 * because it falls back to the ".." lookup if the namecache lookup fails. 3326 */ 3327 int 3328 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, 3329 u_int pathlen) 3330 { 3331 struct nameidata nd; 3332 struct vnode *vp1; 3333 char *rpath, *fbuf; 3334 int error; 3335 3336 ASSERT_VOP_ELOCKED(vp, __func__); 3337 3338 /* Construct global filesystem path from vp. */ 3339 VOP_UNLOCK(vp); 3340 error = vn_fullpath_global(vp, &rpath, &fbuf); 3341 3342 if (error != 0) { 3343 vrele(vp); 3344 return (error); 3345 } 3346 3347 if (strlen(rpath) >= pathlen) { 3348 vrele(vp); 3349 error = ENAMETOOLONG; 3350 goto out; 3351 } 3352 3353 /* 3354 * Re-lookup the vnode by path to detect a possible rename. 3355 * As a side effect, the vnode is relocked. 3356 * If vnode was renamed, return ENOENT. 3357 */ 3358 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, 3359 UIO_SYSSPACE, path, td); 3360 error = namei(&nd); 3361 if (error != 0) { 3362 vrele(vp); 3363 goto out; 3364 } 3365 NDFREE(&nd, NDF_ONLY_PNBUF); 3366 vp1 = nd.ni_vp; 3367 vrele(vp); 3368 if (vp1 == vp) 3369 strcpy(path, rpath); 3370 else { 3371 vput(vp1); 3372 error = ENOENT; 3373 } 3374 3375 out: 3376 free(fbuf, M_TEMP); 3377 return (error); 3378 } 3379 3380 #ifdef DDB 3381 static void 3382 db_print_vpath(struct vnode *vp) 3383 { 3384 3385 while (vp != NULL) { 3386 db_printf("%p: ", vp); 3387 if (vp == rootvnode) { 3388 db_printf("/"); 3389 vp = NULL; 3390 } else { 3391 if (vp->v_vflag & VV_ROOT) { 3392 db_printf("<mount point>"); 3393 vp = vp->v_mount->mnt_vnodecovered; 3394 } else { 3395 struct namecache *ncp; 3396 char *ncn; 3397 int i; 3398 3399 ncp = TAILQ_FIRST(&vp->v_cache_dst); 3400 if (ncp != NULL) { 3401 ncn = ncp->nc_name; 3402 for (i = 0; i < ncp->nc_nlen; i++) 3403 db_printf("%c", *ncn++); 3404 vp = ncp->nc_dvp; 3405 } else { 3406 vp = NULL; 3407 } 3408 } 3409 } 3410 db_printf("\n"); 3411 } 3412 3413 return; 3414 } 3415 3416 DB_SHOW_COMMAND(vpath, db_show_vpath) 3417 { 3418 struct vnode *vp; 3419 3420 if (!have_addr) { 3421 db_printf("usage: show vpath <struct vnode *>\n"); 3422 return; 3423 } 3424 3425 vp = (struct vnode *)addr; 3426 db_print_vpath(vp); 3427 } 3428 3429 #endif 3430 3431 static bool __read_frequently cache_fast_lookup = true; 3432 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW, 3433 &cache_fast_lookup, 0, ""); 3434 3435 #define CACHE_FPL_FAILED -2020 3436 3437 static void 3438 cache_fpl_cleanup_cnp(struct componentname *cnp) 3439 { 3440 3441 uma_zfree(namei_zone, cnp->cn_pnbuf); 3442 #ifdef DIAGNOSTIC 3443 cnp->cn_pnbuf = NULL; 3444 cnp->cn_nameptr = NULL; 3445 #endif 3446 } 3447 3448 static void 3449 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp) 3450 { 3451 struct componentname *cnp; 3452 3453 cnp = &ndp->ni_cnd; 3454 while (*(cnp->cn_nameptr) == '/') { 3455 cnp->cn_nameptr++; 3456 ndp->ni_pathlen--; 3457 } 3458 3459 *dpp = ndp->ni_rootdir; 3460 } 3461 3462 /* 3463 * Components of nameidata (or objects it can point to) which may 3464 * need restoring in case fast path lookup fails. 3465 */ 3466 struct nameidata_saved { 3467 long cn_namelen; 3468 char *cn_nameptr; 3469 size_t ni_pathlen; 3470 int cn_flags; 3471 }; 3472 3473 struct cache_fpl { 3474 struct nameidata *ndp; 3475 struct componentname *cnp; 3476 struct pwd *pwd; 3477 struct vnode *dvp; 3478 struct vnode *tvp; 3479 seqc_t dvp_seqc; 3480 seqc_t tvp_seqc; 3481 struct nameidata_saved snd; 3482 int line; 3483 enum cache_fpl_status status:8; 3484 bool in_smr; 3485 bool fsearch; 3486 }; 3487 3488 static void 3489 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd) 3490 { 3491 3492 snd->cn_flags = fpl->ndp->ni_cnd.cn_flags; 3493 snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen; 3494 snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr; 3495 snd->ni_pathlen = fpl->ndp->ni_pathlen; 3496 } 3497 3498 static void 3499 cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd) 3500 { 3501 3502 fpl->ndp->ni_cnd.cn_flags = snd->cn_flags; 3503 fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen; 3504 fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr; 3505 fpl->ndp->ni_pathlen = snd->ni_pathlen; 3506 } 3507 3508 #ifdef INVARIANTS 3509 #define cache_fpl_smr_assert_entered(fpl) ({ \ 3510 struct cache_fpl *_fpl = (fpl); \ 3511 MPASS(_fpl->in_smr == true); \ 3512 VFS_SMR_ASSERT_ENTERED(); \ 3513 }) 3514 #define cache_fpl_smr_assert_not_entered(fpl) ({ \ 3515 struct cache_fpl *_fpl = (fpl); \ 3516 MPASS(_fpl->in_smr == false); \ 3517 VFS_SMR_ASSERT_NOT_ENTERED(); \ 3518 }) 3519 #else 3520 #define cache_fpl_smr_assert_entered(fpl) do { } while (0) 3521 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0) 3522 #endif 3523 3524 #define cache_fpl_smr_enter_initial(fpl) ({ \ 3525 struct cache_fpl *_fpl = (fpl); \ 3526 vfs_smr_enter(); \ 3527 _fpl->in_smr = true; \ 3528 }) 3529 3530 #define cache_fpl_smr_enter(fpl) ({ \ 3531 struct cache_fpl *_fpl = (fpl); \ 3532 MPASS(_fpl->in_smr == false); \ 3533 vfs_smr_enter(); \ 3534 _fpl->in_smr = true; \ 3535 }) 3536 3537 #define cache_fpl_smr_exit(fpl) ({ \ 3538 struct cache_fpl *_fpl = (fpl); \ 3539 MPASS(_fpl->in_smr == true); \ 3540 vfs_smr_exit(); \ 3541 _fpl->in_smr = false; \ 3542 }) 3543 3544 static int 3545 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line) 3546 { 3547 3548 if (fpl->status != CACHE_FPL_STATUS_UNSET) { 3549 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, 3550 ("%s: converting to abort from %d at %d, set at %d\n", 3551 __func__, fpl->status, line, fpl->line)); 3552 } 3553 fpl->status = CACHE_FPL_STATUS_ABORTED; 3554 fpl->line = line; 3555 return (CACHE_FPL_FAILED); 3556 } 3557 3558 #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__) 3559 3560 static int 3561 cache_fpl_partial_impl(struct cache_fpl *fpl, int line) 3562 { 3563 3564 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3565 ("%s: setting to partial at %d, but already set to %d at %d\n", 3566 __func__, line, fpl->status, fpl->line)); 3567 cache_fpl_smr_assert_entered(fpl); 3568 fpl->status = CACHE_FPL_STATUS_PARTIAL; 3569 fpl->line = line; 3570 return (CACHE_FPL_FAILED); 3571 } 3572 3573 #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__) 3574 3575 static int 3576 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line) 3577 { 3578 3579 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3580 ("%s: setting to handled at %d, but already set to %d at %d\n", 3581 __func__, line, fpl->status, fpl->line)); 3582 cache_fpl_smr_assert_not_entered(fpl); 3583 MPASS(error != CACHE_FPL_FAILED); 3584 fpl->status = CACHE_FPL_STATUS_HANDLED; 3585 fpl->line = line; 3586 return (error); 3587 } 3588 3589 #define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__) 3590 3591 #define CACHE_FPL_SUPPORTED_CN_FLAGS \ 3592 (LOCKLEAF | LOCKPARENT | WANTPARENT | NOCACHE | FOLLOW | LOCKSHARED | SAVENAME | \ 3593 SAVESTART | WILLBEDIR | ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK) 3594 3595 #define CACHE_FPL_INTERNAL_CN_FLAGS \ 3596 (ISDOTDOT | MAKEENTRY | ISLASTCN) 3597 3598 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, 3599 "supported and internal flags overlap"); 3600 3601 static bool 3602 cache_fpl_islastcn(struct nameidata *ndp) 3603 { 3604 3605 return (*ndp->ni_next == 0); 3606 } 3607 3608 static bool 3609 cache_fpl_isdotdot(struct componentname *cnp) 3610 { 3611 3612 if (cnp->cn_namelen == 2 && 3613 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') 3614 return (true); 3615 return (false); 3616 } 3617 3618 static bool 3619 cache_can_fplookup(struct cache_fpl *fpl) 3620 { 3621 struct nameidata *ndp; 3622 struct componentname *cnp; 3623 struct thread *td; 3624 3625 ndp = fpl->ndp; 3626 cnp = fpl->cnp; 3627 td = cnp->cn_thread; 3628 3629 if (!cache_fast_lookup) { 3630 cache_fpl_aborted(fpl); 3631 return (false); 3632 } 3633 #ifdef MAC 3634 if (mac_vnode_check_lookup_enabled()) { 3635 cache_fpl_aborted(fpl); 3636 return (false); 3637 } 3638 #endif 3639 if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) { 3640 cache_fpl_aborted(fpl); 3641 return (false); 3642 } 3643 if (IN_CAPABILITY_MODE(td)) { 3644 cache_fpl_aborted(fpl); 3645 return (false); 3646 } 3647 if (AUDITING_TD(td)) { 3648 cache_fpl_aborted(fpl); 3649 return (false); 3650 } 3651 if (ndp->ni_startdir != NULL) { 3652 cache_fpl_aborted(fpl); 3653 return (false); 3654 } 3655 return (true); 3656 } 3657 3658 static int 3659 cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp) 3660 { 3661 struct nameidata *ndp; 3662 int error; 3663 bool fsearch; 3664 3665 ndp = fpl->ndp; 3666 error = fgetvp_lookup_smr(ndp->ni_dirfd, ndp, vpp, &fsearch); 3667 if (__predict_false(error != 0)) { 3668 cache_fpl_smr_exit(fpl); 3669 return (cache_fpl_aborted(fpl)); 3670 } 3671 fpl->fsearch = fsearch; 3672 return (0); 3673 } 3674 3675 static bool 3676 cache_fplookup_vnode_supported(struct vnode *vp) 3677 { 3678 3679 return (vp->v_type != VLNK); 3680 } 3681 3682 static int __noinline 3683 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp, 3684 uint32_t hash) 3685 { 3686 struct componentname *cnp; 3687 struct vnode *dvp; 3688 3689 cnp = fpl->cnp; 3690 dvp = fpl->dvp; 3691 3692 cache_fpl_smr_exit(fpl); 3693 if (cache_neg_promote_cond(dvp, cnp, oncp, hash)) 3694 return (cache_fpl_handled(fpl, ENOENT)); 3695 else 3696 return (cache_fpl_aborted(fpl)); 3697 } 3698 3699 /* 3700 * The target vnode is not supported, prepare for the slow path to take over. 3701 */ 3702 static int __noinline 3703 cache_fplookup_partial_setup(struct cache_fpl *fpl) 3704 { 3705 struct nameidata *ndp; 3706 struct componentname *cnp; 3707 enum vgetstate dvs; 3708 struct vnode *dvp; 3709 struct pwd *pwd; 3710 seqc_t dvp_seqc; 3711 3712 ndp = fpl->ndp; 3713 cnp = fpl->cnp; 3714 pwd = fpl->pwd; 3715 dvp = fpl->dvp; 3716 dvp_seqc = fpl->dvp_seqc; 3717 3718 if (!pwd_hold_smr(pwd)) { 3719 cache_fpl_smr_exit(fpl); 3720 return (cache_fpl_aborted(fpl)); 3721 } 3722 3723 dvs = vget_prep_smr(dvp); 3724 cache_fpl_smr_exit(fpl); 3725 if (__predict_false(dvs == VGET_NONE)) { 3726 pwd_drop(pwd); 3727 return (cache_fpl_aborted(fpl)); 3728 } 3729 3730 vget_finish_ref(dvp, dvs); 3731 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3732 vrele(dvp); 3733 pwd_drop(pwd); 3734 return (cache_fpl_aborted(fpl)); 3735 } 3736 3737 cache_fpl_restore(fpl, &fpl->snd); 3738 3739 ndp->ni_startdir = dvp; 3740 cnp->cn_flags |= MAKEENTRY; 3741 if (cache_fpl_islastcn(ndp)) 3742 cnp->cn_flags |= ISLASTCN; 3743 if (cache_fpl_isdotdot(cnp)) 3744 cnp->cn_flags |= ISDOTDOT; 3745 3746 return (0); 3747 } 3748 3749 static int 3750 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs) 3751 { 3752 struct componentname *cnp; 3753 struct vnode *tvp; 3754 seqc_t tvp_seqc; 3755 int error, lkflags; 3756 3757 cnp = fpl->cnp; 3758 tvp = fpl->tvp; 3759 tvp_seqc = fpl->tvp_seqc; 3760 3761 if ((cnp->cn_flags & LOCKLEAF) != 0) { 3762 lkflags = LK_SHARED; 3763 if ((cnp->cn_flags & LOCKSHARED) == 0) 3764 lkflags = LK_EXCLUSIVE; 3765 error = vget_finish(tvp, lkflags, tvs); 3766 if (__predict_false(error != 0)) { 3767 return (cache_fpl_aborted(fpl)); 3768 } 3769 } else { 3770 vget_finish_ref(tvp, tvs); 3771 } 3772 3773 if (!vn_seqc_consistent(tvp, tvp_seqc)) { 3774 if ((cnp->cn_flags & LOCKLEAF) != 0) 3775 vput(tvp); 3776 else 3777 vrele(tvp); 3778 return (cache_fpl_aborted(fpl)); 3779 } 3780 3781 return (cache_fpl_handled(fpl, 0)); 3782 } 3783 3784 /* 3785 * They want to possibly modify the state of the namecache. 3786 * 3787 * Don't try to match the API contract, just leave. 3788 * TODO: this leaves scalability on the table 3789 */ 3790 static int 3791 cache_fplookup_final_modifying(struct cache_fpl *fpl) 3792 { 3793 struct componentname *cnp; 3794 3795 cnp = fpl->cnp; 3796 MPASS(cnp->cn_nameiop != LOOKUP); 3797 return (cache_fpl_partial(fpl)); 3798 } 3799 3800 static int __noinline 3801 cache_fplookup_final_withparent(struct cache_fpl *fpl) 3802 { 3803 struct componentname *cnp; 3804 enum vgetstate dvs, tvs; 3805 struct vnode *dvp, *tvp; 3806 seqc_t dvp_seqc; 3807 int error; 3808 3809 cnp = fpl->cnp; 3810 dvp = fpl->dvp; 3811 dvp_seqc = fpl->dvp_seqc; 3812 tvp = fpl->tvp; 3813 3814 MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0); 3815 3816 /* 3817 * This is less efficient than it can be for simplicity. 3818 */ 3819 dvs = vget_prep_smr(dvp); 3820 if (__predict_false(dvs == VGET_NONE)) { 3821 return (cache_fpl_aborted(fpl)); 3822 } 3823 tvs = vget_prep_smr(tvp); 3824 if (__predict_false(tvs == VGET_NONE)) { 3825 cache_fpl_smr_exit(fpl); 3826 vget_abort(dvp, dvs); 3827 return (cache_fpl_aborted(fpl)); 3828 } 3829 3830 cache_fpl_smr_exit(fpl); 3831 3832 if ((cnp->cn_flags & LOCKPARENT) != 0) { 3833 error = vget_finish(dvp, LK_EXCLUSIVE, dvs); 3834 if (__predict_false(error != 0)) { 3835 vget_abort(tvp, tvs); 3836 return (cache_fpl_aborted(fpl)); 3837 } 3838 } else { 3839 vget_finish_ref(dvp, dvs); 3840 } 3841 3842 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3843 vget_abort(tvp, tvs); 3844 if ((cnp->cn_flags & LOCKPARENT) != 0) 3845 vput(dvp); 3846 else 3847 vrele(dvp); 3848 return (cache_fpl_aborted(fpl)); 3849 } 3850 3851 error = cache_fplookup_final_child(fpl, tvs); 3852 if (__predict_false(error != 0)) { 3853 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED); 3854 if ((cnp->cn_flags & LOCKPARENT) != 0) 3855 vput(dvp); 3856 else 3857 vrele(dvp); 3858 return (error); 3859 } 3860 3861 MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED); 3862 return (0); 3863 } 3864 3865 static int 3866 cache_fplookup_final(struct cache_fpl *fpl) 3867 { 3868 struct componentname *cnp; 3869 enum vgetstate tvs; 3870 struct vnode *dvp, *tvp; 3871 seqc_t dvp_seqc; 3872 3873 cnp = fpl->cnp; 3874 dvp = fpl->dvp; 3875 dvp_seqc = fpl->dvp_seqc; 3876 tvp = fpl->tvp; 3877 3878 VNPASS(cache_fplookup_vnode_supported(dvp), dvp); 3879 3880 if (cnp->cn_nameiop != LOOKUP) { 3881 return (cache_fplookup_final_modifying(fpl)); 3882 } 3883 3884 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) 3885 return (cache_fplookup_final_withparent(fpl)); 3886 3887 tvs = vget_prep_smr(tvp); 3888 if (__predict_false(tvs == VGET_NONE)) { 3889 return (cache_fpl_partial(fpl)); 3890 } 3891 3892 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3893 cache_fpl_smr_exit(fpl); 3894 vget_abort(tvp, tvs); 3895 return (cache_fpl_aborted(fpl)); 3896 } 3897 3898 cache_fpl_smr_exit(fpl); 3899 return (cache_fplookup_final_child(fpl, tvs)); 3900 } 3901 3902 static int __noinline 3903 cache_fplookup_dot(struct cache_fpl *fpl) 3904 { 3905 struct vnode *dvp; 3906 3907 dvp = fpl->dvp; 3908 3909 fpl->tvp = dvp; 3910 fpl->tvp_seqc = vn_seqc_read_any(dvp); 3911 if (seqc_in_modify(fpl->tvp_seqc)) { 3912 return (cache_fpl_aborted(fpl)); 3913 } 3914 3915 counter_u64_add(dothits, 1); 3916 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", dvp); 3917 3918 return (0); 3919 } 3920 3921 static int __noinline 3922 cache_fplookup_dotdot(struct cache_fpl *fpl) 3923 { 3924 struct nameidata *ndp; 3925 struct componentname *cnp; 3926 struct namecache *ncp; 3927 struct vnode *dvp; 3928 struct prison *pr; 3929 u_char nc_flag; 3930 3931 ndp = fpl->ndp; 3932 cnp = fpl->cnp; 3933 dvp = fpl->dvp; 3934 3935 /* 3936 * XXX this is racy the same way regular lookup is 3937 */ 3938 for (pr = cnp->cn_cred->cr_prison; pr != NULL; 3939 pr = pr->pr_parent) 3940 if (dvp == pr->pr_root) 3941 break; 3942 3943 if (dvp == ndp->ni_rootdir || 3944 dvp == ndp->ni_topdir || 3945 dvp == rootvnode || 3946 pr != NULL) { 3947 fpl->tvp = dvp; 3948 fpl->tvp_seqc = vn_seqc_read_any(dvp); 3949 if (seqc_in_modify(fpl->tvp_seqc)) { 3950 return (cache_fpl_aborted(fpl)); 3951 } 3952 return (0); 3953 } 3954 3955 if ((dvp->v_vflag & VV_ROOT) != 0) { 3956 /* 3957 * TODO 3958 * The opposite of climb mount is needed here. 3959 */ 3960 return (cache_fpl_aborted(fpl)); 3961 } 3962 3963 ncp = atomic_load_ptr(&dvp->v_cache_dd); 3964 if (ncp == NULL) { 3965 return (cache_fpl_aborted(fpl)); 3966 } 3967 3968 nc_flag = atomic_load_char(&ncp->nc_flag); 3969 if ((nc_flag & NCF_ISDOTDOT) != 0) { 3970 if ((nc_flag & NCF_NEGATIVE) != 0) 3971 return (cache_fpl_aborted(fpl)); 3972 fpl->tvp = ncp->nc_vp; 3973 } else { 3974 fpl->tvp = ncp->nc_dvp; 3975 } 3976 3977 if (__predict_false(!cache_ncp_canuse(ncp))) { 3978 return (cache_fpl_aborted(fpl)); 3979 } 3980 3981 fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp); 3982 if (seqc_in_modify(fpl->tvp_seqc)) { 3983 return (cache_fpl_partial(fpl)); 3984 } 3985 3986 counter_u64_add(dotdothits, 1); 3987 return (0); 3988 } 3989 3990 static int __noinline 3991 cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash) 3992 { 3993 u_char nc_flag; 3994 bool neg_promote; 3995 3996 nc_flag = atomic_load_char(&ncp->nc_flag); 3997 MPASS((nc_flag & NCF_NEGATIVE) != 0); 3998 /* 3999 * If they want to create an entry we need to replace this one. 4000 */ 4001 if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) { 4002 /* 4003 * TODO 4004 * This should call something similar to 4005 * cache_fplookup_final_modifying. 4006 */ 4007 return (cache_fpl_partial(fpl)); 4008 } 4009 neg_promote = cache_neg_hit_prep(ncp); 4010 if (__predict_false(!cache_ncp_canuse(ncp))) { 4011 cache_neg_hit_abort(ncp); 4012 return (cache_fpl_partial(fpl)); 4013 } 4014 if (__predict_false((nc_flag & NCF_WHITE) != 0)) { 4015 cache_neg_hit_abort(ncp); 4016 return (cache_fpl_partial(fpl)); 4017 } 4018 if (neg_promote) { 4019 return (cache_fplookup_negative_promote(fpl, ncp, hash)); 4020 } 4021 cache_neg_hit_finish(ncp); 4022 cache_fpl_smr_exit(fpl); 4023 return (cache_fpl_handled(fpl, ENOENT)); 4024 } 4025 4026 static int 4027 cache_fplookup_next(struct cache_fpl *fpl) 4028 { 4029 struct componentname *cnp; 4030 struct namecache *ncp; 4031 struct vnode *dvp, *tvp; 4032 u_char nc_flag; 4033 uint32_t hash; 4034 4035 cnp = fpl->cnp; 4036 dvp = fpl->dvp; 4037 4038 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) { 4039 return (cache_fplookup_dot(fpl)); 4040 } 4041 4042 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 4043 4044 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 4045 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 4046 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 4047 break; 4048 } 4049 4050 /* 4051 * If there is no entry we have to punt to the slow path to perform 4052 * actual lookup. Should there be nothing with this name a negative 4053 * entry will be created. 4054 */ 4055 if (__predict_false(ncp == NULL)) { 4056 return (cache_fpl_partial(fpl)); 4057 } 4058 4059 tvp = atomic_load_ptr(&ncp->nc_vp); 4060 nc_flag = atomic_load_char(&ncp->nc_flag); 4061 if ((nc_flag & NCF_NEGATIVE) != 0) { 4062 return (cache_fplookup_neg(fpl, ncp, hash)); 4063 } 4064 4065 if (__predict_false(!cache_ncp_canuse(ncp))) { 4066 return (cache_fpl_partial(fpl)); 4067 } 4068 4069 fpl->tvp = tvp; 4070 fpl->tvp_seqc = vn_seqc_read_any(tvp); 4071 if (seqc_in_modify(fpl->tvp_seqc)) { 4072 return (cache_fpl_partial(fpl)); 4073 } 4074 4075 if (!cache_fplookup_vnode_supported(tvp)) { 4076 return (cache_fpl_partial(fpl)); 4077 } 4078 4079 counter_u64_add(numposhits, 1); 4080 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp); 4081 return (0); 4082 } 4083 4084 static bool 4085 cache_fplookup_mp_supported(struct mount *mp) 4086 { 4087 4088 if (mp == NULL) 4089 return (false); 4090 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) 4091 return (false); 4092 return (true); 4093 } 4094 4095 /* 4096 * Walk up the mount stack (if any). 4097 * 4098 * Correctness is provided in the following ways: 4099 * - all vnodes are protected from freeing with SMR 4100 * - struct mount objects are type stable making them always safe to access 4101 * - stability of the particular mount is provided by busying it 4102 * - relationship between the vnode which is mounted on and the mount is 4103 * verified with the vnode sequence counter after busying 4104 * - association between root vnode of the mount and the mount is protected 4105 * by busy 4106 * 4107 * From that point on we can read the sequence counter of the root vnode 4108 * and get the next mount on the stack (if any) using the same protection. 4109 * 4110 * By the end of successful walk we are guaranteed the reached state was 4111 * indeed present at least at some point which matches the regular lookup. 4112 */ 4113 static int __noinline 4114 cache_fplookup_climb_mount(struct cache_fpl *fpl) 4115 { 4116 struct mount *mp, *prev_mp; 4117 struct vnode *vp; 4118 seqc_t vp_seqc; 4119 4120 vp = fpl->tvp; 4121 vp_seqc = fpl->tvp_seqc; 4122 4123 VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp); 4124 mp = atomic_load_ptr(&vp->v_mountedhere); 4125 if (mp == NULL) 4126 return (0); 4127 4128 prev_mp = NULL; 4129 for (;;) { 4130 if (!vfs_op_thread_enter_crit(mp)) { 4131 if (prev_mp != NULL) 4132 vfs_op_thread_exit_crit(prev_mp); 4133 return (cache_fpl_partial(fpl)); 4134 } 4135 if (prev_mp != NULL) 4136 vfs_op_thread_exit_crit(prev_mp); 4137 if (!vn_seqc_consistent(vp, vp_seqc)) { 4138 vfs_op_thread_exit_crit(mp); 4139 return (cache_fpl_partial(fpl)); 4140 } 4141 if (!cache_fplookup_mp_supported(mp)) { 4142 vfs_op_thread_exit_crit(mp); 4143 return (cache_fpl_partial(fpl)); 4144 } 4145 vp = atomic_load_ptr(&mp->mnt_rootvnode); 4146 if (vp == NULL || VN_IS_DOOMED(vp)) { 4147 vfs_op_thread_exit_crit(mp); 4148 return (cache_fpl_partial(fpl)); 4149 } 4150 vp_seqc = vn_seqc_read_any(vp); 4151 if (seqc_in_modify(vp_seqc)) { 4152 vfs_op_thread_exit_crit(mp); 4153 return (cache_fpl_partial(fpl)); 4154 } 4155 prev_mp = mp; 4156 mp = atomic_load_ptr(&vp->v_mountedhere); 4157 if (mp == NULL) 4158 break; 4159 } 4160 4161 vfs_op_thread_exit_crit(prev_mp); 4162 fpl->tvp = vp; 4163 fpl->tvp_seqc = vp_seqc; 4164 return (0); 4165 } 4166 4167 static bool 4168 cache_fplookup_need_climb_mount(struct cache_fpl *fpl) 4169 { 4170 struct mount *mp; 4171 struct vnode *vp; 4172 4173 vp = fpl->tvp; 4174 4175 /* 4176 * Hack: while this is a union, the pointer tends to be NULL so save on 4177 * a branch. 4178 */ 4179 mp = atomic_load_ptr(&vp->v_mountedhere); 4180 if (mp == NULL) 4181 return (false); 4182 if (vp->v_type == VDIR) 4183 return (true); 4184 return (false); 4185 } 4186 4187 /* 4188 * Parse the path. 4189 * 4190 * The code was originally copy-pasted from regular lookup and despite 4191 * clean ups leaves performance on the table. Any modifications here 4192 * must take into account that in case off fallback the resulting 4193 * nameidata state has to be compatible with the original. 4194 */ 4195 static int 4196 cache_fplookup_parse(struct cache_fpl *fpl) 4197 { 4198 struct nameidata *ndp; 4199 struct componentname *cnp; 4200 char *cp; 4201 4202 ndp = fpl->ndp; 4203 cnp = fpl->cnp; 4204 4205 /* 4206 * Search a new directory. 4207 * 4208 * The last component of the filename is left accessible via 4209 * cnp->cn_nameptr for callers that need the name. Callers needing 4210 * the name set the SAVENAME flag. When done, they assume 4211 * responsibility for freeing the pathname buffer. 4212 */ 4213 for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++) 4214 continue; 4215 cnp->cn_namelen = cp - cnp->cn_nameptr; 4216 if (__predict_false(cnp->cn_namelen > NAME_MAX)) { 4217 cache_fpl_smr_exit(fpl); 4218 return (cache_fpl_handled(fpl, ENAMETOOLONG)); 4219 } 4220 ndp->ni_pathlen -= cnp->cn_namelen; 4221 KASSERT(ndp->ni_pathlen <= PATH_MAX, 4222 ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen)); 4223 ndp->ni_next = cp; 4224 4225 /* 4226 * Replace multiple slashes by a single slash and trailing slashes 4227 * by a null. This must be done before VOP_LOOKUP() because some 4228 * fs's don't know about trailing slashes. Remember if there were 4229 * trailing slashes to handle symlinks, existing non-directories 4230 * and non-existing files that won't be directories specially later. 4231 */ 4232 while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) { 4233 cp++; 4234 ndp->ni_pathlen--; 4235 if (*cp == '\0') { 4236 /* 4237 * TODO 4238 * Regular lookup performs the following: 4239 * *ndp->ni_next = '\0'; 4240 * cnp->cn_flags |= TRAILINGSLASH; 4241 * 4242 * Which is problematic since it modifies data read 4243 * from userspace. Then if fast path lookup was to 4244 * abort we would have to either restore it or convey 4245 * the flag. Since this is a corner case just ignore 4246 * it for simplicity. 4247 */ 4248 return (cache_fpl_partial(fpl)); 4249 } 4250 } 4251 ndp->ni_next = cp; 4252 4253 /* 4254 * Check for degenerate name (e.g. / or "") 4255 * which is a way of talking about a directory, 4256 * e.g. like "/." or ".". 4257 * 4258 * TODO 4259 * Another corner case handled by the regular lookup 4260 */ 4261 if (__predict_false(cnp->cn_nameptr[0] == '\0')) { 4262 return (cache_fpl_partial(fpl)); 4263 } 4264 return (0); 4265 } 4266 4267 static void 4268 cache_fplookup_parse_advance(struct cache_fpl *fpl) 4269 { 4270 struct nameidata *ndp; 4271 struct componentname *cnp; 4272 4273 ndp = fpl->ndp; 4274 cnp = fpl->cnp; 4275 4276 cnp->cn_nameptr = ndp->ni_next; 4277 while (*cnp->cn_nameptr == '/') { 4278 cnp->cn_nameptr++; 4279 ndp->ni_pathlen--; 4280 } 4281 } 4282 4283 /* 4284 * See the API contract for VOP_FPLOOKUP_VEXEC. 4285 */ 4286 static int __noinline 4287 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error) 4288 { 4289 struct componentname *cnp; 4290 struct vnode *dvp; 4291 seqc_t dvp_seqc; 4292 4293 cnp = fpl->cnp; 4294 dvp = fpl->dvp; 4295 dvp_seqc = fpl->dvp_seqc; 4296 4297 /* 4298 * Hack: they may be looking up foo/bar, where foo is a 4299 * regular file. In such a case we need to turn ENOTDIR, 4300 * but we may happen to get here with a different error. 4301 */ 4302 if (dvp->v_type != VDIR) { 4303 /* 4304 * The check here is predominantly to catch 4305 * EOPNOTSUPP from dead_vnodeops. If the vnode 4306 * gets doomed past this point it is going to 4307 * fail seqc verification. 4308 */ 4309 if (VN_IS_DOOMED(dvp)) { 4310 return (cache_fpl_aborted(fpl)); 4311 } 4312 error = ENOTDIR; 4313 } 4314 4315 /* 4316 * Hack: handle O_SEARCH. 4317 * 4318 * Open Group Base Specifications Issue 7, 2018 edition states: 4319 * If the access mode of the open file description associated with the 4320 * file descriptor is not O_SEARCH, the function shall check whether 4321 * directory searches are permitted using the current permissions of 4322 * the directory underlying the file descriptor. If the access mode is 4323 * O_SEARCH, the function shall not perform the check. 4324 * 4325 * Regular lookup tests for the NOEXECCHECK flag for every path 4326 * component to decide whether to do the permission check. However, 4327 * since most lookups never have the flag (and when they do it is only 4328 * present for the first path component), lockless lookup only acts on 4329 * it if there is a permission problem. Here the flag is represented 4330 * with a boolean so that we don't have to clear it on the way out. 4331 * 4332 * For simplicity this always aborts. 4333 * TODO: check if this is the first lookup and ignore the permission 4334 * problem. Note the flag has to survive fallback (if it happens to be 4335 * performed). 4336 */ 4337 if (fpl->fsearch) { 4338 return (cache_fpl_aborted(fpl)); 4339 } 4340 4341 switch (error) { 4342 case EAGAIN: 4343 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4344 error = cache_fpl_aborted(fpl); 4345 } else { 4346 cache_fpl_partial(fpl); 4347 } 4348 break; 4349 default: 4350 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4351 error = cache_fpl_aborted(fpl); 4352 } else { 4353 cache_fpl_smr_exit(fpl); 4354 cache_fpl_handled(fpl, error); 4355 } 4356 break; 4357 } 4358 return (error); 4359 } 4360 4361 static int 4362 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl) 4363 { 4364 struct nameidata *ndp; 4365 struct componentname *cnp; 4366 struct mount *mp; 4367 int error; 4368 4369 error = CACHE_FPL_FAILED; 4370 ndp = fpl->ndp; 4371 cnp = fpl->cnp; 4372 4373 cache_fpl_checkpoint(fpl, &fpl->snd); 4374 4375 fpl->dvp = dvp; 4376 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); 4377 if (seqc_in_modify(fpl->dvp_seqc)) { 4378 cache_fpl_aborted(fpl); 4379 goto out; 4380 } 4381 mp = atomic_load_ptr(&fpl->dvp->v_mount); 4382 if (!cache_fplookup_mp_supported(mp)) { 4383 cache_fpl_aborted(fpl); 4384 goto out; 4385 } 4386 4387 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); 4388 4389 for (;;) { 4390 error = cache_fplookup_parse(fpl); 4391 if (__predict_false(error != 0)) { 4392 break; 4393 } 4394 4395 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); 4396 4397 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred); 4398 if (__predict_false(error != 0)) { 4399 error = cache_fplookup_failed_vexec(fpl, error); 4400 break; 4401 } 4402 4403 if (__predict_false(cache_fpl_isdotdot(cnp))) { 4404 error = cache_fplookup_dotdot(fpl); 4405 if (__predict_false(error != 0)) { 4406 break; 4407 } 4408 } else { 4409 error = cache_fplookup_next(fpl); 4410 if (__predict_false(error != 0)) { 4411 break; 4412 } 4413 4414 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 4415 4416 if (cache_fplookup_need_climb_mount(fpl)) { 4417 error = cache_fplookup_climb_mount(fpl); 4418 if (__predict_false(error != 0)) { 4419 break; 4420 } 4421 } 4422 } 4423 4424 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 4425 4426 if (cache_fpl_islastcn(ndp)) { 4427 error = cache_fplookup_final(fpl); 4428 break; 4429 } 4430 4431 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { 4432 error = cache_fpl_aborted(fpl); 4433 break; 4434 } 4435 4436 fpl->dvp = fpl->tvp; 4437 fpl->dvp_seqc = fpl->tvp_seqc; 4438 4439 cache_fplookup_parse_advance(fpl); 4440 cache_fpl_checkpoint(fpl, &fpl->snd); 4441 } 4442 out: 4443 switch (fpl->status) { 4444 case CACHE_FPL_STATUS_UNSET: 4445 __assert_unreachable(); 4446 break; 4447 case CACHE_FPL_STATUS_PARTIAL: 4448 cache_fpl_smr_assert_entered(fpl); 4449 return (cache_fplookup_partial_setup(fpl)); 4450 case CACHE_FPL_STATUS_ABORTED: 4451 if (fpl->in_smr) 4452 cache_fpl_smr_exit(fpl); 4453 return (CACHE_FPL_FAILED); 4454 case CACHE_FPL_STATUS_HANDLED: 4455 MPASS(error != CACHE_FPL_FAILED); 4456 cache_fpl_smr_assert_not_entered(fpl); 4457 if (__predict_false(error != 0)) { 4458 ndp->ni_dvp = NULL; 4459 ndp->ni_vp = NULL; 4460 cache_fpl_cleanup_cnp(cnp); 4461 return (error); 4462 } 4463 ndp->ni_dvp = fpl->dvp; 4464 ndp->ni_vp = fpl->tvp; 4465 if (cnp->cn_flags & SAVENAME) 4466 cnp->cn_flags |= HASBUF; 4467 else 4468 cache_fpl_cleanup_cnp(cnp); 4469 return (error); 4470 } 4471 } 4472 4473 /* 4474 * Fast path lookup protected with SMR and sequence counters. 4475 * 4476 * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one. 4477 * 4478 * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria 4479 * outlined below. 4480 * 4481 * Traditional vnode lookup conceptually looks like this: 4482 * 4483 * vn_lock(current); 4484 * for (;;) { 4485 * next = find(); 4486 * vn_lock(next); 4487 * vn_unlock(current); 4488 * current = next; 4489 * if (last) 4490 * break; 4491 * } 4492 * return (current); 4493 * 4494 * Each jump to the next vnode is safe memory-wise and atomic with respect to 4495 * any modifications thanks to holding respective locks. 4496 * 4497 * The same guarantee can be provided with a combination of safe memory 4498 * reclamation and sequence counters instead. If all operations which affect 4499 * the relationship between the current vnode and the one we are looking for 4500 * also modify the counter, we can verify whether all the conditions held as 4501 * we made the jump. This includes things like permissions, mount points etc. 4502 * Counter modification is provided by enclosing relevant places in 4503 * vn_seqc_write_begin()/end() calls. 4504 * 4505 * Thus this translates to: 4506 * 4507 * vfs_smr_enter(); 4508 * dvp_seqc = seqc_read_any(dvp); 4509 * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode 4510 * abort(); 4511 * for (;;) { 4512 * tvp = find(); 4513 * tvp_seqc = seqc_read_any(tvp); 4514 * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode 4515 * abort(); 4516 * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode 4517 * abort(); 4518 * dvp = tvp; // we know nothing of importance has changed 4519 * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration 4520 * if (last) 4521 * break; 4522 * } 4523 * vget(); // secure the vnode 4524 * if (!seqc_consistent(tvp, tvp_seqc) // final check 4525 * abort(); 4526 * // at this point we know nothing has changed for any parent<->child pair 4527 * // as they were crossed during the lookup, meaning we matched the guarantee 4528 * // of the locked variant 4529 * return (tvp); 4530 * 4531 * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows: 4532 * - they are called while within vfs_smr protection which they must never exit 4533 * - EAGAIN can be returned to denote checking could not be performed, it is 4534 * always valid to return it 4535 * - if the sequence counter has not changed the result must be valid 4536 * - if the sequence counter has changed both false positives and false negatives 4537 * are permitted (since the result will be rejected later) 4538 * - for simple cases of unix permission checks vaccess_vexec_smr can be used 4539 * 4540 * Caveats to watch out for: 4541 * - vnodes are passed unlocked and unreferenced with nothing stopping 4542 * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised 4543 * to use atomic_load_ptr to fetch it. 4544 * - the aforementioned object can also get freed, meaning absent other means it 4545 * should be protected with vfs_smr 4546 * - either safely checking permissions as they are modified or guaranteeing 4547 * their stability is left to the routine 4548 */ 4549 int 4550 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status, 4551 struct pwd **pwdp) 4552 { 4553 struct cache_fpl fpl; 4554 struct pwd *pwd; 4555 struct vnode *dvp; 4556 struct componentname *cnp; 4557 struct nameidata_saved orig; 4558 int error; 4559 4560 MPASS(ndp->ni_lcf == 0); 4561 4562 fpl.status = CACHE_FPL_STATUS_UNSET; 4563 fpl.ndp = ndp; 4564 fpl.cnp = &ndp->ni_cnd; 4565 MPASS(curthread == fpl.cnp->cn_thread); 4566 4567 if ((fpl.cnp->cn_flags & SAVESTART) != 0) 4568 MPASS(fpl.cnp->cn_nameiop != LOOKUP); 4569 4570 if (!cache_can_fplookup(&fpl)) { 4571 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 4572 *status = fpl.status; 4573 return (EOPNOTSUPP); 4574 } 4575 4576 cache_fpl_checkpoint(&fpl, &orig); 4577 4578 cache_fpl_smr_enter_initial(&fpl); 4579 fpl.fsearch = false; 4580 pwd = pwd_get_smr(); 4581 fpl.pwd = pwd; 4582 ndp->ni_rootdir = pwd->pwd_rdir; 4583 ndp->ni_topdir = pwd->pwd_jdir; 4584 4585 cnp = fpl.cnp; 4586 cnp->cn_nameptr = cnp->cn_pnbuf; 4587 if (cnp->cn_pnbuf[0] == '/') { 4588 cache_fpl_handle_root(ndp, &dvp); 4589 } else { 4590 if (ndp->ni_dirfd == AT_FDCWD) { 4591 dvp = pwd->pwd_cdir; 4592 } else { 4593 error = cache_fplookup_dirfd(&fpl, &dvp); 4594 if (__predict_false(error != 0)) { 4595 goto out; 4596 } 4597 } 4598 } 4599 4600 SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true); 4601 4602 error = cache_fplookup_impl(dvp, &fpl); 4603 out: 4604 cache_fpl_smr_assert_not_entered(&fpl); 4605 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 4606 4607 *status = fpl.status; 4608 switch (fpl.status) { 4609 case CACHE_FPL_STATUS_UNSET: 4610 __assert_unreachable(); 4611 break; 4612 case CACHE_FPL_STATUS_HANDLED: 4613 SDT_PROBE3(vfs, namei, lookup, return, error, 4614 (error == 0 ? ndp->ni_vp : NULL), true); 4615 break; 4616 case CACHE_FPL_STATUS_PARTIAL: 4617 *pwdp = fpl.pwd; 4618 /* 4619 * Status restored by cache_fplookup_partial_setup. 4620 */ 4621 break; 4622 case CACHE_FPL_STATUS_ABORTED: 4623 cache_fpl_restore(&fpl, &orig); 4624 break; 4625 } 4626 return (error); 4627 } 4628