1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Poul-Henning Kamp of the FreeBSD Project. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_ddb.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/capsicum.h> 46 #include <sys/counter.h> 47 #include <sys/filedesc.h> 48 #include <sys/fnv_hash.h> 49 #include <sys/kernel.h> 50 #include <sys/ktr.h> 51 #include <sys/lock.h> 52 #include <sys/malloc.h> 53 #include <sys/fcntl.h> 54 #include <sys/jail.h> 55 #include <sys/mount.h> 56 #include <sys/namei.h> 57 #include <sys/proc.h> 58 #include <sys/seqc.h> 59 #include <sys/sdt.h> 60 #include <sys/smr.h> 61 #include <sys/smp.h> 62 #include <sys/syscallsubr.h> 63 #include <sys/sysctl.h> 64 #include <sys/sysproto.h> 65 #include <sys/vnode.h> 66 #include <ck_queue.h> 67 #ifdef KTRACE 68 #include <sys/ktrace.h> 69 #endif 70 71 #include <sys/capsicum.h> 72 73 #include <security/audit/audit.h> 74 #include <security/mac/mac_framework.h> 75 76 #ifdef DDB 77 #include <ddb/ddb.h> 78 #endif 79 80 #include <vm/uma.h> 81 82 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 83 "Name cache"); 84 85 SDT_PROVIDER_DECLARE(vfs); 86 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", 87 "struct vnode *"); 88 SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *", 89 "struct vnode *"); 90 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", 91 "char *"); 92 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *", 93 "const char *"); 94 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *", 95 "struct namecache *", "int", "int"); 96 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); 97 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", 98 "char *", "struct vnode *"); 99 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); 100 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", 101 "struct vnode *", "char *"); 102 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", 103 "struct vnode *"); 104 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, 105 "struct vnode *", "char *"); 106 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", 107 "char *"); 108 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *", 109 "struct componentname *"); 110 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *", 111 "struct componentname *"); 112 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); 113 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); 114 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); 115 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", 116 "struct vnode *"); 117 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", 118 "char *"); 119 SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *", 120 "char *"); 121 122 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool"); 123 SDT_PROBE_DECLARE(vfs, namei, lookup, entry); 124 SDT_PROBE_DECLARE(vfs, namei, lookup, return); 125 126 /* 127 * This structure describes the elements in the cache of recent 128 * names looked up by namei. 129 */ 130 struct negstate { 131 u_char neg_flag; 132 u_char neg_hit; 133 }; 134 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *), 135 "the state must fit in a union with a pointer without growing it"); 136 137 struct namecache { 138 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 139 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 140 CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */ 141 struct vnode *nc_dvp; /* vnode of parent of name */ 142 union { 143 struct vnode *nu_vp; /* vnode the name refers to */ 144 struct negstate nu_neg;/* negative entry state */ 145 } n_un; 146 u_char nc_flag; /* flag bits */ 147 u_char nc_nlen; /* length of name */ 148 char nc_name[0]; /* segment name + nul */ 149 }; 150 151 /* 152 * struct namecache_ts repeats struct namecache layout up to the 153 * nc_nlen member. 154 * struct namecache_ts is used in place of struct namecache when time(s) need 155 * to be stored. The nc_dotdottime field is used when a cache entry is mapping 156 * both a non-dotdot directory name plus dotdot for the directory's 157 * parent. 158 * 159 * See below for alignment requirement. 160 */ 161 struct namecache_ts { 162 struct timespec nc_time; /* timespec provided by fs */ 163 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ 164 int nc_ticks; /* ticks value when entry was added */ 165 struct namecache nc_nc; 166 }; 167 168 /* 169 * At least mips n32 performs 64-bit accesses to timespec as found 170 * in namecache_ts and requires them to be aligned. Since others 171 * may be in the same spot suffer a little bit and enforce the 172 * alignment for everyone. Note this is a nop for 64-bit platforms. 173 */ 174 #define CACHE_ZONE_ALIGNMENT UMA_ALIGNOF(time_t) 175 #define CACHE_PATH_CUTOFF 39 176 177 #define CACHE_ZONE_SMALL_SIZE (sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1) 178 #define CACHE_ZONE_SMALL_TS_SIZE (sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1) 179 #define CACHE_ZONE_LARGE_SIZE (sizeof(struct namecache) + NAME_MAX + 1) 180 #define CACHE_ZONE_LARGE_TS_SIZE (sizeof(struct namecache_ts) + NAME_MAX + 1) 181 182 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 183 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 184 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 185 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 186 187 #define nc_vp n_un.nu_vp 188 #define nc_neg n_un.nu_neg 189 190 /* 191 * Flags in namecache.nc_flag 192 */ 193 #define NCF_WHITE 0x01 194 #define NCF_ISDOTDOT 0x02 195 #define NCF_TS 0x04 196 #define NCF_DTS 0x08 197 #define NCF_DVDROP 0x10 198 #define NCF_NEGATIVE 0x20 199 #define NCF_INVALID 0x40 200 #define NCF_WIP 0x80 201 202 /* 203 * Flags in negstate.neg_flag 204 */ 205 #define NEG_HOT 0x01 206 207 /* 208 * Mark an entry as invalid. 209 * 210 * This is called before it starts getting deconstructed. 211 */ 212 static void 213 cache_ncp_invalidate(struct namecache *ncp) 214 { 215 216 KASSERT((ncp->nc_flag & NCF_INVALID) == 0, 217 ("%s: entry %p already invalid", __func__, ncp)); 218 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID); 219 atomic_thread_fence_rel(); 220 } 221 222 /* 223 * Check whether the entry can be safely used. 224 * 225 * All places which elide locks are supposed to call this after they are 226 * done with reading from an entry. 227 */ 228 static bool 229 cache_ncp_canuse(struct namecache *ncp) 230 { 231 232 atomic_thread_fence_acq(); 233 return ((atomic_load_char(&ncp->nc_flag) & (NCF_INVALID | NCF_WIP)) == 0); 234 } 235 236 /* 237 * Name caching works as follows: 238 * 239 * Names found by directory scans are retained in a cache 240 * for future reference. It is managed LRU, so frequently 241 * used names will hang around. Cache is indexed by hash value 242 * obtained from (dvp, name) where dvp refers to the directory 243 * containing name. 244 * 245 * If it is a "negative" entry, (i.e. for a name that is known NOT to 246 * exist) the vnode pointer will be NULL. 247 * 248 * Upon reaching the last segment of a path, if the reference 249 * is for DELETE, or NOCACHE is set (rewrite), and the 250 * name is located in the cache, it will be dropped. 251 * 252 * These locks are used (in the order in which they can be taken): 253 * NAME TYPE ROLE 254 * vnodelock mtx vnode lists and v_cache_dd field protection 255 * bucketlock mtx for access to given set of hash buckets 256 * neglist mtx negative entry LRU management 257 * 258 * It is legal to take multiple vnodelock and bucketlock locks. The locking 259 * order is lower address first. Both are recursive. 260 * 261 * "." lookups are lockless. 262 * 263 * ".." and vnode -> name lookups require vnodelock. 264 * 265 * name -> vnode lookup requires the relevant bucketlock to be held for reading. 266 * 267 * Insertions and removals of entries require involved vnodes and bucketlocks 268 * to be locked to provide safe operation against other threads modifying the 269 * cache. 270 * 271 * Some lookups result in removal of the found entry (e.g. getting rid of a 272 * negative entry with the intent to create a positive one), which poses a 273 * problem when multiple threads reach the state. Similarly, two different 274 * threads can purge two different vnodes and try to remove the same name. 275 * 276 * If the already held vnode lock is lower than the second required lock, we 277 * can just take the other lock. However, in the opposite case, this could 278 * deadlock. As such, this is resolved by trylocking and if that fails unlocking 279 * the first node, locking everything in order and revalidating the state. 280 */ 281 282 VFS_SMR_DECLARE; 283 284 static SYSCTL_NODE(_vfs_cache, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 285 "Name cache parameters"); 286 287 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */ 288 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, size, CTLFLAG_RW, &ncsize, 0, 289 "Total namecache capacity"); 290 291 u_int ncsizefactor = 2; 292 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, sizefactor, CTLFLAG_RW, &ncsizefactor, 0, 293 "Size factor for namecache"); 294 295 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */ 296 SYSCTL_ULONG(_vfs_cache_param, OID_AUTO, negfactor, CTLFLAG_RW, &ncnegfactor, 0, 297 "Ratio of negative namecache entries"); 298 299 /* 300 * Negative entry % of namecahe capacity above which automatic eviction is allowed. 301 * 302 * Check cache_neg_evict_cond for details. 303 */ 304 static u_int ncnegminpct = 3; 305 306 static u_int __read_mostly neg_min; /* the above recomputed against ncsize */ 307 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, negmin, CTLFLAG_RD, &neg_min, 0, 308 "Negative entry count above which automatic eviction is allowed"); 309 310 /* 311 * Structures associated with name caching. 312 */ 313 #define NCHHASH(hash) \ 314 (&nchashtbl[(hash) & nchash]) 315 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ 316 static u_long __read_mostly nchash; /* size of hash table */ 317 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 318 "Size of namecache hash table"); 319 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ 320 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ 321 322 struct nchstats nchstats; /* cache effectiveness statistics */ 323 324 static bool __read_frequently cache_fast_revlookup = true; 325 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_revlookup, CTLFLAG_RW, 326 &cache_fast_revlookup, 0, ""); 327 328 static u_int __exclusive_cache_line neg_cycle; 329 330 #define ncneghash 3 331 #define numneglists (ncneghash + 1) 332 333 struct neglist { 334 struct mtx nl_evict_lock; 335 struct mtx nl_lock __aligned(CACHE_LINE_SIZE); 336 TAILQ_HEAD(, namecache) nl_list; 337 TAILQ_HEAD(, namecache) nl_hotlist; 338 u_long nl_hotnum; 339 } __aligned(CACHE_LINE_SIZE); 340 341 static struct neglist neglists[numneglists]; 342 343 static inline struct neglist * 344 NCP2NEGLIST(struct namecache *ncp) 345 { 346 347 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); 348 } 349 350 static inline struct negstate * 351 NCP2NEGSTATE(struct namecache *ncp) 352 { 353 354 MPASS(ncp->nc_flag & NCF_NEGATIVE); 355 return (&ncp->nc_neg); 356 } 357 358 #define numbucketlocks (ncbuckethash + 1) 359 static u_int __read_mostly ncbuckethash; 360 static struct mtx_padalign __read_mostly *bucketlocks; 361 #define HASH2BUCKETLOCK(hash) \ 362 ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)])) 363 364 #define numvnodelocks (ncvnodehash + 1) 365 static u_int __read_mostly ncvnodehash; 366 static struct mtx __read_mostly *vnodelocks; 367 static inline struct mtx * 368 VP2VNODELOCK(struct vnode *vp) 369 { 370 371 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); 372 } 373 374 /* 375 * UMA zones for the VFS cache. 376 * 377 * The small cache is used for entries with short names, which are the 378 * most common. The large cache is used for entries which are too big to 379 * fit in the small cache. 380 */ 381 static uma_zone_t __read_mostly cache_zone_small; 382 static uma_zone_t __read_mostly cache_zone_small_ts; 383 static uma_zone_t __read_mostly cache_zone_large; 384 static uma_zone_t __read_mostly cache_zone_large_ts; 385 386 static struct namecache * 387 cache_alloc(int len, int ts) 388 { 389 struct namecache_ts *ncp_ts; 390 struct namecache *ncp; 391 392 if (__predict_false(ts)) { 393 if (len <= CACHE_PATH_CUTOFF) 394 ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK); 395 else 396 ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK); 397 ncp = &ncp_ts->nc_nc; 398 } else { 399 if (len <= CACHE_PATH_CUTOFF) 400 ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK); 401 else 402 ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK); 403 } 404 return (ncp); 405 } 406 407 static void 408 cache_free(struct namecache *ncp) 409 { 410 struct namecache_ts *ncp_ts; 411 412 MPASS(ncp != NULL); 413 if ((ncp->nc_flag & NCF_DVDROP) != 0) 414 vdrop(ncp->nc_dvp); 415 if (__predict_false(ncp->nc_flag & NCF_TS)) { 416 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 417 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 418 uma_zfree_smr(cache_zone_small_ts, ncp_ts); 419 else 420 uma_zfree_smr(cache_zone_large_ts, ncp_ts); 421 } else { 422 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 423 uma_zfree_smr(cache_zone_small, ncp); 424 else 425 uma_zfree_smr(cache_zone_large, ncp); 426 } 427 } 428 429 static void 430 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) 431 { 432 struct namecache_ts *ncp_ts; 433 434 KASSERT((ncp->nc_flag & NCF_TS) != 0 || 435 (tsp == NULL && ticksp == NULL), 436 ("No NCF_TS")); 437 438 if (tsp == NULL) 439 return; 440 441 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 442 *tsp = ncp_ts->nc_time; 443 *ticksp = ncp_ts->nc_ticks; 444 } 445 446 #ifdef DEBUG_CACHE 447 static int __read_mostly doingcache = 1; /* 1 => enable the cache */ 448 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, 449 "VFS namecache enabled"); 450 #endif 451 452 /* Export size information to userland */ 453 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 454 sizeof(struct namecache), "sizeof(struct namecache)"); 455 456 /* 457 * The new name cache statistics 458 */ 459 static SYSCTL_NODE(_vfs_cache, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 460 "Name cache statistics"); 461 462 #define STATNODE_ULONG(name, varname, descr) \ 463 SYSCTL_ULONG(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr); 464 #define STATNODE_COUNTER(name, varname, descr) \ 465 static COUNTER_U64_DEFINE_EARLY(varname); \ 466 SYSCTL_COUNTER_U64(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, \ 467 descr); 468 STATNODE_ULONG(neg, numneg, "Number of negative cache entries"); 469 STATNODE_ULONG(count, numcache, "Number of cache entries"); 470 STATNODE_COUNTER(heldvnodes, numcachehv, "Number of namecache entries with vnodes held"); 471 STATNODE_COUNTER(drops, numdrops, "Number of dropped entries due to reaching the limit"); 472 STATNODE_COUNTER(dothits, dothits, "Number of '.' hits"); 473 STATNODE_COUNTER(dotdothis, dotdothits, "Number of '..' hits"); 474 STATNODE_COUNTER(miss, nummiss, "Number of cache misses"); 475 STATNODE_COUNTER(misszap, nummisszap, "Number of cache misses we do not want to cache"); 476 STATNODE_COUNTER(posszaps, numposzaps, 477 "Number of cache hits (positive) we do not want to cache"); 478 STATNODE_COUNTER(poshits, numposhits, "Number of cache hits (positive)"); 479 STATNODE_COUNTER(negzaps, numnegzaps, 480 "Number of cache hits (negative) we do not want to cache"); 481 STATNODE_COUNTER(neghits, numneghits, "Number of cache hits (negative)"); 482 /* These count for vn_getcwd(), too. */ 483 STATNODE_COUNTER(fullpathcalls, numfullpathcalls, "Number of fullpath search calls"); 484 STATNODE_COUNTER(fullpathfail1, numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); 485 STATNODE_COUNTER(fullpathfail2, numfullpathfail2, 486 "Number of fullpath search errors (VOP_VPTOCNP failures)"); 487 STATNODE_COUNTER(fullpathfail4, numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); 488 STATNODE_COUNTER(fullpathfound, numfullpathfound, "Number of successful fullpath calls"); 489 490 /* 491 * Debug or developer statistics. 492 */ 493 static SYSCTL_NODE(_vfs_cache, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 494 "Name cache debugging"); 495 #define DEBUGNODE_ULONG(name, varname, descr) \ 496 SYSCTL_ULONG(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr); 497 #define DEBUGNODE_COUNTER(name, varname, descr) \ 498 static COUNTER_U64_DEFINE_EARLY(varname); \ 499 SYSCTL_COUNTER_U64(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, \ 500 descr); 501 DEBUGNODE_COUNTER(zap_bucket_relock_success, zap_bucket_relock_success, 502 "Number of successful removals after relocking"); 503 static long zap_bucket_fail; 504 DEBUGNODE_ULONG(zap_bucket_fail, zap_bucket_fail, ""); 505 static long zap_bucket_fail2; 506 DEBUGNODE_ULONG(zap_bucket_fail2, zap_bucket_fail2, ""); 507 static long cache_lock_vnodes_cel_3_failures; 508 DEBUGNODE_ULONG(vnodes_cel_3_failures, cache_lock_vnodes_cel_3_failures, 509 "Number of times 3-way vnode locking failed"); 510 511 static void cache_zap_locked(struct namecache *ncp); 512 static int vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, 513 char **freebuf, size_t *buflen); 514 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 515 char **retbuf, size_t *buflen, size_t addend); 516 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, 517 char **retbuf, size_t *buflen); 518 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, 519 char **retbuf, size_t *len, size_t addend); 520 521 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 522 523 static inline void 524 cache_assert_vlp_locked(struct mtx *vlp) 525 { 526 527 if (vlp != NULL) 528 mtx_assert(vlp, MA_OWNED); 529 } 530 531 static inline void 532 cache_assert_vnode_locked(struct vnode *vp) 533 { 534 struct mtx *vlp; 535 536 vlp = VP2VNODELOCK(vp); 537 cache_assert_vlp_locked(vlp); 538 } 539 540 /* 541 * TODO: With the value stored we can do better than computing the hash based 542 * on the address. The choice of FNV should also be revisited. 543 */ 544 static void 545 cache_prehash(struct vnode *vp) 546 { 547 548 vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT); 549 } 550 551 static uint32_t 552 cache_get_hash(char *name, u_char len, struct vnode *dvp) 553 { 554 555 return (fnv_32_buf(name, len, dvp->v_nchash)); 556 } 557 558 static inline struct nchashhead * 559 NCP2BUCKET(struct namecache *ncp) 560 { 561 uint32_t hash; 562 563 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 564 return (NCHHASH(hash)); 565 } 566 567 static inline struct mtx * 568 NCP2BUCKETLOCK(struct namecache *ncp) 569 { 570 uint32_t hash; 571 572 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 573 return (HASH2BUCKETLOCK(hash)); 574 } 575 576 #ifdef INVARIANTS 577 static void 578 cache_assert_bucket_locked(struct namecache *ncp) 579 { 580 struct mtx *blp; 581 582 blp = NCP2BUCKETLOCK(ncp); 583 mtx_assert(blp, MA_OWNED); 584 } 585 586 static void 587 cache_assert_bucket_unlocked(struct namecache *ncp) 588 { 589 struct mtx *blp; 590 591 blp = NCP2BUCKETLOCK(ncp); 592 mtx_assert(blp, MA_NOTOWNED); 593 } 594 #else 595 #define cache_assert_bucket_locked(x) do { } while (0) 596 #define cache_assert_bucket_unlocked(x) do { } while (0) 597 #endif 598 599 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) 600 static void 601 _cache_sort_vnodes(void **p1, void **p2) 602 { 603 void *tmp; 604 605 MPASS(*p1 != NULL || *p2 != NULL); 606 607 if (*p1 > *p2) { 608 tmp = *p2; 609 *p2 = *p1; 610 *p1 = tmp; 611 } 612 } 613 614 static void 615 cache_lock_all_buckets(void) 616 { 617 u_int i; 618 619 for (i = 0; i < numbucketlocks; i++) 620 mtx_lock(&bucketlocks[i]); 621 } 622 623 static void 624 cache_unlock_all_buckets(void) 625 { 626 u_int i; 627 628 for (i = 0; i < numbucketlocks; i++) 629 mtx_unlock(&bucketlocks[i]); 630 } 631 632 static void 633 cache_lock_all_vnodes(void) 634 { 635 u_int i; 636 637 for (i = 0; i < numvnodelocks; i++) 638 mtx_lock(&vnodelocks[i]); 639 } 640 641 static void 642 cache_unlock_all_vnodes(void) 643 { 644 u_int i; 645 646 for (i = 0; i < numvnodelocks; i++) 647 mtx_unlock(&vnodelocks[i]); 648 } 649 650 static int 651 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 652 { 653 654 cache_sort_vnodes(&vlp1, &vlp2); 655 656 if (vlp1 != NULL) { 657 if (!mtx_trylock(vlp1)) 658 return (EAGAIN); 659 } 660 if (!mtx_trylock(vlp2)) { 661 if (vlp1 != NULL) 662 mtx_unlock(vlp1); 663 return (EAGAIN); 664 } 665 666 return (0); 667 } 668 669 static void 670 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 671 { 672 673 MPASS(vlp1 != NULL || vlp2 != NULL); 674 MPASS(vlp1 <= vlp2); 675 676 if (vlp1 != NULL) 677 mtx_lock(vlp1); 678 if (vlp2 != NULL) 679 mtx_lock(vlp2); 680 } 681 682 static void 683 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 684 { 685 686 MPASS(vlp1 != NULL || vlp2 != NULL); 687 688 if (vlp1 != NULL) 689 mtx_unlock(vlp1); 690 if (vlp2 != NULL) 691 mtx_unlock(vlp2); 692 } 693 694 static int 695 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 696 { 697 struct nchstats snap; 698 699 if (req->oldptr == NULL) 700 return (SYSCTL_OUT(req, 0, sizeof(snap))); 701 702 snap = nchstats; 703 snap.ncs_goodhits = counter_u64_fetch(numposhits); 704 snap.ncs_neghits = counter_u64_fetch(numneghits); 705 snap.ncs_badhits = counter_u64_fetch(numposzaps) + 706 counter_u64_fetch(numnegzaps); 707 snap.ncs_miss = counter_u64_fetch(nummisszap) + 708 counter_u64_fetch(nummiss); 709 710 return (SYSCTL_OUT(req, &snap, sizeof(snap))); 711 } 712 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | 713 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", 714 "VFS cache effectiveness statistics"); 715 716 static void 717 cache_recalc_neg_min(u_int val) 718 { 719 720 neg_min = (ncsize * val) / 100; 721 } 722 723 static int 724 sysctl_negminpct(SYSCTL_HANDLER_ARGS) 725 { 726 u_int val; 727 int error; 728 729 val = ncnegminpct; 730 error = sysctl_handle_int(oidp, &val, 0, req); 731 if (error != 0 || req->newptr == NULL) 732 return (error); 733 734 if (val == ncnegminpct) 735 return (0); 736 if (val < 0 || val > 99) 737 return (EINVAL); 738 ncnegminpct = val; 739 cache_recalc_neg_min(val); 740 return (0); 741 } 742 743 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, negminpct, 744 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_negminpct, 745 "I", "Negative entry \% of namecahe capacity above which automatic eviction is allowed"); 746 747 #ifdef DIAGNOSTIC 748 /* 749 * Grab an atomic snapshot of the name cache hash chain lengths 750 */ 751 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, 752 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 753 "hash table stats"); 754 755 static int 756 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) 757 { 758 struct nchashhead *ncpp; 759 struct namecache *ncp; 760 int i, error, n_nchash, *cntbuf; 761 762 retry: 763 n_nchash = nchash + 1; /* nchash is max index, not count */ 764 if (req->oldptr == NULL) 765 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); 766 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); 767 cache_lock_all_buckets(); 768 if (n_nchash != nchash + 1) { 769 cache_unlock_all_buckets(); 770 free(cntbuf, M_TEMP); 771 goto retry; 772 } 773 /* Scan hash tables counting entries */ 774 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) 775 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) 776 cntbuf[i]++; 777 cache_unlock_all_buckets(); 778 for (error = 0, i = 0; i < n_nchash; i++) 779 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) 780 break; 781 free(cntbuf, M_TEMP); 782 return (error); 783 } 784 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| 785 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", 786 "nchash chain lengths"); 787 788 static int 789 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) 790 { 791 int error; 792 struct nchashhead *ncpp; 793 struct namecache *ncp; 794 int n_nchash; 795 int count, maxlength, used, pct; 796 797 if (!req->oldptr) 798 return SYSCTL_OUT(req, 0, 4 * sizeof(int)); 799 800 cache_lock_all_buckets(); 801 n_nchash = nchash + 1; /* nchash is max index, not count */ 802 used = 0; 803 maxlength = 0; 804 805 /* Scan hash tables for applicable entries */ 806 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { 807 count = 0; 808 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) { 809 count++; 810 } 811 if (count) 812 used++; 813 if (maxlength < count) 814 maxlength = count; 815 } 816 n_nchash = nchash + 1; 817 cache_unlock_all_buckets(); 818 pct = (used * 100) / (n_nchash / 100); 819 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); 820 if (error) 821 return (error); 822 error = SYSCTL_OUT(req, &used, sizeof(used)); 823 if (error) 824 return (error); 825 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); 826 if (error) 827 return (error); 828 error = SYSCTL_OUT(req, &pct, sizeof(pct)); 829 if (error) 830 return (error); 831 return (0); 832 } 833 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| 834 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", 835 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); 836 #endif 837 838 /* 839 * Negative entries management 840 * 841 * Various workloads create plenty of negative entries and barely use them 842 * afterwards. Moreover malicious users can keep performing bogus lookups 843 * adding even more entries. For example "make tinderbox" as of writing this 844 * comment ends up with 2.6M namecache entries in total, 1.2M of which are 845 * negative. 846 * 847 * As such, a rather aggressive eviction method is needed. The currently 848 * employed method is a placeholder. 849 * 850 * Entries are split over numneglists separate lists, each of which is further 851 * split into hot and cold entries. Entries get promoted after getting a hit. 852 * Eviction happens on addition of new entry. 853 */ 854 static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 855 "Name cache negative entry statistics"); 856 857 SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0, 858 "Number of negative cache entries"); 859 860 static COUNTER_U64_DEFINE_EARLY(neg_created); 861 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created, 862 "Number of created negative entries"); 863 864 static COUNTER_U64_DEFINE_EARLY(neg_evicted); 865 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted, 866 "Number of evicted negative entries"); 867 868 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty); 869 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD, 870 &neg_evict_skipped_empty, 871 "Number of times evicting failed due to lack of entries"); 872 873 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed); 874 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD, 875 &neg_evict_skipped_missed, 876 "Number of times evicting failed due to target entry disappearing"); 877 878 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended); 879 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD, 880 &neg_evict_skipped_contended, 881 "Number of times evicting failed due to contention"); 882 883 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits, 884 "Number of cache hits (negative)"); 885 886 static int 887 sysctl_neg_hot(SYSCTL_HANDLER_ARGS) 888 { 889 int i, out; 890 891 out = 0; 892 for (i = 0; i < numneglists; i++) 893 out += neglists[i].nl_hotnum; 894 895 return (SYSCTL_OUT(req, &out, sizeof(out))); 896 } 897 SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD | 898 CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I", 899 "Number of hot negative entries"); 900 901 static void 902 cache_neg_init(struct namecache *ncp) 903 { 904 struct negstate *ns; 905 906 ncp->nc_flag |= NCF_NEGATIVE; 907 ns = NCP2NEGSTATE(ncp); 908 ns->neg_flag = 0; 909 ns->neg_hit = 0; 910 counter_u64_add(neg_created, 1); 911 } 912 913 #define CACHE_NEG_PROMOTION_THRESH 2 914 915 static bool 916 cache_neg_hit_prep(struct namecache *ncp) 917 { 918 struct negstate *ns; 919 u_char n; 920 921 ns = NCP2NEGSTATE(ncp); 922 n = atomic_load_char(&ns->neg_hit); 923 for (;;) { 924 if (n >= CACHE_NEG_PROMOTION_THRESH) 925 return (false); 926 if (atomic_fcmpset_8(&ns->neg_hit, &n, n + 1)) 927 break; 928 } 929 return (n + 1 == CACHE_NEG_PROMOTION_THRESH); 930 } 931 932 /* 933 * Nothing to do here but it is provided for completeness as some 934 * cache_neg_hit_prep callers may end up returning without even 935 * trying to promote. 936 */ 937 #define cache_neg_hit_abort(ncp) do { } while (0) 938 939 static void 940 cache_neg_hit_finish(struct namecache *ncp) 941 { 942 943 SDT_PROBE2(vfs, namecache, lookup, hit__negative, ncp->nc_dvp, ncp->nc_name); 944 counter_u64_add(numneghits, 1); 945 } 946 947 /* 948 * Move a negative entry to the hot list. 949 */ 950 static void 951 cache_neg_promote_locked(struct namecache *ncp) 952 { 953 struct neglist *nl; 954 struct negstate *ns; 955 956 ns = NCP2NEGSTATE(ncp); 957 nl = NCP2NEGLIST(ncp); 958 mtx_assert(&nl->nl_lock, MA_OWNED); 959 if ((ns->neg_flag & NEG_HOT) == 0) { 960 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst); 961 TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst); 962 nl->nl_hotnum++; 963 ns->neg_flag |= NEG_HOT; 964 } 965 } 966 967 /* 968 * Move a hot negative entry to the cold list. 969 */ 970 static void 971 cache_neg_demote_locked(struct namecache *ncp) 972 { 973 struct neglist *nl; 974 struct negstate *ns; 975 976 ns = NCP2NEGSTATE(ncp); 977 nl = NCP2NEGLIST(ncp); 978 mtx_assert(&nl->nl_lock, MA_OWNED); 979 MPASS(ns->neg_flag & NEG_HOT); 980 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst); 981 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst); 982 nl->nl_hotnum--; 983 ns->neg_flag &= ~NEG_HOT; 984 atomic_store_char(&ns->neg_hit, 0); 985 } 986 987 /* 988 * Move a negative entry to the hot list if it matches the lookup. 989 * 990 * We have to take locks, but they may be contended and in the worst 991 * case we may need to go off CPU. We don't want to spin within the 992 * smr section and we can't block with it. Exiting the section means 993 * the found entry could have been evicted. We are going to look it 994 * up again. 995 */ 996 static bool 997 cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp, 998 struct namecache *oncp, uint32_t hash) 999 { 1000 struct namecache *ncp; 1001 struct neglist *nl; 1002 u_char nc_flag; 1003 1004 nl = NCP2NEGLIST(oncp); 1005 1006 mtx_lock(&nl->nl_lock); 1007 /* 1008 * For hash iteration. 1009 */ 1010 vfs_smr_enter(); 1011 1012 /* 1013 * Avoid all surprises by only succeeding if we got the same entry and 1014 * bailing completely otherwise. 1015 * XXX There are no provisions to keep the vnode around, meaning we may 1016 * end up promoting a negative entry for a *new* vnode and returning 1017 * ENOENT on its account. This is the error we want to return anyway 1018 * and promotion is harmless. 1019 * 1020 * In particular at this point there can be a new ncp which matches the 1021 * search but hashes to a different neglist. 1022 */ 1023 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1024 if (ncp == oncp) 1025 break; 1026 } 1027 1028 /* 1029 * No match to begin with. 1030 */ 1031 if (__predict_false(ncp == NULL)) { 1032 goto out_abort; 1033 } 1034 1035 /* 1036 * The newly found entry may be something different... 1037 */ 1038 if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1039 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) { 1040 goto out_abort; 1041 } 1042 1043 /* 1044 * ... and not even negative. 1045 */ 1046 nc_flag = atomic_load_char(&ncp->nc_flag); 1047 if ((nc_flag & NCF_NEGATIVE) == 0) { 1048 goto out_abort; 1049 } 1050 1051 if (__predict_false(!cache_ncp_canuse(ncp))) { 1052 goto out_abort; 1053 } 1054 1055 cache_neg_promote_locked(ncp); 1056 cache_neg_hit_finish(ncp); 1057 vfs_smr_exit(); 1058 mtx_unlock(&nl->nl_lock); 1059 return (true); 1060 out_abort: 1061 vfs_smr_exit(); 1062 mtx_unlock(&nl->nl_lock); 1063 return (false); 1064 } 1065 1066 static void 1067 cache_neg_promote(struct namecache *ncp) 1068 { 1069 struct neglist *nl; 1070 1071 nl = NCP2NEGLIST(ncp); 1072 mtx_lock(&nl->nl_lock); 1073 cache_neg_promote_locked(ncp); 1074 mtx_unlock(&nl->nl_lock); 1075 } 1076 1077 static void 1078 cache_neg_insert(struct namecache *ncp) 1079 { 1080 struct neglist *nl; 1081 1082 MPASS(ncp->nc_flag & NCF_NEGATIVE); 1083 cache_assert_bucket_locked(ncp); 1084 nl = NCP2NEGLIST(ncp); 1085 mtx_lock(&nl->nl_lock); 1086 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst); 1087 mtx_unlock(&nl->nl_lock); 1088 atomic_add_long(&numneg, 1); 1089 } 1090 1091 static void 1092 cache_neg_remove(struct namecache *ncp) 1093 { 1094 struct neglist *nl; 1095 struct negstate *ns; 1096 1097 cache_assert_bucket_locked(ncp); 1098 nl = NCP2NEGLIST(ncp); 1099 ns = NCP2NEGSTATE(ncp); 1100 mtx_lock(&nl->nl_lock); 1101 if ((ns->neg_flag & NEG_HOT) != 0) { 1102 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst); 1103 nl->nl_hotnum--; 1104 } else { 1105 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst); 1106 } 1107 mtx_unlock(&nl->nl_lock); 1108 atomic_subtract_long(&numneg, 1); 1109 } 1110 1111 static struct neglist * 1112 cache_neg_evict_select_list(void) 1113 { 1114 struct neglist *nl; 1115 u_int c; 1116 1117 c = atomic_fetchadd_int(&neg_cycle, 1) + 1; 1118 nl = &neglists[c % numneglists]; 1119 if (!mtx_trylock(&nl->nl_evict_lock)) { 1120 counter_u64_add(neg_evict_skipped_contended, 1); 1121 return (NULL); 1122 } 1123 return (nl); 1124 } 1125 1126 static struct namecache * 1127 cache_neg_evict_select_entry(struct neglist *nl) 1128 { 1129 struct namecache *ncp, *lncp; 1130 struct negstate *ns, *lns; 1131 int i; 1132 1133 mtx_assert(&nl->nl_evict_lock, MA_OWNED); 1134 mtx_assert(&nl->nl_lock, MA_OWNED); 1135 ncp = TAILQ_FIRST(&nl->nl_list); 1136 if (ncp == NULL) 1137 return (NULL); 1138 lncp = ncp; 1139 lns = NCP2NEGSTATE(lncp); 1140 for (i = 1; i < 4; i++) { 1141 ncp = TAILQ_NEXT(ncp, nc_dst); 1142 if (ncp == NULL) 1143 break; 1144 ns = NCP2NEGSTATE(ncp); 1145 if (ns->neg_hit < lns->neg_hit) { 1146 lncp = ncp; 1147 lns = ns; 1148 } 1149 } 1150 return (lncp); 1151 } 1152 1153 static bool 1154 cache_neg_evict(void) 1155 { 1156 struct namecache *ncp, *ncp2; 1157 struct neglist *nl; 1158 struct negstate *ns; 1159 struct vnode *dvp; 1160 struct mtx *dvlp; 1161 struct mtx *blp; 1162 uint32_t hash; 1163 u_char nlen; 1164 bool evicted; 1165 1166 nl = cache_neg_evict_select_list(); 1167 if (nl == NULL) { 1168 return (false); 1169 } 1170 1171 mtx_lock(&nl->nl_lock); 1172 ncp = TAILQ_FIRST(&nl->nl_hotlist); 1173 if (ncp != NULL) { 1174 cache_neg_demote_locked(ncp); 1175 } 1176 ncp = cache_neg_evict_select_entry(nl); 1177 if (ncp == NULL) { 1178 counter_u64_add(neg_evict_skipped_empty, 1); 1179 mtx_unlock(&nl->nl_lock); 1180 mtx_unlock(&nl->nl_evict_lock); 1181 return (false); 1182 } 1183 ns = NCP2NEGSTATE(ncp); 1184 nlen = ncp->nc_nlen; 1185 dvp = ncp->nc_dvp; 1186 hash = cache_get_hash(ncp->nc_name, nlen, dvp); 1187 dvlp = VP2VNODELOCK(dvp); 1188 blp = HASH2BUCKETLOCK(hash); 1189 mtx_unlock(&nl->nl_lock); 1190 mtx_unlock(&nl->nl_evict_lock); 1191 mtx_lock(dvlp); 1192 mtx_lock(blp); 1193 /* 1194 * Note that since all locks were dropped above, the entry may be 1195 * gone or reallocated to be something else. 1196 */ 1197 CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) { 1198 if (ncp2 == ncp && ncp2->nc_dvp == dvp && 1199 ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0) 1200 break; 1201 } 1202 if (ncp2 == NULL) { 1203 counter_u64_add(neg_evict_skipped_missed, 1); 1204 ncp = NULL; 1205 evicted = false; 1206 } else { 1207 MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp)); 1208 MPASS(blp == NCP2BUCKETLOCK(ncp)); 1209 SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp, 1210 ncp->nc_name); 1211 cache_zap_locked(ncp); 1212 counter_u64_add(neg_evicted, 1); 1213 evicted = true; 1214 } 1215 mtx_unlock(blp); 1216 mtx_unlock(dvlp); 1217 if (ncp != NULL) 1218 cache_free(ncp); 1219 return (evicted); 1220 } 1221 1222 /* 1223 * Maybe evict a negative entry to create more room. 1224 * 1225 * The ncnegfactor parameter limits what fraction of the total count 1226 * can comprise of negative entries. However, if the cache is just 1227 * warming up this leads to excessive evictions. As such, ncnegminpct 1228 * (recomputed to neg_min) dictates whether the above should be 1229 * applied. 1230 * 1231 * Try evicting if the cache is close to full capacity regardless of 1232 * other considerations. 1233 */ 1234 static bool 1235 cache_neg_evict_cond(u_long lnumcache) 1236 { 1237 u_long lnumneg; 1238 1239 if (ncsize - 1000 < lnumcache) 1240 goto out_evict; 1241 lnumneg = atomic_load_long(&numneg); 1242 if (lnumneg < neg_min) 1243 return (false); 1244 if (lnumneg * ncnegfactor < lnumcache) 1245 return (false); 1246 out_evict: 1247 return (cache_neg_evict()); 1248 } 1249 1250 /* 1251 * cache_zap_locked(): 1252 * 1253 * Removes a namecache entry from cache, whether it contains an actual 1254 * pointer to a vnode or if it is just a negative cache entry. 1255 */ 1256 static void 1257 cache_zap_locked(struct namecache *ncp) 1258 { 1259 struct nchashhead *ncpp; 1260 1261 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1262 cache_assert_vnode_locked(ncp->nc_vp); 1263 cache_assert_vnode_locked(ncp->nc_dvp); 1264 cache_assert_bucket_locked(ncp); 1265 1266 cache_ncp_invalidate(ncp); 1267 1268 ncpp = NCP2BUCKET(ncp); 1269 CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash); 1270 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 1271 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, 1272 ncp->nc_name, ncp->nc_vp); 1273 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); 1274 if (ncp == ncp->nc_vp->v_cache_dd) { 1275 vn_seqc_write_begin_unheld(ncp->nc_vp); 1276 ncp->nc_vp->v_cache_dd = NULL; 1277 vn_seqc_write_end(ncp->nc_vp); 1278 } 1279 } else { 1280 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp, 1281 ncp->nc_name); 1282 cache_neg_remove(ncp); 1283 } 1284 if (ncp->nc_flag & NCF_ISDOTDOT) { 1285 if (ncp == ncp->nc_dvp->v_cache_dd) { 1286 vn_seqc_write_begin_unheld(ncp->nc_dvp); 1287 ncp->nc_dvp->v_cache_dd = NULL; 1288 vn_seqc_write_end(ncp->nc_dvp); 1289 } 1290 } else { 1291 LIST_REMOVE(ncp, nc_src); 1292 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { 1293 ncp->nc_flag |= NCF_DVDROP; 1294 counter_u64_add(numcachehv, -1); 1295 } 1296 } 1297 atomic_subtract_long(&numcache, 1); 1298 } 1299 1300 static void 1301 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) 1302 { 1303 struct mtx *blp; 1304 1305 MPASS(ncp->nc_dvp == vp); 1306 MPASS(ncp->nc_flag & NCF_NEGATIVE); 1307 cache_assert_vnode_locked(vp); 1308 1309 blp = NCP2BUCKETLOCK(ncp); 1310 mtx_lock(blp); 1311 cache_zap_locked(ncp); 1312 mtx_unlock(blp); 1313 } 1314 1315 static bool 1316 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, 1317 struct mtx **vlpp) 1318 { 1319 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 1320 struct mtx *blp; 1321 1322 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 1323 cache_assert_vnode_locked(vp); 1324 1325 if (ncp->nc_flag & NCF_NEGATIVE) { 1326 if (*vlpp != NULL) { 1327 mtx_unlock(*vlpp); 1328 *vlpp = NULL; 1329 } 1330 cache_zap_negative_locked_vnode_kl(ncp, vp); 1331 return (true); 1332 } 1333 1334 pvlp = VP2VNODELOCK(vp); 1335 blp = NCP2BUCKETLOCK(ncp); 1336 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 1337 vlp2 = VP2VNODELOCK(ncp->nc_vp); 1338 1339 if (*vlpp == vlp1 || *vlpp == vlp2) { 1340 to_unlock = *vlpp; 1341 *vlpp = NULL; 1342 } else { 1343 if (*vlpp != NULL) { 1344 mtx_unlock(*vlpp); 1345 *vlpp = NULL; 1346 } 1347 cache_sort_vnodes(&vlp1, &vlp2); 1348 if (vlp1 == pvlp) { 1349 mtx_lock(vlp2); 1350 to_unlock = vlp2; 1351 } else { 1352 if (!mtx_trylock(vlp1)) 1353 goto out_relock; 1354 to_unlock = vlp1; 1355 } 1356 } 1357 mtx_lock(blp); 1358 cache_zap_locked(ncp); 1359 mtx_unlock(blp); 1360 if (to_unlock != NULL) 1361 mtx_unlock(to_unlock); 1362 return (true); 1363 1364 out_relock: 1365 mtx_unlock(vlp2); 1366 mtx_lock(vlp1); 1367 mtx_lock(vlp2); 1368 MPASS(*vlpp == NULL); 1369 *vlpp = vlp1; 1370 return (false); 1371 } 1372 1373 /* 1374 * If trylocking failed we can get here. We know enough to take all needed locks 1375 * in the right order and re-lookup the entry. 1376 */ 1377 static int 1378 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1379 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, 1380 struct mtx *blp) 1381 { 1382 struct namecache *rncp; 1383 1384 cache_assert_bucket_unlocked(ncp); 1385 1386 cache_sort_vnodes(&dvlp, &vlp); 1387 cache_lock_vnodes(dvlp, vlp); 1388 mtx_lock(blp); 1389 CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { 1390 if (rncp == ncp && rncp->nc_dvp == dvp && 1391 rncp->nc_nlen == cnp->cn_namelen && 1392 !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) 1393 break; 1394 } 1395 if (rncp != NULL) { 1396 cache_zap_locked(rncp); 1397 mtx_unlock(blp); 1398 cache_unlock_vnodes(dvlp, vlp); 1399 counter_u64_add(zap_bucket_relock_success, 1); 1400 return (0); 1401 } 1402 1403 mtx_unlock(blp); 1404 cache_unlock_vnodes(dvlp, vlp); 1405 return (EAGAIN); 1406 } 1407 1408 static int __noinline 1409 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp, 1410 uint32_t hash, struct mtx *blp) 1411 { 1412 struct mtx *dvlp, *vlp; 1413 struct vnode *dvp; 1414 1415 cache_assert_bucket_locked(ncp); 1416 1417 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1418 vlp = NULL; 1419 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1420 vlp = VP2VNODELOCK(ncp->nc_vp); 1421 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1422 cache_zap_locked(ncp); 1423 mtx_unlock(blp); 1424 cache_unlock_vnodes(dvlp, vlp); 1425 return (0); 1426 } 1427 1428 dvp = ncp->nc_dvp; 1429 mtx_unlock(blp); 1430 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1431 } 1432 1433 static __noinline int 1434 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp) 1435 { 1436 struct namecache *ncp; 1437 struct mtx *blp; 1438 struct mtx *dvlp, *dvlp2; 1439 uint32_t hash; 1440 int error; 1441 1442 if (cnp->cn_namelen == 2 && 1443 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1444 dvlp = VP2VNODELOCK(dvp); 1445 dvlp2 = NULL; 1446 mtx_lock(dvlp); 1447 retry_dotdot: 1448 ncp = dvp->v_cache_dd; 1449 if (ncp == NULL) { 1450 mtx_unlock(dvlp); 1451 if (dvlp2 != NULL) 1452 mtx_unlock(dvlp2); 1453 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1454 return (0); 1455 } 1456 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1457 if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2)) 1458 goto retry_dotdot; 1459 MPASS(dvp->v_cache_dd == NULL); 1460 mtx_unlock(dvlp); 1461 if (dvlp2 != NULL) 1462 mtx_unlock(dvlp2); 1463 cache_free(ncp); 1464 } else { 1465 vn_seqc_write_begin(dvp); 1466 dvp->v_cache_dd = NULL; 1467 vn_seqc_write_end(dvp); 1468 mtx_unlock(dvlp); 1469 if (dvlp2 != NULL) 1470 mtx_unlock(dvlp2); 1471 } 1472 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1473 return (1); 1474 } 1475 1476 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1477 blp = HASH2BUCKETLOCK(hash); 1478 retry: 1479 if (CK_SLIST_EMPTY(NCHHASH(hash))) 1480 goto out_no_entry; 1481 1482 mtx_lock(blp); 1483 1484 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1485 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1486 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1487 break; 1488 } 1489 1490 if (ncp == NULL) { 1491 mtx_unlock(blp); 1492 goto out_no_entry; 1493 } 1494 1495 error = cache_zap_locked_bucket(ncp, cnp, hash, blp); 1496 if (__predict_false(error != 0)) { 1497 zap_bucket_fail++; 1498 goto retry; 1499 } 1500 counter_u64_add(numposzaps, 1); 1501 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1502 cache_free(ncp); 1503 return (1); 1504 out_no_entry: 1505 counter_u64_add(nummisszap, 1); 1506 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1507 return (0); 1508 } 1509 1510 static int __noinline 1511 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1512 struct timespec *tsp, int *ticksp) 1513 { 1514 int ltype; 1515 1516 *vpp = dvp; 1517 counter_u64_add(dothits, 1); 1518 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); 1519 if (tsp != NULL) 1520 timespecclear(tsp); 1521 if (ticksp != NULL) 1522 *ticksp = ticks; 1523 vrefact(*vpp); 1524 /* 1525 * When we lookup "." we still can be asked to lock it 1526 * differently... 1527 */ 1528 ltype = cnp->cn_lkflags & LK_TYPE_MASK; 1529 if (ltype != VOP_ISLOCKED(*vpp)) { 1530 if (ltype == LK_EXCLUSIVE) { 1531 vn_lock(*vpp, LK_UPGRADE | LK_RETRY); 1532 if (VN_IS_DOOMED((*vpp))) { 1533 /* forced unmount */ 1534 vrele(*vpp); 1535 *vpp = NULL; 1536 return (ENOENT); 1537 } 1538 } else 1539 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); 1540 } 1541 return (-1); 1542 } 1543 1544 static int __noinline 1545 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1546 struct timespec *tsp, int *ticksp) 1547 { 1548 struct namecache_ts *ncp_ts; 1549 struct namecache *ncp; 1550 struct mtx *dvlp; 1551 enum vgetstate vs; 1552 int error, ltype; 1553 bool whiteout; 1554 1555 MPASS((cnp->cn_flags & ISDOTDOT) != 0); 1556 1557 if ((cnp->cn_flags & MAKEENTRY) == 0) { 1558 cache_remove_cnp(dvp, cnp); 1559 return (0); 1560 } 1561 1562 counter_u64_add(dotdothits, 1); 1563 retry: 1564 dvlp = VP2VNODELOCK(dvp); 1565 mtx_lock(dvlp); 1566 ncp = dvp->v_cache_dd; 1567 if (ncp == NULL) { 1568 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, "..", NULL); 1569 mtx_unlock(dvlp); 1570 return (0); 1571 } 1572 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1573 if (ncp->nc_flag & NCF_NEGATIVE) 1574 *vpp = NULL; 1575 else 1576 *vpp = ncp->nc_vp; 1577 } else 1578 *vpp = ncp->nc_dvp; 1579 if (*vpp == NULL) 1580 goto negative_success; 1581 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp); 1582 cache_out_ts(ncp, tsp, ticksp); 1583 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == 1584 NCF_DTS && tsp != NULL) { 1585 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1586 *tsp = ncp_ts->nc_dotdottime; 1587 } 1588 1589 MPASS(dvp != *vpp); 1590 ltype = VOP_ISLOCKED(dvp); 1591 VOP_UNLOCK(dvp); 1592 vs = vget_prep(*vpp); 1593 mtx_unlock(dvlp); 1594 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1595 vn_lock(dvp, ltype | LK_RETRY); 1596 if (VN_IS_DOOMED(dvp)) { 1597 if (error == 0) 1598 vput(*vpp); 1599 *vpp = NULL; 1600 return (ENOENT); 1601 } 1602 if (error) { 1603 *vpp = NULL; 1604 goto retry; 1605 } 1606 return (-1); 1607 negative_success: 1608 if (__predict_false(cnp->cn_nameiop == CREATE)) { 1609 if (cnp->cn_flags & ISLASTCN) { 1610 counter_u64_add(numnegzaps, 1); 1611 cache_zap_negative_locked_vnode_kl(ncp, dvp); 1612 mtx_unlock(dvlp); 1613 cache_free(ncp); 1614 return (0); 1615 } 1616 } 1617 1618 whiteout = (ncp->nc_flag & NCF_WHITE); 1619 cache_out_ts(ncp, tsp, ticksp); 1620 if (cache_neg_hit_prep(ncp)) 1621 cache_neg_promote(ncp); 1622 else 1623 cache_neg_hit_finish(ncp); 1624 mtx_unlock(dvlp); 1625 if (whiteout) 1626 cnp->cn_flags |= ISWHITEOUT; 1627 return (ENOENT); 1628 } 1629 1630 /** 1631 * Lookup a name in the name cache 1632 * 1633 * # Arguments 1634 * 1635 * - dvp: Parent directory in which to search. 1636 * - vpp: Return argument. Will contain desired vnode on cache hit. 1637 * - cnp: Parameters of the name search. The most interesting bits of 1638 * the cn_flags field have the following meanings: 1639 * - MAKEENTRY: If clear, free an entry from the cache rather than look 1640 * it up. 1641 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." 1642 * - tsp: Return storage for cache timestamp. On a successful (positive 1643 * or negative) lookup, tsp will be filled with any timespec that 1644 * was stored when this cache entry was created. However, it will 1645 * be clear for "." entries. 1646 * - ticks: Return storage for alternate cache timestamp. On a successful 1647 * (positive or negative) lookup, it will contain the ticks value 1648 * that was current when the cache entry was created, unless cnp 1649 * was ".". 1650 * 1651 * Either both tsp and ticks have to be provided or neither of them. 1652 * 1653 * # Returns 1654 * 1655 * - -1: A positive cache hit. vpp will contain the desired vnode. 1656 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due 1657 * to a forced unmount. vpp will not be modified. If the entry 1658 * is a whiteout, then the ISWHITEOUT flag will be set in 1659 * cnp->cn_flags. 1660 * - 0: A cache miss. vpp will not be modified. 1661 * 1662 * # Locking 1663 * 1664 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up 1665 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the 1666 * lock is not recursively acquired. 1667 */ 1668 static int __noinline 1669 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1670 struct timespec *tsp, int *ticksp) 1671 { 1672 struct namecache *ncp; 1673 struct mtx *blp; 1674 uint32_t hash; 1675 enum vgetstate vs; 1676 int error; 1677 bool whiteout; 1678 1679 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 1680 MPASS((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) != 0); 1681 1682 retry: 1683 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1684 blp = HASH2BUCKETLOCK(hash); 1685 mtx_lock(blp); 1686 1687 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1688 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1689 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1690 break; 1691 } 1692 1693 if (__predict_false(ncp == NULL)) { 1694 mtx_unlock(blp); 1695 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1696 NULL); 1697 counter_u64_add(nummiss, 1); 1698 return (0); 1699 } 1700 1701 if (ncp->nc_flag & NCF_NEGATIVE) 1702 goto negative_success; 1703 1704 counter_u64_add(numposhits, 1); 1705 *vpp = ncp->nc_vp; 1706 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); 1707 cache_out_ts(ncp, tsp, ticksp); 1708 MPASS(dvp != *vpp); 1709 vs = vget_prep(*vpp); 1710 mtx_unlock(blp); 1711 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1712 if (error) { 1713 *vpp = NULL; 1714 goto retry; 1715 } 1716 return (-1); 1717 negative_success: 1718 if (__predict_false(cnp->cn_nameiop == CREATE)) { 1719 if (cnp->cn_flags & ISLASTCN) { 1720 counter_u64_add(numnegzaps, 1); 1721 error = cache_zap_locked_bucket(ncp, cnp, hash, blp); 1722 if (__predict_false(error != 0)) { 1723 zap_bucket_fail2++; 1724 goto retry; 1725 } 1726 cache_free(ncp); 1727 return (0); 1728 } 1729 } 1730 1731 whiteout = (ncp->nc_flag & NCF_WHITE); 1732 cache_out_ts(ncp, tsp, ticksp); 1733 if (cache_neg_hit_prep(ncp)) 1734 cache_neg_promote(ncp); 1735 else 1736 cache_neg_hit_finish(ncp); 1737 mtx_unlock(blp); 1738 if (whiteout) 1739 cnp->cn_flags |= ISWHITEOUT; 1740 return (ENOENT); 1741 } 1742 1743 int 1744 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1745 struct timespec *tsp, int *ticksp) 1746 { 1747 struct namecache *ncp; 1748 uint32_t hash; 1749 enum vgetstate vs; 1750 int error; 1751 bool whiteout, neg_promote; 1752 u_short nc_flag; 1753 1754 MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL)); 1755 1756 #ifdef DEBUG_CACHE 1757 if (__predict_false(!doingcache)) { 1758 cnp->cn_flags &= ~MAKEENTRY; 1759 return (0); 1760 } 1761 #endif 1762 1763 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 1764 if (cnp->cn_namelen == 1) 1765 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); 1766 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') 1767 return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp)); 1768 } 1769 1770 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 1771 1772 if ((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) == 0) { 1773 cache_remove_cnp(dvp, cnp); 1774 return (0); 1775 } 1776 1777 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1778 vfs_smr_enter(); 1779 1780 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1781 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1782 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1783 break; 1784 } 1785 1786 if (__predict_false(ncp == NULL)) { 1787 vfs_smr_exit(); 1788 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1789 NULL); 1790 counter_u64_add(nummiss, 1); 1791 return (0); 1792 } 1793 1794 nc_flag = atomic_load_char(&ncp->nc_flag); 1795 if (nc_flag & NCF_NEGATIVE) 1796 goto negative_success; 1797 1798 counter_u64_add(numposhits, 1); 1799 *vpp = ncp->nc_vp; 1800 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); 1801 cache_out_ts(ncp, tsp, ticksp); 1802 MPASS(dvp != *vpp); 1803 if (!cache_ncp_canuse(ncp)) { 1804 vfs_smr_exit(); 1805 *vpp = NULL; 1806 goto out_fallback; 1807 } 1808 vs = vget_prep_smr(*vpp); 1809 vfs_smr_exit(); 1810 if (__predict_false(vs == VGET_NONE)) { 1811 *vpp = NULL; 1812 goto out_fallback; 1813 } 1814 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1815 if (error) { 1816 *vpp = NULL; 1817 goto out_fallback; 1818 } 1819 return (-1); 1820 negative_success: 1821 if (__predict_false(cnp->cn_nameiop == CREATE)) { 1822 if (cnp->cn_flags & ISLASTCN) { 1823 vfs_smr_exit(); 1824 goto out_fallback; 1825 } 1826 } 1827 1828 cache_out_ts(ncp, tsp, ticksp); 1829 whiteout = (ncp->nc_flag & NCF_WHITE); 1830 neg_promote = cache_neg_hit_prep(ncp); 1831 if (__predict_false(!cache_ncp_canuse(ncp))) { 1832 cache_neg_hit_abort(ncp); 1833 vfs_smr_exit(); 1834 goto out_fallback; 1835 } 1836 if (neg_promote) { 1837 vfs_smr_exit(); 1838 if (!cache_neg_promote_cond(dvp, cnp, ncp, hash)) 1839 goto out_fallback; 1840 } else { 1841 cache_neg_hit_finish(ncp); 1842 vfs_smr_exit(); 1843 } 1844 if (whiteout) 1845 cnp->cn_flags |= ISWHITEOUT; 1846 return (ENOENT); 1847 out_fallback: 1848 return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp)); 1849 } 1850 1851 struct celockstate { 1852 struct mtx *vlp[3]; 1853 struct mtx *blp[2]; 1854 }; 1855 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); 1856 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); 1857 1858 static inline void 1859 cache_celockstate_init(struct celockstate *cel) 1860 { 1861 1862 bzero(cel, sizeof(*cel)); 1863 } 1864 1865 static void 1866 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, 1867 struct vnode *dvp) 1868 { 1869 struct mtx *vlp1, *vlp2; 1870 1871 MPASS(cel->vlp[0] == NULL); 1872 MPASS(cel->vlp[1] == NULL); 1873 MPASS(cel->vlp[2] == NULL); 1874 1875 MPASS(vp != NULL || dvp != NULL); 1876 1877 vlp1 = VP2VNODELOCK(vp); 1878 vlp2 = VP2VNODELOCK(dvp); 1879 cache_sort_vnodes(&vlp1, &vlp2); 1880 1881 if (vlp1 != NULL) { 1882 mtx_lock(vlp1); 1883 cel->vlp[0] = vlp1; 1884 } 1885 mtx_lock(vlp2); 1886 cel->vlp[1] = vlp2; 1887 } 1888 1889 static void 1890 cache_unlock_vnodes_cel(struct celockstate *cel) 1891 { 1892 1893 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); 1894 1895 if (cel->vlp[0] != NULL) 1896 mtx_unlock(cel->vlp[0]); 1897 if (cel->vlp[1] != NULL) 1898 mtx_unlock(cel->vlp[1]); 1899 if (cel->vlp[2] != NULL) 1900 mtx_unlock(cel->vlp[2]); 1901 } 1902 1903 static bool 1904 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) 1905 { 1906 struct mtx *vlp; 1907 bool ret; 1908 1909 cache_assert_vlp_locked(cel->vlp[0]); 1910 cache_assert_vlp_locked(cel->vlp[1]); 1911 MPASS(cel->vlp[2] == NULL); 1912 1913 MPASS(vp != NULL); 1914 vlp = VP2VNODELOCK(vp); 1915 1916 ret = true; 1917 if (vlp >= cel->vlp[1]) { 1918 mtx_lock(vlp); 1919 } else { 1920 if (mtx_trylock(vlp)) 1921 goto out; 1922 cache_lock_vnodes_cel_3_failures++; 1923 cache_unlock_vnodes_cel(cel); 1924 if (vlp < cel->vlp[0]) { 1925 mtx_lock(vlp); 1926 mtx_lock(cel->vlp[0]); 1927 mtx_lock(cel->vlp[1]); 1928 } else { 1929 if (cel->vlp[0] != NULL) 1930 mtx_lock(cel->vlp[0]); 1931 mtx_lock(vlp); 1932 mtx_lock(cel->vlp[1]); 1933 } 1934 ret = false; 1935 } 1936 out: 1937 cel->vlp[2] = vlp; 1938 return (ret); 1939 } 1940 1941 static void 1942 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1, 1943 struct mtx *blp2) 1944 { 1945 1946 MPASS(cel->blp[0] == NULL); 1947 MPASS(cel->blp[1] == NULL); 1948 1949 cache_sort_vnodes(&blp1, &blp2); 1950 1951 if (blp1 != NULL) { 1952 mtx_lock(blp1); 1953 cel->blp[0] = blp1; 1954 } 1955 mtx_lock(blp2); 1956 cel->blp[1] = blp2; 1957 } 1958 1959 static void 1960 cache_unlock_buckets_cel(struct celockstate *cel) 1961 { 1962 1963 if (cel->blp[0] != NULL) 1964 mtx_unlock(cel->blp[0]); 1965 mtx_unlock(cel->blp[1]); 1966 } 1967 1968 /* 1969 * Lock part of the cache affected by the insertion. 1970 * 1971 * This means vnodelocks for dvp, vp and the relevant bucketlock. 1972 * However, insertion can result in removal of an old entry. In this 1973 * case we have an additional vnode and bucketlock pair to lock. 1974 * 1975 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while 1976 * preserving the locking order (smaller address first). 1977 */ 1978 static void 1979 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1980 uint32_t hash) 1981 { 1982 struct namecache *ncp; 1983 struct mtx *blps[2]; 1984 1985 blps[0] = HASH2BUCKETLOCK(hash); 1986 for (;;) { 1987 blps[1] = NULL; 1988 cache_lock_vnodes_cel(cel, dvp, vp); 1989 if (vp == NULL || vp->v_type != VDIR) 1990 break; 1991 ncp = vp->v_cache_dd; 1992 if (ncp == NULL) 1993 break; 1994 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1995 break; 1996 MPASS(ncp->nc_dvp == vp); 1997 blps[1] = NCP2BUCKETLOCK(ncp); 1998 if (ncp->nc_flag & NCF_NEGATIVE) 1999 break; 2000 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 2001 break; 2002 /* 2003 * All vnodes got re-locked. Re-validate the state and if 2004 * nothing changed we are done. Otherwise restart. 2005 */ 2006 if (ncp == vp->v_cache_dd && 2007 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 2008 blps[1] == NCP2BUCKETLOCK(ncp) && 2009 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 2010 break; 2011 cache_unlock_vnodes_cel(cel); 2012 cel->vlp[0] = NULL; 2013 cel->vlp[1] = NULL; 2014 cel->vlp[2] = NULL; 2015 } 2016 cache_lock_buckets_cel(cel, blps[0], blps[1]); 2017 } 2018 2019 static void 2020 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 2021 uint32_t hash) 2022 { 2023 struct namecache *ncp; 2024 struct mtx *blps[2]; 2025 2026 blps[0] = HASH2BUCKETLOCK(hash); 2027 for (;;) { 2028 blps[1] = NULL; 2029 cache_lock_vnodes_cel(cel, dvp, vp); 2030 ncp = dvp->v_cache_dd; 2031 if (ncp == NULL) 2032 break; 2033 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2034 break; 2035 MPASS(ncp->nc_dvp == dvp); 2036 blps[1] = NCP2BUCKETLOCK(ncp); 2037 if (ncp->nc_flag & NCF_NEGATIVE) 2038 break; 2039 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 2040 break; 2041 if (ncp == dvp->v_cache_dd && 2042 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 2043 blps[1] == NCP2BUCKETLOCK(ncp) && 2044 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 2045 break; 2046 cache_unlock_vnodes_cel(cel); 2047 cel->vlp[0] = NULL; 2048 cel->vlp[1] = NULL; 2049 cel->vlp[2] = NULL; 2050 } 2051 cache_lock_buckets_cel(cel, blps[0], blps[1]); 2052 } 2053 2054 static void 2055 cache_enter_unlock(struct celockstate *cel) 2056 { 2057 2058 cache_unlock_buckets_cel(cel); 2059 cache_unlock_vnodes_cel(cel); 2060 } 2061 2062 static void __noinline 2063 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, 2064 struct componentname *cnp) 2065 { 2066 struct celockstate cel; 2067 struct namecache *ncp; 2068 uint32_t hash; 2069 int len; 2070 2071 if (dvp->v_cache_dd == NULL) 2072 return; 2073 len = cnp->cn_namelen; 2074 cache_celockstate_init(&cel); 2075 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 2076 cache_enter_lock_dd(&cel, dvp, vp, hash); 2077 vn_seqc_write_begin(dvp); 2078 ncp = dvp->v_cache_dd; 2079 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) { 2080 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); 2081 cache_zap_locked(ncp); 2082 } else { 2083 ncp = NULL; 2084 } 2085 dvp->v_cache_dd = NULL; 2086 vn_seqc_write_end(dvp); 2087 cache_enter_unlock(&cel); 2088 if (ncp != NULL) 2089 cache_free(ncp); 2090 } 2091 2092 /* 2093 * Add an entry to the cache. 2094 */ 2095 void 2096 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 2097 struct timespec *tsp, struct timespec *dtsp) 2098 { 2099 struct celockstate cel; 2100 struct namecache *ncp, *n2, *ndd; 2101 struct namecache_ts *ncp_ts; 2102 struct nchashhead *ncpp; 2103 uint32_t hash; 2104 int flag; 2105 int len; 2106 u_long lnumcache; 2107 2108 VNPASS(dvp != vp, dvp); 2109 VNPASS(!VN_IS_DOOMED(dvp), dvp); 2110 VNPASS(dvp->v_type != VNON, dvp); 2111 if (vp != NULL) { 2112 VNPASS(!VN_IS_DOOMED(vp), vp); 2113 VNPASS(vp->v_type != VNON, vp); 2114 } 2115 2116 #ifdef DEBUG_CACHE 2117 if (__predict_false(!doingcache)) 2118 return; 2119 #endif 2120 2121 flag = 0; 2122 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 2123 if (cnp->cn_namelen == 1) 2124 return; 2125 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 2126 cache_enter_dotdot_prep(dvp, vp, cnp); 2127 flag = NCF_ISDOTDOT; 2128 } 2129 } 2130 2131 /* 2132 * Avoid blowout in namecache entries. 2133 * 2134 * Bugs: 2135 * 1. filesystems may end up tryng to add an already existing entry 2136 * (for example this can happen after a cache miss during concurrent 2137 * lookup), in which case we will call cache_neg_evict despite not 2138 * adding anything. 2139 * 2. the routine may fail to free anything and no provisions are made 2140 * to make it try harder (see the inside for failure modes) 2141 * 3. it only ever looks at negative entries. 2142 */ 2143 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; 2144 if (cache_neg_evict_cond(lnumcache)) { 2145 lnumcache = atomic_load_long(&numcache); 2146 } 2147 if (__predict_false(lnumcache >= ncsize)) { 2148 atomic_subtract_long(&numcache, 1); 2149 counter_u64_add(numdrops, 1); 2150 return; 2151 } 2152 2153 cache_celockstate_init(&cel); 2154 ndd = NULL; 2155 ncp_ts = NULL; 2156 2157 /* 2158 * Calculate the hash key and setup as much of the new 2159 * namecache entry as possible before acquiring the lock. 2160 */ 2161 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); 2162 ncp->nc_flag = flag | NCF_WIP; 2163 ncp->nc_vp = vp; 2164 if (vp == NULL) 2165 cache_neg_init(ncp); 2166 ncp->nc_dvp = dvp; 2167 if (tsp != NULL) { 2168 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 2169 ncp_ts->nc_time = *tsp; 2170 ncp_ts->nc_ticks = ticks; 2171 ncp_ts->nc_nc.nc_flag |= NCF_TS; 2172 if (dtsp != NULL) { 2173 ncp_ts->nc_dotdottime = *dtsp; 2174 ncp_ts->nc_nc.nc_flag |= NCF_DTS; 2175 } 2176 } 2177 len = ncp->nc_nlen = cnp->cn_namelen; 2178 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 2179 memcpy(ncp->nc_name, cnp->cn_nameptr, len); 2180 ncp->nc_name[len] = '\0'; 2181 cache_enter_lock(&cel, dvp, vp, hash); 2182 2183 /* 2184 * See if this vnode or negative entry is already in the cache 2185 * with this name. This can happen with concurrent lookups of 2186 * the same path name. 2187 */ 2188 ncpp = NCHHASH(hash); 2189 CK_SLIST_FOREACH(n2, ncpp, nc_hash) { 2190 if (n2->nc_dvp == dvp && 2191 n2->nc_nlen == cnp->cn_namelen && 2192 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { 2193 MPASS(cache_ncp_canuse(n2)); 2194 if ((n2->nc_flag & NCF_NEGATIVE) != 0) 2195 KASSERT(vp == NULL, 2196 ("%s: found entry pointing to a different vnode (%p != %p)", 2197 __func__, NULL, vp)); 2198 else 2199 KASSERT(n2->nc_vp == vp, 2200 ("%s: found entry pointing to a different vnode (%p != %p)", 2201 __func__, n2->nc_vp, vp)); 2202 /* 2203 * Entries are supposed to be immutable unless in the 2204 * process of getting destroyed. Accommodating for 2205 * changing timestamps is possible but not worth it. 2206 * This should be harmless in terms of correctness, in 2207 * the worst case resulting in an earlier expiration. 2208 * Alternatively, the found entry can be replaced 2209 * altogether. 2210 */ 2211 MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS))); 2212 #if 0 2213 if (tsp != NULL) { 2214 KASSERT((n2->nc_flag & NCF_TS) != 0, 2215 ("no NCF_TS")); 2216 n2_ts = __containerof(n2, struct namecache_ts, nc_nc); 2217 n2_ts->nc_time = ncp_ts->nc_time; 2218 n2_ts->nc_ticks = ncp_ts->nc_ticks; 2219 if (dtsp != NULL) { 2220 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; 2221 n2_ts->nc_nc.nc_flag |= NCF_DTS; 2222 } 2223 } 2224 #endif 2225 SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name, 2226 vp); 2227 goto out_unlock_free; 2228 } 2229 } 2230 2231 if (flag == NCF_ISDOTDOT) { 2232 /* 2233 * See if we are trying to add .. entry, but some other lookup 2234 * has populated v_cache_dd pointer already. 2235 */ 2236 if (dvp->v_cache_dd != NULL) 2237 goto out_unlock_free; 2238 KASSERT(vp == NULL || vp->v_type == VDIR, 2239 ("wrong vnode type %p", vp)); 2240 vn_seqc_write_begin(dvp); 2241 dvp->v_cache_dd = ncp; 2242 vn_seqc_write_end(dvp); 2243 } 2244 2245 if (vp != NULL) { 2246 if (flag != NCF_ISDOTDOT) { 2247 /* 2248 * For this case, the cache entry maps both the 2249 * directory name in it and the name ".." for the 2250 * directory's parent. 2251 */ 2252 vn_seqc_write_begin(vp); 2253 if ((ndd = vp->v_cache_dd) != NULL) { 2254 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) 2255 cache_zap_locked(ndd); 2256 else 2257 ndd = NULL; 2258 } 2259 vp->v_cache_dd = ncp; 2260 vn_seqc_write_end(vp); 2261 } else if (vp->v_type != VDIR) { 2262 if (vp->v_cache_dd != NULL) { 2263 vn_seqc_write_begin(vp); 2264 vp->v_cache_dd = NULL; 2265 vn_seqc_write_end(vp); 2266 } 2267 } 2268 } 2269 2270 if (flag != NCF_ISDOTDOT) { 2271 if (LIST_EMPTY(&dvp->v_cache_src)) { 2272 vhold(dvp); 2273 counter_u64_add(numcachehv, 1); 2274 } 2275 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); 2276 } 2277 2278 /* 2279 * If the entry is "negative", we place it into the 2280 * "negative" cache queue, otherwise, we place it into the 2281 * destination vnode's cache entries queue. 2282 */ 2283 if (vp != NULL) { 2284 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); 2285 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, 2286 vp); 2287 } else { 2288 if (cnp->cn_flags & ISWHITEOUT) 2289 ncp->nc_flag |= NCF_WHITE; 2290 cache_neg_insert(ncp); 2291 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, 2292 ncp->nc_name); 2293 } 2294 2295 /* 2296 * Insert the new namecache entry into the appropriate chain 2297 * within the cache entries table. 2298 */ 2299 CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash); 2300 2301 atomic_thread_fence_rel(); 2302 /* 2303 * Mark the entry as fully constructed. 2304 * It is immutable past this point until its removal. 2305 */ 2306 atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP); 2307 2308 cache_enter_unlock(&cel); 2309 if (ndd != NULL) 2310 cache_free(ndd); 2311 return; 2312 out_unlock_free: 2313 cache_enter_unlock(&cel); 2314 atomic_subtract_long(&numcache, 1); 2315 cache_free(ncp); 2316 return; 2317 } 2318 2319 static u_int 2320 cache_roundup_2(u_int val) 2321 { 2322 u_int res; 2323 2324 for (res = 1; res <= val; res <<= 1) 2325 continue; 2326 2327 return (res); 2328 } 2329 2330 static struct nchashhead * 2331 nchinittbl(u_long elements, u_long *hashmask) 2332 { 2333 struct nchashhead *hashtbl; 2334 u_long hashsize, i; 2335 2336 hashsize = cache_roundup_2(elements) / 2; 2337 2338 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK); 2339 for (i = 0; i < hashsize; i++) 2340 CK_SLIST_INIT(&hashtbl[i]); 2341 *hashmask = hashsize - 1; 2342 return (hashtbl); 2343 } 2344 2345 static void 2346 ncfreetbl(struct nchashhead *hashtbl) 2347 { 2348 2349 free(hashtbl, M_VFSCACHE); 2350 } 2351 2352 /* 2353 * Name cache initialization, from vfs_init() when we are booting 2354 */ 2355 static void 2356 nchinit(void *dummy __unused) 2357 { 2358 u_int i; 2359 2360 cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE, 2361 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2362 cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE, 2363 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2364 cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE, 2365 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2366 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE, 2367 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2368 2369 VFS_SMR_ZONE_SET(cache_zone_small); 2370 VFS_SMR_ZONE_SET(cache_zone_small_ts); 2371 VFS_SMR_ZONE_SET(cache_zone_large); 2372 VFS_SMR_ZONE_SET(cache_zone_large_ts); 2373 2374 ncsize = desiredvnodes * ncsizefactor; 2375 cache_recalc_neg_min(ncnegminpct); 2376 nchashtbl = nchinittbl(desiredvnodes * 2, &nchash); 2377 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; 2378 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ 2379 ncbuckethash = 7; 2380 if (ncbuckethash > nchash) 2381 ncbuckethash = nchash; 2382 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, 2383 M_WAITOK | M_ZERO); 2384 for (i = 0; i < numbucketlocks; i++) 2385 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE); 2386 ncvnodehash = ncbuckethash; 2387 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, 2388 M_WAITOK | M_ZERO); 2389 for (i = 0; i < numvnodelocks; i++) 2390 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); 2391 2392 for (i = 0; i < numneglists; i++) { 2393 mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF); 2394 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); 2395 TAILQ_INIT(&neglists[i].nl_list); 2396 TAILQ_INIT(&neglists[i].nl_hotlist); 2397 } 2398 } 2399 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); 2400 2401 void 2402 cache_vnode_init(struct vnode *vp) 2403 { 2404 2405 LIST_INIT(&vp->v_cache_src); 2406 TAILQ_INIT(&vp->v_cache_dst); 2407 vp->v_cache_dd = NULL; 2408 cache_prehash(vp); 2409 } 2410 2411 void 2412 cache_changesize(u_long newmaxvnodes) 2413 { 2414 struct nchashhead *new_nchashtbl, *old_nchashtbl; 2415 u_long new_nchash, old_nchash; 2416 struct namecache *ncp; 2417 uint32_t hash; 2418 u_long newncsize; 2419 int i; 2420 2421 newncsize = newmaxvnodes * ncsizefactor; 2422 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); 2423 if (newmaxvnodes < numbucketlocks) 2424 newmaxvnodes = numbucketlocks; 2425 2426 new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash); 2427 /* If same hash table size, nothing to do */ 2428 if (nchash == new_nchash) { 2429 ncfreetbl(new_nchashtbl); 2430 return; 2431 } 2432 /* 2433 * Move everything from the old hash table to the new table. 2434 * None of the namecache entries in the table can be removed 2435 * because to do so, they have to be removed from the hash table. 2436 */ 2437 cache_lock_all_vnodes(); 2438 cache_lock_all_buckets(); 2439 old_nchashtbl = nchashtbl; 2440 old_nchash = nchash; 2441 nchashtbl = new_nchashtbl; 2442 nchash = new_nchash; 2443 for (i = 0; i <= old_nchash; i++) { 2444 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) { 2445 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, 2446 ncp->nc_dvp); 2447 CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash); 2448 CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); 2449 } 2450 } 2451 ncsize = newncsize; 2452 cache_recalc_neg_min(ncnegminpct); 2453 cache_unlock_all_buckets(); 2454 cache_unlock_all_vnodes(); 2455 ncfreetbl(old_nchashtbl); 2456 } 2457 2458 /* 2459 * Invalidate all entries from and to a particular vnode. 2460 */ 2461 static void 2462 cache_purge_impl(struct vnode *vp) 2463 { 2464 TAILQ_HEAD(, namecache) ncps; 2465 struct namecache *ncp, *nnp; 2466 struct mtx *vlp, *vlp2; 2467 2468 TAILQ_INIT(&ncps); 2469 vlp = VP2VNODELOCK(vp); 2470 vlp2 = NULL; 2471 mtx_lock(vlp); 2472 retry: 2473 while (!LIST_EMPTY(&vp->v_cache_src)) { 2474 ncp = LIST_FIRST(&vp->v_cache_src); 2475 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2476 goto retry; 2477 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2478 } 2479 while (!TAILQ_EMPTY(&vp->v_cache_dst)) { 2480 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2481 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2482 goto retry; 2483 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2484 } 2485 ncp = vp->v_cache_dd; 2486 if (ncp != NULL) { 2487 KASSERT(ncp->nc_flag & NCF_ISDOTDOT, 2488 ("lost dotdot link")); 2489 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2490 goto retry; 2491 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2492 } 2493 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); 2494 mtx_unlock(vlp); 2495 if (vlp2 != NULL) 2496 mtx_unlock(vlp2); 2497 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2498 cache_free(ncp); 2499 } 2500 } 2501 2502 /* 2503 * Opportunistic check to see if there is anything to do. 2504 */ 2505 static bool 2506 cache_has_entries(struct vnode *vp) 2507 { 2508 2509 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2510 vp->v_cache_dd == NULL) 2511 return (false); 2512 return (true); 2513 } 2514 2515 void 2516 cache_purge(struct vnode *vp) 2517 { 2518 2519 SDT_PROBE1(vfs, namecache, purge, done, vp); 2520 if (!cache_has_entries(vp)) 2521 return; 2522 cache_purge_impl(vp); 2523 } 2524 2525 /* 2526 * Only to be used by vgone. 2527 */ 2528 void 2529 cache_purge_vgone(struct vnode *vp) 2530 { 2531 struct mtx *vlp; 2532 2533 VNPASS(VN_IS_DOOMED(vp), vp); 2534 if (cache_has_entries(vp)) { 2535 cache_purge_impl(vp); 2536 return; 2537 } 2538 2539 /* 2540 * Serialize against a potential thread doing cache_purge. 2541 */ 2542 vlp = VP2VNODELOCK(vp); 2543 mtx_wait_unlocked(vlp); 2544 if (cache_has_entries(vp)) { 2545 cache_purge_impl(vp); 2546 return; 2547 } 2548 return; 2549 } 2550 2551 /* 2552 * Invalidate all negative entries for a particular directory vnode. 2553 */ 2554 void 2555 cache_purge_negative(struct vnode *vp) 2556 { 2557 TAILQ_HEAD(, namecache) ncps; 2558 struct namecache *ncp, *nnp; 2559 struct mtx *vlp; 2560 2561 SDT_PROBE1(vfs, namecache, purge_negative, done, vp); 2562 if (LIST_EMPTY(&vp->v_cache_src)) 2563 return; 2564 TAILQ_INIT(&ncps); 2565 vlp = VP2VNODELOCK(vp); 2566 mtx_lock(vlp); 2567 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { 2568 if (!(ncp->nc_flag & NCF_NEGATIVE)) 2569 continue; 2570 cache_zap_negative_locked_vnode_kl(ncp, vp); 2571 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2572 } 2573 mtx_unlock(vlp); 2574 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2575 cache_free(ncp); 2576 } 2577 } 2578 2579 void 2580 cache_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, 2581 struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp) 2582 { 2583 2584 ASSERT_VOP_IN_SEQC(fdvp); 2585 ASSERT_VOP_IN_SEQC(fvp); 2586 ASSERT_VOP_IN_SEQC(tdvp); 2587 if (tvp != NULL) 2588 ASSERT_VOP_IN_SEQC(tvp); 2589 2590 cache_purge(fvp); 2591 if (tvp != NULL) { 2592 cache_purge(tvp); 2593 KASSERT(!cache_remove_cnp(tdvp, tcnp), 2594 ("%s: lingering negative entry", __func__)); 2595 } else { 2596 cache_remove_cnp(tdvp, tcnp); 2597 } 2598 } 2599 2600 #ifdef INVARIANTS 2601 /* 2602 * Validate that if an entry exists it matches. 2603 */ 2604 void 2605 cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) 2606 { 2607 struct namecache *ncp; 2608 struct mtx *blp; 2609 uint32_t hash; 2610 2611 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 2612 if (CK_SLIST_EMPTY(NCHHASH(hash))) 2613 return; 2614 blp = HASH2BUCKETLOCK(hash); 2615 mtx_lock(blp); 2616 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 2617 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 2618 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) { 2619 if (ncp->nc_vp != vp) 2620 panic("%s: mismatch (%p != %p); ncp %p [%s] dvp %p vp %p\n", 2621 __func__, vp, ncp->nc_vp, ncp, ncp->nc_name, ncp->nc_dvp, 2622 ncp->nc_vp); 2623 } 2624 } 2625 mtx_unlock(blp); 2626 } 2627 #endif 2628 2629 /* 2630 * Flush all entries referencing a particular filesystem. 2631 */ 2632 void 2633 cache_purgevfs(struct mount *mp) 2634 { 2635 struct vnode *vp, *mvp; 2636 2637 SDT_PROBE1(vfs, namecache, purgevfs, done, mp); 2638 /* 2639 * Somewhat wasteful iteration over all vnodes. Would be better to 2640 * support filtering and avoid the interlock to begin with. 2641 */ 2642 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 2643 if (!cache_has_entries(vp)) { 2644 VI_UNLOCK(vp); 2645 continue; 2646 } 2647 vholdl(vp); 2648 VI_UNLOCK(vp); 2649 cache_purge(vp); 2650 vdrop(vp); 2651 } 2652 } 2653 2654 /* 2655 * Perform canonical checks and cache lookup and pass on to filesystem 2656 * through the vop_cachedlookup only if needed. 2657 */ 2658 2659 int 2660 vfs_cache_lookup(struct vop_lookup_args *ap) 2661 { 2662 struct vnode *dvp; 2663 int error; 2664 struct vnode **vpp = ap->a_vpp; 2665 struct componentname *cnp = ap->a_cnp; 2666 int flags = cnp->cn_flags; 2667 2668 *vpp = NULL; 2669 dvp = ap->a_dvp; 2670 2671 if (dvp->v_type != VDIR) 2672 return (ENOTDIR); 2673 2674 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && 2675 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) 2676 return (EROFS); 2677 2678 error = vn_dir_check_exec(dvp, cnp); 2679 if (error != 0) 2680 return (error); 2681 2682 error = cache_lookup(dvp, vpp, cnp, NULL, NULL); 2683 if (error == 0) 2684 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); 2685 if (error == -1) 2686 return (0); 2687 return (error); 2688 } 2689 2690 /* Implementation of the getcwd syscall. */ 2691 int 2692 sys___getcwd(struct thread *td, struct __getcwd_args *uap) 2693 { 2694 char *buf, *retbuf; 2695 size_t buflen; 2696 int error; 2697 2698 buflen = uap->buflen; 2699 if (__predict_false(buflen < 2)) 2700 return (EINVAL); 2701 if (buflen > MAXPATHLEN) 2702 buflen = MAXPATHLEN; 2703 2704 buf = uma_zalloc(namei_zone, M_WAITOK); 2705 error = vn_getcwd(buf, &retbuf, &buflen); 2706 if (error == 0) 2707 error = copyout(retbuf, uap->buf, buflen); 2708 uma_zfree(namei_zone, buf); 2709 return (error); 2710 } 2711 2712 int 2713 vn_getcwd(char *buf, char **retbuf, size_t *buflen) 2714 { 2715 struct pwd *pwd; 2716 int error; 2717 2718 vfs_smr_enter(); 2719 pwd = pwd_get_smr(); 2720 error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, 2721 buflen, 0); 2722 VFS_SMR_ASSERT_NOT_ENTERED(); 2723 if (error < 0) { 2724 pwd = pwd_hold(curthread); 2725 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf, 2726 retbuf, buflen); 2727 pwd_drop(pwd); 2728 } 2729 2730 #ifdef KTRACE 2731 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0) 2732 ktrnamei(*retbuf); 2733 #endif 2734 return (error); 2735 } 2736 2737 static int 2738 kern___realpathat(struct thread *td, int fd, const char *path, char *buf, 2739 size_t size, int flags, enum uio_seg pathseg) 2740 { 2741 struct nameidata nd; 2742 char *retbuf, *freebuf; 2743 int error; 2744 2745 if (flags != 0) 2746 return (EINVAL); 2747 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1, 2748 pathseg, path, fd, &cap_fstat_rights, td); 2749 if ((error = namei(&nd)) != 0) 2750 return (error); 2751 error = vn_fullpath_hardlink(&nd, &retbuf, &freebuf, &size); 2752 if (error == 0) { 2753 error = copyout(retbuf, buf, size); 2754 free(freebuf, M_TEMP); 2755 } 2756 NDFREE(&nd, 0); 2757 return (error); 2758 } 2759 2760 int 2761 sys___realpathat(struct thread *td, struct __realpathat_args *uap) 2762 { 2763 2764 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size, 2765 uap->flags, UIO_USERSPACE)); 2766 } 2767 2768 /* 2769 * Retrieve the full filesystem path that correspond to a vnode from the name 2770 * cache (if available) 2771 */ 2772 int 2773 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf) 2774 { 2775 struct pwd *pwd; 2776 char *buf; 2777 size_t buflen; 2778 int error; 2779 2780 if (__predict_false(vp == NULL)) 2781 return (EINVAL); 2782 2783 buflen = MAXPATHLEN; 2784 buf = malloc(buflen, M_TEMP, M_WAITOK); 2785 vfs_smr_enter(); 2786 pwd = pwd_get_smr(); 2787 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, 0); 2788 VFS_SMR_ASSERT_NOT_ENTERED(); 2789 if (error < 0) { 2790 pwd = pwd_hold(curthread); 2791 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen); 2792 pwd_drop(pwd); 2793 } 2794 if (error == 0) 2795 *freebuf = buf; 2796 else 2797 free(buf, M_TEMP); 2798 return (error); 2799 } 2800 2801 /* 2802 * This function is similar to vn_fullpath, but it attempts to lookup the 2803 * pathname relative to the global root mount point. This is required for the 2804 * auditing sub-system, as audited pathnames must be absolute, relative to the 2805 * global root mount point. 2806 */ 2807 int 2808 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf) 2809 { 2810 char *buf; 2811 size_t buflen; 2812 int error; 2813 2814 if (__predict_false(vp == NULL)) 2815 return (EINVAL); 2816 buflen = MAXPATHLEN; 2817 buf = malloc(buflen, M_TEMP, M_WAITOK); 2818 vfs_smr_enter(); 2819 error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, 0); 2820 VFS_SMR_ASSERT_NOT_ENTERED(); 2821 if (error < 0) { 2822 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen); 2823 } 2824 if (error == 0) 2825 *freebuf = buf; 2826 else 2827 free(buf, M_TEMP); 2828 return (error); 2829 } 2830 2831 static struct namecache * 2832 vn_dd_from_dst(struct vnode *vp) 2833 { 2834 struct namecache *ncp; 2835 2836 cache_assert_vnode_locked(vp); 2837 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) { 2838 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2839 return (ncp); 2840 } 2841 return (NULL); 2842 } 2843 2844 int 2845 vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen) 2846 { 2847 struct vnode *dvp; 2848 struct namecache *ncp; 2849 struct mtx *vlp; 2850 int error; 2851 2852 vlp = VP2VNODELOCK(*vp); 2853 mtx_lock(vlp); 2854 ncp = (*vp)->v_cache_dd; 2855 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) { 2856 KASSERT(ncp == vn_dd_from_dst(*vp), 2857 ("%s: mismatch for dd entry (%p != %p)", __func__, 2858 ncp, vn_dd_from_dst(*vp))); 2859 } else { 2860 ncp = vn_dd_from_dst(*vp); 2861 } 2862 if (ncp != NULL) { 2863 if (*buflen < ncp->nc_nlen) { 2864 mtx_unlock(vlp); 2865 vrele(*vp); 2866 counter_u64_add(numfullpathfail4, 1); 2867 error = ENOMEM; 2868 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2869 vp, NULL); 2870 return (error); 2871 } 2872 *buflen -= ncp->nc_nlen; 2873 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 2874 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, 2875 ncp->nc_name, vp); 2876 dvp = *vp; 2877 *vp = ncp->nc_dvp; 2878 vref(*vp); 2879 mtx_unlock(vlp); 2880 vrele(dvp); 2881 return (0); 2882 } 2883 SDT_PROBE1(vfs, namecache, fullpath, miss, vp); 2884 2885 mtx_unlock(vlp); 2886 vn_lock(*vp, LK_SHARED | LK_RETRY); 2887 error = VOP_VPTOCNP(*vp, &dvp, buf, buflen); 2888 vput(*vp); 2889 if (error) { 2890 counter_u64_add(numfullpathfail2, 1); 2891 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2892 return (error); 2893 } 2894 2895 *vp = dvp; 2896 if (VN_IS_DOOMED(dvp)) { 2897 /* forced unmount */ 2898 vrele(dvp); 2899 error = ENOENT; 2900 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2901 return (error); 2902 } 2903 /* 2904 * *vp has its use count incremented still. 2905 */ 2906 2907 return (0); 2908 } 2909 2910 /* 2911 * Resolve a directory to a pathname. 2912 * 2913 * The name of the directory can always be found in the namecache or fetched 2914 * from the filesystem. There is also guaranteed to be only one parent, meaning 2915 * we can just follow vnodes up until we find the root. 2916 * 2917 * The vnode must be referenced. 2918 */ 2919 static int 2920 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 2921 size_t *len, size_t addend) 2922 { 2923 #ifdef KDTRACE_HOOKS 2924 struct vnode *startvp = vp; 2925 #endif 2926 struct vnode *vp1; 2927 size_t buflen; 2928 int error; 2929 bool slash_prefixed; 2930 2931 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); 2932 VNPASS(vp->v_usecount > 0, vp); 2933 2934 buflen = *len; 2935 2936 slash_prefixed = true; 2937 if (addend == 0) { 2938 MPASS(*len >= 2); 2939 buflen--; 2940 buf[buflen] = '\0'; 2941 slash_prefixed = false; 2942 } 2943 2944 error = 0; 2945 2946 SDT_PROBE1(vfs, namecache, fullpath, entry, vp); 2947 counter_u64_add(numfullpathcalls, 1); 2948 while (vp != rdir && vp != rootvnode) { 2949 /* 2950 * The vp vnode must be already fully constructed, 2951 * since it is either found in namecache or obtained 2952 * from VOP_VPTOCNP(). We may test for VV_ROOT safely 2953 * without obtaining the vnode lock. 2954 */ 2955 if ((vp->v_vflag & VV_ROOT) != 0) { 2956 vn_lock(vp, LK_RETRY | LK_SHARED); 2957 2958 /* 2959 * With the vnode locked, check for races with 2960 * unmount, forced or not. Note that we 2961 * already verified that vp is not equal to 2962 * the root vnode, which means that 2963 * mnt_vnodecovered can be NULL only for the 2964 * case of unmount. 2965 */ 2966 if (VN_IS_DOOMED(vp) || 2967 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || 2968 vp1->v_mountedhere != vp->v_mount) { 2969 vput(vp); 2970 error = ENOENT; 2971 SDT_PROBE3(vfs, namecache, fullpath, return, 2972 error, vp, NULL); 2973 break; 2974 } 2975 2976 vref(vp1); 2977 vput(vp); 2978 vp = vp1; 2979 continue; 2980 } 2981 if (vp->v_type != VDIR) { 2982 vrele(vp); 2983 counter_u64_add(numfullpathfail1, 1); 2984 error = ENOTDIR; 2985 SDT_PROBE3(vfs, namecache, fullpath, return, 2986 error, vp, NULL); 2987 break; 2988 } 2989 error = vn_vptocnp(&vp, buf, &buflen); 2990 if (error) 2991 break; 2992 if (buflen == 0) { 2993 vrele(vp); 2994 error = ENOMEM; 2995 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2996 startvp, NULL); 2997 break; 2998 } 2999 buf[--buflen] = '/'; 3000 slash_prefixed = true; 3001 } 3002 if (error) 3003 return (error); 3004 if (!slash_prefixed) { 3005 if (buflen == 0) { 3006 vrele(vp); 3007 counter_u64_add(numfullpathfail4, 1); 3008 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, 3009 startvp, NULL); 3010 return (ENOMEM); 3011 } 3012 buf[--buflen] = '/'; 3013 } 3014 counter_u64_add(numfullpathfound, 1); 3015 vrele(vp); 3016 3017 *retbuf = buf + buflen; 3018 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf); 3019 *len -= buflen; 3020 *len += addend; 3021 return (0); 3022 } 3023 3024 /* 3025 * Resolve an arbitrary vnode to a pathname. 3026 * 3027 * Note 2 caveats: 3028 * - hardlinks are not tracked, thus if the vnode is not a directory this can 3029 * resolve to a different path than the one used to find it 3030 * - namecache is not mandatory, meaning names are not guaranteed to be added 3031 * (in which case resolving fails) 3032 */ 3033 static void __inline 3034 cache_rev_failed_impl(int *reason, int line) 3035 { 3036 3037 *reason = line; 3038 } 3039 #define cache_rev_failed(var) cache_rev_failed_impl((var), __LINE__) 3040 3041 static int 3042 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 3043 char **retbuf, size_t *buflen, size_t addend) 3044 { 3045 #ifdef KDTRACE_HOOKS 3046 struct vnode *startvp = vp; 3047 #endif 3048 struct vnode *tvp; 3049 struct mount *mp; 3050 struct namecache *ncp; 3051 size_t orig_buflen; 3052 int reason; 3053 int error; 3054 #ifdef KDTRACE_HOOKS 3055 int i; 3056 #endif 3057 seqc_t vp_seqc, tvp_seqc; 3058 u_char nc_flag; 3059 3060 VFS_SMR_ASSERT_ENTERED(); 3061 3062 if (!cache_fast_revlookup) { 3063 vfs_smr_exit(); 3064 return (-1); 3065 } 3066 3067 orig_buflen = *buflen; 3068 3069 if (addend == 0) { 3070 MPASS(*buflen >= 2); 3071 *buflen -= 1; 3072 buf[*buflen] = '\0'; 3073 } 3074 3075 if (vp == rdir || vp == rootvnode) { 3076 if (addend == 0) { 3077 *buflen -= 1; 3078 buf[*buflen] = '/'; 3079 } 3080 goto out_ok; 3081 } 3082 3083 #ifdef KDTRACE_HOOKS 3084 i = 0; 3085 #endif 3086 error = -1; 3087 ncp = NULL; /* for sdt probe down below */ 3088 vp_seqc = vn_seqc_read_any(vp); 3089 if (seqc_in_modify(vp_seqc)) { 3090 cache_rev_failed(&reason); 3091 goto out_abort; 3092 } 3093 3094 for (;;) { 3095 #ifdef KDTRACE_HOOKS 3096 i++; 3097 #endif 3098 if ((vp->v_vflag & VV_ROOT) != 0) { 3099 mp = atomic_load_ptr(&vp->v_mount); 3100 if (mp == NULL) { 3101 cache_rev_failed(&reason); 3102 goto out_abort; 3103 } 3104 tvp = atomic_load_ptr(&mp->mnt_vnodecovered); 3105 tvp_seqc = vn_seqc_read_any(tvp); 3106 if (seqc_in_modify(tvp_seqc)) { 3107 cache_rev_failed(&reason); 3108 goto out_abort; 3109 } 3110 if (!vn_seqc_consistent(vp, vp_seqc)) { 3111 cache_rev_failed(&reason); 3112 goto out_abort; 3113 } 3114 vp = tvp; 3115 vp_seqc = tvp_seqc; 3116 continue; 3117 } 3118 ncp = atomic_load_ptr(&vp->v_cache_dd); 3119 if (ncp == NULL) { 3120 cache_rev_failed(&reason); 3121 goto out_abort; 3122 } 3123 nc_flag = atomic_load_char(&ncp->nc_flag); 3124 if ((nc_flag & NCF_ISDOTDOT) != 0) { 3125 cache_rev_failed(&reason); 3126 goto out_abort; 3127 } 3128 if (!cache_ncp_canuse(ncp)) { 3129 cache_rev_failed(&reason); 3130 goto out_abort; 3131 } 3132 if (ncp->nc_nlen >= *buflen) { 3133 cache_rev_failed(&reason); 3134 error = ENOMEM; 3135 goto out_abort; 3136 } 3137 *buflen -= ncp->nc_nlen; 3138 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 3139 *buflen -= 1; 3140 buf[*buflen] = '/'; 3141 tvp = ncp->nc_dvp; 3142 tvp_seqc = vn_seqc_read_any(tvp); 3143 if (seqc_in_modify(tvp_seqc)) { 3144 cache_rev_failed(&reason); 3145 goto out_abort; 3146 } 3147 if (!vn_seqc_consistent(vp, vp_seqc)) { 3148 cache_rev_failed(&reason); 3149 goto out_abort; 3150 } 3151 vp = tvp; 3152 vp_seqc = tvp_seqc; 3153 if (vp == rdir || vp == rootvnode) 3154 break; 3155 } 3156 out_ok: 3157 vfs_smr_exit(); 3158 *retbuf = buf + *buflen; 3159 *buflen = orig_buflen - *buflen + addend; 3160 SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf); 3161 return (0); 3162 3163 out_abort: 3164 *buflen = orig_buflen; 3165 SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i); 3166 vfs_smr_exit(); 3167 return (error); 3168 } 3169 3170 static int 3171 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 3172 size_t *buflen) 3173 { 3174 size_t orig_buflen, addend; 3175 int error; 3176 3177 if (*buflen < 2) 3178 return (EINVAL); 3179 3180 orig_buflen = *buflen; 3181 3182 vref(vp); 3183 addend = 0; 3184 if (vp->v_type != VDIR) { 3185 *buflen -= 1; 3186 buf[*buflen] = '\0'; 3187 error = vn_vptocnp(&vp, buf, buflen); 3188 if (error) 3189 return (error); 3190 if (*buflen == 0) { 3191 vrele(vp); 3192 return (ENOMEM); 3193 } 3194 *buflen -= 1; 3195 buf[*buflen] = '/'; 3196 addend = orig_buflen - *buflen; 3197 } 3198 3199 return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, addend)); 3200 } 3201 3202 /* 3203 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks). 3204 * 3205 * Since the namecache does not track handlings, the caller is expected to first 3206 * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei. 3207 * 3208 * Then we have 2 cases: 3209 * - if the found vnode is a directory, the path can be constructed just by 3210 * fullowing names up the chain 3211 * - otherwise we populate the buffer with the saved name and start resolving 3212 * from the parent 3213 */ 3214 static int 3215 vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, char **freebuf, 3216 size_t *buflen) 3217 { 3218 char *buf, *tmpbuf; 3219 struct pwd *pwd; 3220 struct componentname *cnp; 3221 struct vnode *vp; 3222 size_t addend; 3223 int error; 3224 enum vtype type; 3225 3226 if (*buflen < 2) 3227 return (EINVAL); 3228 if (*buflen > MAXPATHLEN) 3229 *buflen = MAXPATHLEN; 3230 3231 buf = malloc(*buflen, M_TEMP, M_WAITOK); 3232 3233 addend = 0; 3234 vp = ndp->ni_vp; 3235 /* 3236 * Check for VBAD to work around the vp_crossmp bug in lookup(). 3237 * 3238 * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be 3239 * set to mount point's root vnode while ni_dvp will be vp_crossmp. 3240 * If the type is VDIR (like in this very case) we can skip looking 3241 * at ni_dvp in the first place. However, since vnodes get passed here 3242 * unlocked the target may transition to doomed state (type == VBAD) 3243 * before we get to evaluate the condition. If this happens, we will 3244 * populate part of the buffer and descend to vn_fullpath_dir with 3245 * vp == vp_crossmp. Prevent the problem by checking for VBAD. 3246 * 3247 * This should be atomic_load(&vp->v_type) but it is ilegal to take 3248 * an address of a bit field, even if said field is sized to char. 3249 * Work around the problem by reading the value into a full-sized enum 3250 * and then re-reading it with atomic_load which will still prevent 3251 * the compiler from re-reading down the road. 3252 */ 3253 type = vp->v_type; 3254 type = atomic_load_int(&type); 3255 if (type == VBAD) { 3256 error = ENOENT; 3257 goto out_bad; 3258 } 3259 if (type != VDIR) { 3260 cnp = &ndp->ni_cnd; 3261 addend = cnp->cn_namelen + 2; 3262 if (*buflen < addend) { 3263 error = ENOMEM; 3264 goto out_bad; 3265 } 3266 *buflen -= addend; 3267 tmpbuf = buf + *buflen; 3268 tmpbuf[0] = '/'; 3269 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen); 3270 tmpbuf[addend - 1] = '\0'; 3271 vp = ndp->ni_dvp; 3272 } 3273 3274 vfs_smr_enter(); 3275 pwd = pwd_get_smr(); 3276 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen, 3277 addend); 3278 VFS_SMR_ASSERT_NOT_ENTERED(); 3279 if (error < 0) { 3280 pwd = pwd_hold(curthread); 3281 vref(vp); 3282 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen, 3283 addend); 3284 pwd_drop(pwd); 3285 if (error != 0) 3286 goto out_bad; 3287 } 3288 3289 *freebuf = buf; 3290 3291 return (0); 3292 out_bad: 3293 free(buf, M_TEMP); 3294 return (error); 3295 } 3296 3297 struct vnode * 3298 vn_dir_dd_ino(struct vnode *vp) 3299 { 3300 struct namecache *ncp; 3301 struct vnode *ddvp; 3302 struct mtx *vlp; 3303 enum vgetstate vs; 3304 3305 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); 3306 vlp = VP2VNODELOCK(vp); 3307 mtx_lock(vlp); 3308 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { 3309 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 3310 continue; 3311 ddvp = ncp->nc_dvp; 3312 vs = vget_prep(ddvp); 3313 mtx_unlock(vlp); 3314 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) 3315 return (NULL); 3316 return (ddvp); 3317 } 3318 mtx_unlock(vlp); 3319 return (NULL); 3320 } 3321 3322 int 3323 vn_commname(struct vnode *vp, char *buf, u_int buflen) 3324 { 3325 struct namecache *ncp; 3326 struct mtx *vlp; 3327 int l; 3328 3329 vlp = VP2VNODELOCK(vp); 3330 mtx_lock(vlp); 3331 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) 3332 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 3333 break; 3334 if (ncp == NULL) { 3335 mtx_unlock(vlp); 3336 return (ENOENT); 3337 } 3338 l = min(ncp->nc_nlen, buflen - 1); 3339 memcpy(buf, ncp->nc_name, l); 3340 mtx_unlock(vlp); 3341 buf[l] = '\0'; 3342 return (0); 3343 } 3344 3345 /* 3346 * This function updates path string to vnode's full global path 3347 * and checks the size of the new path string against the pathlen argument. 3348 * 3349 * Requires a locked, referenced vnode. 3350 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 3351 * 3352 * If vp is a directory, the call to vn_fullpath_global() always succeeds 3353 * because it falls back to the ".." lookup if the namecache lookup fails. 3354 */ 3355 int 3356 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, 3357 u_int pathlen) 3358 { 3359 struct nameidata nd; 3360 struct vnode *vp1; 3361 char *rpath, *fbuf; 3362 int error; 3363 3364 ASSERT_VOP_ELOCKED(vp, __func__); 3365 3366 /* Construct global filesystem path from vp. */ 3367 VOP_UNLOCK(vp); 3368 error = vn_fullpath_global(vp, &rpath, &fbuf); 3369 3370 if (error != 0) { 3371 vrele(vp); 3372 return (error); 3373 } 3374 3375 if (strlen(rpath) >= pathlen) { 3376 vrele(vp); 3377 error = ENAMETOOLONG; 3378 goto out; 3379 } 3380 3381 /* 3382 * Re-lookup the vnode by path to detect a possible rename. 3383 * As a side effect, the vnode is relocked. 3384 * If vnode was renamed, return ENOENT. 3385 */ 3386 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, 3387 UIO_SYSSPACE, path, td); 3388 error = namei(&nd); 3389 if (error != 0) { 3390 vrele(vp); 3391 goto out; 3392 } 3393 NDFREE(&nd, NDF_ONLY_PNBUF); 3394 vp1 = nd.ni_vp; 3395 vrele(vp); 3396 if (vp1 == vp) 3397 strcpy(path, rpath); 3398 else { 3399 vput(vp1); 3400 error = ENOENT; 3401 } 3402 3403 out: 3404 free(fbuf, M_TEMP); 3405 return (error); 3406 } 3407 3408 #ifdef DDB 3409 static void 3410 db_print_vpath(struct vnode *vp) 3411 { 3412 3413 while (vp != NULL) { 3414 db_printf("%p: ", vp); 3415 if (vp == rootvnode) { 3416 db_printf("/"); 3417 vp = NULL; 3418 } else { 3419 if (vp->v_vflag & VV_ROOT) { 3420 db_printf("<mount point>"); 3421 vp = vp->v_mount->mnt_vnodecovered; 3422 } else { 3423 struct namecache *ncp; 3424 char *ncn; 3425 int i; 3426 3427 ncp = TAILQ_FIRST(&vp->v_cache_dst); 3428 if (ncp != NULL) { 3429 ncn = ncp->nc_name; 3430 for (i = 0; i < ncp->nc_nlen; i++) 3431 db_printf("%c", *ncn++); 3432 vp = ncp->nc_dvp; 3433 } else { 3434 vp = NULL; 3435 } 3436 } 3437 } 3438 db_printf("\n"); 3439 } 3440 3441 return; 3442 } 3443 3444 DB_SHOW_COMMAND(vpath, db_show_vpath) 3445 { 3446 struct vnode *vp; 3447 3448 if (!have_addr) { 3449 db_printf("usage: show vpath <struct vnode *>\n"); 3450 return; 3451 } 3452 3453 vp = (struct vnode *)addr; 3454 db_print_vpath(vp); 3455 } 3456 3457 #endif 3458 3459 static bool __read_frequently cache_fast_lookup = true; 3460 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW, 3461 &cache_fast_lookup, 0, ""); 3462 3463 #define CACHE_FPL_FAILED -2020 3464 3465 static void 3466 cache_fpl_cleanup_cnp(struct componentname *cnp) 3467 { 3468 3469 uma_zfree(namei_zone, cnp->cn_pnbuf); 3470 #ifdef DIAGNOSTIC 3471 cnp->cn_pnbuf = NULL; 3472 cnp->cn_nameptr = NULL; 3473 #endif 3474 } 3475 3476 static void 3477 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp) 3478 { 3479 struct componentname *cnp; 3480 3481 cnp = &ndp->ni_cnd; 3482 while (*(cnp->cn_nameptr) == '/') { 3483 cnp->cn_nameptr++; 3484 ndp->ni_pathlen--; 3485 } 3486 3487 *dpp = ndp->ni_rootdir; 3488 } 3489 3490 /* 3491 * Components of nameidata (or objects it can point to) which may 3492 * need restoring in case fast path lookup fails. 3493 */ 3494 struct nameidata_saved { 3495 long cn_namelen; 3496 char *cn_nameptr; 3497 size_t ni_pathlen; 3498 int cn_flags; 3499 }; 3500 3501 struct cache_fpl { 3502 struct nameidata *ndp; 3503 struct componentname *cnp; 3504 struct pwd *pwd; 3505 struct vnode *dvp; 3506 struct vnode *tvp; 3507 seqc_t dvp_seqc; 3508 seqc_t tvp_seqc; 3509 struct nameidata_saved snd; 3510 int line; 3511 enum cache_fpl_status status:8; 3512 bool in_smr; 3513 bool fsearch; 3514 }; 3515 3516 static void 3517 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd) 3518 { 3519 3520 snd->cn_flags = fpl->ndp->ni_cnd.cn_flags; 3521 snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen; 3522 snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr; 3523 snd->ni_pathlen = fpl->ndp->ni_pathlen; 3524 } 3525 3526 static void 3527 cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd) 3528 { 3529 3530 fpl->ndp->ni_cnd.cn_flags = snd->cn_flags; 3531 fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen; 3532 fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr; 3533 fpl->ndp->ni_pathlen = snd->ni_pathlen; 3534 } 3535 3536 #ifdef INVARIANTS 3537 #define cache_fpl_smr_assert_entered(fpl) ({ \ 3538 struct cache_fpl *_fpl = (fpl); \ 3539 MPASS(_fpl->in_smr == true); \ 3540 VFS_SMR_ASSERT_ENTERED(); \ 3541 }) 3542 #define cache_fpl_smr_assert_not_entered(fpl) ({ \ 3543 struct cache_fpl *_fpl = (fpl); \ 3544 MPASS(_fpl->in_smr == false); \ 3545 VFS_SMR_ASSERT_NOT_ENTERED(); \ 3546 }) 3547 #else 3548 #define cache_fpl_smr_assert_entered(fpl) do { } while (0) 3549 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0) 3550 #endif 3551 3552 #define cache_fpl_smr_enter_initial(fpl) ({ \ 3553 struct cache_fpl *_fpl = (fpl); \ 3554 vfs_smr_enter(); \ 3555 _fpl->in_smr = true; \ 3556 }) 3557 3558 #define cache_fpl_smr_enter(fpl) ({ \ 3559 struct cache_fpl *_fpl = (fpl); \ 3560 MPASS(_fpl->in_smr == false); \ 3561 vfs_smr_enter(); \ 3562 _fpl->in_smr = true; \ 3563 }) 3564 3565 #define cache_fpl_smr_exit(fpl) ({ \ 3566 struct cache_fpl *_fpl = (fpl); \ 3567 MPASS(_fpl->in_smr == true); \ 3568 vfs_smr_exit(); \ 3569 _fpl->in_smr = false; \ 3570 }) 3571 3572 static int 3573 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line) 3574 { 3575 3576 if (fpl->status != CACHE_FPL_STATUS_UNSET) { 3577 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, 3578 ("%s: converting to abort from %d at %d, set at %d\n", 3579 __func__, fpl->status, line, fpl->line)); 3580 } 3581 fpl->status = CACHE_FPL_STATUS_ABORTED; 3582 fpl->line = line; 3583 return (CACHE_FPL_FAILED); 3584 } 3585 3586 #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__) 3587 3588 static int 3589 cache_fpl_partial_impl(struct cache_fpl *fpl, int line) 3590 { 3591 3592 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3593 ("%s: setting to partial at %d, but already set to %d at %d\n", 3594 __func__, line, fpl->status, fpl->line)); 3595 cache_fpl_smr_assert_entered(fpl); 3596 fpl->status = CACHE_FPL_STATUS_PARTIAL; 3597 fpl->line = line; 3598 return (CACHE_FPL_FAILED); 3599 } 3600 3601 #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__) 3602 3603 static int 3604 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line) 3605 { 3606 3607 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3608 ("%s: setting to handled at %d, but already set to %d at %d\n", 3609 __func__, line, fpl->status, fpl->line)); 3610 cache_fpl_smr_assert_not_entered(fpl); 3611 MPASS(error != CACHE_FPL_FAILED); 3612 fpl->status = CACHE_FPL_STATUS_HANDLED; 3613 fpl->line = line; 3614 return (error); 3615 } 3616 3617 #define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__) 3618 3619 #define CACHE_FPL_SUPPORTED_CN_FLAGS \ 3620 (LOCKLEAF | LOCKPARENT | WANTPARENT | NOCACHE | FOLLOW | LOCKSHARED | SAVENAME | \ 3621 SAVESTART | WILLBEDIR | ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK) 3622 3623 #define CACHE_FPL_INTERNAL_CN_FLAGS \ 3624 (ISDOTDOT | MAKEENTRY | ISLASTCN) 3625 3626 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, 3627 "supported and internal flags overlap"); 3628 3629 static bool 3630 cache_fpl_islastcn(struct nameidata *ndp) 3631 { 3632 3633 return (*ndp->ni_next == 0); 3634 } 3635 3636 static bool 3637 cache_fpl_isdotdot(struct componentname *cnp) 3638 { 3639 3640 if (cnp->cn_namelen == 2 && 3641 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') 3642 return (true); 3643 return (false); 3644 } 3645 3646 static bool 3647 cache_can_fplookup(struct cache_fpl *fpl) 3648 { 3649 struct nameidata *ndp; 3650 struct componentname *cnp; 3651 struct thread *td; 3652 3653 ndp = fpl->ndp; 3654 cnp = fpl->cnp; 3655 td = cnp->cn_thread; 3656 3657 if (!cache_fast_lookup) { 3658 cache_fpl_aborted(fpl); 3659 return (false); 3660 } 3661 #ifdef MAC 3662 if (mac_vnode_check_lookup_enabled()) { 3663 cache_fpl_aborted(fpl); 3664 return (false); 3665 } 3666 #endif 3667 if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) { 3668 cache_fpl_aborted(fpl); 3669 return (false); 3670 } 3671 if (IN_CAPABILITY_MODE(td)) { 3672 cache_fpl_aborted(fpl); 3673 return (false); 3674 } 3675 if (AUDITING_TD(td)) { 3676 cache_fpl_aborted(fpl); 3677 return (false); 3678 } 3679 if (ndp->ni_startdir != NULL) { 3680 cache_fpl_aborted(fpl); 3681 return (false); 3682 } 3683 return (true); 3684 } 3685 3686 static int 3687 cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp) 3688 { 3689 struct nameidata *ndp; 3690 int error; 3691 bool fsearch; 3692 3693 ndp = fpl->ndp; 3694 error = fgetvp_lookup_smr(ndp->ni_dirfd, ndp, vpp, &fsearch); 3695 if (__predict_false(error != 0)) { 3696 cache_fpl_smr_exit(fpl); 3697 return (cache_fpl_aborted(fpl)); 3698 } 3699 fpl->fsearch = fsearch; 3700 return (0); 3701 } 3702 3703 static bool 3704 cache_fplookup_vnode_supported(struct vnode *vp) 3705 { 3706 3707 return (vp->v_type != VLNK); 3708 } 3709 3710 static int __noinline 3711 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp, 3712 uint32_t hash) 3713 { 3714 struct componentname *cnp; 3715 struct vnode *dvp; 3716 3717 cnp = fpl->cnp; 3718 dvp = fpl->dvp; 3719 3720 cache_fpl_smr_exit(fpl); 3721 if (cache_neg_promote_cond(dvp, cnp, oncp, hash)) 3722 return (cache_fpl_handled(fpl, ENOENT)); 3723 else 3724 return (cache_fpl_aborted(fpl)); 3725 } 3726 3727 /* 3728 * The target vnode is not supported, prepare for the slow path to take over. 3729 */ 3730 static int __noinline 3731 cache_fplookup_partial_setup(struct cache_fpl *fpl) 3732 { 3733 struct nameidata *ndp; 3734 struct componentname *cnp; 3735 enum vgetstate dvs; 3736 struct vnode *dvp; 3737 struct pwd *pwd; 3738 seqc_t dvp_seqc; 3739 3740 ndp = fpl->ndp; 3741 cnp = fpl->cnp; 3742 pwd = fpl->pwd; 3743 dvp = fpl->dvp; 3744 dvp_seqc = fpl->dvp_seqc; 3745 3746 if (!pwd_hold_smr(pwd)) { 3747 cache_fpl_smr_exit(fpl); 3748 return (cache_fpl_aborted(fpl)); 3749 } 3750 3751 dvs = vget_prep_smr(dvp); 3752 cache_fpl_smr_exit(fpl); 3753 if (__predict_false(dvs == VGET_NONE)) { 3754 pwd_drop(pwd); 3755 return (cache_fpl_aborted(fpl)); 3756 } 3757 3758 vget_finish_ref(dvp, dvs); 3759 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3760 vrele(dvp); 3761 pwd_drop(pwd); 3762 return (cache_fpl_aborted(fpl)); 3763 } 3764 3765 cache_fpl_restore(fpl, &fpl->snd); 3766 3767 ndp->ni_startdir = dvp; 3768 cnp->cn_flags |= MAKEENTRY; 3769 if (cache_fpl_islastcn(ndp)) 3770 cnp->cn_flags |= ISLASTCN; 3771 if (cache_fpl_isdotdot(cnp)) 3772 cnp->cn_flags |= ISDOTDOT; 3773 3774 return (0); 3775 } 3776 3777 static int 3778 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs) 3779 { 3780 struct componentname *cnp; 3781 struct vnode *tvp; 3782 seqc_t tvp_seqc; 3783 int error, lkflags; 3784 3785 cnp = fpl->cnp; 3786 tvp = fpl->tvp; 3787 tvp_seqc = fpl->tvp_seqc; 3788 3789 if ((cnp->cn_flags & LOCKLEAF) != 0) { 3790 lkflags = LK_SHARED; 3791 if ((cnp->cn_flags & LOCKSHARED) == 0) 3792 lkflags = LK_EXCLUSIVE; 3793 error = vget_finish(tvp, lkflags, tvs); 3794 if (__predict_false(error != 0)) { 3795 return (cache_fpl_aborted(fpl)); 3796 } 3797 } else { 3798 vget_finish_ref(tvp, tvs); 3799 } 3800 3801 if (!vn_seqc_consistent(tvp, tvp_seqc)) { 3802 if ((cnp->cn_flags & LOCKLEAF) != 0) 3803 vput(tvp); 3804 else 3805 vrele(tvp); 3806 return (cache_fpl_aborted(fpl)); 3807 } 3808 3809 return (cache_fpl_handled(fpl, 0)); 3810 } 3811 3812 /* 3813 * They want to possibly modify the state of the namecache. 3814 * 3815 * Don't try to match the API contract, just leave. 3816 * TODO: this leaves scalability on the table 3817 */ 3818 static int 3819 cache_fplookup_final_modifying(struct cache_fpl *fpl) 3820 { 3821 struct componentname *cnp; 3822 3823 cnp = fpl->cnp; 3824 MPASS(cnp->cn_nameiop != LOOKUP); 3825 return (cache_fpl_partial(fpl)); 3826 } 3827 3828 static int __noinline 3829 cache_fplookup_final_withparent(struct cache_fpl *fpl) 3830 { 3831 struct componentname *cnp; 3832 enum vgetstate dvs, tvs; 3833 struct vnode *dvp, *tvp; 3834 seqc_t dvp_seqc; 3835 int error; 3836 3837 cnp = fpl->cnp; 3838 dvp = fpl->dvp; 3839 dvp_seqc = fpl->dvp_seqc; 3840 tvp = fpl->tvp; 3841 3842 MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0); 3843 3844 /* 3845 * This is less efficient than it can be for simplicity. 3846 */ 3847 dvs = vget_prep_smr(dvp); 3848 if (__predict_false(dvs == VGET_NONE)) { 3849 return (cache_fpl_aborted(fpl)); 3850 } 3851 tvs = vget_prep_smr(tvp); 3852 if (__predict_false(tvs == VGET_NONE)) { 3853 cache_fpl_smr_exit(fpl); 3854 vget_abort(dvp, dvs); 3855 return (cache_fpl_aborted(fpl)); 3856 } 3857 3858 cache_fpl_smr_exit(fpl); 3859 3860 if ((cnp->cn_flags & LOCKPARENT) != 0) { 3861 error = vget_finish(dvp, LK_EXCLUSIVE, dvs); 3862 if (__predict_false(error != 0)) { 3863 vget_abort(tvp, tvs); 3864 return (cache_fpl_aborted(fpl)); 3865 } 3866 } else { 3867 vget_finish_ref(dvp, dvs); 3868 } 3869 3870 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3871 vget_abort(tvp, tvs); 3872 if ((cnp->cn_flags & LOCKPARENT) != 0) 3873 vput(dvp); 3874 else 3875 vrele(dvp); 3876 return (cache_fpl_aborted(fpl)); 3877 } 3878 3879 error = cache_fplookup_final_child(fpl, tvs); 3880 if (__predict_false(error != 0)) { 3881 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED); 3882 if ((cnp->cn_flags & LOCKPARENT) != 0) 3883 vput(dvp); 3884 else 3885 vrele(dvp); 3886 return (error); 3887 } 3888 3889 MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED); 3890 return (0); 3891 } 3892 3893 static int 3894 cache_fplookup_final(struct cache_fpl *fpl) 3895 { 3896 struct componentname *cnp; 3897 enum vgetstate tvs; 3898 struct vnode *dvp, *tvp; 3899 seqc_t dvp_seqc; 3900 3901 cnp = fpl->cnp; 3902 dvp = fpl->dvp; 3903 dvp_seqc = fpl->dvp_seqc; 3904 tvp = fpl->tvp; 3905 3906 VNPASS(cache_fplookup_vnode_supported(dvp), dvp); 3907 3908 if (cnp->cn_nameiop != LOOKUP) { 3909 return (cache_fplookup_final_modifying(fpl)); 3910 } 3911 3912 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) 3913 return (cache_fplookup_final_withparent(fpl)); 3914 3915 tvs = vget_prep_smr(tvp); 3916 if (__predict_false(tvs == VGET_NONE)) { 3917 return (cache_fpl_partial(fpl)); 3918 } 3919 3920 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3921 cache_fpl_smr_exit(fpl); 3922 vget_abort(tvp, tvs); 3923 return (cache_fpl_aborted(fpl)); 3924 } 3925 3926 cache_fpl_smr_exit(fpl); 3927 return (cache_fplookup_final_child(fpl, tvs)); 3928 } 3929 3930 static int __noinline 3931 cache_fplookup_dot(struct cache_fpl *fpl) 3932 { 3933 struct vnode *dvp; 3934 3935 dvp = fpl->dvp; 3936 3937 fpl->tvp = dvp; 3938 fpl->tvp_seqc = vn_seqc_read_any(dvp); 3939 if (seqc_in_modify(fpl->tvp_seqc)) { 3940 return (cache_fpl_aborted(fpl)); 3941 } 3942 3943 counter_u64_add(dothits, 1); 3944 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", dvp); 3945 3946 return (0); 3947 } 3948 3949 static int __noinline 3950 cache_fplookup_dotdot(struct cache_fpl *fpl) 3951 { 3952 struct nameidata *ndp; 3953 struct componentname *cnp; 3954 struct namecache *ncp; 3955 struct vnode *dvp; 3956 struct prison *pr; 3957 u_char nc_flag; 3958 3959 ndp = fpl->ndp; 3960 cnp = fpl->cnp; 3961 dvp = fpl->dvp; 3962 3963 /* 3964 * XXX this is racy the same way regular lookup is 3965 */ 3966 for (pr = cnp->cn_cred->cr_prison; pr != NULL; 3967 pr = pr->pr_parent) 3968 if (dvp == pr->pr_root) 3969 break; 3970 3971 if (dvp == ndp->ni_rootdir || 3972 dvp == ndp->ni_topdir || 3973 dvp == rootvnode || 3974 pr != NULL) { 3975 fpl->tvp = dvp; 3976 fpl->tvp_seqc = vn_seqc_read_any(dvp); 3977 if (seqc_in_modify(fpl->tvp_seqc)) { 3978 return (cache_fpl_aborted(fpl)); 3979 } 3980 return (0); 3981 } 3982 3983 if ((dvp->v_vflag & VV_ROOT) != 0) { 3984 /* 3985 * TODO 3986 * The opposite of climb mount is needed here. 3987 */ 3988 return (cache_fpl_aborted(fpl)); 3989 } 3990 3991 ncp = atomic_load_ptr(&dvp->v_cache_dd); 3992 if (ncp == NULL) { 3993 return (cache_fpl_aborted(fpl)); 3994 } 3995 3996 nc_flag = atomic_load_char(&ncp->nc_flag); 3997 if ((nc_flag & NCF_ISDOTDOT) != 0) { 3998 if ((nc_flag & NCF_NEGATIVE) != 0) 3999 return (cache_fpl_aborted(fpl)); 4000 fpl->tvp = ncp->nc_vp; 4001 } else { 4002 fpl->tvp = ncp->nc_dvp; 4003 } 4004 4005 if (__predict_false(!cache_ncp_canuse(ncp))) { 4006 return (cache_fpl_aborted(fpl)); 4007 } 4008 4009 fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp); 4010 if (seqc_in_modify(fpl->tvp_seqc)) { 4011 return (cache_fpl_partial(fpl)); 4012 } 4013 4014 counter_u64_add(dotdothits, 1); 4015 return (0); 4016 } 4017 4018 static int __noinline 4019 cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash) 4020 { 4021 u_char nc_flag; 4022 bool neg_promote; 4023 4024 nc_flag = atomic_load_char(&ncp->nc_flag); 4025 MPASS((nc_flag & NCF_NEGATIVE) != 0); 4026 /* 4027 * If they want to create an entry we need to replace this one. 4028 */ 4029 if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) { 4030 /* 4031 * TODO 4032 * This should call something similar to 4033 * cache_fplookup_final_modifying. 4034 */ 4035 return (cache_fpl_partial(fpl)); 4036 } 4037 neg_promote = cache_neg_hit_prep(ncp); 4038 if (__predict_false(!cache_ncp_canuse(ncp))) { 4039 cache_neg_hit_abort(ncp); 4040 return (cache_fpl_partial(fpl)); 4041 } 4042 if (__predict_false((nc_flag & NCF_WHITE) != 0)) { 4043 cache_neg_hit_abort(ncp); 4044 return (cache_fpl_partial(fpl)); 4045 } 4046 if (neg_promote) { 4047 return (cache_fplookup_negative_promote(fpl, ncp, hash)); 4048 } 4049 cache_neg_hit_finish(ncp); 4050 cache_fpl_smr_exit(fpl); 4051 return (cache_fpl_handled(fpl, ENOENT)); 4052 } 4053 4054 static int 4055 cache_fplookup_next(struct cache_fpl *fpl) 4056 { 4057 struct componentname *cnp; 4058 struct namecache *ncp; 4059 struct vnode *dvp, *tvp; 4060 u_char nc_flag; 4061 uint32_t hash; 4062 4063 cnp = fpl->cnp; 4064 dvp = fpl->dvp; 4065 4066 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) { 4067 return (cache_fplookup_dot(fpl)); 4068 } 4069 4070 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 4071 4072 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 4073 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 4074 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 4075 break; 4076 } 4077 4078 /* 4079 * If there is no entry we have to punt to the slow path to perform 4080 * actual lookup. Should there be nothing with this name a negative 4081 * entry will be created. 4082 */ 4083 if (__predict_false(ncp == NULL)) { 4084 return (cache_fpl_partial(fpl)); 4085 } 4086 4087 tvp = atomic_load_ptr(&ncp->nc_vp); 4088 nc_flag = atomic_load_char(&ncp->nc_flag); 4089 if ((nc_flag & NCF_NEGATIVE) != 0) { 4090 return (cache_fplookup_neg(fpl, ncp, hash)); 4091 } 4092 4093 if (__predict_false(!cache_ncp_canuse(ncp))) { 4094 return (cache_fpl_partial(fpl)); 4095 } 4096 4097 fpl->tvp = tvp; 4098 fpl->tvp_seqc = vn_seqc_read_any(tvp); 4099 if (seqc_in_modify(fpl->tvp_seqc)) { 4100 return (cache_fpl_partial(fpl)); 4101 } 4102 4103 if (!cache_fplookup_vnode_supported(tvp)) { 4104 return (cache_fpl_partial(fpl)); 4105 } 4106 4107 counter_u64_add(numposhits, 1); 4108 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp); 4109 return (0); 4110 } 4111 4112 static bool 4113 cache_fplookup_mp_supported(struct mount *mp) 4114 { 4115 4116 if (mp == NULL) 4117 return (false); 4118 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) 4119 return (false); 4120 return (true); 4121 } 4122 4123 /* 4124 * Walk up the mount stack (if any). 4125 * 4126 * Correctness is provided in the following ways: 4127 * - all vnodes are protected from freeing with SMR 4128 * - struct mount objects are type stable making them always safe to access 4129 * - stability of the particular mount is provided by busying it 4130 * - relationship between the vnode which is mounted on and the mount is 4131 * verified with the vnode sequence counter after busying 4132 * - association between root vnode of the mount and the mount is protected 4133 * by busy 4134 * 4135 * From that point on we can read the sequence counter of the root vnode 4136 * and get the next mount on the stack (if any) using the same protection. 4137 * 4138 * By the end of successful walk we are guaranteed the reached state was 4139 * indeed present at least at some point which matches the regular lookup. 4140 */ 4141 static int __noinline 4142 cache_fplookup_climb_mount(struct cache_fpl *fpl) 4143 { 4144 struct mount *mp, *prev_mp; 4145 struct vnode *vp; 4146 seqc_t vp_seqc; 4147 4148 vp = fpl->tvp; 4149 vp_seqc = fpl->tvp_seqc; 4150 4151 VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp); 4152 mp = atomic_load_ptr(&vp->v_mountedhere); 4153 if (mp == NULL) 4154 return (0); 4155 4156 prev_mp = NULL; 4157 for (;;) { 4158 if (!vfs_op_thread_enter_crit(mp)) { 4159 if (prev_mp != NULL) 4160 vfs_op_thread_exit_crit(prev_mp); 4161 return (cache_fpl_partial(fpl)); 4162 } 4163 if (prev_mp != NULL) 4164 vfs_op_thread_exit_crit(prev_mp); 4165 if (!vn_seqc_consistent(vp, vp_seqc)) { 4166 vfs_op_thread_exit_crit(mp); 4167 return (cache_fpl_partial(fpl)); 4168 } 4169 if (!cache_fplookup_mp_supported(mp)) { 4170 vfs_op_thread_exit_crit(mp); 4171 return (cache_fpl_partial(fpl)); 4172 } 4173 vp = atomic_load_ptr(&mp->mnt_rootvnode); 4174 if (vp == NULL || VN_IS_DOOMED(vp)) { 4175 vfs_op_thread_exit_crit(mp); 4176 return (cache_fpl_partial(fpl)); 4177 } 4178 vp_seqc = vn_seqc_read_any(vp); 4179 if (seqc_in_modify(vp_seqc)) { 4180 vfs_op_thread_exit_crit(mp); 4181 return (cache_fpl_partial(fpl)); 4182 } 4183 prev_mp = mp; 4184 mp = atomic_load_ptr(&vp->v_mountedhere); 4185 if (mp == NULL) 4186 break; 4187 } 4188 4189 vfs_op_thread_exit_crit(prev_mp); 4190 fpl->tvp = vp; 4191 fpl->tvp_seqc = vp_seqc; 4192 return (0); 4193 } 4194 4195 static bool 4196 cache_fplookup_need_climb_mount(struct cache_fpl *fpl) 4197 { 4198 struct mount *mp; 4199 struct vnode *vp; 4200 4201 vp = fpl->tvp; 4202 4203 /* 4204 * Hack: while this is a union, the pointer tends to be NULL so save on 4205 * a branch. 4206 */ 4207 mp = atomic_load_ptr(&vp->v_mountedhere); 4208 if (mp == NULL) 4209 return (false); 4210 if (vp->v_type == VDIR) 4211 return (true); 4212 return (false); 4213 } 4214 4215 /* 4216 * Parse the path. 4217 * 4218 * The code was originally copy-pasted from regular lookup and despite 4219 * clean ups leaves performance on the table. Any modifications here 4220 * must take into account that in case off fallback the resulting 4221 * nameidata state has to be compatible with the original. 4222 */ 4223 static int 4224 cache_fplookup_parse(struct cache_fpl *fpl) 4225 { 4226 struct nameidata *ndp; 4227 struct componentname *cnp; 4228 char *cp; 4229 4230 ndp = fpl->ndp; 4231 cnp = fpl->cnp; 4232 4233 /* 4234 * Search a new directory. 4235 * 4236 * The last component of the filename is left accessible via 4237 * cnp->cn_nameptr for callers that need the name. Callers needing 4238 * the name set the SAVENAME flag. When done, they assume 4239 * responsibility for freeing the pathname buffer. 4240 */ 4241 for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++) 4242 continue; 4243 cnp->cn_namelen = cp - cnp->cn_nameptr; 4244 if (__predict_false(cnp->cn_namelen > NAME_MAX)) { 4245 cache_fpl_smr_exit(fpl); 4246 return (cache_fpl_handled(fpl, ENAMETOOLONG)); 4247 } 4248 ndp->ni_pathlen -= cnp->cn_namelen; 4249 KASSERT(ndp->ni_pathlen <= PATH_MAX, 4250 ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen)); 4251 ndp->ni_next = cp; 4252 4253 /* 4254 * Replace multiple slashes by a single slash and trailing slashes 4255 * by a null. This must be done before VOP_LOOKUP() because some 4256 * fs's don't know about trailing slashes. Remember if there were 4257 * trailing slashes to handle symlinks, existing non-directories 4258 * and non-existing files that won't be directories specially later. 4259 */ 4260 while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) { 4261 cp++; 4262 ndp->ni_pathlen--; 4263 if (*cp == '\0') { 4264 /* 4265 * TODO 4266 * Regular lookup performs the following: 4267 * *ndp->ni_next = '\0'; 4268 * cnp->cn_flags |= TRAILINGSLASH; 4269 * 4270 * Which is problematic since it modifies data read 4271 * from userspace. Then if fast path lookup was to 4272 * abort we would have to either restore it or convey 4273 * the flag. Since this is a corner case just ignore 4274 * it for simplicity. 4275 */ 4276 return (cache_fpl_partial(fpl)); 4277 } 4278 } 4279 ndp->ni_next = cp; 4280 4281 /* 4282 * Check for degenerate name (e.g. / or "") 4283 * which is a way of talking about a directory, 4284 * e.g. like "/." or ".". 4285 * 4286 * TODO 4287 * Another corner case handled by the regular lookup 4288 */ 4289 if (__predict_false(cnp->cn_nameptr[0] == '\0')) { 4290 return (cache_fpl_partial(fpl)); 4291 } 4292 return (0); 4293 } 4294 4295 static void 4296 cache_fplookup_parse_advance(struct cache_fpl *fpl) 4297 { 4298 struct nameidata *ndp; 4299 struct componentname *cnp; 4300 4301 ndp = fpl->ndp; 4302 cnp = fpl->cnp; 4303 4304 cnp->cn_nameptr = ndp->ni_next; 4305 while (*cnp->cn_nameptr == '/') { 4306 cnp->cn_nameptr++; 4307 ndp->ni_pathlen--; 4308 } 4309 } 4310 4311 /* 4312 * See the API contract for VOP_FPLOOKUP_VEXEC. 4313 */ 4314 static int __noinline 4315 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error) 4316 { 4317 struct componentname *cnp; 4318 struct vnode *dvp; 4319 seqc_t dvp_seqc; 4320 4321 cnp = fpl->cnp; 4322 dvp = fpl->dvp; 4323 dvp_seqc = fpl->dvp_seqc; 4324 4325 /* 4326 * Hack: they may be looking up foo/bar, where foo is a 4327 * regular file. In such a case we need to turn ENOTDIR, 4328 * but we may happen to get here with a different error. 4329 */ 4330 if (dvp->v_type != VDIR) { 4331 /* 4332 * The check here is predominantly to catch 4333 * EOPNOTSUPP from dead_vnodeops. If the vnode 4334 * gets doomed past this point it is going to 4335 * fail seqc verification. 4336 */ 4337 if (VN_IS_DOOMED(dvp)) { 4338 return (cache_fpl_aborted(fpl)); 4339 } 4340 error = ENOTDIR; 4341 } 4342 4343 /* 4344 * Hack: handle O_SEARCH. 4345 * 4346 * Open Group Base Specifications Issue 7, 2018 edition states: 4347 * If the access mode of the open file description associated with the 4348 * file descriptor is not O_SEARCH, the function shall check whether 4349 * directory searches are permitted using the current permissions of 4350 * the directory underlying the file descriptor. If the access mode is 4351 * O_SEARCH, the function shall not perform the check. 4352 * 4353 * Regular lookup tests for the NOEXECCHECK flag for every path 4354 * component to decide whether to do the permission check. However, 4355 * since most lookups never have the flag (and when they do it is only 4356 * present for the first path component), lockless lookup only acts on 4357 * it if there is a permission problem. Here the flag is represented 4358 * with a boolean so that we don't have to clear it on the way out. 4359 * 4360 * For simplicity this always aborts. 4361 * TODO: check if this is the first lookup and ignore the permission 4362 * problem. Note the flag has to survive fallback (if it happens to be 4363 * performed). 4364 */ 4365 if (fpl->fsearch) { 4366 return (cache_fpl_aborted(fpl)); 4367 } 4368 4369 switch (error) { 4370 case EAGAIN: 4371 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4372 error = cache_fpl_aborted(fpl); 4373 } else { 4374 cache_fpl_partial(fpl); 4375 } 4376 break; 4377 default: 4378 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4379 error = cache_fpl_aborted(fpl); 4380 } else { 4381 cache_fpl_smr_exit(fpl); 4382 cache_fpl_handled(fpl, error); 4383 } 4384 break; 4385 } 4386 return (error); 4387 } 4388 4389 static int 4390 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl) 4391 { 4392 struct nameidata *ndp; 4393 struct componentname *cnp; 4394 struct mount *mp; 4395 int error; 4396 4397 error = CACHE_FPL_FAILED; 4398 ndp = fpl->ndp; 4399 cnp = fpl->cnp; 4400 4401 cache_fpl_checkpoint(fpl, &fpl->snd); 4402 4403 fpl->dvp = dvp; 4404 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); 4405 if (seqc_in_modify(fpl->dvp_seqc)) { 4406 cache_fpl_aborted(fpl); 4407 goto out; 4408 } 4409 mp = atomic_load_ptr(&fpl->dvp->v_mount); 4410 if (!cache_fplookup_mp_supported(mp)) { 4411 cache_fpl_aborted(fpl); 4412 goto out; 4413 } 4414 4415 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); 4416 4417 for (;;) { 4418 error = cache_fplookup_parse(fpl); 4419 if (__predict_false(error != 0)) { 4420 break; 4421 } 4422 4423 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); 4424 4425 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred); 4426 if (__predict_false(error != 0)) { 4427 error = cache_fplookup_failed_vexec(fpl, error); 4428 break; 4429 } 4430 4431 if (__predict_false(cache_fpl_isdotdot(cnp))) { 4432 error = cache_fplookup_dotdot(fpl); 4433 if (__predict_false(error != 0)) { 4434 break; 4435 } 4436 } else { 4437 error = cache_fplookup_next(fpl); 4438 if (__predict_false(error != 0)) { 4439 break; 4440 } 4441 4442 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 4443 4444 if (cache_fplookup_need_climb_mount(fpl)) { 4445 error = cache_fplookup_climb_mount(fpl); 4446 if (__predict_false(error != 0)) { 4447 break; 4448 } 4449 } 4450 } 4451 4452 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 4453 4454 if (cache_fpl_islastcn(ndp)) { 4455 error = cache_fplookup_final(fpl); 4456 break; 4457 } 4458 4459 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { 4460 error = cache_fpl_aborted(fpl); 4461 break; 4462 } 4463 4464 fpl->dvp = fpl->tvp; 4465 fpl->dvp_seqc = fpl->tvp_seqc; 4466 4467 cache_fplookup_parse_advance(fpl); 4468 cache_fpl_checkpoint(fpl, &fpl->snd); 4469 } 4470 out: 4471 switch (fpl->status) { 4472 case CACHE_FPL_STATUS_UNSET: 4473 __assert_unreachable(); 4474 break; 4475 case CACHE_FPL_STATUS_PARTIAL: 4476 cache_fpl_smr_assert_entered(fpl); 4477 return (cache_fplookup_partial_setup(fpl)); 4478 case CACHE_FPL_STATUS_ABORTED: 4479 if (fpl->in_smr) 4480 cache_fpl_smr_exit(fpl); 4481 return (CACHE_FPL_FAILED); 4482 case CACHE_FPL_STATUS_HANDLED: 4483 MPASS(error != CACHE_FPL_FAILED); 4484 cache_fpl_smr_assert_not_entered(fpl); 4485 if (__predict_false(error != 0)) { 4486 ndp->ni_dvp = NULL; 4487 ndp->ni_vp = NULL; 4488 cache_fpl_cleanup_cnp(cnp); 4489 return (error); 4490 } 4491 ndp->ni_dvp = fpl->dvp; 4492 ndp->ni_vp = fpl->tvp; 4493 if (cnp->cn_flags & SAVENAME) 4494 cnp->cn_flags |= HASBUF; 4495 else 4496 cache_fpl_cleanup_cnp(cnp); 4497 return (error); 4498 } 4499 } 4500 4501 /* 4502 * Fast path lookup protected with SMR and sequence counters. 4503 * 4504 * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one. 4505 * 4506 * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria 4507 * outlined below. 4508 * 4509 * Traditional vnode lookup conceptually looks like this: 4510 * 4511 * vn_lock(current); 4512 * for (;;) { 4513 * next = find(); 4514 * vn_lock(next); 4515 * vn_unlock(current); 4516 * current = next; 4517 * if (last) 4518 * break; 4519 * } 4520 * return (current); 4521 * 4522 * Each jump to the next vnode is safe memory-wise and atomic with respect to 4523 * any modifications thanks to holding respective locks. 4524 * 4525 * The same guarantee can be provided with a combination of safe memory 4526 * reclamation and sequence counters instead. If all operations which affect 4527 * the relationship between the current vnode and the one we are looking for 4528 * also modify the counter, we can verify whether all the conditions held as 4529 * we made the jump. This includes things like permissions, mount points etc. 4530 * Counter modification is provided by enclosing relevant places in 4531 * vn_seqc_write_begin()/end() calls. 4532 * 4533 * Thus this translates to: 4534 * 4535 * vfs_smr_enter(); 4536 * dvp_seqc = seqc_read_any(dvp); 4537 * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode 4538 * abort(); 4539 * for (;;) { 4540 * tvp = find(); 4541 * tvp_seqc = seqc_read_any(tvp); 4542 * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode 4543 * abort(); 4544 * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode 4545 * abort(); 4546 * dvp = tvp; // we know nothing of importance has changed 4547 * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration 4548 * if (last) 4549 * break; 4550 * } 4551 * vget(); // secure the vnode 4552 * if (!seqc_consistent(tvp, tvp_seqc) // final check 4553 * abort(); 4554 * // at this point we know nothing has changed for any parent<->child pair 4555 * // as they were crossed during the lookup, meaning we matched the guarantee 4556 * // of the locked variant 4557 * return (tvp); 4558 * 4559 * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows: 4560 * - they are called while within vfs_smr protection which they must never exit 4561 * - EAGAIN can be returned to denote checking could not be performed, it is 4562 * always valid to return it 4563 * - if the sequence counter has not changed the result must be valid 4564 * - if the sequence counter has changed both false positives and false negatives 4565 * are permitted (since the result will be rejected later) 4566 * - for simple cases of unix permission checks vaccess_vexec_smr can be used 4567 * 4568 * Caveats to watch out for: 4569 * - vnodes are passed unlocked and unreferenced with nothing stopping 4570 * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised 4571 * to use atomic_load_ptr to fetch it. 4572 * - the aforementioned object can also get freed, meaning absent other means it 4573 * should be protected with vfs_smr 4574 * - either safely checking permissions as they are modified or guaranteeing 4575 * their stability is left to the routine 4576 */ 4577 int 4578 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status, 4579 struct pwd **pwdp) 4580 { 4581 struct cache_fpl fpl; 4582 struct pwd *pwd; 4583 struct vnode *dvp; 4584 struct componentname *cnp; 4585 struct nameidata_saved orig; 4586 int error; 4587 4588 MPASS(ndp->ni_lcf == 0); 4589 4590 fpl.status = CACHE_FPL_STATUS_UNSET; 4591 fpl.ndp = ndp; 4592 fpl.cnp = &ndp->ni_cnd; 4593 MPASS(curthread == fpl.cnp->cn_thread); 4594 4595 if ((fpl.cnp->cn_flags & SAVESTART) != 0) 4596 MPASS(fpl.cnp->cn_nameiop != LOOKUP); 4597 4598 if (!cache_can_fplookup(&fpl)) { 4599 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 4600 *status = fpl.status; 4601 return (EOPNOTSUPP); 4602 } 4603 4604 cache_fpl_checkpoint(&fpl, &orig); 4605 4606 cache_fpl_smr_enter_initial(&fpl); 4607 fpl.fsearch = false; 4608 pwd = pwd_get_smr(); 4609 fpl.pwd = pwd; 4610 ndp->ni_rootdir = pwd->pwd_rdir; 4611 ndp->ni_topdir = pwd->pwd_jdir; 4612 4613 cnp = fpl.cnp; 4614 cnp->cn_nameptr = cnp->cn_pnbuf; 4615 if (cnp->cn_pnbuf[0] == '/') { 4616 cache_fpl_handle_root(ndp, &dvp); 4617 } else { 4618 if (ndp->ni_dirfd == AT_FDCWD) { 4619 dvp = pwd->pwd_cdir; 4620 } else { 4621 error = cache_fplookup_dirfd(&fpl, &dvp); 4622 if (__predict_false(error != 0)) { 4623 goto out; 4624 } 4625 } 4626 } 4627 4628 SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true); 4629 4630 error = cache_fplookup_impl(dvp, &fpl); 4631 out: 4632 cache_fpl_smr_assert_not_entered(&fpl); 4633 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 4634 4635 *status = fpl.status; 4636 switch (fpl.status) { 4637 case CACHE_FPL_STATUS_UNSET: 4638 __assert_unreachable(); 4639 break; 4640 case CACHE_FPL_STATUS_HANDLED: 4641 SDT_PROBE3(vfs, namei, lookup, return, error, 4642 (error == 0 ? ndp->ni_vp : NULL), true); 4643 break; 4644 case CACHE_FPL_STATUS_PARTIAL: 4645 *pwdp = fpl.pwd; 4646 /* 4647 * Status restored by cache_fplookup_partial_setup. 4648 */ 4649 break; 4650 case CACHE_FPL_STATUS_ABORTED: 4651 cache_fpl_restore(&fpl, &orig); 4652 break; 4653 } 4654 return (error); 4655 } 4656