1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Poul-Henning Kamp of the FreeBSD Project. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_ddb.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/capsicum.h> 46 #include <sys/counter.h> 47 #include <sys/filedesc.h> 48 #include <sys/fnv_hash.h> 49 #include <sys/kernel.h> 50 #include <sys/ktr.h> 51 #include <sys/lock.h> 52 #include <sys/malloc.h> 53 #include <sys/fcntl.h> 54 #include <sys/jail.h> 55 #include <sys/mount.h> 56 #include <sys/namei.h> 57 #include <sys/proc.h> 58 #include <sys/seqc.h> 59 #include <sys/sdt.h> 60 #include <sys/smr.h> 61 #include <sys/smp.h> 62 #include <sys/syscallsubr.h> 63 #include <sys/sysctl.h> 64 #include <sys/sysproto.h> 65 #include <sys/vnode.h> 66 #include <ck_queue.h> 67 #ifdef KTRACE 68 #include <sys/ktrace.h> 69 #endif 70 71 #include <sys/capsicum.h> 72 73 #include <security/audit/audit.h> 74 #include <security/mac/mac_framework.h> 75 76 #ifdef DDB 77 #include <ddb/ddb.h> 78 #endif 79 80 #include <vm/uma.h> 81 82 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 83 "Name cache"); 84 85 SDT_PROVIDER_DECLARE(vfs); 86 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", 87 "struct vnode *"); 88 SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *", 89 "struct vnode *"); 90 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", 91 "char *"); 92 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *", 93 "const char *"); 94 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *", 95 "struct namecache *", "int", "int"); 96 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); 97 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", 98 "char *", "struct vnode *"); 99 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); 100 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", 101 "struct vnode *", "char *"); 102 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", 103 "struct vnode *"); 104 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, 105 "struct vnode *", "char *"); 106 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", 107 "char *"); 108 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *", 109 "struct componentname *"); 110 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *", 111 "struct componentname *"); 112 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); 113 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); 114 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); 115 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", 116 "struct vnode *"); 117 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", 118 "char *"); 119 SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *", 120 "char *"); 121 122 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool"); 123 SDT_PROBE_DECLARE(vfs, namei, lookup, entry); 124 SDT_PROBE_DECLARE(vfs, namei, lookup, return); 125 126 /* 127 * This structure describes the elements in the cache of recent 128 * names looked up by namei. 129 */ 130 struct negstate { 131 u_char neg_flag; 132 u_char neg_hit; 133 }; 134 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *), 135 "the state must fit in a union with a pointer without growing it"); 136 137 struct namecache { 138 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 139 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 140 CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */ 141 struct vnode *nc_dvp; /* vnode of parent of name */ 142 union { 143 struct vnode *nu_vp; /* vnode the name refers to */ 144 struct negstate nu_neg;/* negative entry state */ 145 } n_un; 146 u_char nc_flag; /* flag bits */ 147 u_char nc_nlen; /* length of name */ 148 char nc_name[0]; /* segment name + nul */ 149 }; 150 151 /* 152 * struct namecache_ts repeats struct namecache layout up to the 153 * nc_nlen member. 154 * struct namecache_ts is used in place of struct namecache when time(s) need 155 * to be stored. The nc_dotdottime field is used when a cache entry is mapping 156 * both a non-dotdot directory name plus dotdot for the directory's 157 * parent. 158 * 159 * See below for alignment requirement. 160 */ 161 struct namecache_ts { 162 struct timespec nc_time; /* timespec provided by fs */ 163 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ 164 int nc_ticks; /* ticks value when entry was added */ 165 struct namecache nc_nc; 166 }; 167 168 /* 169 * At least mips n32 performs 64-bit accesses to timespec as found 170 * in namecache_ts and requires them to be aligned. Since others 171 * may be in the same spot suffer a little bit and enforce the 172 * alignment for everyone. Note this is a nop for 64-bit platforms. 173 */ 174 #define CACHE_ZONE_ALIGNMENT UMA_ALIGNOF(time_t) 175 #define CACHE_PATH_CUTOFF 39 176 177 #define CACHE_ZONE_SMALL_SIZE (sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1) 178 #define CACHE_ZONE_SMALL_TS_SIZE (sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1) 179 #define CACHE_ZONE_LARGE_SIZE (sizeof(struct namecache) + NAME_MAX + 1) 180 #define CACHE_ZONE_LARGE_TS_SIZE (sizeof(struct namecache_ts) + NAME_MAX + 1) 181 182 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 183 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 184 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 185 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 186 187 #define nc_vp n_un.nu_vp 188 #define nc_neg n_un.nu_neg 189 190 /* 191 * Flags in namecache.nc_flag 192 */ 193 #define NCF_WHITE 0x01 194 #define NCF_ISDOTDOT 0x02 195 #define NCF_TS 0x04 196 #define NCF_DTS 0x08 197 #define NCF_DVDROP 0x10 198 #define NCF_NEGATIVE 0x20 199 #define NCF_INVALID 0x40 200 #define NCF_WIP 0x80 201 202 /* 203 * Flags in negstate.neg_flag 204 */ 205 #define NEG_HOT 0x01 206 207 /* 208 * Mark an entry as invalid. 209 * 210 * This is called before it starts getting deconstructed. 211 */ 212 static void 213 cache_ncp_invalidate(struct namecache *ncp) 214 { 215 216 KASSERT((ncp->nc_flag & NCF_INVALID) == 0, 217 ("%s: entry %p already invalid", __func__, ncp)); 218 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID); 219 atomic_thread_fence_rel(); 220 } 221 222 /* 223 * Check whether the entry can be safely used. 224 * 225 * All places which elide locks are supposed to call this after they are 226 * done with reading from an entry. 227 */ 228 static bool 229 cache_ncp_canuse(struct namecache *ncp) 230 { 231 232 atomic_thread_fence_acq(); 233 return ((atomic_load_char(&ncp->nc_flag) & (NCF_INVALID | NCF_WIP)) == 0); 234 } 235 236 /* 237 * Name caching works as follows: 238 * 239 * Names found by directory scans are retained in a cache 240 * for future reference. It is managed LRU, so frequently 241 * used names will hang around. Cache is indexed by hash value 242 * obtained from (dvp, name) where dvp refers to the directory 243 * containing name. 244 * 245 * If it is a "negative" entry, (i.e. for a name that is known NOT to 246 * exist) the vnode pointer will be NULL. 247 * 248 * Upon reaching the last segment of a path, if the reference 249 * is for DELETE, or NOCACHE is set (rewrite), and the 250 * name is located in the cache, it will be dropped. 251 * 252 * These locks are used (in the order in which they can be taken): 253 * NAME TYPE ROLE 254 * vnodelock mtx vnode lists and v_cache_dd field protection 255 * bucketlock mtx for access to given set of hash buckets 256 * neglist mtx negative entry LRU management 257 * 258 * It is legal to take multiple vnodelock and bucketlock locks. The locking 259 * order is lower address first. Both are recursive. 260 * 261 * "." lookups are lockless. 262 * 263 * ".." and vnode -> name lookups require vnodelock. 264 * 265 * name -> vnode lookup requires the relevant bucketlock to be held for reading. 266 * 267 * Insertions and removals of entries require involved vnodes and bucketlocks 268 * to be locked to provide safe operation against other threads modifying the 269 * cache. 270 * 271 * Some lookups result in removal of the found entry (e.g. getting rid of a 272 * negative entry with the intent to create a positive one), which poses a 273 * problem when multiple threads reach the state. Similarly, two different 274 * threads can purge two different vnodes and try to remove the same name. 275 * 276 * If the already held vnode lock is lower than the second required lock, we 277 * can just take the other lock. However, in the opposite case, this could 278 * deadlock. As such, this is resolved by trylocking and if that fails unlocking 279 * the first node, locking everything in order and revalidating the state. 280 */ 281 282 VFS_SMR_DECLARE; 283 284 static SYSCTL_NODE(_vfs_cache, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 285 "Name cache parameters"); 286 287 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */ 288 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, size, CTLFLAG_RW, &ncsize, 0, 289 "Total namecache capacity"); 290 291 u_int ncsizefactor = 2; 292 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, sizefactor, CTLFLAG_RW, &ncsizefactor, 0, 293 "Size factor for namecache"); 294 295 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */ 296 SYSCTL_ULONG(_vfs_cache_param, OID_AUTO, negfactor, CTLFLAG_RW, &ncnegfactor, 0, 297 "Ratio of negative namecache entries"); 298 299 /* 300 * Negative entry % of namecahe capacity above which automatic eviction is allowed. 301 * 302 * Check cache_neg_evict_cond for details. 303 */ 304 static u_int ncnegminpct = 3; 305 306 static u_int __read_mostly neg_min; /* the above recomputed against ncsize */ 307 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, negmin, CTLFLAG_RD, &neg_min, 0, 308 "Negative entry count above which automatic eviction is allowed"); 309 310 /* 311 * Structures associated with name caching. 312 */ 313 #define NCHHASH(hash) \ 314 (&nchashtbl[(hash) & nchash]) 315 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ 316 static u_long __read_mostly nchash; /* size of hash table */ 317 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 318 "Size of namecache hash table"); 319 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ 320 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ 321 322 struct nchstats nchstats; /* cache effectiveness statistics */ 323 324 static bool __read_frequently cache_fast_revlookup = true; 325 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_revlookup, CTLFLAG_RW, 326 &cache_fast_revlookup, 0, ""); 327 328 static u_int __exclusive_cache_line neg_cycle; 329 330 #define ncneghash 3 331 #define numneglists (ncneghash + 1) 332 333 struct neglist { 334 struct mtx nl_evict_lock; 335 struct mtx nl_lock __aligned(CACHE_LINE_SIZE); 336 TAILQ_HEAD(, namecache) nl_list; 337 TAILQ_HEAD(, namecache) nl_hotlist; 338 u_long nl_hotnum; 339 } __aligned(CACHE_LINE_SIZE); 340 341 static struct neglist neglists[numneglists]; 342 343 static inline struct neglist * 344 NCP2NEGLIST(struct namecache *ncp) 345 { 346 347 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); 348 } 349 350 static inline struct negstate * 351 NCP2NEGSTATE(struct namecache *ncp) 352 { 353 354 MPASS(ncp->nc_flag & NCF_NEGATIVE); 355 return (&ncp->nc_neg); 356 } 357 358 #define numbucketlocks (ncbuckethash + 1) 359 static u_int __read_mostly ncbuckethash; 360 static struct mtx_padalign __read_mostly *bucketlocks; 361 #define HASH2BUCKETLOCK(hash) \ 362 ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)])) 363 364 #define numvnodelocks (ncvnodehash + 1) 365 static u_int __read_mostly ncvnodehash; 366 static struct mtx __read_mostly *vnodelocks; 367 static inline struct mtx * 368 VP2VNODELOCK(struct vnode *vp) 369 { 370 371 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); 372 } 373 374 /* 375 * UMA zones for the VFS cache. 376 * 377 * The small cache is used for entries with short names, which are the 378 * most common. The large cache is used for entries which are too big to 379 * fit in the small cache. 380 */ 381 static uma_zone_t __read_mostly cache_zone_small; 382 static uma_zone_t __read_mostly cache_zone_small_ts; 383 static uma_zone_t __read_mostly cache_zone_large; 384 static uma_zone_t __read_mostly cache_zone_large_ts; 385 386 static struct namecache * 387 cache_alloc(int len, int ts) 388 { 389 struct namecache_ts *ncp_ts; 390 struct namecache *ncp; 391 392 if (__predict_false(ts)) { 393 if (len <= CACHE_PATH_CUTOFF) 394 ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK); 395 else 396 ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK); 397 ncp = &ncp_ts->nc_nc; 398 } else { 399 if (len <= CACHE_PATH_CUTOFF) 400 ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK); 401 else 402 ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK); 403 } 404 return (ncp); 405 } 406 407 static void 408 cache_free(struct namecache *ncp) 409 { 410 struct namecache_ts *ncp_ts; 411 412 MPASS(ncp != NULL); 413 if ((ncp->nc_flag & NCF_DVDROP) != 0) 414 vdrop(ncp->nc_dvp); 415 if (__predict_false(ncp->nc_flag & NCF_TS)) { 416 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 417 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 418 uma_zfree_smr(cache_zone_small_ts, ncp_ts); 419 else 420 uma_zfree_smr(cache_zone_large_ts, ncp_ts); 421 } else { 422 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 423 uma_zfree_smr(cache_zone_small, ncp); 424 else 425 uma_zfree_smr(cache_zone_large, ncp); 426 } 427 } 428 429 static void 430 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) 431 { 432 struct namecache_ts *ncp_ts; 433 434 KASSERT((ncp->nc_flag & NCF_TS) != 0 || 435 (tsp == NULL && ticksp == NULL), 436 ("No NCF_TS")); 437 438 if (tsp == NULL) 439 return; 440 441 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 442 *tsp = ncp_ts->nc_time; 443 *ticksp = ncp_ts->nc_ticks; 444 } 445 446 #ifdef DEBUG_CACHE 447 static int __read_mostly doingcache = 1; /* 1 => enable the cache */ 448 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, 449 "VFS namecache enabled"); 450 #endif 451 452 /* Export size information to userland */ 453 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 454 sizeof(struct namecache), "sizeof(struct namecache)"); 455 456 /* 457 * The new name cache statistics 458 */ 459 static SYSCTL_NODE(_vfs_cache, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 460 "Name cache statistics"); 461 462 #define STATNODE_ULONG(name, varname, descr) \ 463 SYSCTL_ULONG(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr); 464 #define STATNODE_COUNTER(name, varname, descr) \ 465 static COUNTER_U64_DEFINE_EARLY(varname); \ 466 SYSCTL_COUNTER_U64(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, \ 467 descr); 468 STATNODE_ULONG(neg, numneg, "Number of negative cache entries"); 469 STATNODE_ULONG(count, numcache, "Number of cache entries"); 470 STATNODE_COUNTER(heldvnodes, numcachehv, "Number of namecache entries with vnodes held"); 471 STATNODE_COUNTER(drops, numdrops, "Number of dropped entries due to reaching the limit"); 472 STATNODE_COUNTER(dothits, dothits, "Number of '.' hits"); 473 STATNODE_COUNTER(dotdothis, dotdothits, "Number of '..' hits"); 474 STATNODE_COUNTER(miss, nummiss, "Number of cache misses"); 475 STATNODE_COUNTER(misszap, nummisszap, "Number of cache misses we do not want to cache"); 476 STATNODE_COUNTER(posszaps, numposzaps, 477 "Number of cache hits (positive) we do not want to cache"); 478 STATNODE_COUNTER(poshits, numposhits, "Number of cache hits (positive)"); 479 STATNODE_COUNTER(negzaps, numnegzaps, 480 "Number of cache hits (negative) we do not want to cache"); 481 STATNODE_COUNTER(neghits, numneghits, "Number of cache hits (negative)"); 482 /* These count for vn_getcwd(), too. */ 483 STATNODE_COUNTER(fullpathcalls, numfullpathcalls, "Number of fullpath search calls"); 484 STATNODE_COUNTER(fullpathfail1, numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); 485 STATNODE_COUNTER(fullpathfail2, numfullpathfail2, 486 "Number of fullpath search errors (VOP_VPTOCNP failures)"); 487 STATNODE_COUNTER(fullpathfail4, numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); 488 STATNODE_COUNTER(fullpathfound, numfullpathfound, "Number of successful fullpath calls"); 489 490 /* 491 * Debug or developer statistics. 492 */ 493 static SYSCTL_NODE(_vfs_cache, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 494 "Name cache debugging"); 495 #define DEBUGNODE_ULONG(name, varname, descr) \ 496 SYSCTL_ULONG(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr); 497 #define DEBUGNODE_COUNTER(name, varname, descr) \ 498 static COUNTER_U64_DEFINE_EARLY(varname); \ 499 SYSCTL_COUNTER_U64(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, \ 500 descr); 501 DEBUGNODE_COUNTER(zap_bucket_relock_success, zap_bucket_relock_success, 502 "Number of successful removals after relocking"); 503 static long zap_bucket_fail; 504 DEBUGNODE_ULONG(zap_bucket_fail, zap_bucket_fail, ""); 505 static long zap_bucket_fail2; 506 DEBUGNODE_ULONG(zap_bucket_fail2, zap_bucket_fail2, ""); 507 static long cache_lock_vnodes_cel_3_failures; 508 DEBUGNODE_ULONG(vnodes_cel_3_failures, cache_lock_vnodes_cel_3_failures, 509 "Number of times 3-way vnode locking failed"); 510 511 static void cache_zap_locked(struct namecache *ncp); 512 static int vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, 513 char **freebuf, size_t *buflen); 514 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 515 char **retbuf, size_t *buflen, size_t addend); 516 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, 517 char **retbuf, size_t *buflen); 518 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, 519 char **retbuf, size_t *len, size_t addend); 520 521 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 522 523 static inline void 524 cache_assert_vlp_locked(struct mtx *vlp) 525 { 526 527 if (vlp != NULL) 528 mtx_assert(vlp, MA_OWNED); 529 } 530 531 static inline void 532 cache_assert_vnode_locked(struct vnode *vp) 533 { 534 struct mtx *vlp; 535 536 vlp = VP2VNODELOCK(vp); 537 cache_assert_vlp_locked(vlp); 538 } 539 540 /* 541 * TODO: With the value stored we can do better than computing the hash based 542 * on the address. The choice of FNV should also be revisited. 543 */ 544 static void 545 cache_prehash(struct vnode *vp) 546 { 547 548 vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT); 549 } 550 551 static uint32_t 552 cache_get_hash(char *name, u_char len, struct vnode *dvp) 553 { 554 555 return (fnv_32_buf(name, len, dvp->v_nchash)); 556 } 557 558 static inline struct nchashhead * 559 NCP2BUCKET(struct namecache *ncp) 560 { 561 uint32_t hash; 562 563 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 564 return (NCHHASH(hash)); 565 } 566 567 static inline struct mtx * 568 NCP2BUCKETLOCK(struct namecache *ncp) 569 { 570 uint32_t hash; 571 572 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 573 return (HASH2BUCKETLOCK(hash)); 574 } 575 576 #ifdef INVARIANTS 577 static void 578 cache_assert_bucket_locked(struct namecache *ncp) 579 { 580 struct mtx *blp; 581 582 blp = NCP2BUCKETLOCK(ncp); 583 mtx_assert(blp, MA_OWNED); 584 } 585 586 static void 587 cache_assert_bucket_unlocked(struct namecache *ncp) 588 { 589 struct mtx *blp; 590 591 blp = NCP2BUCKETLOCK(ncp); 592 mtx_assert(blp, MA_NOTOWNED); 593 } 594 #else 595 #define cache_assert_bucket_locked(x) do { } while (0) 596 #define cache_assert_bucket_unlocked(x) do { } while (0) 597 #endif 598 599 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) 600 static void 601 _cache_sort_vnodes(void **p1, void **p2) 602 { 603 void *tmp; 604 605 MPASS(*p1 != NULL || *p2 != NULL); 606 607 if (*p1 > *p2) { 608 tmp = *p2; 609 *p2 = *p1; 610 *p1 = tmp; 611 } 612 } 613 614 static void 615 cache_lock_all_buckets(void) 616 { 617 u_int i; 618 619 for (i = 0; i < numbucketlocks; i++) 620 mtx_lock(&bucketlocks[i]); 621 } 622 623 static void 624 cache_unlock_all_buckets(void) 625 { 626 u_int i; 627 628 for (i = 0; i < numbucketlocks; i++) 629 mtx_unlock(&bucketlocks[i]); 630 } 631 632 static void 633 cache_lock_all_vnodes(void) 634 { 635 u_int i; 636 637 for (i = 0; i < numvnodelocks; i++) 638 mtx_lock(&vnodelocks[i]); 639 } 640 641 static void 642 cache_unlock_all_vnodes(void) 643 { 644 u_int i; 645 646 for (i = 0; i < numvnodelocks; i++) 647 mtx_unlock(&vnodelocks[i]); 648 } 649 650 static int 651 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 652 { 653 654 cache_sort_vnodes(&vlp1, &vlp2); 655 656 if (vlp1 != NULL) { 657 if (!mtx_trylock(vlp1)) 658 return (EAGAIN); 659 } 660 if (!mtx_trylock(vlp2)) { 661 if (vlp1 != NULL) 662 mtx_unlock(vlp1); 663 return (EAGAIN); 664 } 665 666 return (0); 667 } 668 669 static void 670 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 671 { 672 673 MPASS(vlp1 != NULL || vlp2 != NULL); 674 MPASS(vlp1 <= vlp2); 675 676 if (vlp1 != NULL) 677 mtx_lock(vlp1); 678 if (vlp2 != NULL) 679 mtx_lock(vlp2); 680 } 681 682 static void 683 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 684 { 685 686 MPASS(vlp1 != NULL || vlp2 != NULL); 687 688 if (vlp1 != NULL) 689 mtx_unlock(vlp1); 690 if (vlp2 != NULL) 691 mtx_unlock(vlp2); 692 } 693 694 static int 695 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 696 { 697 struct nchstats snap; 698 699 if (req->oldptr == NULL) 700 return (SYSCTL_OUT(req, 0, sizeof(snap))); 701 702 snap = nchstats; 703 snap.ncs_goodhits = counter_u64_fetch(numposhits); 704 snap.ncs_neghits = counter_u64_fetch(numneghits); 705 snap.ncs_badhits = counter_u64_fetch(numposzaps) + 706 counter_u64_fetch(numnegzaps); 707 snap.ncs_miss = counter_u64_fetch(nummisszap) + 708 counter_u64_fetch(nummiss); 709 710 return (SYSCTL_OUT(req, &snap, sizeof(snap))); 711 } 712 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | 713 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", 714 "VFS cache effectiveness statistics"); 715 716 static void 717 cache_recalc_neg_min(u_int val) 718 { 719 720 neg_min = (ncsize * val) / 100; 721 } 722 723 static int 724 sysctl_negminpct(SYSCTL_HANDLER_ARGS) 725 { 726 u_int val; 727 int error; 728 729 val = ncnegminpct; 730 error = sysctl_handle_int(oidp, &val, 0, req); 731 if (error != 0 || req->newptr == NULL) 732 return (error); 733 734 if (val == ncnegminpct) 735 return (0); 736 if (val < 0 || val > 99) 737 return (EINVAL); 738 ncnegminpct = val; 739 cache_recalc_neg_min(val); 740 return (0); 741 } 742 743 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, negminpct, 744 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_negminpct, 745 "I", "Negative entry \% of namecahe capacity above which automatic eviction is allowed"); 746 747 #ifdef DIAGNOSTIC 748 /* 749 * Grab an atomic snapshot of the name cache hash chain lengths 750 */ 751 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, 752 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 753 "hash table stats"); 754 755 static int 756 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) 757 { 758 struct nchashhead *ncpp; 759 struct namecache *ncp; 760 int i, error, n_nchash, *cntbuf; 761 762 retry: 763 n_nchash = nchash + 1; /* nchash is max index, not count */ 764 if (req->oldptr == NULL) 765 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); 766 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); 767 cache_lock_all_buckets(); 768 if (n_nchash != nchash + 1) { 769 cache_unlock_all_buckets(); 770 free(cntbuf, M_TEMP); 771 goto retry; 772 } 773 /* Scan hash tables counting entries */ 774 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) 775 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) 776 cntbuf[i]++; 777 cache_unlock_all_buckets(); 778 for (error = 0, i = 0; i < n_nchash; i++) 779 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) 780 break; 781 free(cntbuf, M_TEMP); 782 return (error); 783 } 784 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| 785 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", 786 "nchash chain lengths"); 787 788 static int 789 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) 790 { 791 int error; 792 struct nchashhead *ncpp; 793 struct namecache *ncp; 794 int n_nchash; 795 int count, maxlength, used, pct; 796 797 if (!req->oldptr) 798 return SYSCTL_OUT(req, 0, 4 * sizeof(int)); 799 800 cache_lock_all_buckets(); 801 n_nchash = nchash + 1; /* nchash is max index, not count */ 802 used = 0; 803 maxlength = 0; 804 805 /* Scan hash tables for applicable entries */ 806 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { 807 count = 0; 808 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) { 809 count++; 810 } 811 if (count) 812 used++; 813 if (maxlength < count) 814 maxlength = count; 815 } 816 n_nchash = nchash + 1; 817 cache_unlock_all_buckets(); 818 pct = (used * 100) / (n_nchash / 100); 819 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); 820 if (error) 821 return (error); 822 error = SYSCTL_OUT(req, &used, sizeof(used)); 823 if (error) 824 return (error); 825 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); 826 if (error) 827 return (error); 828 error = SYSCTL_OUT(req, &pct, sizeof(pct)); 829 if (error) 830 return (error); 831 return (0); 832 } 833 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| 834 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", 835 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); 836 #endif 837 838 /* 839 * Negative entries management 840 * 841 * Various workloads create plenty of negative entries and barely use them 842 * afterwards. Moreover malicious users can keep performing bogus lookups 843 * adding even more entries. For example "make tinderbox" as of writing this 844 * comment ends up with 2.6M namecache entries in total, 1.2M of which are 845 * negative. 846 * 847 * As such, a rather aggressive eviction method is needed. The currently 848 * employed method is a placeholder. 849 * 850 * Entries are split over numneglists separate lists, each of which is further 851 * split into hot and cold entries. Entries get promoted after getting a hit. 852 * Eviction happens on addition of new entry. 853 */ 854 static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 855 "Name cache negative entry statistics"); 856 857 SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0, 858 "Number of negative cache entries"); 859 860 static COUNTER_U64_DEFINE_EARLY(neg_created); 861 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created, 862 "Number of created negative entries"); 863 864 static COUNTER_U64_DEFINE_EARLY(neg_evicted); 865 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted, 866 "Number of evicted negative entries"); 867 868 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty); 869 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD, 870 &neg_evict_skipped_empty, 871 "Number of times evicting failed due to lack of entries"); 872 873 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed); 874 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD, 875 &neg_evict_skipped_missed, 876 "Number of times evicting failed due to target entry disappearing"); 877 878 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended); 879 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD, 880 &neg_evict_skipped_contended, 881 "Number of times evicting failed due to contention"); 882 883 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits, 884 "Number of cache hits (negative)"); 885 886 static int 887 sysctl_neg_hot(SYSCTL_HANDLER_ARGS) 888 { 889 int i, out; 890 891 out = 0; 892 for (i = 0; i < numneglists; i++) 893 out += neglists[i].nl_hotnum; 894 895 return (SYSCTL_OUT(req, &out, sizeof(out))); 896 } 897 SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD | 898 CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I", 899 "Number of hot negative entries"); 900 901 static void 902 cache_neg_init(struct namecache *ncp) 903 { 904 struct negstate *ns; 905 906 ncp->nc_flag |= NCF_NEGATIVE; 907 ns = NCP2NEGSTATE(ncp); 908 ns->neg_flag = 0; 909 ns->neg_hit = 0; 910 counter_u64_add(neg_created, 1); 911 } 912 913 #define CACHE_NEG_PROMOTION_THRESH 2 914 915 static bool 916 cache_neg_hit_prep(struct namecache *ncp) 917 { 918 struct negstate *ns; 919 u_char n; 920 921 ns = NCP2NEGSTATE(ncp); 922 n = atomic_load_char(&ns->neg_hit); 923 for (;;) { 924 if (n >= CACHE_NEG_PROMOTION_THRESH) 925 return (false); 926 if (atomic_fcmpset_8(&ns->neg_hit, &n, n + 1)) 927 break; 928 } 929 return (n + 1 == CACHE_NEG_PROMOTION_THRESH); 930 } 931 932 /* 933 * Nothing to do here but it is provided for completeness as some 934 * cache_neg_hit_prep callers may end up returning without even 935 * trying to promote. 936 */ 937 #define cache_neg_hit_abort(ncp) do { } while (0) 938 939 static void 940 cache_neg_hit_finish(struct namecache *ncp) 941 { 942 943 SDT_PROBE2(vfs, namecache, lookup, hit__negative, ncp->nc_dvp, ncp->nc_name); 944 counter_u64_add(numneghits, 1); 945 } 946 947 /* 948 * Move a negative entry to the hot list. 949 */ 950 static void 951 cache_neg_promote_locked(struct namecache *ncp) 952 { 953 struct neglist *nl; 954 struct negstate *ns; 955 956 ns = NCP2NEGSTATE(ncp); 957 nl = NCP2NEGLIST(ncp); 958 mtx_assert(&nl->nl_lock, MA_OWNED); 959 if ((ns->neg_flag & NEG_HOT) == 0) { 960 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst); 961 TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst); 962 nl->nl_hotnum++; 963 ns->neg_flag |= NEG_HOT; 964 } 965 } 966 967 /* 968 * Move a hot negative entry to the cold list. 969 */ 970 static void 971 cache_neg_demote_locked(struct namecache *ncp) 972 { 973 struct neglist *nl; 974 struct negstate *ns; 975 976 ns = NCP2NEGSTATE(ncp); 977 nl = NCP2NEGLIST(ncp); 978 mtx_assert(&nl->nl_lock, MA_OWNED); 979 MPASS(ns->neg_flag & NEG_HOT); 980 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst); 981 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst); 982 nl->nl_hotnum--; 983 ns->neg_flag &= ~NEG_HOT; 984 atomic_store_char(&ns->neg_hit, 0); 985 } 986 987 /* 988 * Move a negative entry to the hot list if it matches the lookup. 989 * 990 * We have to take locks, but they may be contended and in the worst 991 * case we may need to go off CPU. We don't want to spin within the 992 * smr section and we can't block with it. Exiting the section means 993 * the found entry could have been evicted. We are going to look it 994 * up again. 995 */ 996 static bool 997 cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp, 998 struct namecache *oncp, uint32_t hash) 999 { 1000 struct namecache *ncp; 1001 struct neglist *nl; 1002 u_char nc_flag; 1003 1004 nl = NCP2NEGLIST(oncp); 1005 1006 mtx_lock(&nl->nl_lock); 1007 /* 1008 * For hash iteration. 1009 */ 1010 vfs_smr_enter(); 1011 1012 /* 1013 * Avoid all surprises by only succeeding if we got the same entry and 1014 * bailing completely otherwise. 1015 * XXX There are no provisions to keep the vnode around, meaning we may 1016 * end up promoting a negative entry for a *new* vnode and returning 1017 * ENOENT on its account. This is the error we want to return anyway 1018 * and promotion is harmless. 1019 * 1020 * In particular at this point there can be a new ncp which matches the 1021 * search but hashes to a different neglist. 1022 */ 1023 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1024 if (ncp == oncp) 1025 break; 1026 } 1027 1028 /* 1029 * No match to begin with. 1030 */ 1031 if (__predict_false(ncp == NULL)) { 1032 goto out_abort; 1033 } 1034 1035 /* 1036 * The newly found entry may be something different... 1037 */ 1038 if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1039 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) { 1040 goto out_abort; 1041 } 1042 1043 /* 1044 * ... and not even negative. 1045 */ 1046 nc_flag = atomic_load_char(&ncp->nc_flag); 1047 if ((nc_flag & NCF_NEGATIVE) == 0) { 1048 goto out_abort; 1049 } 1050 1051 if (__predict_false(!cache_ncp_canuse(ncp))) { 1052 goto out_abort; 1053 } 1054 1055 cache_neg_promote_locked(ncp); 1056 cache_neg_hit_finish(ncp); 1057 vfs_smr_exit(); 1058 mtx_unlock(&nl->nl_lock); 1059 return (true); 1060 out_abort: 1061 vfs_smr_exit(); 1062 mtx_unlock(&nl->nl_lock); 1063 return (false); 1064 } 1065 1066 static void 1067 cache_neg_promote(struct namecache *ncp) 1068 { 1069 struct neglist *nl; 1070 1071 nl = NCP2NEGLIST(ncp); 1072 mtx_lock(&nl->nl_lock); 1073 cache_neg_promote_locked(ncp); 1074 mtx_unlock(&nl->nl_lock); 1075 } 1076 1077 static void 1078 cache_neg_insert(struct namecache *ncp) 1079 { 1080 struct neglist *nl; 1081 1082 MPASS(ncp->nc_flag & NCF_NEGATIVE); 1083 cache_assert_bucket_locked(ncp); 1084 nl = NCP2NEGLIST(ncp); 1085 mtx_lock(&nl->nl_lock); 1086 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst); 1087 mtx_unlock(&nl->nl_lock); 1088 atomic_add_long(&numneg, 1); 1089 } 1090 1091 static void 1092 cache_neg_remove(struct namecache *ncp) 1093 { 1094 struct neglist *nl; 1095 struct negstate *ns; 1096 1097 cache_assert_bucket_locked(ncp); 1098 nl = NCP2NEGLIST(ncp); 1099 ns = NCP2NEGSTATE(ncp); 1100 mtx_lock(&nl->nl_lock); 1101 if ((ns->neg_flag & NEG_HOT) != 0) { 1102 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst); 1103 nl->nl_hotnum--; 1104 } else { 1105 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst); 1106 } 1107 mtx_unlock(&nl->nl_lock); 1108 atomic_subtract_long(&numneg, 1); 1109 } 1110 1111 static struct neglist * 1112 cache_neg_evict_select_list(void) 1113 { 1114 struct neglist *nl; 1115 u_int c; 1116 1117 c = atomic_fetchadd_int(&neg_cycle, 1) + 1; 1118 nl = &neglists[c % numneglists]; 1119 if (!mtx_trylock(&nl->nl_evict_lock)) { 1120 counter_u64_add(neg_evict_skipped_contended, 1); 1121 return (NULL); 1122 } 1123 return (nl); 1124 } 1125 1126 static struct namecache * 1127 cache_neg_evict_select_entry(struct neglist *nl) 1128 { 1129 struct namecache *ncp, *lncp; 1130 struct negstate *ns, *lns; 1131 int i; 1132 1133 mtx_assert(&nl->nl_evict_lock, MA_OWNED); 1134 mtx_assert(&nl->nl_lock, MA_OWNED); 1135 ncp = TAILQ_FIRST(&nl->nl_list); 1136 if (ncp == NULL) 1137 return (NULL); 1138 lncp = ncp; 1139 lns = NCP2NEGSTATE(lncp); 1140 for (i = 1; i < 4; i++) { 1141 ncp = TAILQ_NEXT(ncp, nc_dst); 1142 if (ncp == NULL) 1143 break; 1144 ns = NCP2NEGSTATE(ncp); 1145 if (ns->neg_hit < lns->neg_hit) { 1146 lncp = ncp; 1147 lns = ns; 1148 } 1149 } 1150 return (lncp); 1151 } 1152 1153 static bool 1154 cache_neg_evict(void) 1155 { 1156 struct namecache *ncp, *ncp2; 1157 struct neglist *nl; 1158 struct negstate *ns; 1159 struct vnode *dvp; 1160 struct mtx *dvlp; 1161 struct mtx *blp; 1162 uint32_t hash; 1163 u_char nlen; 1164 bool evicted; 1165 1166 nl = cache_neg_evict_select_list(); 1167 if (nl == NULL) { 1168 return (false); 1169 } 1170 1171 mtx_lock(&nl->nl_lock); 1172 ncp = TAILQ_FIRST(&nl->nl_hotlist); 1173 if (ncp != NULL) { 1174 cache_neg_demote_locked(ncp); 1175 } 1176 ncp = cache_neg_evict_select_entry(nl); 1177 if (ncp == NULL) { 1178 counter_u64_add(neg_evict_skipped_empty, 1); 1179 mtx_unlock(&nl->nl_lock); 1180 mtx_unlock(&nl->nl_evict_lock); 1181 return (false); 1182 } 1183 ns = NCP2NEGSTATE(ncp); 1184 nlen = ncp->nc_nlen; 1185 dvp = ncp->nc_dvp; 1186 hash = cache_get_hash(ncp->nc_name, nlen, dvp); 1187 dvlp = VP2VNODELOCK(dvp); 1188 blp = HASH2BUCKETLOCK(hash); 1189 mtx_unlock(&nl->nl_lock); 1190 mtx_unlock(&nl->nl_evict_lock); 1191 mtx_lock(dvlp); 1192 mtx_lock(blp); 1193 /* 1194 * Note that since all locks were dropped above, the entry may be 1195 * gone or reallocated to be something else. 1196 */ 1197 CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) { 1198 if (ncp2 == ncp && ncp2->nc_dvp == dvp && 1199 ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0) 1200 break; 1201 } 1202 if (ncp2 == NULL) { 1203 counter_u64_add(neg_evict_skipped_missed, 1); 1204 ncp = NULL; 1205 evicted = false; 1206 } else { 1207 MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp)); 1208 MPASS(blp == NCP2BUCKETLOCK(ncp)); 1209 SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp, 1210 ncp->nc_name); 1211 cache_zap_locked(ncp); 1212 counter_u64_add(neg_evicted, 1); 1213 evicted = true; 1214 } 1215 mtx_unlock(blp); 1216 mtx_unlock(dvlp); 1217 if (ncp != NULL) 1218 cache_free(ncp); 1219 return (evicted); 1220 } 1221 1222 /* 1223 * Maybe evict a negative entry to create more room. 1224 * 1225 * The ncnegfactor parameter limits what fraction of the total count 1226 * can comprise of negative entries. However, if the cache is just 1227 * warming up this leads to excessive evictions. As such, ncnegminpct 1228 * (recomputed to neg_min) dictates whether the above should be 1229 * applied. 1230 * 1231 * Try evicting if the cache is close to full capacity regardless of 1232 * other considerations. 1233 */ 1234 static bool 1235 cache_neg_evict_cond(u_long lnumcache) 1236 { 1237 u_long lnumneg; 1238 1239 if (ncsize - 1000 < lnumcache) 1240 goto out_evict; 1241 lnumneg = atomic_load_long(&numneg); 1242 if (lnumneg < neg_min) 1243 return (false); 1244 if (lnumneg * ncnegfactor < lnumcache) 1245 return (false); 1246 out_evict: 1247 return (cache_neg_evict()); 1248 } 1249 1250 /* 1251 * cache_zap_locked(): 1252 * 1253 * Removes a namecache entry from cache, whether it contains an actual 1254 * pointer to a vnode or if it is just a negative cache entry. 1255 */ 1256 static void 1257 cache_zap_locked(struct namecache *ncp) 1258 { 1259 struct nchashhead *ncpp; 1260 1261 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1262 cache_assert_vnode_locked(ncp->nc_vp); 1263 cache_assert_vnode_locked(ncp->nc_dvp); 1264 cache_assert_bucket_locked(ncp); 1265 1266 cache_ncp_invalidate(ncp); 1267 1268 ncpp = NCP2BUCKET(ncp); 1269 CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash); 1270 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 1271 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, 1272 ncp->nc_name, ncp->nc_vp); 1273 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); 1274 if (ncp == ncp->nc_vp->v_cache_dd) { 1275 vn_seqc_write_begin_unheld(ncp->nc_vp); 1276 ncp->nc_vp->v_cache_dd = NULL; 1277 vn_seqc_write_end(ncp->nc_vp); 1278 } 1279 } else { 1280 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp, 1281 ncp->nc_name); 1282 cache_neg_remove(ncp); 1283 } 1284 if (ncp->nc_flag & NCF_ISDOTDOT) { 1285 if (ncp == ncp->nc_dvp->v_cache_dd) { 1286 vn_seqc_write_begin_unheld(ncp->nc_dvp); 1287 ncp->nc_dvp->v_cache_dd = NULL; 1288 vn_seqc_write_end(ncp->nc_dvp); 1289 } 1290 } else { 1291 LIST_REMOVE(ncp, nc_src); 1292 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { 1293 ncp->nc_flag |= NCF_DVDROP; 1294 counter_u64_add(numcachehv, -1); 1295 } 1296 } 1297 atomic_subtract_long(&numcache, 1); 1298 } 1299 1300 static void 1301 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) 1302 { 1303 struct mtx *blp; 1304 1305 MPASS(ncp->nc_dvp == vp); 1306 MPASS(ncp->nc_flag & NCF_NEGATIVE); 1307 cache_assert_vnode_locked(vp); 1308 1309 blp = NCP2BUCKETLOCK(ncp); 1310 mtx_lock(blp); 1311 cache_zap_locked(ncp); 1312 mtx_unlock(blp); 1313 } 1314 1315 static bool 1316 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, 1317 struct mtx **vlpp) 1318 { 1319 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 1320 struct mtx *blp; 1321 1322 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 1323 cache_assert_vnode_locked(vp); 1324 1325 if (ncp->nc_flag & NCF_NEGATIVE) { 1326 if (*vlpp != NULL) { 1327 mtx_unlock(*vlpp); 1328 *vlpp = NULL; 1329 } 1330 cache_zap_negative_locked_vnode_kl(ncp, vp); 1331 return (true); 1332 } 1333 1334 pvlp = VP2VNODELOCK(vp); 1335 blp = NCP2BUCKETLOCK(ncp); 1336 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 1337 vlp2 = VP2VNODELOCK(ncp->nc_vp); 1338 1339 if (*vlpp == vlp1 || *vlpp == vlp2) { 1340 to_unlock = *vlpp; 1341 *vlpp = NULL; 1342 } else { 1343 if (*vlpp != NULL) { 1344 mtx_unlock(*vlpp); 1345 *vlpp = NULL; 1346 } 1347 cache_sort_vnodes(&vlp1, &vlp2); 1348 if (vlp1 == pvlp) { 1349 mtx_lock(vlp2); 1350 to_unlock = vlp2; 1351 } else { 1352 if (!mtx_trylock(vlp1)) 1353 goto out_relock; 1354 to_unlock = vlp1; 1355 } 1356 } 1357 mtx_lock(blp); 1358 cache_zap_locked(ncp); 1359 mtx_unlock(blp); 1360 if (to_unlock != NULL) 1361 mtx_unlock(to_unlock); 1362 return (true); 1363 1364 out_relock: 1365 mtx_unlock(vlp2); 1366 mtx_lock(vlp1); 1367 mtx_lock(vlp2); 1368 MPASS(*vlpp == NULL); 1369 *vlpp = vlp1; 1370 return (false); 1371 } 1372 1373 /* 1374 * If trylocking failed we can get here. We know enough to take all needed locks 1375 * in the right order and re-lookup the entry. 1376 */ 1377 static int 1378 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1379 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, 1380 struct mtx *blp) 1381 { 1382 struct namecache *rncp; 1383 1384 cache_assert_bucket_unlocked(ncp); 1385 1386 cache_sort_vnodes(&dvlp, &vlp); 1387 cache_lock_vnodes(dvlp, vlp); 1388 mtx_lock(blp); 1389 CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { 1390 if (rncp == ncp && rncp->nc_dvp == dvp && 1391 rncp->nc_nlen == cnp->cn_namelen && 1392 !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) 1393 break; 1394 } 1395 if (rncp != NULL) { 1396 cache_zap_locked(rncp); 1397 mtx_unlock(blp); 1398 cache_unlock_vnodes(dvlp, vlp); 1399 counter_u64_add(zap_bucket_relock_success, 1); 1400 return (0); 1401 } 1402 1403 mtx_unlock(blp); 1404 cache_unlock_vnodes(dvlp, vlp); 1405 return (EAGAIN); 1406 } 1407 1408 static int __noinline 1409 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp, 1410 uint32_t hash, struct mtx *blp) 1411 { 1412 struct mtx *dvlp, *vlp; 1413 struct vnode *dvp; 1414 1415 cache_assert_bucket_locked(ncp); 1416 1417 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1418 vlp = NULL; 1419 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1420 vlp = VP2VNODELOCK(ncp->nc_vp); 1421 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1422 cache_zap_locked(ncp); 1423 mtx_unlock(blp); 1424 cache_unlock_vnodes(dvlp, vlp); 1425 return (0); 1426 } 1427 1428 dvp = ncp->nc_dvp; 1429 mtx_unlock(blp); 1430 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1431 } 1432 1433 static __noinline int 1434 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp) 1435 { 1436 struct namecache *ncp; 1437 struct mtx *blp; 1438 struct mtx *dvlp, *dvlp2; 1439 uint32_t hash; 1440 int error; 1441 1442 if (cnp->cn_namelen == 2 && 1443 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1444 dvlp = VP2VNODELOCK(dvp); 1445 dvlp2 = NULL; 1446 mtx_lock(dvlp); 1447 retry_dotdot: 1448 ncp = dvp->v_cache_dd; 1449 if (ncp == NULL) { 1450 mtx_unlock(dvlp); 1451 if (dvlp2 != NULL) 1452 mtx_unlock(dvlp2); 1453 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1454 return (0); 1455 } 1456 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1457 if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2)) 1458 goto retry_dotdot; 1459 MPASS(dvp->v_cache_dd == NULL); 1460 mtx_unlock(dvlp); 1461 if (dvlp2 != NULL) 1462 mtx_unlock(dvlp2); 1463 cache_free(ncp); 1464 } else { 1465 vn_seqc_write_begin(dvp); 1466 dvp->v_cache_dd = NULL; 1467 vn_seqc_write_end(dvp); 1468 mtx_unlock(dvlp); 1469 if (dvlp2 != NULL) 1470 mtx_unlock(dvlp2); 1471 } 1472 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1473 return (1); 1474 } 1475 1476 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1477 blp = HASH2BUCKETLOCK(hash); 1478 retry: 1479 if (CK_SLIST_EMPTY(NCHHASH(hash))) 1480 goto out_no_entry; 1481 1482 mtx_lock(blp); 1483 1484 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1485 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1486 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1487 break; 1488 } 1489 1490 if (ncp == NULL) { 1491 mtx_unlock(blp); 1492 goto out_no_entry; 1493 } 1494 1495 error = cache_zap_locked_bucket(ncp, cnp, hash, blp); 1496 if (__predict_false(error != 0)) { 1497 zap_bucket_fail++; 1498 goto retry; 1499 } 1500 counter_u64_add(numposzaps, 1); 1501 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1502 cache_free(ncp); 1503 return (1); 1504 out_no_entry: 1505 counter_u64_add(nummisszap, 1); 1506 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1507 return (0); 1508 } 1509 1510 static int __noinline 1511 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1512 struct timespec *tsp, int *ticksp) 1513 { 1514 int ltype; 1515 1516 *vpp = dvp; 1517 counter_u64_add(dothits, 1); 1518 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); 1519 if (tsp != NULL) 1520 timespecclear(tsp); 1521 if (ticksp != NULL) 1522 *ticksp = ticks; 1523 vrefact(*vpp); 1524 /* 1525 * When we lookup "." we still can be asked to lock it 1526 * differently... 1527 */ 1528 ltype = cnp->cn_lkflags & LK_TYPE_MASK; 1529 if (ltype != VOP_ISLOCKED(*vpp)) { 1530 if (ltype == LK_EXCLUSIVE) { 1531 vn_lock(*vpp, LK_UPGRADE | LK_RETRY); 1532 if (VN_IS_DOOMED((*vpp))) { 1533 /* forced unmount */ 1534 vrele(*vpp); 1535 *vpp = NULL; 1536 return (ENOENT); 1537 } 1538 } else 1539 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); 1540 } 1541 return (-1); 1542 } 1543 1544 static int __noinline 1545 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1546 struct timespec *tsp, int *ticksp) 1547 { 1548 struct namecache_ts *ncp_ts; 1549 struct namecache *ncp; 1550 struct mtx *dvlp; 1551 enum vgetstate vs; 1552 int error, ltype; 1553 bool whiteout; 1554 1555 MPASS((cnp->cn_flags & ISDOTDOT) != 0); 1556 1557 if ((cnp->cn_flags & MAKEENTRY) == 0) { 1558 cache_remove_cnp(dvp, cnp); 1559 return (0); 1560 } 1561 1562 counter_u64_add(dotdothits, 1); 1563 retry: 1564 dvlp = VP2VNODELOCK(dvp); 1565 mtx_lock(dvlp); 1566 ncp = dvp->v_cache_dd; 1567 if (ncp == NULL) { 1568 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, "..", NULL); 1569 mtx_unlock(dvlp); 1570 return (0); 1571 } 1572 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1573 if (ncp->nc_flag & NCF_NEGATIVE) 1574 *vpp = NULL; 1575 else 1576 *vpp = ncp->nc_vp; 1577 } else 1578 *vpp = ncp->nc_dvp; 1579 if (*vpp == NULL) 1580 goto negative_success; 1581 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp); 1582 cache_out_ts(ncp, tsp, ticksp); 1583 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == 1584 NCF_DTS && tsp != NULL) { 1585 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1586 *tsp = ncp_ts->nc_dotdottime; 1587 } 1588 1589 MPASS(dvp != *vpp); 1590 ltype = VOP_ISLOCKED(dvp); 1591 VOP_UNLOCK(dvp); 1592 vs = vget_prep(*vpp); 1593 mtx_unlock(dvlp); 1594 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1595 vn_lock(dvp, ltype | LK_RETRY); 1596 if (VN_IS_DOOMED(dvp)) { 1597 if (error == 0) 1598 vput(*vpp); 1599 *vpp = NULL; 1600 return (ENOENT); 1601 } 1602 if (error) { 1603 *vpp = NULL; 1604 goto retry; 1605 } 1606 return (-1); 1607 negative_success: 1608 if (__predict_false(cnp->cn_nameiop == CREATE)) { 1609 if (cnp->cn_flags & ISLASTCN) { 1610 counter_u64_add(numnegzaps, 1); 1611 cache_zap_negative_locked_vnode_kl(ncp, dvp); 1612 mtx_unlock(dvlp); 1613 cache_free(ncp); 1614 return (0); 1615 } 1616 } 1617 1618 whiteout = (ncp->nc_flag & NCF_WHITE); 1619 cache_out_ts(ncp, tsp, ticksp); 1620 if (cache_neg_hit_prep(ncp)) 1621 cache_neg_promote(ncp); 1622 else 1623 cache_neg_hit_finish(ncp); 1624 mtx_unlock(dvlp); 1625 if (whiteout) 1626 cnp->cn_flags |= ISWHITEOUT; 1627 return (ENOENT); 1628 } 1629 1630 /** 1631 * Lookup a name in the name cache 1632 * 1633 * # Arguments 1634 * 1635 * - dvp: Parent directory in which to search. 1636 * - vpp: Return argument. Will contain desired vnode on cache hit. 1637 * - cnp: Parameters of the name search. The most interesting bits of 1638 * the cn_flags field have the following meanings: 1639 * - MAKEENTRY: If clear, free an entry from the cache rather than look 1640 * it up. 1641 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." 1642 * - tsp: Return storage for cache timestamp. On a successful (positive 1643 * or negative) lookup, tsp will be filled with any timespec that 1644 * was stored when this cache entry was created. However, it will 1645 * be clear for "." entries. 1646 * - ticks: Return storage for alternate cache timestamp. On a successful 1647 * (positive or negative) lookup, it will contain the ticks value 1648 * that was current when the cache entry was created, unless cnp 1649 * was ".". 1650 * 1651 * Either both tsp and ticks have to be provided or neither of them. 1652 * 1653 * # Returns 1654 * 1655 * - -1: A positive cache hit. vpp will contain the desired vnode. 1656 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due 1657 * to a forced unmount. vpp will not be modified. If the entry 1658 * is a whiteout, then the ISWHITEOUT flag will be set in 1659 * cnp->cn_flags. 1660 * - 0: A cache miss. vpp will not be modified. 1661 * 1662 * # Locking 1663 * 1664 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up 1665 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the 1666 * lock is not recursively acquired. 1667 */ 1668 static int __noinline 1669 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1670 struct timespec *tsp, int *ticksp) 1671 { 1672 struct namecache *ncp; 1673 struct mtx *blp; 1674 uint32_t hash; 1675 enum vgetstate vs; 1676 int error; 1677 bool whiteout; 1678 1679 MPASS((cnp->cn_flags & (MAKEENTRY | ISDOTDOT)) == MAKEENTRY); 1680 1681 retry: 1682 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1683 blp = HASH2BUCKETLOCK(hash); 1684 mtx_lock(blp); 1685 1686 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1687 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1688 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1689 break; 1690 } 1691 1692 if (__predict_false(ncp == NULL)) { 1693 mtx_unlock(blp); 1694 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1695 NULL); 1696 counter_u64_add(nummiss, 1); 1697 return (0); 1698 } 1699 1700 if (ncp->nc_flag & NCF_NEGATIVE) 1701 goto negative_success; 1702 1703 counter_u64_add(numposhits, 1); 1704 *vpp = ncp->nc_vp; 1705 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); 1706 cache_out_ts(ncp, tsp, ticksp); 1707 MPASS(dvp != *vpp); 1708 vs = vget_prep(*vpp); 1709 mtx_unlock(blp); 1710 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1711 if (error) { 1712 *vpp = NULL; 1713 goto retry; 1714 } 1715 return (-1); 1716 negative_success: 1717 if (__predict_false(cnp->cn_nameiop == CREATE)) { 1718 if (cnp->cn_flags & ISLASTCN) { 1719 counter_u64_add(numnegzaps, 1); 1720 error = cache_zap_locked_bucket(ncp, cnp, hash, blp); 1721 if (__predict_false(error != 0)) { 1722 zap_bucket_fail2++; 1723 goto retry; 1724 } 1725 cache_free(ncp); 1726 return (0); 1727 } 1728 } 1729 1730 whiteout = (ncp->nc_flag & NCF_WHITE); 1731 cache_out_ts(ncp, tsp, ticksp); 1732 if (cache_neg_hit_prep(ncp)) 1733 cache_neg_promote(ncp); 1734 else 1735 cache_neg_hit_finish(ncp); 1736 mtx_unlock(blp); 1737 if (whiteout) 1738 cnp->cn_flags |= ISWHITEOUT; 1739 return (ENOENT); 1740 } 1741 1742 int 1743 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1744 struct timespec *tsp, int *ticksp) 1745 { 1746 struct namecache *ncp; 1747 uint32_t hash; 1748 enum vgetstate vs; 1749 int error; 1750 bool whiteout, neg_promote; 1751 u_short nc_flag; 1752 1753 MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL)); 1754 1755 #ifdef DEBUG_CACHE 1756 if (__predict_false(!doingcache)) { 1757 cnp->cn_flags &= ~MAKEENTRY; 1758 return (0); 1759 } 1760 #endif 1761 1762 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 1763 if (cnp->cn_namelen == 1) 1764 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); 1765 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') 1766 return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp)); 1767 } 1768 1769 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 1770 1771 if ((cnp->cn_flags & MAKEENTRY) == 0) { 1772 cache_remove_cnp(dvp, cnp); 1773 return (0); 1774 } 1775 1776 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1777 vfs_smr_enter(); 1778 1779 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1780 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1781 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1782 break; 1783 } 1784 1785 if (__predict_false(ncp == NULL)) { 1786 vfs_smr_exit(); 1787 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1788 NULL); 1789 counter_u64_add(nummiss, 1); 1790 return (0); 1791 } 1792 1793 nc_flag = atomic_load_char(&ncp->nc_flag); 1794 if (nc_flag & NCF_NEGATIVE) 1795 goto negative_success; 1796 1797 counter_u64_add(numposhits, 1); 1798 *vpp = ncp->nc_vp; 1799 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); 1800 cache_out_ts(ncp, tsp, ticksp); 1801 MPASS(dvp != *vpp); 1802 if (!cache_ncp_canuse(ncp)) { 1803 vfs_smr_exit(); 1804 *vpp = NULL; 1805 goto out_fallback; 1806 } 1807 vs = vget_prep_smr(*vpp); 1808 vfs_smr_exit(); 1809 if (__predict_false(vs == VGET_NONE)) { 1810 *vpp = NULL; 1811 goto out_fallback; 1812 } 1813 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1814 if (error) { 1815 *vpp = NULL; 1816 goto out_fallback; 1817 } 1818 return (-1); 1819 negative_success: 1820 if (__predict_false(cnp->cn_nameiop == CREATE)) { 1821 if (cnp->cn_flags & ISLASTCN) { 1822 vfs_smr_exit(); 1823 goto out_fallback; 1824 } 1825 } 1826 1827 cache_out_ts(ncp, tsp, ticksp); 1828 whiteout = (ncp->nc_flag & NCF_WHITE); 1829 neg_promote = cache_neg_hit_prep(ncp); 1830 if (__predict_false(!cache_ncp_canuse(ncp))) { 1831 cache_neg_hit_abort(ncp); 1832 vfs_smr_exit(); 1833 goto out_fallback; 1834 } 1835 if (neg_promote) { 1836 vfs_smr_exit(); 1837 if (!cache_neg_promote_cond(dvp, cnp, ncp, hash)) 1838 goto out_fallback; 1839 } else { 1840 cache_neg_hit_finish(ncp); 1841 vfs_smr_exit(); 1842 } 1843 if (whiteout) 1844 cnp->cn_flags |= ISWHITEOUT; 1845 return (ENOENT); 1846 out_fallback: 1847 return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp)); 1848 } 1849 1850 struct celockstate { 1851 struct mtx *vlp[3]; 1852 struct mtx *blp[2]; 1853 }; 1854 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); 1855 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); 1856 1857 static inline void 1858 cache_celockstate_init(struct celockstate *cel) 1859 { 1860 1861 bzero(cel, sizeof(*cel)); 1862 } 1863 1864 static void 1865 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, 1866 struct vnode *dvp) 1867 { 1868 struct mtx *vlp1, *vlp2; 1869 1870 MPASS(cel->vlp[0] == NULL); 1871 MPASS(cel->vlp[1] == NULL); 1872 MPASS(cel->vlp[2] == NULL); 1873 1874 MPASS(vp != NULL || dvp != NULL); 1875 1876 vlp1 = VP2VNODELOCK(vp); 1877 vlp2 = VP2VNODELOCK(dvp); 1878 cache_sort_vnodes(&vlp1, &vlp2); 1879 1880 if (vlp1 != NULL) { 1881 mtx_lock(vlp1); 1882 cel->vlp[0] = vlp1; 1883 } 1884 mtx_lock(vlp2); 1885 cel->vlp[1] = vlp2; 1886 } 1887 1888 static void 1889 cache_unlock_vnodes_cel(struct celockstate *cel) 1890 { 1891 1892 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); 1893 1894 if (cel->vlp[0] != NULL) 1895 mtx_unlock(cel->vlp[0]); 1896 if (cel->vlp[1] != NULL) 1897 mtx_unlock(cel->vlp[1]); 1898 if (cel->vlp[2] != NULL) 1899 mtx_unlock(cel->vlp[2]); 1900 } 1901 1902 static bool 1903 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) 1904 { 1905 struct mtx *vlp; 1906 bool ret; 1907 1908 cache_assert_vlp_locked(cel->vlp[0]); 1909 cache_assert_vlp_locked(cel->vlp[1]); 1910 MPASS(cel->vlp[2] == NULL); 1911 1912 MPASS(vp != NULL); 1913 vlp = VP2VNODELOCK(vp); 1914 1915 ret = true; 1916 if (vlp >= cel->vlp[1]) { 1917 mtx_lock(vlp); 1918 } else { 1919 if (mtx_trylock(vlp)) 1920 goto out; 1921 cache_lock_vnodes_cel_3_failures++; 1922 cache_unlock_vnodes_cel(cel); 1923 if (vlp < cel->vlp[0]) { 1924 mtx_lock(vlp); 1925 mtx_lock(cel->vlp[0]); 1926 mtx_lock(cel->vlp[1]); 1927 } else { 1928 if (cel->vlp[0] != NULL) 1929 mtx_lock(cel->vlp[0]); 1930 mtx_lock(vlp); 1931 mtx_lock(cel->vlp[1]); 1932 } 1933 ret = false; 1934 } 1935 out: 1936 cel->vlp[2] = vlp; 1937 return (ret); 1938 } 1939 1940 static void 1941 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1, 1942 struct mtx *blp2) 1943 { 1944 1945 MPASS(cel->blp[0] == NULL); 1946 MPASS(cel->blp[1] == NULL); 1947 1948 cache_sort_vnodes(&blp1, &blp2); 1949 1950 if (blp1 != NULL) { 1951 mtx_lock(blp1); 1952 cel->blp[0] = blp1; 1953 } 1954 mtx_lock(blp2); 1955 cel->blp[1] = blp2; 1956 } 1957 1958 static void 1959 cache_unlock_buckets_cel(struct celockstate *cel) 1960 { 1961 1962 if (cel->blp[0] != NULL) 1963 mtx_unlock(cel->blp[0]); 1964 mtx_unlock(cel->blp[1]); 1965 } 1966 1967 /* 1968 * Lock part of the cache affected by the insertion. 1969 * 1970 * This means vnodelocks for dvp, vp and the relevant bucketlock. 1971 * However, insertion can result in removal of an old entry. In this 1972 * case we have an additional vnode and bucketlock pair to lock. 1973 * 1974 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while 1975 * preserving the locking order (smaller address first). 1976 */ 1977 static void 1978 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1979 uint32_t hash) 1980 { 1981 struct namecache *ncp; 1982 struct mtx *blps[2]; 1983 1984 blps[0] = HASH2BUCKETLOCK(hash); 1985 for (;;) { 1986 blps[1] = NULL; 1987 cache_lock_vnodes_cel(cel, dvp, vp); 1988 if (vp == NULL || vp->v_type != VDIR) 1989 break; 1990 ncp = vp->v_cache_dd; 1991 if (ncp == NULL) 1992 break; 1993 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1994 break; 1995 MPASS(ncp->nc_dvp == vp); 1996 blps[1] = NCP2BUCKETLOCK(ncp); 1997 if (ncp->nc_flag & NCF_NEGATIVE) 1998 break; 1999 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 2000 break; 2001 /* 2002 * All vnodes got re-locked. Re-validate the state and if 2003 * nothing changed we are done. Otherwise restart. 2004 */ 2005 if (ncp == vp->v_cache_dd && 2006 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 2007 blps[1] == NCP2BUCKETLOCK(ncp) && 2008 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 2009 break; 2010 cache_unlock_vnodes_cel(cel); 2011 cel->vlp[0] = NULL; 2012 cel->vlp[1] = NULL; 2013 cel->vlp[2] = NULL; 2014 } 2015 cache_lock_buckets_cel(cel, blps[0], blps[1]); 2016 } 2017 2018 static void 2019 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 2020 uint32_t hash) 2021 { 2022 struct namecache *ncp; 2023 struct mtx *blps[2]; 2024 2025 blps[0] = HASH2BUCKETLOCK(hash); 2026 for (;;) { 2027 blps[1] = NULL; 2028 cache_lock_vnodes_cel(cel, dvp, vp); 2029 ncp = dvp->v_cache_dd; 2030 if (ncp == NULL) 2031 break; 2032 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2033 break; 2034 MPASS(ncp->nc_dvp == dvp); 2035 blps[1] = NCP2BUCKETLOCK(ncp); 2036 if (ncp->nc_flag & NCF_NEGATIVE) 2037 break; 2038 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 2039 break; 2040 if (ncp == dvp->v_cache_dd && 2041 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 2042 blps[1] == NCP2BUCKETLOCK(ncp) && 2043 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 2044 break; 2045 cache_unlock_vnodes_cel(cel); 2046 cel->vlp[0] = NULL; 2047 cel->vlp[1] = NULL; 2048 cel->vlp[2] = NULL; 2049 } 2050 cache_lock_buckets_cel(cel, blps[0], blps[1]); 2051 } 2052 2053 static void 2054 cache_enter_unlock(struct celockstate *cel) 2055 { 2056 2057 cache_unlock_buckets_cel(cel); 2058 cache_unlock_vnodes_cel(cel); 2059 } 2060 2061 static void __noinline 2062 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, 2063 struct componentname *cnp) 2064 { 2065 struct celockstate cel; 2066 struct namecache *ncp; 2067 uint32_t hash; 2068 int len; 2069 2070 if (dvp->v_cache_dd == NULL) 2071 return; 2072 len = cnp->cn_namelen; 2073 cache_celockstate_init(&cel); 2074 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 2075 cache_enter_lock_dd(&cel, dvp, vp, hash); 2076 vn_seqc_write_begin(dvp); 2077 ncp = dvp->v_cache_dd; 2078 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) { 2079 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); 2080 cache_zap_locked(ncp); 2081 } else { 2082 ncp = NULL; 2083 } 2084 dvp->v_cache_dd = NULL; 2085 vn_seqc_write_end(dvp); 2086 cache_enter_unlock(&cel); 2087 if (ncp != NULL) 2088 cache_free(ncp); 2089 } 2090 2091 /* 2092 * Add an entry to the cache. 2093 */ 2094 void 2095 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 2096 struct timespec *tsp, struct timespec *dtsp) 2097 { 2098 struct celockstate cel; 2099 struct namecache *ncp, *n2, *ndd; 2100 struct namecache_ts *ncp_ts; 2101 struct nchashhead *ncpp; 2102 uint32_t hash; 2103 int flag; 2104 int len; 2105 u_long lnumcache; 2106 2107 VNPASS(!VN_IS_DOOMED(dvp), dvp); 2108 VNPASS(dvp->v_type != VNON, dvp); 2109 if (vp != NULL) { 2110 VNPASS(!VN_IS_DOOMED(vp), vp); 2111 VNPASS(vp->v_type != VNON, vp); 2112 } 2113 2114 #ifdef DEBUG_CACHE 2115 if (__predict_false(!doingcache)) 2116 return; 2117 #endif 2118 2119 flag = 0; 2120 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 2121 if (cnp->cn_namelen == 1) 2122 return; 2123 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 2124 cache_enter_dotdot_prep(dvp, vp, cnp); 2125 flag = NCF_ISDOTDOT; 2126 } 2127 } 2128 2129 /* 2130 * Avoid blowout in namecache entries. 2131 * 2132 * Bugs: 2133 * 1. filesystems may end up tryng to add an already existing entry 2134 * (for example this can happen after a cache miss during concurrent 2135 * lookup), in which case we will call cache_neg_evict despite not 2136 * adding anything. 2137 * 2. the routine may fail to free anything and no provisions are made 2138 * to make it try harder (see the inside for failure modes) 2139 * 3. it only ever looks at negative entries. 2140 */ 2141 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; 2142 if (cache_neg_evict_cond(lnumcache)) { 2143 lnumcache = atomic_load_long(&numcache); 2144 } 2145 if (__predict_false(lnumcache >= ncsize)) { 2146 atomic_subtract_long(&numcache, 1); 2147 counter_u64_add(numdrops, 1); 2148 return; 2149 } 2150 2151 cache_celockstate_init(&cel); 2152 ndd = NULL; 2153 ncp_ts = NULL; 2154 2155 /* 2156 * Calculate the hash key and setup as much of the new 2157 * namecache entry as possible before acquiring the lock. 2158 */ 2159 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); 2160 ncp->nc_flag = flag | NCF_WIP; 2161 ncp->nc_vp = vp; 2162 if (vp == NULL) 2163 cache_neg_init(ncp); 2164 ncp->nc_dvp = dvp; 2165 if (tsp != NULL) { 2166 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 2167 ncp_ts->nc_time = *tsp; 2168 ncp_ts->nc_ticks = ticks; 2169 ncp_ts->nc_nc.nc_flag |= NCF_TS; 2170 if (dtsp != NULL) { 2171 ncp_ts->nc_dotdottime = *dtsp; 2172 ncp_ts->nc_nc.nc_flag |= NCF_DTS; 2173 } 2174 } 2175 len = ncp->nc_nlen = cnp->cn_namelen; 2176 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 2177 memcpy(ncp->nc_name, cnp->cn_nameptr, len); 2178 ncp->nc_name[len] = '\0'; 2179 cache_enter_lock(&cel, dvp, vp, hash); 2180 2181 /* 2182 * See if this vnode or negative entry is already in the cache 2183 * with this name. This can happen with concurrent lookups of 2184 * the same path name. 2185 */ 2186 ncpp = NCHHASH(hash); 2187 CK_SLIST_FOREACH(n2, ncpp, nc_hash) { 2188 if (n2->nc_dvp == dvp && 2189 n2->nc_nlen == cnp->cn_namelen && 2190 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { 2191 MPASS(cache_ncp_canuse(n2)); 2192 if ((n2->nc_flag & NCF_NEGATIVE) != 0) 2193 KASSERT(vp == NULL, 2194 ("%s: found entry pointing to a different vnode (%p != %p)", 2195 __func__, NULL, vp)); 2196 else 2197 KASSERT(n2->nc_vp == vp, 2198 ("%s: found entry pointing to a different vnode (%p != %p)", 2199 __func__, n2->nc_vp, vp)); 2200 /* 2201 * Entries are supposed to be immutable unless in the 2202 * process of getting destroyed. Accommodating for 2203 * changing timestamps is possible but not worth it. 2204 * This should be harmless in terms of correctness, in 2205 * the worst case resulting in an earlier expiration. 2206 * Alternatively, the found entry can be replaced 2207 * altogether. 2208 */ 2209 MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS))); 2210 #if 0 2211 if (tsp != NULL) { 2212 KASSERT((n2->nc_flag & NCF_TS) != 0, 2213 ("no NCF_TS")); 2214 n2_ts = __containerof(n2, struct namecache_ts, nc_nc); 2215 n2_ts->nc_time = ncp_ts->nc_time; 2216 n2_ts->nc_ticks = ncp_ts->nc_ticks; 2217 if (dtsp != NULL) { 2218 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; 2219 n2_ts->nc_nc.nc_flag |= NCF_DTS; 2220 } 2221 } 2222 #endif 2223 SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name, 2224 vp); 2225 goto out_unlock_free; 2226 } 2227 } 2228 2229 if (flag == NCF_ISDOTDOT) { 2230 /* 2231 * See if we are trying to add .. entry, but some other lookup 2232 * has populated v_cache_dd pointer already. 2233 */ 2234 if (dvp->v_cache_dd != NULL) 2235 goto out_unlock_free; 2236 KASSERT(vp == NULL || vp->v_type == VDIR, 2237 ("wrong vnode type %p", vp)); 2238 vn_seqc_write_begin(dvp); 2239 dvp->v_cache_dd = ncp; 2240 vn_seqc_write_end(dvp); 2241 } 2242 2243 if (vp != NULL) { 2244 if (flag != NCF_ISDOTDOT) { 2245 /* 2246 * For this case, the cache entry maps both the 2247 * directory name in it and the name ".." for the 2248 * directory's parent. 2249 */ 2250 vn_seqc_write_begin(vp); 2251 if ((ndd = vp->v_cache_dd) != NULL) { 2252 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) 2253 cache_zap_locked(ndd); 2254 else 2255 ndd = NULL; 2256 } 2257 vp->v_cache_dd = ncp; 2258 vn_seqc_write_end(vp); 2259 } else if (vp->v_type != VDIR) { 2260 if (vp->v_cache_dd != NULL) { 2261 vn_seqc_write_begin(vp); 2262 vp->v_cache_dd = NULL; 2263 vn_seqc_write_end(vp); 2264 } 2265 } 2266 } 2267 2268 if (flag != NCF_ISDOTDOT) { 2269 if (LIST_EMPTY(&dvp->v_cache_src)) { 2270 vhold(dvp); 2271 counter_u64_add(numcachehv, 1); 2272 } 2273 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); 2274 } 2275 2276 /* 2277 * If the entry is "negative", we place it into the 2278 * "negative" cache queue, otherwise, we place it into the 2279 * destination vnode's cache entries queue. 2280 */ 2281 if (vp != NULL) { 2282 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); 2283 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, 2284 vp); 2285 } else { 2286 if (cnp->cn_flags & ISWHITEOUT) 2287 ncp->nc_flag |= NCF_WHITE; 2288 cache_neg_insert(ncp); 2289 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, 2290 ncp->nc_name); 2291 } 2292 2293 /* 2294 * Insert the new namecache entry into the appropriate chain 2295 * within the cache entries table. 2296 */ 2297 CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash); 2298 2299 atomic_thread_fence_rel(); 2300 /* 2301 * Mark the entry as fully constructed. 2302 * It is immutable past this point until its removal. 2303 */ 2304 atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP); 2305 2306 cache_enter_unlock(&cel); 2307 if (ndd != NULL) 2308 cache_free(ndd); 2309 return; 2310 out_unlock_free: 2311 cache_enter_unlock(&cel); 2312 atomic_subtract_long(&numcache, 1); 2313 cache_free(ncp); 2314 return; 2315 } 2316 2317 static u_int 2318 cache_roundup_2(u_int val) 2319 { 2320 u_int res; 2321 2322 for (res = 1; res <= val; res <<= 1) 2323 continue; 2324 2325 return (res); 2326 } 2327 2328 static struct nchashhead * 2329 nchinittbl(u_long elements, u_long *hashmask) 2330 { 2331 struct nchashhead *hashtbl; 2332 u_long hashsize, i; 2333 2334 hashsize = cache_roundup_2(elements) / 2; 2335 2336 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK); 2337 for (i = 0; i < hashsize; i++) 2338 CK_SLIST_INIT(&hashtbl[i]); 2339 *hashmask = hashsize - 1; 2340 return (hashtbl); 2341 } 2342 2343 static void 2344 ncfreetbl(struct nchashhead *hashtbl) 2345 { 2346 2347 free(hashtbl, M_VFSCACHE); 2348 } 2349 2350 /* 2351 * Name cache initialization, from vfs_init() when we are booting 2352 */ 2353 static void 2354 nchinit(void *dummy __unused) 2355 { 2356 u_int i; 2357 2358 cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE, 2359 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2360 cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE, 2361 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2362 cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE, 2363 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2364 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE, 2365 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2366 2367 VFS_SMR_ZONE_SET(cache_zone_small); 2368 VFS_SMR_ZONE_SET(cache_zone_small_ts); 2369 VFS_SMR_ZONE_SET(cache_zone_large); 2370 VFS_SMR_ZONE_SET(cache_zone_large_ts); 2371 2372 ncsize = desiredvnodes * ncsizefactor; 2373 cache_recalc_neg_min(ncnegminpct); 2374 nchashtbl = nchinittbl(desiredvnodes * 2, &nchash); 2375 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; 2376 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ 2377 ncbuckethash = 7; 2378 if (ncbuckethash > nchash) 2379 ncbuckethash = nchash; 2380 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, 2381 M_WAITOK | M_ZERO); 2382 for (i = 0; i < numbucketlocks; i++) 2383 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE); 2384 ncvnodehash = ncbuckethash; 2385 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, 2386 M_WAITOK | M_ZERO); 2387 for (i = 0; i < numvnodelocks; i++) 2388 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); 2389 2390 for (i = 0; i < numneglists; i++) { 2391 mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF); 2392 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); 2393 TAILQ_INIT(&neglists[i].nl_list); 2394 TAILQ_INIT(&neglists[i].nl_hotlist); 2395 } 2396 } 2397 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); 2398 2399 void 2400 cache_vnode_init(struct vnode *vp) 2401 { 2402 2403 LIST_INIT(&vp->v_cache_src); 2404 TAILQ_INIT(&vp->v_cache_dst); 2405 vp->v_cache_dd = NULL; 2406 cache_prehash(vp); 2407 } 2408 2409 void 2410 cache_changesize(u_long newmaxvnodes) 2411 { 2412 struct nchashhead *new_nchashtbl, *old_nchashtbl; 2413 u_long new_nchash, old_nchash; 2414 struct namecache *ncp; 2415 uint32_t hash; 2416 u_long newncsize; 2417 int i; 2418 2419 newncsize = newmaxvnodes * ncsizefactor; 2420 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); 2421 if (newmaxvnodes < numbucketlocks) 2422 newmaxvnodes = numbucketlocks; 2423 2424 new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash); 2425 /* If same hash table size, nothing to do */ 2426 if (nchash == new_nchash) { 2427 ncfreetbl(new_nchashtbl); 2428 return; 2429 } 2430 /* 2431 * Move everything from the old hash table to the new table. 2432 * None of the namecache entries in the table can be removed 2433 * because to do so, they have to be removed from the hash table. 2434 */ 2435 cache_lock_all_vnodes(); 2436 cache_lock_all_buckets(); 2437 old_nchashtbl = nchashtbl; 2438 old_nchash = nchash; 2439 nchashtbl = new_nchashtbl; 2440 nchash = new_nchash; 2441 for (i = 0; i <= old_nchash; i++) { 2442 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) { 2443 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, 2444 ncp->nc_dvp); 2445 CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash); 2446 CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); 2447 } 2448 } 2449 ncsize = newncsize; 2450 cache_recalc_neg_min(ncnegminpct); 2451 cache_unlock_all_buckets(); 2452 cache_unlock_all_vnodes(); 2453 ncfreetbl(old_nchashtbl); 2454 } 2455 2456 /* 2457 * Invalidate all entries from and to a particular vnode. 2458 */ 2459 static void 2460 cache_purge_impl(struct vnode *vp) 2461 { 2462 TAILQ_HEAD(, namecache) ncps; 2463 struct namecache *ncp, *nnp; 2464 struct mtx *vlp, *vlp2; 2465 2466 TAILQ_INIT(&ncps); 2467 vlp = VP2VNODELOCK(vp); 2468 vlp2 = NULL; 2469 mtx_lock(vlp); 2470 retry: 2471 while (!LIST_EMPTY(&vp->v_cache_src)) { 2472 ncp = LIST_FIRST(&vp->v_cache_src); 2473 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2474 goto retry; 2475 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2476 } 2477 while (!TAILQ_EMPTY(&vp->v_cache_dst)) { 2478 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2479 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2480 goto retry; 2481 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2482 } 2483 ncp = vp->v_cache_dd; 2484 if (ncp != NULL) { 2485 KASSERT(ncp->nc_flag & NCF_ISDOTDOT, 2486 ("lost dotdot link")); 2487 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2488 goto retry; 2489 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2490 } 2491 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); 2492 mtx_unlock(vlp); 2493 if (vlp2 != NULL) 2494 mtx_unlock(vlp2); 2495 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2496 cache_free(ncp); 2497 } 2498 } 2499 2500 /* 2501 * Opportunistic check to see if there is anything to do. 2502 */ 2503 static bool 2504 cache_has_entries(struct vnode *vp) 2505 { 2506 2507 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2508 vp->v_cache_dd == NULL) 2509 return (false); 2510 return (true); 2511 } 2512 2513 void 2514 cache_purge(struct vnode *vp) 2515 { 2516 2517 SDT_PROBE1(vfs, namecache, purge, done, vp); 2518 if (!cache_has_entries(vp)) 2519 return; 2520 cache_purge_impl(vp); 2521 } 2522 2523 /* 2524 * Only to be used by vgone. 2525 */ 2526 void 2527 cache_purge_vgone(struct vnode *vp) 2528 { 2529 struct mtx *vlp; 2530 2531 VNPASS(VN_IS_DOOMED(vp), vp); 2532 if (cache_has_entries(vp)) { 2533 cache_purge_impl(vp); 2534 return; 2535 } 2536 2537 /* 2538 * Serialize against a potential thread doing cache_purge. 2539 */ 2540 vlp = VP2VNODELOCK(vp); 2541 mtx_wait_unlocked(vlp); 2542 if (cache_has_entries(vp)) { 2543 cache_purge_impl(vp); 2544 return; 2545 } 2546 return; 2547 } 2548 2549 /* 2550 * Invalidate all negative entries for a particular directory vnode. 2551 */ 2552 void 2553 cache_purge_negative(struct vnode *vp) 2554 { 2555 TAILQ_HEAD(, namecache) ncps; 2556 struct namecache *ncp, *nnp; 2557 struct mtx *vlp; 2558 2559 SDT_PROBE1(vfs, namecache, purge_negative, done, vp); 2560 if (LIST_EMPTY(&vp->v_cache_src)) 2561 return; 2562 TAILQ_INIT(&ncps); 2563 vlp = VP2VNODELOCK(vp); 2564 mtx_lock(vlp); 2565 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { 2566 if (!(ncp->nc_flag & NCF_NEGATIVE)) 2567 continue; 2568 cache_zap_negative_locked_vnode_kl(ncp, vp); 2569 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2570 } 2571 mtx_unlock(vlp); 2572 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2573 cache_free(ncp); 2574 } 2575 } 2576 2577 void 2578 cache_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, 2579 struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp) 2580 { 2581 2582 ASSERT_VOP_IN_SEQC(fdvp); 2583 ASSERT_VOP_IN_SEQC(fvp); 2584 ASSERT_VOP_IN_SEQC(tdvp); 2585 if (tvp != NULL) 2586 ASSERT_VOP_IN_SEQC(tvp); 2587 2588 cache_purge(fvp); 2589 if (tvp != NULL) { 2590 cache_purge(tvp); 2591 KASSERT(!cache_remove_cnp(tdvp, tcnp), 2592 ("%s: lingering negative entry", __func__)); 2593 } else { 2594 cache_remove_cnp(tdvp, tcnp); 2595 } 2596 } 2597 2598 /* 2599 * Flush all entries referencing a particular filesystem. 2600 */ 2601 void 2602 cache_purgevfs(struct mount *mp) 2603 { 2604 struct vnode *vp, *mvp; 2605 2606 SDT_PROBE1(vfs, namecache, purgevfs, done, mp); 2607 /* 2608 * Somewhat wasteful iteration over all vnodes. Would be better to 2609 * support filtering and avoid the interlock to begin with. 2610 */ 2611 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 2612 if (!cache_has_entries(vp)) { 2613 VI_UNLOCK(vp); 2614 continue; 2615 } 2616 vholdl(vp); 2617 VI_UNLOCK(vp); 2618 cache_purge(vp); 2619 vdrop(vp); 2620 } 2621 } 2622 2623 /* 2624 * Perform canonical checks and cache lookup and pass on to filesystem 2625 * through the vop_cachedlookup only if needed. 2626 */ 2627 2628 int 2629 vfs_cache_lookup(struct vop_lookup_args *ap) 2630 { 2631 struct vnode *dvp; 2632 int error; 2633 struct vnode **vpp = ap->a_vpp; 2634 struct componentname *cnp = ap->a_cnp; 2635 int flags = cnp->cn_flags; 2636 2637 *vpp = NULL; 2638 dvp = ap->a_dvp; 2639 2640 if (dvp->v_type != VDIR) 2641 return (ENOTDIR); 2642 2643 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && 2644 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) 2645 return (EROFS); 2646 2647 error = vn_dir_check_exec(dvp, cnp); 2648 if (error != 0) 2649 return (error); 2650 2651 error = cache_lookup(dvp, vpp, cnp, NULL, NULL); 2652 if (error == 0) 2653 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); 2654 if (error == -1) 2655 return (0); 2656 return (error); 2657 } 2658 2659 /* Implementation of the getcwd syscall. */ 2660 int 2661 sys___getcwd(struct thread *td, struct __getcwd_args *uap) 2662 { 2663 char *buf, *retbuf; 2664 size_t buflen; 2665 int error; 2666 2667 buflen = uap->buflen; 2668 if (__predict_false(buflen < 2)) 2669 return (EINVAL); 2670 if (buflen > MAXPATHLEN) 2671 buflen = MAXPATHLEN; 2672 2673 buf = uma_zalloc(namei_zone, M_WAITOK); 2674 error = vn_getcwd(buf, &retbuf, &buflen); 2675 if (error == 0) 2676 error = copyout(retbuf, uap->buf, buflen); 2677 uma_zfree(namei_zone, buf); 2678 return (error); 2679 } 2680 2681 int 2682 vn_getcwd(char *buf, char **retbuf, size_t *buflen) 2683 { 2684 struct pwd *pwd; 2685 int error; 2686 2687 vfs_smr_enter(); 2688 pwd = pwd_get_smr(); 2689 error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, 2690 buflen, 0); 2691 VFS_SMR_ASSERT_NOT_ENTERED(); 2692 if (error < 0) { 2693 pwd = pwd_hold(curthread); 2694 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf, 2695 retbuf, buflen); 2696 pwd_drop(pwd); 2697 } 2698 2699 #ifdef KTRACE 2700 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0) 2701 ktrnamei(*retbuf); 2702 #endif 2703 return (error); 2704 } 2705 2706 static int 2707 kern___realpathat(struct thread *td, int fd, const char *path, char *buf, 2708 size_t size, int flags, enum uio_seg pathseg) 2709 { 2710 struct nameidata nd; 2711 char *retbuf, *freebuf; 2712 int error; 2713 2714 if (flags != 0) 2715 return (EINVAL); 2716 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1, 2717 pathseg, path, fd, &cap_fstat_rights, td); 2718 if ((error = namei(&nd)) != 0) 2719 return (error); 2720 error = vn_fullpath_hardlink(&nd, &retbuf, &freebuf, &size); 2721 if (error == 0) { 2722 error = copyout(retbuf, buf, size); 2723 free(freebuf, M_TEMP); 2724 } 2725 NDFREE(&nd, 0); 2726 return (error); 2727 } 2728 2729 int 2730 sys___realpathat(struct thread *td, struct __realpathat_args *uap) 2731 { 2732 2733 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size, 2734 uap->flags, UIO_USERSPACE)); 2735 } 2736 2737 /* 2738 * Retrieve the full filesystem path that correspond to a vnode from the name 2739 * cache (if available) 2740 */ 2741 int 2742 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf) 2743 { 2744 struct pwd *pwd; 2745 char *buf; 2746 size_t buflen; 2747 int error; 2748 2749 if (__predict_false(vp == NULL)) 2750 return (EINVAL); 2751 2752 buflen = MAXPATHLEN; 2753 buf = malloc(buflen, M_TEMP, M_WAITOK); 2754 vfs_smr_enter(); 2755 pwd = pwd_get_smr(); 2756 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, 0); 2757 VFS_SMR_ASSERT_NOT_ENTERED(); 2758 if (error < 0) { 2759 pwd = pwd_hold(curthread); 2760 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen); 2761 pwd_drop(pwd); 2762 } 2763 if (error == 0) 2764 *freebuf = buf; 2765 else 2766 free(buf, M_TEMP); 2767 return (error); 2768 } 2769 2770 /* 2771 * This function is similar to vn_fullpath, but it attempts to lookup the 2772 * pathname relative to the global root mount point. This is required for the 2773 * auditing sub-system, as audited pathnames must be absolute, relative to the 2774 * global root mount point. 2775 */ 2776 int 2777 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf) 2778 { 2779 char *buf; 2780 size_t buflen; 2781 int error; 2782 2783 if (__predict_false(vp == NULL)) 2784 return (EINVAL); 2785 buflen = MAXPATHLEN; 2786 buf = malloc(buflen, M_TEMP, M_WAITOK); 2787 vfs_smr_enter(); 2788 error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, 0); 2789 VFS_SMR_ASSERT_NOT_ENTERED(); 2790 if (error < 0) { 2791 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen); 2792 } 2793 if (error == 0) 2794 *freebuf = buf; 2795 else 2796 free(buf, M_TEMP); 2797 return (error); 2798 } 2799 2800 static struct namecache * 2801 vn_dd_from_dst(struct vnode *vp) 2802 { 2803 struct namecache *ncp; 2804 2805 cache_assert_vnode_locked(vp); 2806 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) { 2807 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2808 return (ncp); 2809 } 2810 return (NULL); 2811 } 2812 2813 int 2814 vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen) 2815 { 2816 struct vnode *dvp; 2817 struct namecache *ncp; 2818 struct mtx *vlp; 2819 int error; 2820 2821 vlp = VP2VNODELOCK(*vp); 2822 mtx_lock(vlp); 2823 ncp = (*vp)->v_cache_dd; 2824 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) { 2825 KASSERT(ncp == vn_dd_from_dst(*vp), 2826 ("%s: mismatch for dd entry (%p != %p)", __func__, 2827 ncp, vn_dd_from_dst(*vp))); 2828 } else { 2829 ncp = vn_dd_from_dst(*vp); 2830 } 2831 if (ncp != NULL) { 2832 if (*buflen < ncp->nc_nlen) { 2833 mtx_unlock(vlp); 2834 vrele(*vp); 2835 counter_u64_add(numfullpathfail4, 1); 2836 error = ENOMEM; 2837 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2838 vp, NULL); 2839 return (error); 2840 } 2841 *buflen -= ncp->nc_nlen; 2842 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 2843 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, 2844 ncp->nc_name, vp); 2845 dvp = *vp; 2846 *vp = ncp->nc_dvp; 2847 vref(*vp); 2848 mtx_unlock(vlp); 2849 vrele(dvp); 2850 return (0); 2851 } 2852 SDT_PROBE1(vfs, namecache, fullpath, miss, vp); 2853 2854 mtx_unlock(vlp); 2855 vn_lock(*vp, LK_SHARED | LK_RETRY); 2856 error = VOP_VPTOCNP(*vp, &dvp, buf, buflen); 2857 vput(*vp); 2858 if (error) { 2859 counter_u64_add(numfullpathfail2, 1); 2860 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2861 return (error); 2862 } 2863 2864 *vp = dvp; 2865 if (VN_IS_DOOMED(dvp)) { 2866 /* forced unmount */ 2867 vrele(dvp); 2868 error = ENOENT; 2869 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2870 return (error); 2871 } 2872 /* 2873 * *vp has its use count incremented still. 2874 */ 2875 2876 return (0); 2877 } 2878 2879 /* 2880 * Resolve a directory to a pathname. 2881 * 2882 * The name of the directory can always be found in the namecache or fetched 2883 * from the filesystem. There is also guaranteed to be only one parent, meaning 2884 * we can just follow vnodes up until we find the root. 2885 * 2886 * The vnode must be referenced. 2887 */ 2888 static int 2889 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 2890 size_t *len, size_t addend) 2891 { 2892 #ifdef KDTRACE_HOOKS 2893 struct vnode *startvp = vp; 2894 #endif 2895 struct vnode *vp1; 2896 size_t buflen; 2897 int error; 2898 bool slash_prefixed; 2899 2900 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); 2901 VNPASS(vp->v_usecount > 0, vp); 2902 2903 buflen = *len; 2904 2905 slash_prefixed = true; 2906 if (addend == 0) { 2907 MPASS(*len >= 2); 2908 buflen--; 2909 buf[buflen] = '\0'; 2910 slash_prefixed = false; 2911 } 2912 2913 error = 0; 2914 2915 SDT_PROBE1(vfs, namecache, fullpath, entry, vp); 2916 counter_u64_add(numfullpathcalls, 1); 2917 while (vp != rdir && vp != rootvnode) { 2918 /* 2919 * The vp vnode must be already fully constructed, 2920 * since it is either found in namecache or obtained 2921 * from VOP_VPTOCNP(). We may test for VV_ROOT safely 2922 * without obtaining the vnode lock. 2923 */ 2924 if ((vp->v_vflag & VV_ROOT) != 0) { 2925 vn_lock(vp, LK_RETRY | LK_SHARED); 2926 2927 /* 2928 * With the vnode locked, check for races with 2929 * unmount, forced or not. Note that we 2930 * already verified that vp is not equal to 2931 * the root vnode, which means that 2932 * mnt_vnodecovered can be NULL only for the 2933 * case of unmount. 2934 */ 2935 if (VN_IS_DOOMED(vp) || 2936 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || 2937 vp1->v_mountedhere != vp->v_mount) { 2938 vput(vp); 2939 error = ENOENT; 2940 SDT_PROBE3(vfs, namecache, fullpath, return, 2941 error, vp, NULL); 2942 break; 2943 } 2944 2945 vref(vp1); 2946 vput(vp); 2947 vp = vp1; 2948 continue; 2949 } 2950 if (vp->v_type != VDIR) { 2951 vrele(vp); 2952 counter_u64_add(numfullpathfail1, 1); 2953 error = ENOTDIR; 2954 SDT_PROBE3(vfs, namecache, fullpath, return, 2955 error, vp, NULL); 2956 break; 2957 } 2958 error = vn_vptocnp(&vp, buf, &buflen); 2959 if (error) 2960 break; 2961 if (buflen == 0) { 2962 vrele(vp); 2963 error = ENOMEM; 2964 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2965 startvp, NULL); 2966 break; 2967 } 2968 buf[--buflen] = '/'; 2969 slash_prefixed = true; 2970 } 2971 if (error) 2972 return (error); 2973 if (!slash_prefixed) { 2974 if (buflen == 0) { 2975 vrele(vp); 2976 counter_u64_add(numfullpathfail4, 1); 2977 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, 2978 startvp, NULL); 2979 return (ENOMEM); 2980 } 2981 buf[--buflen] = '/'; 2982 } 2983 counter_u64_add(numfullpathfound, 1); 2984 vrele(vp); 2985 2986 *retbuf = buf + buflen; 2987 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf); 2988 *len -= buflen; 2989 *len += addend; 2990 return (0); 2991 } 2992 2993 /* 2994 * Resolve an arbitrary vnode to a pathname. 2995 * 2996 * Note 2 caveats: 2997 * - hardlinks are not tracked, thus if the vnode is not a directory this can 2998 * resolve to a different path than the one used to find it 2999 * - namecache is not mandatory, meaning names are not guaranteed to be added 3000 * (in which case resolving fails) 3001 */ 3002 static void __inline 3003 cache_rev_failed_impl(int *reason, int line) 3004 { 3005 3006 *reason = line; 3007 } 3008 #define cache_rev_failed(var) cache_rev_failed_impl((var), __LINE__) 3009 3010 static int 3011 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 3012 char **retbuf, size_t *buflen, size_t addend) 3013 { 3014 #ifdef KDTRACE_HOOKS 3015 struct vnode *startvp = vp; 3016 #endif 3017 struct vnode *tvp; 3018 struct mount *mp; 3019 struct namecache *ncp; 3020 size_t orig_buflen; 3021 int reason; 3022 int error; 3023 #ifdef KDTRACE_HOOKS 3024 int i; 3025 #endif 3026 seqc_t vp_seqc, tvp_seqc; 3027 u_char nc_flag; 3028 3029 VFS_SMR_ASSERT_ENTERED(); 3030 3031 if (!cache_fast_revlookup) { 3032 vfs_smr_exit(); 3033 return (-1); 3034 } 3035 3036 orig_buflen = *buflen; 3037 3038 if (addend == 0) { 3039 MPASS(*buflen >= 2); 3040 *buflen -= 1; 3041 buf[*buflen] = '\0'; 3042 } 3043 3044 if (vp == rdir || vp == rootvnode) { 3045 if (addend == 0) { 3046 *buflen -= 1; 3047 buf[*buflen] = '/'; 3048 } 3049 goto out_ok; 3050 } 3051 3052 #ifdef KDTRACE_HOOKS 3053 i = 0; 3054 #endif 3055 error = -1; 3056 ncp = NULL; /* for sdt probe down below */ 3057 vp_seqc = vn_seqc_read_any(vp); 3058 if (seqc_in_modify(vp_seqc)) { 3059 cache_rev_failed(&reason); 3060 goto out_abort; 3061 } 3062 3063 for (;;) { 3064 #ifdef KDTRACE_HOOKS 3065 i++; 3066 #endif 3067 if ((vp->v_vflag & VV_ROOT) != 0) { 3068 mp = atomic_load_ptr(&vp->v_mount); 3069 if (mp == NULL) { 3070 cache_rev_failed(&reason); 3071 goto out_abort; 3072 } 3073 tvp = atomic_load_ptr(&mp->mnt_vnodecovered); 3074 tvp_seqc = vn_seqc_read_any(tvp); 3075 if (seqc_in_modify(tvp_seqc)) { 3076 cache_rev_failed(&reason); 3077 goto out_abort; 3078 } 3079 if (!vn_seqc_consistent(vp, vp_seqc)) { 3080 cache_rev_failed(&reason); 3081 goto out_abort; 3082 } 3083 vp = tvp; 3084 vp_seqc = tvp_seqc; 3085 continue; 3086 } 3087 ncp = atomic_load_ptr(&vp->v_cache_dd); 3088 if (ncp == NULL) { 3089 cache_rev_failed(&reason); 3090 goto out_abort; 3091 } 3092 nc_flag = atomic_load_char(&ncp->nc_flag); 3093 if ((nc_flag & NCF_ISDOTDOT) != 0) { 3094 cache_rev_failed(&reason); 3095 goto out_abort; 3096 } 3097 if (!cache_ncp_canuse(ncp)) { 3098 cache_rev_failed(&reason); 3099 goto out_abort; 3100 } 3101 if (ncp->nc_nlen >= *buflen) { 3102 cache_rev_failed(&reason); 3103 error = ENOMEM; 3104 goto out_abort; 3105 } 3106 *buflen -= ncp->nc_nlen; 3107 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 3108 *buflen -= 1; 3109 buf[*buflen] = '/'; 3110 tvp = ncp->nc_dvp; 3111 tvp_seqc = vn_seqc_read_any(tvp); 3112 if (seqc_in_modify(tvp_seqc)) { 3113 cache_rev_failed(&reason); 3114 goto out_abort; 3115 } 3116 if (!vn_seqc_consistent(vp, vp_seqc)) { 3117 cache_rev_failed(&reason); 3118 goto out_abort; 3119 } 3120 vp = tvp; 3121 vp_seqc = tvp_seqc; 3122 if (vp == rdir || vp == rootvnode) 3123 break; 3124 } 3125 out_ok: 3126 vfs_smr_exit(); 3127 *retbuf = buf + *buflen; 3128 *buflen = orig_buflen - *buflen + addend; 3129 SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf); 3130 return (0); 3131 3132 out_abort: 3133 *buflen = orig_buflen; 3134 SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i); 3135 vfs_smr_exit(); 3136 return (error); 3137 } 3138 3139 static int 3140 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 3141 size_t *buflen) 3142 { 3143 size_t orig_buflen, addend; 3144 int error; 3145 3146 if (*buflen < 2) 3147 return (EINVAL); 3148 3149 orig_buflen = *buflen; 3150 3151 vref(vp); 3152 addend = 0; 3153 if (vp->v_type != VDIR) { 3154 *buflen -= 1; 3155 buf[*buflen] = '\0'; 3156 error = vn_vptocnp(&vp, buf, buflen); 3157 if (error) 3158 return (error); 3159 if (*buflen == 0) { 3160 vrele(vp); 3161 return (ENOMEM); 3162 } 3163 *buflen -= 1; 3164 buf[*buflen] = '/'; 3165 addend = orig_buflen - *buflen; 3166 } 3167 3168 return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, addend)); 3169 } 3170 3171 /* 3172 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks). 3173 * 3174 * Since the namecache does not track handlings, the caller is expected to first 3175 * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei. 3176 * 3177 * Then we have 2 cases: 3178 * - if the found vnode is a directory, the path can be constructed just by 3179 * fullowing names up the chain 3180 * - otherwise we populate the buffer with the saved name and start resolving 3181 * from the parent 3182 */ 3183 static int 3184 vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, char **freebuf, 3185 size_t *buflen) 3186 { 3187 char *buf, *tmpbuf; 3188 struct pwd *pwd; 3189 struct componentname *cnp; 3190 struct vnode *vp; 3191 size_t addend; 3192 int error; 3193 enum vtype type; 3194 3195 if (*buflen < 2) 3196 return (EINVAL); 3197 if (*buflen > MAXPATHLEN) 3198 *buflen = MAXPATHLEN; 3199 3200 buf = malloc(*buflen, M_TEMP, M_WAITOK); 3201 3202 addend = 0; 3203 vp = ndp->ni_vp; 3204 /* 3205 * Check for VBAD to work around the vp_crossmp bug in lookup(). 3206 * 3207 * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be 3208 * set to mount point's root vnode while ni_dvp will be vp_crossmp. 3209 * If the type is VDIR (like in this very case) we can skip looking 3210 * at ni_dvp in the first place. However, since vnodes get passed here 3211 * unlocked the target may transition to doomed state (type == VBAD) 3212 * before we get to evaluate the condition. If this happens, we will 3213 * populate part of the buffer and descend to vn_fullpath_dir with 3214 * vp == vp_crossmp. Prevent the problem by checking for VBAD. 3215 * 3216 * This should be atomic_load(&vp->v_type) but it is ilegal to take 3217 * an address of a bit field, even if said field is sized to char. 3218 * Work around the problem by reading the value into a full-sized enum 3219 * and then re-reading it with atomic_load which will still prevent 3220 * the compiler from re-reading down the road. 3221 */ 3222 type = vp->v_type; 3223 type = atomic_load_int(&type); 3224 if (type == VBAD) { 3225 error = ENOENT; 3226 goto out_bad; 3227 } 3228 if (type != VDIR) { 3229 cnp = &ndp->ni_cnd; 3230 addend = cnp->cn_namelen + 2; 3231 if (*buflen < addend) { 3232 error = ENOMEM; 3233 goto out_bad; 3234 } 3235 *buflen -= addend; 3236 tmpbuf = buf + *buflen; 3237 tmpbuf[0] = '/'; 3238 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen); 3239 tmpbuf[addend - 1] = '\0'; 3240 vp = ndp->ni_dvp; 3241 } 3242 3243 vfs_smr_enter(); 3244 pwd = pwd_get_smr(); 3245 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen, 3246 addend); 3247 VFS_SMR_ASSERT_NOT_ENTERED(); 3248 if (error < 0) { 3249 pwd = pwd_hold(curthread); 3250 vref(vp); 3251 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen, 3252 addend); 3253 pwd_drop(pwd); 3254 if (error != 0) 3255 goto out_bad; 3256 } 3257 3258 *freebuf = buf; 3259 3260 return (0); 3261 out_bad: 3262 free(buf, M_TEMP); 3263 return (error); 3264 } 3265 3266 struct vnode * 3267 vn_dir_dd_ino(struct vnode *vp) 3268 { 3269 struct namecache *ncp; 3270 struct vnode *ddvp; 3271 struct mtx *vlp; 3272 enum vgetstate vs; 3273 3274 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); 3275 vlp = VP2VNODELOCK(vp); 3276 mtx_lock(vlp); 3277 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { 3278 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 3279 continue; 3280 ddvp = ncp->nc_dvp; 3281 vs = vget_prep(ddvp); 3282 mtx_unlock(vlp); 3283 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) 3284 return (NULL); 3285 return (ddvp); 3286 } 3287 mtx_unlock(vlp); 3288 return (NULL); 3289 } 3290 3291 int 3292 vn_commname(struct vnode *vp, char *buf, u_int buflen) 3293 { 3294 struct namecache *ncp; 3295 struct mtx *vlp; 3296 int l; 3297 3298 vlp = VP2VNODELOCK(vp); 3299 mtx_lock(vlp); 3300 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) 3301 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 3302 break; 3303 if (ncp == NULL) { 3304 mtx_unlock(vlp); 3305 return (ENOENT); 3306 } 3307 l = min(ncp->nc_nlen, buflen - 1); 3308 memcpy(buf, ncp->nc_name, l); 3309 mtx_unlock(vlp); 3310 buf[l] = '\0'; 3311 return (0); 3312 } 3313 3314 /* 3315 * This function updates path string to vnode's full global path 3316 * and checks the size of the new path string against the pathlen argument. 3317 * 3318 * Requires a locked, referenced vnode. 3319 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 3320 * 3321 * If vp is a directory, the call to vn_fullpath_global() always succeeds 3322 * because it falls back to the ".." lookup if the namecache lookup fails. 3323 */ 3324 int 3325 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, 3326 u_int pathlen) 3327 { 3328 struct nameidata nd; 3329 struct vnode *vp1; 3330 char *rpath, *fbuf; 3331 int error; 3332 3333 ASSERT_VOP_ELOCKED(vp, __func__); 3334 3335 /* Construct global filesystem path from vp. */ 3336 VOP_UNLOCK(vp); 3337 error = vn_fullpath_global(vp, &rpath, &fbuf); 3338 3339 if (error != 0) { 3340 vrele(vp); 3341 return (error); 3342 } 3343 3344 if (strlen(rpath) >= pathlen) { 3345 vrele(vp); 3346 error = ENAMETOOLONG; 3347 goto out; 3348 } 3349 3350 /* 3351 * Re-lookup the vnode by path to detect a possible rename. 3352 * As a side effect, the vnode is relocked. 3353 * If vnode was renamed, return ENOENT. 3354 */ 3355 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, 3356 UIO_SYSSPACE, path, td); 3357 error = namei(&nd); 3358 if (error != 0) { 3359 vrele(vp); 3360 goto out; 3361 } 3362 NDFREE(&nd, NDF_ONLY_PNBUF); 3363 vp1 = nd.ni_vp; 3364 vrele(vp); 3365 if (vp1 == vp) 3366 strcpy(path, rpath); 3367 else { 3368 vput(vp1); 3369 error = ENOENT; 3370 } 3371 3372 out: 3373 free(fbuf, M_TEMP); 3374 return (error); 3375 } 3376 3377 #ifdef DDB 3378 static void 3379 db_print_vpath(struct vnode *vp) 3380 { 3381 3382 while (vp != NULL) { 3383 db_printf("%p: ", vp); 3384 if (vp == rootvnode) { 3385 db_printf("/"); 3386 vp = NULL; 3387 } else { 3388 if (vp->v_vflag & VV_ROOT) { 3389 db_printf("<mount point>"); 3390 vp = vp->v_mount->mnt_vnodecovered; 3391 } else { 3392 struct namecache *ncp; 3393 char *ncn; 3394 int i; 3395 3396 ncp = TAILQ_FIRST(&vp->v_cache_dst); 3397 if (ncp != NULL) { 3398 ncn = ncp->nc_name; 3399 for (i = 0; i < ncp->nc_nlen; i++) 3400 db_printf("%c", *ncn++); 3401 vp = ncp->nc_dvp; 3402 } else { 3403 vp = NULL; 3404 } 3405 } 3406 } 3407 db_printf("\n"); 3408 } 3409 3410 return; 3411 } 3412 3413 DB_SHOW_COMMAND(vpath, db_show_vpath) 3414 { 3415 struct vnode *vp; 3416 3417 if (!have_addr) { 3418 db_printf("usage: show vpath <struct vnode *>\n"); 3419 return; 3420 } 3421 3422 vp = (struct vnode *)addr; 3423 db_print_vpath(vp); 3424 } 3425 3426 #endif 3427 3428 static bool __read_frequently cache_fast_lookup = true; 3429 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW, 3430 &cache_fast_lookup, 0, ""); 3431 3432 #define CACHE_FPL_FAILED -2020 3433 3434 static void 3435 cache_fpl_cleanup_cnp(struct componentname *cnp) 3436 { 3437 3438 uma_zfree(namei_zone, cnp->cn_pnbuf); 3439 #ifdef DIAGNOSTIC 3440 cnp->cn_pnbuf = NULL; 3441 cnp->cn_nameptr = NULL; 3442 #endif 3443 } 3444 3445 static void 3446 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp) 3447 { 3448 struct componentname *cnp; 3449 3450 cnp = &ndp->ni_cnd; 3451 while (*(cnp->cn_nameptr) == '/') { 3452 cnp->cn_nameptr++; 3453 ndp->ni_pathlen--; 3454 } 3455 3456 *dpp = ndp->ni_rootdir; 3457 } 3458 3459 /* 3460 * Components of nameidata (or objects it can point to) which may 3461 * need restoring in case fast path lookup fails. 3462 */ 3463 struct nameidata_saved { 3464 long cn_namelen; 3465 char *cn_nameptr; 3466 size_t ni_pathlen; 3467 int cn_flags; 3468 }; 3469 3470 struct cache_fpl { 3471 struct nameidata *ndp; 3472 struct componentname *cnp; 3473 struct pwd *pwd; 3474 struct vnode *dvp; 3475 struct vnode *tvp; 3476 seqc_t dvp_seqc; 3477 seqc_t tvp_seqc; 3478 struct nameidata_saved snd; 3479 int line; 3480 enum cache_fpl_status status:8; 3481 bool in_smr; 3482 bool fsearch; 3483 }; 3484 3485 static void 3486 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd) 3487 { 3488 3489 snd->cn_flags = fpl->ndp->ni_cnd.cn_flags; 3490 snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen; 3491 snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr; 3492 snd->ni_pathlen = fpl->ndp->ni_pathlen; 3493 } 3494 3495 static void 3496 cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd) 3497 { 3498 3499 fpl->ndp->ni_cnd.cn_flags = snd->cn_flags; 3500 fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen; 3501 fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr; 3502 fpl->ndp->ni_pathlen = snd->ni_pathlen; 3503 } 3504 3505 #ifdef INVARIANTS 3506 #define cache_fpl_smr_assert_entered(fpl) ({ \ 3507 struct cache_fpl *_fpl = (fpl); \ 3508 MPASS(_fpl->in_smr == true); \ 3509 VFS_SMR_ASSERT_ENTERED(); \ 3510 }) 3511 #define cache_fpl_smr_assert_not_entered(fpl) ({ \ 3512 struct cache_fpl *_fpl = (fpl); \ 3513 MPASS(_fpl->in_smr == false); \ 3514 VFS_SMR_ASSERT_NOT_ENTERED(); \ 3515 }) 3516 #else 3517 #define cache_fpl_smr_assert_entered(fpl) do { } while (0) 3518 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0) 3519 #endif 3520 3521 #define cache_fpl_smr_enter_initial(fpl) ({ \ 3522 struct cache_fpl *_fpl = (fpl); \ 3523 vfs_smr_enter(); \ 3524 _fpl->in_smr = true; \ 3525 }) 3526 3527 #define cache_fpl_smr_enter(fpl) ({ \ 3528 struct cache_fpl *_fpl = (fpl); \ 3529 MPASS(_fpl->in_smr == false); \ 3530 vfs_smr_enter(); \ 3531 _fpl->in_smr = true; \ 3532 }) 3533 3534 #define cache_fpl_smr_exit(fpl) ({ \ 3535 struct cache_fpl *_fpl = (fpl); \ 3536 MPASS(_fpl->in_smr == true); \ 3537 vfs_smr_exit(); \ 3538 _fpl->in_smr = false; \ 3539 }) 3540 3541 static int 3542 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line) 3543 { 3544 3545 if (fpl->status != CACHE_FPL_STATUS_UNSET) { 3546 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, 3547 ("%s: converting to abort from %d at %d, set at %d\n", 3548 __func__, fpl->status, line, fpl->line)); 3549 } 3550 fpl->status = CACHE_FPL_STATUS_ABORTED; 3551 fpl->line = line; 3552 return (CACHE_FPL_FAILED); 3553 } 3554 3555 #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__) 3556 3557 static int 3558 cache_fpl_partial_impl(struct cache_fpl *fpl, int line) 3559 { 3560 3561 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3562 ("%s: setting to partial at %d, but already set to %d at %d\n", 3563 __func__, line, fpl->status, fpl->line)); 3564 cache_fpl_smr_assert_entered(fpl); 3565 fpl->status = CACHE_FPL_STATUS_PARTIAL; 3566 fpl->line = line; 3567 return (CACHE_FPL_FAILED); 3568 } 3569 3570 #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__) 3571 3572 static int 3573 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line) 3574 { 3575 3576 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3577 ("%s: setting to handled at %d, but already set to %d at %d\n", 3578 __func__, line, fpl->status, fpl->line)); 3579 cache_fpl_smr_assert_not_entered(fpl); 3580 MPASS(error != CACHE_FPL_FAILED); 3581 fpl->status = CACHE_FPL_STATUS_HANDLED; 3582 fpl->line = line; 3583 return (error); 3584 } 3585 3586 #define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__) 3587 3588 #define CACHE_FPL_SUPPORTED_CN_FLAGS \ 3589 (LOCKLEAF | LOCKPARENT | WANTPARENT | NOCACHE | FOLLOW | LOCKSHARED | SAVENAME | \ 3590 SAVESTART | WILLBEDIR | ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK) 3591 3592 #define CACHE_FPL_INTERNAL_CN_FLAGS \ 3593 (ISDOTDOT | MAKEENTRY | ISLASTCN) 3594 3595 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, 3596 "supported and internal flags overlap"); 3597 3598 static bool 3599 cache_fpl_islastcn(struct nameidata *ndp) 3600 { 3601 3602 return (*ndp->ni_next == 0); 3603 } 3604 3605 static bool 3606 cache_fpl_isdotdot(struct componentname *cnp) 3607 { 3608 3609 if (cnp->cn_namelen == 2 && 3610 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') 3611 return (true); 3612 return (false); 3613 } 3614 3615 static bool 3616 cache_can_fplookup(struct cache_fpl *fpl) 3617 { 3618 struct nameidata *ndp; 3619 struct componentname *cnp; 3620 struct thread *td; 3621 3622 ndp = fpl->ndp; 3623 cnp = fpl->cnp; 3624 td = cnp->cn_thread; 3625 3626 if (!cache_fast_lookup) { 3627 cache_fpl_aborted(fpl); 3628 return (false); 3629 } 3630 #ifdef MAC 3631 if (mac_vnode_check_lookup_enabled()) { 3632 cache_fpl_aborted(fpl); 3633 return (false); 3634 } 3635 #endif 3636 if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) { 3637 cache_fpl_aborted(fpl); 3638 return (false); 3639 } 3640 if (IN_CAPABILITY_MODE(td)) { 3641 cache_fpl_aborted(fpl); 3642 return (false); 3643 } 3644 if (AUDITING_TD(td)) { 3645 cache_fpl_aborted(fpl); 3646 return (false); 3647 } 3648 if (ndp->ni_startdir != NULL) { 3649 cache_fpl_aborted(fpl); 3650 return (false); 3651 } 3652 return (true); 3653 } 3654 3655 static int 3656 cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp) 3657 { 3658 struct nameidata *ndp; 3659 int error; 3660 bool fsearch; 3661 3662 ndp = fpl->ndp; 3663 error = fgetvp_lookup_smr(ndp->ni_dirfd, ndp, vpp, &fsearch); 3664 if (__predict_false(error != 0)) { 3665 cache_fpl_smr_exit(fpl); 3666 return (cache_fpl_aborted(fpl)); 3667 } 3668 fpl->fsearch = fsearch; 3669 return (0); 3670 } 3671 3672 static bool 3673 cache_fplookup_vnode_supported(struct vnode *vp) 3674 { 3675 3676 return (vp->v_type != VLNK); 3677 } 3678 3679 static int __noinline 3680 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp, 3681 uint32_t hash) 3682 { 3683 struct componentname *cnp; 3684 struct vnode *dvp; 3685 3686 cnp = fpl->cnp; 3687 dvp = fpl->dvp; 3688 3689 cache_fpl_smr_exit(fpl); 3690 if (cache_neg_promote_cond(dvp, cnp, oncp, hash)) 3691 return (cache_fpl_handled(fpl, ENOENT)); 3692 else 3693 return (cache_fpl_aborted(fpl)); 3694 } 3695 3696 /* 3697 * The target vnode is not supported, prepare for the slow path to take over. 3698 */ 3699 static int __noinline 3700 cache_fplookup_partial_setup(struct cache_fpl *fpl) 3701 { 3702 struct nameidata *ndp; 3703 struct componentname *cnp; 3704 enum vgetstate dvs; 3705 struct vnode *dvp; 3706 struct pwd *pwd; 3707 seqc_t dvp_seqc; 3708 3709 ndp = fpl->ndp; 3710 cnp = fpl->cnp; 3711 pwd = fpl->pwd; 3712 dvp = fpl->dvp; 3713 dvp_seqc = fpl->dvp_seqc; 3714 3715 if (!pwd_hold_smr(pwd)) { 3716 cache_fpl_smr_exit(fpl); 3717 return (cache_fpl_aborted(fpl)); 3718 } 3719 3720 dvs = vget_prep_smr(dvp); 3721 cache_fpl_smr_exit(fpl); 3722 if (__predict_false(dvs == VGET_NONE)) { 3723 pwd_drop(pwd); 3724 return (cache_fpl_aborted(fpl)); 3725 } 3726 3727 vget_finish_ref(dvp, dvs); 3728 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3729 vrele(dvp); 3730 pwd_drop(pwd); 3731 return (cache_fpl_aborted(fpl)); 3732 } 3733 3734 cache_fpl_restore(fpl, &fpl->snd); 3735 3736 ndp->ni_startdir = dvp; 3737 cnp->cn_flags |= MAKEENTRY; 3738 if (cache_fpl_islastcn(ndp)) 3739 cnp->cn_flags |= ISLASTCN; 3740 if (cache_fpl_isdotdot(cnp)) 3741 cnp->cn_flags |= ISDOTDOT; 3742 3743 return (0); 3744 } 3745 3746 static int 3747 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs) 3748 { 3749 struct componentname *cnp; 3750 struct vnode *tvp; 3751 seqc_t tvp_seqc; 3752 int error, lkflags; 3753 3754 cnp = fpl->cnp; 3755 tvp = fpl->tvp; 3756 tvp_seqc = fpl->tvp_seqc; 3757 3758 if ((cnp->cn_flags & LOCKLEAF) != 0) { 3759 lkflags = LK_SHARED; 3760 if ((cnp->cn_flags & LOCKSHARED) == 0) 3761 lkflags = LK_EXCLUSIVE; 3762 error = vget_finish(tvp, lkflags, tvs); 3763 if (__predict_false(error != 0)) { 3764 return (cache_fpl_aborted(fpl)); 3765 } 3766 } else { 3767 vget_finish_ref(tvp, tvs); 3768 } 3769 3770 if (!vn_seqc_consistent(tvp, tvp_seqc)) { 3771 if ((cnp->cn_flags & LOCKLEAF) != 0) 3772 vput(tvp); 3773 else 3774 vrele(tvp); 3775 return (cache_fpl_aborted(fpl)); 3776 } 3777 3778 return (cache_fpl_handled(fpl, 0)); 3779 } 3780 3781 /* 3782 * They want to possibly modify the state of the namecache. 3783 * 3784 * Don't try to match the API contract, just leave. 3785 * TODO: this leaves scalability on the table 3786 */ 3787 static int 3788 cache_fplookup_final_modifying(struct cache_fpl *fpl) 3789 { 3790 struct componentname *cnp; 3791 3792 cnp = fpl->cnp; 3793 MPASS(cnp->cn_nameiop != LOOKUP); 3794 return (cache_fpl_partial(fpl)); 3795 } 3796 3797 static int __noinline 3798 cache_fplookup_final_withparent(struct cache_fpl *fpl) 3799 { 3800 struct componentname *cnp; 3801 enum vgetstate dvs, tvs; 3802 struct vnode *dvp, *tvp; 3803 seqc_t dvp_seqc; 3804 int error; 3805 3806 cnp = fpl->cnp; 3807 dvp = fpl->dvp; 3808 dvp_seqc = fpl->dvp_seqc; 3809 tvp = fpl->tvp; 3810 3811 MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0); 3812 3813 /* 3814 * This is less efficient than it can be for simplicity. 3815 */ 3816 dvs = vget_prep_smr(dvp); 3817 if (__predict_false(dvs == VGET_NONE)) { 3818 return (cache_fpl_aborted(fpl)); 3819 } 3820 tvs = vget_prep_smr(tvp); 3821 if (__predict_false(tvs == VGET_NONE)) { 3822 cache_fpl_smr_exit(fpl); 3823 vget_abort(dvp, dvs); 3824 return (cache_fpl_aborted(fpl)); 3825 } 3826 3827 cache_fpl_smr_exit(fpl); 3828 3829 if ((cnp->cn_flags & LOCKPARENT) != 0) { 3830 error = vget_finish(dvp, LK_EXCLUSIVE, dvs); 3831 if (__predict_false(error != 0)) { 3832 vget_abort(tvp, tvs); 3833 return (cache_fpl_aborted(fpl)); 3834 } 3835 } else { 3836 vget_finish_ref(dvp, dvs); 3837 } 3838 3839 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3840 vget_abort(tvp, tvs); 3841 if ((cnp->cn_flags & LOCKPARENT) != 0) 3842 vput(dvp); 3843 else 3844 vrele(dvp); 3845 return (cache_fpl_aborted(fpl)); 3846 } 3847 3848 error = cache_fplookup_final_child(fpl, tvs); 3849 if (__predict_false(error != 0)) { 3850 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED); 3851 if ((cnp->cn_flags & LOCKPARENT) != 0) 3852 vput(dvp); 3853 else 3854 vrele(dvp); 3855 return (error); 3856 } 3857 3858 MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED); 3859 return (0); 3860 } 3861 3862 static int 3863 cache_fplookup_final(struct cache_fpl *fpl) 3864 { 3865 struct componentname *cnp; 3866 enum vgetstate tvs; 3867 struct vnode *dvp, *tvp; 3868 seqc_t dvp_seqc; 3869 3870 cnp = fpl->cnp; 3871 dvp = fpl->dvp; 3872 dvp_seqc = fpl->dvp_seqc; 3873 tvp = fpl->tvp; 3874 3875 VNPASS(cache_fplookup_vnode_supported(dvp), dvp); 3876 3877 if (cnp->cn_nameiop != LOOKUP) { 3878 return (cache_fplookup_final_modifying(fpl)); 3879 } 3880 3881 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) 3882 return (cache_fplookup_final_withparent(fpl)); 3883 3884 tvs = vget_prep_smr(tvp); 3885 if (__predict_false(tvs == VGET_NONE)) { 3886 return (cache_fpl_partial(fpl)); 3887 } 3888 3889 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3890 cache_fpl_smr_exit(fpl); 3891 vget_abort(tvp, tvs); 3892 return (cache_fpl_aborted(fpl)); 3893 } 3894 3895 cache_fpl_smr_exit(fpl); 3896 return (cache_fplookup_final_child(fpl, tvs)); 3897 } 3898 3899 static int __noinline 3900 cache_fplookup_dot(struct cache_fpl *fpl) 3901 { 3902 struct vnode *dvp; 3903 3904 dvp = fpl->dvp; 3905 3906 fpl->tvp = dvp; 3907 fpl->tvp_seqc = vn_seqc_read_any(dvp); 3908 if (seqc_in_modify(fpl->tvp_seqc)) { 3909 return (cache_fpl_aborted(fpl)); 3910 } 3911 3912 counter_u64_add(dothits, 1); 3913 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", dvp); 3914 3915 return (0); 3916 } 3917 3918 static int __noinline 3919 cache_fplookup_dotdot(struct cache_fpl *fpl) 3920 { 3921 struct nameidata *ndp; 3922 struct componentname *cnp; 3923 struct namecache *ncp; 3924 struct vnode *dvp; 3925 struct prison *pr; 3926 u_char nc_flag; 3927 3928 ndp = fpl->ndp; 3929 cnp = fpl->cnp; 3930 dvp = fpl->dvp; 3931 3932 /* 3933 * XXX this is racy the same way regular lookup is 3934 */ 3935 for (pr = cnp->cn_cred->cr_prison; pr != NULL; 3936 pr = pr->pr_parent) 3937 if (dvp == pr->pr_root) 3938 break; 3939 3940 if (dvp == ndp->ni_rootdir || 3941 dvp == ndp->ni_topdir || 3942 dvp == rootvnode || 3943 pr != NULL) { 3944 fpl->tvp = dvp; 3945 fpl->tvp_seqc = vn_seqc_read_any(dvp); 3946 if (seqc_in_modify(fpl->tvp_seqc)) { 3947 return (cache_fpl_aborted(fpl)); 3948 } 3949 return (0); 3950 } 3951 3952 if ((dvp->v_vflag & VV_ROOT) != 0) { 3953 /* 3954 * TODO 3955 * The opposite of climb mount is needed here. 3956 */ 3957 return (cache_fpl_aborted(fpl)); 3958 } 3959 3960 ncp = atomic_load_ptr(&dvp->v_cache_dd); 3961 if (ncp == NULL) { 3962 return (cache_fpl_aborted(fpl)); 3963 } 3964 3965 nc_flag = atomic_load_char(&ncp->nc_flag); 3966 if ((nc_flag & NCF_ISDOTDOT) != 0) { 3967 if ((nc_flag & NCF_NEGATIVE) != 0) 3968 return (cache_fpl_aborted(fpl)); 3969 fpl->tvp = ncp->nc_vp; 3970 } else { 3971 fpl->tvp = ncp->nc_dvp; 3972 } 3973 3974 if (__predict_false(!cache_ncp_canuse(ncp))) { 3975 return (cache_fpl_aborted(fpl)); 3976 } 3977 3978 fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp); 3979 if (seqc_in_modify(fpl->tvp_seqc)) { 3980 return (cache_fpl_partial(fpl)); 3981 } 3982 3983 counter_u64_add(dotdothits, 1); 3984 return (0); 3985 } 3986 3987 static int __noinline 3988 cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash) 3989 { 3990 u_char nc_flag; 3991 bool neg_promote; 3992 3993 nc_flag = atomic_load_char(&ncp->nc_flag); 3994 MPASS((nc_flag & NCF_NEGATIVE) != 0); 3995 /* 3996 * If they want to create an entry we need to replace this one. 3997 */ 3998 if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) { 3999 /* 4000 * TODO 4001 * This should call something similar to 4002 * cache_fplookup_final_modifying. 4003 */ 4004 return (cache_fpl_partial(fpl)); 4005 } 4006 neg_promote = cache_neg_hit_prep(ncp); 4007 if (__predict_false(!cache_ncp_canuse(ncp))) { 4008 cache_neg_hit_abort(ncp); 4009 return (cache_fpl_partial(fpl)); 4010 } 4011 if (__predict_false((nc_flag & NCF_WHITE) != 0)) { 4012 cache_neg_hit_abort(ncp); 4013 return (cache_fpl_partial(fpl)); 4014 } 4015 if (neg_promote) { 4016 return (cache_fplookup_negative_promote(fpl, ncp, hash)); 4017 } 4018 cache_neg_hit_finish(ncp); 4019 cache_fpl_smr_exit(fpl); 4020 return (cache_fpl_handled(fpl, ENOENT)); 4021 } 4022 4023 static int 4024 cache_fplookup_next(struct cache_fpl *fpl) 4025 { 4026 struct componentname *cnp; 4027 struct namecache *ncp; 4028 struct vnode *dvp, *tvp; 4029 u_char nc_flag; 4030 uint32_t hash; 4031 4032 cnp = fpl->cnp; 4033 dvp = fpl->dvp; 4034 4035 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) { 4036 return (cache_fplookup_dot(fpl)); 4037 } 4038 4039 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 4040 4041 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 4042 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 4043 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 4044 break; 4045 } 4046 4047 /* 4048 * If there is no entry we have to punt to the slow path to perform 4049 * actual lookup. Should there be nothing with this name a negative 4050 * entry will be created. 4051 */ 4052 if (__predict_false(ncp == NULL)) { 4053 return (cache_fpl_partial(fpl)); 4054 } 4055 4056 tvp = atomic_load_ptr(&ncp->nc_vp); 4057 nc_flag = atomic_load_char(&ncp->nc_flag); 4058 if ((nc_flag & NCF_NEGATIVE) != 0) { 4059 return (cache_fplookup_neg(fpl, ncp, hash)); 4060 } 4061 4062 if (__predict_false(!cache_ncp_canuse(ncp))) { 4063 return (cache_fpl_partial(fpl)); 4064 } 4065 4066 fpl->tvp = tvp; 4067 fpl->tvp_seqc = vn_seqc_read_any(tvp); 4068 if (seqc_in_modify(fpl->tvp_seqc)) { 4069 return (cache_fpl_partial(fpl)); 4070 } 4071 4072 if (!cache_fplookup_vnode_supported(tvp)) { 4073 return (cache_fpl_partial(fpl)); 4074 } 4075 4076 counter_u64_add(numposhits, 1); 4077 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp); 4078 return (0); 4079 } 4080 4081 static bool 4082 cache_fplookup_mp_supported(struct mount *mp) 4083 { 4084 4085 if (mp == NULL) 4086 return (false); 4087 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) 4088 return (false); 4089 return (true); 4090 } 4091 4092 /* 4093 * Walk up the mount stack (if any). 4094 * 4095 * Correctness is provided in the following ways: 4096 * - all vnodes are protected from freeing with SMR 4097 * - struct mount objects are type stable making them always safe to access 4098 * - stability of the particular mount is provided by busying it 4099 * - relationship between the vnode which is mounted on and the mount is 4100 * verified with the vnode sequence counter after busying 4101 * - association between root vnode of the mount and the mount is protected 4102 * by busy 4103 * 4104 * From that point on we can read the sequence counter of the root vnode 4105 * and get the next mount on the stack (if any) using the same protection. 4106 * 4107 * By the end of successful walk we are guaranteed the reached state was 4108 * indeed present at least at some point which matches the regular lookup. 4109 */ 4110 static int __noinline 4111 cache_fplookup_climb_mount(struct cache_fpl *fpl) 4112 { 4113 struct mount *mp, *prev_mp; 4114 struct vnode *vp; 4115 seqc_t vp_seqc; 4116 4117 vp = fpl->tvp; 4118 vp_seqc = fpl->tvp_seqc; 4119 4120 VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp); 4121 mp = atomic_load_ptr(&vp->v_mountedhere); 4122 if (mp == NULL) 4123 return (0); 4124 4125 prev_mp = NULL; 4126 for (;;) { 4127 if (!vfs_op_thread_enter_crit(mp)) { 4128 if (prev_mp != NULL) 4129 vfs_op_thread_exit_crit(prev_mp); 4130 return (cache_fpl_partial(fpl)); 4131 } 4132 if (prev_mp != NULL) 4133 vfs_op_thread_exit_crit(prev_mp); 4134 if (!vn_seqc_consistent(vp, vp_seqc)) { 4135 vfs_op_thread_exit_crit(mp); 4136 return (cache_fpl_partial(fpl)); 4137 } 4138 if (!cache_fplookup_mp_supported(mp)) { 4139 vfs_op_thread_exit_crit(mp); 4140 return (cache_fpl_partial(fpl)); 4141 } 4142 vp = atomic_load_ptr(&mp->mnt_rootvnode); 4143 if (vp == NULL || VN_IS_DOOMED(vp)) { 4144 vfs_op_thread_exit_crit(mp); 4145 return (cache_fpl_partial(fpl)); 4146 } 4147 vp_seqc = vn_seqc_read_any(vp); 4148 if (seqc_in_modify(vp_seqc)) { 4149 vfs_op_thread_exit_crit(mp); 4150 return (cache_fpl_partial(fpl)); 4151 } 4152 prev_mp = mp; 4153 mp = atomic_load_ptr(&vp->v_mountedhere); 4154 if (mp == NULL) 4155 break; 4156 } 4157 4158 vfs_op_thread_exit_crit(prev_mp); 4159 fpl->tvp = vp; 4160 fpl->tvp_seqc = vp_seqc; 4161 return (0); 4162 } 4163 4164 static bool 4165 cache_fplookup_need_climb_mount(struct cache_fpl *fpl) 4166 { 4167 struct mount *mp; 4168 struct vnode *vp; 4169 4170 vp = fpl->tvp; 4171 4172 /* 4173 * Hack: while this is a union, the pointer tends to be NULL so save on 4174 * a branch. 4175 */ 4176 mp = atomic_load_ptr(&vp->v_mountedhere); 4177 if (mp == NULL) 4178 return (false); 4179 if (vp->v_type == VDIR) 4180 return (true); 4181 return (false); 4182 } 4183 4184 /* 4185 * Parse the path. 4186 * 4187 * The code was originally copy-pasted from regular lookup and despite 4188 * clean ups leaves performance on the table. Any modifications here 4189 * must take into account that in case off fallback the resulting 4190 * nameidata state has to be compatible with the original. 4191 */ 4192 static int 4193 cache_fplookup_parse(struct cache_fpl *fpl) 4194 { 4195 struct nameidata *ndp; 4196 struct componentname *cnp; 4197 char *cp; 4198 4199 ndp = fpl->ndp; 4200 cnp = fpl->cnp; 4201 4202 /* 4203 * Search a new directory. 4204 * 4205 * The last component of the filename is left accessible via 4206 * cnp->cn_nameptr for callers that need the name. Callers needing 4207 * the name set the SAVENAME flag. When done, they assume 4208 * responsibility for freeing the pathname buffer. 4209 */ 4210 for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++) 4211 continue; 4212 cnp->cn_namelen = cp - cnp->cn_nameptr; 4213 if (__predict_false(cnp->cn_namelen > NAME_MAX)) { 4214 cache_fpl_smr_exit(fpl); 4215 return (cache_fpl_handled(fpl, ENAMETOOLONG)); 4216 } 4217 ndp->ni_pathlen -= cnp->cn_namelen; 4218 KASSERT(ndp->ni_pathlen <= PATH_MAX, 4219 ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen)); 4220 ndp->ni_next = cp; 4221 4222 /* 4223 * Replace multiple slashes by a single slash and trailing slashes 4224 * by a null. This must be done before VOP_LOOKUP() because some 4225 * fs's don't know about trailing slashes. Remember if there were 4226 * trailing slashes to handle symlinks, existing non-directories 4227 * and non-existing files that won't be directories specially later. 4228 */ 4229 while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) { 4230 cp++; 4231 ndp->ni_pathlen--; 4232 if (*cp == '\0') { 4233 /* 4234 * TODO 4235 * Regular lookup performs the following: 4236 * *ndp->ni_next = '\0'; 4237 * cnp->cn_flags |= TRAILINGSLASH; 4238 * 4239 * Which is problematic since it modifies data read 4240 * from userspace. Then if fast path lookup was to 4241 * abort we would have to either restore it or convey 4242 * the flag. Since this is a corner case just ignore 4243 * it for simplicity. 4244 */ 4245 return (cache_fpl_partial(fpl)); 4246 } 4247 } 4248 ndp->ni_next = cp; 4249 4250 /* 4251 * Check for degenerate name (e.g. / or "") 4252 * which is a way of talking about a directory, 4253 * e.g. like "/." or ".". 4254 * 4255 * TODO 4256 * Another corner case handled by the regular lookup 4257 */ 4258 if (__predict_false(cnp->cn_nameptr[0] == '\0')) { 4259 return (cache_fpl_partial(fpl)); 4260 } 4261 return (0); 4262 } 4263 4264 static void 4265 cache_fplookup_parse_advance(struct cache_fpl *fpl) 4266 { 4267 struct nameidata *ndp; 4268 struct componentname *cnp; 4269 4270 ndp = fpl->ndp; 4271 cnp = fpl->cnp; 4272 4273 cnp->cn_nameptr = ndp->ni_next; 4274 while (*cnp->cn_nameptr == '/') { 4275 cnp->cn_nameptr++; 4276 ndp->ni_pathlen--; 4277 } 4278 } 4279 4280 /* 4281 * See the API contract for VOP_FPLOOKUP_VEXEC. 4282 */ 4283 static int __noinline 4284 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error) 4285 { 4286 struct componentname *cnp; 4287 struct vnode *dvp; 4288 seqc_t dvp_seqc; 4289 4290 cnp = fpl->cnp; 4291 dvp = fpl->dvp; 4292 dvp_seqc = fpl->dvp_seqc; 4293 4294 /* 4295 * Hack: they may be looking up foo/bar, where foo is a 4296 * regular file. In such a case we need to turn ENOTDIR, 4297 * but we may happen to get here with a different error. 4298 */ 4299 if (dvp->v_type != VDIR) { 4300 /* 4301 * The check here is predominantly to catch 4302 * EOPNOTSUPP from dead_vnodeops. If the vnode 4303 * gets doomed past this point it is going to 4304 * fail seqc verification. 4305 */ 4306 if (VN_IS_DOOMED(dvp)) { 4307 return (cache_fpl_aborted(fpl)); 4308 } 4309 error = ENOTDIR; 4310 } 4311 4312 /* 4313 * Hack: handle O_SEARCH. 4314 * 4315 * Open Group Base Specifications Issue 7, 2018 edition states: 4316 * If the access mode of the open file description associated with the 4317 * file descriptor is not O_SEARCH, the function shall check whether 4318 * directory searches are permitted using the current permissions of 4319 * the directory underlying the file descriptor. If the access mode is 4320 * O_SEARCH, the function shall not perform the check. 4321 * 4322 * Regular lookup tests for the NOEXECCHECK flag for every path 4323 * component to decide whether to do the permission check. However, 4324 * since most lookups never have the flag (and when they do it is only 4325 * present for the first path component), lockless lookup only acts on 4326 * it if there is a permission problem. Here the flag is represented 4327 * with a boolean so that we don't have to clear it on the way out. 4328 * 4329 * For simplicity this always aborts. 4330 * TODO: check if this is the first lookup and ignore the permission 4331 * problem. Note the flag has to survive fallback (if it happens to be 4332 * performed). 4333 */ 4334 if (fpl->fsearch) { 4335 return (cache_fpl_aborted(fpl)); 4336 } 4337 4338 switch (error) { 4339 case EAGAIN: 4340 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4341 error = cache_fpl_aborted(fpl); 4342 } else { 4343 cache_fpl_partial(fpl); 4344 } 4345 break; 4346 default: 4347 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4348 error = cache_fpl_aborted(fpl); 4349 } else { 4350 cache_fpl_smr_exit(fpl); 4351 cache_fpl_handled(fpl, error); 4352 } 4353 break; 4354 } 4355 return (error); 4356 } 4357 4358 static int 4359 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl) 4360 { 4361 struct nameidata *ndp; 4362 struct componentname *cnp; 4363 struct mount *mp; 4364 int error; 4365 4366 error = CACHE_FPL_FAILED; 4367 ndp = fpl->ndp; 4368 cnp = fpl->cnp; 4369 4370 cache_fpl_checkpoint(fpl, &fpl->snd); 4371 4372 fpl->dvp = dvp; 4373 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); 4374 if (seqc_in_modify(fpl->dvp_seqc)) { 4375 cache_fpl_aborted(fpl); 4376 goto out; 4377 } 4378 mp = atomic_load_ptr(&fpl->dvp->v_mount); 4379 if (!cache_fplookup_mp_supported(mp)) { 4380 cache_fpl_aborted(fpl); 4381 goto out; 4382 } 4383 4384 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); 4385 4386 for (;;) { 4387 error = cache_fplookup_parse(fpl); 4388 if (__predict_false(error != 0)) { 4389 break; 4390 } 4391 4392 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); 4393 4394 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred); 4395 if (__predict_false(error != 0)) { 4396 error = cache_fplookup_failed_vexec(fpl, error); 4397 break; 4398 } 4399 4400 if (__predict_false(cache_fpl_isdotdot(cnp))) { 4401 error = cache_fplookup_dotdot(fpl); 4402 if (__predict_false(error != 0)) { 4403 break; 4404 } 4405 } else { 4406 error = cache_fplookup_next(fpl); 4407 if (__predict_false(error != 0)) { 4408 break; 4409 } 4410 4411 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 4412 4413 if (cache_fplookup_need_climb_mount(fpl)) { 4414 error = cache_fplookup_climb_mount(fpl); 4415 if (__predict_false(error != 0)) { 4416 break; 4417 } 4418 } 4419 } 4420 4421 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 4422 4423 if (cache_fpl_islastcn(ndp)) { 4424 error = cache_fplookup_final(fpl); 4425 break; 4426 } 4427 4428 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { 4429 error = cache_fpl_aborted(fpl); 4430 break; 4431 } 4432 4433 fpl->dvp = fpl->tvp; 4434 fpl->dvp_seqc = fpl->tvp_seqc; 4435 4436 cache_fplookup_parse_advance(fpl); 4437 cache_fpl_checkpoint(fpl, &fpl->snd); 4438 } 4439 out: 4440 switch (fpl->status) { 4441 case CACHE_FPL_STATUS_UNSET: 4442 __assert_unreachable(); 4443 break; 4444 case CACHE_FPL_STATUS_PARTIAL: 4445 cache_fpl_smr_assert_entered(fpl); 4446 return (cache_fplookup_partial_setup(fpl)); 4447 case CACHE_FPL_STATUS_ABORTED: 4448 if (fpl->in_smr) 4449 cache_fpl_smr_exit(fpl); 4450 return (CACHE_FPL_FAILED); 4451 case CACHE_FPL_STATUS_HANDLED: 4452 MPASS(error != CACHE_FPL_FAILED); 4453 cache_fpl_smr_assert_not_entered(fpl); 4454 if (__predict_false(error != 0)) { 4455 ndp->ni_dvp = NULL; 4456 ndp->ni_vp = NULL; 4457 cache_fpl_cleanup_cnp(cnp); 4458 return (error); 4459 } 4460 ndp->ni_dvp = fpl->dvp; 4461 ndp->ni_vp = fpl->tvp; 4462 if (cnp->cn_flags & SAVENAME) 4463 cnp->cn_flags |= HASBUF; 4464 else 4465 cache_fpl_cleanup_cnp(cnp); 4466 return (error); 4467 } 4468 } 4469 4470 /* 4471 * Fast path lookup protected with SMR and sequence counters. 4472 * 4473 * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one. 4474 * 4475 * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria 4476 * outlined below. 4477 * 4478 * Traditional vnode lookup conceptually looks like this: 4479 * 4480 * vn_lock(current); 4481 * for (;;) { 4482 * next = find(); 4483 * vn_lock(next); 4484 * vn_unlock(current); 4485 * current = next; 4486 * if (last) 4487 * break; 4488 * } 4489 * return (current); 4490 * 4491 * Each jump to the next vnode is safe memory-wise and atomic with respect to 4492 * any modifications thanks to holding respective locks. 4493 * 4494 * The same guarantee can be provided with a combination of safe memory 4495 * reclamation and sequence counters instead. If all operations which affect 4496 * the relationship between the current vnode and the one we are looking for 4497 * also modify the counter, we can verify whether all the conditions held as 4498 * we made the jump. This includes things like permissions, mount points etc. 4499 * Counter modification is provided by enclosing relevant places in 4500 * vn_seqc_write_begin()/end() calls. 4501 * 4502 * Thus this translates to: 4503 * 4504 * vfs_smr_enter(); 4505 * dvp_seqc = seqc_read_any(dvp); 4506 * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode 4507 * abort(); 4508 * for (;;) { 4509 * tvp = find(); 4510 * tvp_seqc = seqc_read_any(tvp); 4511 * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode 4512 * abort(); 4513 * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode 4514 * abort(); 4515 * dvp = tvp; // we know nothing of importance has changed 4516 * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration 4517 * if (last) 4518 * break; 4519 * } 4520 * vget(); // secure the vnode 4521 * if (!seqc_consistent(tvp, tvp_seqc) // final check 4522 * abort(); 4523 * // at this point we know nothing has changed for any parent<->child pair 4524 * // as they were crossed during the lookup, meaning we matched the guarantee 4525 * // of the locked variant 4526 * return (tvp); 4527 * 4528 * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows: 4529 * - they are called while within vfs_smr protection which they must never exit 4530 * - EAGAIN can be returned to denote checking could not be performed, it is 4531 * always valid to return it 4532 * - if the sequence counter has not changed the result must be valid 4533 * - if the sequence counter has changed both false positives and false negatives 4534 * are permitted (since the result will be rejected later) 4535 * - for simple cases of unix permission checks vaccess_vexec_smr can be used 4536 * 4537 * Caveats to watch out for: 4538 * - vnodes are passed unlocked and unreferenced with nothing stopping 4539 * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised 4540 * to use atomic_load_ptr to fetch it. 4541 * - the aforementioned object can also get freed, meaning absent other means it 4542 * should be protected with vfs_smr 4543 * - either safely checking permissions as they are modified or guaranteeing 4544 * their stability is left to the routine 4545 */ 4546 int 4547 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status, 4548 struct pwd **pwdp) 4549 { 4550 struct cache_fpl fpl; 4551 struct pwd *pwd; 4552 struct vnode *dvp; 4553 struct componentname *cnp; 4554 struct nameidata_saved orig; 4555 int error; 4556 4557 MPASS(ndp->ni_lcf == 0); 4558 4559 fpl.status = CACHE_FPL_STATUS_UNSET; 4560 fpl.ndp = ndp; 4561 fpl.cnp = &ndp->ni_cnd; 4562 MPASS(curthread == fpl.cnp->cn_thread); 4563 4564 if ((fpl.cnp->cn_flags & SAVESTART) != 0) 4565 MPASS(fpl.cnp->cn_nameiop != LOOKUP); 4566 4567 if (!cache_can_fplookup(&fpl)) { 4568 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 4569 *status = fpl.status; 4570 return (EOPNOTSUPP); 4571 } 4572 4573 cache_fpl_checkpoint(&fpl, &orig); 4574 4575 cache_fpl_smr_enter_initial(&fpl); 4576 fpl.fsearch = false; 4577 pwd = pwd_get_smr(); 4578 fpl.pwd = pwd; 4579 ndp->ni_rootdir = pwd->pwd_rdir; 4580 ndp->ni_topdir = pwd->pwd_jdir; 4581 4582 cnp = fpl.cnp; 4583 cnp->cn_nameptr = cnp->cn_pnbuf; 4584 if (cnp->cn_pnbuf[0] == '/') { 4585 cache_fpl_handle_root(ndp, &dvp); 4586 } else { 4587 if (ndp->ni_dirfd == AT_FDCWD) { 4588 dvp = pwd->pwd_cdir; 4589 } else { 4590 error = cache_fplookup_dirfd(&fpl, &dvp); 4591 if (__predict_false(error != 0)) { 4592 goto out; 4593 } 4594 } 4595 } 4596 4597 SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true); 4598 4599 error = cache_fplookup_impl(dvp, &fpl); 4600 out: 4601 cache_fpl_smr_assert_not_entered(&fpl); 4602 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 4603 4604 *status = fpl.status; 4605 switch (fpl.status) { 4606 case CACHE_FPL_STATUS_UNSET: 4607 __assert_unreachable(); 4608 break; 4609 case CACHE_FPL_STATUS_HANDLED: 4610 SDT_PROBE3(vfs, namei, lookup, return, error, 4611 (error == 0 ? ndp->ni_vp : NULL), true); 4612 break; 4613 case CACHE_FPL_STATUS_PARTIAL: 4614 *pwdp = fpl.pwd; 4615 /* 4616 * Status restored by cache_fplookup_partial_setup. 4617 */ 4618 break; 4619 case CACHE_FPL_STATUS_ABORTED: 4620 cache_fpl_restore(&fpl, &orig); 4621 break; 4622 } 4623 return (error); 4624 } 4625