1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Poul-Henning Kamp of the FreeBSD Project. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_ddb.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/capsicum.h> 46 #include <sys/counter.h> 47 #include <sys/filedesc.h> 48 #include <sys/fnv_hash.h> 49 #include <sys/kernel.h> 50 #include <sys/ktr.h> 51 #include <sys/lock.h> 52 #include <sys/malloc.h> 53 #include <sys/fcntl.h> 54 #include <sys/jail.h> 55 #include <sys/mount.h> 56 #include <sys/namei.h> 57 #include <sys/proc.h> 58 #include <sys/seqc.h> 59 #include <sys/sdt.h> 60 #include <sys/smr.h> 61 #include <sys/smp.h> 62 #include <sys/syscallsubr.h> 63 #include <sys/sysctl.h> 64 #include <sys/sysproto.h> 65 #include <sys/vnode.h> 66 #include <ck_queue.h> 67 #ifdef KTRACE 68 #include <sys/ktrace.h> 69 #endif 70 71 #include <sys/capsicum.h> 72 73 #include <security/audit/audit.h> 74 #include <security/mac/mac_framework.h> 75 76 #ifdef DDB 77 #include <ddb/ddb.h> 78 #endif 79 80 #include <vm/uma.h> 81 82 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 83 "Name cache"); 84 85 SDT_PROVIDER_DECLARE(vfs); 86 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", 87 "struct vnode *"); 88 SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *", 89 "struct vnode *"); 90 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", 91 "char *"); 92 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *", 93 "const char *"); 94 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *", 95 "struct namecache *", "int", "int"); 96 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); 97 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", 98 "char *", "struct vnode *"); 99 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); 100 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", 101 "struct vnode *", "char *"); 102 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", 103 "struct vnode *"); 104 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, 105 "struct vnode *", "char *"); 106 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", 107 "char *"); 108 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *", 109 "struct componentname *"); 110 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *", 111 "struct componentname *"); 112 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); 113 SDT_PROBE_DEFINE1(vfs, namecache, purge, batch, "int"); 114 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); 115 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); 116 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", 117 "struct vnode *"); 118 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", 119 "char *"); 120 SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *", 121 "char *"); 122 123 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool"); 124 SDT_PROBE_DECLARE(vfs, namei, lookup, entry); 125 SDT_PROBE_DECLARE(vfs, namei, lookup, return); 126 127 /* 128 * This structure describes the elements in the cache of recent 129 * names looked up by namei. 130 */ 131 struct negstate { 132 u_char neg_flag; 133 u_char neg_hit; 134 }; 135 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *), 136 "the state must fit in a union with a pointer without growing it"); 137 138 struct namecache { 139 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 140 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 141 CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */ 142 struct vnode *nc_dvp; /* vnode of parent of name */ 143 union { 144 struct vnode *nu_vp; /* vnode the name refers to */ 145 struct negstate nu_neg;/* negative entry state */ 146 } n_un; 147 u_char nc_flag; /* flag bits */ 148 u_char nc_nlen; /* length of name */ 149 char nc_name[0]; /* segment name + nul */ 150 }; 151 152 /* 153 * struct namecache_ts repeats struct namecache layout up to the 154 * nc_nlen member. 155 * struct namecache_ts is used in place of struct namecache when time(s) need 156 * to be stored. The nc_dotdottime field is used when a cache entry is mapping 157 * both a non-dotdot directory name plus dotdot for the directory's 158 * parent. 159 * 160 * See below for alignment requirement. 161 */ 162 struct namecache_ts { 163 struct timespec nc_time; /* timespec provided by fs */ 164 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ 165 int nc_ticks; /* ticks value when entry was added */ 166 int nc_pad; 167 struct namecache nc_nc; 168 }; 169 170 TAILQ_HEAD(cache_freebatch, namecache); 171 172 /* 173 * At least mips n32 performs 64-bit accesses to timespec as found 174 * in namecache_ts and requires them to be aligned. Since others 175 * may be in the same spot suffer a little bit and enforce the 176 * alignment for everyone. Note this is a nop for 64-bit platforms. 177 */ 178 #define CACHE_ZONE_ALIGNMENT UMA_ALIGNOF(time_t) 179 180 /* 181 * TODO: the initial value of CACHE_PATH_CUTOFF was inherited from the 182 * 4.4 BSD codebase. Later on struct namecache was tweaked to become 183 * smaller and the value was bumped to retain the total size, but it 184 * was never re-evaluated for suitability. A simple test counting 185 * lengths during package building shows that the value of 45 covers 186 * about 86% of all added entries, reaching 99% at 65. 187 * 188 * Regardless of the above, use of dedicated zones instead of malloc may be 189 * inducing additional waste. This may be hard to address as said zones are 190 * tied to VFS SMR. Even if retaining them, the current split should be 191 * re-evaluated. 192 */ 193 #ifdef __LP64__ 194 #define CACHE_PATH_CUTOFF 45 195 #define CACHE_LARGE_PAD 6 196 #else 197 #define CACHE_PATH_CUTOFF 41 198 #define CACHE_LARGE_PAD 2 199 #endif 200 201 #define CACHE_ZONE_SMALL_SIZE (offsetof(struct namecache, nc_name) + CACHE_PATH_CUTOFF + 1) 202 #define CACHE_ZONE_SMALL_TS_SIZE (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_SMALL_SIZE) 203 #define CACHE_ZONE_LARGE_SIZE (offsetof(struct namecache, nc_name) + NAME_MAX + 1 + CACHE_LARGE_PAD) 204 #define CACHE_ZONE_LARGE_TS_SIZE (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_LARGE_SIZE) 205 206 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 207 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 208 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 209 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 210 211 #define nc_vp n_un.nu_vp 212 #define nc_neg n_un.nu_neg 213 214 /* 215 * Flags in namecache.nc_flag 216 */ 217 #define NCF_WHITE 0x01 218 #define NCF_ISDOTDOT 0x02 219 #define NCF_TS 0x04 220 #define NCF_DTS 0x08 221 #define NCF_DVDROP 0x10 222 #define NCF_NEGATIVE 0x20 223 #define NCF_INVALID 0x40 224 #define NCF_WIP 0x80 225 226 /* 227 * Flags in negstate.neg_flag 228 */ 229 #define NEG_HOT 0x01 230 231 static bool cache_neg_evict_cond(u_long lnumcache); 232 233 /* 234 * Mark an entry as invalid. 235 * 236 * This is called before it starts getting deconstructed. 237 */ 238 static void 239 cache_ncp_invalidate(struct namecache *ncp) 240 { 241 242 KASSERT((ncp->nc_flag & NCF_INVALID) == 0, 243 ("%s: entry %p already invalid", __func__, ncp)); 244 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID); 245 atomic_thread_fence_rel(); 246 } 247 248 /* 249 * Check whether the entry can be safely used. 250 * 251 * All places which elide locks are supposed to call this after they are 252 * done with reading from an entry. 253 */ 254 #define cache_ncp_canuse(ncp) ({ \ 255 struct namecache *_ncp = (ncp); \ 256 u_char _nc_flag; \ 257 \ 258 atomic_thread_fence_acq(); \ 259 _nc_flag = atomic_load_char(&_ncp->nc_flag); \ 260 __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP)) == 0); \ 261 }) 262 263 /* 264 * Name caching works as follows: 265 * 266 * Names found by directory scans are retained in a cache 267 * for future reference. It is managed LRU, so frequently 268 * used names will hang around. Cache is indexed by hash value 269 * obtained from (dvp, name) where dvp refers to the directory 270 * containing name. 271 * 272 * If it is a "negative" entry, (i.e. for a name that is known NOT to 273 * exist) the vnode pointer will be NULL. 274 * 275 * Upon reaching the last segment of a path, if the reference 276 * is for DELETE, or NOCACHE is set (rewrite), and the 277 * name is located in the cache, it will be dropped. 278 * 279 * These locks are used (in the order in which they can be taken): 280 * NAME TYPE ROLE 281 * vnodelock mtx vnode lists and v_cache_dd field protection 282 * bucketlock mtx for access to given set of hash buckets 283 * neglist mtx negative entry LRU management 284 * 285 * It is legal to take multiple vnodelock and bucketlock locks. The locking 286 * order is lower address first. Both are recursive. 287 * 288 * "." lookups are lockless. 289 * 290 * ".." and vnode -> name lookups require vnodelock. 291 * 292 * name -> vnode lookup requires the relevant bucketlock to be held for reading. 293 * 294 * Insertions and removals of entries require involved vnodes and bucketlocks 295 * to be locked to provide safe operation against other threads modifying the 296 * cache. 297 * 298 * Some lookups result in removal of the found entry (e.g. getting rid of a 299 * negative entry with the intent to create a positive one), which poses a 300 * problem when multiple threads reach the state. Similarly, two different 301 * threads can purge two different vnodes and try to remove the same name. 302 * 303 * If the already held vnode lock is lower than the second required lock, we 304 * can just take the other lock. However, in the opposite case, this could 305 * deadlock. As such, this is resolved by trylocking and if that fails unlocking 306 * the first node, locking everything in order and revalidating the state. 307 */ 308 309 VFS_SMR_DECLARE; 310 311 static SYSCTL_NODE(_vfs_cache, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 312 "Name cache parameters"); 313 314 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */ 315 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, size, CTLFLAG_RW, &ncsize, 0, 316 "Total namecache capacity"); 317 318 u_int ncsizefactor = 2; 319 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, sizefactor, CTLFLAG_RW, &ncsizefactor, 0, 320 "Size factor for namecache"); 321 322 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */ 323 SYSCTL_ULONG(_vfs_cache_param, OID_AUTO, negfactor, CTLFLAG_RW, &ncnegfactor, 0, 324 "Ratio of negative namecache entries"); 325 326 /* 327 * Negative entry % of namecache capacity above which automatic eviction is allowed. 328 * 329 * Check cache_neg_evict_cond for details. 330 */ 331 static u_int ncnegminpct = 3; 332 333 static u_int __read_mostly neg_min; /* the above recomputed against ncsize */ 334 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, negmin, CTLFLAG_RD, &neg_min, 0, 335 "Negative entry count above which automatic eviction is allowed"); 336 337 /* 338 * Structures associated with name caching. 339 */ 340 #define NCHHASH(hash) \ 341 (&nchashtbl[(hash) & nchash]) 342 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ 343 static u_long __read_mostly nchash; /* size of hash table */ 344 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 345 "Size of namecache hash table"); 346 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ 347 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ 348 349 struct nchstats nchstats; /* cache effectiveness statistics */ 350 351 static bool __read_frequently cache_fast_revlookup = true; 352 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_revlookup, CTLFLAG_RW, 353 &cache_fast_revlookup, 0, ""); 354 355 static u_int __exclusive_cache_line neg_cycle; 356 357 #define ncneghash 3 358 #define numneglists (ncneghash + 1) 359 360 struct neglist { 361 struct mtx nl_evict_lock; 362 struct mtx nl_lock __aligned(CACHE_LINE_SIZE); 363 TAILQ_HEAD(, namecache) nl_list; 364 TAILQ_HEAD(, namecache) nl_hotlist; 365 u_long nl_hotnum; 366 } __aligned(CACHE_LINE_SIZE); 367 368 static struct neglist neglists[numneglists]; 369 370 static inline struct neglist * 371 NCP2NEGLIST(struct namecache *ncp) 372 { 373 374 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); 375 } 376 377 static inline struct negstate * 378 NCP2NEGSTATE(struct namecache *ncp) 379 { 380 381 MPASS(ncp->nc_flag & NCF_NEGATIVE); 382 return (&ncp->nc_neg); 383 } 384 385 #define numbucketlocks (ncbuckethash + 1) 386 static u_int __read_mostly ncbuckethash; 387 static struct mtx_padalign __read_mostly *bucketlocks; 388 #define HASH2BUCKETLOCK(hash) \ 389 ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)])) 390 391 #define numvnodelocks (ncvnodehash + 1) 392 static u_int __read_mostly ncvnodehash; 393 static struct mtx __read_mostly *vnodelocks; 394 static inline struct mtx * 395 VP2VNODELOCK(struct vnode *vp) 396 { 397 398 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); 399 } 400 401 static void 402 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) 403 { 404 struct namecache_ts *ncp_ts; 405 406 KASSERT((ncp->nc_flag & NCF_TS) != 0 || 407 (tsp == NULL && ticksp == NULL), 408 ("No NCF_TS")); 409 410 if (tsp == NULL) 411 return; 412 413 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 414 *tsp = ncp_ts->nc_time; 415 *ticksp = ncp_ts->nc_ticks; 416 } 417 418 #ifdef DEBUG_CACHE 419 static int __read_mostly doingcache = 1; /* 1 => enable the cache */ 420 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, 421 "VFS namecache enabled"); 422 #endif 423 424 /* Export size information to userland */ 425 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 426 sizeof(struct namecache), "sizeof(struct namecache)"); 427 428 /* 429 * The new name cache statistics 430 */ 431 static SYSCTL_NODE(_vfs_cache, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 432 "Name cache statistics"); 433 434 #define STATNODE_ULONG(name, varname, descr) \ 435 SYSCTL_ULONG(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr); 436 #define STATNODE_COUNTER(name, varname, descr) \ 437 static COUNTER_U64_DEFINE_EARLY(varname); \ 438 SYSCTL_COUNTER_U64(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, \ 439 descr); 440 STATNODE_ULONG(neg, numneg, "Number of negative cache entries"); 441 STATNODE_ULONG(count, numcache, "Number of cache entries"); 442 STATNODE_COUNTER(heldvnodes, numcachehv, "Number of namecache entries with vnodes held"); 443 STATNODE_COUNTER(drops, numdrops, "Number of dropped entries due to reaching the limit"); 444 STATNODE_COUNTER(dothits, dothits, "Number of '.' hits"); 445 STATNODE_COUNTER(dotdothis, dotdothits, "Number of '..' hits"); 446 STATNODE_COUNTER(miss, nummiss, "Number of cache misses"); 447 STATNODE_COUNTER(misszap, nummisszap, "Number of cache misses we do not want to cache"); 448 STATNODE_COUNTER(posszaps, numposzaps, 449 "Number of cache hits (positive) we do not want to cache"); 450 STATNODE_COUNTER(poshits, numposhits, "Number of cache hits (positive)"); 451 STATNODE_COUNTER(negzaps, numnegzaps, 452 "Number of cache hits (negative) we do not want to cache"); 453 STATNODE_COUNTER(neghits, numneghits, "Number of cache hits (negative)"); 454 /* These count for vn_getcwd(), too. */ 455 STATNODE_COUNTER(fullpathcalls, numfullpathcalls, "Number of fullpath search calls"); 456 STATNODE_COUNTER(fullpathfail1, numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); 457 STATNODE_COUNTER(fullpathfail2, numfullpathfail2, 458 "Number of fullpath search errors (VOP_VPTOCNP failures)"); 459 STATNODE_COUNTER(fullpathfail4, numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); 460 STATNODE_COUNTER(fullpathfound, numfullpathfound, "Number of successful fullpath calls"); 461 462 /* 463 * Debug or developer statistics. 464 */ 465 static SYSCTL_NODE(_vfs_cache, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 466 "Name cache debugging"); 467 #define DEBUGNODE_ULONG(name, varname, descr) \ 468 SYSCTL_ULONG(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr); 469 #define DEBUGNODE_COUNTER(name, varname, descr) \ 470 static COUNTER_U64_DEFINE_EARLY(varname); \ 471 SYSCTL_COUNTER_U64(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, \ 472 descr); 473 DEBUGNODE_COUNTER(zap_bucket_relock_success, zap_bucket_relock_success, 474 "Number of successful removals after relocking"); 475 static long zap_bucket_fail; 476 DEBUGNODE_ULONG(zap_bucket_fail, zap_bucket_fail, ""); 477 static long zap_bucket_fail2; 478 DEBUGNODE_ULONG(zap_bucket_fail2, zap_bucket_fail2, ""); 479 static long cache_lock_vnodes_cel_3_failures; 480 DEBUGNODE_ULONG(vnodes_cel_3_failures, cache_lock_vnodes_cel_3_failures, 481 "Number of times 3-way vnode locking failed"); 482 483 static void cache_zap_locked(struct namecache *ncp); 484 static int vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, 485 char **freebuf, size_t *buflen); 486 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 487 char **retbuf, size_t *buflen, size_t addend); 488 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, 489 char **retbuf, size_t *buflen); 490 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, 491 char **retbuf, size_t *len, size_t addend); 492 493 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 494 495 static inline void 496 cache_assert_vlp_locked(struct mtx *vlp) 497 { 498 499 if (vlp != NULL) 500 mtx_assert(vlp, MA_OWNED); 501 } 502 503 static inline void 504 cache_assert_vnode_locked(struct vnode *vp) 505 { 506 struct mtx *vlp; 507 508 vlp = VP2VNODELOCK(vp); 509 cache_assert_vlp_locked(vlp); 510 } 511 512 /* 513 * Directory vnodes with entries are held for two reasons: 514 * 1. make them less of a target for reclamation in vnlru 515 * 2. suffer smaller performance penalty in locked lookup as requeieing is avoided 516 * 517 * Note this preferably would not be done and it's a hold over from. It will be 518 * feasible to eliminate altogether if all filesystems start supporting 519 * lockless lookup. 520 */ 521 static void 522 cache_hold_vnode(struct vnode *vp) 523 { 524 525 cache_assert_vnode_locked(vp); 526 VNPASS(LIST_EMPTY(&vp->v_cache_src), vp); 527 vhold(vp); 528 counter_u64_add(numcachehv, 1); 529 } 530 531 static void 532 cache_drop_vnode(struct vnode *vp) 533 { 534 535 /* 536 * Called after all locks are dropped, meaning we can't assert 537 * on the state of v_cache_src. 538 */ 539 vdrop(vp); 540 counter_u64_add(numcachehv, -1); 541 } 542 543 /* 544 * UMA zones. 545 */ 546 static uma_zone_t __read_mostly cache_zone_small; 547 static uma_zone_t __read_mostly cache_zone_small_ts; 548 static uma_zone_t __read_mostly cache_zone_large; 549 static uma_zone_t __read_mostly cache_zone_large_ts; 550 551 static struct namecache * 552 cache_alloc_uma(int len, bool ts) 553 { 554 struct namecache_ts *ncp_ts; 555 struct namecache *ncp; 556 557 if (__predict_false(ts)) { 558 if (len <= CACHE_PATH_CUTOFF) 559 ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK); 560 else 561 ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK); 562 ncp = &ncp_ts->nc_nc; 563 } else { 564 if (len <= CACHE_PATH_CUTOFF) 565 ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK); 566 else 567 ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK); 568 } 569 return (ncp); 570 } 571 572 static void 573 cache_free_uma(struct namecache *ncp) 574 { 575 struct namecache_ts *ncp_ts; 576 577 if (__predict_false(ncp->nc_flag & NCF_TS)) { 578 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 579 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 580 uma_zfree_smr(cache_zone_small_ts, ncp_ts); 581 else 582 uma_zfree_smr(cache_zone_large_ts, ncp_ts); 583 } else { 584 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 585 uma_zfree_smr(cache_zone_small, ncp); 586 else 587 uma_zfree_smr(cache_zone_large, ncp); 588 } 589 } 590 591 static struct namecache * 592 cache_alloc(int len, bool ts) 593 { 594 u_long lnumcache; 595 596 /* 597 * Avoid blowout in namecache entries. 598 * 599 * Bugs: 600 * 1. filesystems may end up trying to add an already existing entry 601 * (for example this can happen after a cache miss during concurrent 602 * lookup), in which case we will call cache_neg_evict despite not 603 * adding anything. 604 * 2. the routine may fail to free anything and no provisions are made 605 * to make it try harder (see the inside for failure modes) 606 * 3. it only ever looks at negative entries. 607 */ 608 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; 609 if (cache_neg_evict_cond(lnumcache)) { 610 lnumcache = atomic_load_long(&numcache); 611 } 612 if (__predict_false(lnumcache >= ncsize)) { 613 atomic_subtract_long(&numcache, 1); 614 counter_u64_add(numdrops, 1); 615 return (NULL); 616 } 617 return (cache_alloc_uma(len, ts)); 618 } 619 620 static void 621 cache_free(struct namecache *ncp) 622 { 623 624 MPASS(ncp != NULL); 625 if ((ncp->nc_flag & NCF_DVDROP) != 0) { 626 cache_drop_vnode(ncp->nc_dvp); 627 } 628 cache_free_uma(ncp); 629 atomic_subtract_long(&numcache, 1); 630 } 631 632 static void 633 cache_free_batch(struct cache_freebatch *batch) 634 { 635 struct namecache *ncp, *nnp; 636 int i; 637 638 i = 0; 639 if (TAILQ_EMPTY(batch)) 640 goto out; 641 TAILQ_FOREACH_SAFE(ncp, batch, nc_dst, nnp) { 642 if ((ncp->nc_flag & NCF_DVDROP) != 0) { 643 cache_drop_vnode(ncp->nc_dvp); 644 } 645 cache_free_uma(ncp); 646 i++; 647 } 648 atomic_subtract_long(&numcache, i); 649 out: 650 SDT_PROBE1(vfs, namecache, purge, batch, i); 651 } 652 653 /* 654 * TODO: With the value stored we can do better than computing the hash based 655 * on the address. The choice of FNV should also be revisited. 656 */ 657 static void 658 cache_prehash(struct vnode *vp) 659 { 660 661 vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT); 662 } 663 664 static uint32_t 665 cache_get_hash(char *name, u_char len, struct vnode *dvp) 666 { 667 668 return (fnv_32_buf(name, len, dvp->v_nchash)); 669 } 670 671 static inline struct nchashhead * 672 NCP2BUCKET(struct namecache *ncp) 673 { 674 uint32_t hash; 675 676 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 677 return (NCHHASH(hash)); 678 } 679 680 static inline struct mtx * 681 NCP2BUCKETLOCK(struct namecache *ncp) 682 { 683 uint32_t hash; 684 685 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 686 return (HASH2BUCKETLOCK(hash)); 687 } 688 689 #ifdef INVARIANTS 690 static void 691 cache_assert_bucket_locked(struct namecache *ncp) 692 { 693 struct mtx *blp; 694 695 blp = NCP2BUCKETLOCK(ncp); 696 mtx_assert(blp, MA_OWNED); 697 } 698 699 static void 700 cache_assert_bucket_unlocked(struct namecache *ncp) 701 { 702 struct mtx *blp; 703 704 blp = NCP2BUCKETLOCK(ncp); 705 mtx_assert(blp, MA_NOTOWNED); 706 } 707 #else 708 #define cache_assert_bucket_locked(x) do { } while (0) 709 #define cache_assert_bucket_unlocked(x) do { } while (0) 710 #endif 711 712 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) 713 static void 714 _cache_sort_vnodes(void **p1, void **p2) 715 { 716 void *tmp; 717 718 MPASS(*p1 != NULL || *p2 != NULL); 719 720 if (*p1 > *p2) { 721 tmp = *p2; 722 *p2 = *p1; 723 *p1 = tmp; 724 } 725 } 726 727 static void 728 cache_lock_all_buckets(void) 729 { 730 u_int i; 731 732 for (i = 0; i < numbucketlocks; i++) 733 mtx_lock(&bucketlocks[i]); 734 } 735 736 static void 737 cache_unlock_all_buckets(void) 738 { 739 u_int i; 740 741 for (i = 0; i < numbucketlocks; i++) 742 mtx_unlock(&bucketlocks[i]); 743 } 744 745 static void 746 cache_lock_all_vnodes(void) 747 { 748 u_int i; 749 750 for (i = 0; i < numvnodelocks; i++) 751 mtx_lock(&vnodelocks[i]); 752 } 753 754 static void 755 cache_unlock_all_vnodes(void) 756 { 757 u_int i; 758 759 for (i = 0; i < numvnodelocks; i++) 760 mtx_unlock(&vnodelocks[i]); 761 } 762 763 static int 764 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 765 { 766 767 cache_sort_vnodes(&vlp1, &vlp2); 768 769 if (vlp1 != NULL) { 770 if (!mtx_trylock(vlp1)) 771 return (EAGAIN); 772 } 773 if (!mtx_trylock(vlp2)) { 774 if (vlp1 != NULL) 775 mtx_unlock(vlp1); 776 return (EAGAIN); 777 } 778 779 return (0); 780 } 781 782 static void 783 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 784 { 785 786 MPASS(vlp1 != NULL || vlp2 != NULL); 787 MPASS(vlp1 <= vlp2); 788 789 if (vlp1 != NULL) 790 mtx_lock(vlp1); 791 if (vlp2 != NULL) 792 mtx_lock(vlp2); 793 } 794 795 static void 796 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 797 { 798 799 MPASS(vlp1 != NULL || vlp2 != NULL); 800 801 if (vlp1 != NULL) 802 mtx_unlock(vlp1); 803 if (vlp2 != NULL) 804 mtx_unlock(vlp2); 805 } 806 807 static int 808 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 809 { 810 struct nchstats snap; 811 812 if (req->oldptr == NULL) 813 return (SYSCTL_OUT(req, 0, sizeof(snap))); 814 815 snap = nchstats; 816 snap.ncs_goodhits = counter_u64_fetch(numposhits); 817 snap.ncs_neghits = counter_u64_fetch(numneghits); 818 snap.ncs_badhits = counter_u64_fetch(numposzaps) + 819 counter_u64_fetch(numnegzaps); 820 snap.ncs_miss = counter_u64_fetch(nummisszap) + 821 counter_u64_fetch(nummiss); 822 823 return (SYSCTL_OUT(req, &snap, sizeof(snap))); 824 } 825 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | 826 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", 827 "VFS cache effectiveness statistics"); 828 829 static void 830 cache_recalc_neg_min(u_int val) 831 { 832 833 neg_min = (ncsize * val) / 100; 834 } 835 836 static int 837 sysctl_negminpct(SYSCTL_HANDLER_ARGS) 838 { 839 u_int val; 840 int error; 841 842 val = ncnegminpct; 843 error = sysctl_handle_int(oidp, &val, 0, req); 844 if (error != 0 || req->newptr == NULL) 845 return (error); 846 847 if (val == ncnegminpct) 848 return (0); 849 if (val < 0 || val > 99) 850 return (EINVAL); 851 ncnegminpct = val; 852 cache_recalc_neg_min(val); 853 return (0); 854 } 855 856 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, negminpct, 857 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_negminpct, 858 "I", "Negative entry \% of namecache capacity above which automatic eviction is allowed"); 859 860 #ifdef DIAGNOSTIC 861 /* 862 * Grab an atomic snapshot of the name cache hash chain lengths 863 */ 864 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, 865 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 866 "hash table stats"); 867 868 static int 869 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) 870 { 871 struct nchashhead *ncpp; 872 struct namecache *ncp; 873 int i, error, n_nchash, *cntbuf; 874 875 retry: 876 n_nchash = nchash + 1; /* nchash is max index, not count */ 877 if (req->oldptr == NULL) 878 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); 879 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); 880 cache_lock_all_buckets(); 881 if (n_nchash != nchash + 1) { 882 cache_unlock_all_buckets(); 883 free(cntbuf, M_TEMP); 884 goto retry; 885 } 886 /* Scan hash tables counting entries */ 887 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) 888 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) 889 cntbuf[i]++; 890 cache_unlock_all_buckets(); 891 for (error = 0, i = 0; i < n_nchash; i++) 892 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) 893 break; 894 free(cntbuf, M_TEMP); 895 return (error); 896 } 897 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| 898 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", 899 "nchash chain lengths"); 900 901 static int 902 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) 903 { 904 int error; 905 struct nchashhead *ncpp; 906 struct namecache *ncp; 907 int n_nchash; 908 int count, maxlength, used, pct; 909 910 if (!req->oldptr) 911 return SYSCTL_OUT(req, 0, 4 * sizeof(int)); 912 913 cache_lock_all_buckets(); 914 n_nchash = nchash + 1; /* nchash is max index, not count */ 915 used = 0; 916 maxlength = 0; 917 918 /* Scan hash tables for applicable entries */ 919 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { 920 count = 0; 921 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) { 922 count++; 923 } 924 if (count) 925 used++; 926 if (maxlength < count) 927 maxlength = count; 928 } 929 n_nchash = nchash + 1; 930 cache_unlock_all_buckets(); 931 pct = (used * 100) / (n_nchash / 100); 932 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); 933 if (error) 934 return (error); 935 error = SYSCTL_OUT(req, &used, sizeof(used)); 936 if (error) 937 return (error); 938 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); 939 if (error) 940 return (error); 941 error = SYSCTL_OUT(req, &pct, sizeof(pct)); 942 if (error) 943 return (error); 944 return (0); 945 } 946 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| 947 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", 948 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); 949 #endif 950 951 /* 952 * Negative entries management 953 * 954 * Various workloads create plenty of negative entries and barely use them 955 * afterwards. Moreover malicious users can keep performing bogus lookups 956 * adding even more entries. For example "make tinderbox" as of writing this 957 * comment ends up with 2.6M namecache entries in total, 1.2M of which are 958 * negative. 959 * 960 * As such, a rather aggressive eviction method is needed. The currently 961 * employed method is a placeholder. 962 * 963 * Entries are split over numneglists separate lists, each of which is further 964 * split into hot and cold entries. Entries get promoted after getting a hit. 965 * Eviction happens on addition of new entry. 966 */ 967 static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 968 "Name cache negative entry statistics"); 969 970 SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0, 971 "Number of negative cache entries"); 972 973 static COUNTER_U64_DEFINE_EARLY(neg_created); 974 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created, 975 "Number of created negative entries"); 976 977 static COUNTER_U64_DEFINE_EARLY(neg_evicted); 978 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted, 979 "Number of evicted negative entries"); 980 981 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty); 982 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD, 983 &neg_evict_skipped_empty, 984 "Number of times evicting failed due to lack of entries"); 985 986 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed); 987 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD, 988 &neg_evict_skipped_missed, 989 "Number of times evicting failed due to target entry disappearing"); 990 991 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended); 992 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD, 993 &neg_evict_skipped_contended, 994 "Number of times evicting failed due to contention"); 995 996 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits, 997 "Number of cache hits (negative)"); 998 999 static int 1000 sysctl_neg_hot(SYSCTL_HANDLER_ARGS) 1001 { 1002 int i, out; 1003 1004 out = 0; 1005 for (i = 0; i < numneglists; i++) 1006 out += neglists[i].nl_hotnum; 1007 1008 return (SYSCTL_OUT(req, &out, sizeof(out))); 1009 } 1010 SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD | 1011 CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I", 1012 "Number of hot negative entries"); 1013 1014 static void 1015 cache_neg_init(struct namecache *ncp) 1016 { 1017 struct negstate *ns; 1018 1019 ncp->nc_flag |= NCF_NEGATIVE; 1020 ns = NCP2NEGSTATE(ncp); 1021 ns->neg_flag = 0; 1022 ns->neg_hit = 0; 1023 counter_u64_add(neg_created, 1); 1024 } 1025 1026 #define CACHE_NEG_PROMOTION_THRESH 2 1027 1028 static bool 1029 cache_neg_hit_prep(struct namecache *ncp) 1030 { 1031 struct negstate *ns; 1032 u_char n; 1033 1034 ns = NCP2NEGSTATE(ncp); 1035 n = atomic_load_char(&ns->neg_hit); 1036 for (;;) { 1037 if (n >= CACHE_NEG_PROMOTION_THRESH) 1038 return (false); 1039 if (atomic_fcmpset_8(&ns->neg_hit, &n, n + 1)) 1040 break; 1041 } 1042 return (n + 1 == CACHE_NEG_PROMOTION_THRESH); 1043 } 1044 1045 /* 1046 * Nothing to do here but it is provided for completeness as some 1047 * cache_neg_hit_prep callers may end up returning without even 1048 * trying to promote. 1049 */ 1050 #define cache_neg_hit_abort(ncp) do { } while (0) 1051 1052 static void 1053 cache_neg_hit_finish(struct namecache *ncp) 1054 { 1055 1056 SDT_PROBE2(vfs, namecache, lookup, hit__negative, ncp->nc_dvp, ncp->nc_name); 1057 counter_u64_add(numneghits, 1); 1058 } 1059 1060 /* 1061 * Move a negative entry to the hot list. 1062 */ 1063 static void 1064 cache_neg_promote_locked(struct namecache *ncp) 1065 { 1066 struct neglist *nl; 1067 struct negstate *ns; 1068 1069 ns = NCP2NEGSTATE(ncp); 1070 nl = NCP2NEGLIST(ncp); 1071 mtx_assert(&nl->nl_lock, MA_OWNED); 1072 if ((ns->neg_flag & NEG_HOT) == 0) { 1073 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst); 1074 TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst); 1075 nl->nl_hotnum++; 1076 ns->neg_flag |= NEG_HOT; 1077 } 1078 } 1079 1080 /* 1081 * Move a hot negative entry to the cold list. 1082 */ 1083 static void 1084 cache_neg_demote_locked(struct namecache *ncp) 1085 { 1086 struct neglist *nl; 1087 struct negstate *ns; 1088 1089 ns = NCP2NEGSTATE(ncp); 1090 nl = NCP2NEGLIST(ncp); 1091 mtx_assert(&nl->nl_lock, MA_OWNED); 1092 MPASS(ns->neg_flag & NEG_HOT); 1093 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst); 1094 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst); 1095 nl->nl_hotnum--; 1096 ns->neg_flag &= ~NEG_HOT; 1097 atomic_store_char(&ns->neg_hit, 0); 1098 } 1099 1100 /* 1101 * Move a negative entry to the hot list if it matches the lookup. 1102 * 1103 * We have to take locks, but they may be contended and in the worst 1104 * case we may need to go off CPU. We don't want to spin within the 1105 * smr section and we can't block with it. Exiting the section means 1106 * the found entry could have been evicted. We are going to look it 1107 * up again. 1108 */ 1109 static bool 1110 cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp, 1111 struct namecache *oncp, uint32_t hash) 1112 { 1113 struct namecache *ncp; 1114 struct neglist *nl; 1115 u_char nc_flag; 1116 1117 nl = NCP2NEGLIST(oncp); 1118 1119 mtx_lock(&nl->nl_lock); 1120 /* 1121 * For hash iteration. 1122 */ 1123 vfs_smr_enter(); 1124 1125 /* 1126 * Avoid all surprises by only succeeding if we got the same entry and 1127 * bailing completely otherwise. 1128 * XXX There are no provisions to keep the vnode around, meaning we may 1129 * end up promoting a negative entry for a *new* vnode and returning 1130 * ENOENT on its account. This is the error we want to return anyway 1131 * and promotion is harmless. 1132 * 1133 * In particular at this point there can be a new ncp which matches the 1134 * search but hashes to a different neglist. 1135 */ 1136 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1137 if (ncp == oncp) 1138 break; 1139 } 1140 1141 /* 1142 * No match to begin with. 1143 */ 1144 if (__predict_false(ncp == NULL)) { 1145 goto out_abort; 1146 } 1147 1148 /* 1149 * The newly found entry may be something different... 1150 */ 1151 if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1152 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) { 1153 goto out_abort; 1154 } 1155 1156 /* 1157 * ... and not even negative. 1158 */ 1159 nc_flag = atomic_load_char(&ncp->nc_flag); 1160 if ((nc_flag & NCF_NEGATIVE) == 0) { 1161 goto out_abort; 1162 } 1163 1164 if (!cache_ncp_canuse(ncp)) { 1165 goto out_abort; 1166 } 1167 1168 cache_neg_promote_locked(ncp); 1169 cache_neg_hit_finish(ncp); 1170 vfs_smr_exit(); 1171 mtx_unlock(&nl->nl_lock); 1172 return (true); 1173 out_abort: 1174 vfs_smr_exit(); 1175 mtx_unlock(&nl->nl_lock); 1176 return (false); 1177 } 1178 1179 static void 1180 cache_neg_promote(struct namecache *ncp) 1181 { 1182 struct neglist *nl; 1183 1184 nl = NCP2NEGLIST(ncp); 1185 mtx_lock(&nl->nl_lock); 1186 cache_neg_promote_locked(ncp); 1187 mtx_unlock(&nl->nl_lock); 1188 } 1189 1190 static void 1191 cache_neg_insert(struct namecache *ncp) 1192 { 1193 struct neglist *nl; 1194 1195 MPASS(ncp->nc_flag & NCF_NEGATIVE); 1196 cache_assert_bucket_locked(ncp); 1197 nl = NCP2NEGLIST(ncp); 1198 mtx_lock(&nl->nl_lock); 1199 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst); 1200 mtx_unlock(&nl->nl_lock); 1201 atomic_add_long(&numneg, 1); 1202 } 1203 1204 static void 1205 cache_neg_remove(struct namecache *ncp) 1206 { 1207 struct neglist *nl; 1208 struct negstate *ns; 1209 1210 cache_assert_bucket_locked(ncp); 1211 nl = NCP2NEGLIST(ncp); 1212 ns = NCP2NEGSTATE(ncp); 1213 mtx_lock(&nl->nl_lock); 1214 if ((ns->neg_flag & NEG_HOT) != 0) { 1215 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst); 1216 nl->nl_hotnum--; 1217 } else { 1218 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst); 1219 } 1220 mtx_unlock(&nl->nl_lock); 1221 atomic_subtract_long(&numneg, 1); 1222 } 1223 1224 static struct neglist * 1225 cache_neg_evict_select_list(void) 1226 { 1227 struct neglist *nl; 1228 u_int c; 1229 1230 c = atomic_fetchadd_int(&neg_cycle, 1) + 1; 1231 nl = &neglists[c % numneglists]; 1232 if (!mtx_trylock(&nl->nl_evict_lock)) { 1233 counter_u64_add(neg_evict_skipped_contended, 1); 1234 return (NULL); 1235 } 1236 return (nl); 1237 } 1238 1239 static struct namecache * 1240 cache_neg_evict_select_entry(struct neglist *nl) 1241 { 1242 struct namecache *ncp, *lncp; 1243 struct negstate *ns, *lns; 1244 int i; 1245 1246 mtx_assert(&nl->nl_evict_lock, MA_OWNED); 1247 mtx_assert(&nl->nl_lock, MA_OWNED); 1248 ncp = TAILQ_FIRST(&nl->nl_list); 1249 if (ncp == NULL) 1250 return (NULL); 1251 lncp = ncp; 1252 lns = NCP2NEGSTATE(lncp); 1253 for (i = 1; i < 4; i++) { 1254 ncp = TAILQ_NEXT(ncp, nc_dst); 1255 if (ncp == NULL) 1256 break; 1257 ns = NCP2NEGSTATE(ncp); 1258 if (ns->neg_hit < lns->neg_hit) { 1259 lncp = ncp; 1260 lns = ns; 1261 } 1262 } 1263 return (lncp); 1264 } 1265 1266 static bool 1267 cache_neg_evict(void) 1268 { 1269 struct namecache *ncp, *ncp2; 1270 struct neglist *nl; 1271 struct vnode *dvp; 1272 struct mtx *dvlp; 1273 struct mtx *blp; 1274 uint32_t hash; 1275 u_char nlen; 1276 bool evicted; 1277 1278 nl = cache_neg_evict_select_list(); 1279 if (nl == NULL) { 1280 return (false); 1281 } 1282 1283 mtx_lock(&nl->nl_lock); 1284 ncp = TAILQ_FIRST(&nl->nl_hotlist); 1285 if (ncp != NULL) { 1286 cache_neg_demote_locked(ncp); 1287 } 1288 ncp = cache_neg_evict_select_entry(nl); 1289 if (ncp == NULL) { 1290 counter_u64_add(neg_evict_skipped_empty, 1); 1291 mtx_unlock(&nl->nl_lock); 1292 mtx_unlock(&nl->nl_evict_lock); 1293 return (false); 1294 } 1295 nlen = ncp->nc_nlen; 1296 dvp = ncp->nc_dvp; 1297 hash = cache_get_hash(ncp->nc_name, nlen, dvp); 1298 dvlp = VP2VNODELOCK(dvp); 1299 blp = HASH2BUCKETLOCK(hash); 1300 mtx_unlock(&nl->nl_lock); 1301 mtx_unlock(&nl->nl_evict_lock); 1302 mtx_lock(dvlp); 1303 mtx_lock(blp); 1304 /* 1305 * Note that since all locks were dropped above, the entry may be 1306 * gone or reallocated to be something else. 1307 */ 1308 CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) { 1309 if (ncp2 == ncp && ncp2->nc_dvp == dvp && 1310 ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0) 1311 break; 1312 } 1313 if (ncp2 == NULL) { 1314 counter_u64_add(neg_evict_skipped_missed, 1); 1315 ncp = NULL; 1316 evicted = false; 1317 } else { 1318 MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp)); 1319 MPASS(blp == NCP2BUCKETLOCK(ncp)); 1320 SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp, 1321 ncp->nc_name); 1322 cache_zap_locked(ncp); 1323 counter_u64_add(neg_evicted, 1); 1324 evicted = true; 1325 } 1326 mtx_unlock(blp); 1327 mtx_unlock(dvlp); 1328 if (ncp != NULL) 1329 cache_free(ncp); 1330 return (evicted); 1331 } 1332 1333 /* 1334 * Maybe evict a negative entry to create more room. 1335 * 1336 * The ncnegfactor parameter limits what fraction of the total count 1337 * can comprise of negative entries. However, if the cache is just 1338 * warming up this leads to excessive evictions. As such, ncnegminpct 1339 * (recomputed to neg_min) dictates whether the above should be 1340 * applied. 1341 * 1342 * Try evicting if the cache is close to full capacity regardless of 1343 * other considerations. 1344 */ 1345 static bool 1346 cache_neg_evict_cond(u_long lnumcache) 1347 { 1348 u_long lnumneg; 1349 1350 if (ncsize - 1000 < lnumcache) 1351 goto out_evict; 1352 lnumneg = atomic_load_long(&numneg); 1353 if (lnumneg < neg_min) 1354 return (false); 1355 if (lnumneg * ncnegfactor < lnumcache) 1356 return (false); 1357 out_evict: 1358 return (cache_neg_evict()); 1359 } 1360 1361 /* 1362 * cache_zap_locked(): 1363 * 1364 * Removes a namecache entry from cache, whether it contains an actual 1365 * pointer to a vnode or if it is just a negative cache entry. 1366 */ 1367 static void 1368 cache_zap_locked(struct namecache *ncp) 1369 { 1370 struct nchashhead *ncpp; 1371 1372 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1373 cache_assert_vnode_locked(ncp->nc_vp); 1374 cache_assert_vnode_locked(ncp->nc_dvp); 1375 cache_assert_bucket_locked(ncp); 1376 1377 cache_ncp_invalidate(ncp); 1378 1379 ncpp = NCP2BUCKET(ncp); 1380 CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash); 1381 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 1382 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, 1383 ncp->nc_name, ncp->nc_vp); 1384 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); 1385 if (ncp == ncp->nc_vp->v_cache_dd) { 1386 vn_seqc_write_begin_unheld(ncp->nc_vp); 1387 ncp->nc_vp->v_cache_dd = NULL; 1388 vn_seqc_write_end(ncp->nc_vp); 1389 } 1390 } else { 1391 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp, 1392 ncp->nc_name); 1393 cache_neg_remove(ncp); 1394 } 1395 if (ncp->nc_flag & NCF_ISDOTDOT) { 1396 if (ncp == ncp->nc_dvp->v_cache_dd) { 1397 vn_seqc_write_begin_unheld(ncp->nc_dvp); 1398 ncp->nc_dvp->v_cache_dd = NULL; 1399 vn_seqc_write_end(ncp->nc_dvp); 1400 } 1401 } else { 1402 LIST_REMOVE(ncp, nc_src); 1403 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { 1404 ncp->nc_flag |= NCF_DVDROP; 1405 } 1406 } 1407 } 1408 1409 static void 1410 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) 1411 { 1412 struct mtx *blp; 1413 1414 MPASS(ncp->nc_dvp == vp); 1415 MPASS(ncp->nc_flag & NCF_NEGATIVE); 1416 cache_assert_vnode_locked(vp); 1417 1418 blp = NCP2BUCKETLOCK(ncp); 1419 mtx_lock(blp); 1420 cache_zap_locked(ncp); 1421 mtx_unlock(blp); 1422 } 1423 1424 static bool 1425 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, 1426 struct mtx **vlpp) 1427 { 1428 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 1429 struct mtx *blp; 1430 1431 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 1432 cache_assert_vnode_locked(vp); 1433 1434 if (ncp->nc_flag & NCF_NEGATIVE) { 1435 if (*vlpp != NULL) { 1436 mtx_unlock(*vlpp); 1437 *vlpp = NULL; 1438 } 1439 cache_zap_negative_locked_vnode_kl(ncp, vp); 1440 return (true); 1441 } 1442 1443 pvlp = VP2VNODELOCK(vp); 1444 blp = NCP2BUCKETLOCK(ncp); 1445 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 1446 vlp2 = VP2VNODELOCK(ncp->nc_vp); 1447 1448 if (*vlpp == vlp1 || *vlpp == vlp2) { 1449 to_unlock = *vlpp; 1450 *vlpp = NULL; 1451 } else { 1452 if (*vlpp != NULL) { 1453 mtx_unlock(*vlpp); 1454 *vlpp = NULL; 1455 } 1456 cache_sort_vnodes(&vlp1, &vlp2); 1457 if (vlp1 == pvlp) { 1458 mtx_lock(vlp2); 1459 to_unlock = vlp2; 1460 } else { 1461 if (!mtx_trylock(vlp1)) 1462 goto out_relock; 1463 to_unlock = vlp1; 1464 } 1465 } 1466 mtx_lock(blp); 1467 cache_zap_locked(ncp); 1468 mtx_unlock(blp); 1469 if (to_unlock != NULL) 1470 mtx_unlock(to_unlock); 1471 return (true); 1472 1473 out_relock: 1474 mtx_unlock(vlp2); 1475 mtx_lock(vlp1); 1476 mtx_lock(vlp2); 1477 MPASS(*vlpp == NULL); 1478 *vlpp = vlp1; 1479 return (false); 1480 } 1481 1482 /* 1483 * If trylocking failed we can get here. We know enough to take all needed locks 1484 * in the right order and re-lookup the entry. 1485 */ 1486 static int 1487 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1488 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, 1489 struct mtx *blp) 1490 { 1491 struct namecache *rncp; 1492 1493 cache_assert_bucket_unlocked(ncp); 1494 1495 cache_sort_vnodes(&dvlp, &vlp); 1496 cache_lock_vnodes(dvlp, vlp); 1497 mtx_lock(blp); 1498 CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { 1499 if (rncp == ncp && rncp->nc_dvp == dvp && 1500 rncp->nc_nlen == cnp->cn_namelen && 1501 !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) 1502 break; 1503 } 1504 if (rncp != NULL) { 1505 cache_zap_locked(rncp); 1506 mtx_unlock(blp); 1507 cache_unlock_vnodes(dvlp, vlp); 1508 counter_u64_add(zap_bucket_relock_success, 1); 1509 return (0); 1510 } 1511 1512 mtx_unlock(blp); 1513 cache_unlock_vnodes(dvlp, vlp); 1514 return (EAGAIN); 1515 } 1516 1517 static int __noinline 1518 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp, 1519 uint32_t hash, struct mtx *blp) 1520 { 1521 struct mtx *dvlp, *vlp; 1522 struct vnode *dvp; 1523 1524 cache_assert_bucket_locked(ncp); 1525 1526 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1527 vlp = NULL; 1528 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1529 vlp = VP2VNODELOCK(ncp->nc_vp); 1530 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1531 cache_zap_locked(ncp); 1532 mtx_unlock(blp); 1533 cache_unlock_vnodes(dvlp, vlp); 1534 return (0); 1535 } 1536 1537 dvp = ncp->nc_dvp; 1538 mtx_unlock(blp); 1539 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1540 } 1541 1542 static __noinline int 1543 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp) 1544 { 1545 struct namecache *ncp; 1546 struct mtx *blp; 1547 struct mtx *dvlp, *dvlp2; 1548 uint32_t hash; 1549 int error; 1550 1551 if (cnp->cn_namelen == 2 && 1552 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1553 dvlp = VP2VNODELOCK(dvp); 1554 dvlp2 = NULL; 1555 mtx_lock(dvlp); 1556 retry_dotdot: 1557 ncp = dvp->v_cache_dd; 1558 if (ncp == NULL) { 1559 mtx_unlock(dvlp); 1560 if (dvlp2 != NULL) 1561 mtx_unlock(dvlp2); 1562 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1563 return (0); 1564 } 1565 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1566 if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2)) 1567 goto retry_dotdot; 1568 MPASS(dvp->v_cache_dd == NULL); 1569 mtx_unlock(dvlp); 1570 if (dvlp2 != NULL) 1571 mtx_unlock(dvlp2); 1572 cache_free(ncp); 1573 } else { 1574 vn_seqc_write_begin(dvp); 1575 dvp->v_cache_dd = NULL; 1576 vn_seqc_write_end(dvp); 1577 mtx_unlock(dvlp); 1578 if (dvlp2 != NULL) 1579 mtx_unlock(dvlp2); 1580 } 1581 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1582 return (1); 1583 } 1584 1585 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1586 blp = HASH2BUCKETLOCK(hash); 1587 retry: 1588 if (CK_SLIST_EMPTY(NCHHASH(hash))) 1589 goto out_no_entry; 1590 1591 mtx_lock(blp); 1592 1593 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1594 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1595 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1596 break; 1597 } 1598 1599 if (ncp == NULL) { 1600 mtx_unlock(blp); 1601 goto out_no_entry; 1602 } 1603 1604 error = cache_zap_locked_bucket(ncp, cnp, hash, blp); 1605 if (__predict_false(error != 0)) { 1606 zap_bucket_fail++; 1607 goto retry; 1608 } 1609 counter_u64_add(numposzaps, 1); 1610 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1611 cache_free(ncp); 1612 return (1); 1613 out_no_entry: 1614 counter_u64_add(nummisszap, 1); 1615 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1616 return (0); 1617 } 1618 1619 static int __noinline 1620 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1621 struct timespec *tsp, int *ticksp) 1622 { 1623 int ltype; 1624 1625 *vpp = dvp; 1626 counter_u64_add(dothits, 1); 1627 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); 1628 if (tsp != NULL) 1629 timespecclear(tsp); 1630 if (ticksp != NULL) 1631 *ticksp = ticks; 1632 vrefact(*vpp); 1633 /* 1634 * When we lookup "." we still can be asked to lock it 1635 * differently... 1636 */ 1637 ltype = cnp->cn_lkflags & LK_TYPE_MASK; 1638 if (ltype != VOP_ISLOCKED(*vpp)) { 1639 if (ltype == LK_EXCLUSIVE) { 1640 vn_lock(*vpp, LK_UPGRADE | LK_RETRY); 1641 if (VN_IS_DOOMED((*vpp))) { 1642 /* forced unmount */ 1643 vrele(*vpp); 1644 *vpp = NULL; 1645 return (ENOENT); 1646 } 1647 } else 1648 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); 1649 } 1650 return (-1); 1651 } 1652 1653 static int __noinline 1654 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1655 struct timespec *tsp, int *ticksp) 1656 { 1657 struct namecache_ts *ncp_ts; 1658 struct namecache *ncp; 1659 struct mtx *dvlp; 1660 enum vgetstate vs; 1661 int error, ltype; 1662 bool whiteout; 1663 1664 MPASS((cnp->cn_flags & ISDOTDOT) != 0); 1665 1666 if ((cnp->cn_flags & MAKEENTRY) == 0) { 1667 cache_remove_cnp(dvp, cnp); 1668 return (0); 1669 } 1670 1671 counter_u64_add(dotdothits, 1); 1672 retry: 1673 dvlp = VP2VNODELOCK(dvp); 1674 mtx_lock(dvlp); 1675 ncp = dvp->v_cache_dd; 1676 if (ncp == NULL) { 1677 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, "..", NULL); 1678 mtx_unlock(dvlp); 1679 return (0); 1680 } 1681 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1682 if (ncp->nc_flag & NCF_NEGATIVE) 1683 *vpp = NULL; 1684 else 1685 *vpp = ncp->nc_vp; 1686 } else 1687 *vpp = ncp->nc_dvp; 1688 if (*vpp == NULL) 1689 goto negative_success; 1690 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp); 1691 cache_out_ts(ncp, tsp, ticksp); 1692 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == 1693 NCF_DTS && tsp != NULL) { 1694 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1695 *tsp = ncp_ts->nc_dotdottime; 1696 } 1697 1698 MPASS(dvp != *vpp); 1699 ltype = VOP_ISLOCKED(dvp); 1700 VOP_UNLOCK(dvp); 1701 vs = vget_prep(*vpp); 1702 mtx_unlock(dvlp); 1703 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1704 vn_lock(dvp, ltype | LK_RETRY); 1705 if (VN_IS_DOOMED(dvp)) { 1706 if (error == 0) 1707 vput(*vpp); 1708 *vpp = NULL; 1709 return (ENOENT); 1710 } 1711 if (error) { 1712 *vpp = NULL; 1713 goto retry; 1714 } 1715 return (-1); 1716 negative_success: 1717 if (__predict_false(cnp->cn_nameiop == CREATE)) { 1718 if (cnp->cn_flags & ISLASTCN) { 1719 counter_u64_add(numnegzaps, 1); 1720 cache_zap_negative_locked_vnode_kl(ncp, dvp); 1721 mtx_unlock(dvlp); 1722 cache_free(ncp); 1723 return (0); 1724 } 1725 } 1726 1727 whiteout = (ncp->nc_flag & NCF_WHITE); 1728 cache_out_ts(ncp, tsp, ticksp); 1729 if (cache_neg_hit_prep(ncp)) 1730 cache_neg_promote(ncp); 1731 else 1732 cache_neg_hit_finish(ncp); 1733 mtx_unlock(dvlp); 1734 if (whiteout) 1735 cnp->cn_flags |= ISWHITEOUT; 1736 return (ENOENT); 1737 } 1738 1739 /** 1740 * Lookup a name in the name cache 1741 * 1742 * # Arguments 1743 * 1744 * - dvp: Parent directory in which to search. 1745 * - vpp: Return argument. Will contain desired vnode on cache hit. 1746 * - cnp: Parameters of the name search. The most interesting bits of 1747 * the cn_flags field have the following meanings: 1748 * - MAKEENTRY: If clear, free an entry from the cache rather than look 1749 * it up. 1750 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." 1751 * - tsp: Return storage for cache timestamp. On a successful (positive 1752 * or negative) lookup, tsp will be filled with any timespec that 1753 * was stored when this cache entry was created. However, it will 1754 * be clear for "." entries. 1755 * - ticks: Return storage for alternate cache timestamp. On a successful 1756 * (positive or negative) lookup, it will contain the ticks value 1757 * that was current when the cache entry was created, unless cnp 1758 * was ".". 1759 * 1760 * Either both tsp and ticks have to be provided or neither of them. 1761 * 1762 * # Returns 1763 * 1764 * - -1: A positive cache hit. vpp will contain the desired vnode. 1765 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due 1766 * to a forced unmount. vpp will not be modified. If the entry 1767 * is a whiteout, then the ISWHITEOUT flag will be set in 1768 * cnp->cn_flags. 1769 * - 0: A cache miss. vpp will not be modified. 1770 * 1771 * # Locking 1772 * 1773 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up 1774 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the 1775 * lock is not recursively acquired. 1776 */ 1777 static int __noinline 1778 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1779 struct timespec *tsp, int *ticksp) 1780 { 1781 struct namecache *ncp; 1782 struct mtx *blp; 1783 uint32_t hash; 1784 enum vgetstate vs; 1785 int error; 1786 bool whiteout; 1787 1788 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 1789 MPASS((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) != 0); 1790 1791 retry: 1792 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1793 blp = HASH2BUCKETLOCK(hash); 1794 mtx_lock(blp); 1795 1796 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1797 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1798 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1799 break; 1800 } 1801 1802 if (__predict_false(ncp == NULL)) { 1803 mtx_unlock(blp); 1804 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1805 NULL); 1806 counter_u64_add(nummiss, 1); 1807 return (0); 1808 } 1809 1810 if (ncp->nc_flag & NCF_NEGATIVE) 1811 goto negative_success; 1812 1813 counter_u64_add(numposhits, 1); 1814 *vpp = ncp->nc_vp; 1815 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); 1816 cache_out_ts(ncp, tsp, ticksp); 1817 MPASS(dvp != *vpp); 1818 vs = vget_prep(*vpp); 1819 mtx_unlock(blp); 1820 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1821 if (error) { 1822 *vpp = NULL; 1823 goto retry; 1824 } 1825 return (-1); 1826 negative_success: 1827 /* 1828 * We don't get here with regular lookup apart from corner cases. 1829 */ 1830 if (__predict_true(cnp->cn_nameiop == CREATE)) { 1831 if (cnp->cn_flags & ISLASTCN) { 1832 counter_u64_add(numnegzaps, 1); 1833 error = cache_zap_locked_bucket(ncp, cnp, hash, blp); 1834 if (__predict_false(error != 0)) { 1835 zap_bucket_fail2++; 1836 goto retry; 1837 } 1838 cache_free(ncp); 1839 return (0); 1840 } 1841 } 1842 1843 whiteout = (ncp->nc_flag & NCF_WHITE); 1844 cache_out_ts(ncp, tsp, ticksp); 1845 if (cache_neg_hit_prep(ncp)) 1846 cache_neg_promote(ncp); 1847 else 1848 cache_neg_hit_finish(ncp); 1849 mtx_unlock(blp); 1850 if (whiteout) 1851 cnp->cn_flags |= ISWHITEOUT; 1852 return (ENOENT); 1853 } 1854 1855 int 1856 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1857 struct timespec *tsp, int *ticksp) 1858 { 1859 struct namecache *ncp; 1860 uint32_t hash; 1861 enum vgetstate vs; 1862 int error; 1863 bool whiteout, neg_promote; 1864 u_short nc_flag; 1865 1866 MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL)); 1867 1868 #ifdef DEBUG_CACHE 1869 if (__predict_false(!doingcache)) { 1870 cnp->cn_flags &= ~MAKEENTRY; 1871 return (0); 1872 } 1873 #endif 1874 1875 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 1876 if (cnp->cn_namelen == 1) 1877 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); 1878 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') 1879 return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp)); 1880 } 1881 1882 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 1883 1884 if ((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) == 0) { 1885 cache_remove_cnp(dvp, cnp); 1886 return (0); 1887 } 1888 1889 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1890 vfs_smr_enter(); 1891 1892 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1893 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1894 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1895 break; 1896 } 1897 1898 if (__predict_false(ncp == NULL)) { 1899 vfs_smr_exit(); 1900 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1901 NULL); 1902 counter_u64_add(nummiss, 1); 1903 return (0); 1904 } 1905 1906 nc_flag = atomic_load_char(&ncp->nc_flag); 1907 if (nc_flag & NCF_NEGATIVE) 1908 goto negative_success; 1909 1910 counter_u64_add(numposhits, 1); 1911 *vpp = ncp->nc_vp; 1912 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); 1913 cache_out_ts(ncp, tsp, ticksp); 1914 MPASS(dvp != *vpp); 1915 if (!cache_ncp_canuse(ncp)) { 1916 vfs_smr_exit(); 1917 *vpp = NULL; 1918 goto out_fallback; 1919 } 1920 vs = vget_prep_smr(*vpp); 1921 vfs_smr_exit(); 1922 if (__predict_false(vs == VGET_NONE)) { 1923 *vpp = NULL; 1924 goto out_fallback; 1925 } 1926 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1927 if (error) { 1928 *vpp = NULL; 1929 goto out_fallback; 1930 } 1931 return (-1); 1932 negative_success: 1933 if (cnp->cn_nameiop == CREATE) { 1934 if (cnp->cn_flags & ISLASTCN) { 1935 vfs_smr_exit(); 1936 goto out_fallback; 1937 } 1938 } 1939 1940 cache_out_ts(ncp, tsp, ticksp); 1941 whiteout = (ncp->nc_flag & NCF_WHITE); 1942 neg_promote = cache_neg_hit_prep(ncp); 1943 if (!cache_ncp_canuse(ncp)) { 1944 cache_neg_hit_abort(ncp); 1945 vfs_smr_exit(); 1946 goto out_fallback; 1947 } 1948 if (neg_promote) { 1949 vfs_smr_exit(); 1950 if (!cache_neg_promote_cond(dvp, cnp, ncp, hash)) 1951 goto out_fallback; 1952 } else { 1953 cache_neg_hit_finish(ncp); 1954 vfs_smr_exit(); 1955 } 1956 if (whiteout) 1957 cnp->cn_flags |= ISWHITEOUT; 1958 return (ENOENT); 1959 out_fallback: 1960 return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp)); 1961 } 1962 1963 struct celockstate { 1964 struct mtx *vlp[3]; 1965 struct mtx *blp[2]; 1966 }; 1967 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); 1968 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); 1969 1970 static inline void 1971 cache_celockstate_init(struct celockstate *cel) 1972 { 1973 1974 bzero(cel, sizeof(*cel)); 1975 } 1976 1977 static void 1978 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, 1979 struct vnode *dvp) 1980 { 1981 struct mtx *vlp1, *vlp2; 1982 1983 MPASS(cel->vlp[0] == NULL); 1984 MPASS(cel->vlp[1] == NULL); 1985 MPASS(cel->vlp[2] == NULL); 1986 1987 MPASS(vp != NULL || dvp != NULL); 1988 1989 vlp1 = VP2VNODELOCK(vp); 1990 vlp2 = VP2VNODELOCK(dvp); 1991 cache_sort_vnodes(&vlp1, &vlp2); 1992 1993 if (vlp1 != NULL) { 1994 mtx_lock(vlp1); 1995 cel->vlp[0] = vlp1; 1996 } 1997 mtx_lock(vlp2); 1998 cel->vlp[1] = vlp2; 1999 } 2000 2001 static void 2002 cache_unlock_vnodes_cel(struct celockstate *cel) 2003 { 2004 2005 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); 2006 2007 if (cel->vlp[0] != NULL) 2008 mtx_unlock(cel->vlp[0]); 2009 if (cel->vlp[1] != NULL) 2010 mtx_unlock(cel->vlp[1]); 2011 if (cel->vlp[2] != NULL) 2012 mtx_unlock(cel->vlp[2]); 2013 } 2014 2015 static bool 2016 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) 2017 { 2018 struct mtx *vlp; 2019 bool ret; 2020 2021 cache_assert_vlp_locked(cel->vlp[0]); 2022 cache_assert_vlp_locked(cel->vlp[1]); 2023 MPASS(cel->vlp[2] == NULL); 2024 2025 MPASS(vp != NULL); 2026 vlp = VP2VNODELOCK(vp); 2027 2028 ret = true; 2029 if (vlp >= cel->vlp[1]) { 2030 mtx_lock(vlp); 2031 } else { 2032 if (mtx_trylock(vlp)) 2033 goto out; 2034 cache_lock_vnodes_cel_3_failures++; 2035 cache_unlock_vnodes_cel(cel); 2036 if (vlp < cel->vlp[0]) { 2037 mtx_lock(vlp); 2038 mtx_lock(cel->vlp[0]); 2039 mtx_lock(cel->vlp[1]); 2040 } else { 2041 if (cel->vlp[0] != NULL) 2042 mtx_lock(cel->vlp[0]); 2043 mtx_lock(vlp); 2044 mtx_lock(cel->vlp[1]); 2045 } 2046 ret = false; 2047 } 2048 out: 2049 cel->vlp[2] = vlp; 2050 return (ret); 2051 } 2052 2053 static void 2054 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1, 2055 struct mtx *blp2) 2056 { 2057 2058 MPASS(cel->blp[0] == NULL); 2059 MPASS(cel->blp[1] == NULL); 2060 2061 cache_sort_vnodes(&blp1, &blp2); 2062 2063 if (blp1 != NULL) { 2064 mtx_lock(blp1); 2065 cel->blp[0] = blp1; 2066 } 2067 mtx_lock(blp2); 2068 cel->blp[1] = blp2; 2069 } 2070 2071 static void 2072 cache_unlock_buckets_cel(struct celockstate *cel) 2073 { 2074 2075 if (cel->blp[0] != NULL) 2076 mtx_unlock(cel->blp[0]); 2077 mtx_unlock(cel->blp[1]); 2078 } 2079 2080 /* 2081 * Lock part of the cache affected by the insertion. 2082 * 2083 * This means vnodelocks for dvp, vp and the relevant bucketlock. 2084 * However, insertion can result in removal of an old entry. In this 2085 * case we have an additional vnode and bucketlock pair to lock. 2086 * 2087 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while 2088 * preserving the locking order (smaller address first). 2089 */ 2090 static void 2091 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 2092 uint32_t hash) 2093 { 2094 struct namecache *ncp; 2095 struct mtx *blps[2]; 2096 2097 blps[0] = HASH2BUCKETLOCK(hash); 2098 for (;;) { 2099 blps[1] = NULL; 2100 cache_lock_vnodes_cel(cel, dvp, vp); 2101 if (vp == NULL || vp->v_type != VDIR) 2102 break; 2103 ncp = vp->v_cache_dd; 2104 if (ncp == NULL) 2105 break; 2106 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2107 break; 2108 MPASS(ncp->nc_dvp == vp); 2109 blps[1] = NCP2BUCKETLOCK(ncp); 2110 if (ncp->nc_flag & NCF_NEGATIVE) 2111 break; 2112 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 2113 break; 2114 /* 2115 * All vnodes got re-locked. Re-validate the state and if 2116 * nothing changed we are done. Otherwise restart. 2117 */ 2118 if (ncp == vp->v_cache_dd && 2119 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 2120 blps[1] == NCP2BUCKETLOCK(ncp) && 2121 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 2122 break; 2123 cache_unlock_vnodes_cel(cel); 2124 cel->vlp[0] = NULL; 2125 cel->vlp[1] = NULL; 2126 cel->vlp[2] = NULL; 2127 } 2128 cache_lock_buckets_cel(cel, blps[0], blps[1]); 2129 } 2130 2131 static void 2132 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 2133 uint32_t hash) 2134 { 2135 struct namecache *ncp; 2136 struct mtx *blps[2]; 2137 2138 blps[0] = HASH2BUCKETLOCK(hash); 2139 for (;;) { 2140 blps[1] = NULL; 2141 cache_lock_vnodes_cel(cel, dvp, vp); 2142 ncp = dvp->v_cache_dd; 2143 if (ncp == NULL) 2144 break; 2145 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2146 break; 2147 MPASS(ncp->nc_dvp == dvp); 2148 blps[1] = NCP2BUCKETLOCK(ncp); 2149 if (ncp->nc_flag & NCF_NEGATIVE) 2150 break; 2151 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 2152 break; 2153 if (ncp == dvp->v_cache_dd && 2154 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 2155 blps[1] == NCP2BUCKETLOCK(ncp) && 2156 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 2157 break; 2158 cache_unlock_vnodes_cel(cel); 2159 cel->vlp[0] = NULL; 2160 cel->vlp[1] = NULL; 2161 cel->vlp[2] = NULL; 2162 } 2163 cache_lock_buckets_cel(cel, blps[0], blps[1]); 2164 } 2165 2166 static void 2167 cache_enter_unlock(struct celockstate *cel) 2168 { 2169 2170 cache_unlock_buckets_cel(cel); 2171 cache_unlock_vnodes_cel(cel); 2172 } 2173 2174 static void __noinline 2175 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, 2176 struct componentname *cnp) 2177 { 2178 struct celockstate cel; 2179 struct namecache *ncp; 2180 uint32_t hash; 2181 int len; 2182 2183 if (dvp->v_cache_dd == NULL) 2184 return; 2185 len = cnp->cn_namelen; 2186 cache_celockstate_init(&cel); 2187 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 2188 cache_enter_lock_dd(&cel, dvp, vp, hash); 2189 vn_seqc_write_begin(dvp); 2190 ncp = dvp->v_cache_dd; 2191 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) { 2192 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); 2193 cache_zap_locked(ncp); 2194 } else { 2195 ncp = NULL; 2196 } 2197 dvp->v_cache_dd = NULL; 2198 vn_seqc_write_end(dvp); 2199 cache_enter_unlock(&cel); 2200 if (ncp != NULL) 2201 cache_free(ncp); 2202 } 2203 2204 /* 2205 * Add an entry to the cache. 2206 */ 2207 void 2208 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 2209 struct timespec *tsp, struct timespec *dtsp) 2210 { 2211 struct celockstate cel; 2212 struct namecache *ncp, *n2, *ndd; 2213 struct namecache_ts *ncp_ts; 2214 struct nchashhead *ncpp; 2215 uint32_t hash; 2216 int flag; 2217 int len; 2218 2219 VNPASS(dvp != vp, dvp); 2220 VNPASS(!VN_IS_DOOMED(dvp), dvp); 2221 VNPASS(dvp->v_type != VNON, dvp); 2222 if (vp != NULL) { 2223 VNPASS(!VN_IS_DOOMED(vp), vp); 2224 VNPASS(vp->v_type != VNON, vp); 2225 } 2226 2227 #ifdef DEBUG_CACHE 2228 if (__predict_false(!doingcache)) 2229 return; 2230 #endif 2231 2232 flag = 0; 2233 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 2234 if (cnp->cn_namelen == 1) 2235 return; 2236 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 2237 cache_enter_dotdot_prep(dvp, vp, cnp); 2238 flag = NCF_ISDOTDOT; 2239 } 2240 } 2241 2242 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); 2243 if (ncp == NULL) 2244 return; 2245 2246 cache_celockstate_init(&cel); 2247 ndd = NULL; 2248 ncp_ts = NULL; 2249 2250 /* 2251 * Calculate the hash key and setup as much of the new 2252 * namecache entry as possible before acquiring the lock. 2253 */ 2254 ncp->nc_flag = flag | NCF_WIP; 2255 ncp->nc_vp = vp; 2256 if (vp == NULL) 2257 cache_neg_init(ncp); 2258 ncp->nc_dvp = dvp; 2259 if (tsp != NULL) { 2260 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 2261 ncp_ts->nc_time = *tsp; 2262 ncp_ts->nc_ticks = ticks; 2263 ncp_ts->nc_nc.nc_flag |= NCF_TS; 2264 if (dtsp != NULL) { 2265 ncp_ts->nc_dotdottime = *dtsp; 2266 ncp_ts->nc_nc.nc_flag |= NCF_DTS; 2267 } 2268 } 2269 len = ncp->nc_nlen = cnp->cn_namelen; 2270 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 2271 memcpy(ncp->nc_name, cnp->cn_nameptr, len); 2272 ncp->nc_name[len] = '\0'; 2273 cache_enter_lock(&cel, dvp, vp, hash); 2274 2275 /* 2276 * See if this vnode or negative entry is already in the cache 2277 * with this name. This can happen with concurrent lookups of 2278 * the same path name. 2279 */ 2280 ncpp = NCHHASH(hash); 2281 CK_SLIST_FOREACH(n2, ncpp, nc_hash) { 2282 if (n2->nc_dvp == dvp && 2283 n2->nc_nlen == cnp->cn_namelen && 2284 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { 2285 MPASS(cache_ncp_canuse(n2)); 2286 if ((n2->nc_flag & NCF_NEGATIVE) != 0) 2287 KASSERT(vp == NULL, 2288 ("%s: found entry pointing to a different vnode (%p != %p)", 2289 __func__, NULL, vp)); 2290 else 2291 KASSERT(n2->nc_vp == vp, 2292 ("%s: found entry pointing to a different vnode (%p != %p)", 2293 __func__, n2->nc_vp, vp)); 2294 /* 2295 * Entries are supposed to be immutable unless in the 2296 * process of getting destroyed. Accommodating for 2297 * changing timestamps is possible but not worth it. 2298 * This should be harmless in terms of correctness, in 2299 * the worst case resulting in an earlier expiration. 2300 * Alternatively, the found entry can be replaced 2301 * altogether. 2302 */ 2303 MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS))); 2304 #if 0 2305 if (tsp != NULL) { 2306 KASSERT((n2->nc_flag & NCF_TS) != 0, 2307 ("no NCF_TS")); 2308 n2_ts = __containerof(n2, struct namecache_ts, nc_nc); 2309 n2_ts->nc_time = ncp_ts->nc_time; 2310 n2_ts->nc_ticks = ncp_ts->nc_ticks; 2311 if (dtsp != NULL) { 2312 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; 2313 n2_ts->nc_nc.nc_flag |= NCF_DTS; 2314 } 2315 } 2316 #endif 2317 SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name, 2318 vp); 2319 goto out_unlock_free; 2320 } 2321 } 2322 2323 if (flag == NCF_ISDOTDOT) { 2324 /* 2325 * See if we are trying to add .. entry, but some other lookup 2326 * has populated v_cache_dd pointer already. 2327 */ 2328 if (dvp->v_cache_dd != NULL) 2329 goto out_unlock_free; 2330 KASSERT(vp == NULL || vp->v_type == VDIR, 2331 ("wrong vnode type %p", vp)); 2332 vn_seqc_write_begin(dvp); 2333 dvp->v_cache_dd = ncp; 2334 vn_seqc_write_end(dvp); 2335 } 2336 2337 if (vp != NULL) { 2338 if (flag != NCF_ISDOTDOT) { 2339 /* 2340 * For this case, the cache entry maps both the 2341 * directory name in it and the name ".." for the 2342 * directory's parent. 2343 */ 2344 vn_seqc_write_begin(vp); 2345 if ((ndd = vp->v_cache_dd) != NULL) { 2346 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) 2347 cache_zap_locked(ndd); 2348 else 2349 ndd = NULL; 2350 } 2351 vp->v_cache_dd = ncp; 2352 vn_seqc_write_end(vp); 2353 } else if (vp->v_type != VDIR) { 2354 if (vp->v_cache_dd != NULL) { 2355 vn_seqc_write_begin(vp); 2356 vp->v_cache_dd = NULL; 2357 vn_seqc_write_end(vp); 2358 } 2359 } 2360 } 2361 2362 if (flag != NCF_ISDOTDOT) { 2363 if (LIST_EMPTY(&dvp->v_cache_src)) { 2364 cache_hold_vnode(dvp); 2365 } 2366 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); 2367 } 2368 2369 /* 2370 * If the entry is "negative", we place it into the 2371 * "negative" cache queue, otherwise, we place it into the 2372 * destination vnode's cache entries queue. 2373 */ 2374 if (vp != NULL) { 2375 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); 2376 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, 2377 vp); 2378 } else { 2379 if (cnp->cn_flags & ISWHITEOUT) 2380 ncp->nc_flag |= NCF_WHITE; 2381 cache_neg_insert(ncp); 2382 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, 2383 ncp->nc_name); 2384 } 2385 2386 /* 2387 * Insert the new namecache entry into the appropriate chain 2388 * within the cache entries table. 2389 */ 2390 CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash); 2391 2392 atomic_thread_fence_rel(); 2393 /* 2394 * Mark the entry as fully constructed. 2395 * It is immutable past this point until its removal. 2396 */ 2397 atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP); 2398 2399 cache_enter_unlock(&cel); 2400 if (ndd != NULL) 2401 cache_free(ndd); 2402 return; 2403 out_unlock_free: 2404 cache_enter_unlock(&cel); 2405 cache_free(ncp); 2406 return; 2407 } 2408 2409 static u_int 2410 cache_roundup_2(u_int val) 2411 { 2412 u_int res; 2413 2414 for (res = 1; res <= val; res <<= 1) 2415 continue; 2416 2417 return (res); 2418 } 2419 2420 static struct nchashhead * 2421 nchinittbl(u_long elements, u_long *hashmask) 2422 { 2423 struct nchashhead *hashtbl; 2424 u_long hashsize, i; 2425 2426 hashsize = cache_roundup_2(elements) / 2; 2427 2428 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK); 2429 for (i = 0; i < hashsize; i++) 2430 CK_SLIST_INIT(&hashtbl[i]); 2431 *hashmask = hashsize - 1; 2432 return (hashtbl); 2433 } 2434 2435 static void 2436 ncfreetbl(struct nchashhead *hashtbl) 2437 { 2438 2439 free(hashtbl, M_VFSCACHE); 2440 } 2441 2442 /* 2443 * Name cache initialization, from vfs_init() when we are booting 2444 */ 2445 static void 2446 nchinit(void *dummy __unused) 2447 { 2448 u_int i; 2449 2450 cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE, 2451 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2452 cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE, 2453 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2454 cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE, 2455 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2456 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE, 2457 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2458 2459 VFS_SMR_ZONE_SET(cache_zone_small); 2460 VFS_SMR_ZONE_SET(cache_zone_small_ts); 2461 VFS_SMR_ZONE_SET(cache_zone_large); 2462 VFS_SMR_ZONE_SET(cache_zone_large_ts); 2463 2464 ncsize = desiredvnodes * ncsizefactor; 2465 cache_recalc_neg_min(ncnegminpct); 2466 nchashtbl = nchinittbl(desiredvnodes * 2, &nchash); 2467 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; 2468 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ 2469 ncbuckethash = 7; 2470 if (ncbuckethash > nchash) 2471 ncbuckethash = nchash; 2472 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, 2473 M_WAITOK | M_ZERO); 2474 for (i = 0; i < numbucketlocks; i++) 2475 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE); 2476 ncvnodehash = ncbuckethash; 2477 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, 2478 M_WAITOK | M_ZERO); 2479 for (i = 0; i < numvnodelocks; i++) 2480 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); 2481 2482 for (i = 0; i < numneglists; i++) { 2483 mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF); 2484 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); 2485 TAILQ_INIT(&neglists[i].nl_list); 2486 TAILQ_INIT(&neglists[i].nl_hotlist); 2487 } 2488 } 2489 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); 2490 2491 void 2492 cache_vnode_init(struct vnode *vp) 2493 { 2494 2495 LIST_INIT(&vp->v_cache_src); 2496 TAILQ_INIT(&vp->v_cache_dst); 2497 vp->v_cache_dd = NULL; 2498 cache_prehash(vp); 2499 } 2500 2501 void 2502 cache_changesize(u_long newmaxvnodes) 2503 { 2504 struct nchashhead *new_nchashtbl, *old_nchashtbl; 2505 u_long new_nchash, old_nchash; 2506 struct namecache *ncp; 2507 uint32_t hash; 2508 u_long newncsize; 2509 int i; 2510 2511 newncsize = newmaxvnodes * ncsizefactor; 2512 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); 2513 if (newmaxvnodes < numbucketlocks) 2514 newmaxvnodes = numbucketlocks; 2515 2516 new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash); 2517 /* If same hash table size, nothing to do */ 2518 if (nchash == new_nchash) { 2519 ncfreetbl(new_nchashtbl); 2520 return; 2521 } 2522 /* 2523 * Move everything from the old hash table to the new table. 2524 * None of the namecache entries in the table can be removed 2525 * because to do so, they have to be removed from the hash table. 2526 */ 2527 cache_lock_all_vnodes(); 2528 cache_lock_all_buckets(); 2529 old_nchashtbl = nchashtbl; 2530 old_nchash = nchash; 2531 nchashtbl = new_nchashtbl; 2532 nchash = new_nchash; 2533 for (i = 0; i <= old_nchash; i++) { 2534 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) { 2535 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, 2536 ncp->nc_dvp); 2537 CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash); 2538 CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); 2539 } 2540 } 2541 ncsize = newncsize; 2542 cache_recalc_neg_min(ncnegminpct); 2543 cache_unlock_all_buckets(); 2544 cache_unlock_all_vnodes(); 2545 ncfreetbl(old_nchashtbl); 2546 } 2547 2548 /* 2549 * Invalidate all entries from and to a particular vnode. 2550 */ 2551 static void 2552 cache_purge_impl(struct vnode *vp) 2553 { 2554 struct cache_freebatch batch; 2555 struct namecache *ncp; 2556 struct mtx *vlp, *vlp2; 2557 2558 TAILQ_INIT(&batch); 2559 vlp = VP2VNODELOCK(vp); 2560 vlp2 = NULL; 2561 mtx_lock(vlp); 2562 retry: 2563 while (!LIST_EMPTY(&vp->v_cache_src)) { 2564 ncp = LIST_FIRST(&vp->v_cache_src); 2565 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2566 goto retry; 2567 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2568 } 2569 while (!TAILQ_EMPTY(&vp->v_cache_dst)) { 2570 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2571 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2572 goto retry; 2573 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2574 } 2575 ncp = vp->v_cache_dd; 2576 if (ncp != NULL) { 2577 KASSERT(ncp->nc_flag & NCF_ISDOTDOT, 2578 ("lost dotdot link")); 2579 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2580 goto retry; 2581 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2582 } 2583 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); 2584 mtx_unlock(vlp); 2585 if (vlp2 != NULL) 2586 mtx_unlock(vlp2); 2587 cache_free_batch(&batch); 2588 } 2589 2590 /* 2591 * Opportunistic check to see if there is anything to do. 2592 */ 2593 static bool 2594 cache_has_entries(struct vnode *vp) 2595 { 2596 2597 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2598 vp->v_cache_dd == NULL) 2599 return (false); 2600 return (true); 2601 } 2602 2603 void 2604 cache_purge(struct vnode *vp) 2605 { 2606 2607 SDT_PROBE1(vfs, namecache, purge, done, vp); 2608 if (!cache_has_entries(vp)) 2609 return; 2610 cache_purge_impl(vp); 2611 } 2612 2613 /* 2614 * Only to be used by vgone. 2615 */ 2616 void 2617 cache_purge_vgone(struct vnode *vp) 2618 { 2619 struct mtx *vlp; 2620 2621 VNPASS(VN_IS_DOOMED(vp), vp); 2622 if (cache_has_entries(vp)) { 2623 cache_purge_impl(vp); 2624 return; 2625 } 2626 2627 /* 2628 * Serialize against a potential thread doing cache_purge. 2629 */ 2630 vlp = VP2VNODELOCK(vp); 2631 mtx_wait_unlocked(vlp); 2632 if (cache_has_entries(vp)) { 2633 cache_purge_impl(vp); 2634 return; 2635 } 2636 return; 2637 } 2638 2639 /* 2640 * Invalidate all negative entries for a particular directory vnode. 2641 */ 2642 void 2643 cache_purge_negative(struct vnode *vp) 2644 { 2645 struct cache_freebatch batch; 2646 struct namecache *ncp, *nnp; 2647 struct mtx *vlp; 2648 2649 SDT_PROBE1(vfs, namecache, purge_negative, done, vp); 2650 if (LIST_EMPTY(&vp->v_cache_src)) 2651 return; 2652 TAILQ_INIT(&batch); 2653 vlp = VP2VNODELOCK(vp); 2654 mtx_lock(vlp); 2655 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { 2656 if (!(ncp->nc_flag & NCF_NEGATIVE)) 2657 continue; 2658 cache_zap_negative_locked_vnode_kl(ncp, vp); 2659 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2660 } 2661 mtx_unlock(vlp); 2662 cache_free_batch(&batch); 2663 } 2664 2665 /* 2666 * Entry points for modifying VOP operations. 2667 */ 2668 void 2669 cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, 2670 struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp) 2671 { 2672 2673 ASSERT_VOP_IN_SEQC(fdvp); 2674 ASSERT_VOP_IN_SEQC(fvp); 2675 ASSERT_VOP_IN_SEQC(tdvp); 2676 if (tvp != NULL) 2677 ASSERT_VOP_IN_SEQC(tvp); 2678 2679 cache_purge(fvp); 2680 if (tvp != NULL) { 2681 cache_purge(tvp); 2682 KASSERT(!cache_remove_cnp(tdvp, tcnp), 2683 ("%s: lingering negative entry", __func__)); 2684 } else { 2685 cache_remove_cnp(tdvp, tcnp); 2686 } 2687 } 2688 2689 void 2690 cache_vop_rmdir(struct vnode *dvp, struct vnode *vp) 2691 { 2692 2693 ASSERT_VOP_IN_SEQC(dvp); 2694 ASSERT_VOP_IN_SEQC(vp); 2695 cache_purge(vp); 2696 } 2697 2698 #ifdef INVARIANTS 2699 /* 2700 * Validate that if an entry exists it matches. 2701 */ 2702 void 2703 cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) 2704 { 2705 struct namecache *ncp; 2706 struct mtx *blp; 2707 uint32_t hash; 2708 2709 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 2710 if (CK_SLIST_EMPTY(NCHHASH(hash))) 2711 return; 2712 blp = HASH2BUCKETLOCK(hash); 2713 mtx_lock(blp); 2714 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 2715 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 2716 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) { 2717 if (ncp->nc_vp != vp) 2718 panic("%s: mismatch (%p != %p); ncp %p [%s] dvp %p vp %p\n", 2719 __func__, vp, ncp->nc_vp, ncp, ncp->nc_name, ncp->nc_dvp, 2720 ncp->nc_vp); 2721 } 2722 } 2723 mtx_unlock(blp); 2724 } 2725 #endif 2726 2727 /* 2728 * Flush all entries referencing a particular filesystem. 2729 */ 2730 void 2731 cache_purgevfs(struct mount *mp) 2732 { 2733 struct vnode *vp, *mvp; 2734 2735 SDT_PROBE1(vfs, namecache, purgevfs, done, mp); 2736 /* 2737 * Somewhat wasteful iteration over all vnodes. Would be better to 2738 * support filtering and avoid the interlock to begin with. 2739 */ 2740 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 2741 if (!cache_has_entries(vp)) { 2742 VI_UNLOCK(vp); 2743 continue; 2744 } 2745 vholdl(vp); 2746 VI_UNLOCK(vp); 2747 cache_purge(vp); 2748 vdrop(vp); 2749 } 2750 } 2751 2752 /* 2753 * Perform canonical checks and cache lookup and pass on to filesystem 2754 * through the vop_cachedlookup only if needed. 2755 */ 2756 2757 int 2758 vfs_cache_lookup(struct vop_lookup_args *ap) 2759 { 2760 struct vnode *dvp; 2761 int error; 2762 struct vnode **vpp = ap->a_vpp; 2763 struct componentname *cnp = ap->a_cnp; 2764 int flags = cnp->cn_flags; 2765 2766 *vpp = NULL; 2767 dvp = ap->a_dvp; 2768 2769 if (dvp->v_type != VDIR) 2770 return (ENOTDIR); 2771 2772 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && 2773 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) 2774 return (EROFS); 2775 2776 error = vn_dir_check_exec(dvp, cnp); 2777 if (error != 0) 2778 return (error); 2779 2780 error = cache_lookup(dvp, vpp, cnp, NULL, NULL); 2781 if (error == 0) 2782 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); 2783 if (error == -1) 2784 return (0); 2785 return (error); 2786 } 2787 2788 /* Implementation of the getcwd syscall. */ 2789 int 2790 sys___getcwd(struct thread *td, struct __getcwd_args *uap) 2791 { 2792 char *buf, *retbuf; 2793 size_t buflen; 2794 int error; 2795 2796 buflen = uap->buflen; 2797 if (__predict_false(buflen < 2)) 2798 return (EINVAL); 2799 if (buflen > MAXPATHLEN) 2800 buflen = MAXPATHLEN; 2801 2802 buf = uma_zalloc(namei_zone, M_WAITOK); 2803 error = vn_getcwd(buf, &retbuf, &buflen); 2804 if (error == 0) 2805 error = copyout(retbuf, uap->buf, buflen); 2806 uma_zfree(namei_zone, buf); 2807 return (error); 2808 } 2809 2810 int 2811 vn_getcwd(char *buf, char **retbuf, size_t *buflen) 2812 { 2813 struct pwd *pwd; 2814 int error; 2815 2816 vfs_smr_enter(); 2817 pwd = pwd_get_smr(); 2818 error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, 2819 buflen, 0); 2820 VFS_SMR_ASSERT_NOT_ENTERED(); 2821 if (error < 0) { 2822 pwd = pwd_hold(curthread); 2823 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf, 2824 retbuf, buflen); 2825 pwd_drop(pwd); 2826 } 2827 2828 #ifdef KTRACE 2829 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0) 2830 ktrnamei(*retbuf); 2831 #endif 2832 return (error); 2833 } 2834 2835 static int 2836 kern___realpathat(struct thread *td, int fd, const char *path, char *buf, 2837 size_t size, int flags, enum uio_seg pathseg) 2838 { 2839 struct nameidata nd; 2840 char *retbuf, *freebuf; 2841 int error; 2842 2843 if (flags != 0) 2844 return (EINVAL); 2845 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1, 2846 pathseg, path, fd, &cap_fstat_rights, td); 2847 if ((error = namei(&nd)) != 0) 2848 return (error); 2849 error = vn_fullpath_hardlink(&nd, &retbuf, &freebuf, &size); 2850 if (error == 0) { 2851 error = copyout(retbuf, buf, size); 2852 free(freebuf, M_TEMP); 2853 } 2854 NDFREE(&nd, 0); 2855 return (error); 2856 } 2857 2858 int 2859 sys___realpathat(struct thread *td, struct __realpathat_args *uap) 2860 { 2861 2862 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size, 2863 uap->flags, UIO_USERSPACE)); 2864 } 2865 2866 /* 2867 * Retrieve the full filesystem path that correspond to a vnode from the name 2868 * cache (if available) 2869 */ 2870 int 2871 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf) 2872 { 2873 struct pwd *pwd; 2874 char *buf; 2875 size_t buflen; 2876 int error; 2877 2878 if (__predict_false(vp == NULL)) 2879 return (EINVAL); 2880 2881 buflen = MAXPATHLEN; 2882 buf = malloc(buflen, M_TEMP, M_WAITOK); 2883 vfs_smr_enter(); 2884 pwd = pwd_get_smr(); 2885 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, 0); 2886 VFS_SMR_ASSERT_NOT_ENTERED(); 2887 if (error < 0) { 2888 pwd = pwd_hold(curthread); 2889 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen); 2890 pwd_drop(pwd); 2891 } 2892 if (error == 0) 2893 *freebuf = buf; 2894 else 2895 free(buf, M_TEMP); 2896 return (error); 2897 } 2898 2899 /* 2900 * This function is similar to vn_fullpath, but it attempts to lookup the 2901 * pathname relative to the global root mount point. This is required for the 2902 * auditing sub-system, as audited pathnames must be absolute, relative to the 2903 * global root mount point. 2904 */ 2905 int 2906 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf) 2907 { 2908 char *buf; 2909 size_t buflen; 2910 int error; 2911 2912 if (__predict_false(vp == NULL)) 2913 return (EINVAL); 2914 buflen = MAXPATHLEN; 2915 buf = malloc(buflen, M_TEMP, M_WAITOK); 2916 vfs_smr_enter(); 2917 error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, 0); 2918 VFS_SMR_ASSERT_NOT_ENTERED(); 2919 if (error < 0) { 2920 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen); 2921 } 2922 if (error == 0) 2923 *freebuf = buf; 2924 else 2925 free(buf, M_TEMP); 2926 return (error); 2927 } 2928 2929 static struct namecache * 2930 vn_dd_from_dst(struct vnode *vp) 2931 { 2932 struct namecache *ncp; 2933 2934 cache_assert_vnode_locked(vp); 2935 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) { 2936 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2937 return (ncp); 2938 } 2939 return (NULL); 2940 } 2941 2942 int 2943 vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen) 2944 { 2945 struct vnode *dvp; 2946 struct namecache *ncp; 2947 struct mtx *vlp; 2948 int error; 2949 2950 vlp = VP2VNODELOCK(*vp); 2951 mtx_lock(vlp); 2952 ncp = (*vp)->v_cache_dd; 2953 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) { 2954 KASSERT(ncp == vn_dd_from_dst(*vp), 2955 ("%s: mismatch for dd entry (%p != %p)", __func__, 2956 ncp, vn_dd_from_dst(*vp))); 2957 } else { 2958 ncp = vn_dd_from_dst(*vp); 2959 } 2960 if (ncp != NULL) { 2961 if (*buflen < ncp->nc_nlen) { 2962 mtx_unlock(vlp); 2963 vrele(*vp); 2964 counter_u64_add(numfullpathfail4, 1); 2965 error = ENOMEM; 2966 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2967 vp, NULL); 2968 return (error); 2969 } 2970 *buflen -= ncp->nc_nlen; 2971 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 2972 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, 2973 ncp->nc_name, vp); 2974 dvp = *vp; 2975 *vp = ncp->nc_dvp; 2976 vref(*vp); 2977 mtx_unlock(vlp); 2978 vrele(dvp); 2979 return (0); 2980 } 2981 SDT_PROBE1(vfs, namecache, fullpath, miss, vp); 2982 2983 mtx_unlock(vlp); 2984 vn_lock(*vp, LK_SHARED | LK_RETRY); 2985 error = VOP_VPTOCNP(*vp, &dvp, buf, buflen); 2986 vput(*vp); 2987 if (error) { 2988 counter_u64_add(numfullpathfail2, 1); 2989 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2990 return (error); 2991 } 2992 2993 *vp = dvp; 2994 if (VN_IS_DOOMED(dvp)) { 2995 /* forced unmount */ 2996 vrele(dvp); 2997 error = ENOENT; 2998 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2999 return (error); 3000 } 3001 /* 3002 * *vp has its use count incremented still. 3003 */ 3004 3005 return (0); 3006 } 3007 3008 /* 3009 * Resolve a directory to a pathname. 3010 * 3011 * The name of the directory can always be found in the namecache or fetched 3012 * from the filesystem. There is also guaranteed to be only one parent, meaning 3013 * we can just follow vnodes up until we find the root. 3014 * 3015 * The vnode must be referenced. 3016 */ 3017 static int 3018 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 3019 size_t *len, size_t addend) 3020 { 3021 #ifdef KDTRACE_HOOKS 3022 struct vnode *startvp = vp; 3023 #endif 3024 struct vnode *vp1; 3025 size_t buflen; 3026 int error; 3027 bool slash_prefixed; 3028 3029 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); 3030 VNPASS(vp->v_usecount > 0, vp); 3031 3032 buflen = *len; 3033 3034 slash_prefixed = true; 3035 if (addend == 0) { 3036 MPASS(*len >= 2); 3037 buflen--; 3038 buf[buflen] = '\0'; 3039 slash_prefixed = false; 3040 } 3041 3042 error = 0; 3043 3044 SDT_PROBE1(vfs, namecache, fullpath, entry, vp); 3045 counter_u64_add(numfullpathcalls, 1); 3046 while (vp != rdir && vp != rootvnode) { 3047 /* 3048 * The vp vnode must be already fully constructed, 3049 * since it is either found in namecache or obtained 3050 * from VOP_VPTOCNP(). We may test for VV_ROOT safely 3051 * without obtaining the vnode lock. 3052 */ 3053 if ((vp->v_vflag & VV_ROOT) != 0) { 3054 vn_lock(vp, LK_RETRY | LK_SHARED); 3055 3056 /* 3057 * With the vnode locked, check for races with 3058 * unmount, forced or not. Note that we 3059 * already verified that vp is not equal to 3060 * the root vnode, which means that 3061 * mnt_vnodecovered can be NULL only for the 3062 * case of unmount. 3063 */ 3064 if (VN_IS_DOOMED(vp) || 3065 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || 3066 vp1->v_mountedhere != vp->v_mount) { 3067 vput(vp); 3068 error = ENOENT; 3069 SDT_PROBE3(vfs, namecache, fullpath, return, 3070 error, vp, NULL); 3071 break; 3072 } 3073 3074 vref(vp1); 3075 vput(vp); 3076 vp = vp1; 3077 continue; 3078 } 3079 if (vp->v_type != VDIR) { 3080 vrele(vp); 3081 counter_u64_add(numfullpathfail1, 1); 3082 error = ENOTDIR; 3083 SDT_PROBE3(vfs, namecache, fullpath, return, 3084 error, vp, NULL); 3085 break; 3086 } 3087 error = vn_vptocnp(&vp, buf, &buflen); 3088 if (error) 3089 break; 3090 if (buflen == 0) { 3091 vrele(vp); 3092 error = ENOMEM; 3093 SDT_PROBE3(vfs, namecache, fullpath, return, error, 3094 startvp, NULL); 3095 break; 3096 } 3097 buf[--buflen] = '/'; 3098 slash_prefixed = true; 3099 } 3100 if (error) 3101 return (error); 3102 if (!slash_prefixed) { 3103 if (buflen == 0) { 3104 vrele(vp); 3105 counter_u64_add(numfullpathfail4, 1); 3106 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, 3107 startvp, NULL); 3108 return (ENOMEM); 3109 } 3110 buf[--buflen] = '/'; 3111 } 3112 counter_u64_add(numfullpathfound, 1); 3113 vrele(vp); 3114 3115 *retbuf = buf + buflen; 3116 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf); 3117 *len -= buflen; 3118 *len += addend; 3119 return (0); 3120 } 3121 3122 /* 3123 * Resolve an arbitrary vnode to a pathname. 3124 * 3125 * Note 2 caveats: 3126 * - hardlinks are not tracked, thus if the vnode is not a directory this can 3127 * resolve to a different path than the one used to find it 3128 * - namecache is not mandatory, meaning names are not guaranteed to be added 3129 * (in which case resolving fails) 3130 */ 3131 static void __inline 3132 cache_rev_failed_impl(int *reason, int line) 3133 { 3134 3135 *reason = line; 3136 } 3137 #define cache_rev_failed(var) cache_rev_failed_impl((var), __LINE__) 3138 3139 static int 3140 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 3141 char **retbuf, size_t *buflen, size_t addend) 3142 { 3143 #ifdef KDTRACE_HOOKS 3144 struct vnode *startvp = vp; 3145 #endif 3146 struct vnode *tvp; 3147 struct mount *mp; 3148 struct namecache *ncp; 3149 size_t orig_buflen; 3150 int reason; 3151 int error; 3152 #ifdef KDTRACE_HOOKS 3153 int i; 3154 #endif 3155 seqc_t vp_seqc, tvp_seqc; 3156 u_char nc_flag; 3157 3158 VFS_SMR_ASSERT_ENTERED(); 3159 3160 if (!cache_fast_revlookup) { 3161 vfs_smr_exit(); 3162 return (-1); 3163 } 3164 3165 orig_buflen = *buflen; 3166 3167 if (addend == 0) { 3168 MPASS(*buflen >= 2); 3169 *buflen -= 1; 3170 buf[*buflen] = '\0'; 3171 } 3172 3173 if (vp == rdir || vp == rootvnode) { 3174 if (addend == 0) { 3175 *buflen -= 1; 3176 buf[*buflen] = '/'; 3177 } 3178 goto out_ok; 3179 } 3180 3181 #ifdef KDTRACE_HOOKS 3182 i = 0; 3183 #endif 3184 error = -1; 3185 ncp = NULL; /* for sdt probe down below */ 3186 vp_seqc = vn_seqc_read_any(vp); 3187 if (seqc_in_modify(vp_seqc)) { 3188 cache_rev_failed(&reason); 3189 goto out_abort; 3190 } 3191 3192 for (;;) { 3193 #ifdef KDTRACE_HOOKS 3194 i++; 3195 #endif 3196 if ((vp->v_vflag & VV_ROOT) != 0) { 3197 mp = atomic_load_ptr(&vp->v_mount); 3198 if (mp == NULL) { 3199 cache_rev_failed(&reason); 3200 goto out_abort; 3201 } 3202 tvp = atomic_load_ptr(&mp->mnt_vnodecovered); 3203 tvp_seqc = vn_seqc_read_any(tvp); 3204 if (seqc_in_modify(tvp_seqc)) { 3205 cache_rev_failed(&reason); 3206 goto out_abort; 3207 } 3208 if (!vn_seqc_consistent(vp, vp_seqc)) { 3209 cache_rev_failed(&reason); 3210 goto out_abort; 3211 } 3212 vp = tvp; 3213 vp_seqc = tvp_seqc; 3214 continue; 3215 } 3216 ncp = atomic_load_ptr(&vp->v_cache_dd); 3217 if (ncp == NULL) { 3218 cache_rev_failed(&reason); 3219 goto out_abort; 3220 } 3221 nc_flag = atomic_load_char(&ncp->nc_flag); 3222 if ((nc_flag & NCF_ISDOTDOT) != 0) { 3223 cache_rev_failed(&reason); 3224 goto out_abort; 3225 } 3226 if (!cache_ncp_canuse(ncp)) { 3227 cache_rev_failed(&reason); 3228 goto out_abort; 3229 } 3230 if (ncp->nc_nlen >= *buflen) { 3231 cache_rev_failed(&reason); 3232 error = ENOMEM; 3233 goto out_abort; 3234 } 3235 *buflen -= ncp->nc_nlen; 3236 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 3237 *buflen -= 1; 3238 buf[*buflen] = '/'; 3239 tvp = ncp->nc_dvp; 3240 tvp_seqc = vn_seqc_read_any(tvp); 3241 if (seqc_in_modify(tvp_seqc)) { 3242 cache_rev_failed(&reason); 3243 goto out_abort; 3244 } 3245 if (!vn_seqc_consistent(vp, vp_seqc)) { 3246 cache_rev_failed(&reason); 3247 goto out_abort; 3248 } 3249 vp = tvp; 3250 vp_seqc = tvp_seqc; 3251 if (vp == rdir || vp == rootvnode) 3252 break; 3253 } 3254 out_ok: 3255 vfs_smr_exit(); 3256 *retbuf = buf + *buflen; 3257 *buflen = orig_buflen - *buflen + addend; 3258 SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf); 3259 return (0); 3260 3261 out_abort: 3262 *buflen = orig_buflen; 3263 SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i); 3264 vfs_smr_exit(); 3265 return (error); 3266 } 3267 3268 static int 3269 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 3270 size_t *buflen) 3271 { 3272 size_t orig_buflen, addend; 3273 int error; 3274 3275 if (*buflen < 2) 3276 return (EINVAL); 3277 3278 orig_buflen = *buflen; 3279 3280 vref(vp); 3281 addend = 0; 3282 if (vp->v_type != VDIR) { 3283 *buflen -= 1; 3284 buf[*buflen] = '\0'; 3285 error = vn_vptocnp(&vp, buf, buflen); 3286 if (error) 3287 return (error); 3288 if (*buflen == 0) { 3289 vrele(vp); 3290 return (ENOMEM); 3291 } 3292 *buflen -= 1; 3293 buf[*buflen] = '/'; 3294 addend = orig_buflen - *buflen; 3295 } 3296 3297 return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, addend)); 3298 } 3299 3300 /* 3301 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks). 3302 * 3303 * Since the namecache does not track hardlinks, the caller is expected to first 3304 * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei. 3305 * 3306 * Then we have 2 cases: 3307 * - if the found vnode is a directory, the path can be constructed just by 3308 * following names up the chain 3309 * - otherwise we populate the buffer with the saved name and start resolving 3310 * from the parent 3311 */ 3312 static int 3313 vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, char **freebuf, 3314 size_t *buflen) 3315 { 3316 char *buf, *tmpbuf; 3317 struct pwd *pwd; 3318 struct componentname *cnp; 3319 struct vnode *vp; 3320 size_t addend; 3321 int error; 3322 enum vtype type; 3323 3324 if (*buflen < 2) 3325 return (EINVAL); 3326 if (*buflen > MAXPATHLEN) 3327 *buflen = MAXPATHLEN; 3328 3329 buf = malloc(*buflen, M_TEMP, M_WAITOK); 3330 3331 addend = 0; 3332 vp = ndp->ni_vp; 3333 /* 3334 * Check for VBAD to work around the vp_crossmp bug in lookup(). 3335 * 3336 * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be 3337 * set to mount point's root vnode while ni_dvp will be vp_crossmp. 3338 * If the type is VDIR (like in this very case) we can skip looking 3339 * at ni_dvp in the first place. However, since vnodes get passed here 3340 * unlocked the target may transition to doomed state (type == VBAD) 3341 * before we get to evaluate the condition. If this happens, we will 3342 * populate part of the buffer and descend to vn_fullpath_dir with 3343 * vp == vp_crossmp. Prevent the problem by checking for VBAD. 3344 * 3345 * This should be atomic_load(&vp->v_type) but it is illegal to take 3346 * an address of a bit field, even if said field is sized to char. 3347 * Work around the problem by reading the value into a full-sized enum 3348 * and then re-reading it with atomic_load which will still prevent 3349 * the compiler from re-reading down the road. 3350 */ 3351 type = vp->v_type; 3352 type = atomic_load_int(&type); 3353 if (type == VBAD) { 3354 error = ENOENT; 3355 goto out_bad; 3356 } 3357 if (type != VDIR) { 3358 cnp = &ndp->ni_cnd; 3359 addend = cnp->cn_namelen + 2; 3360 if (*buflen < addend) { 3361 error = ENOMEM; 3362 goto out_bad; 3363 } 3364 *buflen -= addend; 3365 tmpbuf = buf + *buflen; 3366 tmpbuf[0] = '/'; 3367 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen); 3368 tmpbuf[addend - 1] = '\0'; 3369 vp = ndp->ni_dvp; 3370 } 3371 3372 vfs_smr_enter(); 3373 pwd = pwd_get_smr(); 3374 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen, 3375 addend); 3376 VFS_SMR_ASSERT_NOT_ENTERED(); 3377 if (error < 0) { 3378 pwd = pwd_hold(curthread); 3379 vref(vp); 3380 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen, 3381 addend); 3382 pwd_drop(pwd); 3383 if (error != 0) 3384 goto out_bad; 3385 } 3386 3387 *freebuf = buf; 3388 3389 return (0); 3390 out_bad: 3391 free(buf, M_TEMP); 3392 return (error); 3393 } 3394 3395 struct vnode * 3396 vn_dir_dd_ino(struct vnode *vp) 3397 { 3398 struct namecache *ncp; 3399 struct vnode *ddvp; 3400 struct mtx *vlp; 3401 enum vgetstate vs; 3402 3403 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); 3404 vlp = VP2VNODELOCK(vp); 3405 mtx_lock(vlp); 3406 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { 3407 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 3408 continue; 3409 ddvp = ncp->nc_dvp; 3410 vs = vget_prep(ddvp); 3411 mtx_unlock(vlp); 3412 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) 3413 return (NULL); 3414 return (ddvp); 3415 } 3416 mtx_unlock(vlp); 3417 return (NULL); 3418 } 3419 3420 int 3421 vn_commname(struct vnode *vp, char *buf, u_int buflen) 3422 { 3423 struct namecache *ncp; 3424 struct mtx *vlp; 3425 int l; 3426 3427 vlp = VP2VNODELOCK(vp); 3428 mtx_lock(vlp); 3429 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) 3430 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 3431 break; 3432 if (ncp == NULL) { 3433 mtx_unlock(vlp); 3434 return (ENOENT); 3435 } 3436 l = min(ncp->nc_nlen, buflen - 1); 3437 memcpy(buf, ncp->nc_name, l); 3438 mtx_unlock(vlp); 3439 buf[l] = '\0'; 3440 return (0); 3441 } 3442 3443 /* 3444 * This function updates path string to vnode's full global path 3445 * and checks the size of the new path string against the pathlen argument. 3446 * 3447 * Requires a locked, referenced vnode. 3448 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 3449 * 3450 * If vp is a directory, the call to vn_fullpath_global() always succeeds 3451 * because it falls back to the ".." lookup if the namecache lookup fails. 3452 */ 3453 int 3454 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, 3455 u_int pathlen) 3456 { 3457 struct nameidata nd; 3458 struct vnode *vp1; 3459 char *rpath, *fbuf; 3460 int error; 3461 3462 ASSERT_VOP_ELOCKED(vp, __func__); 3463 3464 /* Construct global filesystem path from vp. */ 3465 VOP_UNLOCK(vp); 3466 error = vn_fullpath_global(vp, &rpath, &fbuf); 3467 3468 if (error != 0) { 3469 vrele(vp); 3470 return (error); 3471 } 3472 3473 if (strlen(rpath) >= pathlen) { 3474 vrele(vp); 3475 error = ENAMETOOLONG; 3476 goto out; 3477 } 3478 3479 /* 3480 * Re-lookup the vnode by path to detect a possible rename. 3481 * As a side effect, the vnode is relocked. 3482 * If vnode was renamed, return ENOENT. 3483 */ 3484 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, 3485 UIO_SYSSPACE, path, td); 3486 error = namei(&nd); 3487 if (error != 0) { 3488 vrele(vp); 3489 goto out; 3490 } 3491 NDFREE(&nd, NDF_ONLY_PNBUF); 3492 vp1 = nd.ni_vp; 3493 vrele(vp); 3494 if (vp1 == vp) 3495 strcpy(path, rpath); 3496 else { 3497 vput(vp1); 3498 error = ENOENT; 3499 } 3500 3501 out: 3502 free(fbuf, M_TEMP); 3503 return (error); 3504 } 3505 3506 #ifdef DDB 3507 static void 3508 db_print_vpath(struct vnode *vp) 3509 { 3510 3511 while (vp != NULL) { 3512 db_printf("%p: ", vp); 3513 if (vp == rootvnode) { 3514 db_printf("/"); 3515 vp = NULL; 3516 } else { 3517 if (vp->v_vflag & VV_ROOT) { 3518 db_printf("<mount point>"); 3519 vp = vp->v_mount->mnt_vnodecovered; 3520 } else { 3521 struct namecache *ncp; 3522 char *ncn; 3523 int i; 3524 3525 ncp = TAILQ_FIRST(&vp->v_cache_dst); 3526 if (ncp != NULL) { 3527 ncn = ncp->nc_name; 3528 for (i = 0; i < ncp->nc_nlen; i++) 3529 db_printf("%c", *ncn++); 3530 vp = ncp->nc_dvp; 3531 } else { 3532 vp = NULL; 3533 } 3534 } 3535 } 3536 db_printf("\n"); 3537 } 3538 3539 return; 3540 } 3541 3542 DB_SHOW_COMMAND(vpath, db_show_vpath) 3543 { 3544 struct vnode *vp; 3545 3546 if (!have_addr) { 3547 db_printf("usage: show vpath <struct vnode *>\n"); 3548 return; 3549 } 3550 3551 vp = (struct vnode *)addr; 3552 db_print_vpath(vp); 3553 } 3554 3555 #endif 3556 3557 static bool __read_frequently cache_fast_lookup = true; 3558 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW, 3559 &cache_fast_lookup, 0, ""); 3560 3561 #define CACHE_FPL_FAILED -2020 3562 3563 static void 3564 cache_fpl_cleanup_cnp(struct componentname *cnp) 3565 { 3566 3567 uma_zfree(namei_zone, cnp->cn_pnbuf); 3568 #ifdef DIAGNOSTIC 3569 cnp->cn_pnbuf = NULL; 3570 cnp->cn_nameptr = NULL; 3571 #endif 3572 } 3573 3574 static void 3575 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp) 3576 { 3577 struct componentname *cnp; 3578 3579 cnp = &ndp->ni_cnd; 3580 while (*(cnp->cn_nameptr) == '/') { 3581 cnp->cn_nameptr++; 3582 ndp->ni_pathlen--; 3583 } 3584 3585 *dpp = ndp->ni_rootdir; 3586 } 3587 3588 /* 3589 * Components of nameidata (or objects it can point to) which may 3590 * need restoring in case fast path lookup fails. 3591 */ 3592 struct nameidata_saved { 3593 long cn_namelen; 3594 char *cn_nameptr; 3595 size_t ni_pathlen; 3596 int cn_flags; 3597 }; 3598 3599 struct cache_fpl { 3600 struct nameidata *ndp; 3601 struct componentname *cnp; 3602 struct pwd *pwd; 3603 struct vnode *dvp; 3604 struct vnode *tvp; 3605 seqc_t dvp_seqc; 3606 seqc_t tvp_seqc; 3607 struct nameidata_saved snd; 3608 int line; 3609 enum cache_fpl_status status:8; 3610 bool in_smr; 3611 bool fsearch; 3612 }; 3613 3614 static void 3615 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd) 3616 { 3617 3618 snd->cn_flags = fpl->ndp->ni_cnd.cn_flags; 3619 snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen; 3620 snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr; 3621 snd->ni_pathlen = fpl->ndp->ni_pathlen; 3622 } 3623 3624 static void 3625 cache_fpl_restore_partial(struct cache_fpl *fpl, struct nameidata_saved *snd) 3626 { 3627 3628 fpl->ndp->ni_cnd.cn_flags = snd->cn_flags; 3629 fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen; 3630 fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr; 3631 fpl->ndp->ni_pathlen = snd->ni_pathlen; 3632 } 3633 3634 static void 3635 cache_fpl_restore_abort(struct cache_fpl *fpl, struct nameidata_saved *snd) 3636 { 3637 3638 cache_fpl_restore_partial(fpl, snd); 3639 /* 3640 * It is 0 on entry by API contract. 3641 */ 3642 fpl->ndp->ni_resflags = 0; 3643 } 3644 3645 #ifdef INVARIANTS 3646 #define cache_fpl_smr_assert_entered(fpl) ({ \ 3647 struct cache_fpl *_fpl = (fpl); \ 3648 MPASS(_fpl->in_smr == true); \ 3649 VFS_SMR_ASSERT_ENTERED(); \ 3650 }) 3651 #define cache_fpl_smr_assert_not_entered(fpl) ({ \ 3652 struct cache_fpl *_fpl = (fpl); \ 3653 MPASS(_fpl->in_smr == false); \ 3654 VFS_SMR_ASSERT_NOT_ENTERED(); \ 3655 }) 3656 #else 3657 #define cache_fpl_smr_assert_entered(fpl) do { } while (0) 3658 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0) 3659 #endif 3660 3661 #define cache_fpl_smr_enter_initial(fpl) ({ \ 3662 struct cache_fpl *_fpl = (fpl); \ 3663 vfs_smr_enter(); \ 3664 _fpl->in_smr = true; \ 3665 }) 3666 3667 #define cache_fpl_smr_enter(fpl) ({ \ 3668 struct cache_fpl *_fpl = (fpl); \ 3669 MPASS(_fpl->in_smr == false); \ 3670 vfs_smr_enter(); \ 3671 _fpl->in_smr = true; \ 3672 }) 3673 3674 #define cache_fpl_smr_exit(fpl) ({ \ 3675 struct cache_fpl *_fpl = (fpl); \ 3676 MPASS(_fpl->in_smr == true); \ 3677 vfs_smr_exit(); \ 3678 _fpl->in_smr = false; \ 3679 }) 3680 3681 static int 3682 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line) 3683 { 3684 3685 if (fpl->status != CACHE_FPL_STATUS_UNSET) { 3686 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, 3687 ("%s: converting to abort from %d at %d, set at %d\n", 3688 __func__, fpl->status, line, fpl->line)); 3689 } 3690 fpl->status = CACHE_FPL_STATUS_ABORTED; 3691 fpl->line = line; 3692 return (CACHE_FPL_FAILED); 3693 } 3694 3695 #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__) 3696 3697 static int 3698 cache_fpl_partial_impl(struct cache_fpl *fpl, int line) 3699 { 3700 3701 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3702 ("%s: setting to partial at %d, but already set to %d at %d\n", 3703 __func__, line, fpl->status, fpl->line)); 3704 cache_fpl_smr_assert_entered(fpl); 3705 fpl->status = CACHE_FPL_STATUS_PARTIAL; 3706 fpl->line = line; 3707 return (CACHE_FPL_FAILED); 3708 } 3709 3710 #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__) 3711 3712 static int 3713 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line) 3714 { 3715 3716 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3717 ("%s: setting to handled at %d, but already set to %d at %d\n", 3718 __func__, line, fpl->status, fpl->line)); 3719 cache_fpl_smr_assert_not_entered(fpl); 3720 MPASS(error != CACHE_FPL_FAILED); 3721 fpl->status = CACHE_FPL_STATUS_HANDLED; 3722 fpl->line = line; 3723 return (error); 3724 } 3725 3726 #define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__) 3727 3728 #define CACHE_FPL_SUPPORTED_CN_FLAGS \ 3729 (NC_NOMAKEENTRY | NC_KEEPPOSENTRY | LOCKLEAF | LOCKPARENT | WANTPARENT | \ 3730 FOLLOW | LOCKSHARED | SAVENAME | SAVESTART | WILLBEDIR | ISOPEN | \ 3731 NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK) 3732 3733 #define CACHE_FPL_INTERNAL_CN_FLAGS \ 3734 (ISDOTDOT | MAKEENTRY | ISLASTCN) 3735 3736 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, 3737 "supported and internal flags overlap"); 3738 3739 static bool 3740 cache_fpl_islastcn(struct nameidata *ndp) 3741 { 3742 3743 return (*ndp->ni_next == 0); 3744 } 3745 3746 static bool 3747 cache_fpl_isdotdot(struct componentname *cnp) 3748 { 3749 3750 if (cnp->cn_namelen == 2 && 3751 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') 3752 return (true); 3753 return (false); 3754 } 3755 3756 static bool 3757 cache_can_fplookup(struct cache_fpl *fpl) 3758 { 3759 struct nameidata *ndp; 3760 struct componentname *cnp; 3761 struct thread *td; 3762 3763 ndp = fpl->ndp; 3764 cnp = fpl->cnp; 3765 td = cnp->cn_thread; 3766 3767 if (!cache_fast_lookup) { 3768 cache_fpl_aborted(fpl); 3769 return (false); 3770 } 3771 #ifdef MAC 3772 if (mac_vnode_check_lookup_enabled()) { 3773 cache_fpl_aborted(fpl); 3774 return (false); 3775 } 3776 #endif 3777 if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) { 3778 cache_fpl_aborted(fpl); 3779 return (false); 3780 } 3781 if (IN_CAPABILITY_MODE(td)) { 3782 cache_fpl_aborted(fpl); 3783 return (false); 3784 } 3785 if (AUDITING_TD(td)) { 3786 cache_fpl_aborted(fpl); 3787 return (false); 3788 } 3789 if (ndp->ni_startdir != NULL) { 3790 cache_fpl_aborted(fpl); 3791 return (false); 3792 } 3793 return (true); 3794 } 3795 3796 static int 3797 cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp) 3798 { 3799 struct nameidata *ndp; 3800 int error; 3801 bool fsearch; 3802 3803 ndp = fpl->ndp; 3804 error = fgetvp_lookup_smr(ndp->ni_dirfd, ndp, vpp, &fsearch); 3805 if (__predict_false(error != 0)) { 3806 cache_fpl_smr_exit(fpl); 3807 return (cache_fpl_aborted(fpl)); 3808 } 3809 fpl->fsearch = fsearch; 3810 return (0); 3811 } 3812 3813 static bool 3814 cache_fplookup_vnode_supported(struct vnode *vp) 3815 { 3816 3817 return (vp->v_type != VLNK); 3818 } 3819 3820 static int __noinline 3821 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp, 3822 uint32_t hash) 3823 { 3824 struct componentname *cnp; 3825 struct vnode *dvp; 3826 3827 cnp = fpl->cnp; 3828 dvp = fpl->dvp; 3829 3830 cache_fpl_smr_exit(fpl); 3831 if (cache_neg_promote_cond(dvp, cnp, oncp, hash)) 3832 return (cache_fpl_handled(fpl, ENOENT)); 3833 else 3834 return (cache_fpl_aborted(fpl)); 3835 } 3836 3837 /* 3838 * The target vnode is not supported, prepare for the slow path to take over. 3839 */ 3840 static int __noinline 3841 cache_fplookup_partial_setup(struct cache_fpl *fpl) 3842 { 3843 struct nameidata *ndp; 3844 struct componentname *cnp; 3845 enum vgetstate dvs; 3846 struct vnode *dvp; 3847 struct pwd *pwd; 3848 seqc_t dvp_seqc; 3849 3850 ndp = fpl->ndp; 3851 cnp = fpl->cnp; 3852 pwd = fpl->pwd; 3853 dvp = fpl->dvp; 3854 dvp_seqc = fpl->dvp_seqc; 3855 3856 if (!pwd_hold_smr(pwd)) { 3857 cache_fpl_smr_exit(fpl); 3858 return (cache_fpl_aborted(fpl)); 3859 } 3860 3861 dvs = vget_prep_smr(dvp); 3862 cache_fpl_smr_exit(fpl); 3863 if (__predict_false(dvs == VGET_NONE)) { 3864 pwd_drop(pwd); 3865 return (cache_fpl_aborted(fpl)); 3866 } 3867 3868 vget_finish_ref(dvp, dvs); 3869 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3870 vrele(dvp); 3871 pwd_drop(pwd); 3872 return (cache_fpl_aborted(fpl)); 3873 } 3874 3875 cache_fpl_restore_partial(fpl, &fpl->snd); 3876 3877 ndp->ni_startdir = dvp; 3878 cnp->cn_flags |= MAKEENTRY; 3879 if (cache_fpl_islastcn(ndp)) 3880 cnp->cn_flags |= ISLASTCN; 3881 if (cache_fpl_isdotdot(cnp)) 3882 cnp->cn_flags |= ISDOTDOT; 3883 3884 return (0); 3885 } 3886 3887 static int 3888 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs) 3889 { 3890 struct componentname *cnp; 3891 struct vnode *tvp; 3892 seqc_t tvp_seqc; 3893 int error, lkflags; 3894 3895 cnp = fpl->cnp; 3896 tvp = fpl->tvp; 3897 tvp_seqc = fpl->tvp_seqc; 3898 3899 if ((cnp->cn_flags & LOCKLEAF) != 0) { 3900 lkflags = LK_SHARED; 3901 if ((cnp->cn_flags & LOCKSHARED) == 0) 3902 lkflags = LK_EXCLUSIVE; 3903 error = vget_finish(tvp, lkflags, tvs); 3904 if (__predict_false(error != 0)) { 3905 return (cache_fpl_aborted(fpl)); 3906 } 3907 } else { 3908 vget_finish_ref(tvp, tvs); 3909 } 3910 3911 if (!vn_seqc_consistent(tvp, tvp_seqc)) { 3912 if ((cnp->cn_flags & LOCKLEAF) != 0) 3913 vput(tvp); 3914 else 3915 vrele(tvp); 3916 return (cache_fpl_aborted(fpl)); 3917 } 3918 3919 return (cache_fpl_handled(fpl, 0)); 3920 } 3921 3922 /* 3923 * They want to possibly modify the state of the namecache. 3924 * 3925 * Don't try to match the API contract, just leave. 3926 * TODO: this leaves scalability on the table 3927 */ 3928 static int 3929 cache_fplookup_final_modifying(struct cache_fpl *fpl) 3930 { 3931 struct componentname *cnp; 3932 3933 cnp = fpl->cnp; 3934 MPASS(cnp->cn_nameiop != LOOKUP); 3935 return (cache_fpl_partial(fpl)); 3936 } 3937 3938 static int __noinline 3939 cache_fplookup_final_withparent(struct cache_fpl *fpl) 3940 { 3941 struct componentname *cnp; 3942 enum vgetstate dvs, tvs; 3943 struct vnode *dvp, *tvp; 3944 seqc_t dvp_seqc; 3945 int error; 3946 3947 cnp = fpl->cnp; 3948 dvp = fpl->dvp; 3949 dvp_seqc = fpl->dvp_seqc; 3950 tvp = fpl->tvp; 3951 3952 MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0); 3953 3954 /* 3955 * This is less efficient than it can be for simplicity. 3956 */ 3957 dvs = vget_prep_smr(dvp); 3958 if (__predict_false(dvs == VGET_NONE)) { 3959 return (cache_fpl_aborted(fpl)); 3960 } 3961 tvs = vget_prep_smr(tvp); 3962 if (__predict_false(tvs == VGET_NONE)) { 3963 cache_fpl_smr_exit(fpl); 3964 vget_abort(dvp, dvs); 3965 return (cache_fpl_aborted(fpl)); 3966 } 3967 3968 cache_fpl_smr_exit(fpl); 3969 3970 if ((cnp->cn_flags & LOCKPARENT) != 0) { 3971 error = vget_finish(dvp, LK_EXCLUSIVE, dvs); 3972 if (__predict_false(error != 0)) { 3973 vget_abort(tvp, tvs); 3974 return (cache_fpl_aborted(fpl)); 3975 } 3976 } else { 3977 vget_finish_ref(dvp, dvs); 3978 } 3979 3980 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3981 vget_abort(tvp, tvs); 3982 if ((cnp->cn_flags & LOCKPARENT) != 0) 3983 vput(dvp); 3984 else 3985 vrele(dvp); 3986 return (cache_fpl_aborted(fpl)); 3987 } 3988 3989 error = cache_fplookup_final_child(fpl, tvs); 3990 if (__predict_false(error != 0)) { 3991 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED); 3992 if ((cnp->cn_flags & LOCKPARENT) != 0) 3993 vput(dvp); 3994 else 3995 vrele(dvp); 3996 return (error); 3997 } 3998 3999 MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED); 4000 return (0); 4001 } 4002 4003 static int 4004 cache_fplookup_final(struct cache_fpl *fpl) 4005 { 4006 struct componentname *cnp; 4007 enum vgetstate tvs; 4008 struct vnode *dvp, *tvp; 4009 seqc_t dvp_seqc; 4010 4011 cnp = fpl->cnp; 4012 dvp = fpl->dvp; 4013 dvp_seqc = fpl->dvp_seqc; 4014 tvp = fpl->tvp; 4015 4016 VNPASS(cache_fplookup_vnode_supported(dvp), dvp); 4017 4018 if (cnp->cn_nameiop != LOOKUP) { 4019 return (cache_fplookup_final_modifying(fpl)); 4020 } 4021 4022 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) 4023 return (cache_fplookup_final_withparent(fpl)); 4024 4025 tvs = vget_prep_smr(tvp); 4026 if (__predict_false(tvs == VGET_NONE)) { 4027 return (cache_fpl_partial(fpl)); 4028 } 4029 4030 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4031 cache_fpl_smr_exit(fpl); 4032 vget_abort(tvp, tvs); 4033 return (cache_fpl_aborted(fpl)); 4034 } 4035 4036 cache_fpl_smr_exit(fpl); 4037 return (cache_fplookup_final_child(fpl, tvs)); 4038 } 4039 4040 static int __noinline 4041 cache_fplookup_dot(struct cache_fpl *fpl) 4042 { 4043 struct vnode *dvp; 4044 4045 dvp = fpl->dvp; 4046 4047 fpl->tvp = dvp; 4048 fpl->tvp_seqc = vn_seqc_read_any(dvp); 4049 if (seqc_in_modify(fpl->tvp_seqc)) { 4050 return (cache_fpl_aborted(fpl)); 4051 } 4052 4053 counter_u64_add(dothits, 1); 4054 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", dvp); 4055 4056 return (0); 4057 } 4058 4059 static int __noinline 4060 cache_fplookup_dotdot(struct cache_fpl *fpl) 4061 { 4062 struct nameidata *ndp; 4063 struct componentname *cnp; 4064 struct namecache *ncp; 4065 struct vnode *dvp; 4066 struct prison *pr; 4067 u_char nc_flag; 4068 4069 ndp = fpl->ndp; 4070 cnp = fpl->cnp; 4071 dvp = fpl->dvp; 4072 4073 /* 4074 * XXX this is racy the same way regular lookup is 4075 */ 4076 for (pr = cnp->cn_cred->cr_prison; pr != NULL; 4077 pr = pr->pr_parent) 4078 if (dvp == pr->pr_root) 4079 break; 4080 4081 if (dvp == ndp->ni_rootdir || 4082 dvp == ndp->ni_topdir || 4083 dvp == rootvnode || 4084 pr != NULL) { 4085 fpl->tvp = dvp; 4086 fpl->tvp_seqc = vn_seqc_read_any(dvp); 4087 if (seqc_in_modify(fpl->tvp_seqc)) { 4088 return (cache_fpl_aborted(fpl)); 4089 } 4090 return (0); 4091 } 4092 4093 if ((dvp->v_vflag & VV_ROOT) != 0) { 4094 /* 4095 * TODO 4096 * The opposite of climb mount is needed here. 4097 */ 4098 return (cache_fpl_aborted(fpl)); 4099 } 4100 4101 ncp = atomic_load_ptr(&dvp->v_cache_dd); 4102 if (ncp == NULL) { 4103 return (cache_fpl_aborted(fpl)); 4104 } 4105 4106 nc_flag = atomic_load_char(&ncp->nc_flag); 4107 if ((nc_flag & NCF_ISDOTDOT) != 0) { 4108 if ((nc_flag & NCF_NEGATIVE) != 0) 4109 return (cache_fpl_aborted(fpl)); 4110 fpl->tvp = ncp->nc_vp; 4111 } else { 4112 fpl->tvp = ncp->nc_dvp; 4113 } 4114 4115 if (!cache_ncp_canuse(ncp)) { 4116 return (cache_fpl_aborted(fpl)); 4117 } 4118 4119 fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp); 4120 if (seqc_in_modify(fpl->tvp_seqc)) { 4121 return (cache_fpl_partial(fpl)); 4122 } 4123 4124 counter_u64_add(dotdothits, 1); 4125 return (0); 4126 } 4127 4128 static int __noinline 4129 cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash) 4130 { 4131 u_char nc_flag; 4132 bool neg_promote; 4133 4134 nc_flag = atomic_load_char(&ncp->nc_flag); 4135 MPASS((nc_flag & NCF_NEGATIVE) != 0); 4136 /* 4137 * If they want to create an entry we need to replace this one. 4138 */ 4139 if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) { 4140 /* 4141 * TODO 4142 * This should call something similar to 4143 * cache_fplookup_final_modifying. 4144 */ 4145 return (cache_fpl_partial(fpl)); 4146 } 4147 neg_promote = cache_neg_hit_prep(ncp); 4148 if (!cache_ncp_canuse(ncp)) { 4149 cache_neg_hit_abort(ncp); 4150 return (cache_fpl_partial(fpl)); 4151 } 4152 if (__predict_false((nc_flag & NCF_WHITE) != 0)) { 4153 cache_neg_hit_abort(ncp); 4154 return (cache_fpl_partial(fpl)); 4155 } 4156 if (neg_promote) { 4157 return (cache_fplookup_negative_promote(fpl, ncp, hash)); 4158 } 4159 cache_neg_hit_finish(ncp); 4160 cache_fpl_smr_exit(fpl); 4161 return (cache_fpl_handled(fpl, ENOENT)); 4162 } 4163 4164 static int 4165 cache_fplookup_next(struct cache_fpl *fpl) 4166 { 4167 struct componentname *cnp; 4168 struct namecache *ncp; 4169 struct vnode *dvp, *tvp; 4170 u_char nc_flag; 4171 uint32_t hash; 4172 4173 cnp = fpl->cnp; 4174 dvp = fpl->dvp; 4175 4176 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) { 4177 return (cache_fplookup_dot(fpl)); 4178 } 4179 4180 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 4181 4182 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 4183 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 4184 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 4185 break; 4186 } 4187 4188 /* 4189 * If there is no entry we have to punt to the slow path to perform 4190 * actual lookup. Should there be nothing with this name a negative 4191 * entry will be created. 4192 */ 4193 if (__predict_false(ncp == NULL)) { 4194 return (cache_fpl_partial(fpl)); 4195 } 4196 4197 tvp = atomic_load_ptr(&ncp->nc_vp); 4198 nc_flag = atomic_load_char(&ncp->nc_flag); 4199 if ((nc_flag & NCF_NEGATIVE) != 0) { 4200 return (cache_fplookup_neg(fpl, ncp, hash)); 4201 } 4202 4203 if (!cache_ncp_canuse(ncp)) { 4204 return (cache_fpl_partial(fpl)); 4205 } 4206 4207 fpl->tvp = tvp; 4208 fpl->tvp_seqc = vn_seqc_read_any(tvp); 4209 if (seqc_in_modify(fpl->tvp_seqc)) { 4210 return (cache_fpl_partial(fpl)); 4211 } 4212 4213 if (!cache_fplookup_vnode_supported(tvp)) { 4214 return (cache_fpl_partial(fpl)); 4215 } 4216 4217 counter_u64_add(numposhits, 1); 4218 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp); 4219 return (0); 4220 } 4221 4222 static bool 4223 cache_fplookup_mp_supported(struct mount *mp) 4224 { 4225 4226 if (mp == NULL) 4227 return (false); 4228 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) 4229 return (false); 4230 return (true); 4231 } 4232 4233 /* 4234 * Walk up the mount stack (if any). 4235 * 4236 * Correctness is provided in the following ways: 4237 * - all vnodes are protected from freeing with SMR 4238 * - struct mount objects are type stable making them always safe to access 4239 * - stability of the particular mount is provided by busying it 4240 * - relationship between the vnode which is mounted on and the mount is 4241 * verified with the vnode sequence counter after busying 4242 * - association between root vnode of the mount and the mount is protected 4243 * by busy 4244 * 4245 * From that point on we can read the sequence counter of the root vnode 4246 * and get the next mount on the stack (if any) using the same protection. 4247 * 4248 * By the end of successful walk we are guaranteed the reached state was 4249 * indeed present at least at some point which matches the regular lookup. 4250 */ 4251 static int __noinline 4252 cache_fplookup_climb_mount(struct cache_fpl *fpl) 4253 { 4254 struct mount *mp, *prev_mp; 4255 struct mount_pcpu *mpcpu, *prev_mpcpu; 4256 struct vnode *vp; 4257 seqc_t vp_seqc; 4258 4259 vp = fpl->tvp; 4260 vp_seqc = fpl->tvp_seqc; 4261 4262 VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp); 4263 mp = atomic_load_ptr(&vp->v_mountedhere); 4264 if (mp == NULL) 4265 return (0); 4266 4267 prev_mp = NULL; 4268 for (;;) { 4269 if (!vfs_op_thread_enter_crit(mp, mpcpu)) { 4270 if (prev_mp != NULL) 4271 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); 4272 return (cache_fpl_partial(fpl)); 4273 } 4274 if (prev_mp != NULL) 4275 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); 4276 if (!vn_seqc_consistent(vp, vp_seqc)) { 4277 vfs_op_thread_exit_crit(mp, mpcpu); 4278 return (cache_fpl_partial(fpl)); 4279 } 4280 if (!cache_fplookup_mp_supported(mp)) { 4281 vfs_op_thread_exit_crit(mp, mpcpu); 4282 return (cache_fpl_partial(fpl)); 4283 } 4284 vp = atomic_load_ptr(&mp->mnt_rootvnode); 4285 if (vp == NULL || VN_IS_DOOMED(vp)) { 4286 vfs_op_thread_exit_crit(mp, mpcpu); 4287 return (cache_fpl_partial(fpl)); 4288 } 4289 vp_seqc = vn_seqc_read_any(vp); 4290 if (seqc_in_modify(vp_seqc)) { 4291 vfs_op_thread_exit_crit(mp, mpcpu); 4292 return (cache_fpl_partial(fpl)); 4293 } 4294 prev_mp = mp; 4295 prev_mpcpu = mpcpu; 4296 mp = atomic_load_ptr(&vp->v_mountedhere); 4297 if (mp == NULL) 4298 break; 4299 } 4300 4301 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); 4302 fpl->tvp = vp; 4303 fpl->tvp_seqc = vp_seqc; 4304 return (0); 4305 } 4306 4307 static bool 4308 cache_fplookup_need_climb_mount(struct cache_fpl *fpl) 4309 { 4310 struct mount *mp; 4311 struct vnode *vp; 4312 4313 vp = fpl->tvp; 4314 4315 /* 4316 * Hack: while this is a union, the pointer tends to be NULL so save on 4317 * a branch. 4318 */ 4319 mp = atomic_load_ptr(&vp->v_mountedhere); 4320 if (mp == NULL) 4321 return (false); 4322 if (vp->v_type == VDIR) 4323 return (true); 4324 return (false); 4325 } 4326 4327 /* 4328 * Parse the path. 4329 * 4330 * The code was originally copy-pasted from regular lookup and despite 4331 * clean ups leaves performance on the table. Any modifications here 4332 * must take into account that in case off fallback the resulting 4333 * nameidata state has to be compatible with the original. 4334 */ 4335 static int 4336 cache_fplookup_parse(struct cache_fpl *fpl) 4337 { 4338 struct nameidata *ndp; 4339 struct componentname *cnp; 4340 char *cp; 4341 4342 ndp = fpl->ndp; 4343 cnp = fpl->cnp; 4344 4345 /* 4346 * Search a new directory. 4347 * 4348 * The last component of the filename is left accessible via 4349 * cnp->cn_nameptr for callers that need the name. Callers needing 4350 * the name set the SAVENAME flag. When done, they assume 4351 * responsibility for freeing the pathname buffer. 4352 */ 4353 for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++) 4354 continue; 4355 cnp->cn_namelen = cp - cnp->cn_nameptr; 4356 if (__predict_false(cnp->cn_namelen > NAME_MAX)) { 4357 cache_fpl_smr_exit(fpl); 4358 return (cache_fpl_handled(fpl, ENAMETOOLONG)); 4359 } 4360 ndp->ni_pathlen -= cnp->cn_namelen; 4361 KASSERT(ndp->ni_pathlen <= PATH_MAX, 4362 ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen)); 4363 ndp->ni_next = cp; 4364 4365 /* 4366 * Replace multiple slashes by a single slash and trailing slashes 4367 * by a null. This must be done before VOP_LOOKUP() because some 4368 * fs's don't know about trailing slashes. Remember if there were 4369 * trailing slashes to handle symlinks, existing non-directories 4370 * and non-existing files that won't be directories specially later. 4371 */ 4372 while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) { 4373 cp++; 4374 ndp->ni_pathlen--; 4375 if (*cp == '\0') { 4376 /* 4377 * TODO 4378 * Regular lookup performs the following: 4379 * *ndp->ni_next = '\0'; 4380 * cnp->cn_flags |= TRAILINGSLASH; 4381 * 4382 * Which is problematic since it modifies data read 4383 * from userspace. Then if fast path lookup was to 4384 * abort we would have to either restore it or convey 4385 * the flag. Since this is a corner case just ignore 4386 * it for simplicity. 4387 */ 4388 return (cache_fpl_partial(fpl)); 4389 } 4390 } 4391 ndp->ni_next = cp; 4392 4393 /* 4394 * Check for degenerate name (e.g. / or "") 4395 * which is a way of talking about a directory, 4396 * e.g. like "/." or ".". 4397 * 4398 * TODO 4399 * Another corner case handled by the regular lookup 4400 */ 4401 if (__predict_false(cnp->cn_nameptr[0] == '\0')) { 4402 return (cache_fpl_partial(fpl)); 4403 } 4404 return (0); 4405 } 4406 4407 static void 4408 cache_fplookup_parse_advance(struct cache_fpl *fpl) 4409 { 4410 struct nameidata *ndp; 4411 struct componentname *cnp; 4412 4413 ndp = fpl->ndp; 4414 cnp = fpl->cnp; 4415 4416 cnp->cn_nameptr = ndp->ni_next; 4417 while (*cnp->cn_nameptr == '/') { 4418 cnp->cn_nameptr++; 4419 ndp->ni_pathlen--; 4420 } 4421 } 4422 4423 /* 4424 * See the API contract for VOP_FPLOOKUP_VEXEC. 4425 */ 4426 static int __noinline 4427 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error) 4428 { 4429 struct vnode *dvp; 4430 seqc_t dvp_seqc; 4431 4432 dvp = fpl->dvp; 4433 dvp_seqc = fpl->dvp_seqc; 4434 4435 /* 4436 * Hack: they may be looking up foo/bar, where foo is a 4437 * regular file. In such a case we need to turn ENOTDIR, 4438 * but we may happen to get here with a different error. 4439 */ 4440 if (dvp->v_type != VDIR) { 4441 /* 4442 * The check here is predominantly to catch 4443 * EOPNOTSUPP from dead_vnodeops. If the vnode 4444 * gets doomed past this point it is going to 4445 * fail seqc verification. 4446 */ 4447 if (VN_IS_DOOMED(dvp)) { 4448 return (cache_fpl_aborted(fpl)); 4449 } 4450 error = ENOTDIR; 4451 } 4452 4453 /* 4454 * Hack: handle O_SEARCH. 4455 * 4456 * Open Group Base Specifications Issue 7, 2018 edition states: 4457 * If the access mode of the open file description associated with the 4458 * file descriptor is not O_SEARCH, the function shall check whether 4459 * directory searches are permitted using the current permissions of 4460 * the directory underlying the file descriptor. If the access mode is 4461 * O_SEARCH, the function shall not perform the check. 4462 * 4463 * Regular lookup tests for the NOEXECCHECK flag for every path 4464 * component to decide whether to do the permission check. However, 4465 * since most lookups never have the flag (and when they do it is only 4466 * present for the first path component), lockless lookup only acts on 4467 * it if there is a permission problem. Here the flag is represented 4468 * with a boolean so that we don't have to clear it on the way out. 4469 * 4470 * For simplicity this always aborts. 4471 * TODO: check if this is the first lookup and ignore the permission 4472 * problem. Note the flag has to survive fallback (if it happens to be 4473 * performed). 4474 */ 4475 if (fpl->fsearch) { 4476 return (cache_fpl_aborted(fpl)); 4477 } 4478 4479 switch (error) { 4480 case EAGAIN: 4481 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4482 error = cache_fpl_aborted(fpl); 4483 } else { 4484 cache_fpl_partial(fpl); 4485 } 4486 break; 4487 default: 4488 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4489 error = cache_fpl_aborted(fpl); 4490 } else { 4491 cache_fpl_smr_exit(fpl); 4492 cache_fpl_handled(fpl, error); 4493 } 4494 break; 4495 } 4496 return (error); 4497 } 4498 4499 static int 4500 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl) 4501 { 4502 struct nameidata *ndp; 4503 struct componentname *cnp; 4504 struct mount *mp; 4505 int error; 4506 4507 error = CACHE_FPL_FAILED; 4508 ndp = fpl->ndp; 4509 cnp = fpl->cnp; 4510 4511 cache_fpl_checkpoint(fpl, &fpl->snd); 4512 4513 fpl->dvp = dvp; 4514 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); 4515 if (seqc_in_modify(fpl->dvp_seqc)) { 4516 cache_fpl_aborted(fpl); 4517 goto out; 4518 } 4519 mp = atomic_load_ptr(&fpl->dvp->v_mount); 4520 if (!cache_fplookup_mp_supported(mp)) { 4521 cache_fpl_aborted(fpl); 4522 goto out; 4523 } 4524 4525 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); 4526 4527 for (;;) { 4528 error = cache_fplookup_parse(fpl); 4529 if (__predict_false(error != 0)) { 4530 break; 4531 } 4532 4533 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); 4534 4535 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred); 4536 if (__predict_false(error != 0)) { 4537 error = cache_fplookup_failed_vexec(fpl, error); 4538 break; 4539 } 4540 4541 if (__predict_false(cache_fpl_isdotdot(cnp))) { 4542 error = cache_fplookup_dotdot(fpl); 4543 if (__predict_false(error != 0)) { 4544 break; 4545 } 4546 } else { 4547 error = cache_fplookup_next(fpl); 4548 if (__predict_false(error != 0)) { 4549 break; 4550 } 4551 4552 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 4553 4554 if (cache_fplookup_need_climb_mount(fpl)) { 4555 error = cache_fplookup_climb_mount(fpl); 4556 if (__predict_false(error != 0)) { 4557 break; 4558 } 4559 } 4560 } 4561 4562 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 4563 4564 if (cache_fpl_islastcn(ndp)) { 4565 error = cache_fplookup_final(fpl); 4566 break; 4567 } 4568 4569 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { 4570 error = cache_fpl_aborted(fpl); 4571 break; 4572 } 4573 4574 fpl->dvp = fpl->tvp; 4575 fpl->dvp_seqc = fpl->tvp_seqc; 4576 4577 cache_fplookup_parse_advance(fpl); 4578 cache_fpl_checkpoint(fpl, &fpl->snd); 4579 } 4580 out: 4581 switch (fpl->status) { 4582 case CACHE_FPL_STATUS_UNSET: 4583 __assert_unreachable(); 4584 break; 4585 case CACHE_FPL_STATUS_PARTIAL: 4586 cache_fpl_smr_assert_entered(fpl); 4587 return (cache_fplookup_partial_setup(fpl)); 4588 case CACHE_FPL_STATUS_ABORTED: 4589 if (fpl->in_smr) 4590 cache_fpl_smr_exit(fpl); 4591 return (CACHE_FPL_FAILED); 4592 case CACHE_FPL_STATUS_HANDLED: 4593 MPASS(error != CACHE_FPL_FAILED); 4594 cache_fpl_smr_assert_not_entered(fpl); 4595 /* 4596 * A common error is ENOENT. 4597 */ 4598 if (error != 0) { 4599 ndp->ni_dvp = NULL; 4600 ndp->ni_vp = NULL; 4601 cache_fpl_cleanup_cnp(cnp); 4602 return (error); 4603 } 4604 ndp->ni_dvp = fpl->dvp; 4605 ndp->ni_vp = fpl->tvp; 4606 if (cnp->cn_flags & SAVENAME) 4607 cnp->cn_flags |= HASBUF; 4608 else 4609 cache_fpl_cleanup_cnp(cnp); 4610 return (error); 4611 } 4612 __assert_unreachable(); 4613 } 4614 4615 /* 4616 * Fast path lookup protected with SMR and sequence counters. 4617 * 4618 * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one. 4619 * 4620 * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria 4621 * outlined below. 4622 * 4623 * Traditional vnode lookup conceptually looks like this: 4624 * 4625 * vn_lock(current); 4626 * for (;;) { 4627 * next = find(); 4628 * vn_lock(next); 4629 * vn_unlock(current); 4630 * current = next; 4631 * if (last) 4632 * break; 4633 * } 4634 * return (current); 4635 * 4636 * Each jump to the next vnode is safe memory-wise and atomic with respect to 4637 * any modifications thanks to holding respective locks. 4638 * 4639 * The same guarantee can be provided with a combination of safe memory 4640 * reclamation and sequence counters instead. If all operations which affect 4641 * the relationship between the current vnode and the one we are looking for 4642 * also modify the counter, we can verify whether all the conditions held as 4643 * we made the jump. This includes things like permissions, mount points etc. 4644 * Counter modification is provided by enclosing relevant places in 4645 * vn_seqc_write_begin()/end() calls. 4646 * 4647 * Thus this translates to: 4648 * 4649 * vfs_smr_enter(); 4650 * dvp_seqc = seqc_read_any(dvp); 4651 * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode 4652 * abort(); 4653 * for (;;) { 4654 * tvp = find(); 4655 * tvp_seqc = seqc_read_any(tvp); 4656 * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode 4657 * abort(); 4658 * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode 4659 * abort(); 4660 * dvp = tvp; // we know nothing of importance has changed 4661 * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration 4662 * if (last) 4663 * break; 4664 * } 4665 * vget(); // secure the vnode 4666 * if (!seqc_consistent(tvp, tvp_seqc) // final check 4667 * abort(); 4668 * // at this point we know nothing has changed for any parent<->child pair 4669 * // as they were crossed during the lookup, meaning we matched the guarantee 4670 * // of the locked variant 4671 * return (tvp); 4672 * 4673 * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows: 4674 * - they are called while within vfs_smr protection which they must never exit 4675 * - EAGAIN can be returned to denote checking could not be performed, it is 4676 * always valid to return it 4677 * - if the sequence counter has not changed the result must be valid 4678 * - if the sequence counter has changed both false positives and false negatives 4679 * are permitted (since the result will be rejected later) 4680 * - for simple cases of unix permission checks vaccess_vexec_smr can be used 4681 * 4682 * Caveats to watch out for: 4683 * - vnodes are passed unlocked and unreferenced with nothing stopping 4684 * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised 4685 * to use atomic_load_ptr to fetch it. 4686 * - the aforementioned object can also get freed, meaning absent other means it 4687 * should be protected with vfs_smr 4688 * - either safely checking permissions as they are modified or guaranteeing 4689 * their stability is left to the routine 4690 */ 4691 int 4692 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status, 4693 struct pwd **pwdp) 4694 { 4695 struct cache_fpl fpl; 4696 struct pwd *pwd; 4697 struct vnode *dvp; 4698 struct componentname *cnp; 4699 struct nameidata_saved orig; 4700 int error; 4701 4702 MPASS(ndp->ni_lcf == 0); 4703 4704 fpl.status = CACHE_FPL_STATUS_UNSET; 4705 fpl.ndp = ndp; 4706 fpl.cnp = &ndp->ni_cnd; 4707 MPASS(curthread == fpl.cnp->cn_thread); 4708 4709 if ((fpl.cnp->cn_flags & SAVESTART) != 0) 4710 MPASS(fpl.cnp->cn_nameiop != LOOKUP); 4711 4712 if (!cache_can_fplookup(&fpl)) { 4713 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 4714 *status = fpl.status; 4715 return (EOPNOTSUPP); 4716 } 4717 4718 cache_fpl_checkpoint(&fpl, &orig); 4719 4720 cache_fpl_smr_enter_initial(&fpl); 4721 fpl.fsearch = false; 4722 pwd = pwd_get_smr(); 4723 fpl.pwd = pwd; 4724 ndp->ni_rootdir = pwd->pwd_rdir; 4725 ndp->ni_topdir = pwd->pwd_jdir; 4726 4727 cnp = fpl.cnp; 4728 cnp->cn_nameptr = cnp->cn_pnbuf; 4729 if (cnp->cn_pnbuf[0] == '/') { 4730 cache_fpl_handle_root(ndp, &dvp); 4731 ndp->ni_resflags |= NIRES_ABS; 4732 } else { 4733 if (ndp->ni_dirfd == AT_FDCWD) { 4734 dvp = pwd->pwd_cdir; 4735 } else { 4736 error = cache_fplookup_dirfd(&fpl, &dvp); 4737 if (__predict_false(error != 0)) { 4738 goto out; 4739 } 4740 } 4741 } 4742 4743 SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true); 4744 4745 error = cache_fplookup_impl(dvp, &fpl); 4746 out: 4747 cache_fpl_smr_assert_not_entered(&fpl); 4748 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 4749 4750 *status = fpl.status; 4751 switch (fpl.status) { 4752 case CACHE_FPL_STATUS_UNSET: 4753 __assert_unreachable(); 4754 break; 4755 case CACHE_FPL_STATUS_HANDLED: 4756 SDT_PROBE3(vfs, namei, lookup, return, error, 4757 (error == 0 ? ndp->ni_vp : NULL), true); 4758 break; 4759 case CACHE_FPL_STATUS_PARTIAL: 4760 *pwdp = fpl.pwd; 4761 /* 4762 * Status restored by cache_fplookup_partial_setup. 4763 */ 4764 break; 4765 case CACHE_FPL_STATUS_ABORTED: 4766 cache_fpl_restore_abort(&fpl, &orig); 4767 break; 4768 } 4769 return (error); 4770 } 4771