1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Poul-Henning Kamp of the FreeBSD Project. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_ddb.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/capsicum.h> 46 #include <sys/counter.h> 47 #include <sys/filedesc.h> 48 #include <sys/fnv_hash.h> 49 #include <sys/kernel.h> 50 #include <sys/ktr.h> 51 #include <sys/lock.h> 52 #include <sys/malloc.h> 53 #include <sys/fcntl.h> 54 #include <sys/jail.h> 55 #include <sys/mount.h> 56 #include <sys/namei.h> 57 #include <sys/proc.h> 58 #include <sys/seqc.h> 59 #include <sys/sdt.h> 60 #include <sys/smr.h> 61 #include <sys/smp.h> 62 #include <sys/syscallsubr.h> 63 #include <sys/sysctl.h> 64 #include <sys/sysproto.h> 65 #include <sys/vnode.h> 66 #include <ck_queue.h> 67 #ifdef KTRACE 68 #include <sys/ktrace.h> 69 #endif 70 71 #include <sys/capsicum.h> 72 73 #include <security/audit/audit.h> 74 #include <security/mac/mac_framework.h> 75 76 #ifdef DDB 77 #include <ddb/ddb.h> 78 #endif 79 80 #include <vm/uma.h> 81 82 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 83 "Name cache"); 84 85 SDT_PROVIDER_DECLARE(vfs); 86 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", 87 "struct vnode *"); 88 SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *", 89 "struct vnode *"); 90 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", 91 "char *"); 92 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *", 93 "const char *"); 94 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *", 95 "struct namecache *", "int", "int"); 96 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); 97 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", 98 "char *", "struct vnode *"); 99 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); 100 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", 101 "struct vnode *", "char *"); 102 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", 103 "struct vnode *"); 104 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, 105 "struct vnode *", "char *"); 106 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", 107 "char *"); 108 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *", 109 "struct componentname *"); 110 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *", 111 "struct componentname *"); 112 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); 113 SDT_PROBE_DEFINE1(vfs, namecache, purge, batch, "int"); 114 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); 115 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); 116 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", 117 "struct vnode *"); 118 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", 119 "char *"); 120 SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *", 121 "char *"); 122 123 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool"); 124 SDT_PROBE_DECLARE(vfs, namei, lookup, entry); 125 SDT_PROBE_DECLARE(vfs, namei, lookup, return); 126 127 /* 128 * This structure describes the elements in the cache of recent 129 * names looked up by namei. 130 */ 131 struct negstate { 132 u_char neg_flag; 133 u_char neg_hit; 134 }; 135 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *), 136 "the state must fit in a union with a pointer without growing it"); 137 138 struct namecache { 139 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 140 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 141 CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */ 142 struct vnode *nc_dvp; /* vnode of parent of name */ 143 union { 144 struct vnode *nu_vp; /* vnode the name refers to */ 145 struct negstate nu_neg;/* negative entry state */ 146 } n_un; 147 u_char nc_flag; /* flag bits */ 148 u_char nc_nlen; /* length of name */ 149 char nc_name[0]; /* segment name + nul */ 150 }; 151 152 /* 153 * struct namecache_ts repeats struct namecache layout up to the 154 * nc_nlen member. 155 * struct namecache_ts is used in place of struct namecache when time(s) need 156 * to be stored. The nc_dotdottime field is used when a cache entry is mapping 157 * both a non-dotdot directory name plus dotdot for the directory's 158 * parent. 159 * 160 * See below for alignment requirement. 161 */ 162 struct namecache_ts { 163 struct timespec nc_time; /* timespec provided by fs */ 164 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ 165 int nc_ticks; /* ticks value when entry was added */ 166 int nc_pad; 167 struct namecache nc_nc; 168 }; 169 170 TAILQ_HEAD(cache_freebatch, namecache); 171 172 /* 173 * At least mips n32 performs 64-bit accesses to timespec as found 174 * in namecache_ts and requires them to be aligned. Since others 175 * may be in the same spot suffer a little bit and enforce the 176 * alignment for everyone. Note this is a nop for 64-bit platforms. 177 */ 178 #define CACHE_ZONE_ALIGNMENT UMA_ALIGNOF(time_t) 179 180 /* 181 * TODO: the initial value of CACHE_PATH_CUTOFF was inherited from the 182 * 4.4 BSD codebase. Later on struct namecache was tweaked to become 183 * smaller and the value was bumped to retain the total size, but it 184 * was never re-evaluated for suitability. A simple test counting 185 * lengths during package building shows that the value of 45 covers 186 * about 86% of all added entries, reaching 99% at 65. 187 * 188 * Regardless of the above, use of dedicated zones instead of malloc may be 189 * inducing additional waste. This may be hard to address as said zones are 190 * tied to VFS SMR. Even if retaining them, the current split should be 191 * re-evaluated. 192 */ 193 #ifdef __LP64__ 194 #define CACHE_PATH_CUTOFF 45 195 #define CACHE_LARGE_PAD 6 196 #else 197 #define CACHE_PATH_CUTOFF 41 198 #define CACHE_LARGE_PAD 2 199 #endif 200 201 #define CACHE_ZONE_SMALL_SIZE (offsetof(struct namecache, nc_name) + CACHE_PATH_CUTOFF + 1) 202 #define CACHE_ZONE_SMALL_TS_SIZE (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_SMALL_SIZE) 203 #define CACHE_ZONE_LARGE_SIZE (offsetof(struct namecache, nc_name) + NAME_MAX + 1 + CACHE_LARGE_PAD) 204 #define CACHE_ZONE_LARGE_TS_SIZE (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_LARGE_SIZE) 205 206 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 207 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 208 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 209 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 210 211 #define nc_vp n_un.nu_vp 212 #define nc_neg n_un.nu_neg 213 214 /* 215 * Flags in namecache.nc_flag 216 */ 217 #define NCF_WHITE 0x01 218 #define NCF_ISDOTDOT 0x02 219 #define NCF_TS 0x04 220 #define NCF_DTS 0x08 221 #define NCF_DVDROP 0x10 222 #define NCF_NEGATIVE 0x20 223 #define NCF_INVALID 0x40 224 #define NCF_WIP 0x80 225 226 /* 227 * Flags in negstate.neg_flag 228 */ 229 #define NEG_HOT 0x01 230 231 static bool cache_neg_evict_cond(u_long lnumcache); 232 233 /* 234 * Mark an entry as invalid. 235 * 236 * This is called before it starts getting deconstructed. 237 */ 238 static void 239 cache_ncp_invalidate(struct namecache *ncp) 240 { 241 242 KASSERT((ncp->nc_flag & NCF_INVALID) == 0, 243 ("%s: entry %p already invalid", __func__, ncp)); 244 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID); 245 atomic_thread_fence_rel(); 246 } 247 248 /* 249 * Check whether the entry can be safely used. 250 * 251 * All places which elide locks are supposed to call this after they are 252 * done with reading from an entry. 253 */ 254 #define cache_ncp_canuse(ncp) ({ \ 255 struct namecache *_ncp = (ncp); \ 256 u_char _nc_flag; \ 257 \ 258 atomic_thread_fence_acq(); \ 259 _nc_flag = atomic_load_char(&_ncp->nc_flag); \ 260 __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP)) == 0); \ 261 }) 262 263 /* 264 * Name caching works as follows: 265 * 266 * Names found by directory scans are retained in a cache 267 * for future reference. It is managed LRU, so frequently 268 * used names will hang around. Cache is indexed by hash value 269 * obtained from (dvp, name) where dvp refers to the directory 270 * containing name. 271 * 272 * If it is a "negative" entry, (i.e. for a name that is known NOT to 273 * exist) the vnode pointer will be NULL. 274 * 275 * Upon reaching the last segment of a path, if the reference 276 * is for DELETE, or NOCACHE is set (rewrite), and the 277 * name is located in the cache, it will be dropped. 278 * 279 * These locks are used (in the order in which they can be taken): 280 * NAME TYPE ROLE 281 * vnodelock mtx vnode lists and v_cache_dd field protection 282 * bucketlock mtx for access to given set of hash buckets 283 * neglist mtx negative entry LRU management 284 * 285 * It is legal to take multiple vnodelock and bucketlock locks. The locking 286 * order is lower address first. Both are recursive. 287 * 288 * "." lookups are lockless. 289 * 290 * ".." and vnode -> name lookups require vnodelock. 291 * 292 * name -> vnode lookup requires the relevant bucketlock to be held for reading. 293 * 294 * Insertions and removals of entries require involved vnodes and bucketlocks 295 * to be locked to provide safe operation against other threads modifying the 296 * cache. 297 * 298 * Some lookups result in removal of the found entry (e.g. getting rid of a 299 * negative entry with the intent to create a positive one), which poses a 300 * problem when multiple threads reach the state. Similarly, two different 301 * threads can purge two different vnodes and try to remove the same name. 302 * 303 * If the already held vnode lock is lower than the second required lock, we 304 * can just take the other lock. However, in the opposite case, this could 305 * deadlock. As such, this is resolved by trylocking and if that fails unlocking 306 * the first node, locking everything in order and revalidating the state. 307 */ 308 309 VFS_SMR_DECLARE; 310 311 static SYSCTL_NODE(_vfs_cache, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 312 "Name cache parameters"); 313 314 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */ 315 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, size, CTLFLAG_RW, &ncsize, 0, 316 "Total namecache capacity"); 317 318 u_int ncsizefactor = 2; 319 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, sizefactor, CTLFLAG_RW, &ncsizefactor, 0, 320 "Size factor for namecache"); 321 322 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */ 323 SYSCTL_ULONG(_vfs_cache_param, OID_AUTO, negfactor, CTLFLAG_RW, &ncnegfactor, 0, 324 "Ratio of negative namecache entries"); 325 326 /* 327 * Negative entry % of namecache capacity above which automatic eviction is allowed. 328 * 329 * Check cache_neg_evict_cond for details. 330 */ 331 static u_int ncnegminpct = 3; 332 333 static u_int __read_mostly neg_min; /* the above recomputed against ncsize */ 334 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, negmin, CTLFLAG_RD, &neg_min, 0, 335 "Negative entry count above which automatic eviction is allowed"); 336 337 /* 338 * Structures associated with name caching. 339 */ 340 #define NCHHASH(hash) \ 341 (&nchashtbl[(hash) & nchash]) 342 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ 343 static u_long __read_mostly nchash; /* size of hash table */ 344 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 345 "Size of namecache hash table"); 346 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ 347 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ 348 349 struct nchstats nchstats; /* cache effectiveness statistics */ 350 351 static bool __read_frequently cache_fast_revlookup = true; 352 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_revlookup, CTLFLAG_RW, 353 &cache_fast_revlookup, 0, ""); 354 355 static u_int __exclusive_cache_line neg_cycle; 356 357 #define ncneghash 3 358 #define numneglists (ncneghash + 1) 359 360 struct neglist { 361 struct mtx nl_evict_lock; 362 struct mtx nl_lock __aligned(CACHE_LINE_SIZE); 363 TAILQ_HEAD(, namecache) nl_list; 364 TAILQ_HEAD(, namecache) nl_hotlist; 365 u_long nl_hotnum; 366 } __aligned(CACHE_LINE_SIZE); 367 368 static struct neglist neglists[numneglists]; 369 370 static inline struct neglist * 371 NCP2NEGLIST(struct namecache *ncp) 372 { 373 374 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); 375 } 376 377 static inline struct negstate * 378 NCP2NEGSTATE(struct namecache *ncp) 379 { 380 381 MPASS(ncp->nc_flag & NCF_NEGATIVE); 382 return (&ncp->nc_neg); 383 } 384 385 #define numbucketlocks (ncbuckethash + 1) 386 static u_int __read_mostly ncbuckethash; 387 static struct mtx_padalign __read_mostly *bucketlocks; 388 #define HASH2BUCKETLOCK(hash) \ 389 ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)])) 390 391 #define numvnodelocks (ncvnodehash + 1) 392 static u_int __read_mostly ncvnodehash; 393 static struct mtx __read_mostly *vnodelocks; 394 static inline struct mtx * 395 VP2VNODELOCK(struct vnode *vp) 396 { 397 398 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); 399 } 400 401 static void 402 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) 403 { 404 struct namecache_ts *ncp_ts; 405 406 KASSERT((ncp->nc_flag & NCF_TS) != 0 || 407 (tsp == NULL && ticksp == NULL), 408 ("No NCF_TS")); 409 410 if (tsp == NULL) 411 return; 412 413 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 414 *tsp = ncp_ts->nc_time; 415 *ticksp = ncp_ts->nc_ticks; 416 } 417 418 #ifdef DEBUG_CACHE 419 static int __read_mostly doingcache = 1; /* 1 => enable the cache */ 420 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, 421 "VFS namecache enabled"); 422 #endif 423 424 /* Export size information to userland */ 425 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 426 sizeof(struct namecache), "sizeof(struct namecache)"); 427 428 /* 429 * The new name cache statistics 430 */ 431 static SYSCTL_NODE(_vfs_cache, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 432 "Name cache statistics"); 433 434 #define STATNODE_ULONG(name, varname, descr) \ 435 SYSCTL_ULONG(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr); 436 #define STATNODE_COUNTER(name, varname, descr) \ 437 static COUNTER_U64_DEFINE_EARLY(varname); \ 438 SYSCTL_COUNTER_U64(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, \ 439 descr); 440 STATNODE_ULONG(neg, numneg, "Number of negative cache entries"); 441 STATNODE_ULONG(count, numcache, "Number of cache entries"); 442 STATNODE_COUNTER(heldvnodes, numcachehv, "Number of namecache entries with vnodes held"); 443 STATNODE_COUNTER(drops, numdrops, "Number of dropped entries due to reaching the limit"); 444 STATNODE_COUNTER(dothits, dothits, "Number of '.' hits"); 445 STATNODE_COUNTER(dotdothis, dotdothits, "Number of '..' hits"); 446 STATNODE_COUNTER(miss, nummiss, "Number of cache misses"); 447 STATNODE_COUNTER(misszap, nummisszap, "Number of cache misses we do not want to cache"); 448 STATNODE_COUNTER(posszaps, numposzaps, 449 "Number of cache hits (positive) we do not want to cache"); 450 STATNODE_COUNTER(poshits, numposhits, "Number of cache hits (positive)"); 451 STATNODE_COUNTER(negzaps, numnegzaps, 452 "Number of cache hits (negative) we do not want to cache"); 453 STATNODE_COUNTER(neghits, numneghits, "Number of cache hits (negative)"); 454 /* These count for vn_getcwd(), too. */ 455 STATNODE_COUNTER(fullpathcalls, numfullpathcalls, "Number of fullpath search calls"); 456 STATNODE_COUNTER(fullpathfail1, numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); 457 STATNODE_COUNTER(fullpathfail2, numfullpathfail2, 458 "Number of fullpath search errors (VOP_VPTOCNP failures)"); 459 STATNODE_COUNTER(fullpathfail4, numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); 460 STATNODE_COUNTER(fullpathfound, numfullpathfound, "Number of successful fullpath calls"); 461 462 /* 463 * Debug or developer statistics. 464 */ 465 static SYSCTL_NODE(_vfs_cache, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 466 "Name cache debugging"); 467 #define DEBUGNODE_ULONG(name, varname, descr) \ 468 SYSCTL_ULONG(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr); 469 #define DEBUGNODE_COUNTER(name, varname, descr) \ 470 static COUNTER_U64_DEFINE_EARLY(varname); \ 471 SYSCTL_COUNTER_U64(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, \ 472 descr); 473 DEBUGNODE_COUNTER(zap_bucket_relock_success, zap_bucket_relock_success, 474 "Number of successful removals after relocking"); 475 static long zap_bucket_fail; 476 DEBUGNODE_ULONG(zap_bucket_fail, zap_bucket_fail, ""); 477 static long zap_bucket_fail2; 478 DEBUGNODE_ULONG(zap_bucket_fail2, zap_bucket_fail2, ""); 479 static long cache_lock_vnodes_cel_3_failures; 480 DEBUGNODE_ULONG(vnodes_cel_3_failures, cache_lock_vnodes_cel_3_failures, 481 "Number of times 3-way vnode locking failed"); 482 483 static void cache_zap_locked(struct namecache *ncp); 484 static int vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, 485 char **freebuf, size_t *buflen); 486 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 487 char **retbuf, size_t *buflen, size_t addend); 488 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, 489 char **retbuf, size_t *buflen); 490 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, 491 char **retbuf, size_t *len, size_t addend); 492 493 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 494 495 static inline void 496 cache_assert_vlp_locked(struct mtx *vlp) 497 { 498 499 if (vlp != NULL) 500 mtx_assert(vlp, MA_OWNED); 501 } 502 503 static inline void 504 cache_assert_vnode_locked(struct vnode *vp) 505 { 506 struct mtx *vlp; 507 508 vlp = VP2VNODELOCK(vp); 509 cache_assert_vlp_locked(vlp); 510 } 511 512 /* 513 * Directory vnodes with entries are held for two reasons: 514 * 1. make them less of a target for reclamation in vnlru 515 * 2. suffer smaller performance penalty in locked lookup as requeieing is avoided 516 * 517 * Note this preferably would not be done and it's a hold over from. It will be 518 * feasible to eliminate altogether if all filesystems start supporting 519 * lockless lookup. 520 */ 521 static void 522 cache_hold_vnode(struct vnode *vp) 523 { 524 525 cache_assert_vnode_locked(vp); 526 VNPASS(LIST_EMPTY(&vp->v_cache_src), vp); 527 vhold(vp); 528 counter_u64_add(numcachehv, 1); 529 } 530 531 static void 532 cache_drop_vnode(struct vnode *vp) 533 { 534 535 /* 536 * Called after all locks are dropped, meaning we can't assert 537 * on the state of v_cache_src. 538 */ 539 vdrop(vp); 540 counter_u64_add(numcachehv, -1); 541 } 542 543 /* 544 * UMA zones. 545 */ 546 static uma_zone_t __read_mostly cache_zone_small; 547 static uma_zone_t __read_mostly cache_zone_small_ts; 548 static uma_zone_t __read_mostly cache_zone_large; 549 static uma_zone_t __read_mostly cache_zone_large_ts; 550 551 static struct namecache * 552 cache_alloc_uma(int len, bool ts) 553 { 554 struct namecache_ts *ncp_ts; 555 struct namecache *ncp; 556 557 if (__predict_false(ts)) { 558 if (len <= CACHE_PATH_CUTOFF) 559 ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK); 560 else 561 ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK); 562 ncp = &ncp_ts->nc_nc; 563 } else { 564 if (len <= CACHE_PATH_CUTOFF) 565 ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK); 566 else 567 ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK); 568 } 569 return (ncp); 570 } 571 572 static void 573 cache_free_uma(struct namecache *ncp) 574 { 575 struct namecache_ts *ncp_ts; 576 577 if (__predict_false(ncp->nc_flag & NCF_TS)) { 578 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 579 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 580 uma_zfree_smr(cache_zone_small_ts, ncp_ts); 581 else 582 uma_zfree_smr(cache_zone_large_ts, ncp_ts); 583 } else { 584 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 585 uma_zfree_smr(cache_zone_small, ncp); 586 else 587 uma_zfree_smr(cache_zone_large, ncp); 588 } 589 } 590 591 static struct namecache * 592 cache_alloc(int len, bool ts) 593 { 594 u_long lnumcache; 595 596 /* 597 * Avoid blowout in namecache entries. 598 * 599 * Bugs: 600 * 1. filesystems may end up trying to add an already existing entry 601 * (for example this can happen after a cache miss during concurrent 602 * lookup), in which case we will call cache_neg_evict despite not 603 * adding anything. 604 * 2. the routine may fail to free anything and no provisions are made 605 * to make it try harder (see the inside for failure modes) 606 * 3. it only ever looks at negative entries. 607 */ 608 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; 609 if (cache_neg_evict_cond(lnumcache)) { 610 lnumcache = atomic_load_long(&numcache); 611 } 612 if (__predict_false(lnumcache >= ncsize)) { 613 atomic_subtract_long(&numcache, 1); 614 counter_u64_add(numdrops, 1); 615 return (NULL); 616 } 617 return (cache_alloc_uma(len, ts)); 618 } 619 620 static void 621 cache_free(struct namecache *ncp) 622 { 623 624 MPASS(ncp != NULL); 625 if ((ncp->nc_flag & NCF_DVDROP) != 0) { 626 cache_drop_vnode(ncp->nc_dvp); 627 } 628 cache_free_uma(ncp); 629 atomic_subtract_long(&numcache, 1); 630 } 631 632 static void 633 cache_free_batch(struct cache_freebatch *batch) 634 { 635 struct namecache *ncp, *nnp; 636 int i; 637 638 i = 0; 639 if (TAILQ_EMPTY(batch)) 640 goto out; 641 TAILQ_FOREACH_SAFE(ncp, batch, nc_dst, nnp) { 642 if ((ncp->nc_flag & NCF_DVDROP) != 0) { 643 cache_drop_vnode(ncp->nc_dvp); 644 } 645 cache_free_uma(ncp); 646 i++; 647 } 648 atomic_subtract_long(&numcache, i); 649 out: 650 SDT_PROBE1(vfs, namecache, purge, batch, i); 651 } 652 653 /* 654 * TODO: With the value stored we can do better than computing the hash based 655 * on the address. The choice of FNV should also be revisited. 656 */ 657 static void 658 cache_prehash(struct vnode *vp) 659 { 660 661 vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT); 662 } 663 664 static uint32_t 665 cache_get_hash(char *name, u_char len, struct vnode *dvp) 666 { 667 668 return (fnv_32_buf(name, len, dvp->v_nchash)); 669 } 670 671 static inline struct nchashhead * 672 NCP2BUCKET(struct namecache *ncp) 673 { 674 uint32_t hash; 675 676 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 677 return (NCHHASH(hash)); 678 } 679 680 static inline struct mtx * 681 NCP2BUCKETLOCK(struct namecache *ncp) 682 { 683 uint32_t hash; 684 685 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 686 return (HASH2BUCKETLOCK(hash)); 687 } 688 689 #ifdef INVARIANTS 690 static void 691 cache_assert_bucket_locked(struct namecache *ncp) 692 { 693 struct mtx *blp; 694 695 blp = NCP2BUCKETLOCK(ncp); 696 mtx_assert(blp, MA_OWNED); 697 } 698 699 static void 700 cache_assert_bucket_unlocked(struct namecache *ncp) 701 { 702 struct mtx *blp; 703 704 blp = NCP2BUCKETLOCK(ncp); 705 mtx_assert(blp, MA_NOTOWNED); 706 } 707 #else 708 #define cache_assert_bucket_locked(x) do { } while (0) 709 #define cache_assert_bucket_unlocked(x) do { } while (0) 710 #endif 711 712 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) 713 static void 714 _cache_sort_vnodes(void **p1, void **p2) 715 { 716 void *tmp; 717 718 MPASS(*p1 != NULL || *p2 != NULL); 719 720 if (*p1 > *p2) { 721 tmp = *p2; 722 *p2 = *p1; 723 *p1 = tmp; 724 } 725 } 726 727 static void 728 cache_lock_all_buckets(void) 729 { 730 u_int i; 731 732 for (i = 0; i < numbucketlocks; i++) 733 mtx_lock(&bucketlocks[i]); 734 } 735 736 static void 737 cache_unlock_all_buckets(void) 738 { 739 u_int i; 740 741 for (i = 0; i < numbucketlocks; i++) 742 mtx_unlock(&bucketlocks[i]); 743 } 744 745 static void 746 cache_lock_all_vnodes(void) 747 { 748 u_int i; 749 750 for (i = 0; i < numvnodelocks; i++) 751 mtx_lock(&vnodelocks[i]); 752 } 753 754 static void 755 cache_unlock_all_vnodes(void) 756 { 757 u_int i; 758 759 for (i = 0; i < numvnodelocks; i++) 760 mtx_unlock(&vnodelocks[i]); 761 } 762 763 static int 764 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 765 { 766 767 cache_sort_vnodes(&vlp1, &vlp2); 768 769 if (vlp1 != NULL) { 770 if (!mtx_trylock(vlp1)) 771 return (EAGAIN); 772 } 773 if (!mtx_trylock(vlp2)) { 774 if (vlp1 != NULL) 775 mtx_unlock(vlp1); 776 return (EAGAIN); 777 } 778 779 return (0); 780 } 781 782 static void 783 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 784 { 785 786 MPASS(vlp1 != NULL || vlp2 != NULL); 787 MPASS(vlp1 <= vlp2); 788 789 if (vlp1 != NULL) 790 mtx_lock(vlp1); 791 if (vlp2 != NULL) 792 mtx_lock(vlp2); 793 } 794 795 static void 796 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 797 { 798 799 MPASS(vlp1 != NULL || vlp2 != NULL); 800 801 if (vlp1 != NULL) 802 mtx_unlock(vlp1); 803 if (vlp2 != NULL) 804 mtx_unlock(vlp2); 805 } 806 807 static int 808 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 809 { 810 struct nchstats snap; 811 812 if (req->oldptr == NULL) 813 return (SYSCTL_OUT(req, 0, sizeof(snap))); 814 815 snap = nchstats; 816 snap.ncs_goodhits = counter_u64_fetch(numposhits); 817 snap.ncs_neghits = counter_u64_fetch(numneghits); 818 snap.ncs_badhits = counter_u64_fetch(numposzaps) + 819 counter_u64_fetch(numnegzaps); 820 snap.ncs_miss = counter_u64_fetch(nummisszap) + 821 counter_u64_fetch(nummiss); 822 823 return (SYSCTL_OUT(req, &snap, sizeof(snap))); 824 } 825 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | 826 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", 827 "VFS cache effectiveness statistics"); 828 829 static void 830 cache_recalc_neg_min(u_int val) 831 { 832 833 neg_min = (ncsize * val) / 100; 834 } 835 836 static int 837 sysctl_negminpct(SYSCTL_HANDLER_ARGS) 838 { 839 u_int val; 840 int error; 841 842 val = ncnegminpct; 843 error = sysctl_handle_int(oidp, &val, 0, req); 844 if (error != 0 || req->newptr == NULL) 845 return (error); 846 847 if (val == ncnegminpct) 848 return (0); 849 if (val < 0 || val > 99) 850 return (EINVAL); 851 ncnegminpct = val; 852 cache_recalc_neg_min(val); 853 return (0); 854 } 855 856 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, negminpct, 857 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_negminpct, 858 "I", "Negative entry \% of namecache capacity above which automatic eviction is allowed"); 859 860 #ifdef DIAGNOSTIC 861 /* 862 * Grab an atomic snapshot of the name cache hash chain lengths 863 */ 864 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, 865 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 866 "hash table stats"); 867 868 static int 869 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) 870 { 871 struct nchashhead *ncpp; 872 struct namecache *ncp; 873 int i, error, n_nchash, *cntbuf; 874 875 retry: 876 n_nchash = nchash + 1; /* nchash is max index, not count */ 877 if (req->oldptr == NULL) 878 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); 879 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); 880 cache_lock_all_buckets(); 881 if (n_nchash != nchash + 1) { 882 cache_unlock_all_buckets(); 883 free(cntbuf, M_TEMP); 884 goto retry; 885 } 886 /* Scan hash tables counting entries */ 887 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) 888 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) 889 cntbuf[i]++; 890 cache_unlock_all_buckets(); 891 for (error = 0, i = 0; i < n_nchash; i++) 892 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) 893 break; 894 free(cntbuf, M_TEMP); 895 return (error); 896 } 897 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| 898 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", 899 "nchash chain lengths"); 900 901 static int 902 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) 903 { 904 int error; 905 struct nchashhead *ncpp; 906 struct namecache *ncp; 907 int n_nchash; 908 int count, maxlength, used, pct; 909 910 if (!req->oldptr) 911 return SYSCTL_OUT(req, 0, 4 * sizeof(int)); 912 913 cache_lock_all_buckets(); 914 n_nchash = nchash + 1; /* nchash is max index, not count */ 915 used = 0; 916 maxlength = 0; 917 918 /* Scan hash tables for applicable entries */ 919 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { 920 count = 0; 921 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) { 922 count++; 923 } 924 if (count) 925 used++; 926 if (maxlength < count) 927 maxlength = count; 928 } 929 n_nchash = nchash + 1; 930 cache_unlock_all_buckets(); 931 pct = (used * 100) / (n_nchash / 100); 932 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); 933 if (error) 934 return (error); 935 error = SYSCTL_OUT(req, &used, sizeof(used)); 936 if (error) 937 return (error); 938 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); 939 if (error) 940 return (error); 941 error = SYSCTL_OUT(req, &pct, sizeof(pct)); 942 if (error) 943 return (error); 944 return (0); 945 } 946 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| 947 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", 948 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); 949 #endif 950 951 /* 952 * Negative entries management 953 * 954 * Various workloads create plenty of negative entries and barely use them 955 * afterwards. Moreover malicious users can keep performing bogus lookups 956 * adding even more entries. For example "make tinderbox" as of writing this 957 * comment ends up with 2.6M namecache entries in total, 1.2M of which are 958 * negative. 959 * 960 * As such, a rather aggressive eviction method is needed. The currently 961 * employed method is a placeholder. 962 * 963 * Entries are split over numneglists separate lists, each of which is further 964 * split into hot and cold entries. Entries get promoted after getting a hit. 965 * Eviction happens on addition of new entry. 966 */ 967 static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 968 "Name cache negative entry statistics"); 969 970 SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0, 971 "Number of negative cache entries"); 972 973 static COUNTER_U64_DEFINE_EARLY(neg_created); 974 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created, 975 "Number of created negative entries"); 976 977 static COUNTER_U64_DEFINE_EARLY(neg_evicted); 978 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted, 979 "Number of evicted negative entries"); 980 981 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty); 982 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD, 983 &neg_evict_skipped_empty, 984 "Number of times evicting failed due to lack of entries"); 985 986 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed); 987 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD, 988 &neg_evict_skipped_missed, 989 "Number of times evicting failed due to target entry disappearing"); 990 991 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended); 992 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD, 993 &neg_evict_skipped_contended, 994 "Number of times evicting failed due to contention"); 995 996 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits, 997 "Number of cache hits (negative)"); 998 999 static int 1000 sysctl_neg_hot(SYSCTL_HANDLER_ARGS) 1001 { 1002 int i, out; 1003 1004 out = 0; 1005 for (i = 0; i < numneglists; i++) 1006 out += neglists[i].nl_hotnum; 1007 1008 return (SYSCTL_OUT(req, &out, sizeof(out))); 1009 } 1010 SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD | 1011 CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I", 1012 "Number of hot negative entries"); 1013 1014 static void 1015 cache_neg_init(struct namecache *ncp) 1016 { 1017 struct negstate *ns; 1018 1019 ncp->nc_flag |= NCF_NEGATIVE; 1020 ns = NCP2NEGSTATE(ncp); 1021 ns->neg_flag = 0; 1022 ns->neg_hit = 0; 1023 counter_u64_add(neg_created, 1); 1024 } 1025 1026 #define CACHE_NEG_PROMOTION_THRESH 2 1027 1028 static bool 1029 cache_neg_hit_prep(struct namecache *ncp) 1030 { 1031 struct negstate *ns; 1032 u_char n; 1033 1034 ns = NCP2NEGSTATE(ncp); 1035 n = atomic_load_char(&ns->neg_hit); 1036 for (;;) { 1037 if (n >= CACHE_NEG_PROMOTION_THRESH) 1038 return (false); 1039 if (atomic_fcmpset_8(&ns->neg_hit, &n, n + 1)) 1040 break; 1041 } 1042 return (n + 1 == CACHE_NEG_PROMOTION_THRESH); 1043 } 1044 1045 /* 1046 * Nothing to do here but it is provided for completeness as some 1047 * cache_neg_hit_prep callers may end up returning without even 1048 * trying to promote. 1049 */ 1050 #define cache_neg_hit_abort(ncp) do { } while (0) 1051 1052 static void 1053 cache_neg_hit_finish(struct namecache *ncp) 1054 { 1055 1056 SDT_PROBE2(vfs, namecache, lookup, hit__negative, ncp->nc_dvp, ncp->nc_name); 1057 counter_u64_add(numneghits, 1); 1058 } 1059 1060 /* 1061 * Move a negative entry to the hot list. 1062 */ 1063 static void 1064 cache_neg_promote_locked(struct namecache *ncp) 1065 { 1066 struct neglist *nl; 1067 struct negstate *ns; 1068 1069 ns = NCP2NEGSTATE(ncp); 1070 nl = NCP2NEGLIST(ncp); 1071 mtx_assert(&nl->nl_lock, MA_OWNED); 1072 if ((ns->neg_flag & NEG_HOT) == 0) { 1073 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst); 1074 TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst); 1075 nl->nl_hotnum++; 1076 ns->neg_flag |= NEG_HOT; 1077 } 1078 } 1079 1080 /* 1081 * Move a hot negative entry to the cold list. 1082 */ 1083 static void 1084 cache_neg_demote_locked(struct namecache *ncp) 1085 { 1086 struct neglist *nl; 1087 struct negstate *ns; 1088 1089 ns = NCP2NEGSTATE(ncp); 1090 nl = NCP2NEGLIST(ncp); 1091 mtx_assert(&nl->nl_lock, MA_OWNED); 1092 MPASS(ns->neg_flag & NEG_HOT); 1093 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst); 1094 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst); 1095 nl->nl_hotnum--; 1096 ns->neg_flag &= ~NEG_HOT; 1097 atomic_store_char(&ns->neg_hit, 0); 1098 } 1099 1100 /* 1101 * Move a negative entry to the hot list if it matches the lookup. 1102 * 1103 * We have to take locks, but they may be contended and in the worst 1104 * case we may need to go off CPU. We don't want to spin within the 1105 * smr section and we can't block with it. Exiting the section means 1106 * the found entry could have been evicted. We are going to look it 1107 * up again. 1108 */ 1109 static bool 1110 cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp, 1111 struct namecache *oncp, uint32_t hash) 1112 { 1113 struct namecache *ncp; 1114 struct neglist *nl; 1115 u_char nc_flag; 1116 1117 nl = NCP2NEGLIST(oncp); 1118 1119 mtx_lock(&nl->nl_lock); 1120 /* 1121 * For hash iteration. 1122 */ 1123 vfs_smr_enter(); 1124 1125 /* 1126 * Avoid all surprises by only succeeding if we got the same entry and 1127 * bailing completely otherwise. 1128 * XXX There are no provisions to keep the vnode around, meaning we may 1129 * end up promoting a negative entry for a *new* vnode and returning 1130 * ENOENT on its account. This is the error we want to return anyway 1131 * and promotion is harmless. 1132 * 1133 * In particular at this point there can be a new ncp which matches the 1134 * search but hashes to a different neglist. 1135 */ 1136 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1137 if (ncp == oncp) 1138 break; 1139 } 1140 1141 /* 1142 * No match to begin with. 1143 */ 1144 if (__predict_false(ncp == NULL)) { 1145 goto out_abort; 1146 } 1147 1148 /* 1149 * The newly found entry may be something different... 1150 */ 1151 if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1152 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) { 1153 goto out_abort; 1154 } 1155 1156 /* 1157 * ... and not even negative. 1158 */ 1159 nc_flag = atomic_load_char(&ncp->nc_flag); 1160 if ((nc_flag & NCF_NEGATIVE) == 0) { 1161 goto out_abort; 1162 } 1163 1164 if (!cache_ncp_canuse(ncp)) { 1165 goto out_abort; 1166 } 1167 1168 cache_neg_promote_locked(ncp); 1169 cache_neg_hit_finish(ncp); 1170 vfs_smr_exit(); 1171 mtx_unlock(&nl->nl_lock); 1172 return (true); 1173 out_abort: 1174 vfs_smr_exit(); 1175 mtx_unlock(&nl->nl_lock); 1176 return (false); 1177 } 1178 1179 static void 1180 cache_neg_promote(struct namecache *ncp) 1181 { 1182 struct neglist *nl; 1183 1184 nl = NCP2NEGLIST(ncp); 1185 mtx_lock(&nl->nl_lock); 1186 cache_neg_promote_locked(ncp); 1187 mtx_unlock(&nl->nl_lock); 1188 } 1189 1190 static void 1191 cache_neg_insert(struct namecache *ncp) 1192 { 1193 struct neglist *nl; 1194 1195 MPASS(ncp->nc_flag & NCF_NEGATIVE); 1196 cache_assert_bucket_locked(ncp); 1197 nl = NCP2NEGLIST(ncp); 1198 mtx_lock(&nl->nl_lock); 1199 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst); 1200 mtx_unlock(&nl->nl_lock); 1201 atomic_add_long(&numneg, 1); 1202 } 1203 1204 static void 1205 cache_neg_remove(struct namecache *ncp) 1206 { 1207 struct neglist *nl; 1208 struct negstate *ns; 1209 1210 cache_assert_bucket_locked(ncp); 1211 nl = NCP2NEGLIST(ncp); 1212 ns = NCP2NEGSTATE(ncp); 1213 mtx_lock(&nl->nl_lock); 1214 if ((ns->neg_flag & NEG_HOT) != 0) { 1215 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst); 1216 nl->nl_hotnum--; 1217 } else { 1218 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst); 1219 } 1220 mtx_unlock(&nl->nl_lock); 1221 atomic_subtract_long(&numneg, 1); 1222 } 1223 1224 static struct neglist * 1225 cache_neg_evict_select_list(void) 1226 { 1227 struct neglist *nl; 1228 u_int c; 1229 1230 c = atomic_fetchadd_int(&neg_cycle, 1) + 1; 1231 nl = &neglists[c % numneglists]; 1232 if (!mtx_trylock(&nl->nl_evict_lock)) { 1233 counter_u64_add(neg_evict_skipped_contended, 1); 1234 return (NULL); 1235 } 1236 return (nl); 1237 } 1238 1239 static struct namecache * 1240 cache_neg_evict_select_entry(struct neglist *nl) 1241 { 1242 struct namecache *ncp, *lncp; 1243 struct negstate *ns, *lns; 1244 int i; 1245 1246 mtx_assert(&nl->nl_evict_lock, MA_OWNED); 1247 mtx_assert(&nl->nl_lock, MA_OWNED); 1248 ncp = TAILQ_FIRST(&nl->nl_list); 1249 if (ncp == NULL) 1250 return (NULL); 1251 lncp = ncp; 1252 lns = NCP2NEGSTATE(lncp); 1253 for (i = 1; i < 4; i++) { 1254 ncp = TAILQ_NEXT(ncp, nc_dst); 1255 if (ncp == NULL) 1256 break; 1257 ns = NCP2NEGSTATE(ncp); 1258 if (ns->neg_hit < lns->neg_hit) { 1259 lncp = ncp; 1260 lns = ns; 1261 } 1262 } 1263 return (lncp); 1264 } 1265 1266 static bool 1267 cache_neg_evict(void) 1268 { 1269 struct namecache *ncp, *ncp2; 1270 struct neglist *nl; 1271 struct vnode *dvp; 1272 struct mtx *dvlp; 1273 struct mtx *blp; 1274 uint32_t hash; 1275 u_char nlen; 1276 bool evicted; 1277 1278 nl = cache_neg_evict_select_list(); 1279 if (nl == NULL) { 1280 return (false); 1281 } 1282 1283 mtx_lock(&nl->nl_lock); 1284 ncp = TAILQ_FIRST(&nl->nl_hotlist); 1285 if (ncp != NULL) { 1286 cache_neg_demote_locked(ncp); 1287 } 1288 ncp = cache_neg_evict_select_entry(nl); 1289 if (ncp == NULL) { 1290 counter_u64_add(neg_evict_skipped_empty, 1); 1291 mtx_unlock(&nl->nl_lock); 1292 mtx_unlock(&nl->nl_evict_lock); 1293 return (false); 1294 } 1295 nlen = ncp->nc_nlen; 1296 dvp = ncp->nc_dvp; 1297 hash = cache_get_hash(ncp->nc_name, nlen, dvp); 1298 dvlp = VP2VNODELOCK(dvp); 1299 blp = HASH2BUCKETLOCK(hash); 1300 mtx_unlock(&nl->nl_lock); 1301 mtx_unlock(&nl->nl_evict_lock); 1302 mtx_lock(dvlp); 1303 mtx_lock(blp); 1304 /* 1305 * Note that since all locks were dropped above, the entry may be 1306 * gone or reallocated to be something else. 1307 */ 1308 CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) { 1309 if (ncp2 == ncp && ncp2->nc_dvp == dvp && 1310 ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0) 1311 break; 1312 } 1313 if (ncp2 == NULL) { 1314 counter_u64_add(neg_evict_skipped_missed, 1); 1315 ncp = NULL; 1316 evicted = false; 1317 } else { 1318 MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp)); 1319 MPASS(blp == NCP2BUCKETLOCK(ncp)); 1320 SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp, 1321 ncp->nc_name); 1322 cache_zap_locked(ncp); 1323 counter_u64_add(neg_evicted, 1); 1324 evicted = true; 1325 } 1326 mtx_unlock(blp); 1327 mtx_unlock(dvlp); 1328 if (ncp != NULL) 1329 cache_free(ncp); 1330 return (evicted); 1331 } 1332 1333 /* 1334 * Maybe evict a negative entry to create more room. 1335 * 1336 * The ncnegfactor parameter limits what fraction of the total count 1337 * can comprise of negative entries. However, if the cache is just 1338 * warming up this leads to excessive evictions. As such, ncnegminpct 1339 * (recomputed to neg_min) dictates whether the above should be 1340 * applied. 1341 * 1342 * Try evicting if the cache is close to full capacity regardless of 1343 * other considerations. 1344 */ 1345 static bool 1346 cache_neg_evict_cond(u_long lnumcache) 1347 { 1348 u_long lnumneg; 1349 1350 if (ncsize - 1000 < lnumcache) 1351 goto out_evict; 1352 lnumneg = atomic_load_long(&numneg); 1353 if (lnumneg < neg_min) 1354 return (false); 1355 if (lnumneg * ncnegfactor < lnumcache) 1356 return (false); 1357 out_evict: 1358 return (cache_neg_evict()); 1359 } 1360 1361 /* 1362 * cache_zap_locked(): 1363 * 1364 * Removes a namecache entry from cache, whether it contains an actual 1365 * pointer to a vnode or if it is just a negative cache entry. 1366 */ 1367 static void 1368 cache_zap_locked(struct namecache *ncp) 1369 { 1370 struct nchashhead *ncpp; 1371 1372 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1373 cache_assert_vnode_locked(ncp->nc_vp); 1374 cache_assert_vnode_locked(ncp->nc_dvp); 1375 cache_assert_bucket_locked(ncp); 1376 1377 cache_ncp_invalidate(ncp); 1378 1379 ncpp = NCP2BUCKET(ncp); 1380 CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash); 1381 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 1382 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, 1383 ncp->nc_name, ncp->nc_vp); 1384 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); 1385 if (ncp == ncp->nc_vp->v_cache_dd) { 1386 vn_seqc_write_begin_unheld(ncp->nc_vp); 1387 ncp->nc_vp->v_cache_dd = NULL; 1388 vn_seqc_write_end(ncp->nc_vp); 1389 } 1390 } else { 1391 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp, 1392 ncp->nc_name); 1393 cache_neg_remove(ncp); 1394 } 1395 if (ncp->nc_flag & NCF_ISDOTDOT) { 1396 if (ncp == ncp->nc_dvp->v_cache_dd) { 1397 vn_seqc_write_begin_unheld(ncp->nc_dvp); 1398 ncp->nc_dvp->v_cache_dd = NULL; 1399 vn_seqc_write_end(ncp->nc_dvp); 1400 } 1401 } else { 1402 LIST_REMOVE(ncp, nc_src); 1403 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { 1404 ncp->nc_flag |= NCF_DVDROP; 1405 } 1406 } 1407 } 1408 1409 static void 1410 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) 1411 { 1412 struct mtx *blp; 1413 1414 MPASS(ncp->nc_dvp == vp); 1415 MPASS(ncp->nc_flag & NCF_NEGATIVE); 1416 cache_assert_vnode_locked(vp); 1417 1418 blp = NCP2BUCKETLOCK(ncp); 1419 mtx_lock(blp); 1420 cache_zap_locked(ncp); 1421 mtx_unlock(blp); 1422 } 1423 1424 static bool 1425 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, 1426 struct mtx **vlpp) 1427 { 1428 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 1429 struct mtx *blp; 1430 1431 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 1432 cache_assert_vnode_locked(vp); 1433 1434 if (ncp->nc_flag & NCF_NEGATIVE) { 1435 if (*vlpp != NULL) { 1436 mtx_unlock(*vlpp); 1437 *vlpp = NULL; 1438 } 1439 cache_zap_negative_locked_vnode_kl(ncp, vp); 1440 return (true); 1441 } 1442 1443 pvlp = VP2VNODELOCK(vp); 1444 blp = NCP2BUCKETLOCK(ncp); 1445 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 1446 vlp2 = VP2VNODELOCK(ncp->nc_vp); 1447 1448 if (*vlpp == vlp1 || *vlpp == vlp2) { 1449 to_unlock = *vlpp; 1450 *vlpp = NULL; 1451 } else { 1452 if (*vlpp != NULL) { 1453 mtx_unlock(*vlpp); 1454 *vlpp = NULL; 1455 } 1456 cache_sort_vnodes(&vlp1, &vlp2); 1457 if (vlp1 == pvlp) { 1458 mtx_lock(vlp2); 1459 to_unlock = vlp2; 1460 } else { 1461 if (!mtx_trylock(vlp1)) 1462 goto out_relock; 1463 to_unlock = vlp1; 1464 } 1465 } 1466 mtx_lock(blp); 1467 cache_zap_locked(ncp); 1468 mtx_unlock(blp); 1469 if (to_unlock != NULL) 1470 mtx_unlock(to_unlock); 1471 return (true); 1472 1473 out_relock: 1474 mtx_unlock(vlp2); 1475 mtx_lock(vlp1); 1476 mtx_lock(vlp2); 1477 MPASS(*vlpp == NULL); 1478 *vlpp = vlp1; 1479 return (false); 1480 } 1481 1482 /* 1483 * If trylocking failed we can get here. We know enough to take all needed locks 1484 * in the right order and re-lookup the entry. 1485 */ 1486 static int 1487 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1488 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, 1489 struct mtx *blp) 1490 { 1491 struct namecache *rncp; 1492 1493 cache_assert_bucket_unlocked(ncp); 1494 1495 cache_sort_vnodes(&dvlp, &vlp); 1496 cache_lock_vnodes(dvlp, vlp); 1497 mtx_lock(blp); 1498 CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { 1499 if (rncp == ncp && rncp->nc_dvp == dvp && 1500 rncp->nc_nlen == cnp->cn_namelen && 1501 !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) 1502 break; 1503 } 1504 if (rncp != NULL) { 1505 cache_zap_locked(rncp); 1506 mtx_unlock(blp); 1507 cache_unlock_vnodes(dvlp, vlp); 1508 counter_u64_add(zap_bucket_relock_success, 1); 1509 return (0); 1510 } 1511 1512 mtx_unlock(blp); 1513 cache_unlock_vnodes(dvlp, vlp); 1514 return (EAGAIN); 1515 } 1516 1517 static int __noinline 1518 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp, 1519 uint32_t hash, struct mtx *blp) 1520 { 1521 struct mtx *dvlp, *vlp; 1522 struct vnode *dvp; 1523 1524 cache_assert_bucket_locked(ncp); 1525 1526 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1527 vlp = NULL; 1528 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1529 vlp = VP2VNODELOCK(ncp->nc_vp); 1530 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1531 cache_zap_locked(ncp); 1532 mtx_unlock(blp); 1533 cache_unlock_vnodes(dvlp, vlp); 1534 return (0); 1535 } 1536 1537 dvp = ncp->nc_dvp; 1538 mtx_unlock(blp); 1539 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1540 } 1541 1542 static __noinline int 1543 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp) 1544 { 1545 struct namecache *ncp; 1546 struct mtx *blp; 1547 struct mtx *dvlp, *dvlp2; 1548 uint32_t hash; 1549 int error; 1550 1551 if (cnp->cn_namelen == 2 && 1552 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1553 dvlp = VP2VNODELOCK(dvp); 1554 dvlp2 = NULL; 1555 mtx_lock(dvlp); 1556 retry_dotdot: 1557 ncp = dvp->v_cache_dd; 1558 if (ncp == NULL) { 1559 mtx_unlock(dvlp); 1560 if (dvlp2 != NULL) 1561 mtx_unlock(dvlp2); 1562 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1563 return (0); 1564 } 1565 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1566 if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2)) 1567 goto retry_dotdot; 1568 MPASS(dvp->v_cache_dd == NULL); 1569 mtx_unlock(dvlp); 1570 if (dvlp2 != NULL) 1571 mtx_unlock(dvlp2); 1572 cache_free(ncp); 1573 } else { 1574 vn_seqc_write_begin(dvp); 1575 dvp->v_cache_dd = NULL; 1576 vn_seqc_write_end(dvp); 1577 mtx_unlock(dvlp); 1578 if (dvlp2 != NULL) 1579 mtx_unlock(dvlp2); 1580 } 1581 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1582 return (1); 1583 } 1584 1585 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1586 blp = HASH2BUCKETLOCK(hash); 1587 retry: 1588 if (CK_SLIST_EMPTY(NCHHASH(hash))) 1589 goto out_no_entry; 1590 1591 mtx_lock(blp); 1592 1593 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1594 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1595 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1596 break; 1597 } 1598 1599 if (ncp == NULL) { 1600 mtx_unlock(blp); 1601 goto out_no_entry; 1602 } 1603 1604 error = cache_zap_locked_bucket(ncp, cnp, hash, blp); 1605 if (__predict_false(error != 0)) { 1606 zap_bucket_fail++; 1607 goto retry; 1608 } 1609 counter_u64_add(numposzaps, 1); 1610 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1611 cache_free(ncp); 1612 return (1); 1613 out_no_entry: 1614 counter_u64_add(nummisszap, 1); 1615 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1616 return (0); 1617 } 1618 1619 static int __noinline 1620 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1621 struct timespec *tsp, int *ticksp) 1622 { 1623 int ltype; 1624 1625 *vpp = dvp; 1626 counter_u64_add(dothits, 1); 1627 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); 1628 if (tsp != NULL) 1629 timespecclear(tsp); 1630 if (ticksp != NULL) 1631 *ticksp = ticks; 1632 vrefact(*vpp); 1633 /* 1634 * When we lookup "." we still can be asked to lock it 1635 * differently... 1636 */ 1637 ltype = cnp->cn_lkflags & LK_TYPE_MASK; 1638 if (ltype != VOP_ISLOCKED(*vpp)) { 1639 if (ltype == LK_EXCLUSIVE) { 1640 vn_lock(*vpp, LK_UPGRADE | LK_RETRY); 1641 if (VN_IS_DOOMED((*vpp))) { 1642 /* forced unmount */ 1643 vrele(*vpp); 1644 *vpp = NULL; 1645 return (ENOENT); 1646 } 1647 } else 1648 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); 1649 } 1650 return (-1); 1651 } 1652 1653 static int __noinline 1654 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1655 struct timespec *tsp, int *ticksp) 1656 { 1657 struct namecache_ts *ncp_ts; 1658 struct namecache *ncp; 1659 struct mtx *dvlp; 1660 enum vgetstate vs; 1661 int error, ltype; 1662 bool whiteout; 1663 1664 MPASS((cnp->cn_flags & ISDOTDOT) != 0); 1665 1666 if ((cnp->cn_flags & MAKEENTRY) == 0) { 1667 cache_remove_cnp(dvp, cnp); 1668 return (0); 1669 } 1670 1671 counter_u64_add(dotdothits, 1); 1672 retry: 1673 dvlp = VP2VNODELOCK(dvp); 1674 mtx_lock(dvlp); 1675 ncp = dvp->v_cache_dd; 1676 if (ncp == NULL) { 1677 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, "..", NULL); 1678 mtx_unlock(dvlp); 1679 return (0); 1680 } 1681 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1682 if (ncp->nc_flag & NCF_NEGATIVE) 1683 *vpp = NULL; 1684 else 1685 *vpp = ncp->nc_vp; 1686 } else 1687 *vpp = ncp->nc_dvp; 1688 if (*vpp == NULL) 1689 goto negative_success; 1690 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp); 1691 cache_out_ts(ncp, tsp, ticksp); 1692 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == 1693 NCF_DTS && tsp != NULL) { 1694 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1695 *tsp = ncp_ts->nc_dotdottime; 1696 } 1697 1698 MPASS(dvp != *vpp); 1699 ltype = VOP_ISLOCKED(dvp); 1700 VOP_UNLOCK(dvp); 1701 vs = vget_prep(*vpp); 1702 mtx_unlock(dvlp); 1703 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1704 vn_lock(dvp, ltype | LK_RETRY); 1705 if (VN_IS_DOOMED(dvp)) { 1706 if (error == 0) 1707 vput(*vpp); 1708 *vpp = NULL; 1709 return (ENOENT); 1710 } 1711 if (error) { 1712 *vpp = NULL; 1713 goto retry; 1714 } 1715 return (-1); 1716 negative_success: 1717 if (__predict_false(cnp->cn_nameiop == CREATE)) { 1718 if (cnp->cn_flags & ISLASTCN) { 1719 counter_u64_add(numnegzaps, 1); 1720 cache_zap_negative_locked_vnode_kl(ncp, dvp); 1721 mtx_unlock(dvlp); 1722 cache_free(ncp); 1723 return (0); 1724 } 1725 } 1726 1727 whiteout = (ncp->nc_flag & NCF_WHITE); 1728 cache_out_ts(ncp, tsp, ticksp); 1729 if (cache_neg_hit_prep(ncp)) 1730 cache_neg_promote(ncp); 1731 else 1732 cache_neg_hit_finish(ncp); 1733 mtx_unlock(dvlp); 1734 if (whiteout) 1735 cnp->cn_flags |= ISWHITEOUT; 1736 return (ENOENT); 1737 } 1738 1739 /** 1740 * Lookup a name in the name cache 1741 * 1742 * # Arguments 1743 * 1744 * - dvp: Parent directory in which to search. 1745 * - vpp: Return argument. Will contain desired vnode on cache hit. 1746 * - cnp: Parameters of the name search. The most interesting bits of 1747 * the cn_flags field have the following meanings: 1748 * - MAKEENTRY: If clear, free an entry from the cache rather than look 1749 * it up. 1750 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." 1751 * - tsp: Return storage for cache timestamp. On a successful (positive 1752 * or negative) lookup, tsp will be filled with any timespec that 1753 * was stored when this cache entry was created. However, it will 1754 * be clear for "." entries. 1755 * - ticks: Return storage for alternate cache timestamp. On a successful 1756 * (positive or negative) lookup, it will contain the ticks value 1757 * that was current when the cache entry was created, unless cnp 1758 * was ".". 1759 * 1760 * Either both tsp and ticks have to be provided or neither of them. 1761 * 1762 * # Returns 1763 * 1764 * - -1: A positive cache hit. vpp will contain the desired vnode. 1765 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due 1766 * to a forced unmount. vpp will not be modified. If the entry 1767 * is a whiteout, then the ISWHITEOUT flag will be set in 1768 * cnp->cn_flags. 1769 * - 0: A cache miss. vpp will not be modified. 1770 * 1771 * # Locking 1772 * 1773 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up 1774 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the 1775 * lock is not recursively acquired. 1776 */ 1777 static int __noinline 1778 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1779 struct timespec *tsp, int *ticksp) 1780 { 1781 struct namecache *ncp; 1782 struct mtx *blp; 1783 uint32_t hash; 1784 enum vgetstate vs; 1785 int error; 1786 bool whiteout; 1787 1788 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 1789 MPASS((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) != 0); 1790 1791 retry: 1792 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1793 blp = HASH2BUCKETLOCK(hash); 1794 mtx_lock(blp); 1795 1796 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1797 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1798 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1799 break; 1800 } 1801 1802 if (__predict_false(ncp == NULL)) { 1803 mtx_unlock(blp); 1804 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1805 NULL); 1806 counter_u64_add(nummiss, 1); 1807 return (0); 1808 } 1809 1810 if (ncp->nc_flag & NCF_NEGATIVE) 1811 goto negative_success; 1812 1813 counter_u64_add(numposhits, 1); 1814 *vpp = ncp->nc_vp; 1815 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); 1816 cache_out_ts(ncp, tsp, ticksp); 1817 MPASS(dvp != *vpp); 1818 vs = vget_prep(*vpp); 1819 mtx_unlock(blp); 1820 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1821 if (error) { 1822 *vpp = NULL; 1823 goto retry; 1824 } 1825 return (-1); 1826 negative_success: 1827 if (__predict_false(cnp->cn_nameiop == CREATE)) { 1828 if (cnp->cn_flags & ISLASTCN) { 1829 counter_u64_add(numnegzaps, 1); 1830 error = cache_zap_locked_bucket(ncp, cnp, hash, blp); 1831 if (__predict_false(error != 0)) { 1832 zap_bucket_fail2++; 1833 goto retry; 1834 } 1835 cache_free(ncp); 1836 return (0); 1837 } 1838 } 1839 1840 whiteout = (ncp->nc_flag & NCF_WHITE); 1841 cache_out_ts(ncp, tsp, ticksp); 1842 if (cache_neg_hit_prep(ncp)) 1843 cache_neg_promote(ncp); 1844 else 1845 cache_neg_hit_finish(ncp); 1846 mtx_unlock(blp); 1847 if (whiteout) 1848 cnp->cn_flags |= ISWHITEOUT; 1849 return (ENOENT); 1850 } 1851 1852 int 1853 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1854 struct timespec *tsp, int *ticksp) 1855 { 1856 struct namecache *ncp; 1857 uint32_t hash; 1858 enum vgetstate vs; 1859 int error; 1860 bool whiteout, neg_promote; 1861 u_short nc_flag; 1862 1863 MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL)); 1864 1865 #ifdef DEBUG_CACHE 1866 if (__predict_false(!doingcache)) { 1867 cnp->cn_flags &= ~MAKEENTRY; 1868 return (0); 1869 } 1870 #endif 1871 1872 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 1873 if (cnp->cn_namelen == 1) 1874 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); 1875 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') 1876 return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp)); 1877 } 1878 1879 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 1880 1881 if ((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) == 0) { 1882 cache_remove_cnp(dvp, cnp); 1883 return (0); 1884 } 1885 1886 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1887 vfs_smr_enter(); 1888 1889 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1890 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1891 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1892 break; 1893 } 1894 1895 if (__predict_false(ncp == NULL)) { 1896 vfs_smr_exit(); 1897 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1898 NULL); 1899 counter_u64_add(nummiss, 1); 1900 return (0); 1901 } 1902 1903 nc_flag = atomic_load_char(&ncp->nc_flag); 1904 if (nc_flag & NCF_NEGATIVE) 1905 goto negative_success; 1906 1907 counter_u64_add(numposhits, 1); 1908 *vpp = ncp->nc_vp; 1909 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); 1910 cache_out_ts(ncp, tsp, ticksp); 1911 MPASS(dvp != *vpp); 1912 if (!cache_ncp_canuse(ncp)) { 1913 vfs_smr_exit(); 1914 *vpp = NULL; 1915 goto out_fallback; 1916 } 1917 vs = vget_prep_smr(*vpp); 1918 vfs_smr_exit(); 1919 if (__predict_false(vs == VGET_NONE)) { 1920 *vpp = NULL; 1921 goto out_fallback; 1922 } 1923 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1924 if (error) { 1925 *vpp = NULL; 1926 goto out_fallback; 1927 } 1928 return (-1); 1929 negative_success: 1930 if (__predict_false(cnp->cn_nameiop == CREATE)) { 1931 if (cnp->cn_flags & ISLASTCN) { 1932 vfs_smr_exit(); 1933 goto out_fallback; 1934 } 1935 } 1936 1937 cache_out_ts(ncp, tsp, ticksp); 1938 whiteout = (ncp->nc_flag & NCF_WHITE); 1939 neg_promote = cache_neg_hit_prep(ncp); 1940 if (!cache_ncp_canuse(ncp)) { 1941 cache_neg_hit_abort(ncp); 1942 vfs_smr_exit(); 1943 goto out_fallback; 1944 } 1945 if (neg_promote) { 1946 vfs_smr_exit(); 1947 if (!cache_neg_promote_cond(dvp, cnp, ncp, hash)) 1948 goto out_fallback; 1949 } else { 1950 cache_neg_hit_finish(ncp); 1951 vfs_smr_exit(); 1952 } 1953 if (whiteout) 1954 cnp->cn_flags |= ISWHITEOUT; 1955 return (ENOENT); 1956 out_fallback: 1957 return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp)); 1958 } 1959 1960 struct celockstate { 1961 struct mtx *vlp[3]; 1962 struct mtx *blp[2]; 1963 }; 1964 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); 1965 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); 1966 1967 static inline void 1968 cache_celockstate_init(struct celockstate *cel) 1969 { 1970 1971 bzero(cel, sizeof(*cel)); 1972 } 1973 1974 static void 1975 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, 1976 struct vnode *dvp) 1977 { 1978 struct mtx *vlp1, *vlp2; 1979 1980 MPASS(cel->vlp[0] == NULL); 1981 MPASS(cel->vlp[1] == NULL); 1982 MPASS(cel->vlp[2] == NULL); 1983 1984 MPASS(vp != NULL || dvp != NULL); 1985 1986 vlp1 = VP2VNODELOCK(vp); 1987 vlp2 = VP2VNODELOCK(dvp); 1988 cache_sort_vnodes(&vlp1, &vlp2); 1989 1990 if (vlp1 != NULL) { 1991 mtx_lock(vlp1); 1992 cel->vlp[0] = vlp1; 1993 } 1994 mtx_lock(vlp2); 1995 cel->vlp[1] = vlp2; 1996 } 1997 1998 static void 1999 cache_unlock_vnodes_cel(struct celockstate *cel) 2000 { 2001 2002 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); 2003 2004 if (cel->vlp[0] != NULL) 2005 mtx_unlock(cel->vlp[0]); 2006 if (cel->vlp[1] != NULL) 2007 mtx_unlock(cel->vlp[1]); 2008 if (cel->vlp[2] != NULL) 2009 mtx_unlock(cel->vlp[2]); 2010 } 2011 2012 static bool 2013 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) 2014 { 2015 struct mtx *vlp; 2016 bool ret; 2017 2018 cache_assert_vlp_locked(cel->vlp[0]); 2019 cache_assert_vlp_locked(cel->vlp[1]); 2020 MPASS(cel->vlp[2] == NULL); 2021 2022 MPASS(vp != NULL); 2023 vlp = VP2VNODELOCK(vp); 2024 2025 ret = true; 2026 if (vlp >= cel->vlp[1]) { 2027 mtx_lock(vlp); 2028 } else { 2029 if (mtx_trylock(vlp)) 2030 goto out; 2031 cache_lock_vnodes_cel_3_failures++; 2032 cache_unlock_vnodes_cel(cel); 2033 if (vlp < cel->vlp[0]) { 2034 mtx_lock(vlp); 2035 mtx_lock(cel->vlp[0]); 2036 mtx_lock(cel->vlp[1]); 2037 } else { 2038 if (cel->vlp[0] != NULL) 2039 mtx_lock(cel->vlp[0]); 2040 mtx_lock(vlp); 2041 mtx_lock(cel->vlp[1]); 2042 } 2043 ret = false; 2044 } 2045 out: 2046 cel->vlp[2] = vlp; 2047 return (ret); 2048 } 2049 2050 static void 2051 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1, 2052 struct mtx *blp2) 2053 { 2054 2055 MPASS(cel->blp[0] == NULL); 2056 MPASS(cel->blp[1] == NULL); 2057 2058 cache_sort_vnodes(&blp1, &blp2); 2059 2060 if (blp1 != NULL) { 2061 mtx_lock(blp1); 2062 cel->blp[0] = blp1; 2063 } 2064 mtx_lock(blp2); 2065 cel->blp[1] = blp2; 2066 } 2067 2068 static void 2069 cache_unlock_buckets_cel(struct celockstate *cel) 2070 { 2071 2072 if (cel->blp[0] != NULL) 2073 mtx_unlock(cel->blp[0]); 2074 mtx_unlock(cel->blp[1]); 2075 } 2076 2077 /* 2078 * Lock part of the cache affected by the insertion. 2079 * 2080 * This means vnodelocks for dvp, vp and the relevant bucketlock. 2081 * However, insertion can result in removal of an old entry. In this 2082 * case we have an additional vnode and bucketlock pair to lock. 2083 * 2084 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while 2085 * preserving the locking order (smaller address first). 2086 */ 2087 static void 2088 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 2089 uint32_t hash) 2090 { 2091 struct namecache *ncp; 2092 struct mtx *blps[2]; 2093 2094 blps[0] = HASH2BUCKETLOCK(hash); 2095 for (;;) { 2096 blps[1] = NULL; 2097 cache_lock_vnodes_cel(cel, dvp, vp); 2098 if (vp == NULL || vp->v_type != VDIR) 2099 break; 2100 ncp = vp->v_cache_dd; 2101 if (ncp == NULL) 2102 break; 2103 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2104 break; 2105 MPASS(ncp->nc_dvp == vp); 2106 blps[1] = NCP2BUCKETLOCK(ncp); 2107 if (ncp->nc_flag & NCF_NEGATIVE) 2108 break; 2109 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 2110 break; 2111 /* 2112 * All vnodes got re-locked. Re-validate the state and if 2113 * nothing changed we are done. Otherwise restart. 2114 */ 2115 if (ncp == vp->v_cache_dd && 2116 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 2117 blps[1] == NCP2BUCKETLOCK(ncp) && 2118 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 2119 break; 2120 cache_unlock_vnodes_cel(cel); 2121 cel->vlp[0] = NULL; 2122 cel->vlp[1] = NULL; 2123 cel->vlp[2] = NULL; 2124 } 2125 cache_lock_buckets_cel(cel, blps[0], blps[1]); 2126 } 2127 2128 static void 2129 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 2130 uint32_t hash) 2131 { 2132 struct namecache *ncp; 2133 struct mtx *blps[2]; 2134 2135 blps[0] = HASH2BUCKETLOCK(hash); 2136 for (;;) { 2137 blps[1] = NULL; 2138 cache_lock_vnodes_cel(cel, dvp, vp); 2139 ncp = dvp->v_cache_dd; 2140 if (ncp == NULL) 2141 break; 2142 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2143 break; 2144 MPASS(ncp->nc_dvp == dvp); 2145 blps[1] = NCP2BUCKETLOCK(ncp); 2146 if (ncp->nc_flag & NCF_NEGATIVE) 2147 break; 2148 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 2149 break; 2150 if (ncp == dvp->v_cache_dd && 2151 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 2152 blps[1] == NCP2BUCKETLOCK(ncp) && 2153 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 2154 break; 2155 cache_unlock_vnodes_cel(cel); 2156 cel->vlp[0] = NULL; 2157 cel->vlp[1] = NULL; 2158 cel->vlp[2] = NULL; 2159 } 2160 cache_lock_buckets_cel(cel, blps[0], blps[1]); 2161 } 2162 2163 static void 2164 cache_enter_unlock(struct celockstate *cel) 2165 { 2166 2167 cache_unlock_buckets_cel(cel); 2168 cache_unlock_vnodes_cel(cel); 2169 } 2170 2171 static void __noinline 2172 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, 2173 struct componentname *cnp) 2174 { 2175 struct celockstate cel; 2176 struct namecache *ncp; 2177 uint32_t hash; 2178 int len; 2179 2180 if (dvp->v_cache_dd == NULL) 2181 return; 2182 len = cnp->cn_namelen; 2183 cache_celockstate_init(&cel); 2184 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 2185 cache_enter_lock_dd(&cel, dvp, vp, hash); 2186 vn_seqc_write_begin(dvp); 2187 ncp = dvp->v_cache_dd; 2188 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) { 2189 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); 2190 cache_zap_locked(ncp); 2191 } else { 2192 ncp = NULL; 2193 } 2194 dvp->v_cache_dd = NULL; 2195 vn_seqc_write_end(dvp); 2196 cache_enter_unlock(&cel); 2197 if (ncp != NULL) 2198 cache_free(ncp); 2199 } 2200 2201 /* 2202 * Add an entry to the cache. 2203 */ 2204 void 2205 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 2206 struct timespec *tsp, struct timespec *dtsp) 2207 { 2208 struct celockstate cel; 2209 struct namecache *ncp, *n2, *ndd; 2210 struct namecache_ts *ncp_ts; 2211 struct nchashhead *ncpp; 2212 uint32_t hash; 2213 int flag; 2214 int len; 2215 2216 VNPASS(dvp != vp, dvp); 2217 VNPASS(!VN_IS_DOOMED(dvp), dvp); 2218 VNPASS(dvp->v_type != VNON, dvp); 2219 if (vp != NULL) { 2220 VNPASS(!VN_IS_DOOMED(vp), vp); 2221 VNPASS(vp->v_type != VNON, vp); 2222 } 2223 2224 #ifdef DEBUG_CACHE 2225 if (__predict_false(!doingcache)) 2226 return; 2227 #endif 2228 2229 flag = 0; 2230 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 2231 if (cnp->cn_namelen == 1) 2232 return; 2233 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 2234 cache_enter_dotdot_prep(dvp, vp, cnp); 2235 flag = NCF_ISDOTDOT; 2236 } 2237 } 2238 2239 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); 2240 if (ncp == NULL) 2241 return; 2242 2243 cache_celockstate_init(&cel); 2244 ndd = NULL; 2245 ncp_ts = NULL; 2246 2247 /* 2248 * Calculate the hash key and setup as much of the new 2249 * namecache entry as possible before acquiring the lock. 2250 */ 2251 ncp->nc_flag = flag | NCF_WIP; 2252 ncp->nc_vp = vp; 2253 if (vp == NULL) 2254 cache_neg_init(ncp); 2255 ncp->nc_dvp = dvp; 2256 if (tsp != NULL) { 2257 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 2258 ncp_ts->nc_time = *tsp; 2259 ncp_ts->nc_ticks = ticks; 2260 ncp_ts->nc_nc.nc_flag |= NCF_TS; 2261 if (dtsp != NULL) { 2262 ncp_ts->nc_dotdottime = *dtsp; 2263 ncp_ts->nc_nc.nc_flag |= NCF_DTS; 2264 } 2265 } 2266 len = ncp->nc_nlen = cnp->cn_namelen; 2267 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 2268 memcpy(ncp->nc_name, cnp->cn_nameptr, len); 2269 ncp->nc_name[len] = '\0'; 2270 cache_enter_lock(&cel, dvp, vp, hash); 2271 2272 /* 2273 * See if this vnode or negative entry is already in the cache 2274 * with this name. This can happen with concurrent lookups of 2275 * the same path name. 2276 */ 2277 ncpp = NCHHASH(hash); 2278 CK_SLIST_FOREACH(n2, ncpp, nc_hash) { 2279 if (n2->nc_dvp == dvp && 2280 n2->nc_nlen == cnp->cn_namelen && 2281 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { 2282 MPASS(cache_ncp_canuse(n2)); 2283 if ((n2->nc_flag & NCF_NEGATIVE) != 0) 2284 KASSERT(vp == NULL, 2285 ("%s: found entry pointing to a different vnode (%p != %p)", 2286 __func__, NULL, vp)); 2287 else 2288 KASSERT(n2->nc_vp == vp, 2289 ("%s: found entry pointing to a different vnode (%p != %p)", 2290 __func__, n2->nc_vp, vp)); 2291 /* 2292 * Entries are supposed to be immutable unless in the 2293 * process of getting destroyed. Accommodating for 2294 * changing timestamps is possible but not worth it. 2295 * This should be harmless in terms of correctness, in 2296 * the worst case resulting in an earlier expiration. 2297 * Alternatively, the found entry can be replaced 2298 * altogether. 2299 */ 2300 MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS))); 2301 #if 0 2302 if (tsp != NULL) { 2303 KASSERT((n2->nc_flag & NCF_TS) != 0, 2304 ("no NCF_TS")); 2305 n2_ts = __containerof(n2, struct namecache_ts, nc_nc); 2306 n2_ts->nc_time = ncp_ts->nc_time; 2307 n2_ts->nc_ticks = ncp_ts->nc_ticks; 2308 if (dtsp != NULL) { 2309 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; 2310 n2_ts->nc_nc.nc_flag |= NCF_DTS; 2311 } 2312 } 2313 #endif 2314 SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name, 2315 vp); 2316 goto out_unlock_free; 2317 } 2318 } 2319 2320 if (flag == NCF_ISDOTDOT) { 2321 /* 2322 * See if we are trying to add .. entry, but some other lookup 2323 * has populated v_cache_dd pointer already. 2324 */ 2325 if (dvp->v_cache_dd != NULL) 2326 goto out_unlock_free; 2327 KASSERT(vp == NULL || vp->v_type == VDIR, 2328 ("wrong vnode type %p", vp)); 2329 vn_seqc_write_begin(dvp); 2330 dvp->v_cache_dd = ncp; 2331 vn_seqc_write_end(dvp); 2332 } 2333 2334 if (vp != NULL) { 2335 if (flag != NCF_ISDOTDOT) { 2336 /* 2337 * For this case, the cache entry maps both the 2338 * directory name in it and the name ".." for the 2339 * directory's parent. 2340 */ 2341 vn_seqc_write_begin(vp); 2342 if ((ndd = vp->v_cache_dd) != NULL) { 2343 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) 2344 cache_zap_locked(ndd); 2345 else 2346 ndd = NULL; 2347 } 2348 vp->v_cache_dd = ncp; 2349 vn_seqc_write_end(vp); 2350 } else if (vp->v_type != VDIR) { 2351 if (vp->v_cache_dd != NULL) { 2352 vn_seqc_write_begin(vp); 2353 vp->v_cache_dd = NULL; 2354 vn_seqc_write_end(vp); 2355 } 2356 } 2357 } 2358 2359 if (flag != NCF_ISDOTDOT) { 2360 if (LIST_EMPTY(&dvp->v_cache_src)) { 2361 cache_hold_vnode(dvp); 2362 } 2363 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); 2364 } 2365 2366 /* 2367 * If the entry is "negative", we place it into the 2368 * "negative" cache queue, otherwise, we place it into the 2369 * destination vnode's cache entries queue. 2370 */ 2371 if (vp != NULL) { 2372 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); 2373 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, 2374 vp); 2375 } else { 2376 if (cnp->cn_flags & ISWHITEOUT) 2377 ncp->nc_flag |= NCF_WHITE; 2378 cache_neg_insert(ncp); 2379 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, 2380 ncp->nc_name); 2381 } 2382 2383 /* 2384 * Insert the new namecache entry into the appropriate chain 2385 * within the cache entries table. 2386 */ 2387 CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash); 2388 2389 atomic_thread_fence_rel(); 2390 /* 2391 * Mark the entry as fully constructed. 2392 * It is immutable past this point until its removal. 2393 */ 2394 atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP); 2395 2396 cache_enter_unlock(&cel); 2397 if (ndd != NULL) 2398 cache_free(ndd); 2399 return; 2400 out_unlock_free: 2401 cache_enter_unlock(&cel); 2402 cache_free(ncp); 2403 return; 2404 } 2405 2406 static u_int 2407 cache_roundup_2(u_int val) 2408 { 2409 u_int res; 2410 2411 for (res = 1; res <= val; res <<= 1) 2412 continue; 2413 2414 return (res); 2415 } 2416 2417 static struct nchashhead * 2418 nchinittbl(u_long elements, u_long *hashmask) 2419 { 2420 struct nchashhead *hashtbl; 2421 u_long hashsize, i; 2422 2423 hashsize = cache_roundup_2(elements) / 2; 2424 2425 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK); 2426 for (i = 0; i < hashsize; i++) 2427 CK_SLIST_INIT(&hashtbl[i]); 2428 *hashmask = hashsize - 1; 2429 return (hashtbl); 2430 } 2431 2432 static void 2433 ncfreetbl(struct nchashhead *hashtbl) 2434 { 2435 2436 free(hashtbl, M_VFSCACHE); 2437 } 2438 2439 /* 2440 * Name cache initialization, from vfs_init() when we are booting 2441 */ 2442 static void 2443 nchinit(void *dummy __unused) 2444 { 2445 u_int i; 2446 2447 cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE, 2448 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2449 cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE, 2450 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2451 cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE, 2452 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2453 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE, 2454 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2455 2456 VFS_SMR_ZONE_SET(cache_zone_small); 2457 VFS_SMR_ZONE_SET(cache_zone_small_ts); 2458 VFS_SMR_ZONE_SET(cache_zone_large); 2459 VFS_SMR_ZONE_SET(cache_zone_large_ts); 2460 2461 ncsize = desiredvnodes * ncsizefactor; 2462 cache_recalc_neg_min(ncnegminpct); 2463 nchashtbl = nchinittbl(desiredvnodes * 2, &nchash); 2464 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; 2465 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ 2466 ncbuckethash = 7; 2467 if (ncbuckethash > nchash) 2468 ncbuckethash = nchash; 2469 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, 2470 M_WAITOK | M_ZERO); 2471 for (i = 0; i < numbucketlocks; i++) 2472 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE); 2473 ncvnodehash = ncbuckethash; 2474 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, 2475 M_WAITOK | M_ZERO); 2476 for (i = 0; i < numvnodelocks; i++) 2477 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); 2478 2479 for (i = 0; i < numneglists; i++) { 2480 mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF); 2481 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); 2482 TAILQ_INIT(&neglists[i].nl_list); 2483 TAILQ_INIT(&neglists[i].nl_hotlist); 2484 } 2485 } 2486 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); 2487 2488 void 2489 cache_vnode_init(struct vnode *vp) 2490 { 2491 2492 LIST_INIT(&vp->v_cache_src); 2493 TAILQ_INIT(&vp->v_cache_dst); 2494 vp->v_cache_dd = NULL; 2495 cache_prehash(vp); 2496 } 2497 2498 void 2499 cache_changesize(u_long newmaxvnodes) 2500 { 2501 struct nchashhead *new_nchashtbl, *old_nchashtbl; 2502 u_long new_nchash, old_nchash; 2503 struct namecache *ncp; 2504 uint32_t hash; 2505 u_long newncsize; 2506 int i; 2507 2508 newncsize = newmaxvnodes * ncsizefactor; 2509 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); 2510 if (newmaxvnodes < numbucketlocks) 2511 newmaxvnodes = numbucketlocks; 2512 2513 new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash); 2514 /* If same hash table size, nothing to do */ 2515 if (nchash == new_nchash) { 2516 ncfreetbl(new_nchashtbl); 2517 return; 2518 } 2519 /* 2520 * Move everything from the old hash table to the new table. 2521 * None of the namecache entries in the table can be removed 2522 * because to do so, they have to be removed from the hash table. 2523 */ 2524 cache_lock_all_vnodes(); 2525 cache_lock_all_buckets(); 2526 old_nchashtbl = nchashtbl; 2527 old_nchash = nchash; 2528 nchashtbl = new_nchashtbl; 2529 nchash = new_nchash; 2530 for (i = 0; i <= old_nchash; i++) { 2531 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) { 2532 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, 2533 ncp->nc_dvp); 2534 CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash); 2535 CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); 2536 } 2537 } 2538 ncsize = newncsize; 2539 cache_recalc_neg_min(ncnegminpct); 2540 cache_unlock_all_buckets(); 2541 cache_unlock_all_vnodes(); 2542 ncfreetbl(old_nchashtbl); 2543 } 2544 2545 /* 2546 * Invalidate all entries from and to a particular vnode. 2547 */ 2548 static void 2549 cache_purge_impl(struct vnode *vp) 2550 { 2551 struct cache_freebatch batch; 2552 struct namecache *ncp; 2553 struct mtx *vlp, *vlp2; 2554 2555 TAILQ_INIT(&batch); 2556 vlp = VP2VNODELOCK(vp); 2557 vlp2 = NULL; 2558 mtx_lock(vlp); 2559 retry: 2560 while (!LIST_EMPTY(&vp->v_cache_src)) { 2561 ncp = LIST_FIRST(&vp->v_cache_src); 2562 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2563 goto retry; 2564 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2565 } 2566 while (!TAILQ_EMPTY(&vp->v_cache_dst)) { 2567 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2568 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2569 goto retry; 2570 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2571 } 2572 ncp = vp->v_cache_dd; 2573 if (ncp != NULL) { 2574 KASSERT(ncp->nc_flag & NCF_ISDOTDOT, 2575 ("lost dotdot link")); 2576 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2577 goto retry; 2578 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2579 } 2580 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); 2581 mtx_unlock(vlp); 2582 if (vlp2 != NULL) 2583 mtx_unlock(vlp2); 2584 cache_free_batch(&batch); 2585 } 2586 2587 /* 2588 * Opportunistic check to see if there is anything to do. 2589 */ 2590 static bool 2591 cache_has_entries(struct vnode *vp) 2592 { 2593 2594 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2595 vp->v_cache_dd == NULL) 2596 return (false); 2597 return (true); 2598 } 2599 2600 void 2601 cache_purge(struct vnode *vp) 2602 { 2603 2604 SDT_PROBE1(vfs, namecache, purge, done, vp); 2605 if (!cache_has_entries(vp)) 2606 return; 2607 cache_purge_impl(vp); 2608 } 2609 2610 /* 2611 * Only to be used by vgone. 2612 */ 2613 void 2614 cache_purge_vgone(struct vnode *vp) 2615 { 2616 struct mtx *vlp; 2617 2618 VNPASS(VN_IS_DOOMED(vp), vp); 2619 if (cache_has_entries(vp)) { 2620 cache_purge_impl(vp); 2621 return; 2622 } 2623 2624 /* 2625 * Serialize against a potential thread doing cache_purge. 2626 */ 2627 vlp = VP2VNODELOCK(vp); 2628 mtx_wait_unlocked(vlp); 2629 if (cache_has_entries(vp)) { 2630 cache_purge_impl(vp); 2631 return; 2632 } 2633 return; 2634 } 2635 2636 /* 2637 * Invalidate all negative entries for a particular directory vnode. 2638 */ 2639 void 2640 cache_purge_negative(struct vnode *vp) 2641 { 2642 struct cache_freebatch batch; 2643 struct namecache *ncp, *nnp; 2644 struct mtx *vlp; 2645 2646 SDT_PROBE1(vfs, namecache, purge_negative, done, vp); 2647 if (LIST_EMPTY(&vp->v_cache_src)) 2648 return; 2649 TAILQ_INIT(&batch); 2650 vlp = VP2VNODELOCK(vp); 2651 mtx_lock(vlp); 2652 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { 2653 if (!(ncp->nc_flag & NCF_NEGATIVE)) 2654 continue; 2655 cache_zap_negative_locked_vnode_kl(ncp, vp); 2656 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2657 } 2658 mtx_unlock(vlp); 2659 cache_free_batch(&batch); 2660 } 2661 2662 void 2663 cache_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, 2664 struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp) 2665 { 2666 2667 ASSERT_VOP_IN_SEQC(fdvp); 2668 ASSERT_VOP_IN_SEQC(fvp); 2669 ASSERT_VOP_IN_SEQC(tdvp); 2670 if (tvp != NULL) 2671 ASSERT_VOP_IN_SEQC(tvp); 2672 2673 cache_purge(fvp); 2674 if (tvp != NULL) { 2675 cache_purge(tvp); 2676 KASSERT(!cache_remove_cnp(tdvp, tcnp), 2677 ("%s: lingering negative entry", __func__)); 2678 } else { 2679 cache_remove_cnp(tdvp, tcnp); 2680 } 2681 } 2682 2683 #ifdef INVARIANTS 2684 /* 2685 * Validate that if an entry exists it matches. 2686 */ 2687 void 2688 cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) 2689 { 2690 struct namecache *ncp; 2691 struct mtx *blp; 2692 uint32_t hash; 2693 2694 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 2695 if (CK_SLIST_EMPTY(NCHHASH(hash))) 2696 return; 2697 blp = HASH2BUCKETLOCK(hash); 2698 mtx_lock(blp); 2699 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 2700 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 2701 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) { 2702 if (ncp->nc_vp != vp) 2703 panic("%s: mismatch (%p != %p); ncp %p [%s] dvp %p vp %p\n", 2704 __func__, vp, ncp->nc_vp, ncp, ncp->nc_name, ncp->nc_dvp, 2705 ncp->nc_vp); 2706 } 2707 } 2708 mtx_unlock(blp); 2709 } 2710 #endif 2711 2712 /* 2713 * Flush all entries referencing a particular filesystem. 2714 */ 2715 void 2716 cache_purgevfs(struct mount *mp) 2717 { 2718 struct vnode *vp, *mvp; 2719 2720 SDT_PROBE1(vfs, namecache, purgevfs, done, mp); 2721 /* 2722 * Somewhat wasteful iteration over all vnodes. Would be better to 2723 * support filtering and avoid the interlock to begin with. 2724 */ 2725 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 2726 if (!cache_has_entries(vp)) { 2727 VI_UNLOCK(vp); 2728 continue; 2729 } 2730 vholdl(vp); 2731 VI_UNLOCK(vp); 2732 cache_purge(vp); 2733 vdrop(vp); 2734 } 2735 } 2736 2737 /* 2738 * Perform canonical checks and cache lookup and pass on to filesystem 2739 * through the vop_cachedlookup only if needed. 2740 */ 2741 2742 int 2743 vfs_cache_lookup(struct vop_lookup_args *ap) 2744 { 2745 struct vnode *dvp; 2746 int error; 2747 struct vnode **vpp = ap->a_vpp; 2748 struct componentname *cnp = ap->a_cnp; 2749 int flags = cnp->cn_flags; 2750 2751 *vpp = NULL; 2752 dvp = ap->a_dvp; 2753 2754 if (dvp->v_type != VDIR) 2755 return (ENOTDIR); 2756 2757 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && 2758 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) 2759 return (EROFS); 2760 2761 error = vn_dir_check_exec(dvp, cnp); 2762 if (error != 0) 2763 return (error); 2764 2765 error = cache_lookup(dvp, vpp, cnp, NULL, NULL); 2766 if (error == 0) 2767 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); 2768 if (error == -1) 2769 return (0); 2770 return (error); 2771 } 2772 2773 /* Implementation of the getcwd syscall. */ 2774 int 2775 sys___getcwd(struct thread *td, struct __getcwd_args *uap) 2776 { 2777 char *buf, *retbuf; 2778 size_t buflen; 2779 int error; 2780 2781 buflen = uap->buflen; 2782 if (__predict_false(buflen < 2)) 2783 return (EINVAL); 2784 if (buflen > MAXPATHLEN) 2785 buflen = MAXPATHLEN; 2786 2787 buf = uma_zalloc(namei_zone, M_WAITOK); 2788 error = vn_getcwd(buf, &retbuf, &buflen); 2789 if (error == 0) 2790 error = copyout(retbuf, uap->buf, buflen); 2791 uma_zfree(namei_zone, buf); 2792 return (error); 2793 } 2794 2795 int 2796 vn_getcwd(char *buf, char **retbuf, size_t *buflen) 2797 { 2798 struct pwd *pwd; 2799 int error; 2800 2801 vfs_smr_enter(); 2802 pwd = pwd_get_smr(); 2803 error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, 2804 buflen, 0); 2805 VFS_SMR_ASSERT_NOT_ENTERED(); 2806 if (error < 0) { 2807 pwd = pwd_hold(curthread); 2808 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf, 2809 retbuf, buflen); 2810 pwd_drop(pwd); 2811 } 2812 2813 #ifdef KTRACE 2814 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0) 2815 ktrnamei(*retbuf); 2816 #endif 2817 return (error); 2818 } 2819 2820 static int 2821 kern___realpathat(struct thread *td, int fd, const char *path, char *buf, 2822 size_t size, int flags, enum uio_seg pathseg) 2823 { 2824 struct nameidata nd; 2825 char *retbuf, *freebuf; 2826 int error; 2827 2828 if (flags != 0) 2829 return (EINVAL); 2830 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1, 2831 pathseg, path, fd, &cap_fstat_rights, td); 2832 if ((error = namei(&nd)) != 0) 2833 return (error); 2834 error = vn_fullpath_hardlink(&nd, &retbuf, &freebuf, &size); 2835 if (error == 0) { 2836 error = copyout(retbuf, buf, size); 2837 free(freebuf, M_TEMP); 2838 } 2839 NDFREE(&nd, 0); 2840 return (error); 2841 } 2842 2843 int 2844 sys___realpathat(struct thread *td, struct __realpathat_args *uap) 2845 { 2846 2847 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size, 2848 uap->flags, UIO_USERSPACE)); 2849 } 2850 2851 /* 2852 * Retrieve the full filesystem path that correspond to a vnode from the name 2853 * cache (if available) 2854 */ 2855 int 2856 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf) 2857 { 2858 struct pwd *pwd; 2859 char *buf; 2860 size_t buflen; 2861 int error; 2862 2863 if (__predict_false(vp == NULL)) 2864 return (EINVAL); 2865 2866 buflen = MAXPATHLEN; 2867 buf = malloc(buflen, M_TEMP, M_WAITOK); 2868 vfs_smr_enter(); 2869 pwd = pwd_get_smr(); 2870 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, 0); 2871 VFS_SMR_ASSERT_NOT_ENTERED(); 2872 if (error < 0) { 2873 pwd = pwd_hold(curthread); 2874 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen); 2875 pwd_drop(pwd); 2876 } 2877 if (error == 0) 2878 *freebuf = buf; 2879 else 2880 free(buf, M_TEMP); 2881 return (error); 2882 } 2883 2884 /* 2885 * This function is similar to vn_fullpath, but it attempts to lookup the 2886 * pathname relative to the global root mount point. This is required for the 2887 * auditing sub-system, as audited pathnames must be absolute, relative to the 2888 * global root mount point. 2889 */ 2890 int 2891 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf) 2892 { 2893 char *buf; 2894 size_t buflen; 2895 int error; 2896 2897 if (__predict_false(vp == NULL)) 2898 return (EINVAL); 2899 buflen = MAXPATHLEN; 2900 buf = malloc(buflen, M_TEMP, M_WAITOK); 2901 vfs_smr_enter(); 2902 error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, 0); 2903 VFS_SMR_ASSERT_NOT_ENTERED(); 2904 if (error < 0) { 2905 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen); 2906 } 2907 if (error == 0) 2908 *freebuf = buf; 2909 else 2910 free(buf, M_TEMP); 2911 return (error); 2912 } 2913 2914 static struct namecache * 2915 vn_dd_from_dst(struct vnode *vp) 2916 { 2917 struct namecache *ncp; 2918 2919 cache_assert_vnode_locked(vp); 2920 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) { 2921 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2922 return (ncp); 2923 } 2924 return (NULL); 2925 } 2926 2927 int 2928 vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen) 2929 { 2930 struct vnode *dvp; 2931 struct namecache *ncp; 2932 struct mtx *vlp; 2933 int error; 2934 2935 vlp = VP2VNODELOCK(*vp); 2936 mtx_lock(vlp); 2937 ncp = (*vp)->v_cache_dd; 2938 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) { 2939 KASSERT(ncp == vn_dd_from_dst(*vp), 2940 ("%s: mismatch for dd entry (%p != %p)", __func__, 2941 ncp, vn_dd_from_dst(*vp))); 2942 } else { 2943 ncp = vn_dd_from_dst(*vp); 2944 } 2945 if (ncp != NULL) { 2946 if (*buflen < ncp->nc_nlen) { 2947 mtx_unlock(vlp); 2948 vrele(*vp); 2949 counter_u64_add(numfullpathfail4, 1); 2950 error = ENOMEM; 2951 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2952 vp, NULL); 2953 return (error); 2954 } 2955 *buflen -= ncp->nc_nlen; 2956 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 2957 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, 2958 ncp->nc_name, vp); 2959 dvp = *vp; 2960 *vp = ncp->nc_dvp; 2961 vref(*vp); 2962 mtx_unlock(vlp); 2963 vrele(dvp); 2964 return (0); 2965 } 2966 SDT_PROBE1(vfs, namecache, fullpath, miss, vp); 2967 2968 mtx_unlock(vlp); 2969 vn_lock(*vp, LK_SHARED | LK_RETRY); 2970 error = VOP_VPTOCNP(*vp, &dvp, buf, buflen); 2971 vput(*vp); 2972 if (error) { 2973 counter_u64_add(numfullpathfail2, 1); 2974 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2975 return (error); 2976 } 2977 2978 *vp = dvp; 2979 if (VN_IS_DOOMED(dvp)) { 2980 /* forced unmount */ 2981 vrele(dvp); 2982 error = ENOENT; 2983 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2984 return (error); 2985 } 2986 /* 2987 * *vp has its use count incremented still. 2988 */ 2989 2990 return (0); 2991 } 2992 2993 /* 2994 * Resolve a directory to a pathname. 2995 * 2996 * The name of the directory can always be found in the namecache or fetched 2997 * from the filesystem. There is also guaranteed to be only one parent, meaning 2998 * we can just follow vnodes up until we find the root. 2999 * 3000 * The vnode must be referenced. 3001 */ 3002 static int 3003 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 3004 size_t *len, size_t addend) 3005 { 3006 #ifdef KDTRACE_HOOKS 3007 struct vnode *startvp = vp; 3008 #endif 3009 struct vnode *vp1; 3010 size_t buflen; 3011 int error; 3012 bool slash_prefixed; 3013 3014 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); 3015 VNPASS(vp->v_usecount > 0, vp); 3016 3017 buflen = *len; 3018 3019 slash_prefixed = true; 3020 if (addend == 0) { 3021 MPASS(*len >= 2); 3022 buflen--; 3023 buf[buflen] = '\0'; 3024 slash_prefixed = false; 3025 } 3026 3027 error = 0; 3028 3029 SDT_PROBE1(vfs, namecache, fullpath, entry, vp); 3030 counter_u64_add(numfullpathcalls, 1); 3031 while (vp != rdir && vp != rootvnode) { 3032 /* 3033 * The vp vnode must be already fully constructed, 3034 * since it is either found in namecache or obtained 3035 * from VOP_VPTOCNP(). We may test for VV_ROOT safely 3036 * without obtaining the vnode lock. 3037 */ 3038 if ((vp->v_vflag & VV_ROOT) != 0) { 3039 vn_lock(vp, LK_RETRY | LK_SHARED); 3040 3041 /* 3042 * With the vnode locked, check for races with 3043 * unmount, forced or not. Note that we 3044 * already verified that vp is not equal to 3045 * the root vnode, which means that 3046 * mnt_vnodecovered can be NULL only for the 3047 * case of unmount. 3048 */ 3049 if (VN_IS_DOOMED(vp) || 3050 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || 3051 vp1->v_mountedhere != vp->v_mount) { 3052 vput(vp); 3053 error = ENOENT; 3054 SDT_PROBE3(vfs, namecache, fullpath, return, 3055 error, vp, NULL); 3056 break; 3057 } 3058 3059 vref(vp1); 3060 vput(vp); 3061 vp = vp1; 3062 continue; 3063 } 3064 if (vp->v_type != VDIR) { 3065 vrele(vp); 3066 counter_u64_add(numfullpathfail1, 1); 3067 error = ENOTDIR; 3068 SDT_PROBE3(vfs, namecache, fullpath, return, 3069 error, vp, NULL); 3070 break; 3071 } 3072 error = vn_vptocnp(&vp, buf, &buflen); 3073 if (error) 3074 break; 3075 if (buflen == 0) { 3076 vrele(vp); 3077 error = ENOMEM; 3078 SDT_PROBE3(vfs, namecache, fullpath, return, error, 3079 startvp, NULL); 3080 break; 3081 } 3082 buf[--buflen] = '/'; 3083 slash_prefixed = true; 3084 } 3085 if (error) 3086 return (error); 3087 if (!slash_prefixed) { 3088 if (buflen == 0) { 3089 vrele(vp); 3090 counter_u64_add(numfullpathfail4, 1); 3091 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, 3092 startvp, NULL); 3093 return (ENOMEM); 3094 } 3095 buf[--buflen] = '/'; 3096 } 3097 counter_u64_add(numfullpathfound, 1); 3098 vrele(vp); 3099 3100 *retbuf = buf + buflen; 3101 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf); 3102 *len -= buflen; 3103 *len += addend; 3104 return (0); 3105 } 3106 3107 /* 3108 * Resolve an arbitrary vnode to a pathname. 3109 * 3110 * Note 2 caveats: 3111 * - hardlinks are not tracked, thus if the vnode is not a directory this can 3112 * resolve to a different path than the one used to find it 3113 * - namecache is not mandatory, meaning names are not guaranteed to be added 3114 * (in which case resolving fails) 3115 */ 3116 static void __inline 3117 cache_rev_failed_impl(int *reason, int line) 3118 { 3119 3120 *reason = line; 3121 } 3122 #define cache_rev_failed(var) cache_rev_failed_impl((var), __LINE__) 3123 3124 static int 3125 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 3126 char **retbuf, size_t *buflen, size_t addend) 3127 { 3128 #ifdef KDTRACE_HOOKS 3129 struct vnode *startvp = vp; 3130 #endif 3131 struct vnode *tvp; 3132 struct mount *mp; 3133 struct namecache *ncp; 3134 size_t orig_buflen; 3135 int reason; 3136 int error; 3137 #ifdef KDTRACE_HOOKS 3138 int i; 3139 #endif 3140 seqc_t vp_seqc, tvp_seqc; 3141 u_char nc_flag; 3142 3143 VFS_SMR_ASSERT_ENTERED(); 3144 3145 if (!cache_fast_revlookup) { 3146 vfs_smr_exit(); 3147 return (-1); 3148 } 3149 3150 orig_buflen = *buflen; 3151 3152 if (addend == 0) { 3153 MPASS(*buflen >= 2); 3154 *buflen -= 1; 3155 buf[*buflen] = '\0'; 3156 } 3157 3158 if (vp == rdir || vp == rootvnode) { 3159 if (addend == 0) { 3160 *buflen -= 1; 3161 buf[*buflen] = '/'; 3162 } 3163 goto out_ok; 3164 } 3165 3166 #ifdef KDTRACE_HOOKS 3167 i = 0; 3168 #endif 3169 error = -1; 3170 ncp = NULL; /* for sdt probe down below */ 3171 vp_seqc = vn_seqc_read_any(vp); 3172 if (seqc_in_modify(vp_seqc)) { 3173 cache_rev_failed(&reason); 3174 goto out_abort; 3175 } 3176 3177 for (;;) { 3178 #ifdef KDTRACE_HOOKS 3179 i++; 3180 #endif 3181 if ((vp->v_vflag & VV_ROOT) != 0) { 3182 mp = atomic_load_ptr(&vp->v_mount); 3183 if (mp == NULL) { 3184 cache_rev_failed(&reason); 3185 goto out_abort; 3186 } 3187 tvp = atomic_load_ptr(&mp->mnt_vnodecovered); 3188 tvp_seqc = vn_seqc_read_any(tvp); 3189 if (seqc_in_modify(tvp_seqc)) { 3190 cache_rev_failed(&reason); 3191 goto out_abort; 3192 } 3193 if (!vn_seqc_consistent(vp, vp_seqc)) { 3194 cache_rev_failed(&reason); 3195 goto out_abort; 3196 } 3197 vp = tvp; 3198 vp_seqc = tvp_seqc; 3199 continue; 3200 } 3201 ncp = atomic_load_ptr(&vp->v_cache_dd); 3202 if (ncp == NULL) { 3203 cache_rev_failed(&reason); 3204 goto out_abort; 3205 } 3206 nc_flag = atomic_load_char(&ncp->nc_flag); 3207 if ((nc_flag & NCF_ISDOTDOT) != 0) { 3208 cache_rev_failed(&reason); 3209 goto out_abort; 3210 } 3211 if (!cache_ncp_canuse(ncp)) { 3212 cache_rev_failed(&reason); 3213 goto out_abort; 3214 } 3215 if (ncp->nc_nlen >= *buflen) { 3216 cache_rev_failed(&reason); 3217 error = ENOMEM; 3218 goto out_abort; 3219 } 3220 *buflen -= ncp->nc_nlen; 3221 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 3222 *buflen -= 1; 3223 buf[*buflen] = '/'; 3224 tvp = ncp->nc_dvp; 3225 tvp_seqc = vn_seqc_read_any(tvp); 3226 if (seqc_in_modify(tvp_seqc)) { 3227 cache_rev_failed(&reason); 3228 goto out_abort; 3229 } 3230 if (!vn_seqc_consistent(vp, vp_seqc)) { 3231 cache_rev_failed(&reason); 3232 goto out_abort; 3233 } 3234 vp = tvp; 3235 vp_seqc = tvp_seqc; 3236 if (vp == rdir || vp == rootvnode) 3237 break; 3238 } 3239 out_ok: 3240 vfs_smr_exit(); 3241 *retbuf = buf + *buflen; 3242 *buflen = orig_buflen - *buflen + addend; 3243 SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf); 3244 return (0); 3245 3246 out_abort: 3247 *buflen = orig_buflen; 3248 SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i); 3249 vfs_smr_exit(); 3250 return (error); 3251 } 3252 3253 static int 3254 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 3255 size_t *buflen) 3256 { 3257 size_t orig_buflen, addend; 3258 int error; 3259 3260 if (*buflen < 2) 3261 return (EINVAL); 3262 3263 orig_buflen = *buflen; 3264 3265 vref(vp); 3266 addend = 0; 3267 if (vp->v_type != VDIR) { 3268 *buflen -= 1; 3269 buf[*buflen] = '\0'; 3270 error = vn_vptocnp(&vp, buf, buflen); 3271 if (error) 3272 return (error); 3273 if (*buflen == 0) { 3274 vrele(vp); 3275 return (ENOMEM); 3276 } 3277 *buflen -= 1; 3278 buf[*buflen] = '/'; 3279 addend = orig_buflen - *buflen; 3280 } 3281 3282 return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, addend)); 3283 } 3284 3285 /* 3286 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks). 3287 * 3288 * Since the namecache does not track hardlinks, the caller is expected to first 3289 * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei. 3290 * 3291 * Then we have 2 cases: 3292 * - if the found vnode is a directory, the path can be constructed just by 3293 * following names up the chain 3294 * - otherwise we populate the buffer with the saved name and start resolving 3295 * from the parent 3296 */ 3297 static int 3298 vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, char **freebuf, 3299 size_t *buflen) 3300 { 3301 char *buf, *tmpbuf; 3302 struct pwd *pwd; 3303 struct componentname *cnp; 3304 struct vnode *vp; 3305 size_t addend; 3306 int error; 3307 enum vtype type; 3308 3309 if (*buflen < 2) 3310 return (EINVAL); 3311 if (*buflen > MAXPATHLEN) 3312 *buflen = MAXPATHLEN; 3313 3314 buf = malloc(*buflen, M_TEMP, M_WAITOK); 3315 3316 addend = 0; 3317 vp = ndp->ni_vp; 3318 /* 3319 * Check for VBAD to work around the vp_crossmp bug in lookup(). 3320 * 3321 * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be 3322 * set to mount point's root vnode while ni_dvp will be vp_crossmp. 3323 * If the type is VDIR (like in this very case) we can skip looking 3324 * at ni_dvp in the first place. However, since vnodes get passed here 3325 * unlocked the target may transition to doomed state (type == VBAD) 3326 * before we get to evaluate the condition. If this happens, we will 3327 * populate part of the buffer and descend to vn_fullpath_dir with 3328 * vp == vp_crossmp. Prevent the problem by checking for VBAD. 3329 * 3330 * This should be atomic_load(&vp->v_type) but it is illegal to take 3331 * an address of a bit field, even if said field is sized to char. 3332 * Work around the problem by reading the value into a full-sized enum 3333 * and then re-reading it with atomic_load which will still prevent 3334 * the compiler from re-reading down the road. 3335 */ 3336 type = vp->v_type; 3337 type = atomic_load_int(&type); 3338 if (type == VBAD) { 3339 error = ENOENT; 3340 goto out_bad; 3341 } 3342 if (type != VDIR) { 3343 cnp = &ndp->ni_cnd; 3344 addend = cnp->cn_namelen + 2; 3345 if (*buflen < addend) { 3346 error = ENOMEM; 3347 goto out_bad; 3348 } 3349 *buflen -= addend; 3350 tmpbuf = buf + *buflen; 3351 tmpbuf[0] = '/'; 3352 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen); 3353 tmpbuf[addend - 1] = '\0'; 3354 vp = ndp->ni_dvp; 3355 } 3356 3357 vfs_smr_enter(); 3358 pwd = pwd_get_smr(); 3359 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen, 3360 addend); 3361 VFS_SMR_ASSERT_NOT_ENTERED(); 3362 if (error < 0) { 3363 pwd = pwd_hold(curthread); 3364 vref(vp); 3365 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen, 3366 addend); 3367 pwd_drop(pwd); 3368 if (error != 0) 3369 goto out_bad; 3370 } 3371 3372 *freebuf = buf; 3373 3374 return (0); 3375 out_bad: 3376 free(buf, M_TEMP); 3377 return (error); 3378 } 3379 3380 struct vnode * 3381 vn_dir_dd_ino(struct vnode *vp) 3382 { 3383 struct namecache *ncp; 3384 struct vnode *ddvp; 3385 struct mtx *vlp; 3386 enum vgetstate vs; 3387 3388 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); 3389 vlp = VP2VNODELOCK(vp); 3390 mtx_lock(vlp); 3391 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { 3392 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 3393 continue; 3394 ddvp = ncp->nc_dvp; 3395 vs = vget_prep(ddvp); 3396 mtx_unlock(vlp); 3397 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) 3398 return (NULL); 3399 return (ddvp); 3400 } 3401 mtx_unlock(vlp); 3402 return (NULL); 3403 } 3404 3405 int 3406 vn_commname(struct vnode *vp, char *buf, u_int buflen) 3407 { 3408 struct namecache *ncp; 3409 struct mtx *vlp; 3410 int l; 3411 3412 vlp = VP2VNODELOCK(vp); 3413 mtx_lock(vlp); 3414 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) 3415 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 3416 break; 3417 if (ncp == NULL) { 3418 mtx_unlock(vlp); 3419 return (ENOENT); 3420 } 3421 l = min(ncp->nc_nlen, buflen - 1); 3422 memcpy(buf, ncp->nc_name, l); 3423 mtx_unlock(vlp); 3424 buf[l] = '\0'; 3425 return (0); 3426 } 3427 3428 /* 3429 * This function updates path string to vnode's full global path 3430 * and checks the size of the new path string against the pathlen argument. 3431 * 3432 * Requires a locked, referenced vnode. 3433 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 3434 * 3435 * If vp is a directory, the call to vn_fullpath_global() always succeeds 3436 * because it falls back to the ".." lookup if the namecache lookup fails. 3437 */ 3438 int 3439 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, 3440 u_int pathlen) 3441 { 3442 struct nameidata nd; 3443 struct vnode *vp1; 3444 char *rpath, *fbuf; 3445 int error; 3446 3447 ASSERT_VOP_ELOCKED(vp, __func__); 3448 3449 /* Construct global filesystem path from vp. */ 3450 VOP_UNLOCK(vp); 3451 error = vn_fullpath_global(vp, &rpath, &fbuf); 3452 3453 if (error != 0) { 3454 vrele(vp); 3455 return (error); 3456 } 3457 3458 if (strlen(rpath) >= pathlen) { 3459 vrele(vp); 3460 error = ENAMETOOLONG; 3461 goto out; 3462 } 3463 3464 /* 3465 * Re-lookup the vnode by path to detect a possible rename. 3466 * As a side effect, the vnode is relocked. 3467 * If vnode was renamed, return ENOENT. 3468 */ 3469 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, 3470 UIO_SYSSPACE, path, td); 3471 error = namei(&nd); 3472 if (error != 0) { 3473 vrele(vp); 3474 goto out; 3475 } 3476 NDFREE(&nd, NDF_ONLY_PNBUF); 3477 vp1 = nd.ni_vp; 3478 vrele(vp); 3479 if (vp1 == vp) 3480 strcpy(path, rpath); 3481 else { 3482 vput(vp1); 3483 error = ENOENT; 3484 } 3485 3486 out: 3487 free(fbuf, M_TEMP); 3488 return (error); 3489 } 3490 3491 #ifdef DDB 3492 static void 3493 db_print_vpath(struct vnode *vp) 3494 { 3495 3496 while (vp != NULL) { 3497 db_printf("%p: ", vp); 3498 if (vp == rootvnode) { 3499 db_printf("/"); 3500 vp = NULL; 3501 } else { 3502 if (vp->v_vflag & VV_ROOT) { 3503 db_printf("<mount point>"); 3504 vp = vp->v_mount->mnt_vnodecovered; 3505 } else { 3506 struct namecache *ncp; 3507 char *ncn; 3508 int i; 3509 3510 ncp = TAILQ_FIRST(&vp->v_cache_dst); 3511 if (ncp != NULL) { 3512 ncn = ncp->nc_name; 3513 for (i = 0; i < ncp->nc_nlen; i++) 3514 db_printf("%c", *ncn++); 3515 vp = ncp->nc_dvp; 3516 } else { 3517 vp = NULL; 3518 } 3519 } 3520 } 3521 db_printf("\n"); 3522 } 3523 3524 return; 3525 } 3526 3527 DB_SHOW_COMMAND(vpath, db_show_vpath) 3528 { 3529 struct vnode *vp; 3530 3531 if (!have_addr) { 3532 db_printf("usage: show vpath <struct vnode *>\n"); 3533 return; 3534 } 3535 3536 vp = (struct vnode *)addr; 3537 db_print_vpath(vp); 3538 } 3539 3540 #endif 3541 3542 static bool __read_frequently cache_fast_lookup = true; 3543 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW, 3544 &cache_fast_lookup, 0, ""); 3545 3546 #define CACHE_FPL_FAILED -2020 3547 3548 static void 3549 cache_fpl_cleanup_cnp(struct componentname *cnp) 3550 { 3551 3552 uma_zfree(namei_zone, cnp->cn_pnbuf); 3553 #ifdef DIAGNOSTIC 3554 cnp->cn_pnbuf = NULL; 3555 cnp->cn_nameptr = NULL; 3556 #endif 3557 } 3558 3559 static void 3560 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp) 3561 { 3562 struct componentname *cnp; 3563 3564 cnp = &ndp->ni_cnd; 3565 while (*(cnp->cn_nameptr) == '/') { 3566 cnp->cn_nameptr++; 3567 ndp->ni_pathlen--; 3568 } 3569 3570 *dpp = ndp->ni_rootdir; 3571 } 3572 3573 /* 3574 * Components of nameidata (or objects it can point to) which may 3575 * need restoring in case fast path lookup fails. 3576 */ 3577 struct nameidata_saved { 3578 long cn_namelen; 3579 char *cn_nameptr; 3580 size_t ni_pathlen; 3581 int cn_flags; 3582 }; 3583 3584 struct cache_fpl { 3585 struct nameidata *ndp; 3586 struct componentname *cnp; 3587 struct pwd *pwd; 3588 struct vnode *dvp; 3589 struct vnode *tvp; 3590 seqc_t dvp_seqc; 3591 seqc_t tvp_seqc; 3592 struct nameidata_saved snd; 3593 int line; 3594 enum cache_fpl_status status:8; 3595 bool in_smr; 3596 bool fsearch; 3597 }; 3598 3599 static void 3600 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd) 3601 { 3602 3603 snd->cn_flags = fpl->ndp->ni_cnd.cn_flags; 3604 snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen; 3605 snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr; 3606 snd->ni_pathlen = fpl->ndp->ni_pathlen; 3607 } 3608 3609 static void 3610 cache_fpl_restore_partial(struct cache_fpl *fpl, struct nameidata_saved *snd) 3611 { 3612 3613 fpl->ndp->ni_cnd.cn_flags = snd->cn_flags; 3614 fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen; 3615 fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr; 3616 fpl->ndp->ni_pathlen = snd->ni_pathlen; 3617 } 3618 3619 static void 3620 cache_fpl_restore_abort(struct cache_fpl *fpl, struct nameidata_saved *snd) 3621 { 3622 3623 cache_fpl_restore_partial(fpl, snd); 3624 /* 3625 * It is 0 on entry by API contract. 3626 */ 3627 fpl->ndp->ni_resflags = 0; 3628 } 3629 3630 #ifdef INVARIANTS 3631 #define cache_fpl_smr_assert_entered(fpl) ({ \ 3632 struct cache_fpl *_fpl = (fpl); \ 3633 MPASS(_fpl->in_smr == true); \ 3634 VFS_SMR_ASSERT_ENTERED(); \ 3635 }) 3636 #define cache_fpl_smr_assert_not_entered(fpl) ({ \ 3637 struct cache_fpl *_fpl = (fpl); \ 3638 MPASS(_fpl->in_smr == false); \ 3639 VFS_SMR_ASSERT_NOT_ENTERED(); \ 3640 }) 3641 #else 3642 #define cache_fpl_smr_assert_entered(fpl) do { } while (0) 3643 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0) 3644 #endif 3645 3646 #define cache_fpl_smr_enter_initial(fpl) ({ \ 3647 struct cache_fpl *_fpl = (fpl); \ 3648 vfs_smr_enter(); \ 3649 _fpl->in_smr = true; \ 3650 }) 3651 3652 #define cache_fpl_smr_enter(fpl) ({ \ 3653 struct cache_fpl *_fpl = (fpl); \ 3654 MPASS(_fpl->in_smr == false); \ 3655 vfs_smr_enter(); \ 3656 _fpl->in_smr = true; \ 3657 }) 3658 3659 #define cache_fpl_smr_exit(fpl) ({ \ 3660 struct cache_fpl *_fpl = (fpl); \ 3661 MPASS(_fpl->in_smr == true); \ 3662 vfs_smr_exit(); \ 3663 _fpl->in_smr = false; \ 3664 }) 3665 3666 static int 3667 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line) 3668 { 3669 3670 if (fpl->status != CACHE_FPL_STATUS_UNSET) { 3671 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, 3672 ("%s: converting to abort from %d at %d, set at %d\n", 3673 __func__, fpl->status, line, fpl->line)); 3674 } 3675 fpl->status = CACHE_FPL_STATUS_ABORTED; 3676 fpl->line = line; 3677 return (CACHE_FPL_FAILED); 3678 } 3679 3680 #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__) 3681 3682 static int 3683 cache_fpl_partial_impl(struct cache_fpl *fpl, int line) 3684 { 3685 3686 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3687 ("%s: setting to partial at %d, but already set to %d at %d\n", 3688 __func__, line, fpl->status, fpl->line)); 3689 cache_fpl_smr_assert_entered(fpl); 3690 fpl->status = CACHE_FPL_STATUS_PARTIAL; 3691 fpl->line = line; 3692 return (CACHE_FPL_FAILED); 3693 } 3694 3695 #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__) 3696 3697 static int 3698 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line) 3699 { 3700 3701 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3702 ("%s: setting to handled at %d, but already set to %d at %d\n", 3703 __func__, line, fpl->status, fpl->line)); 3704 cache_fpl_smr_assert_not_entered(fpl); 3705 MPASS(error != CACHE_FPL_FAILED); 3706 fpl->status = CACHE_FPL_STATUS_HANDLED; 3707 fpl->line = line; 3708 return (error); 3709 } 3710 3711 #define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__) 3712 3713 #define CACHE_FPL_SUPPORTED_CN_FLAGS \ 3714 (NC_NOMAKEENTRY | NC_KEEPPOSENTRY | LOCKLEAF | LOCKPARENT | WANTPARENT | \ 3715 FOLLOW | LOCKSHARED | SAVENAME | SAVESTART | WILLBEDIR | ISOPEN | \ 3716 NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK) 3717 3718 #define CACHE_FPL_INTERNAL_CN_FLAGS \ 3719 (ISDOTDOT | MAKEENTRY | ISLASTCN) 3720 3721 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, 3722 "supported and internal flags overlap"); 3723 3724 static bool 3725 cache_fpl_islastcn(struct nameidata *ndp) 3726 { 3727 3728 return (*ndp->ni_next == 0); 3729 } 3730 3731 static bool 3732 cache_fpl_isdotdot(struct componentname *cnp) 3733 { 3734 3735 if (cnp->cn_namelen == 2 && 3736 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') 3737 return (true); 3738 return (false); 3739 } 3740 3741 static bool 3742 cache_can_fplookup(struct cache_fpl *fpl) 3743 { 3744 struct nameidata *ndp; 3745 struct componentname *cnp; 3746 struct thread *td; 3747 3748 ndp = fpl->ndp; 3749 cnp = fpl->cnp; 3750 td = cnp->cn_thread; 3751 3752 if (!cache_fast_lookup) { 3753 cache_fpl_aborted(fpl); 3754 return (false); 3755 } 3756 #ifdef MAC 3757 if (mac_vnode_check_lookup_enabled()) { 3758 cache_fpl_aborted(fpl); 3759 return (false); 3760 } 3761 #endif 3762 if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) { 3763 cache_fpl_aborted(fpl); 3764 return (false); 3765 } 3766 if (IN_CAPABILITY_MODE(td)) { 3767 cache_fpl_aborted(fpl); 3768 return (false); 3769 } 3770 if (AUDITING_TD(td)) { 3771 cache_fpl_aborted(fpl); 3772 return (false); 3773 } 3774 if (ndp->ni_startdir != NULL) { 3775 cache_fpl_aborted(fpl); 3776 return (false); 3777 } 3778 return (true); 3779 } 3780 3781 static int 3782 cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp) 3783 { 3784 struct nameidata *ndp; 3785 int error; 3786 bool fsearch; 3787 3788 ndp = fpl->ndp; 3789 error = fgetvp_lookup_smr(ndp->ni_dirfd, ndp, vpp, &fsearch); 3790 if (__predict_false(error != 0)) { 3791 cache_fpl_smr_exit(fpl); 3792 return (cache_fpl_aborted(fpl)); 3793 } 3794 fpl->fsearch = fsearch; 3795 return (0); 3796 } 3797 3798 static bool 3799 cache_fplookup_vnode_supported(struct vnode *vp) 3800 { 3801 3802 return (vp->v_type != VLNK); 3803 } 3804 3805 static int __noinline 3806 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp, 3807 uint32_t hash) 3808 { 3809 struct componentname *cnp; 3810 struct vnode *dvp; 3811 3812 cnp = fpl->cnp; 3813 dvp = fpl->dvp; 3814 3815 cache_fpl_smr_exit(fpl); 3816 if (cache_neg_promote_cond(dvp, cnp, oncp, hash)) 3817 return (cache_fpl_handled(fpl, ENOENT)); 3818 else 3819 return (cache_fpl_aborted(fpl)); 3820 } 3821 3822 /* 3823 * The target vnode is not supported, prepare for the slow path to take over. 3824 */ 3825 static int __noinline 3826 cache_fplookup_partial_setup(struct cache_fpl *fpl) 3827 { 3828 struct nameidata *ndp; 3829 struct componentname *cnp; 3830 enum vgetstate dvs; 3831 struct vnode *dvp; 3832 struct pwd *pwd; 3833 seqc_t dvp_seqc; 3834 3835 ndp = fpl->ndp; 3836 cnp = fpl->cnp; 3837 pwd = fpl->pwd; 3838 dvp = fpl->dvp; 3839 dvp_seqc = fpl->dvp_seqc; 3840 3841 if (!pwd_hold_smr(pwd)) { 3842 cache_fpl_smr_exit(fpl); 3843 return (cache_fpl_aborted(fpl)); 3844 } 3845 3846 dvs = vget_prep_smr(dvp); 3847 cache_fpl_smr_exit(fpl); 3848 if (__predict_false(dvs == VGET_NONE)) { 3849 pwd_drop(pwd); 3850 return (cache_fpl_aborted(fpl)); 3851 } 3852 3853 vget_finish_ref(dvp, dvs); 3854 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3855 vrele(dvp); 3856 pwd_drop(pwd); 3857 return (cache_fpl_aborted(fpl)); 3858 } 3859 3860 cache_fpl_restore_partial(fpl, &fpl->snd); 3861 3862 ndp->ni_startdir = dvp; 3863 cnp->cn_flags |= MAKEENTRY; 3864 if (cache_fpl_islastcn(ndp)) 3865 cnp->cn_flags |= ISLASTCN; 3866 if (cache_fpl_isdotdot(cnp)) 3867 cnp->cn_flags |= ISDOTDOT; 3868 3869 return (0); 3870 } 3871 3872 static int 3873 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs) 3874 { 3875 struct componentname *cnp; 3876 struct vnode *tvp; 3877 seqc_t tvp_seqc; 3878 int error, lkflags; 3879 3880 cnp = fpl->cnp; 3881 tvp = fpl->tvp; 3882 tvp_seqc = fpl->tvp_seqc; 3883 3884 if ((cnp->cn_flags & LOCKLEAF) != 0) { 3885 lkflags = LK_SHARED; 3886 if ((cnp->cn_flags & LOCKSHARED) == 0) 3887 lkflags = LK_EXCLUSIVE; 3888 error = vget_finish(tvp, lkflags, tvs); 3889 if (__predict_false(error != 0)) { 3890 return (cache_fpl_aborted(fpl)); 3891 } 3892 } else { 3893 vget_finish_ref(tvp, tvs); 3894 } 3895 3896 if (!vn_seqc_consistent(tvp, tvp_seqc)) { 3897 if ((cnp->cn_flags & LOCKLEAF) != 0) 3898 vput(tvp); 3899 else 3900 vrele(tvp); 3901 return (cache_fpl_aborted(fpl)); 3902 } 3903 3904 return (cache_fpl_handled(fpl, 0)); 3905 } 3906 3907 /* 3908 * They want to possibly modify the state of the namecache. 3909 * 3910 * Don't try to match the API contract, just leave. 3911 * TODO: this leaves scalability on the table 3912 */ 3913 static int 3914 cache_fplookup_final_modifying(struct cache_fpl *fpl) 3915 { 3916 struct componentname *cnp; 3917 3918 cnp = fpl->cnp; 3919 MPASS(cnp->cn_nameiop != LOOKUP); 3920 return (cache_fpl_partial(fpl)); 3921 } 3922 3923 static int __noinline 3924 cache_fplookup_final_withparent(struct cache_fpl *fpl) 3925 { 3926 struct componentname *cnp; 3927 enum vgetstate dvs, tvs; 3928 struct vnode *dvp, *tvp; 3929 seqc_t dvp_seqc; 3930 int error; 3931 3932 cnp = fpl->cnp; 3933 dvp = fpl->dvp; 3934 dvp_seqc = fpl->dvp_seqc; 3935 tvp = fpl->tvp; 3936 3937 MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0); 3938 3939 /* 3940 * This is less efficient than it can be for simplicity. 3941 */ 3942 dvs = vget_prep_smr(dvp); 3943 if (__predict_false(dvs == VGET_NONE)) { 3944 return (cache_fpl_aborted(fpl)); 3945 } 3946 tvs = vget_prep_smr(tvp); 3947 if (__predict_false(tvs == VGET_NONE)) { 3948 cache_fpl_smr_exit(fpl); 3949 vget_abort(dvp, dvs); 3950 return (cache_fpl_aborted(fpl)); 3951 } 3952 3953 cache_fpl_smr_exit(fpl); 3954 3955 if ((cnp->cn_flags & LOCKPARENT) != 0) { 3956 error = vget_finish(dvp, LK_EXCLUSIVE, dvs); 3957 if (__predict_false(error != 0)) { 3958 vget_abort(tvp, tvs); 3959 return (cache_fpl_aborted(fpl)); 3960 } 3961 } else { 3962 vget_finish_ref(dvp, dvs); 3963 } 3964 3965 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3966 vget_abort(tvp, tvs); 3967 if ((cnp->cn_flags & LOCKPARENT) != 0) 3968 vput(dvp); 3969 else 3970 vrele(dvp); 3971 return (cache_fpl_aborted(fpl)); 3972 } 3973 3974 error = cache_fplookup_final_child(fpl, tvs); 3975 if (__predict_false(error != 0)) { 3976 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED); 3977 if ((cnp->cn_flags & LOCKPARENT) != 0) 3978 vput(dvp); 3979 else 3980 vrele(dvp); 3981 return (error); 3982 } 3983 3984 MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED); 3985 return (0); 3986 } 3987 3988 static int 3989 cache_fplookup_final(struct cache_fpl *fpl) 3990 { 3991 struct componentname *cnp; 3992 enum vgetstate tvs; 3993 struct vnode *dvp, *tvp; 3994 seqc_t dvp_seqc; 3995 3996 cnp = fpl->cnp; 3997 dvp = fpl->dvp; 3998 dvp_seqc = fpl->dvp_seqc; 3999 tvp = fpl->tvp; 4000 4001 VNPASS(cache_fplookup_vnode_supported(dvp), dvp); 4002 4003 if (cnp->cn_nameiop != LOOKUP) { 4004 return (cache_fplookup_final_modifying(fpl)); 4005 } 4006 4007 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) 4008 return (cache_fplookup_final_withparent(fpl)); 4009 4010 tvs = vget_prep_smr(tvp); 4011 if (__predict_false(tvs == VGET_NONE)) { 4012 return (cache_fpl_partial(fpl)); 4013 } 4014 4015 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4016 cache_fpl_smr_exit(fpl); 4017 vget_abort(tvp, tvs); 4018 return (cache_fpl_aborted(fpl)); 4019 } 4020 4021 cache_fpl_smr_exit(fpl); 4022 return (cache_fplookup_final_child(fpl, tvs)); 4023 } 4024 4025 static int __noinline 4026 cache_fplookup_dot(struct cache_fpl *fpl) 4027 { 4028 struct vnode *dvp; 4029 4030 dvp = fpl->dvp; 4031 4032 fpl->tvp = dvp; 4033 fpl->tvp_seqc = vn_seqc_read_any(dvp); 4034 if (seqc_in_modify(fpl->tvp_seqc)) { 4035 return (cache_fpl_aborted(fpl)); 4036 } 4037 4038 counter_u64_add(dothits, 1); 4039 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", dvp); 4040 4041 return (0); 4042 } 4043 4044 static int __noinline 4045 cache_fplookup_dotdot(struct cache_fpl *fpl) 4046 { 4047 struct nameidata *ndp; 4048 struct componentname *cnp; 4049 struct namecache *ncp; 4050 struct vnode *dvp; 4051 struct prison *pr; 4052 u_char nc_flag; 4053 4054 ndp = fpl->ndp; 4055 cnp = fpl->cnp; 4056 dvp = fpl->dvp; 4057 4058 /* 4059 * XXX this is racy the same way regular lookup is 4060 */ 4061 for (pr = cnp->cn_cred->cr_prison; pr != NULL; 4062 pr = pr->pr_parent) 4063 if (dvp == pr->pr_root) 4064 break; 4065 4066 if (dvp == ndp->ni_rootdir || 4067 dvp == ndp->ni_topdir || 4068 dvp == rootvnode || 4069 pr != NULL) { 4070 fpl->tvp = dvp; 4071 fpl->tvp_seqc = vn_seqc_read_any(dvp); 4072 if (seqc_in_modify(fpl->tvp_seqc)) { 4073 return (cache_fpl_aborted(fpl)); 4074 } 4075 return (0); 4076 } 4077 4078 if ((dvp->v_vflag & VV_ROOT) != 0) { 4079 /* 4080 * TODO 4081 * The opposite of climb mount is needed here. 4082 */ 4083 return (cache_fpl_aborted(fpl)); 4084 } 4085 4086 ncp = atomic_load_ptr(&dvp->v_cache_dd); 4087 if (ncp == NULL) { 4088 return (cache_fpl_aborted(fpl)); 4089 } 4090 4091 nc_flag = atomic_load_char(&ncp->nc_flag); 4092 if ((nc_flag & NCF_ISDOTDOT) != 0) { 4093 if ((nc_flag & NCF_NEGATIVE) != 0) 4094 return (cache_fpl_aborted(fpl)); 4095 fpl->tvp = ncp->nc_vp; 4096 } else { 4097 fpl->tvp = ncp->nc_dvp; 4098 } 4099 4100 if (!cache_ncp_canuse(ncp)) { 4101 return (cache_fpl_aborted(fpl)); 4102 } 4103 4104 fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp); 4105 if (seqc_in_modify(fpl->tvp_seqc)) { 4106 return (cache_fpl_partial(fpl)); 4107 } 4108 4109 counter_u64_add(dotdothits, 1); 4110 return (0); 4111 } 4112 4113 static int __noinline 4114 cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash) 4115 { 4116 u_char nc_flag; 4117 bool neg_promote; 4118 4119 nc_flag = atomic_load_char(&ncp->nc_flag); 4120 MPASS((nc_flag & NCF_NEGATIVE) != 0); 4121 /* 4122 * If they want to create an entry we need to replace this one. 4123 */ 4124 if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) { 4125 /* 4126 * TODO 4127 * This should call something similar to 4128 * cache_fplookup_final_modifying. 4129 */ 4130 return (cache_fpl_partial(fpl)); 4131 } 4132 neg_promote = cache_neg_hit_prep(ncp); 4133 if (!cache_ncp_canuse(ncp)) { 4134 cache_neg_hit_abort(ncp); 4135 return (cache_fpl_partial(fpl)); 4136 } 4137 if (__predict_false((nc_flag & NCF_WHITE) != 0)) { 4138 cache_neg_hit_abort(ncp); 4139 return (cache_fpl_partial(fpl)); 4140 } 4141 if (neg_promote) { 4142 return (cache_fplookup_negative_promote(fpl, ncp, hash)); 4143 } 4144 cache_neg_hit_finish(ncp); 4145 cache_fpl_smr_exit(fpl); 4146 return (cache_fpl_handled(fpl, ENOENT)); 4147 } 4148 4149 static int 4150 cache_fplookup_next(struct cache_fpl *fpl) 4151 { 4152 struct componentname *cnp; 4153 struct namecache *ncp; 4154 struct vnode *dvp, *tvp; 4155 u_char nc_flag; 4156 uint32_t hash; 4157 4158 cnp = fpl->cnp; 4159 dvp = fpl->dvp; 4160 4161 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) { 4162 return (cache_fplookup_dot(fpl)); 4163 } 4164 4165 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 4166 4167 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 4168 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 4169 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 4170 break; 4171 } 4172 4173 /* 4174 * If there is no entry we have to punt to the slow path to perform 4175 * actual lookup. Should there be nothing with this name a negative 4176 * entry will be created. 4177 */ 4178 if (__predict_false(ncp == NULL)) { 4179 return (cache_fpl_partial(fpl)); 4180 } 4181 4182 tvp = atomic_load_ptr(&ncp->nc_vp); 4183 nc_flag = atomic_load_char(&ncp->nc_flag); 4184 if ((nc_flag & NCF_NEGATIVE) != 0) { 4185 return (cache_fplookup_neg(fpl, ncp, hash)); 4186 } 4187 4188 if (!cache_ncp_canuse(ncp)) { 4189 return (cache_fpl_partial(fpl)); 4190 } 4191 4192 fpl->tvp = tvp; 4193 fpl->tvp_seqc = vn_seqc_read_any(tvp); 4194 if (seqc_in_modify(fpl->tvp_seqc)) { 4195 return (cache_fpl_partial(fpl)); 4196 } 4197 4198 if (!cache_fplookup_vnode_supported(tvp)) { 4199 return (cache_fpl_partial(fpl)); 4200 } 4201 4202 counter_u64_add(numposhits, 1); 4203 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp); 4204 return (0); 4205 } 4206 4207 static bool 4208 cache_fplookup_mp_supported(struct mount *mp) 4209 { 4210 4211 if (mp == NULL) 4212 return (false); 4213 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) 4214 return (false); 4215 return (true); 4216 } 4217 4218 /* 4219 * Walk up the mount stack (if any). 4220 * 4221 * Correctness is provided in the following ways: 4222 * - all vnodes are protected from freeing with SMR 4223 * - struct mount objects are type stable making them always safe to access 4224 * - stability of the particular mount is provided by busying it 4225 * - relationship between the vnode which is mounted on and the mount is 4226 * verified with the vnode sequence counter after busying 4227 * - association between root vnode of the mount and the mount is protected 4228 * by busy 4229 * 4230 * From that point on we can read the sequence counter of the root vnode 4231 * and get the next mount on the stack (if any) using the same protection. 4232 * 4233 * By the end of successful walk we are guaranteed the reached state was 4234 * indeed present at least at some point which matches the regular lookup. 4235 */ 4236 static int __noinline 4237 cache_fplookup_climb_mount(struct cache_fpl *fpl) 4238 { 4239 struct mount *mp, *prev_mp; 4240 struct vnode *vp; 4241 seqc_t vp_seqc; 4242 4243 vp = fpl->tvp; 4244 vp_seqc = fpl->tvp_seqc; 4245 4246 VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp); 4247 mp = atomic_load_ptr(&vp->v_mountedhere); 4248 if (mp == NULL) 4249 return (0); 4250 4251 prev_mp = NULL; 4252 for (;;) { 4253 if (!vfs_op_thread_enter_crit(mp)) { 4254 if (prev_mp != NULL) 4255 vfs_op_thread_exit_crit(prev_mp); 4256 return (cache_fpl_partial(fpl)); 4257 } 4258 if (prev_mp != NULL) 4259 vfs_op_thread_exit_crit(prev_mp); 4260 if (!vn_seqc_consistent(vp, vp_seqc)) { 4261 vfs_op_thread_exit_crit(mp); 4262 return (cache_fpl_partial(fpl)); 4263 } 4264 if (!cache_fplookup_mp_supported(mp)) { 4265 vfs_op_thread_exit_crit(mp); 4266 return (cache_fpl_partial(fpl)); 4267 } 4268 vp = atomic_load_ptr(&mp->mnt_rootvnode); 4269 if (vp == NULL || VN_IS_DOOMED(vp)) { 4270 vfs_op_thread_exit_crit(mp); 4271 return (cache_fpl_partial(fpl)); 4272 } 4273 vp_seqc = vn_seqc_read_any(vp); 4274 if (seqc_in_modify(vp_seqc)) { 4275 vfs_op_thread_exit_crit(mp); 4276 return (cache_fpl_partial(fpl)); 4277 } 4278 prev_mp = mp; 4279 mp = atomic_load_ptr(&vp->v_mountedhere); 4280 if (mp == NULL) 4281 break; 4282 } 4283 4284 vfs_op_thread_exit_crit(prev_mp); 4285 fpl->tvp = vp; 4286 fpl->tvp_seqc = vp_seqc; 4287 return (0); 4288 } 4289 4290 static bool 4291 cache_fplookup_need_climb_mount(struct cache_fpl *fpl) 4292 { 4293 struct mount *mp; 4294 struct vnode *vp; 4295 4296 vp = fpl->tvp; 4297 4298 /* 4299 * Hack: while this is a union, the pointer tends to be NULL so save on 4300 * a branch. 4301 */ 4302 mp = atomic_load_ptr(&vp->v_mountedhere); 4303 if (mp == NULL) 4304 return (false); 4305 if (vp->v_type == VDIR) 4306 return (true); 4307 return (false); 4308 } 4309 4310 /* 4311 * Parse the path. 4312 * 4313 * The code was originally copy-pasted from regular lookup and despite 4314 * clean ups leaves performance on the table. Any modifications here 4315 * must take into account that in case off fallback the resulting 4316 * nameidata state has to be compatible with the original. 4317 */ 4318 static int 4319 cache_fplookup_parse(struct cache_fpl *fpl) 4320 { 4321 struct nameidata *ndp; 4322 struct componentname *cnp; 4323 char *cp; 4324 4325 ndp = fpl->ndp; 4326 cnp = fpl->cnp; 4327 4328 /* 4329 * Search a new directory. 4330 * 4331 * The last component of the filename is left accessible via 4332 * cnp->cn_nameptr for callers that need the name. Callers needing 4333 * the name set the SAVENAME flag. When done, they assume 4334 * responsibility for freeing the pathname buffer. 4335 */ 4336 for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++) 4337 continue; 4338 cnp->cn_namelen = cp - cnp->cn_nameptr; 4339 if (__predict_false(cnp->cn_namelen > NAME_MAX)) { 4340 cache_fpl_smr_exit(fpl); 4341 return (cache_fpl_handled(fpl, ENAMETOOLONG)); 4342 } 4343 ndp->ni_pathlen -= cnp->cn_namelen; 4344 KASSERT(ndp->ni_pathlen <= PATH_MAX, 4345 ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen)); 4346 ndp->ni_next = cp; 4347 4348 /* 4349 * Replace multiple slashes by a single slash and trailing slashes 4350 * by a null. This must be done before VOP_LOOKUP() because some 4351 * fs's don't know about trailing slashes. Remember if there were 4352 * trailing slashes to handle symlinks, existing non-directories 4353 * and non-existing files that won't be directories specially later. 4354 */ 4355 while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) { 4356 cp++; 4357 ndp->ni_pathlen--; 4358 if (*cp == '\0') { 4359 /* 4360 * TODO 4361 * Regular lookup performs the following: 4362 * *ndp->ni_next = '\0'; 4363 * cnp->cn_flags |= TRAILINGSLASH; 4364 * 4365 * Which is problematic since it modifies data read 4366 * from userspace. Then if fast path lookup was to 4367 * abort we would have to either restore it or convey 4368 * the flag. Since this is a corner case just ignore 4369 * it for simplicity. 4370 */ 4371 return (cache_fpl_partial(fpl)); 4372 } 4373 } 4374 ndp->ni_next = cp; 4375 4376 /* 4377 * Check for degenerate name (e.g. / or "") 4378 * which is a way of talking about a directory, 4379 * e.g. like "/." or ".". 4380 * 4381 * TODO 4382 * Another corner case handled by the regular lookup 4383 */ 4384 if (__predict_false(cnp->cn_nameptr[0] == '\0')) { 4385 return (cache_fpl_partial(fpl)); 4386 } 4387 return (0); 4388 } 4389 4390 static void 4391 cache_fplookup_parse_advance(struct cache_fpl *fpl) 4392 { 4393 struct nameidata *ndp; 4394 struct componentname *cnp; 4395 4396 ndp = fpl->ndp; 4397 cnp = fpl->cnp; 4398 4399 cnp->cn_nameptr = ndp->ni_next; 4400 while (*cnp->cn_nameptr == '/') { 4401 cnp->cn_nameptr++; 4402 ndp->ni_pathlen--; 4403 } 4404 } 4405 4406 /* 4407 * See the API contract for VOP_FPLOOKUP_VEXEC. 4408 */ 4409 static int __noinline 4410 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error) 4411 { 4412 struct vnode *dvp; 4413 seqc_t dvp_seqc; 4414 4415 dvp = fpl->dvp; 4416 dvp_seqc = fpl->dvp_seqc; 4417 4418 /* 4419 * Hack: they may be looking up foo/bar, where foo is a 4420 * regular file. In such a case we need to turn ENOTDIR, 4421 * but we may happen to get here with a different error. 4422 */ 4423 if (dvp->v_type != VDIR) { 4424 /* 4425 * The check here is predominantly to catch 4426 * EOPNOTSUPP from dead_vnodeops. If the vnode 4427 * gets doomed past this point it is going to 4428 * fail seqc verification. 4429 */ 4430 if (VN_IS_DOOMED(dvp)) { 4431 return (cache_fpl_aborted(fpl)); 4432 } 4433 error = ENOTDIR; 4434 } 4435 4436 /* 4437 * Hack: handle O_SEARCH. 4438 * 4439 * Open Group Base Specifications Issue 7, 2018 edition states: 4440 * If the access mode of the open file description associated with the 4441 * file descriptor is not O_SEARCH, the function shall check whether 4442 * directory searches are permitted using the current permissions of 4443 * the directory underlying the file descriptor. If the access mode is 4444 * O_SEARCH, the function shall not perform the check. 4445 * 4446 * Regular lookup tests for the NOEXECCHECK flag for every path 4447 * component to decide whether to do the permission check. However, 4448 * since most lookups never have the flag (and when they do it is only 4449 * present for the first path component), lockless lookup only acts on 4450 * it if there is a permission problem. Here the flag is represented 4451 * with a boolean so that we don't have to clear it on the way out. 4452 * 4453 * For simplicity this always aborts. 4454 * TODO: check if this is the first lookup and ignore the permission 4455 * problem. Note the flag has to survive fallback (if it happens to be 4456 * performed). 4457 */ 4458 if (fpl->fsearch) { 4459 return (cache_fpl_aborted(fpl)); 4460 } 4461 4462 switch (error) { 4463 case EAGAIN: 4464 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4465 error = cache_fpl_aborted(fpl); 4466 } else { 4467 cache_fpl_partial(fpl); 4468 } 4469 break; 4470 default: 4471 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4472 error = cache_fpl_aborted(fpl); 4473 } else { 4474 cache_fpl_smr_exit(fpl); 4475 cache_fpl_handled(fpl, error); 4476 } 4477 break; 4478 } 4479 return (error); 4480 } 4481 4482 static int 4483 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl) 4484 { 4485 struct nameidata *ndp; 4486 struct componentname *cnp; 4487 struct mount *mp; 4488 int error; 4489 4490 error = CACHE_FPL_FAILED; 4491 ndp = fpl->ndp; 4492 cnp = fpl->cnp; 4493 4494 cache_fpl_checkpoint(fpl, &fpl->snd); 4495 4496 fpl->dvp = dvp; 4497 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); 4498 if (seqc_in_modify(fpl->dvp_seqc)) { 4499 cache_fpl_aborted(fpl); 4500 goto out; 4501 } 4502 mp = atomic_load_ptr(&fpl->dvp->v_mount); 4503 if (!cache_fplookup_mp_supported(mp)) { 4504 cache_fpl_aborted(fpl); 4505 goto out; 4506 } 4507 4508 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); 4509 4510 for (;;) { 4511 error = cache_fplookup_parse(fpl); 4512 if (__predict_false(error != 0)) { 4513 break; 4514 } 4515 4516 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); 4517 4518 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred); 4519 if (__predict_false(error != 0)) { 4520 error = cache_fplookup_failed_vexec(fpl, error); 4521 break; 4522 } 4523 4524 if (__predict_false(cache_fpl_isdotdot(cnp))) { 4525 error = cache_fplookup_dotdot(fpl); 4526 if (__predict_false(error != 0)) { 4527 break; 4528 } 4529 } else { 4530 error = cache_fplookup_next(fpl); 4531 if (__predict_false(error != 0)) { 4532 break; 4533 } 4534 4535 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 4536 4537 if (cache_fplookup_need_climb_mount(fpl)) { 4538 error = cache_fplookup_climb_mount(fpl); 4539 if (__predict_false(error != 0)) { 4540 break; 4541 } 4542 } 4543 } 4544 4545 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 4546 4547 if (cache_fpl_islastcn(ndp)) { 4548 error = cache_fplookup_final(fpl); 4549 break; 4550 } 4551 4552 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { 4553 error = cache_fpl_aborted(fpl); 4554 break; 4555 } 4556 4557 fpl->dvp = fpl->tvp; 4558 fpl->dvp_seqc = fpl->tvp_seqc; 4559 4560 cache_fplookup_parse_advance(fpl); 4561 cache_fpl_checkpoint(fpl, &fpl->snd); 4562 } 4563 out: 4564 switch (fpl->status) { 4565 case CACHE_FPL_STATUS_UNSET: 4566 __assert_unreachable(); 4567 break; 4568 case CACHE_FPL_STATUS_PARTIAL: 4569 cache_fpl_smr_assert_entered(fpl); 4570 return (cache_fplookup_partial_setup(fpl)); 4571 case CACHE_FPL_STATUS_ABORTED: 4572 if (fpl->in_smr) 4573 cache_fpl_smr_exit(fpl); 4574 return (CACHE_FPL_FAILED); 4575 case CACHE_FPL_STATUS_HANDLED: 4576 MPASS(error != CACHE_FPL_FAILED); 4577 cache_fpl_smr_assert_not_entered(fpl); 4578 if (__predict_false(error != 0)) { 4579 ndp->ni_dvp = NULL; 4580 ndp->ni_vp = NULL; 4581 cache_fpl_cleanup_cnp(cnp); 4582 return (error); 4583 } 4584 ndp->ni_dvp = fpl->dvp; 4585 ndp->ni_vp = fpl->tvp; 4586 if (cnp->cn_flags & SAVENAME) 4587 cnp->cn_flags |= HASBUF; 4588 else 4589 cache_fpl_cleanup_cnp(cnp); 4590 return (error); 4591 } 4592 } 4593 4594 /* 4595 * Fast path lookup protected with SMR and sequence counters. 4596 * 4597 * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one. 4598 * 4599 * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria 4600 * outlined below. 4601 * 4602 * Traditional vnode lookup conceptually looks like this: 4603 * 4604 * vn_lock(current); 4605 * for (;;) { 4606 * next = find(); 4607 * vn_lock(next); 4608 * vn_unlock(current); 4609 * current = next; 4610 * if (last) 4611 * break; 4612 * } 4613 * return (current); 4614 * 4615 * Each jump to the next vnode is safe memory-wise and atomic with respect to 4616 * any modifications thanks to holding respective locks. 4617 * 4618 * The same guarantee can be provided with a combination of safe memory 4619 * reclamation and sequence counters instead. If all operations which affect 4620 * the relationship between the current vnode and the one we are looking for 4621 * also modify the counter, we can verify whether all the conditions held as 4622 * we made the jump. This includes things like permissions, mount points etc. 4623 * Counter modification is provided by enclosing relevant places in 4624 * vn_seqc_write_begin()/end() calls. 4625 * 4626 * Thus this translates to: 4627 * 4628 * vfs_smr_enter(); 4629 * dvp_seqc = seqc_read_any(dvp); 4630 * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode 4631 * abort(); 4632 * for (;;) { 4633 * tvp = find(); 4634 * tvp_seqc = seqc_read_any(tvp); 4635 * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode 4636 * abort(); 4637 * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode 4638 * abort(); 4639 * dvp = tvp; // we know nothing of importance has changed 4640 * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration 4641 * if (last) 4642 * break; 4643 * } 4644 * vget(); // secure the vnode 4645 * if (!seqc_consistent(tvp, tvp_seqc) // final check 4646 * abort(); 4647 * // at this point we know nothing has changed for any parent<->child pair 4648 * // as they were crossed during the lookup, meaning we matched the guarantee 4649 * // of the locked variant 4650 * return (tvp); 4651 * 4652 * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows: 4653 * - they are called while within vfs_smr protection which they must never exit 4654 * - EAGAIN can be returned to denote checking could not be performed, it is 4655 * always valid to return it 4656 * - if the sequence counter has not changed the result must be valid 4657 * - if the sequence counter has changed both false positives and false negatives 4658 * are permitted (since the result will be rejected later) 4659 * - for simple cases of unix permission checks vaccess_vexec_smr can be used 4660 * 4661 * Caveats to watch out for: 4662 * - vnodes are passed unlocked and unreferenced with nothing stopping 4663 * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised 4664 * to use atomic_load_ptr to fetch it. 4665 * - the aforementioned object can also get freed, meaning absent other means it 4666 * should be protected with vfs_smr 4667 * - either safely checking permissions as they are modified or guaranteeing 4668 * their stability is left to the routine 4669 */ 4670 int 4671 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status, 4672 struct pwd **pwdp) 4673 { 4674 struct cache_fpl fpl; 4675 struct pwd *pwd; 4676 struct vnode *dvp; 4677 struct componentname *cnp; 4678 struct nameidata_saved orig; 4679 int error; 4680 4681 MPASS(ndp->ni_lcf == 0); 4682 4683 fpl.status = CACHE_FPL_STATUS_UNSET; 4684 fpl.ndp = ndp; 4685 fpl.cnp = &ndp->ni_cnd; 4686 MPASS(curthread == fpl.cnp->cn_thread); 4687 4688 if ((fpl.cnp->cn_flags & SAVESTART) != 0) 4689 MPASS(fpl.cnp->cn_nameiop != LOOKUP); 4690 4691 if (!cache_can_fplookup(&fpl)) { 4692 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 4693 *status = fpl.status; 4694 return (EOPNOTSUPP); 4695 } 4696 4697 cache_fpl_checkpoint(&fpl, &orig); 4698 4699 cache_fpl_smr_enter_initial(&fpl); 4700 fpl.fsearch = false; 4701 pwd = pwd_get_smr(); 4702 fpl.pwd = pwd; 4703 ndp->ni_rootdir = pwd->pwd_rdir; 4704 ndp->ni_topdir = pwd->pwd_jdir; 4705 4706 cnp = fpl.cnp; 4707 cnp->cn_nameptr = cnp->cn_pnbuf; 4708 if (cnp->cn_pnbuf[0] == '/') { 4709 cache_fpl_handle_root(ndp, &dvp); 4710 ndp->ni_resflags |= NIRES_ABS; 4711 } else { 4712 if (ndp->ni_dirfd == AT_FDCWD) { 4713 dvp = pwd->pwd_cdir; 4714 } else { 4715 error = cache_fplookup_dirfd(&fpl, &dvp); 4716 if (__predict_false(error != 0)) { 4717 goto out; 4718 } 4719 } 4720 } 4721 4722 SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true); 4723 4724 error = cache_fplookup_impl(dvp, &fpl); 4725 out: 4726 cache_fpl_smr_assert_not_entered(&fpl); 4727 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 4728 4729 *status = fpl.status; 4730 switch (fpl.status) { 4731 case CACHE_FPL_STATUS_UNSET: 4732 __assert_unreachable(); 4733 break; 4734 case CACHE_FPL_STATUS_HANDLED: 4735 SDT_PROBE3(vfs, namei, lookup, return, error, 4736 (error == 0 ? ndp->ni_vp : NULL), true); 4737 break; 4738 case CACHE_FPL_STATUS_PARTIAL: 4739 *pwdp = fpl.pwd; 4740 /* 4741 * Status restored by cache_fplookup_partial_setup. 4742 */ 4743 break; 4744 case CACHE_FPL_STATUS_ABORTED: 4745 cache_fpl_restore_abort(&fpl, &orig); 4746 break; 4747 } 4748 return (error); 4749 } 4750