1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Poul-Henning Kamp of the FreeBSD Project. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_ddb.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/capsicum.h> 46 #include <sys/counter.h> 47 #include <sys/filedesc.h> 48 #include <sys/fnv_hash.h> 49 #include <sys/kernel.h> 50 #include <sys/ktr.h> 51 #include <sys/lock.h> 52 #include <sys/malloc.h> 53 #include <sys/fcntl.h> 54 #include <sys/jail.h> 55 #include <sys/mount.h> 56 #include <sys/namei.h> 57 #include <sys/proc.h> 58 #include <sys/seqc.h> 59 #include <sys/sdt.h> 60 #include <sys/smr.h> 61 #include <sys/smp.h> 62 #include <sys/syscallsubr.h> 63 #include <sys/sysctl.h> 64 #include <sys/sysproto.h> 65 #include <sys/vnode.h> 66 #include <ck_queue.h> 67 #ifdef KTRACE 68 #include <sys/ktrace.h> 69 #endif 70 #ifdef INVARIANTS 71 #include <machine/_inttypes.h> 72 #endif 73 74 #include <sys/capsicum.h> 75 76 #include <security/audit/audit.h> 77 #include <security/mac/mac_framework.h> 78 79 #ifdef DDB 80 #include <ddb/ddb.h> 81 #endif 82 83 #include <vm/uma.h> 84 85 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 86 "Name cache"); 87 88 SDT_PROVIDER_DECLARE(vfs); 89 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", 90 "struct vnode *"); 91 SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *", 92 "struct vnode *"); 93 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", 94 "char *"); 95 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *", 96 "const char *"); 97 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *", 98 "struct namecache *", "int", "int"); 99 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); 100 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", 101 "char *", "struct vnode *"); 102 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); 103 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", 104 "struct vnode *", "char *"); 105 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", 106 "struct vnode *"); 107 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, 108 "struct vnode *", "char *"); 109 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", 110 "char *"); 111 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *", 112 "struct componentname *"); 113 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *", 114 "struct componentname *"); 115 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); 116 SDT_PROBE_DEFINE1(vfs, namecache, purge, batch, "int"); 117 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); 118 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); 119 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", 120 "struct vnode *"); 121 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", 122 "char *"); 123 SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *", 124 "char *"); 125 126 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool"); 127 SDT_PROBE_DECLARE(vfs, namei, lookup, entry); 128 SDT_PROBE_DECLARE(vfs, namei, lookup, return); 129 130 /* 131 * This structure describes the elements in the cache of recent 132 * names looked up by namei. 133 */ 134 struct negstate { 135 u_char neg_flag; 136 u_char neg_hit; 137 }; 138 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *), 139 "the state must fit in a union with a pointer without growing it"); 140 141 struct namecache { 142 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 143 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 144 CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */ 145 struct vnode *nc_dvp; /* vnode of parent of name */ 146 union { 147 struct vnode *nu_vp; /* vnode the name refers to */ 148 struct negstate nu_neg;/* negative entry state */ 149 } n_un; 150 u_char nc_flag; /* flag bits */ 151 u_char nc_nlen; /* length of name */ 152 char nc_name[0]; /* segment name + nul */ 153 }; 154 155 /* 156 * struct namecache_ts repeats struct namecache layout up to the 157 * nc_nlen member. 158 * struct namecache_ts is used in place of struct namecache when time(s) need 159 * to be stored. The nc_dotdottime field is used when a cache entry is mapping 160 * both a non-dotdot directory name plus dotdot for the directory's 161 * parent. 162 * 163 * See below for alignment requirement. 164 */ 165 struct namecache_ts { 166 struct timespec nc_time; /* timespec provided by fs */ 167 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ 168 int nc_ticks; /* ticks value when entry was added */ 169 int nc_pad; 170 struct namecache nc_nc; 171 }; 172 173 TAILQ_HEAD(cache_freebatch, namecache); 174 175 /* 176 * At least mips n32 performs 64-bit accesses to timespec as found 177 * in namecache_ts and requires them to be aligned. Since others 178 * may be in the same spot suffer a little bit and enforce the 179 * alignment for everyone. Note this is a nop for 64-bit platforms. 180 */ 181 #define CACHE_ZONE_ALIGNMENT UMA_ALIGNOF(time_t) 182 183 /* 184 * TODO: the initial value of CACHE_PATH_CUTOFF was inherited from the 185 * 4.4 BSD codebase. Later on struct namecache was tweaked to become 186 * smaller and the value was bumped to retain the total size, but it 187 * was never re-evaluated for suitability. A simple test counting 188 * lengths during package building shows that the value of 45 covers 189 * about 86% of all added entries, reaching 99% at 65. 190 * 191 * Regardless of the above, use of dedicated zones instead of malloc may be 192 * inducing additional waste. This may be hard to address as said zones are 193 * tied to VFS SMR. Even if retaining them, the current split should be 194 * re-evaluated. 195 */ 196 #ifdef __LP64__ 197 #define CACHE_PATH_CUTOFF 45 198 #define CACHE_LARGE_PAD 6 199 #else 200 #define CACHE_PATH_CUTOFF 41 201 #define CACHE_LARGE_PAD 2 202 #endif 203 204 #define CACHE_ZONE_SMALL_SIZE (offsetof(struct namecache, nc_name) + CACHE_PATH_CUTOFF + 1) 205 #define CACHE_ZONE_SMALL_TS_SIZE (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_SMALL_SIZE) 206 #define CACHE_ZONE_LARGE_SIZE (offsetof(struct namecache, nc_name) + NAME_MAX + 1 + CACHE_LARGE_PAD) 207 #define CACHE_ZONE_LARGE_TS_SIZE (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_LARGE_SIZE) 208 209 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 210 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 211 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 212 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 213 214 #define nc_vp n_un.nu_vp 215 #define nc_neg n_un.nu_neg 216 217 /* 218 * Flags in namecache.nc_flag 219 */ 220 #define NCF_WHITE 0x01 221 #define NCF_ISDOTDOT 0x02 222 #define NCF_TS 0x04 223 #define NCF_DTS 0x08 224 #define NCF_DVDROP 0x10 225 #define NCF_NEGATIVE 0x20 226 #define NCF_INVALID 0x40 227 #define NCF_WIP 0x80 228 229 /* 230 * Flags in negstate.neg_flag 231 */ 232 #define NEG_HOT 0x01 233 234 static bool cache_neg_evict_cond(u_long lnumcache); 235 236 /* 237 * Mark an entry as invalid. 238 * 239 * This is called before it starts getting deconstructed. 240 */ 241 static void 242 cache_ncp_invalidate(struct namecache *ncp) 243 { 244 245 KASSERT((ncp->nc_flag & NCF_INVALID) == 0, 246 ("%s: entry %p already invalid", __func__, ncp)); 247 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID); 248 atomic_thread_fence_rel(); 249 } 250 251 /* 252 * Check whether the entry can be safely used. 253 * 254 * All places which elide locks are supposed to call this after they are 255 * done with reading from an entry. 256 */ 257 #define cache_ncp_canuse(ncp) ({ \ 258 struct namecache *_ncp = (ncp); \ 259 u_char _nc_flag; \ 260 \ 261 atomic_thread_fence_acq(); \ 262 _nc_flag = atomic_load_char(&_ncp->nc_flag); \ 263 __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP)) == 0); \ 264 }) 265 266 /* 267 * Like the above but also checks NCF_WHITE. 268 */ 269 #define cache_fpl_neg_ncp_canuse(ncp) ({ \ 270 struct namecache *_ncp = (ncp); \ 271 u_char _nc_flag; \ 272 \ 273 atomic_thread_fence_acq(); \ 274 _nc_flag = atomic_load_char(&_ncp->nc_flag); \ 275 __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP | NCF_WHITE)) == 0); \ 276 }) 277 278 /* 279 * Name caching works as follows: 280 * 281 * Names found by directory scans are retained in a cache 282 * for future reference. It is managed LRU, so frequently 283 * used names will hang around. Cache is indexed by hash value 284 * obtained from (dvp, name) where dvp refers to the directory 285 * containing name. 286 * 287 * If it is a "negative" entry, (i.e. for a name that is known NOT to 288 * exist) the vnode pointer will be NULL. 289 * 290 * Upon reaching the last segment of a path, if the reference 291 * is for DELETE, or NOCACHE is set (rewrite), and the 292 * name is located in the cache, it will be dropped. 293 * 294 * These locks are used (in the order in which they can be taken): 295 * NAME TYPE ROLE 296 * vnodelock mtx vnode lists and v_cache_dd field protection 297 * bucketlock mtx for access to given set of hash buckets 298 * neglist mtx negative entry LRU management 299 * 300 * It is legal to take multiple vnodelock and bucketlock locks. The locking 301 * order is lower address first. Both are recursive. 302 * 303 * "." lookups are lockless. 304 * 305 * ".." and vnode -> name lookups require vnodelock. 306 * 307 * name -> vnode lookup requires the relevant bucketlock to be held for reading. 308 * 309 * Insertions and removals of entries require involved vnodes and bucketlocks 310 * to be locked to provide safe operation against other threads modifying the 311 * cache. 312 * 313 * Some lookups result in removal of the found entry (e.g. getting rid of a 314 * negative entry with the intent to create a positive one), which poses a 315 * problem when multiple threads reach the state. Similarly, two different 316 * threads can purge two different vnodes and try to remove the same name. 317 * 318 * If the already held vnode lock is lower than the second required lock, we 319 * can just take the other lock. However, in the opposite case, this could 320 * deadlock. As such, this is resolved by trylocking and if that fails unlocking 321 * the first node, locking everything in order and revalidating the state. 322 */ 323 324 VFS_SMR_DECLARE; 325 326 static SYSCTL_NODE(_vfs_cache, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 327 "Name cache parameters"); 328 329 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */ 330 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, size, CTLFLAG_RW, &ncsize, 0, 331 "Total namecache capacity"); 332 333 u_int ncsizefactor = 2; 334 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, sizefactor, CTLFLAG_RW, &ncsizefactor, 0, 335 "Size factor for namecache"); 336 337 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */ 338 SYSCTL_ULONG(_vfs_cache_param, OID_AUTO, negfactor, CTLFLAG_RW, &ncnegfactor, 0, 339 "Ratio of negative namecache entries"); 340 341 /* 342 * Negative entry % of namecache capacity above which automatic eviction is allowed. 343 * 344 * Check cache_neg_evict_cond for details. 345 */ 346 static u_int ncnegminpct = 3; 347 348 static u_int __read_mostly neg_min; /* the above recomputed against ncsize */ 349 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, negmin, CTLFLAG_RD, &neg_min, 0, 350 "Negative entry count above which automatic eviction is allowed"); 351 352 /* 353 * Structures associated with name caching. 354 */ 355 #define NCHHASH(hash) \ 356 (&nchashtbl[(hash) & nchash]) 357 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ 358 static u_long __read_mostly nchash; /* size of hash table */ 359 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 360 "Size of namecache hash table"); 361 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ 362 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ 363 364 struct nchstats nchstats; /* cache effectiveness statistics */ 365 366 static bool __read_frequently cache_fast_revlookup = true; 367 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_revlookup, CTLFLAG_RW, 368 &cache_fast_revlookup, 0, ""); 369 370 static u_int __exclusive_cache_line neg_cycle; 371 372 #define ncneghash 3 373 #define numneglists (ncneghash + 1) 374 375 struct neglist { 376 struct mtx nl_evict_lock; 377 struct mtx nl_lock __aligned(CACHE_LINE_SIZE); 378 TAILQ_HEAD(, namecache) nl_list; 379 TAILQ_HEAD(, namecache) nl_hotlist; 380 u_long nl_hotnum; 381 } __aligned(CACHE_LINE_SIZE); 382 383 static struct neglist neglists[numneglists]; 384 385 static inline struct neglist * 386 NCP2NEGLIST(struct namecache *ncp) 387 { 388 389 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); 390 } 391 392 static inline struct negstate * 393 NCP2NEGSTATE(struct namecache *ncp) 394 { 395 396 MPASS(ncp->nc_flag & NCF_NEGATIVE); 397 return (&ncp->nc_neg); 398 } 399 400 #define numbucketlocks (ncbuckethash + 1) 401 static u_int __read_mostly ncbuckethash; 402 static struct mtx_padalign __read_mostly *bucketlocks; 403 #define HASH2BUCKETLOCK(hash) \ 404 ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)])) 405 406 #define numvnodelocks (ncvnodehash + 1) 407 static u_int __read_mostly ncvnodehash; 408 static struct mtx __read_mostly *vnodelocks; 409 static inline struct mtx * 410 VP2VNODELOCK(struct vnode *vp) 411 { 412 413 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); 414 } 415 416 static void 417 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) 418 { 419 struct namecache_ts *ncp_ts; 420 421 KASSERT((ncp->nc_flag & NCF_TS) != 0 || 422 (tsp == NULL && ticksp == NULL), 423 ("No NCF_TS")); 424 425 if (tsp == NULL) 426 return; 427 428 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 429 *tsp = ncp_ts->nc_time; 430 *ticksp = ncp_ts->nc_ticks; 431 } 432 433 #ifdef DEBUG_CACHE 434 static int __read_mostly doingcache = 1; /* 1 => enable the cache */ 435 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, 436 "VFS namecache enabled"); 437 #endif 438 439 /* Export size information to userland */ 440 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 441 sizeof(struct namecache), "sizeof(struct namecache)"); 442 443 /* 444 * The new name cache statistics 445 */ 446 static SYSCTL_NODE(_vfs_cache, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 447 "Name cache statistics"); 448 449 #define STATNODE_ULONG(name, varname, descr) \ 450 SYSCTL_ULONG(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr); 451 #define STATNODE_COUNTER(name, varname, descr) \ 452 static COUNTER_U64_DEFINE_EARLY(varname); \ 453 SYSCTL_COUNTER_U64(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, \ 454 descr); 455 STATNODE_ULONG(neg, numneg, "Number of negative cache entries"); 456 STATNODE_ULONG(count, numcache, "Number of cache entries"); 457 STATNODE_COUNTER(heldvnodes, numcachehv, "Number of namecache entries with vnodes held"); 458 STATNODE_COUNTER(drops, numdrops, "Number of dropped entries due to reaching the limit"); 459 STATNODE_COUNTER(dothits, dothits, "Number of '.' hits"); 460 STATNODE_COUNTER(dotdothis, dotdothits, "Number of '..' hits"); 461 STATNODE_COUNTER(miss, nummiss, "Number of cache misses"); 462 STATNODE_COUNTER(misszap, nummisszap, "Number of cache misses we do not want to cache"); 463 STATNODE_COUNTER(posszaps, numposzaps, 464 "Number of cache hits (positive) we do not want to cache"); 465 STATNODE_COUNTER(poshits, numposhits, "Number of cache hits (positive)"); 466 STATNODE_COUNTER(negzaps, numnegzaps, 467 "Number of cache hits (negative) we do not want to cache"); 468 STATNODE_COUNTER(neghits, numneghits, "Number of cache hits (negative)"); 469 /* These count for vn_getcwd(), too. */ 470 STATNODE_COUNTER(fullpathcalls, numfullpathcalls, "Number of fullpath search calls"); 471 STATNODE_COUNTER(fullpathfail1, numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); 472 STATNODE_COUNTER(fullpathfail2, numfullpathfail2, 473 "Number of fullpath search errors (VOP_VPTOCNP failures)"); 474 STATNODE_COUNTER(fullpathfail4, numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); 475 STATNODE_COUNTER(fullpathfound, numfullpathfound, "Number of successful fullpath calls"); 476 477 /* 478 * Debug or developer statistics. 479 */ 480 static SYSCTL_NODE(_vfs_cache, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 481 "Name cache debugging"); 482 #define DEBUGNODE_ULONG(name, varname, descr) \ 483 SYSCTL_ULONG(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr); 484 #define DEBUGNODE_COUNTER(name, varname, descr) \ 485 static COUNTER_U64_DEFINE_EARLY(varname); \ 486 SYSCTL_COUNTER_U64(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, \ 487 descr); 488 DEBUGNODE_COUNTER(zap_bucket_relock_success, zap_bucket_relock_success, 489 "Number of successful removals after relocking"); 490 static long zap_bucket_fail; 491 DEBUGNODE_ULONG(zap_bucket_fail, zap_bucket_fail, ""); 492 static long zap_bucket_fail2; 493 DEBUGNODE_ULONG(zap_bucket_fail2, zap_bucket_fail2, ""); 494 static long cache_lock_vnodes_cel_3_failures; 495 DEBUGNODE_ULONG(vnodes_cel_3_failures, cache_lock_vnodes_cel_3_failures, 496 "Number of times 3-way vnode locking failed"); 497 498 static void cache_zap_locked(struct namecache *ncp); 499 static int vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, 500 char **freebuf, size_t *buflen); 501 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 502 char **retbuf, size_t *buflen, size_t addend); 503 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, 504 char **retbuf, size_t *buflen); 505 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, 506 char **retbuf, size_t *len, size_t addend); 507 508 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 509 510 static inline void 511 cache_assert_vlp_locked(struct mtx *vlp) 512 { 513 514 if (vlp != NULL) 515 mtx_assert(vlp, MA_OWNED); 516 } 517 518 static inline void 519 cache_assert_vnode_locked(struct vnode *vp) 520 { 521 struct mtx *vlp; 522 523 vlp = VP2VNODELOCK(vp); 524 cache_assert_vlp_locked(vlp); 525 } 526 527 /* 528 * Directory vnodes with entries are held for two reasons: 529 * 1. make them less of a target for reclamation in vnlru 530 * 2. suffer smaller performance penalty in locked lookup as requeieing is avoided 531 * 532 * It will be feasible to stop doing it altogether if all filesystems start 533 * supporting lockless lookup. 534 */ 535 static void 536 cache_hold_vnode(struct vnode *vp) 537 { 538 539 cache_assert_vnode_locked(vp); 540 VNPASS(LIST_EMPTY(&vp->v_cache_src), vp); 541 vhold(vp); 542 counter_u64_add(numcachehv, 1); 543 } 544 545 static void 546 cache_drop_vnode(struct vnode *vp) 547 { 548 549 /* 550 * Called after all locks are dropped, meaning we can't assert 551 * on the state of v_cache_src. 552 */ 553 vdrop(vp); 554 counter_u64_add(numcachehv, -1); 555 } 556 557 /* 558 * UMA zones. 559 */ 560 static uma_zone_t __read_mostly cache_zone_small; 561 static uma_zone_t __read_mostly cache_zone_small_ts; 562 static uma_zone_t __read_mostly cache_zone_large; 563 static uma_zone_t __read_mostly cache_zone_large_ts; 564 565 char * 566 cache_symlink_alloc(size_t size, int flags) 567 { 568 569 if (size < CACHE_ZONE_SMALL_SIZE) { 570 return (uma_zalloc_smr(cache_zone_small, flags)); 571 } 572 if (size < CACHE_ZONE_LARGE_SIZE) { 573 return (uma_zalloc_smr(cache_zone_large, flags)); 574 } 575 return (NULL); 576 } 577 578 void 579 cache_symlink_free(char *string, size_t size) 580 { 581 582 MPASS(string != NULL); 583 584 if (size < CACHE_ZONE_SMALL_SIZE) { 585 uma_zfree_smr(cache_zone_small, string); 586 return; 587 } 588 if (size < CACHE_ZONE_LARGE_SIZE) { 589 uma_zfree_smr(cache_zone_large, string); 590 return; 591 } 592 __assert_unreachable(); 593 } 594 595 static struct namecache * 596 cache_alloc_uma(int len, bool ts) 597 { 598 struct namecache_ts *ncp_ts; 599 struct namecache *ncp; 600 601 if (__predict_false(ts)) { 602 if (len <= CACHE_PATH_CUTOFF) 603 ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK); 604 else 605 ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK); 606 ncp = &ncp_ts->nc_nc; 607 } else { 608 if (len <= CACHE_PATH_CUTOFF) 609 ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK); 610 else 611 ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK); 612 } 613 return (ncp); 614 } 615 616 static void 617 cache_free_uma(struct namecache *ncp) 618 { 619 struct namecache_ts *ncp_ts; 620 621 if (__predict_false(ncp->nc_flag & NCF_TS)) { 622 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 623 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 624 uma_zfree_smr(cache_zone_small_ts, ncp_ts); 625 else 626 uma_zfree_smr(cache_zone_large_ts, ncp_ts); 627 } else { 628 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 629 uma_zfree_smr(cache_zone_small, ncp); 630 else 631 uma_zfree_smr(cache_zone_large, ncp); 632 } 633 } 634 635 static struct namecache * 636 cache_alloc(int len, bool ts) 637 { 638 u_long lnumcache; 639 640 /* 641 * Avoid blowout in namecache entries. 642 * 643 * Bugs: 644 * 1. filesystems may end up trying to add an already existing entry 645 * (for example this can happen after a cache miss during concurrent 646 * lookup), in which case we will call cache_neg_evict despite not 647 * adding anything. 648 * 2. the routine may fail to free anything and no provisions are made 649 * to make it try harder (see the inside for failure modes) 650 * 3. it only ever looks at negative entries. 651 */ 652 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; 653 if (cache_neg_evict_cond(lnumcache)) { 654 lnumcache = atomic_load_long(&numcache); 655 } 656 if (__predict_false(lnumcache >= ncsize)) { 657 atomic_subtract_long(&numcache, 1); 658 counter_u64_add(numdrops, 1); 659 return (NULL); 660 } 661 return (cache_alloc_uma(len, ts)); 662 } 663 664 static void 665 cache_free(struct namecache *ncp) 666 { 667 668 MPASS(ncp != NULL); 669 if ((ncp->nc_flag & NCF_DVDROP) != 0) { 670 cache_drop_vnode(ncp->nc_dvp); 671 } 672 cache_free_uma(ncp); 673 atomic_subtract_long(&numcache, 1); 674 } 675 676 static void 677 cache_free_batch(struct cache_freebatch *batch) 678 { 679 struct namecache *ncp, *nnp; 680 int i; 681 682 i = 0; 683 if (TAILQ_EMPTY(batch)) 684 goto out; 685 TAILQ_FOREACH_SAFE(ncp, batch, nc_dst, nnp) { 686 if ((ncp->nc_flag & NCF_DVDROP) != 0) { 687 cache_drop_vnode(ncp->nc_dvp); 688 } 689 cache_free_uma(ncp); 690 i++; 691 } 692 atomic_subtract_long(&numcache, i); 693 out: 694 SDT_PROBE1(vfs, namecache, purge, batch, i); 695 } 696 697 /* 698 * TODO: With the value stored we can do better than computing the hash based 699 * on the address. The choice of FNV should also be revisited. 700 */ 701 static void 702 cache_prehash(struct vnode *vp) 703 { 704 705 vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT); 706 } 707 708 static uint32_t 709 cache_get_hash(char *name, u_char len, struct vnode *dvp) 710 { 711 712 return (fnv_32_buf(name, len, dvp->v_nchash)); 713 } 714 715 static inline struct nchashhead * 716 NCP2BUCKET(struct namecache *ncp) 717 { 718 uint32_t hash; 719 720 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 721 return (NCHHASH(hash)); 722 } 723 724 static inline struct mtx * 725 NCP2BUCKETLOCK(struct namecache *ncp) 726 { 727 uint32_t hash; 728 729 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 730 return (HASH2BUCKETLOCK(hash)); 731 } 732 733 #ifdef INVARIANTS 734 static void 735 cache_assert_bucket_locked(struct namecache *ncp) 736 { 737 struct mtx *blp; 738 739 blp = NCP2BUCKETLOCK(ncp); 740 mtx_assert(blp, MA_OWNED); 741 } 742 743 static void 744 cache_assert_bucket_unlocked(struct namecache *ncp) 745 { 746 struct mtx *blp; 747 748 blp = NCP2BUCKETLOCK(ncp); 749 mtx_assert(blp, MA_NOTOWNED); 750 } 751 #else 752 #define cache_assert_bucket_locked(x) do { } while (0) 753 #define cache_assert_bucket_unlocked(x) do { } while (0) 754 #endif 755 756 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) 757 static void 758 _cache_sort_vnodes(void **p1, void **p2) 759 { 760 void *tmp; 761 762 MPASS(*p1 != NULL || *p2 != NULL); 763 764 if (*p1 > *p2) { 765 tmp = *p2; 766 *p2 = *p1; 767 *p1 = tmp; 768 } 769 } 770 771 static void 772 cache_lock_all_buckets(void) 773 { 774 u_int i; 775 776 for (i = 0; i < numbucketlocks; i++) 777 mtx_lock(&bucketlocks[i]); 778 } 779 780 static void 781 cache_unlock_all_buckets(void) 782 { 783 u_int i; 784 785 for (i = 0; i < numbucketlocks; i++) 786 mtx_unlock(&bucketlocks[i]); 787 } 788 789 static void 790 cache_lock_all_vnodes(void) 791 { 792 u_int i; 793 794 for (i = 0; i < numvnodelocks; i++) 795 mtx_lock(&vnodelocks[i]); 796 } 797 798 static void 799 cache_unlock_all_vnodes(void) 800 { 801 u_int i; 802 803 for (i = 0; i < numvnodelocks; i++) 804 mtx_unlock(&vnodelocks[i]); 805 } 806 807 static int 808 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 809 { 810 811 cache_sort_vnodes(&vlp1, &vlp2); 812 813 if (vlp1 != NULL) { 814 if (!mtx_trylock(vlp1)) 815 return (EAGAIN); 816 } 817 if (!mtx_trylock(vlp2)) { 818 if (vlp1 != NULL) 819 mtx_unlock(vlp1); 820 return (EAGAIN); 821 } 822 823 return (0); 824 } 825 826 static void 827 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 828 { 829 830 MPASS(vlp1 != NULL || vlp2 != NULL); 831 MPASS(vlp1 <= vlp2); 832 833 if (vlp1 != NULL) 834 mtx_lock(vlp1); 835 if (vlp2 != NULL) 836 mtx_lock(vlp2); 837 } 838 839 static void 840 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 841 { 842 843 MPASS(vlp1 != NULL || vlp2 != NULL); 844 845 if (vlp1 != NULL) 846 mtx_unlock(vlp1); 847 if (vlp2 != NULL) 848 mtx_unlock(vlp2); 849 } 850 851 static int 852 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 853 { 854 struct nchstats snap; 855 856 if (req->oldptr == NULL) 857 return (SYSCTL_OUT(req, 0, sizeof(snap))); 858 859 snap = nchstats; 860 snap.ncs_goodhits = counter_u64_fetch(numposhits); 861 snap.ncs_neghits = counter_u64_fetch(numneghits); 862 snap.ncs_badhits = counter_u64_fetch(numposzaps) + 863 counter_u64_fetch(numnegzaps); 864 snap.ncs_miss = counter_u64_fetch(nummisszap) + 865 counter_u64_fetch(nummiss); 866 867 return (SYSCTL_OUT(req, &snap, sizeof(snap))); 868 } 869 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | 870 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", 871 "VFS cache effectiveness statistics"); 872 873 static void 874 cache_recalc_neg_min(u_int val) 875 { 876 877 neg_min = (ncsize * val) / 100; 878 } 879 880 static int 881 sysctl_negminpct(SYSCTL_HANDLER_ARGS) 882 { 883 u_int val; 884 int error; 885 886 val = ncnegminpct; 887 error = sysctl_handle_int(oidp, &val, 0, req); 888 if (error != 0 || req->newptr == NULL) 889 return (error); 890 891 if (val == ncnegminpct) 892 return (0); 893 if (val < 0 || val > 99) 894 return (EINVAL); 895 ncnegminpct = val; 896 cache_recalc_neg_min(val); 897 return (0); 898 } 899 900 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, negminpct, 901 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_negminpct, 902 "I", "Negative entry \% of namecache capacity above which automatic eviction is allowed"); 903 904 #ifdef DIAGNOSTIC 905 /* 906 * Grab an atomic snapshot of the name cache hash chain lengths 907 */ 908 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, 909 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 910 "hash table stats"); 911 912 static int 913 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) 914 { 915 struct nchashhead *ncpp; 916 struct namecache *ncp; 917 int i, error, n_nchash, *cntbuf; 918 919 retry: 920 n_nchash = nchash + 1; /* nchash is max index, not count */ 921 if (req->oldptr == NULL) 922 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); 923 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); 924 cache_lock_all_buckets(); 925 if (n_nchash != nchash + 1) { 926 cache_unlock_all_buckets(); 927 free(cntbuf, M_TEMP); 928 goto retry; 929 } 930 /* Scan hash tables counting entries */ 931 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) 932 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) 933 cntbuf[i]++; 934 cache_unlock_all_buckets(); 935 for (error = 0, i = 0; i < n_nchash; i++) 936 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) 937 break; 938 free(cntbuf, M_TEMP); 939 return (error); 940 } 941 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| 942 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", 943 "nchash chain lengths"); 944 945 static int 946 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) 947 { 948 int error; 949 struct nchashhead *ncpp; 950 struct namecache *ncp; 951 int n_nchash; 952 int count, maxlength, used, pct; 953 954 if (!req->oldptr) 955 return SYSCTL_OUT(req, 0, 4 * sizeof(int)); 956 957 cache_lock_all_buckets(); 958 n_nchash = nchash + 1; /* nchash is max index, not count */ 959 used = 0; 960 maxlength = 0; 961 962 /* Scan hash tables for applicable entries */ 963 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { 964 count = 0; 965 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) { 966 count++; 967 } 968 if (count) 969 used++; 970 if (maxlength < count) 971 maxlength = count; 972 } 973 n_nchash = nchash + 1; 974 cache_unlock_all_buckets(); 975 pct = (used * 100) / (n_nchash / 100); 976 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); 977 if (error) 978 return (error); 979 error = SYSCTL_OUT(req, &used, sizeof(used)); 980 if (error) 981 return (error); 982 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); 983 if (error) 984 return (error); 985 error = SYSCTL_OUT(req, &pct, sizeof(pct)); 986 if (error) 987 return (error); 988 return (0); 989 } 990 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| 991 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", 992 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); 993 #endif 994 995 /* 996 * Negative entries management 997 * 998 * Various workloads create plenty of negative entries and barely use them 999 * afterwards. Moreover malicious users can keep performing bogus lookups 1000 * adding even more entries. For example "make tinderbox" as of writing this 1001 * comment ends up with 2.6M namecache entries in total, 1.2M of which are 1002 * negative. 1003 * 1004 * As such, a rather aggressive eviction method is needed. The currently 1005 * employed method is a placeholder. 1006 * 1007 * Entries are split over numneglists separate lists, each of which is further 1008 * split into hot and cold entries. Entries get promoted after getting a hit. 1009 * Eviction happens on addition of new entry. 1010 */ 1011 static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1012 "Name cache negative entry statistics"); 1013 1014 SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0, 1015 "Number of negative cache entries"); 1016 1017 static COUNTER_U64_DEFINE_EARLY(neg_created); 1018 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created, 1019 "Number of created negative entries"); 1020 1021 static COUNTER_U64_DEFINE_EARLY(neg_evicted); 1022 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted, 1023 "Number of evicted negative entries"); 1024 1025 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty); 1026 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD, 1027 &neg_evict_skipped_empty, 1028 "Number of times evicting failed due to lack of entries"); 1029 1030 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed); 1031 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD, 1032 &neg_evict_skipped_missed, 1033 "Number of times evicting failed due to target entry disappearing"); 1034 1035 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended); 1036 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD, 1037 &neg_evict_skipped_contended, 1038 "Number of times evicting failed due to contention"); 1039 1040 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits, 1041 "Number of cache hits (negative)"); 1042 1043 static int 1044 sysctl_neg_hot(SYSCTL_HANDLER_ARGS) 1045 { 1046 int i, out; 1047 1048 out = 0; 1049 for (i = 0; i < numneglists; i++) 1050 out += neglists[i].nl_hotnum; 1051 1052 return (SYSCTL_OUT(req, &out, sizeof(out))); 1053 } 1054 SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD | 1055 CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I", 1056 "Number of hot negative entries"); 1057 1058 static void 1059 cache_neg_init(struct namecache *ncp) 1060 { 1061 struct negstate *ns; 1062 1063 ncp->nc_flag |= NCF_NEGATIVE; 1064 ns = NCP2NEGSTATE(ncp); 1065 ns->neg_flag = 0; 1066 ns->neg_hit = 0; 1067 counter_u64_add(neg_created, 1); 1068 } 1069 1070 #define CACHE_NEG_PROMOTION_THRESH 2 1071 1072 static bool 1073 cache_neg_hit_prep(struct namecache *ncp) 1074 { 1075 struct negstate *ns; 1076 u_char n; 1077 1078 ns = NCP2NEGSTATE(ncp); 1079 n = atomic_load_char(&ns->neg_hit); 1080 for (;;) { 1081 if (n >= CACHE_NEG_PROMOTION_THRESH) 1082 return (false); 1083 if (atomic_fcmpset_8(&ns->neg_hit, &n, n + 1)) 1084 break; 1085 } 1086 return (n + 1 == CACHE_NEG_PROMOTION_THRESH); 1087 } 1088 1089 /* 1090 * Nothing to do here but it is provided for completeness as some 1091 * cache_neg_hit_prep callers may end up returning without even 1092 * trying to promote. 1093 */ 1094 #define cache_neg_hit_abort(ncp) do { } while (0) 1095 1096 static void 1097 cache_neg_hit_finish(struct namecache *ncp) 1098 { 1099 1100 SDT_PROBE2(vfs, namecache, lookup, hit__negative, ncp->nc_dvp, ncp->nc_name); 1101 counter_u64_add(numneghits, 1); 1102 } 1103 1104 /* 1105 * Move a negative entry to the hot list. 1106 */ 1107 static void 1108 cache_neg_promote_locked(struct namecache *ncp) 1109 { 1110 struct neglist *nl; 1111 struct negstate *ns; 1112 1113 ns = NCP2NEGSTATE(ncp); 1114 nl = NCP2NEGLIST(ncp); 1115 mtx_assert(&nl->nl_lock, MA_OWNED); 1116 if ((ns->neg_flag & NEG_HOT) == 0) { 1117 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst); 1118 TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst); 1119 nl->nl_hotnum++; 1120 ns->neg_flag |= NEG_HOT; 1121 } 1122 } 1123 1124 /* 1125 * Move a hot negative entry to the cold list. 1126 */ 1127 static void 1128 cache_neg_demote_locked(struct namecache *ncp) 1129 { 1130 struct neglist *nl; 1131 struct negstate *ns; 1132 1133 ns = NCP2NEGSTATE(ncp); 1134 nl = NCP2NEGLIST(ncp); 1135 mtx_assert(&nl->nl_lock, MA_OWNED); 1136 MPASS(ns->neg_flag & NEG_HOT); 1137 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst); 1138 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst); 1139 nl->nl_hotnum--; 1140 ns->neg_flag &= ~NEG_HOT; 1141 atomic_store_char(&ns->neg_hit, 0); 1142 } 1143 1144 /* 1145 * Move a negative entry to the hot list if it matches the lookup. 1146 * 1147 * We have to take locks, but they may be contended and in the worst 1148 * case we may need to go off CPU. We don't want to spin within the 1149 * smr section and we can't block with it. Exiting the section means 1150 * the found entry could have been evicted. We are going to look it 1151 * up again. 1152 */ 1153 static bool 1154 cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp, 1155 struct namecache *oncp, uint32_t hash) 1156 { 1157 struct namecache *ncp; 1158 struct neglist *nl; 1159 u_char nc_flag; 1160 1161 nl = NCP2NEGLIST(oncp); 1162 1163 mtx_lock(&nl->nl_lock); 1164 /* 1165 * For hash iteration. 1166 */ 1167 vfs_smr_enter(); 1168 1169 /* 1170 * Avoid all surprises by only succeeding if we got the same entry and 1171 * bailing completely otherwise. 1172 * XXX There are no provisions to keep the vnode around, meaning we may 1173 * end up promoting a negative entry for a *new* vnode and returning 1174 * ENOENT on its account. This is the error we want to return anyway 1175 * and promotion is harmless. 1176 * 1177 * In particular at this point there can be a new ncp which matches the 1178 * search but hashes to a different neglist. 1179 */ 1180 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1181 if (ncp == oncp) 1182 break; 1183 } 1184 1185 /* 1186 * No match to begin with. 1187 */ 1188 if (__predict_false(ncp == NULL)) { 1189 goto out_abort; 1190 } 1191 1192 /* 1193 * The newly found entry may be something different... 1194 */ 1195 if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1196 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) { 1197 goto out_abort; 1198 } 1199 1200 /* 1201 * ... and not even negative. 1202 */ 1203 nc_flag = atomic_load_char(&ncp->nc_flag); 1204 if ((nc_flag & NCF_NEGATIVE) == 0) { 1205 goto out_abort; 1206 } 1207 1208 if (!cache_ncp_canuse(ncp)) { 1209 goto out_abort; 1210 } 1211 1212 cache_neg_promote_locked(ncp); 1213 cache_neg_hit_finish(ncp); 1214 vfs_smr_exit(); 1215 mtx_unlock(&nl->nl_lock); 1216 return (true); 1217 out_abort: 1218 vfs_smr_exit(); 1219 mtx_unlock(&nl->nl_lock); 1220 return (false); 1221 } 1222 1223 static void 1224 cache_neg_promote(struct namecache *ncp) 1225 { 1226 struct neglist *nl; 1227 1228 nl = NCP2NEGLIST(ncp); 1229 mtx_lock(&nl->nl_lock); 1230 cache_neg_promote_locked(ncp); 1231 mtx_unlock(&nl->nl_lock); 1232 } 1233 1234 static void 1235 cache_neg_insert(struct namecache *ncp) 1236 { 1237 struct neglist *nl; 1238 1239 MPASS(ncp->nc_flag & NCF_NEGATIVE); 1240 cache_assert_bucket_locked(ncp); 1241 nl = NCP2NEGLIST(ncp); 1242 mtx_lock(&nl->nl_lock); 1243 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst); 1244 mtx_unlock(&nl->nl_lock); 1245 atomic_add_long(&numneg, 1); 1246 } 1247 1248 static void 1249 cache_neg_remove(struct namecache *ncp) 1250 { 1251 struct neglist *nl; 1252 struct negstate *ns; 1253 1254 cache_assert_bucket_locked(ncp); 1255 nl = NCP2NEGLIST(ncp); 1256 ns = NCP2NEGSTATE(ncp); 1257 mtx_lock(&nl->nl_lock); 1258 if ((ns->neg_flag & NEG_HOT) != 0) { 1259 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst); 1260 nl->nl_hotnum--; 1261 } else { 1262 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst); 1263 } 1264 mtx_unlock(&nl->nl_lock); 1265 atomic_subtract_long(&numneg, 1); 1266 } 1267 1268 static struct neglist * 1269 cache_neg_evict_select_list(void) 1270 { 1271 struct neglist *nl; 1272 u_int c; 1273 1274 c = atomic_fetchadd_int(&neg_cycle, 1) + 1; 1275 nl = &neglists[c % numneglists]; 1276 if (!mtx_trylock(&nl->nl_evict_lock)) { 1277 counter_u64_add(neg_evict_skipped_contended, 1); 1278 return (NULL); 1279 } 1280 return (nl); 1281 } 1282 1283 static struct namecache * 1284 cache_neg_evict_select_entry(struct neglist *nl) 1285 { 1286 struct namecache *ncp, *lncp; 1287 struct negstate *ns, *lns; 1288 int i; 1289 1290 mtx_assert(&nl->nl_evict_lock, MA_OWNED); 1291 mtx_assert(&nl->nl_lock, MA_OWNED); 1292 ncp = TAILQ_FIRST(&nl->nl_list); 1293 if (ncp == NULL) 1294 return (NULL); 1295 lncp = ncp; 1296 lns = NCP2NEGSTATE(lncp); 1297 for (i = 1; i < 4; i++) { 1298 ncp = TAILQ_NEXT(ncp, nc_dst); 1299 if (ncp == NULL) 1300 break; 1301 ns = NCP2NEGSTATE(ncp); 1302 if (ns->neg_hit < lns->neg_hit) { 1303 lncp = ncp; 1304 lns = ns; 1305 } 1306 } 1307 return (lncp); 1308 } 1309 1310 static bool 1311 cache_neg_evict(void) 1312 { 1313 struct namecache *ncp, *ncp2; 1314 struct neglist *nl; 1315 struct vnode *dvp; 1316 struct mtx *dvlp; 1317 struct mtx *blp; 1318 uint32_t hash; 1319 u_char nlen; 1320 bool evicted; 1321 1322 nl = cache_neg_evict_select_list(); 1323 if (nl == NULL) { 1324 return (false); 1325 } 1326 1327 mtx_lock(&nl->nl_lock); 1328 ncp = TAILQ_FIRST(&nl->nl_hotlist); 1329 if (ncp != NULL) { 1330 cache_neg_demote_locked(ncp); 1331 } 1332 ncp = cache_neg_evict_select_entry(nl); 1333 if (ncp == NULL) { 1334 counter_u64_add(neg_evict_skipped_empty, 1); 1335 mtx_unlock(&nl->nl_lock); 1336 mtx_unlock(&nl->nl_evict_lock); 1337 return (false); 1338 } 1339 nlen = ncp->nc_nlen; 1340 dvp = ncp->nc_dvp; 1341 hash = cache_get_hash(ncp->nc_name, nlen, dvp); 1342 dvlp = VP2VNODELOCK(dvp); 1343 blp = HASH2BUCKETLOCK(hash); 1344 mtx_unlock(&nl->nl_lock); 1345 mtx_unlock(&nl->nl_evict_lock); 1346 mtx_lock(dvlp); 1347 mtx_lock(blp); 1348 /* 1349 * Note that since all locks were dropped above, the entry may be 1350 * gone or reallocated to be something else. 1351 */ 1352 CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) { 1353 if (ncp2 == ncp && ncp2->nc_dvp == dvp && 1354 ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0) 1355 break; 1356 } 1357 if (ncp2 == NULL) { 1358 counter_u64_add(neg_evict_skipped_missed, 1); 1359 ncp = NULL; 1360 evicted = false; 1361 } else { 1362 MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp)); 1363 MPASS(blp == NCP2BUCKETLOCK(ncp)); 1364 SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp, 1365 ncp->nc_name); 1366 cache_zap_locked(ncp); 1367 counter_u64_add(neg_evicted, 1); 1368 evicted = true; 1369 } 1370 mtx_unlock(blp); 1371 mtx_unlock(dvlp); 1372 if (ncp != NULL) 1373 cache_free(ncp); 1374 return (evicted); 1375 } 1376 1377 /* 1378 * Maybe evict a negative entry to create more room. 1379 * 1380 * The ncnegfactor parameter limits what fraction of the total count 1381 * can comprise of negative entries. However, if the cache is just 1382 * warming up this leads to excessive evictions. As such, ncnegminpct 1383 * (recomputed to neg_min) dictates whether the above should be 1384 * applied. 1385 * 1386 * Try evicting if the cache is close to full capacity regardless of 1387 * other considerations. 1388 */ 1389 static bool 1390 cache_neg_evict_cond(u_long lnumcache) 1391 { 1392 u_long lnumneg; 1393 1394 if (ncsize - 1000 < lnumcache) 1395 goto out_evict; 1396 lnumneg = atomic_load_long(&numneg); 1397 if (lnumneg < neg_min) 1398 return (false); 1399 if (lnumneg * ncnegfactor < lnumcache) 1400 return (false); 1401 out_evict: 1402 return (cache_neg_evict()); 1403 } 1404 1405 /* 1406 * cache_zap_locked(): 1407 * 1408 * Removes a namecache entry from cache, whether it contains an actual 1409 * pointer to a vnode or if it is just a negative cache entry. 1410 */ 1411 static void 1412 cache_zap_locked(struct namecache *ncp) 1413 { 1414 struct nchashhead *ncpp; 1415 1416 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1417 cache_assert_vnode_locked(ncp->nc_vp); 1418 cache_assert_vnode_locked(ncp->nc_dvp); 1419 cache_assert_bucket_locked(ncp); 1420 1421 cache_ncp_invalidate(ncp); 1422 1423 ncpp = NCP2BUCKET(ncp); 1424 CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash); 1425 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 1426 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, 1427 ncp->nc_name, ncp->nc_vp); 1428 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); 1429 if (ncp == ncp->nc_vp->v_cache_dd) { 1430 vn_seqc_write_begin_unheld(ncp->nc_vp); 1431 ncp->nc_vp->v_cache_dd = NULL; 1432 vn_seqc_write_end(ncp->nc_vp); 1433 } 1434 } else { 1435 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp, 1436 ncp->nc_name); 1437 cache_neg_remove(ncp); 1438 } 1439 if (ncp->nc_flag & NCF_ISDOTDOT) { 1440 if (ncp == ncp->nc_dvp->v_cache_dd) { 1441 vn_seqc_write_begin_unheld(ncp->nc_dvp); 1442 ncp->nc_dvp->v_cache_dd = NULL; 1443 vn_seqc_write_end(ncp->nc_dvp); 1444 } 1445 } else { 1446 LIST_REMOVE(ncp, nc_src); 1447 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { 1448 ncp->nc_flag |= NCF_DVDROP; 1449 } 1450 } 1451 } 1452 1453 static void 1454 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) 1455 { 1456 struct mtx *blp; 1457 1458 MPASS(ncp->nc_dvp == vp); 1459 MPASS(ncp->nc_flag & NCF_NEGATIVE); 1460 cache_assert_vnode_locked(vp); 1461 1462 blp = NCP2BUCKETLOCK(ncp); 1463 mtx_lock(blp); 1464 cache_zap_locked(ncp); 1465 mtx_unlock(blp); 1466 } 1467 1468 static bool 1469 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, 1470 struct mtx **vlpp) 1471 { 1472 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 1473 struct mtx *blp; 1474 1475 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 1476 cache_assert_vnode_locked(vp); 1477 1478 if (ncp->nc_flag & NCF_NEGATIVE) { 1479 if (*vlpp != NULL) { 1480 mtx_unlock(*vlpp); 1481 *vlpp = NULL; 1482 } 1483 cache_zap_negative_locked_vnode_kl(ncp, vp); 1484 return (true); 1485 } 1486 1487 pvlp = VP2VNODELOCK(vp); 1488 blp = NCP2BUCKETLOCK(ncp); 1489 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 1490 vlp2 = VP2VNODELOCK(ncp->nc_vp); 1491 1492 if (*vlpp == vlp1 || *vlpp == vlp2) { 1493 to_unlock = *vlpp; 1494 *vlpp = NULL; 1495 } else { 1496 if (*vlpp != NULL) { 1497 mtx_unlock(*vlpp); 1498 *vlpp = NULL; 1499 } 1500 cache_sort_vnodes(&vlp1, &vlp2); 1501 if (vlp1 == pvlp) { 1502 mtx_lock(vlp2); 1503 to_unlock = vlp2; 1504 } else { 1505 if (!mtx_trylock(vlp1)) 1506 goto out_relock; 1507 to_unlock = vlp1; 1508 } 1509 } 1510 mtx_lock(blp); 1511 cache_zap_locked(ncp); 1512 mtx_unlock(blp); 1513 if (to_unlock != NULL) 1514 mtx_unlock(to_unlock); 1515 return (true); 1516 1517 out_relock: 1518 mtx_unlock(vlp2); 1519 mtx_lock(vlp1); 1520 mtx_lock(vlp2); 1521 MPASS(*vlpp == NULL); 1522 *vlpp = vlp1; 1523 return (false); 1524 } 1525 1526 /* 1527 * If trylocking failed we can get here. We know enough to take all needed locks 1528 * in the right order and re-lookup the entry. 1529 */ 1530 static int 1531 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1532 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, 1533 struct mtx *blp) 1534 { 1535 struct namecache *rncp; 1536 1537 cache_assert_bucket_unlocked(ncp); 1538 1539 cache_sort_vnodes(&dvlp, &vlp); 1540 cache_lock_vnodes(dvlp, vlp); 1541 mtx_lock(blp); 1542 CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { 1543 if (rncp == ncp && rncp->nc_dvp == dvp && 1544 rncp->nc_nlen == cnp->cn_namelen && 1545 !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) 1546 break; 1547 } 1548 if (rncp != NULL) { 1549 cache_zap_locked(rncp); 1550 mtx_unlock(blp); 1551 cache_unlock_vnodes(dvlp, vlp); 1552 counter_u64_add(zap_bucket_relock_success, 1); 1553 return (0); 1554 } 1555 1556 mtx_unlock(blp); 1557 cache_unlock_vnodes(dvlp, vlp); 1558 return (EAGAIN); 1559 } 1560 1561 static int __noinline 1562 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp, 1563 uint32_t hash, struct mtx *blp) 1564 { 1565 struct mtx *dvlp, *vlp; 1566 struct vnode *dvp; 1567 1568 cache_assert_bucket_locked(ncp); 1569 1570 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1571 vlp = NULL; 1572 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1573 vlp = VP2VNODELOCK(ncp->nc_vp); 1574 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1575 cache_zap_locked(ncp); 1576 mtx_unlock(blp); 1577 cache_unlock_vnodes(dvlp, vlp); 1578 return (0); 1579 } 1580 1581 dvp = ncp->nc_dvp; 1582 mtx_unlock(blp); 1583 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1584 } 1585 1586 static __noinline int 1587 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp) 1588 { 1589 struct namecache *ncp; 1590 struct mtx *blp; 1591 struct mtx *dvlp, *dvlp2; 1592 uint32_t hash; 1593 int error; 1594 1595 if (cnp->cn_namelen == 2 && 1596 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1597 dvlp = VP2VNODELOCK(dvp); 1598 dvlp2 = NULL; 1599 mtx_lock(dvlp); 1600 retry_dotdot: 1601 ncp = dvp->v_cache_dd; 1602 if (ncp == NULL) { 1603 mtx_unlock(dvlp); 1604 if (dvlp2 != NULL) 1605 mtx_unlock(dvlp2); 1606 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1607 return (0); 1608 } 1609 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1610 if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2)) 1611 goto retry_dotdot; 1612 MPASS(dvp->v_cache_dd == NULL); 1613 mtx_unlock(dvlp); 1614 if (dvlp2 != NULL) 1615 mtx_unlock(dvlp2); 1616 cache_free(ncp); 1617 } else { 1618 vn_seqc_write_begin(dvp); 1619 dvp->v_cache_dd = NULL; 1620 vn_seqc_write_end(dvp); 1621 mtx_unlock(dvlp); 1622 if (dvlp2 != NULL) 1623 mtx_unlock(dvlp2); 1624 } 1625 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1626 return (1); 1627 } 1628 1629 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1630 blp = HASH2BUCKETLOCK(hash); 1631 retry: 1632 if (CK_SLIST_EMPTY(NCHHASH(hash))) 1633 goto out_no_entry; 1634 1635 mtx_lock(blp); 1636 1637 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1638 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1639 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1640 break; 1641 } 1642 1643 if (ncp == NULL) { 1644 mtx_unlock(blp); 1645 goto out_no_entry; 1646 } 1647 1648 error = cache_zap_locked_bucket(ncp, cnp, hash, blp); 1649 if (__predict_false(error != 0)) { 1650 zap_bucket_fail++; 1651 goto retry; 1652 } 1653 counter_u64_add(numposzaps, 1); 1654 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1655 cache_free(ncp); 1656 return (1); 1657 out_no_entry: 1658 counter_u64_add(nummisszap, 1); 1659 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1660 return (0); 1661 } 1662 1663 static int __noinline 1664 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1665 struct timespec *tsp, int *ticksp) 1666 { 1667 int ltype; 1668 1669 *vpp = dvp; 1670 counter_u64_add(dothits, 1); 1671 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); 1672 if (tsp != NULL) 1673 timespecclear(tsp); 1674 if (ticksp != NULL) 1675 *ticksp = ticks; 1676 vrefact(*vpp); 1677 /* 1678 * When we lookup "." we still can be asked to lock it 1679 * differently... 1680 */ 1681 ltype = cnp->cn_lkflags & LK_TYPE_MASK; 1682 if (ltype != VOP_ISLOCKED(*vpp)) { 1683 if (ltype == LK_EXCLUSIVE) { 1684 vn_lock(*vpp, LK_UPGRADE | LK_RETRY); 1685 if (VN_IS_DOOMED((*vpp))) { 1686 /* forced unmount */ 1687 vrele(*vpp); 1688 *vpp = NULL; 1689 return (ENOENT); 1690 } 1691 } else 1692 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); 1693 } 1694 return (-1); 1695 } 1696 1697 static int __noinline 1698 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1699 struct timespec *tsp, int *ticksp) 1700 { 1701 struct namecache_ts *ncp_ts; 1702 struct namecache *ncp; 1703 struct mtx *dvlp; 1704 enum vgetstate vs; 1705 int error, ltype; 1706 bool whiteout; 1707 1708 MPASS((cnp->cn_flags & ISDOTDOT) != 0); 1709 1710 if ((cnp->cn_flags & MAKEENTRY) == 0) { 1711 cache_remove_cnp(dvp, cnp); 1712 return (0); 1713 } 1714 1715 counter_u64_add(dotdothits, 1); 1716 retry: 1717 dvlp = VP2VNODELOCK(dvp); 1718 mtx_lock(dvlp); 1719 ncp = dvp->v_cache_dd; 1720 if (ncp == NULL) { 1721 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, "..", NULL); 1722 mtx_unlock(dvlp); 1723 return (0); 1724 } 1725 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1726 if (ncp->nc_flag & NCF_NEGATIVE) 1727 *vpp = NULL; 1728 else 1729 *vpp = ncp->nc_vp; 1730 } else 1731 *vpp = ncp->nc_dvp; 1732 if (*vpp == NULL) 1733 goto negative_success; 1734 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp); 1735 cache_out_ts(ncp, tsp, ticksp); 1736 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == 1737 NCF_DTS && tsp != NULL) { 1738 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1739 *tsp = ncp_ts->nc_dotdottime; 1740 } 1741 1742 MPASS(dvp != *vpp); 1743 ltype = VOP_ISLOCKED(dvp); 1744 VOP_UNLOCK(dvp); 1745 vs = vget_prep(*vpp); 1746 mtx_unlock(dvlp); 1747 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1748 vn_lock(dvp, ltype | LK_RETRY); 1749 if (VN_IS_DOOMED(dvp)) { 1750 if (error == 0) 1751 vput(*vpp); 1752 *vpp = NULL; 1753 return (ENOENT); 1754 } 1755 if (error) { 1756 *vpp = NULL; 1757 goto retry; 1758 } 1759 return (-1); 1760 negative_success: 1761 if (__predict_false(cnp->cn_nameiop == CREATE)) { 1762 if (cnp->cn_flags & ISLASTCN) { 1763 counter_u64_add(numnegzaps, 1); 1764 cache_zap_negative_locked_vnode_kl(ncp, dvp); 1765 mtx_unlock(dvlp); 1766 cache_free(ncp); 1767 return (0); 1768 } 1769 } 1770 1771 whiteout = (ncp->nc_flag & NCF_WHITE); 1772 cache_out_ts(ncp, tsp, ticksp); 1773 if (cache_neg_hit_prep(ncp)) 1774 cache_neg_promote(ncp); 1775 else 1776 cache_neg_hit_finish(ncp); 1777 mtx_unlock(dvlp); 1778 if (whiteout) 1779 cnp->cn_flags |= ISWHITEOUT; 1780 return (ENOENT); 1781 } 1782 1783 /** 1784 * Lookup a name in the name cache 1785 * 1786 * # Arguments 1787 * 1788 * - dvp: Parent directory in which to search. 1789 * - vpp: Return argument. Will contain desired vnode on cache hit. 1790 * - cnp: Parameters of the name search. The most interesting bits of 1791 * the cn_flags field have the following meanings: 1792 * - MAKEENTRY: If clear, free an entry from the cache rather than look 1793 * it up. 1794 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." 1795 * - tsp: Return storage for cache timestamp. On a successful (positive 1796 * or negative) lookup, tsp will be filled with any timespec that 1797 * was stored when this cache entry was created. However, it will 1798 * be clear for "." entries. 1799 * - ticks: Return storage for alternate cache timestamp. On a successful 1800 * (positive or negative) lookup, it will contain the ticks value 1801 * that was current when the cache entry was created, unless cnp 1802 * was ".". 1803 * 1804 * Either both tsp and ticks have to be provided or neither of them. 1805 * 1806 * # Returns 1807 * 1808 * - -1: A positive cache hit. vpp will contain the desired vnode. 1809 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due 1810 * to a forced unmount. vpp will not be modified. If the entry 1811 * is a whiteout, then the ISWHITEOUT flag will be set in 1812 * cnp->cn_flags. 1813 * - 0: A cache miss. vpp will not be modified. 1814 * 1815 * # Locking 1816 * 1817 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up 1818 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the 1819 * lock is not recursively acquired. 1820 */ 1821 static int __noinline 1822 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1823 struct timespec *tsp, int *ticksp) 1824 { 1825 struct namecache *ncp; 1826 struct mtx *blp; 1827 uint32_t hash; 1828 enum vgetstate vs; 1829 int error; 1830 bool whiteout; 1831 1832 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 1833 MPASS((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) != 0); 1834 1835 retry: 1836 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1837 blp = HASH2BUCKETLOCK(hash); 1838 mtx_lock(blp); 1839 1840 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1841 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1842 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1843 break; 1844 } 1845 1846 if (__predict_false(ncp == NULL)) { 1847 mtx_unlock(blp); 1848 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1849 NULL); 1850 counter_u64_add(nummiss, 1); 1851 return (0); 1852 } 1853 1854 if (ncp->nc_flag & NCF_NEGATIVE) 1855 goto negative_success; 1856 1857 counter_u64_add(numposhits, 1); 1858 *vpp = ncp->nc_vp; 1859 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); 1860 cache_out_ts(ncp, tsp, ticksp); 1861 MPASS(dvp != *vpp); 1862 vs = vget_prep(*vpp); 1863 mtx_unlock(blp); 1864 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1865 if (error) { 1866 *vpp = NULL; 1867 goto retry; 1868 } 1869 return (-1); 1870 negative_success: 1871 /* 1872 * We don't get here with regular lookup apart from corner cases. 1873 */ 1874 if (__predict_true(cnp->cn_nameiop == CREATE)) { 1875 if (cnp->cn_flags & ISLASTCN) { 1876 counter_u64_add(numnegzaps, 1); 1877 error = cache_zap_locked_bucket(ncp, cnp, hash, blp); 1878 if (__predict_false(error != 0)) { 1879 zap_bucket_fail2++; 1880 goto retry; 1881 } 1882 cache_free(ncp); 1883 return (0); 1884 } 1885 } 1886 1887 whiteout = (ncp->nc_flag & NCF_WHITE); 1888 cache_out_ts(ncp, tsp, ticksp); 1889 if (cache_neg_hit_prep(ncp)) 1890 cache_neg_promote(ncp); 1891 else 1892 cache_neg_hit_finish(ncp); 1893 mtx_unlock(blp); 1894 if (whiteout) 1895 cnp->cn_flags |= ISWHITEOUT; 1896 return (ENOENT); 1897 } 1898 1899 int 1900 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1901 struct timespec *tsp, int *ticksp) 1902 { 1903 struct namecache *ncp; 1904 uint32_t hash; 1905 enum vgetstate vs; 1906 int error; 1907 bool whiteout, neg_promote; 1908 u_short nc_flag; 1909 1910 MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL)); 1911 1912 #ifdef DEBUG_CACHE 1913 if (__predict_false(!doingcache)) { 1914 cnp->cn_flags &= ~MAKEENTRY; 1915 return (0); 1916 } 1917 #endif 1918 1919 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 1920 if (cnp->cn_namelen == 1) 1921 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); 1922 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') 1923 return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp)); 1924 } 1925 1926 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 1927 1928 if ((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) == 0) { 1929 cache_remove_cnp(dvp, cnp); 1930 return (0); 1931 } 1932 1933 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1934 vfs_smr_enter(); 1935 1936 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1937 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1938 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1939 break; 1940 } 1941 1942 if (__predict_false(ncp == NULL)) { 1943 vfs_smr_exit(); 1944 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1945 NULL); 1946 counter_u64_add(nummiss, 1); 1947 return (0); 1948 } 1949 1950 nc_flag = atomic_load_char(&ncp->nc_flag); 1951 if (nc_flag & NCF_NEGATIVE) 1952 goto negative_success; 1953 1954 counter_u64_add(numposhits, 1); 1955 *vpp = ncp->nc_vp; 1956 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); 1957 cache_out_ts(ncp, tsp, ticksp); 1958 MPASS(dvp != *vpp); 1959 if (!cache_ncp_canuse(ncp)) { 1960 vfs_smr_exit(); 1961 *vpp = NULL; 1962 goto out_fallback; 1963 } 1964 vs = vget_prep_smr(*vpp); 1965 vfs_smr_exit(); 1966 if (__predict_false(vs == VGET_NONE)) { 1967 *vpp = NULL; 1968 goto out_fallback; 1969 } 1970 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1971 if (error) { 1972 *vpp = NULL; 1973 goto out_fallback; 1974 } 1975 return (-1); 1976 negative_success: 1977 if (cnp->cn_nameiop == CREATE) { 1978 if (cnp->cn_flags & ISLASTCN) { 1979 vfs_smr_exit(); 1980 goto out_fallback; 1981 } 1982 } 1983 1984 cache_out_ts(ncp, tsp, ticksp); 1985 whiteout = (ncp->nc_flag & NCF_WHITE); 1986 neg_promote = cache_neg_hit_prep(ncp); 1987 if (!cache_ncp_canuse(ncp)) { 1988 cache_neg_hit_abort(ncp); 1989 vfs_smr_exit(); 1990 goto out_fallback; 1991 } 1992 if (neg_promote) { 1993 vfs_smr_exit(); 1994 if (!cache_neg_promote_cond(dvp, cnp, ncp, hash)) 1995 goto out_fallback; 1996 } else { 1997 cache_neg_hit_finish(ncp); 1998 vfs_smr_exit(); 1999 } 2000 if (whiteout) 2001 cnp->cn_flags |= ISWHITEOUT; 2002 return (ENOENT); 2003 out_fallback: 2004 return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp)); 2005 } 2006 2007 struct celockstate { 2008 struct mtx *vlp[3]; 2009 struct mtx *blp[2]; 2010 }; 2011 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); 2012 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); 2013 2014 static inline void 2015 cache_celockstate_init(struct celockstate *cel) 2016 { 2017 2018 bzero(cel, sizeof(*cel)); 2019 } 2020 2021 static void 2022 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, 2023 struct vnode *dvp) 2024 { 2025 struct mtx *vlp1, *vlp2; 2026 2027 MPASS(cel->vlp[0] == NULL); 2028 MPASS(cel->vlp[1] == NULL); 2029 MPASS(cel->vlp[2] == NULL); 2030 2031 MPASS(vp != NULL || dvp != NULL); 2032 2033 vlp1 = VP2VNODELOCK(vp); 2034 vlp2 = VP2VNODELOCK(dvp); 2035 cache_sort_vnodes(&vlp1, &vlp2); 2036 2037 if (vlp1 != NULL) { 2038 mtx_lock(vlp1); 2039 cel->vlp[0] = vlp1; 2040 } 2041 mtx_lock(vlp2); 2042 cel->vlp[1] = vlp2; 2043 } 2044 2045 static void 2046 cache_unlock_vnodes_cel(struct celockstate *cel) 2047 { 2048 2049 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); 2050 2051 if (cel->vlp[0] != NULL) 2052 mtx_unlock(cel->vlp[0]); 2053 if (cel->vlp[1] != NULL) 2054 mtx_unlock(cel->vlp[1]); 2055 if (cel->vlp[2] != NULL) 2056 mtx_unlock(cel->vlp[2]); 2057 } 2058 2059 static bool 2060 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) 2061 { 2062 struct mtx *vlp; 2063 bool ret; 2064 2065 cache_assert_vlp_locked(cel->vlp[0]); 2066 cache_assert_vlp_locked(cel->vlp[1]); 2067 MPASS(cel->vlp[2] == NULL); 2068 2069 MPASS(vp != NULL); 2070 vlp = VP2VNODELOCK(vp); 2071 2072 ret = true; 2073 if (vlp >= cel->vlp[1]) { 2074 mtx_lock(vlp); 2075 } else { 2076 if (mtx_trylock(vlp)) 2077 goto out; 2078 cache_lock_vnodes_cel_3_failures++; 2079 cache_unlock_vnodes_cel(cel); 2080 if (vlp < cel->vlp[0]) { 2081 mtx_lock(vlp); 2082 mtx_lock(cel->vlp[0]); 2083 mtx_lock(cel->vlp[1]); 2084 } else { 2085 if (cel->vlp[0] != NULL) 2086 mtx_lock(cel->vlp[0]); 2087 mtx_lock(vlp); 2088 mtx_lock(cel->vlp[1]); 2089 } 2090 ret = false; 2091 } 2092 out: 2093 cel->vlp[2] = vlp; 2094 return (ret); 2095 } 2096 2097 static void 2098 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1, 2099 struct mtx *blp2) 2100 { 2101 2102 MPASS(cel->blp[0] == NULL); 2103 MPASS(cel->blp[1] == NULL); 2104 2105 cache_sort_vnodes(&blp1, &blp2); 2106 2107 if (blp1 != NULL) { 2108 mtx_lock(blp1); 2109 cel->blp[0] = blp1; 2110 } 2111 mtx_lock(blp2); 2112 cel->blp[1] = blp2; 2113 } 2114 2115 static void 2116 cache_unlock_buckets_cel(struct celockstate *cel) 2117 { 2118 2119 if (cel->blp[0] != NULL) 2120 mtx_unlock(cel->blp[0]); 2121 mtx_unlock(cel->blp[1]); 2122 } 2123 2124 /* 2125 * Lock part of the cache affected by the insertion. 2126 * 2127 * This means vnodelocks for dvp, vp and the relevant bucketlock. 2128 * However, insertion can result in removal of an old entry. In this 2129 * case we have an additional vnode and bucketlock pair to lock. 2130 * 2131 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while 2132 * preserving the locking order (smaller address first). 2133 */ 2134 static void 2135 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 2136 uint32_t hash) 2137 { 2138 struct namecache *ncp; 2139 struct mtx *blps[2]; 2140 2141 blps[0] = HASH2BUCKETLOCK(hash); 2142 for (;;) { 2143 blps[1] = NULL; 2144 cache_lock_vnodes_cel(cel, dvp, vp); 2145 if (vp == NULL || vp->v_type != VDIR) 2146 break; 2147 ncp = vp->v_cache_dd; 2148 if (ncp == NULL) 2149 break; 2150 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2151 break; 2152 MPASS(ncp->nc_dvp == vp); 2153 blps[1] = NCP2BUCKETLOCK(ncp); 2154 if (ncp->nc_flag & NCF_NEGATIVE) 2155 break; 2156 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 2157 break; 2158 /* 2159 * All vnodes got re-locked. Re-validate the state and if 2160 * nothing changed we are done. Otherwise restart. 2161 */ 2162 if (ncp == vp->v_cache_dd && 2163 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 2164 blps[1] == NCP2BUCKETLOCK(ncp) && 2165 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 2166 break; 2167 cache_unlock_vnodes_cel(cel); 2168 cel->vlp[0] = NULL; 2169 cel->vlp[1] = NULL; 2170 cel->vlp[2] = NULL; 2171 } 2172 cache_lock_buckets_cel(cel, blps[0], blps[1]); 2173 } 2174 2175 static void 2176 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 2177 uint32_t hash) 2178 { 2179 struct namecache *ncp; 2180 struct mtx *blps[2]; 2181 2182 blps[0] = HASH2BUCKETLOCK(hash); 2183 for (;;) { 2184 blps[1] = NULL; 2185 cache_lock_vnodes_cel(cel, dvp, vp); 2186 ncp = dvp->v_cache_dd; 2187 if (ncp == NULL) 2188 break; 2189 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2190 break; 2191 MPASS(ncp->nc_dvp == dvp); 2192 blps[1] = NCP2BUCKETLOCK(ncp); 2193 if (ncp->nc_flag & NCF_NEGATIVE) 2194 break; 2195 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 2196 break; 2197 if (ncp == dvp->v_cache_dd && 2198 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 2199 blps[1] == NCP2BUCKETLOCK(ncp) && 2200 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 2201 break; 2202 cache_unlock_vnodes_cel(cel); 2203 cel->vlp[0] = NULL; 2204 cel->vlp[1] = NULL; 2205 cel->vlp[2] = NULL; 2206 } 2207 cache_lock_buckets_cel(cel, blps[0], blps[1]); 2208 } 2209 2210 static void 2211 cache_enter_unlock(struct celockstate *cel) 2212 { 2213 2214 cache_unlock_buckets_cel(cel); 2215 cache_unlock_vnodes_cel(cel); 2216 } 2217 2218 static void __noinline 2219 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, 2220 struct componentname *cnp) 2221 { 2222 struct celockstate cel; 2223 struct namecache *ncp; 2224 uint32_t hash; 2225 int len; 2226 2227 if (dvp->v_cache_dd == NULL) 2228 return; 2229 len = cnp->cn_namelen; 2230 cache_celockstate_init(&cel); 2231 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 2232 cache_enter_lock_dd(&cel, dvp, vp, hash); 2233 vn_seqc_write_begin(dvp); 2234 ncp = dvp->v_cache_dd; 2235 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) { 2236 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); 2237 cache_zap_locked(ncp); 2238 } else { 2239 ncp = NULL; 2240 } 2241 dvp->v_cache_dd = NULL; 2242 vn_seqc_write_end(dvp); 2243 cache_enter_unlock(&cel); 2244 if (ncp != NULL) 2245 cache_free(ncp); 2246 } 2247 2248 /* 2249 * Add an entry to the cache. 2250 */ 2251 void 2252 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 2253 struct timespec *tsp, struct timespec *dtsp) 2254 { 2255 struct celockstate cel; 2256 struct namecache *ncp, *n2, *ndd; 2257 struct namecache_ts *ncp_ts; 2258 struct nchashhead *ncpp; 2259 uint32_t hash; 2260 int flag; 2261 int len; 2262 2263 KASSERT(cnp->cn_namelen <= NAME_MAX, 2264 ("%s: passed len %ld exceeds NAME_MAX (%d)", __func__, cnp->cn_namelen, 2265 NAME_MAX)); 2266 VNPASS(dvp != vp, dvp); 2267 VNPASS(!VN_IS_DOOMED(dvp), dvp); 2268 VNPASS(dvp->v_type != VNON, dvp); 2269 if (vp != NULL) { 2270 VNPASS(!VN_IS_DOOMED(vp), vp); 2271 VNPASS(vp->v_type != VNON, vp); 2272 } 2273 2274 #ifdef DEBUG_CACHE 2275 if (__predict_false(!doingcache)) 2276 return; 2277 #endif 2278 2279 flag = 0; 2280 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 2281 if (cnp->cn_namelen == 1) 2282 return; 2283 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 2284 cache_enter_dotdot_prep(dvp, vp, cnp); 2285 flag = NCF_ISDOTDOT; 2286 } 2287 } 2288 2289 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); 2290 if (ncp == NULL) 2291 return; 2292 2293 cache_celockstate_init(&cel); 2294 ndd = NULL; 2295 ncp_ts = NULL; 2296 2297 /* 2298 * Calculate the hash key and setup as much of the new 2299 * namecache entry as possible before acquiring the lock. 2300 */ 2301 ncp->nc_flag = flag | NCF_WIP; 2302 ncp->nc_vp = vp; 2303 if (vp == NULL) 2304 cache_neg_init(ncp); 2305 ncp->nc_dvp = dvp; 2306 if (tsp != NULL) { 2307 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 2308 ncp_ts->nc_time = *tsp; 2309 ncp_ts->nc_ticks = ticks; 2310 ncp_ts->nc_nc.nc_flag |= NCF_TS; 2311 if (dtsp != NULL) { 2312 ncp_ts->nc_dotdottime = *dtsp; 2313 ncp_ts->nc_nc.nc_flag |= NCF_DTS; 2314 } 2315 } 2316 len = ncp->nc_nlen = cnp->cn_namelen; 2317 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 2318 memcpy(ncp->nc_name, cnp->cn_nameptr, len); 2319 ncp->nc_name[len] = '\0'; 2320 cache_enter_lock(&cel, dvp, vp, hash); 2321 2322 /* 2323 * See if this vnode or negative entry is already in the cache 2324 * with this name. This can happen with concurrent lookups of 2325 * the same path name. 2326 */ 2327 ncpp = NCHHASH(hash); 2328 CK_SLIST_FOREACH(n2, ncpp, nc_hash) { 2329 if (n2->nc_dvp == dvp && 2330 n2->nc_nlen == cnp->cn_namelen && 2331 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { 2332 MPASS(cache_ncp_canuse(n2)); 2333 if ((n2->nc_flag & NCF_NEGATIVE) != 0) 2334 KASSERT(vp == NULL, 2335 ("%s: found entry pointing to a different vnode (%p != %p)", 2336 __func__, NULL, vp)); 2337 else 2338 KASSERT(n2->nc_vp == vp, 2339 ("%s: found entry pointing to a different vnode (%p != %p)", 2340 __func__, n2->nc_vp, vp)); 2341 /* 2342 * Entries are supposed to be immutable unless in the 2343 * process of getting destroyed. Accommodating for 2344 * changing timestamps is possible but not worth it. 2345 * This should be harmless in terms of correctness, in 2346 * the worst case resulting in an earlier expiration. 2347 * Alternatively, the found entry can be replaced 2348 * altogether. 2349 */ 2350 MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS))); 2351 #if 0 2352 if (tsp != NULL) { 2353 KASSERT((n2->nc_flag & NCF_TS) != 0, 2354 ("no NCF_TS")); 2355 n2_ts = __containerof(n2, struct namecache_ts, nc_nc); 2356 n2_ts->nc_time = ncp_ts->nc_time; 2357 n2_ts->nc_ticks = ncp_ts->nc_ticks; 2358 if (dtsp != NULL) { 2359 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; 2360 n2_ts->nc_nc.nc_flag |= NCF_DTS; 2361 } 2362 } 2363 #endif 2364 SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name, 2365 vp); 2366 goto out_unlock_free; 2367 } 2368 } 2369 2370 if (flag == NCF_ISDOTDOT) { 2371 /* 2372 * See if we are trying to add .. entry, but some other lookup 2373 * has populated v_cache_dd pointer already. 2374 */ 2375 if (dvp->v_cache_dd != NULL) 2376 goto out_unlock_free; 2377 KASSERT(vp == NULL || vp->v_type == VDIR, 2378 ("wrong vnode type %p", vp)); 2379 vn_seqc_write_begin(dvp); 2380 dvp->v_cache_dd = ncp; 2381 vn_seqc_write_end(dvp); 2382 } 2383 2384 if (vp != NULL) { 2385 if (flag != NCF_ISDOTDOT) { 2386 /* 2387 * For this case, the cache entry maps both the 2388 * directory name in it and the name ".." for the 2389 * directory's parent. 2390 */ 2391 vn_seqc_write_begin(vp); 2392 if ((ndd = vp->v_cache_dd) != NULL) { 2393 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) 2394 cache_zap_locked(ndd); 2395 else 2396 ndd = NULL; 2397 } 2398 vp->v_cache_dd = ncp; 2399 vn_seqc_write_end(vp); 2400 } else if (vp->v_type != VDIR) { 2401 if (vp->v_cache_dd != NULL) { 2402 vn_seqc_write_begin(vp); 2403 vp->v_cache_dd = NULL; 2404 vn_seqc_write_end(vp); 2405 } 2406 } 2407 } 2408 2409 if (flag != NCF_ISDOTDOT) { 2410 if (LIST_EMPTY(&dvp->v_cache_src)) { 2411 cache_hold_vnode(dvp); 2412 } 2413 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); 2414 } 2415 2416 /* 2417 * If the entry is "negative", we place it into the 2418 * "negative" cache queue, otherwise, we place it into the 2419 * destination vnode's cache entries queue. 2420 */ 2421 if (vp != NULL) { 2422 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); 2423 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, 2424 vp); 2425 } else { 2426 if (cnp->cn_flags & ISWHITEOUT) 2427 ncp->nc_flag |= NCF_WHITE; 2428 cache_neg_insert(ncp); 2429 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, 2430 ncp->nc_name); 2431 } 2432 2433 /* 2434 * Insert the new namecache entry into the appropriate chain 2435 * within the cache entries table. 2436 */ 2437 CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash); 2438 2439 atomic_thread_fence_rel(); 2440 /* 2441 * Mark the entry as fully constructed. 2442 * It is immutable past this point until its removal. 2443 */ 2444 atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP); 2445 2446 cache_enter_unlock(&cel); 2447 if (ndd != NULL) 2448 cache_free(ndd); 2449 return; 2450 out_unlock_free: 2451 cache_enter_unlock(&cel); 2452 cache_free(ncp); 2453 return; 2454 } 2455 2456 static u_int 2457 cache_roundup_2(u_int val) 2458 { 2459 u_int res; 2460 2461 for (res = 1; res <= val; res <<= 1) 2462 continue; 2463 2464 return (res); 2465 } 2466 2467 static struct nchashhead * 2468 nchinittbl(u_long elements, u_long *hashmask) 2469 { 2470 struct nchashhead *hashtbl; 2471 u_long hashsize, i; 2472 2473 hashsize = cache_roundup_2(elements) / 2; 2474 2475 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK); 2476 for (i = 0; i < hashsize; i++) 2477 CK_SLIST_INIT(&hashtbl[i]); 2478 *hashmask = hashsize - 1; 2479 return (hashtbl); 2480 } 2481 2482 static void 2483 ncfreetbl(struct nchashhead *hashtbl) 2484 { 2485 2486 free(hashtbl, M_VFSCACHE); 2487 } 2488 2489 /* 2490 * Name cache initialization, from vfs_init() when we are booting 2491 */ 2492 static void 2493 nchinit(void *dummy __unused) 2494 { 2495 u_int i; 2496 2497 cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE, 2498 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2499 cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE, 2500 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2501 cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE, 2502 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2503 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE, 2504 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2505 2506 VFS_SMR_ZONE_SET(cache_zone_small); 2507 VFS_SMR_ZONE_SET(cache_zone_small_ts); 2508 VFS_SMR_ZONE_SET(cache_zone_large); 2509 VFS_SMR_ZONE_SET(cache_zone_large_ts); 2510 2511 ncsize = desiredvnodes * ncsizefactor; 2512 cache_recalc_neg_min(ncnegminpct); 2513 nchashtbl = nchinittbl(desiredvnodes * 2, &nchash); 2514 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; 2515 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ 2516 ncbuckethash = 7; 2517 if (ncbuckethash > nchash) 2518 ncbuckethash = nchash; 2519 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, 2520 M_WAITOK | M_ZERO); 2521 for (i = 0; i < numbucketlocks; i++) 2522 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE); 2523 ncvnodehash = ncbuckethash; 2524 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, 2525 M_WAITOK | M_ZERO); 2526 for (i = 0; i < numvnodelocks; i++) 2527 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); 2528 2529 for (i = 0; i < numneglists; i++) { 2530 mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF); 2531 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); 2532 TAILQ_INIT(&neglists[i].nl_list); 2533 TAILQ_INIT(&neglists[i].nl_hotlist); 2534 } 2535 } 2536 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); 2537 2538 void 2539 cache_vnode_init(struct vnode *vp) 2540 { 2541 2542 LIST_INIT(&vp->v_cache_src); 2543 TAILQ_INIT(&vp->v_cache_dst); 2544 vp->v_cache_dd = NULL; 2545 cache_prehash(vp); 2546 } 2547 2548 void 2549 cache_changesize(u_long newmaxvnodes) 2550 { 2551 struct nchashhead *new_nchashtbl, *old_nchashtbl; 2552 u_long new_nchash, old_nchash; 2553 struct namecache *ncp; 2554 uint32_t hash; 2555 u_long newncsize; 2556 int i; 2557 2558 newncsize = newmaxvnodes * ncsizefactor; 2559 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); 2560 if (newmaxvnodes < numbucketlocks) 2561 newmaxvnodes = numbucketlocks; 2562 2563 new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash); 2564 /* If same hash table size, nothing to do */ 2565 if (nchash == new_nchash) { 2566 ncfreetbl(new_nchashtbl); 2567 return; 2568 } 2569 /* 2570 * Move everything from the old hash table to the new table. 2571 * None of the namecache entries in the table can be removed 2572 * because to do so, they have to be removed from the hash table. 2573 */ 2574 cache_lock_all_vnodes(); 2575 cache_lock_all_buckets(); 2576 old_nchashtbl = nchashtbl; 2577 old_nchash = nchash; 2578 nchashtbl = new_nchashtbl; 2579 nchash = new_nchash; 2580 for (i = 0; i <= old_nchash; i++) { 2581 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) { 2582 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, 2583 ncp->nc_dvp); 2584 CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash); 2585 CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); 2586 } 2587 } 2588 ncsize = newncsize; 2589 cache_recalc_neg_min(ncnegminpct); 2590 cache_unlock_all_buckets(); 2591 cache_unlock_all_vnodes(); 2592 ncfreetbl(old_nchashtbl); 2593 } 2594 2595 /* 2596 * Invalidate all entries from and to a particular vnode. 2597 */ 2598 static void 2599 cache_purge_impl(struct vnode *vp) 2600 { 2601 struct cache_freebatch batch; 2602 struct namecache *ncp; 2603 struct mtx *vlp, *vlp2; 2604 2605 TAILQ_INIT(&batch); 2606 vlp = VP2VNODELOCK(vp); 2607 vlp2 = NULL; 2608 mtx_lock(vlp); 2609 retry: 2610 while (!LIST_EMPTY(&vp->v_cache_src)) { 2611 ncp = LIST_FIRST(&vp->v_cache_src); 2612 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2613 goto retry; 2614 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2615 } 2616 while (!TAILQ_EMPTY(&vp->v_cache_dst)) { 2617 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2618 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2619 goto retry; 2620 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2621 } 2622 ncp = vp->v_cache_dd; 2623 if (ncp != NULL) { 2624 KASSERT(ncp->nc_flag & NCF_ISDOTDOT, 2625 ("lost dotdot link")); 2626 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2627 goto retry; 2628 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2629 } 2630 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); 2631 mtx_unlock(vlp); 2632 if (vlp2 != NULL) 2633 mtx_unlock(vlp2); 2634 cache_free_batch(&batch); 2635 } 2636 2637 /* 2638 * Opportunistic check to see if there is anything to do. 2639 */ 2640 static bool 2641 cache_has_entries(struct vnode *vp) 2642 { 2643 2644 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2645 vp->v_cache_dd == NULL) 2646 return (false); 2647 return (true); 2648 } 2649 2650 void 2651 cache_purge(struct vnode *vp) 2652 { 2653 2654 SDT_PROBE1(vfs, namecache, purge, done, vp); 2655 if (!cache_has_entries(vp)) 2656 return; 2657 cache_purge_impl(vp); 2658 } 2659 2660 /* 2661 * Only to be used by vgone. 2662 */ 2663 void 2664 cache_purge_vgone(struct vnode *vp) 2665 { 2666 struct mtx *vlp; 2667 2668 VNPASS(VN_IS_DOOMED(vp), vp); 2669 if (cache_has_entries(vp)) { 2670 cache_purge_impl(vp); 2671 return; 2672 } 2673 2674 /* 2675 * Serialize against a potential thread doing cache_purge. 2676 */ 2677 vlp = VP2VNODELOCK(vp); 2678 mtx_wait_unlocked(vlp); 2679 if (cache_has_entries(vp)) { 2680 cache_purge_impl(vp); 2681 return; 2682 } 2683 return; 2684 } 2685 2686 /* 2687 * Invalidate all negative entries for a particular directory vnode. 2688 */ 2689 void 2690 cache_purge_negative(struct vnode *vp) 2691 { 2692 struct cache_freebatch batch; 2693 struct namecache *ncp, *nnp; 2694 struct mtx *vlp; 2695 2696 SDT_PROBE1(vfs, namecache, purge_negative, done, vp); 2697 if (LIST_EMPTY(&vp->v_cache_src)) 2698 return; 2699 TAILQ_INIT(&batch); 2700 vlp = VP2VNODELOCK(vp); 2701 mtx_lock(vlp); 2702 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { 2703 if (!(ncp->nc_flag & NCF_NEGATIVE)) 2704 continue; 2705 cache_zap_negative_locked_vnode_kl(ncp, vp); 2706 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2707 } 2708 mtx_unlock(vlp); 2709 cache_free_batch(&batch); 2710 } 2711 2712 /* 2713 * Entry points for modifying VOP operations. 2714 */ 2715 void 2716 cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, 2717 struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp) 2718 { 2719 2720 ASSERT_VOP_IN_SEQC(fdvp); 2721 ASSERT_VOP_IN_SEQC(fvp); 2722 ASSERT_VOP_IN_SEQC(tdvp); 2723 if (tvp != NULL) 2724 ASSERT_VOP_IN_SEQC(tvp); 2725 2726 cache_purge(fvp); 2727 if (tvp != NULL) { 2728 cache_purge(tvp); 2729 KASSERT(!cache_remove_cnp(tdvp, tcnp), 2730 ("%s: lingering negative entry", __func__)); 2731 } else { 2732 cache_remove_cnp(tdvp, tcnp); 2733 } 2734 } 2735 2736 void 2737 cache_vop_rmdir(struct vnode *dvp, struct vnode *vp) 2738 { 2739 2740 ASSERT_VOP_IN_SEQC(dvp); 2741 ASSERT_VOP_IN_SEQC(vp); 2742 cache_purge(vp); 2743 } 2744 2745 #ifdef INVARIANTS 2746 /* 2747 * Validate that if an entry exists it matches. 2748 */ 2749 void 2750 cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) 2751 { 2752 struct namecache *ncp; 2753 struct mtx *blp; 2754 uint32_t hash; 2755 2756 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 2757 if (CK_SLIST_EMPTY(NCHHASH(hash))) 2758 return; 2759 blp = HASH2BUCKETLOCK(hash); 2760 mtx_lock(blp); 2761 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 2762 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 2763 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) { 2764 if (ncp->nc_vp != vp) 2765 panic("%s: mismatch (%p != %p); ncp %p [%s] dvp %p vp %p\n", 2766 __func__, vp, ncp->nc_vp, ncp, ncp->nc_name, ncp->nc_dvp, 2767 ncp->nc_vp); 2768 } 2769 } 2770 mtx_unlock(blp); 2771 } 2772 #endif 2773 2774 /* 2775 * Flush all entries referencing a particular filesystem. 2776 */ 2777 void 2778 cache_purgevfs(struct mount *mp) 2779 { 2780 struct vnode *vp, *mvp; 2781 2782 SDT_PROBE1(vfs, namecache, purgevfs, done, mp); 2783 /* 2784 * Somewhat wasteful iteration over all vnodes. Would be better to 2785 * support filtering and avoid the interlock to begin with. 2786 */ 2787 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 2788 if (!cache_has_entries(vp)) { 2789 VI_UNLOCK(vp); 2790 continue; 2791 } 2792 vholdl(vp); 2793 VI_UNLOCK(vp); 2794 cache_purge(vp); 2795 vdrop(vp); 2796 } 2797 } 2798 2799 /* 2800 * Perform canonical checks and cache lookup and pass on to filesystem 2801 * through the vop_cachedlookup only if needed. 2802 */ 2803 2804 int 2805 vfs_cache_lookup(struct vop_lookup_args *ap) 2806 { 2807 struct vnode *dvp; 2808 int error; 2809 struct vnode **vpp = ap->a_vpp; 2810 struct componentname *cnp = ap->a_cnp; 2811 int flags = cnp->cn_flags; 2812 2813 *vpp = NULL; 2814 dvp = ap->a_dvp; 2815 2816 if (dvp->v_type != VDIR) 2817 return (ENOTDIR); 2818 2819 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && 2820 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) 2821 return (EROFS); 2822 2823 error = vn_dir_check_exec(dvp, cnp); 2824 if (error != 0) 2825 return (error); 2826 2827 error = cache_lookup(dvp, vpp, cnp, NULL, NULL); 2828 if (error == 0) 2829 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); 2830 if (error == -1) 2831 return (0); 2832 return (error); 2833 } 2834 2835 /* Implementation of the getcwd syscall. */ 2836 int 2837 sys___getcwd(struct thread *td, struct __getcwd_args *uap) 2838 { 2839 char *buf, *retbuf; 2840 size_t buflen; 2841 int error; 2842 2843 buflen = uap->buflen; 2844 if (__predict_false(buflen < 2)) 2845 return (EINVAL); 2846 if (buflen > MAXPATHLEN) 2847 buflen = MAXPATHLEN; 2848 2849 buf = uma_zalloc(namei_zone, M_WAITOK); 2850 error = vn_getcwd(buf, &retbuf, &buflen); 2851 if (error == 0) 2852 error = copyout(retbuf, uap->buf, buflen); 2853 uma_zfree(namei_zone, buf); 2854 return (error); 2855 } 2856 2857 int 2858 vn_getcwd(char *buf, char **retbuf, size_t *buflen) 2859 { 2860 struct pwd *pwd; 2861 int error; 2862 2863 vfs_smr_enter(); 2864 pwd = pwd_get_smr(); 2865 error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, 2866 buflen, 0); 2867 VFS_SMR_ASSERT_NOT_ENTERED(); 2868 if (error < 0) { 2869 pwd = pwd_hold(curthread); 2870 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf, 2871 retbuf, buflen); 2872 pwd_drop(pwd); 2873 } 2874 2875 #ifdef KTRACE 2876 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0) 2877 ktrnamei(*retbuf); 2878 #endif 2879 return (error); 2880 } 2881 2882 static int 2883 kern___realpathat(struct thread *td, int fd, const char *path, char *buf, 2884 size_t size, int flags, enum uio_seg pathseg) 2885 { 2886 struct nameidata nd; 2887 char *retbuf, *freebuf; 2888 int error; 2889 2890 if (flags != 0) 2891 return (EINVAL); 2892 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1, 2893 pathseg, path, fd, &cap_fstat_rights, td); 2894 if ((error = namei(&nd)) != 0) 2895 return (error); 2896 error = vn_fullpath_hardlink(&nd, &retbuf, &freebuf, &size); 2897 if (error == 0) { 2898 error = copyout(retbuf, buf, size); 2899 free(freebuf, M_TEMP); 2900 } 2901 NDFREE(&nd, 0); 2902 return (error); 2903 } 2904 2905 int 2906 sys___realpathat(struct thread *td, struct __realpathat_args *uap) 2907 { 2908 2909 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size, 2910 uap->flags, UIO_USERSPACE)); 2911 } 2912 2913 /* 2914 * Retrieve the full filesystem path that correspond to a vnode from the name 2915 * cache (if available) 2916 */ 2917 int 2918 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf) 2919 { 2920 struct pwd *pwd; 2921 char *buf; 2922 size_t buflen; 2923 int error; 2924 2925 if (__predict_false(vp == NULL)) 2926 return (EINVAL); 2927 2928 buflen = MAXPATHLEN; 2929 buf = malloc(buflen, M_TEMP, M_WAITOK); 2930 vfs_smr_enter(); 2931 pwd = pwd_get_smr(); 2932 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, 0); 2933 VFS_SMR_ASSERT_NOT_ENTERED(); 2934 if (error < 0) { 2935 pwd = pwd_hold(curthread); 2936 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen); 2937 pwd_drop(pwd); 2938 } 2939 if (error == 0) 2940 *freebuf = buf; 2941 else 2942 free(buf, M_TEMP); 2943 return (error); 2944 } 2945 2946 /* 2947 * This function is similar to vn_fullpath, but it attempts to lookup the 2948 * pathname relative to the global root mount point. This is required for the 2949 * auditing sub-system, as audited pathnames must be absolute, relative to the 2950 * global root mount point. 2951 */ 2952 int 2953 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf) 2954 { 2955 char *buf; 2956 size_t buflen; 2957 int error; 2958 2959 if (__predict_false(vp == NULL)) 2960 return (EINVAL); 2961 buflen = MAXPATHLEN; 2962 buf = malloc(buflen, M_TEMP, M_WAITOK); 2963 vfs_smr_enter(); 2964 error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, 0); 2965 VFS_SMR_ASSERT_NOT_ENTERED(); 2966 if (error < 0) { 2967 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen); 2968 } 2969 if (error == 0) 2970 *freebuf = buf; 2971 else 2972 free(buf, M_TEMP); 2973 return (error); 2974 } 2975 2976 static struct namecache * 2977 vn_dd_from_dst(struct vnode *vp) 2978 { 2979 struct namecache *ncp; 2980 2981 cache_assert_vnode_locked(vp); 2982 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) { 2983 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2984 return (ncp); 2985 } 2986 return (NULL); 2987 } 2988 2989 int 2990 vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen) 2991 { 2992 struct vnode *dvp; 2993 struct namecache *ncp; 2994 struct mtx *vlp; 2995 int error; 2996 2997 vlp = VP2VNODELOCK(*vp); 2998 mtx_lock(vlp); 2999 ncp = (*vp)->v_cache_dd; 3000 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) { 3001 KASSERT(ncp == vn_dd_from_dst(*vp), 3002 ("%s: mismatch for dd entry (%p != %p)", __func__, 3003 ncp, vn_dd_from_dst(*vp))); 3004 } else { 3005 ncp = vn_dd_from_dst(*vp); 3006 } 3007 if (ncp != NULL) { 3008 if (*buflen < ncp->nc_nlen) { 3009 mtx_unlock(vlp); 3010 vrele(*vp); 3011 counter_u64_add(numfullpathfail4, 1); 3012 error = ENOMEM; 3013 SDT_PROBE3(vfs, namecache, fullpath, return, error, 3014 vp, NULL); 3015 return (error); 3016 } 3017 *buflen -= ncp->nc_nlen; 3018 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 3019 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, 3020 ncp->nc_name, vp); 3021 dvp = *vp; 3022 *vp = ncp->nc_dvp; 3023 vref(*vp); 3024 mtx_unlock(vlp); 3025 vrele(dvp); 3026 return (0); 3027 } 3028 SDT_PROBE1(vfs, namecache, fullpath, miss, vp); 3029 3030 mtx_unlock(vlp); 3031 vn_lock(*vp, LK_SHARED | LK_RETRY); 3032 error = VOP_VPTOCNP(*vp, &dvp, buf, buflen); 3033 vput(*vp); 3034 if (error) { 3035 counter_u64_add(numfullpathfail2, 1); 3036 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 3037 return (error); 3038 } 3039 3040 *vp = dvp; 3041 if (VN_IS_DOOMED(dvp)) { 3042 /* forced unmount */ 3043 vrele(dvp); 3044 error = ENOENT; 3045 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 3046 return (error); 3047 } 3048 /* 3049 * *vp has its use count incremented still. 3050 */ 3051 3052 return (0); 3053 } 3054 3055 /* 3056 * Resolve a directory to a pathname. 3057 * 3058 * The name of the directory can always be found in the namecache or fetched 3059 * from the filesystem. There is also guaranteed to be only one parent, meaning 3060 * we can just follow vnodes up until we find the root. 3061 * 3062 * The vnode must be referenced. 3063 */ 3064 static int 3065 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 3066 size_t *len, size_t addend) 3067 { 3068 #ifdef KDTRACE_HOOKS 3069 struct vnode *startvp = vp; 3070 #endif 3071 struct vnode *vp1; 3072 size_t buflen; 3073 int error; 3074 bool slash_prefixed; 3075 3076 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); 3077 VNPASS(vp->v_usecount > 0, vp); 3078 3079 buflen = *len; 3080 3081 slash_prefixed = true; 3082 if (addend == 0) { 3083 MPASS(*len >= 2); 3084 buflen--; 3085 buf[buflen] = '\0'; 3086 slash_prefixed = false; 3087 } 3088 3089 error = 0; 3090 3091 SDT_PROBE1(vfs, namecache, fullpath, entry, vp); 3092 counter_u64_add(numfullpathcalls, 1); 3093 while (vp != rdir && vp != rootvnode) { 3094 /* 3095 * The vp vnode must be already fully constructed, 3096 * since it is either found in namecache or obtained 3097 * from VOP_VPTOCNP(). We may test for VV_ROOT safely 3098 * without obtaining the vnode lock. 3099 */ 3100 if ((vp->v_vflag & VV_ROOT) != 0) { 3101 vn_lock(vp, LK_RETRY | LK_SHARED); 3102 3103 /* 3104 * With the vnode locked, check for races with 3105 * unmount, forced or not. Note that we 3106 * already verified that vp is not equal to 3107 * the root vnode, which means that 3108 * mnt_vnodecovered can be NULL only for the 3109 * case of unmount. 3110 */ 3111 if (VN_IS_DOOMED(vp) || 3112 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || 3113 vp1->v_mountedhere != vp->v_mount) { 3114 vput(vp); 3115 error = ENOENT; 3116 SDT_PROBE3(vfs, namecache, fullpath, return, 3117 error, vp, NULL); 3118 break; 3119 } 3120 3121 vref(vp1); 3122 vput(vp); 3123 vp = vp1; 3124 continue; 3125 } 3126 if (vp->v_type != VDIR) { 3127 vrele(vp); 3128 counter_u64_add(numfullpathfail1, 1); 3129 error = ENOTDIR; 3130 SDT_PROBE3(vfs, namecache, fullpath, return, 3131 error, vp, NULL); 3132 break; 3133 } 3134 error = vn_vptocnp(&vp, buf, &buflen); 3135 if (error) 3136 break; 3137 if (buflen == 0) { 3138 vrele(vp); 3139 error = ENOMEM; 3140 SDT_PROBE3(vfs, namecache, fullpath, return, error, 3141 startvp, NULL); 3142 break; 3143 } 3144 buf[--buflen] = '/'; 3145 slash_prefixed = true; 3146 } 3147 if (error) 3148 return (error); 3149 if (!slash_prefixed) { 3150 if (buflen == 0) { 3151 vrele(vp); 3152 counter_u64_add(numfullpathfail4, 1); 3153 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, 3154 startvp, NULL); 3155 return (ENOMEM); 3156 } 3157 buf[--buflen] = '/'; 3158 } 3159 counter_u64_add(numfullpathfound, 1); 3160 vrele(vp); 3161 3162 *retbuf = buf + buflen; 3163 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf); 3164 *len -= buflen; 3165 *len += addend; 3166 return (0); 3167 } 3168 3169 /* 3170 * Resolve an arbitrary vnode to a pathname. 3171 * 3172 * Note 2 caveats: 3173 * - hardlinks are not tracked, thus if the vnode is not a directory this can 3174 * resolve to a different path than the one used to find it 3175 * - namecache is not mandatory, meaning names are not guaranteed to be added 3176 * (in which case resolving fails) 3177 */ 3178 static void __inline 3179 cache_rev_failed_impl(int *reason, int line) 3180 { 3181 3182 *reason = line; 3183 } 3184 #define cache_rev_failed(var) cache_rev_failed_impl((var), __LINE__) 3185 3186 static int 3187 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 3188 char **retbuf, size_t *buflen, size_t addend) 3189 { 3190 #ifdef KDTRACE_HOOKS 3191 struct vnode *startvp = vp; 3192 #endif 3193 struct vnode *tvp; 3194 struct mount *mp; 3195 struct namecache *ncp; 3196 size_t orig_buflen; 3197 int reason; 3198 int error; 3199 #ifdef KDTRACE_HOOKS 3200 int i; 3201 #endif 3202 seqc_t vp_seqc, tvp_seqc; 3203 u_char nc_flag; 3204 3205 VFS_SMR_ASSERT_ENTERED(); 3206 3207 if (!cache_fast_revlookup) { 3208 vfs_smr_exit(); 3209 return (-1); 3210 } 3211 3212 orig_buflen = *buflen; 3213 3214 if (addend == 0) { 3215 MPASS(*buflen >= 2); 3216 *buflen -= 1; 3217 buf[*buflen] = '\0'; 3218 } 3219 3220 if (vp == rdir || vp == rootvnode) { 3221 if (addend == 0) { 3222 *buflen -= 1; 3223 buf[*buflen] = '/'; 3224 } 3225 goto out_ok; 3226 } 3227 3228 #ifdef KDTRACE_HOOKS 3229 i = 0; 3230 #endif 3231 error = -1; 3232 ncp = NULL; /* for sdt probe down below */ 3233 vp_seqc = vn_seqc_read_any(vp); 3234 if (seqc_in_modify(vp_seqc)) { 3235 cache_rev_failed(&reason); 3236 goto out_abort; 3237 } 3238 3239 for (;;) { 3240 #ifdef KDTRACE_HOOKS 3241 i++; 3242 #endif 3243 if ((vp->v_vflag & VV_ROOT) != 0) { 3244 mp = atomic_load_ptr(&vp->v_mount); 3245 if (mp == NULL) { 3246 cache_rev_failed(&reason); 3247 goto out_abort; 3248 } 3249 tvp = atomic_load_ptr(&mp->mnt_vnodecovered); 3250 tvp_seqc = vn_seqc_read_any(tvp); 3251 if (seqc_in_modify(tvp_seqc)) { 3252 cache_rev_failed(&reason); 3253 goto out_abort; 3254 } 3255 if (!vn_seqc_consistent(vp, vp_seqc)) { 3256 cache_rev_failed(&reason); 3257 goto out_abort; 3258 } 3259 vp = tvp; 3260 vp_seqc = tvp_seqc; 3261 continue; 3262 } 3263 ncp = atomic_load_ptr(&vp->v_cache_dd); 3264 if (ncp == NULL) { 3265 cache_rev_failed(&reason); 3266 goto out_abort; 3267 } 3268 nc_flag = atomic_load_char(&ncp->nc_flag); 3269 if ((nc_flag & NCF_ISDOTDOT) != 0) { 3270 cache_rev_failed(&reason); 3271 goto out_abort; 3272 } 3273 if (!cache_ncp_canuse(ncp)) { 3274 cache_rev_failed(&reason); 3275 goto out_abort; 3276 } 3277 if (ncp->nc_nlen >= *buflen) { 3278 cache_rev_failed(&reason); 3279 error = ENOMEM; 3280 goto out_abort; 3281 } 3282 *buflen -= ncp->nc_nlen; 3283 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 3284 *buflen -= 1; 3285 buf[*buflen] = '/'; 3286 tvp = ncp->nc_dvp; 3287 tvp_seqc = vn_seqc_read_any(tvp); 3288 if (seqc_in_modify(tvp_seqc)) { 3289 cache_rev_failed(&reason); 3290 goto out_abort; 3291 } 3292 if (!vn_seqc_consistent(vp, vp_seqc)) { 3293 cache_rev_failed(&reason); 3294 goto out_abort; 3295 } 3296 vp = tvp; 3297 vp_seqc = tvp_seqc; 3298 if (vp == rdir || vp == rootvnode) 3299 break; 3300 } 3301 out_ok: 3302 vfs_smr_exit(); 3303 *retbuf = buf + *buflen; 3304 *buflen = orig_buflen - *buflen + addend; 3305 SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf); 3306 return (0); 3307 3308 out_abort: 3309 *buflen = orig_buflen; 3310 SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i); 3311 vfs_smr_exit(); 3312 return (error); 3313 } 3314 3315 static int 3316 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 3317 size_t *buflen) 3318 { 3319 size_t orig_buflen, addend; 3320 int error; 3321 3322 if (*buflen < 2) 3323 return (EINVAL); 3324 3325 orig_buflen = *buflen; 3326 3327 vref(vp); 3328 addend = 0; 3329 if (vp->v_type != VDIR) { 3330 *buflen -= 1; 3331 buf[*buflen] = '\0'; 3332 error = vn_vptocnp(&vp, buf, buflen); 3333 if (error) 3334 return (error); 3335 if (*buflen == 0) { 3336 vrele(vp); 3337 return (ENOMEM); 3338 } 3339 *buflen -= 1; 3340 buf[*buflen] = '/'; 3341 addend = orig_buflen - *buflen; 3342 } 3343 3344 return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, addend)); 3345 } 3346 3347 /* 3348 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks). 3349 * 3350 * Since the namecache does not track hardlinks, the caller is expected to first 3351 * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei. 3352 * 3353 * Then we have 2 cases: 3354 * - if the found vnode is a directory, the path can be constructed just by 3355 * following names up the chain 3356 * - otherwise we populate the buffer with the saved name and start resolving 3357 * from the parent 3358 */ 3359 static int 3360 vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, char **freebuf, 3361 size_t *buflen) 3362 { 3363 char *buf, *tmpbuf; 3364 struct pwd *pwd; 3365 struct componentname *cnp; 3366 struct vnode *vp; 3367 size_t addend; 3368 int error; 3369 enum vtype type; 3370 3371 if (*buflen < 2) 3372 return (EINVAL); 3373 if (*buflen > MAXPATHLEN) 3374 *buflen = MAXPATHLEN; 3375 3376 buf = malloc(*buflen, M_TEMP, M_WAITOK); 3377 3378 addend = 0; 3379 vp = ndp->ni_vp; 3380 /* 3381 * Check for VBAD to work around the vp_crossmp bug in lookup(). 3382 * 3383 * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be 3384 * set to mount point's root vnode while ni_dvp will be vp_crossmp. 3385 * If the type is VDIR (like in this very case) we can skip looking 3386 * at ni_dvp in the first place. However, since vnodes get passed here 3387 * unlocked the target may transition to doomed state (type == VBAD) 3388 * before we get to evaluate the condition. If this happens, we will 3389 * populate part of the buffer and descend to vn_fullpath_dir with 3390 * vp == vp_crossmp. Prevent the problem by checking for VBAD. 3391 * 3392 * This should be atomic_load(&vp->v_type) but it is illegal to take 3393 * an address of a bit field, even if said field is sized to char. 3394 * Work around the problem by reading the value into a full-sized enum 3395 * and then re-reading it with atomic_load which will still prevent 3396 * the compiler from re-reading down the road. 3397 */ 3398 type = vp->v_type; 3399 type = atomic_load_int(&type); 3400 if (type == VBAD) { 3401 error = ENOENT; 3402 goto out_bad; 3403 } 3404 if (type != VDIR) { 3405 cnp = &ndp->ni_cnd; 3406 addend = cnp->cn_namelen + 2; 3407 if (*buflen < addend) { 3408 error = ENOMEM; 3409 goto out_bad; 3410 } 3411 *buflen -= addend; 3412 tmpbuf = buf + *buflen; 3413 tmpbuf[0] = '/'; 3414 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen); 3415 tmpbuf[addend - 1] = '\0'; 3416 vp = ndp->ni_dvp; 3417 } 3418 3419 vfs_smr_enter(); 3420 pwd = pwd_get_smr(); 3421 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen, 3422 addend); 3423 VFS_SMR_ASSERT_NOT_ENTERED(); 3424 if (error < 0) { 3425 pwd = pwd_hold(curthread); 3426 vref(vp); 3427 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen, 3428 addend); 3429 pwd_drop(pwd); 3430 if (error != 0) 3431 goto out_bad; 3432 } 3433 3434 *freebuf = buf; 3435 3436 return (0); 3437 out_bad: 3438 free(buf, M_TEMP); 3439 return (error); 3440 } 3441 3442 struct vnode * 3443 vn_dir_dd_ino(struct vnode *vp) 3444 { 3445 struct namecache *ncp; 3446 struct vnode *ddvp; 3447 struct mtx *vlp; 3448 enum vgetstate vs; 3449 3450 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); 3451 vlp = VP2VNODELOCK(vp); 3452 mtx_lock(vlp); 3453 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { 3454 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 3455 continue; 3456 ddvp = ncp->nc_dvp; 3457 vs = vget_prep(ddvp); 3458 mtx_unlock(vlp); 3459 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) 3460 return (NULL); 3461 return (ddvp); 3462 } 3463 mtx_unlock(vlp); 3464 return (NULL); 3465 } 3466 3467 int 3468 vn_commname(struct vnode *vp, char *buf, u_int buflen) 3469 { 3470 struct namecache *ncp; 3471 struct mtx *vlp; 3472 int l; 3473 3474 vlp = VP2VNODELOCK(vp); 3475 mtx_lock(vlp); 3476 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) 3477 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 3478 break; 3479 if (ncp == NULL) { 3480 mtx_unlock(vlp); 3481 return (ENOENT); 3482 } 3483 l = min(ncp->nc_nlen, buflen - 1); 3484 memcpy(buf, ncp->nc_name, l); 3485 mtx_unlock(vlp); 3486 buf[l] = '\0'; 3487 return (0); 3488 } 3489 3490 /* 3491 * This function updates path string to vnode's full global path 3492 * and checks the size of the new path string against the pathlen argument. 3493 * 3494 * Requires a locked, referenced vnode. 3495 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 3496 * 3497 * If vp is a directory, the call to vn_fullpath_global() always succeeds 3498 * because it falls back to the ".." lookup if the namecache lookup fails. 3499 */ 3500 int 3501 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, 3502 u_int pathlen) 3503 { 3504 struct nameidata nd; 3505 struct vnode *vp1; 3506 char *rpath, *fbuf; 3507 int error; 3508 3509 ASSERT_VOP_ELOCKED(vp, __func__); 3510 3511 /* Construct global filesystem path from vp. */ 3512 VOP_UNLOCK(vp); 3513 error = vn_fullpath_global(vp, &rpath, &fbuf); 3514 3515 if (error != 0) { 3516 vrele(vp); 3517 return (error); 3518 } 3519 3520 if (strlen(rpath) >= pathlen) { 3521 vrele(vp); 3522 error = ENAMETOOLONG; 3523 goto out; 3524 } 3525 3526 /* 3527 * Re-lookup the vnode by path to detect a possible rename. 3528 * As a side effect, the vnode is relocked. 3529 * If vnode was renamed, return ENOENT. 3530 */ 3531 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, 3532 UIO_SYSSPACE, path, td); 3533 error = namei(&nd); 3534 if (error != 0) { 3535 vrele(vp); 3536 goto out; 3537 } 3538 NDFREE(&nd, NDF_ONLY_PNBUF); 3539 vp1 = nd.ni_vp; 3540 vrele(vp); 3541 if (vp1 == vp) 3542 strcpy(path, rpath); 3543 else { 3544 vput(vp1); 3545 error = ENOENT; 3546 } 3547 3548 out: 3549 free(fbuf, M_TEMP); 3550 return (error); 3551 } 3552 3553 #ifdef DDB 3554 static void 3555 db_print_vpath(struct vnode *vp) 3556 { 3557 3558 while (vp != NULL) { 3559 db_printf("%p: ", vp); 3560 if (vp == rootvnode) { 3561 db_printf("/"); 3562 vp = NULL; 3563 } else { 3564 if (vp->v_vflag & VV_ROOT) { 3565 db_printf("<mount point>"); 3566 vp = vp->v_mount->mnt_vnodecovered; 3567 } else { 3568 struct namecache *ncp; 3569 char *ncn; 3570 int i; 3571 3572 ncp = TAILQ_FIRST(&vp->v_cache_dst); 3573 if (ncp != NULL) { 3574 ncn = ncp->nc_name; 3575 for (i = 0; i < ncp->nc_nlen; i++) 3576 db_printf("%c", *ncn++); 3577 vp = ncp->nc_dvp; 3578 } else { 3579 vp = NULL; 3580 } 3581 } 3582 } 3583 db_printf("\n"); 3584 } 3585 3586 return; 3587 } 3588 3589 DB_SHOW_COMMAND(vpath, db_show_vpath) 3590 { 3591 struct vnode *vp; 3592 3593 if (!have_addr) { 3594 db_printf("usage: show vpath <struct vnode *>\n"); 3595 return; 3596 } 3597 3598 vp = (struct vnode *)addr; 3599 db_print_vpath(vp); 3600 } 3601 3602 #endif 3603 3604 static int cache_fast_lookup = 1; 3605 static char __read_frequently cache_fast_lookup_enabled = true; 3606 3607 #define CACHE_FPL_FAILED -2020 3608 3609 void 3610 cache_fast_lookup_enabled_recalc(void) 3611 { 3612 int lookup_flag; 3613 int mac_on; 3614 3615 #ifdef MAC 3616 mac_on = mac_vnode_check_lookup_enabled(); 3617 mac_on |= mac_vnode_check_readlink_enabled(); 3618 #else 3619 mac_on = 0; 3620 #endif 3621 3622 lookup_flag = atomic_load_int(&cache_fast_lookup); 3623 if (lookup_flag && !mac_on) { 3624 atomic_store_char(&cache_fast_lookup_enabled, true); 3625 } else { 3626 atomic_store_char(&cache_fast_lookup_enabled, false); 3627 } 3628 } 3629 3630 static int 3631 syscal_vfs_cache_fast_lookup(SYSCTL_HANDLER_ARGS) 3632 { 3633 int error, old; 3634 3635 old = atomic_load_int(&cache_fast_lookup); 3636 error = sysctl_handle_int(oidp, arg1, arg2, req); 3637 if (error == 0 && req->newptr && old != atomic_load_int(&cache_fast_lookup)) 3638 cache_fast_lookup_enabled_recalc(); 3639 return (error); 3640 } 3641 SYSCTL_PROC(_vfs, OID_AUTO, cache_fast_lookup, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_MPSAFE, 3642 &cache_fast_lookup, 0, syscal_vfs_cache_fast_lookup, "IU", ""); 3643 3644 /* 3645 * Components of nameidata (or objects it can point to) which may 3646 * need restoring in case fast path lookup fails. 3647 */ 3648 struct nameidata_outer { 3649 size_t ni_pathlen; 3650 int cn_flags; 3651 }; 3652 3653 struct nameidata_saved { 3654 #ifdef INVARIANTS 3655 char *cn_nameptr; 3656 size_t ni_pathlen; 3657 #endif 3658 }; 3659 3660 #ifdef INVARIANTS 3661 struct cache_fpl_debug { 3662 size_t ni_pathlen; 3663 }; 3664 #endif 3665 3666 struct cache_fpl { 3667 struct nameidata *ndp; 3668 struct componentname *cnp; 3669 char *nulchar; 3670 struct pwd **pwd; 3671 struct vnode *dvp; 3672 struct vnode *tvp; 3673 seqc_t dvp_seqc; 3674 seqc_t tvp_seqc; 3675 struct nameidata_saved snd; 3676 struct nameidata_outer snd_outer; 3677 int line; 3678 enum cache_fpl_status status:8; 3679 bool in_smr; 3680 bool fsearch; 3681 bool savename; 3682 #ifdef INVARIANTS 3683 struct cache_fpl_debug debug; 3684 #endif 3685 }; 3686 3687 static bool cache_fplookup_is_mp(struct cache_fpl *fpl); 3688 static int cache_fplookup_cross_mount(struct cache_fpl *fpl); 3689 static int cache_fplookup_partial_setup(struct cache_fpl *fpl); 3690 static int cache_fplookup_skip_slashes(struct cache_fpl *fpl); 3691 static int cache_fplookup_preparse(struct cache_fpl *fpl); 3692 static void cache_fpl_pathlen_dec(struct cache_fpl *fpl); 3693 static void cache_fpl_pathlen_inc(struct cache_fpl *fpl); 3694 static void cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n); 3695 static void cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n); 3696 3697 static void 3698 cache_fpl_cleanup_cnp(struct componentname *cnp) 3699 { 3700 3701 uma_zfree(namei_zone, cnp->cn_pnbuf); 3702 #ifdef DIAGNOSTIC 3703 cnp->cn_pnbuf = NULL; 3704 cnp->cn_nameptr = NULL; 3705 #endif 3706 } 3707 3708 static struct vnode * 3709 cache_fpl_handle_root(struct cache_fpl *fpl) 3710 { 3711 struct nameidata *ndp; 3712 struct componentname *cnp; 3713 3714 ndp = fpl->ndp; 3715 cnp = fpl->cnp; 3716 3717 MPASS(*(cnp->cn_nameptr) == '/'); 3718 cnp->cn_nameptr++; 3719 cache_fpl_pathlen_dec(fpl); 3720 3721 if (__predict_false(*(cnp->cn_nameptr) == '/')) { 3722 do { 3723 cnp->cn_nameptr++; 3724 cache_fpl_pathlen_dec(fpl); 3725 } while (*(cnp->cn_nameptr) == '/'); 3726 } 3727 3728 return (ndp->ni_rootdir); 3729 } 3730 3731 static void 3732 cache_fpl_checkpoint_outer(struct cache_fpl *fpl) 3733 { 3734 3735 fpl->snd_outer.ni_pathlen = fpl->ndp->ni_pathlen; 3736 fpl->snd_outer.cn_flags = fpl->ndp->ni_cnd.cn_flags; 3737 } 3738 3739 static void 3740 cache_fpl_checkpoint(struct cache_fpl *fpl) 3741 { 3742 3743 #ifdef INVARIANTS 3744 fpl->snd.cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr; 3745 fpl->snd.ni_pathlen = fpl->debug.ni_pathlen; 3746 #endif 3747 } 3748 3749 static void 3750 cache_fpl_restore_partial(struct cache_fpl *fpl) 3751 { 3752 3753 fpl->ndp->ni_cnd.cn_flags = fpl->snd_outer.cn_flags; 3754 #ifdef INVARIANTS 3755 fpl->debug.ni_pathlen = fpl->snd.ni_pathlen; 3756 #endif 3757 } 3758 3759 static void 3760 cache_fpl_restore_abort(struct cache_fpl *fpl) 3761 { 3762 3763 cache_fpl_restore_partial(fpl); 3764 /* 3765 * It is 0 on entry by API contract. 3766 */ 3767 fpl->ndp->ni_resflags = 0; 3768 fpl->ndp->ni_cnd.cn_nameptr = fpl->ndp->ni_cnd.cn_pnbuf; 3769 fpl->ndp->ni_pathlen = fpl->snd_outer.ni_pathlen; 3770 } 3771 3772 #ifdef INVARIANTS 3773 #define cache_fpl_smr_assert_entered(fpl) ({ \ 3774 struct cache_fpl *_fpl = (fpl); \ 3775 MPASS(_fpl->in_smr == true); \ 3776 VFS_SMR_ASSERT_ENTERED(); \ 3777 }) 3778 #define cache_fpl_smr_assert_not_entered(fpl) ({ \ 3779 struct cache_fpl *_fpl = (fpl); \ 3780 MPASS(_fpl->in_smr == false); \ 3781 VFS_SMR_ASSERT_NOT_ENTERED(); \ 3782 }) 3783 static void 3784 cache_fpl_assert_status(struct cache_fpl *fpl) 3785 { 3786 3787 switch (fpl->status) { 3788 case CACHE_FPL_STATUS_UNSET: 3789 __assert_unreachable(); 3790 break; 3791 case CACHE_FPL_STATUS_DESTROYED: 3792 case CACHE_FPL_STATUS_ABORTED: 3793 case CACHE_FPL_STATUS_PARTIAL: 3794 case CACHE_FPL_STATUS_HANDLED: 3795 break; 3796 } 3797 } 3798 #else 3799 #define cache_fpl_smr_assert_entered(fpl) do { } while (0) 3800 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0) 3801 #define cache_fpl_assert_status(fpl) do { } while (0) 3802 #endif 3803 3804 #define cache_fpl_smr_enter_initial(fpl) ({ \ 3805 struct cache_fpl *_fpl = (fpl); \ 3806 vfs_smr_enter(); \ 3807 _fpl->in_smr = true; \ 3808 }) 3809 3810 #define cache_fpl_smr_enter(fpl) ({ \ 3811 struct cache_fpl *_fpl = (fpl); \ 3812 MPASS(_fpl->in_smr == false); \ 3813 vfs_smr_enter(); \ 3814 _fpl->in_smr = true; \ 3815 }) 3816 3817 #define cache_fpl_smr_exit(fpl) ({ \ 3818 struct cache_fpl *_fpl = (fpl); \ 3819 MPASS(_fpl->in_smr == true); \ 3820 vfs_smr_exit(); \ 3821 _fpl->in_smr = false; \ 3822 }) 3823 3824 static int 3825 cache_fpl_aborted_early_impl(struct cache_fpl *fpl, int line) 3826 { 3827 3828 if (fpl->status != CACHE_FPL_STATUS_UNSET) { 3829 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, 3830 ("%s: converting to abort from %d at %d, set at %d\n", 3831 __func__, fpl->status, line, fpl->line)); 3832 } 3833 cache_fpl_smr_assert_not_entered(fpl); 3834 fpl->status = CACHE_FPL_STATUS_ABORTED; 3835 fpl->line = line; 3836 return (CACHE_FPL_FAILED); 3837 } 3838 3839 #define cache_fpl_aborted_early(x) cache_fpl_aborted_early_impl((x), __LINE__) 3840 3841 static int __noinline 3842 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line) 3843 { 3844 struct nameidata *ndp; 3845 struct componentname *cnp; 3846 3847 ndp = fpl->ndp; 3848 cnp = fpl->cnp; 3849 3850 if (fpl->status != CACHE_FPL_STATUS_UNSET) { 3851 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, 3852 ("%s: converting to abort from %d at %d, set at %d\n", 3853 __func__, fpl->status, line, fpl->line)); 3854 } 3855 fpl->status = CACHE_FPL_STATUS_ABORTED; 3856 fpl->line = line; 3857 if (fpl->in_smr) 3858 cache_fpl_smr_exit(fpl); 3859 cache_fpl_restore_abort(fpl); 3860 /* 3861 * Resolving symlinks overwrites data passed by the caller. 3862 * Let namei know. 3863 */ 3864 if (ndp->ni_loopcnt > 0) { 3865 fpl->status = CACHE_FPL_STATUS_DESTROYED; 3866 cache_fpl_cleanup_cnp(cnp); 3867 } 3868 return (CACHE_FPL_FAILED); 3869 } 3870 3871 #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__) 3872 3873 static int __noinline 3874 cache_fpl_partial_impl(struct cache_fpl *fpl, int line) 3875 { 3876 3877 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3878 ("%s: setting to partial at %d, but already set to %d at %d\n", 3879 __func__, line, fpl->status, fpl->line)); 3880 cache_fpl_smr_assert_entered(fpl); 3881 fpl->status = CACHE_FPL_STATUS_PARTIAL; 3882 fpl->line = line; 3883 return (cache_fplookup_partial_setup(fpl)); 3884 } 3885 3886 #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__) 3887 3888 static int 3889 cache_fpl_handled_impl(struct cache_fpl *fpl, int line) 3890 { 3891 3892 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3893 ("%s: setting to handled at %d, but already set to %d at %d\n", 3894 __func__, line, fpl->status, fpl->line)); 3895 cache_fpl_smr_assert_not_entered(fpl); 3896 fpl->status = CACHE_FPL_STATUS_HANDLED; 3897 fpl->line = line; 3898 return (0); 3899 } 3900 3901 #define cache_fpl_handled(x) cache_fpl_handled_impl((x), __LINE__) 3902 3903 static int 3904 cache_fpl_handled_error_impl(struct cache_fpl *fpl, int error, int line) 3905 { 3906 3907 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3908 ("%s: setting to handled at %d, but already set to %d at %d\n", 3909 __func__, line, fpl->status, fpl->line)); 3910 MPASS(error != 0); 3911 MPASS(error != CACHE_FPL_FAILED); 3912 cache_fpl_smr_assert_not_entered(fpl); 3913 fpl->status = CACHE_FPL_STATUS_HANDLED; 3914 fpl->line = line; 3915 fpl->dvp = NULL; 3916 fpl->tvp = NULL; 3917 fpl->savename = false; 3918 return (error); 3919 } 3920 3921 #define cache_fpl_handled_error(x, e) cache_fpl_handled_error_impl((x), (e), __LINE__) 3922 3923 static bool 3924 cache_fpl_terminated(struct cache_fpl *fpl) 3925 { 3926 3927 return (fpl->status != CACHE_FPL_STATUS_UNSET); 3928 } 3929 3930 #define CACHE_FPL_SUPPORTED_CN_FLAGS \ 3931 (NC_NOMAKEENTRY | NC_KEEPPOSENTRY | LOCKLEAF | LOCKPARENT | WANTPARENT | \ 3932 FAILIFEXISTS | FOLLOW | LOCKSHARED | SAVENAME | SAVESTART | WILLBEDIR | \ 3933 ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK) 3934 3935 #define CACHE_FPL_INTERNAL_CN_FLAGS \ 3936 (ISDOTDOT | MAKEENTRY | ISLASTCN) 3937 3938 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, 3939 "supported and internal flags overlap"); 3940 3941 static bool 3942 cache_fpl_islastcn(struct nameidata *ndp) 3943 { 3944 3945 return (*ndp->ni_next == 0); 3946 } 3947 3948 static bool 3949 cache_fpl_isdotdot(struct componentname *cnp) 3950 { 3951 3952 if (cnp->cn_namelen == 2 && 3953 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') 3954 return (true); 3955 return (false); 3956 } 3957 3958 static bool 3959 cache_can_fplookup(struct cache_fpl *fpl) 3960 { 3961 struct nameidata *ndp; 3962 struct componentname *cnp; 3963 struct thread *td; 3964 3965 ndp = fpl->ndp; 3966 cnp = fpl->cnp; 3967 td = cnp->cn_thread; 3968 3969 if (!atomic_load_char(&cache_fast_lookup_enabled)) { 3970 cache_fpl_aborted_early(fpl); 3971 return (false); 3972 } 3973 if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) { 3974 cache_fpl_aborted_early(fpl); 3975 return (false); 3976 } 3977 if (IN_CAPABILITY_MODE(td)) { 3978 cache_fpl_aborted_early(fpl); 3979 return (false); 3980 } 3981 if (AUDITING_TD(td)) { 3982 cache_fpl_aborted_early(fpl); 3983 return (false); 3984 } 3985 if (ndp->ni_startdir != NULL) { 3986 cache_fpl_aborted_early(fpl); 3987 return (false); 3988 } 3989 return (true); 3990 } 3991 3992 static int 3993 cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp) 3994 { 3995 struct nameidata *ndp; 3996 int error; 3997 bool fsearch; 3998 3999 ndp = fpl->ndp; 4000 error = fgetvp_lookup_smr(ndp->ni_dirfd, ndp, vpp, &fsearch); 4001 if (__predict_false(error != 0)) { 4002 return (cache_fpl_aborted(fpl)); 4003 } 4004 fpl->fsearch = fsearch; 4005 return (0); 4006 } 4007 4008 static int __noinline 4009 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp, 4010 uint32_t hash) 4011 { 4012 struct componentname *cnp; 4013 struct vnode *dvp; 4014 4015 cnp = fpl->cnp; 4016 dvp = fpl->dvp; 4017 4018 cache_fpl_smr_exit(fpl); 4019 if (cache_neg_promote_cond(dvp, cnp, oncp, hash)) 4020 return (cache_fpl_handled_error(fpl, ENOENT)); 4021 else 4022 return (cache_fpl_aborted(fpl)); 4023 } 4024 4025 /* 4026 * The target vnode is not supported, prepare for the slow path to take over. 4027 */ 4028 static int __noinline 4029 cache_fplookup_partial_setup(struct cache_fpl *fpl) 4030 { 4031 struct nameidata *ndp; 4032 struct componentname *cnp; 4033 enum vgetstate dvs; 4034 struct vnode *dvp; 4035 struct pwd *pwd; 4036 seqc_t dvp_seqc; 4037 4038 ndp = fpl->ndp; 4039 cnp = fpl->cnp; 4040 pwd = *(fpl->pwd); 4041 dvp = fpl->dvp; 4042 dvp_seqc = fpl->dvp_seqc; 4043 4044 if (!pwd_hold_smr(pwd)) { 4045 return (cache_fpl_aborted(fpl)); 4046 } 4047 4048 /* 4049 * Note that seqc is checked before the vnode is locked, so by 4050 * the time regular lookup gets to it it may have moved. 4051 * 4052 * Ultimately this does not affect correctness, any lookup errors 4053 * are userspace racing with itself. It is guaranteed that any 4054 * path which ultimately gets found could also have been found 4055 * by regular lookup going all the way in absence of concurrent 4056 * modifications. 4057 */ 4058 dvs = vget_prep_smr(dvp); 4059 cache_fpl_smr_exit(fpl); 4060 if (__predict_false(dvs == VGET_NONE)) { 4061 pwd_drop(pwd); 4062 return (cache_fpl_aborted(fpl)); 4063 } 4064 4065 vget_finish_ref(dvp, dvs); 4066 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4067 vrele(dvp); 4068 pwd_drop(pwd); 4069 return (cache_fpl_aborted(fpl)); 4070 } 4071 4072 cache_fpl_restore_partial(fpl); 4073 #ifdef INVARIANTS 4074 if (cnp->cn_nameptr != fpl->snd.cn_nameptr) { 4075 panic("%s: cn_nameptr mismatch (%p != %p) full [%s]\n", __func__, 4076 cnp->cn_nameptr, fpl->snd.cn_nameptr, cnp->cn_pnbuf); 4077 } 4078 #endif 4079 4080 ndp->ni_startdir = dvp; 4081 cnp->cn_flags |= MAKEENTRY; 4082 if (cache_fpl_islastcn(ndp)) 4083 cnp->cn_flags |= ISLASTCN; 4084 if (cache_fpl_isdotdot(cnp)) 4085 cnp->cn_flags |= ISDOTDOT; 4086 4087 /* 4088 * Skip potential extra slashes parsing did not take care of. 4089 * cache_fplookup_skip_slashes explains the mechanism. 4090 */ 4091 if (__predict_false(*(cnp->cn_nameptr) == '/')) { 4092 do { 4093 cnp->cn_nameptr++; 4094 cache_fpl_pathlen_dec(fpl); 4095 } while (*(cnp->cn_nameptr) == '/'); 4096 } 4097 4098 ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1; 4099 #ifdef INVARIANTS 4100 if (ndp->ni_pathlen != fpl->debug.ni_pathlen) { 4101 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n", 4102 __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar, 4103 cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf); 4104 } 4105 #endif 4106 return (0); 4107 } 4108 4109 static int 4110 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs) 4111 { 4112 struct componentname *cnp; 4113 struct vnode *tvp; 4114 seqc_t tvp_seqc; 4115 int error, lkflags; 4116 4117 cnp = fpl->cnp; 4118 tvp = fpl->tvp; 4119 tvp_seqc = fpl->tvp_seqc; 4120 4121 if ((cnp->cn_flags & LOCKLEAF) != 0) { 4122 lkflags = LK_SHARED; 4123 if ((cnp->cn_flags & LOCKSHARED) == 0) 4124 lkflags = LK_EXCLUSIVE; 4125 error = vget_finish(tvp, lkflags, tvs); 4126 if (__predict_false(error != 0)) { 4127 return (cache_fpl_aborted(fpl)); 4128 } 4129 } else { 4130 vget_finish_ref(tvp, tvs); 4131 } 4132 4133 if (!vn_seqc_consistent(tvp, tvp_seqc)) { 4134 if ((cnp->cn_flags & LOCKLEAF) != 0) 4135 vput(tvp); 4136 else 4137 vrele(tvp); 4138 return (cache_fpl_aborted(fpl)); 4139 } 4140 4141 return (cache_fpl_handled(fpl)); 4142 } 4143 4144 /* 4145 * They want to possibly modify the state of the namecache. 4146 */ 4147 static int __noinline 4148 cache_fplookup_final_modifying(struct cache_fpl *fpl) 4149 { 4150 struct nameidata *ndp; 4151 struct componentname *cnp; 4152 enum vgetstate dvs; 4153 struct vnode *dvp, *tvp; 4154 struct mount *mp; 4155 seqc_t dvp_seqc; 4156 int error; 4157 bool docache; 4158 4159 ndp = fpl->ndp; 4160 cnp = fpl->cnp; 4161 dvp = fpl->dvp; 4162 dvp_seqc = fpl->dvp_seqc; 4163 4164 MPASS(*(cnp->cn_nameptr) != '/'); 4165 MPASS(cache_fpl_islastcn(ndp)); 4166 if ((cnp->cn_flags & LOCKPARENT) == 0) 4167 MPASS((cnp->cn_flags & WANTPARENT) != 0); 4168 MPASS((cnp->cn_flags & TRAILINGSLASH) == 0); 4169 MPASS(cnp->cn_nameiop == CREATE || cnp->cn_nameiop == DELETE || 4170 cnp->cn_nameiop == RENAME); 4171 MPASS((cnp->cn_flags & MAKEENTRY) == 0); 4172 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 4173 4174 docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE; 4175 if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME) 4176 docache = false; 4177 4178 mp = atomic_load_ptr(&dvp->v_mount); 4179 if (__predict_false(mp == NULL)) { 4180 return (cache_fpl_aborted(fpl)); 4181 } 4182 4183 if (__predict_false(mp->mnt_flag & MNT_RDONLY)) { 4184 cache_fpl_smr_exit(fpl); 4185 /* 4186 * Original code keeps not checking for CREATE which 4187 * might be a bug. For now let the old lookup decide. 4188 */ 4189 if (cnp->cn_nameiop == CREATE) { 4190 return (cache_fpl_aborted(fpl)); 4191 } 4192 return (cache_fpl_handled_error(fpl, EROFS)); 4193 } 4194 4195 if (fpl->tvp != NULL && (cnp->cn_flags & FAILIFEXISTS) != 0) { 4196 cache_fpl_smr_exit(fpl); 4197 return (cache_fpl_handled_error(fpl, EEXIST)); 4198 } 4199 4200 /* 4201 * Secure access to dvp; check cache_fplookup_partial_setup for 4202 * reasoning. 4203 * 4204 * XXX At least UFS requires its lookup routine to be called for 4205 * the last path component, which leads to some level of complication 4206 * and inefficiency: 4207 * - the target routine always locks the target vnode, but our caller 4208 * may not need it locked 4209 * - some of the VOP machinery asserts that the parent is locked, which 4210 * once more may be not required 4211 * 4212 * TODO: add a flag for filesystems which don't need this. 4213 */ 4214 dvs = vget_prep_smr(dvp); 4215 cache_fpl_smr_exit(fpl); 4216 if (__predict_false(dvs == VGET_NONE)) { 4217 return (cache_fpl_aborted(fpl)); 4218 } 4219 4220 vget_finish_ref(dvp, dvs); 4221 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4222 vrele(dvp); 4223 return (cache_fpl_aborted(fpl)); 4224 } 4225 4226 error = vn_lock(dvp, LK_EXCLUSIVE); 4227 if (__predict_false(error != 0)) { 4228 vrele(dvp); 4229 return (cache_fpl_aborted(fpl)); 4230 } 4231 4232 tvp = NULL; 4233 cnp->cn_flags |= ISLASTCN; 4234 if (docache) 4235 cnp->cn_flags |= MAKEENTRY; 4236 if (cache_fpl_isdotdot(cnp)) 4237 cnp->cn_flags |= ISDOTDOT; 4238 cnp->cn_lkflags = LK_EXCLUSIVE; 4239 error = VOP_LOOKUP(dvp, &tvp, cnp); 4240 switch (error) { 4241 case EJUSTRETURN: 4242 case 0: 4243 break; 4244 case ENOTDIR: 4245 case ENOENT: 4246 vput(dvp); 4247 return (cache_fpl_handled_error(fpl, error)); 4248 default: 4249 vput(dvp); 4250 return (cache_fpl_aborted(fpl)); 4251 } 4252 4253 fpl->tvp = tvp; 4254 fpl->savename = (cnp->cn_flags & SAVENAME) != 0; 4255 4256 if (tvp == NULL) { 4257 if ((cnp->cn_flags & SAVESTART) != 0) { 4258 ndp->ni_startdir = dvp; 4259 vrefact(ndp->ni_startdir); 4260 cnp->cn_flags |= SAVENAME; 4261 fpl->savename = true; 4262 } 4263 MPASS(error == EJUSTRETURN); 4264 if ((cnp->cn_flags & LOCKPARENT) == 0) { 4265 VOP_UNLOCK(dvp); 4266 } 4267 return (cache_fpl_handled(fpl)); 4268 } 4269 4270 /* 4271 * There are very hairy corner cases concerning various flag combinations 4272 * and locking state. In particular here we only hold one lock instead of 4273 * two. 4274 * 4275 * Skip the complexity as it is of no significance for normal workloads. 4276 */ 4277 if (__predict_false(tvp == dvp)) { 4278 vput(dvp); 4279 vrele(tvp); 4280 return (cache_fpl_aborted(fpl)); 4281 } 4282 4283 /* 4284 * Check if the target is either a symlink or a mount point. 4285 * Since we expect this to be the terminal vnode it should 4286 * almost never be true. 4287 */ 4288 if (__predict_false(tvp->v_type == VLNK || cache_fplookup_is_mp(fpl))) { 4289 vput(dvp); 4290 vput(tvp); 4291 return (cache_fpl_aborted(fpl)); 4292 } 4293 4294 if ((cnp->cn_flags & FAILIFEXISTS) != 0) { 4295 vput(dvp); 4296 vput(tvp); 4297 return (cache_fpl_handled_error(fpl, EEXIST)); 4298 } 4299 4300 if ((cnp->cn_flags & LOCKLEAF) == 0) { 4301 VOP_UNLOCK(tvp); 4302 } 4303 4304 if ((cnp->cn_flags & LOCKPARENT) == 0) { 4305 VOP_UNLOCK(dvp); 4306 } 4307 4308 if ((cnp->cn_flags & SAVESTART) != 0) { 4309 ndp->ni_startdir = dvp; 4310 vrefact(ndp->ni_startdir); 4311 cnp->cn_flags |= SAVENAME; 4312 fpl->savename = true; 4313 } 4314 4315 return (cache_fpl_handled(fpl)); 4316 } 4317 4318 static int __noinline 4319 cache_fplookup_modifying(struct cache_fpl *fpl) 4320 { 4321 struct nameidata *ndp; 4322 4323 ndp = fpl->ndp; 4324 4325 if (!cache_fpl_islastcn(ndp)) { 4326 return (cache_fpl_partial(fpl)); 4327 } 4328 return (cache_fplookup_final_modifying(fpl)); 4329 } 4330 4331 static int __noinline 4332 cache_fplookup_final_withparent(struct cache_fpl *fpl) 4333 { 4334 struct componentname *cnp; 4335 enum vgetstate dvs, tvs; 4336 struct vnode *dvp, *tvp; 4337 seqc_t dvp_seqc; 4338 int error; 4339 4340 cnp = fpl->cnp; 4341 dvp = fpl->dvp; 4342 dvp_seqc = fpl->dvp_seqc; 4343 tvp = fpl->tvp; 4344 4345 MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0); 4346 4347 /* 4348 * This is less efficient than it can be for simplicity. 4349 */ 4350 dvs = vget_prep_smr(dvp); 4351 if (__predict_false(dvs == VGET_NONE)) { 4352 return (cache_fpl_aborted(fpl)); 4353 } 4354 tvs = vget_prep_smr(tvp); 4355 if (__predict_false(tvs == VGET_NONE)) { 4356 cache_fpl_smr_exit(fpl); 4357 vget_abort(dvp, dvs); 4358 return (cache_fpl_aborted(fpl)); 4359 } 4360 4361 cache_fpl_smr_exit(fpl); 4362 4363 if ((cnp->cn_flags & LOCKPARENT) != 0) { 4364 error = vget_finish(dvp, LK_EXCLUSIVE, dvs); 4365 if (__predict_false(error != 0)) { 4366 vget_abort(tvp, tvs); 4367 return (cache_fpl_aborted(fpl)); 4368 } 4369 } else { 4370 vget_finish_ref(dvp, dvs); 4371 } 4372 4373 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4374 vget_abort(tvp, tvs); 4375 if ((cnp->cn_flags & LOCKPARENT) != 0) 4376 vput(dvp); 4377 else 4378 vrele(dvp); 4379 return (cache_fpl_aborted(fpl)); 4380 } 4381 4382 error = cache_fplookup_final_child(fpl, tvs); 4383 if (__predict_false(error != 0)) { 4384 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED); 4385 if ((cnp->cn_flags & LOCKPARENT) != 0) 4386 vput(dvp); 4387 else 4388 vrele(dvp); 4389 return (error); 4390 } 4391 4392 MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED); 4393 return (0); 4394 } 4395 4396 static int 4397 cache_fplookup_final(struct cache_fpl *fpl) 4398 { 4399 struct componentname *cnp; 4400 enum vgetstate tvs; 4401 struct vnode *dvp, *tvp; 4402 seqc_t dvp_seqc; 4403 4404 cnp = fpl->cnp; 4405 dvp = fpl->dvp; 4406 dvp_seqc = fpl->dvp_seqc; 4407 tvp = fpl->tvp; 4408 4409 MPASS(*(cnp->cn_nameptr) != '/'); 4410 4411 if (cnp->cn_nameiop != LOOKUP) { 4412 return (cache_fplookup_final_modifying(fpl)); 4413 } 4414 4415 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) 4416 return (cache_fplookup_final_withparent(fpl)); 4417 4418 tvs = vget_prep_smr(tvp); 4419 if (__predict_false(tvs == VGET_NONE)) { 4420 return (cache_fpl_partial(fpl)); 4421 } 4422 4423 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4424 cache_fpl_smr_exit(fpl); 4425 vget_abort(tvp, tvs); 4426 return (cache_fpl_aborted(fpl)); 4427 } 4428 4429 cache_fpl_smr_exit(fpl); 4430 return (cache_fplookup_final_child(fpl, tvs)); 4431 } 4432 4433 /* 4434 * Comment from locked lookup: 4435 * Check for degenerate name (e.g. / or "") which is a way of talking about a 4436 * directory, e.g. like "/." or ".". 4437 */ 4438 static int __noinline 4439 cache_fplookup_degenerate(struct cache_fpl *fpl) 4440 { 4441 struct componentname *cnp; 4442 struct vnode *dvp; 4443 enum vgetstate dvs; 4444 int error, lkflags; 4445 4446 fpl->tvp = fpl->dvp; 4447 fpl->tvp_seqc = fpl->dvp_seqc; 4448 4449 cnp = fpl->cnp; 4450 dvp = fpl->dvp; 4451 4452 if (__predict_false(cnp->cn_nameiop != LOOKUP)) { 4453 cache_fpl_smr_exit(fpl); 4454 return (cache_fpl_handled_error(fpl, EISDIR)); 4455 } 4456 4457 MPASS((cnp->cn_flags & SAVESTART) == 0); 4458 4459 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) { 4460 return (cache_fplookup_final_withparent(fpl)); 4461 } 4462 4463 dvs = vget_prep_smr(dvp); 4464 cache_fpl_smr_exit(fpl); 4465 if (__predict_false(dvs == VGET_NONE)) { 4466 return (cache_fpl_aborted(fpl)); 4467 } 4468 4469 if ((cnp->cn_flags & LOCKLEAF) != 0) { 4470 lkflags = LK_SHARED; 4471 if ((cnp->cn_flags & LOCKSHARED) == 0) 4472 lkflags = LK_EXCLUSIVE; 4473 error = vget_finish(dvp, lkflags, dvs); 4474 if (__predict_false(error != 0)) { 4475 return (cache_fpl_aborted(fpl)); 4476 } 4477 } else { 4478 vget_finish_ref(dvp, dvs); 4479 } 4480 return (cache_fpl_handled(fpl)); 4481 } 4482 4483 static int __noinline 4484 cache_fplookup_noentry(struct cache_fpl *fpl) 4485 { 4486 struct nameidata *ndp; 4487 struct componentname *cnp; 4488 enum vgetstate dvs; 4489 struct vnode *dvp, *tvp; 4490 seqc_t dvp_seqc; 4491 int error; 4492 bool docache; 4493 4494 ndp = fpl->ndp; 4495 cnp = fpl->cnp; 4496 dvp = fpl->dvp; 4497 dvp_seqc = fpl->dvp_seqc; 4498 4499 MPASS(*(cnp->cn_nameptr) != '/'); 4500 MPASS((cnp->cn_flags & MAKEENTRY) == 0); 4501 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 4502 MPASS(!cache_fpl_isdotdot(cnp)); 4503 4504 /* 4505 * Hack: delayed name len checking. 4506 */ 4507 if (__predict_false(cnp->cn_namelen > NAME_MAX)) { 4508 cache_fpl_smr_exit(fpl); 4509 return (cache_fpl_handled_error(fpl, ENAMETOOLONG)); 4510 } 4511 4512 if (cnp->cn_nameiop != LOOKUP) { 4513 fpl->tvp = NULL; 4514 return (cache_fplookup_modifying(fpl)); 4515 } 4516 4517 MPASS((cnp->cn_flags & SAVESTART) == 0); 4518 4519 /* 4520 * Only try to fill in the component if it is the last one, 4521 * otherwise not only there may be several to handle but the 4522 * walk may be complicated. 4523 */ 4524 if (!cache_fpl_islastcn(ndp)) { 4525 return (cache_fpl_partial(fpl)); 4526 } 4527 4528 /* 4529 * Secure access to dvp; check cache_fplookup_partial_setup for 4530 * reasoning. 4531 */ 4532 dvs = vget_prep_smr(dvp); 4533 cache_fpl_smr_exit(fpl); 4534 if (__predict_false(dvs == VGET_NONE)) { 4535 return (cache_fpl_aborted(fpl)); 4536 } 4537 4538 vget_finish_ref(dvp, dvs); 4539 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4540 vrele(dvp); 4541 return (cache_fpl_aborted(fpl)); 4542 } 4543 4544 error = vn_lock(dvp, LK_SHARED); 4545 if (__predict_false(error != 0)) { 4546 vrele(dvp); 4547 return (cache_fpl_aborted(fpl)); 4548 } 4549 4550 tvp = NULL; 4551 /* 4552 * TODO: provide variants which don't require locking either vnode. 4553 */ 4554 cnp->cn_flags |= ISLASTCN; 4555 docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE; 4556 if (docache) 4557 cnp->cn_flags |= MAKEENTRY; 4558 cnp->cn_lkflags = LK_SHARED; 4559 if ((cnp->cn_flags & LOCKSHARED) == 0) { 4560 cnp->cn_lkflags = LK_EXCLUSIVE; 4561 } 4562 error = VOP_LOOKUP(dvp, &tvp, cnp); 4563 switch (error) { 4564 case EJUSTRETURN: 4565 case 0: 4566 break; 4567 case ENOTDIR: 4568 case ENOENT: 4569 vput(dvp); 4570 return (cache_fpl_handled_error(fpl, error)); 4571 default: 4572 vput(dvp); 4573 return (cache_fpl_aborted(fpl)); 4574 } 4575 4576 fpl->tvp = tvp; 4577 if (!fpl->savename) { 4578 MPASS((cnp->cn_flags & SAVENAME) == 0); 4579 } 4580 4581 if (tvp == NULL) { 4582 MPASS(error == EJUSTRETURN); 4583 if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) { 4584 vput(dvp); 4585 } else if ((cnp->cn_flags & LOCKPARENT) == 0) { 4586 VOP_UNLOCK(dvp); 4587 } 4588 return (cache_fpl_handled(fpl)); 4589 } 4590 4591 if (__predict_false(tvp->v_type == VLNK || cache_fplookup_is_mp(fpl))) { 4592 vput(dvp); 4593 vput(tvp); 4594 return (cache_fpl_aborted(fpl)); 4595 } 4596 4597 if ((cnp->cn_flags & LOCKLEAF) == 0) { 4598 VOP_UNLOCK(tvp); 4599 } 4600 4601 if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) { 4602 vput(dvp); 4603 } else if ((cnp->cn_flags & LOCKPARENT) == 0) { 4604 VOP_UNLOCK(dvp); 4605 } 4606 return (cache_fpl_handled(fpl)); 4607 } 4608 4609 static int __noinline 4610 cache_fplookup_dot(struct cache_fpl *fpl) 4611 { 4612 int error; 4613 4614 MPASS(!seqc_in_modify(fpl->dvp_seqc)); 4615 /* 4616 * Just re-assign the value. seqc will be checked later for the first 4617 * non-dot path component in line and/or before deciding to return the 4618 * vnode. 4619 */ 4620 fpl->tvp = fpl->dvp; 4621 fpl->tvp_seqc = fpl->dvp_seqc; 4622 4623 counter_u64_add(dothits, 1); 4624 SDT_PROBE3(vfs, namecache, lookup, hit, fpl->dvp, ".", fpl->dvp); 4625 4626 error = 0; 4627 if (cache_fplookup_is_mp(fpl)) { 4628 error = cache_fplookup_cross_mount(fpl); 4629 } 4630 return (error); 4631 } 4632 4633 static int __noinline 4634 cache_fplookup_dotdot(struct cache_fpl *fpl) 4635 { 4636 struct nameidata *ndp; 4637 struct componentname *cnp; 4638 struct namecache *ncp; 4639 struct vnode *dvp; 4640 struct prison *pr; 4641 u_char nc_flag; 4642 4643 ndp = fpl->ndp; 4644 cnp = fpl->cnp; 4645 dvp = fpl->dvp; 4646 4647 MPASS(cache_fpl_isdotdot(cnp)); 4648 4649 /* 4650 * XXX this is racy the same way regular lookup is 4651 */ 4652 for (pr = cnp->cn_cred->cr_prison; pr != NULL; 4653 pr = pr->pr_parent) 4654 if (dvp == pr->pr_root) 4655 break; 4656 4657 if (dvp == ndp->ni_rootdir || 4658 dvp == ndp->ni_topdir || 4659 dvp == rootvnode || 4660 pr != NULL) { 4661 fpl->tvp = dvp; 4662 fpl->tvp_seqc = vn_seqc_read_any(dvp); 4663 if (seqc_in_modify(fpl->tvp_seqc)) { 4664 return (cache_fpl_aborted(fpl)); 4665 } 4666 return (0); 4667 } 4668 4669 if ((dvp->v_vflag & VV_ROOT) != 0) { 4670 /* 4671 * TODO 4672 * The opposite of climb mount is needed here. 4673 */ 4674 return (cache_fpl_aborted(fpl)); 4675 } 4676 4677 ncp = atomic_load_ptr(&dvp->v_cache_dd); 4678 if (ncp == NULL) { 4679 return (cache_fpl_aborted(fpl)); 4680 } 4681 4682 nc_flag = atomic_load_char(&ncp->nc_flag); 4683 if ((nc_flag & NCF_ISDOTDOT) != 0) { 4684 if ((nc_flag & NCF_NEGATIVE) != 0) 4685 return (cache_fpl_aborted(fpl)); 4686 fpl->tvp = ncp->nc_vp; 4687 } else { 4688 fpl->tvp = ncp->nc_dvp; 4689 } 4690 4691 if (!cache_ncp_canuse(ncp)) { 4692 return (cache_fpl_aborted(fpl)); 4693 } 4694 4695 fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp); 4696 if (seqc_in_modify(fpl->tvp_seqc)) { 4697 return (cache_fpl_partial(fpl)); 4698 } 4699 4700 counter_u64_add(dotdothits, 1); 4701 return (0); 4702 } 4703 4704 static int __noinline 4705 cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash) 4706 { 4707 u_char nc_flag; 4708 bool neg_promote; 4709 4710 nc_flag = atomic_load_char(&ncp->nc_flag); 4711 MPASS((nc_flag & NCF_NEGATIVE) != 0); 4712 /* 4713 * If they want to create an entry we need to replace this one. 4714 */ 4715 if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) { 4716 fpl->tvp = NULL; 4717 return (cache_fplookup_modifying(fpl)); 4718 } 4719 neg_promote = cache_neg_hit_prep(ncp); 4720 if (!cache_fpl_neg_ncp_canuse(ncp)) { 4721 cache_neg_hit_abort(ncp); 4722 return (cache_fpl_partial(fpl)); 4723 } 4724 if (neg_promote) { 4725 return (cache_fplookup_negative_promote(fpl, ncp, hash)); 4726 } 4727 cache_neg_hit_finish(ncp); 4728 cache_fpl_smr_exit(fpl); 4729 return (cache_fpl_handled_error(fpl, ENOENT)); 4730 } 4731 4732 /* 4733 * Resolve a symlink. Called by filesystem-specific routines. 4734 * 4735 * Code flow is: 4736 * ... -> cache_fplookup_symlink -> VOP_FPLOOKUP_SYMLINK -> cache_symlink_resolve 4737 */ 4738 int 4739 cache_symlink_resolve(struct cache_fpl *fpl, const char *string, size_t len) 4740 { 4741 struct nameidata *ndp; 4742 struct componentname *cnp; 4743 4744 ndp = fpl->ndp; 4745 cnp = fpl->cnp; 4746 4747 if (__predict_false(len == 0)) { 4748 return (ENOENT); 4749 } 4750 4751 ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr - cnp->cn_namelen + 1; 4752 #ifdef INVARIANTS 4753 if (ndp->ni_pathlen != fpl->debug.ni_pathlen) { 4754 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n", 4755 __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar, 4756 cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf); 4757 } 4758 #endif 4759 4760 if (__predict_false(len + ndp->ni_pathlen > MAXPATHLEN)) { 4761 return (ENAMETOOLONG); 4762 } 4763 4764 if (__predict_false(ndp->ni_loopcnt++ >= MAXSYMLINKS)) { 4765 return (ELOOP); 4766 } 4767 4768 if (ndp->ni_pathlen > 1) { 4769 bcopy(ndp->ni_next, cnp->cn_pnbuf + len, ndp->ni_pathlen); 4770 } else { 4771 cnp->cn_pnbuf[len] = '\0'; 4772 } 4773 bcopy(string, cnp->cn_pnbuf, len); 4774 4775 ndp->ni_pathlen += len; 4776 cache_fpl_pathlen_add(fpl, len); 4777 cnp->cn_nameptr = cnp->cn_pnbuf; 4778 fpl->nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1]; 4779 4780 return (0); 4781 } 4782 4783 static int __noinline 4784 cache_fplookup_symlink(struct cache_fpl *fpl) 4785 { 4786 struct nameidata *ndp; 4787 struct componentname *cnp; 4788 struct vnode *dvp, *tvp; 4789 int error; 4790 4791 ndp = fpl->ndp; 4792 cnp = fpl->cnp; 4793 dvp = fpl->dvp; 4794 tvp = fpl->tvp; 4795 4796 if (cache_fpl_islastcn(ndp)) { 4797 if ((cnp->cn_flags & FOLLOW) == 0) { 4798 return (cache_fplookup_final(fpl)); 4799 } 4800 } 4801 4802 error = VOP_FPLOOKUP_SYMLINK(tvp, fpl); 4803 if (__predict_false(error != 0)) { 4804 switch (error) { 4805 case EAGAIN: 4806 return (cache_fpl_partial(fpl)); 4807 case ENOENT: 4808 case ENAMETOOLONG: 4809 case ELOOP: 4810 cache_fpl_smr_exit(fpl); 4811 return (cache_fpl_handled_error(fpl, error)); 4812 default: 4813 return (cache_fpl_aborted(fpl)); 4814 } 4815 } 4816 4817 if (*(cnp->cn_nameptr) == '/') { 4818 fpl->dvp = cache_fpl_handle_root(fpl); 4819 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); 4820 if (seqc_in_modify(fpl->dvp_seqc)) { 4821 return (cache_fpl_aborted(fpl)); 4822 } 4823 } 4824 4825 return (cache_fplookup_preparse(fpl)); 4826 } 4827 4828 static int 4829 cache_fplookup_next(struct cache_fpl *fpl) 4830 { 4831 struct componentname *cnp; 4832 struct namecache *ncp; 4833 struct vnode *dvp, *tvp; 4834 u_char nc_flag; 4835 uint32_t hash; 4836 int error; 4837 4838 cnp = fpl->cnp; 4839 dvp = fpl->dvp; 4840 4841 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 4842 if (cnp->cn_namelen == 1) { 4843 return (cache_fplookup_dot(fpl)); 4844 } 4845 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 4846 return (cache_fplookup_dotdot(fpl)); 4847 } 4848 } 4849 4850 MPASS(!cache_fpl_isdotdot(cnp)); 4851 4852 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 4853 4854 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 4855 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 4856 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 4857 break; 4858 } 4859 4860 if (__predict_false(ncp == NULL)) { 4861 if (cnp->cn_nameptr[0] == '/') { 4862 return (cache_fplookup_skip_slashes(fpl)); 4863 } 4864 return (cache_fplookup_noentry(fpl)); 4865 } 4866 4867 tvp = atomic_load_ptr(&ncp->nc_vp); 4868 nc_flag = atomic_load_char(&ncp->nc_flag); 4869 if ((nc_flag & NCF_NEGATIVE) != 0) { 4870 return (cache_fplookup_neg(fpl, ncp, hash)); 4871 } 4872 4873 if (!cache_ncp_canuse(ncp)) { 4874 return (cache_fpl_partial(fpl)); 4875 } 4876 4877 fpl->tvp = tvp; 4878 fpl->tvp_seqc = vn_seqc_read_any(tvp); 4879 if (seqc_in_modify(fpl->tvp_seqc)) { 4880 return (cache_fpl_partial(fpl)); 4881 } 4882 4883 counter_u64_add(numposhits, 1); 4884 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp); 4885 4886 error = 0; 4887 if (cache_fplookup_is_mp(fpl)) { 4888 error = cache_fplookup_cross_mount(fpl); 4889 } 4890 return (error); 4891 } 4892 4893 static bool 4894 cache_fplookup_mp_supported(struct mount *mp) 4895 { 4896 4897 MPASS(mp != NULL); 4898 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) 4899 return (false); 4900 return (true); 4901 } 4902 4903 /* 4904 * Walk up the mount stack (if any). 4905 * 4906 * Correctness is provided in the following ways: 4907 * - all vnodes are protected from freeing with SMR 4908 * - struct mount objects are type stable making them always safe to access 4909 * - stability of the particular mount is provided by busying it 4910 * - relationship between the vnode which is mounted on and the mount is 4911 * verified with the vnode sequence counter after busying 4912 * - association between root vnode of the mount and the mount is protected 4913 * by busy 4914 * 4915 * From that point on we can read the sequence counter of the root vnode 4916 * and get the next mount on the stack (if any) using the same protection. 4917 * 4918 * By the end of successful walk we are guaranteed the reached state was 4919 * indeed present at least at some point which matches the regular lookup. 4920 */ 4921 static int __noinline 4922 cache_fplookup_climb_mount(struct cache_fpl *fpl) 4923 { 4924 struct mount *mp, *prev_mp; 4925 struct mount_pcpu *mpcpu, *prev_mpcpu; 4926 struct vnode *vp; 4927 seqc_t vp_seqc; 4928 4929 vp = fpl->tvp; 4930 vp_seqc = fpl->tvp_seqc; 4931 4932 VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp); 4933 mp = atomic_load_ptr(&vp->v_mountedhere); 4934 if (__predict_false(mp == NULL)) { 4935 return (0); 4936 } 4937 4938 prev_mp = NULL; 4939 for (;;) { 4940 if (!vfs_op_thread_enter_crit(mp, mpcpu)) { 4941 if (prev_mp != NULL) 4942 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); 4943 return (cache_fpl_partial(fpl)); 4944 } 4945 if (prev_mp != NULL) 4946 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); 4947 if (!vn_seqc_consistent(vp, vp_seqc)) { 4948 vfs_op_thread_exit_crit(mp, mpcpu); 4949 return (cache_fpl_partial(fpl)); 4950 } 4951 if (!cache_fplookup_mp_supported(mp)) { 4952 vfs_op_thread_exit_crit(mp, mpcpu); 4953 return (cache_fpl_partial(fpl)); 4954 } 4955 vp = atomic_load_ptr(&mp->mnt_rootvnode); 4956 if (vp == NULL) { 4957 vfs_op_thread_exit_crit(mp, mpcpu); 4958 return (cache_fpl_partial(fpl)); 4959 } 4960 vp_seqc = vn_seqc_read_any(vp); 4961 if (seqc_in_modify(vp_seqc)) { 4962 vfs_op_thread_exit_crit(mp, mpcpu); 4963 return (cache_fpl_partial(fpl)); 4964 } 4965 prev_mp = mp; 4966 prev_mpcpu = mpcpu; 4967 mp = atomic_load_ptr(&vp->v_mountedhere); 4968 if (mp == NULL) 4969 break; 4970 } 4971 4972 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); 4973 fpl->tvp = vp; 4974 fpl->tvp_seqc = vp_seqc; 4975 return (0); 4976 } 4977 4978 static int __noinline 4979 cache_fplookup_cross_mount(struct cache_fpl *fpl) 4980 { 4981 struct mount *mp; 4982 struct mount_pcpu *mpcpu; 4983 struct vnode *vp; 4984 seqc_t vp_seqc; 4985 4986 vp = fpl->tvp; 4987 vp_seqc = fpl->tvp_seqc; 4988 4989 VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp); 4990 mp = atomic_load_ptr(&vp->v_mountedhere); 4991 if (__predict_false(mp == NULL)) { 4992 return (0); 4993 } 4994 4995 if (!vfs_op_thread_enter_crit(mp, mpcpu)) { 4996 return (cache_fpl_partial(fpl)); 4997 } 4998 if (!vn_seqc_consistent(vp, vp_seqc)) { 4999 vfs_op_thread_exit_crit(mp, mpcpu); 5000 return (cache_fpl_partial(fpl)); 5001 } 5002 if (!cache_fplookup_mp_supported(mp)) { 5003 vfs_op_thread_exit_crit(mp, mpcpu); 5004 return (cache_fpl_partial(fpl)); 5005 } 5006 vp = atomic_load_ptr(&mp->mnt_rootvnode); 5007 if (__predict_false(vp == NULL)) { 5008 vfs_op_thread_exit_crit(mp, mpcpu); 5009 return (cache_fpl_partial(fpl)); 5010 } 5011 vp_seqc = vn_seqc_read_any(vp); 5012 vfs_op_thread_exit_crit(mp, mpcpu); 5013 if (seqc_in_modify(vp_seqc)) { 5014 return (cache_fpl_partial(fpl)); 5015 } 5016 mp = atomic_load_ptr(&vp->v_mountedhere); 5017 if (__predict_false(mp != NULL)) { 5018 /* 5019 * There are possibly more mount points on top. 5020 * Normally this does not happen so for simplicity just start 5021 * over. 5022 */ 5023 return (cache_fplookup_climb_mount(fpl)); 5024 } 5025 5026 fpl->tvp = vp; 5027 fpl->tvp_seqc = vp_seqc; 5028 return (0); 5029 } 5030 5031 /* 5032 * Check if a vnode is mounted on. 5033 */ 5034 static bool 5035 cache_fplookup_is_mp(struct cache_fpl *fpl) 5036 { 5037 struct vnode *vp; 5038 5039 vp = fpl->tvp; 5040 return ((vn_irflag_read(vp) & VIRF_MOUNTPOINT) != 0); 5041 } 5042 5043 /* 5044 * Parse the path. 5045 * 5046 * The code was originally copy-pasted from regular lookup and despite 5047 * clean ups leaves performance on the table. Any modifications here 5048 * must take into account that in case off fallback the resulting 5049 * nameidata state has to be compatible with the original. 5050 */ 5051 5052 /* 5053 * Debug ni_pathlen tracking. 5054 */ 5055 #ifdef INVARIANTS 5056 static void 5057 cache_fpl_pathlen_dec(struct cache_fpl *fpl) 5058 { 5059 5060 cache_fpl_pathlen_sub(fpl, 1); 5061 } 5062 5063 static void 5064 cache_fpl_pathlen_inc(struct cache_fpl *fpl) 5065 { 5066 5067 cache_fpl_pathlen_add(fpl, 1); 5068 } 5069 5070 static void 5071 cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n) 5072 { 5073 5074 fpl->debug.ni_pathlen += n; 5075 KASSERT(fpl->debug.ni_pathlen <= PATH_MAX, 5076 ("%s: pathlen overflow to %zd\n", __func__, fpl->debug.ni_pathlen)); 5077 } 5078 5079 static void 5080 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n) 5081 { 5082 5083 fpl->debug.ni_pathlen -= n; 5084 KASSERT(fpl->debug.ni_pathlen <= PATH_MAX, 5085 ("%s: pathlen underflow to %zd\n", __func__, fpl->debug.ni_pathlen)); 5086 } 5087 #else 5088 static void __always_inline 5089 cache_fpl_pathlen_dec(struct cache_fpl *fpl) 5090 { 5091 } 5092 5093 static void __always_inline 5094 cache_fpl_pathlen_inc(struct cache_fpl *fpl) 5095 { 5096 } 5097 5098 static void 5099 cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n) 5100 { 5101 } 5102 5103 static void 5104 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n) 5105 { 5106 } 5107 #endif 5108 5109 static int __always_inline 5110 cache_fplookup_preparse(struct cache_fpl *fpl) 5111 { 5112 struct componentname *cnp; 5113 5114 cnp = fpl->cnp; 5115 5116 if (__predict_false(cnp->cn_nameptr[0] == '\0')) { 5117 return (cache_fplookup_degenerate(fpl)); 5118 } 5119 5120 /* 5121 * By this point the shortest possible pathname is one character + nul 5122 * terminator, hence 2. 5123 */ 5124 KASSERT(fpl->debug.ni_pathlen >= 2, ("%s: pathlen %zu\n", __func__, 5125 fpl->debug.ni_pathlen)); 5126 KASSERT(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 2] == fpl->nulchar - 1, 5127 ("%s: mismatch on string (%p != %p) [%s]\n", __func__, 5128 &cnp->cn_nameptr[fpl->debug.ni_pathlen - 2], fpl->nulchar - 1, 5129 cnp->cn_pnbuf)); 5130 if (__predict_false(*(fpl->nulchar - 1) == '/')) { 5131 /* 5132 * TODO 5133 * Regular lookup performs the following: 5134 * *ndp->ni_next = '\0'; 5135 * cnp->cn_flags |= TRAILINGSLASH; 5136 * 5137 * Which is problematic since it modifies data read 5138 * from userspace. Then if fast path lookup was to 5139 * abort we would have to either restore it or convey 5140 * the flag. Since this is a corner case just ignore 5141 * it for simplicity. 5142 */ 5143 return (cache_fpl_aborted(fpl)); 5144 } 5145 return (0); 5146 } 5147 5148 static int 5149 cache_fplookup_parse(struct cache_fpl *fpl) 5150 { 5151 struct nameidata *ndp; 5152 struct componentname *cnp; 5153 char *cp; 5154 5155 ndp = fpl->ndp; 5156 cnp = fpl->cnp; 5157 5158 /* 5159 * Find the end of this path component, it is either / or nul. 5160 * 5161 * Store / as a temporary sentinel so that we only have one character 5162 * to test for. Pathnames tend to be short so this should not be 5163 * resulting in cache misses. 5164 */ 5165 KASSERT(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] == fpl->nulchar, 5166 ("%s: mismatch between pathlen (%zu) and nulchar (%p != %p), string [%s]\n", 5167 __func__, fpl->debug.ni_pathlen, &cnp->cn_nameptr[fpl->debug.ni_pathlen - 1], 5168 fpl->nulchar, cnp->cn_pnbuf)); 5169 KASSERT(*fpl->nulchar == '\0', 5170 ("%s: expected nul at %p; string [%s]\n", __func__, fpl->nulchar, 5171 cnp->cn_pnbuf)); 5172 *fpl->nulchar = '/'; 5173 for (cp = cnp->cn_nameptr; *cp != '/'; cp++) { 5174 KASSERT(*cp != '\0', 5175 ("%s: encountered unexpected nul; string [%s]\n", __func__, 5176 cnp->cn_nameptr)); 5177 continue; 5178 } 5179 *fpl->nulchar = '\0'; 5180 5181 cnp->cn_namelen = cp - cnp->cn_nameptr; 5182 cache_fpl_pathlen_sub(fpl, cnp->cn_namelen); 5183 /* 5184 * Hack: we have to check if the found path component's length exceeds 5185 * NAME_MAX. However, the condition is very rarely true and check can 5186 * be elided in the common case -- if an entry was found in the cache, 5187 * then it could not have been too long to begin with. 5188 */ 5189 ndp->ni_next = cp; 5190 5191 #ifdef INVARIANTS 5192 /* 5193 * Code below is only here to assure compatibility with regular lookup. 5194 * It covers handling of trailing slashes and names like "/", both of 5195 * which of can be taken care of upfront which lockless lookup does 5196 * in cache_fplookup_preparse. Regular lookup performs these for each 5197 * path component. 5198 */ 5199 while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) { 5200 cp++; 5201 if (*cp == '\0') { 5202 panic("%s: ran into TRAILINGSLASH handling from [%s]\n", 5203 __func__, cnp->cn_pnbuf); 5204 } 5205 } 5206 5207 if (cnp->cn_nameptr[0] == '\0') { 5208 panic("%s: ran into degenerate name from [%s]\n", __func__, cnp->cn_pnbuf); 5209 } 5210 #endif 5211 return (0); 5212 } 5213 5214 static void 5215 cache_fplookup_parse_advance(struct cache_fpl *fpl) 5216 { 5217 struct nameidata *ndp; 5218 struct componentname *cnp; 5219 5220 ndp = fpl->ndp; 5221 cnp = fpl->cnp; 5222 5223 cnp->cn_nameptr = ndp->ni_next; 5224 KASSERT(*(cnp->cn_nameptr) == '/', 5225 ("%s: should have seen slash at %p ; buf %p [%s]\n", __func__, 5226 cnp->cn_nameptr, cnp->cn_pnbuf, cnp->cn_pnbuf)); 5227 cnp->cn_nameptr++; 5228 cache_fpl_pathlen_dec(fpl); 5229 } 5230 5231 /* 5232 * Skip spurious slashes in a pathname (e.g., "foo///bar") and retry. 5233 * 5234 * Lockless lookup tries to elide checking for spurious slashes and should they 5235 * be present is guaranteed to fail to find an entry. In this case the caller 5236 * must check if the name starts with a slash and this call routine. It is 5237 * going to fast forward across the spurious slashes and set the state up for 5238 * retry. 5239 */ 5240 static int __noinline 5241 cache_fplookup_skip_slashes(struct cache_fpl *fpl) 5242 { 5243 struct nameidata *ndp; 5244 struct componentname *cnp; 5245 5246 ndp = fpl->ndp; 5247 cnp = fpl->cnp; 5248 5249 MPASS(*(cnp->cn_nameptr) == '/'); 5250 do { 5251 cnp->cn_nameptr++; 5252 cache_fpl_pathlen_dec(fpl); 5253 } while (*(cnp->cn_nameptr) == '/'); 5254 5255 /* 5256 * Go back to one slash so that cache_fplookup_parse_advance has 5257 * something to skip. 5258 */ 5259 cnp->cn_nameptr--; 5260 cache_fpl_pathlen_inc(fpl); 5261 5262 /* 5263 * cache_fplookup_parse_advance starts from ndp->ni_next 5264 */ 5265 ndp->ni_next = cnp->cn_nameptr; 5266 5267 /* 5268 * See cache_fplookup_dot. 5269 */ 5270 fpl->tvp = fpl->dvp; 5271 fpl->tvp_seqc = fpl->dvp_seqc; 5272 5273 return (0); 5274 } 5275 5276 /* 5277 * See the API contract for VOP_FPLOOKUP_VEXEC. 5278 */ 5279 static int __noinline 5280 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error) 5281 { 5282 struct componentname *cnp; 5283 struct vnode *dvp; 5284 seqc_t dvp_seqc; 5285 5286 cnp = fpl->cnp; 5287 dvp = fpl->dvp; 5288 dvp_seqc = fpl->dvp_seqc; 5289 5290 /* 5291 * Hack: delayed name len checking. 5292 */ 5293 if (__predict_false(cnp->cn_namelen > NAME_MAX)) { 5294 cache_fpl_smr_exit(fpl); 5295 return (cache_fpl_handled_error(fpl, ENAMETOOLONG)); 5296 } 5297 5298 /* 5299 * Hack: they may be looking up foo/bar, where foo is a 5300 * regular file. In such a case we need to turn ENOTDIR, 5301 * but we may happen to get here with a different error. 5302 */ 5303 if (dvp->v_type != VDIR) { 5304 /* 5305 * The check here is predominantly to catch 5306 * EOPNOTSUPP from dead_vnodeops. If the vnode 5307 * gets doomed past this point it is going to 5308 * fail seqc verification. 5309 */ 5310 if (VN_IS_DOOMED(dvp)) { 5311 return (cache_fpl_aborted(fpl)); 5312 } 5313 error = ENOTDIR; 5314 } 5315 5316 /* 5317 * Hack: handle O_SEARCH. 5318 * 5319 * Open Group Base Specifications Issue 7, 2018 edition states: 5320 * If the access mode of the open file description associated with the 5321 * file descriptor is not O_SEARCH, the function shall check whether 5322 * directory searches are permitted using the current permissions of 5323 * the directory underlying the file descriptor. If the access mode is 5324 * O_SEARCH, the function shall not perform the check. 5325 * 5326 * Regular lookup tests for the NOEXECCHECK flag for every path 5327 * component to decide whether to do the permission check. However, 5328 * since most lookups never have the flag (and when they do it is only 5329 * present for the first path component), lockless lookup only acts on 5330 * it if there is a permission problem. Here the flag is represented 5331 * with a boolean so that we don't have to clear it on the way out. 5332 * 5333 * For simplicity this always aborts. 5334 * TODO: check if this is the first lookup and ignore the permission 5335 * problem. Note the flag has to survive fallback (if it happens to be 5336 * performed). 5337 */ 5338 if (fpl->fsearch) { 5339 return (cache_fpl_aborted(fpl)); 5340 } 5341 5342 switch (error) { 5343 case EAGAIN: 5344 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 5345 error = cache_fpl_aborted(fpl); 5346 } else { 5347 cache_fpl_partial(fpl); 5348 } 5349 break; 5350 default: 5351 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 5352 error = cache_fpl_aborted(fpl); 5353 } else { 5354 cache_fpl_smr_exit(fpl); 5355 cache_fpl_handled_error(fpl, error); 5356 } 5357 break; 5358 } 5359 return (error); 5360 } 5361 5362 static int 5363 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl) 5364 { 5365 struct nameidata *ndp; 5366 struct componentname *cnp; 5367 struct mount *mp; 5368 int error; 5369 5370 ndp = fpl->ndp; 5371 cnp = fpl->cnp; 5372 5373 cache_fpl_checkpoint(fpl); 5374 5375 /* 5376 * The vnode at hand is almost always stable, skip checking for it. 5377 * Worst case this postpones the check towards the end of the iteration 5378 * of the main loop. 5379 */ 5380 fpl->dvp = dvp; 5381 fpl->dvp_seqc = vn_seqc_read_notmodify(fpl->dvp); 5382 5383 mp = atomic_load_ptr(&dvp->v_mount); 5384 if (__predict_false(mp == NULL || !cache_fplookup_mp_supported(mp))) { 5385 return (cache_fpl_aborted(fpl)); 5386 } 5387 5388 error = cache_fplookup_preparse(fpl); 5389 if (__predict_false(cache_fpl_terminated(fpl))) { 5390 return (error); 5391 } 5392 5393 for (;;) { 5394 error = cache_fplookup_parse(fpl); 5395 if (__predict_false(error != 0)) { 5396 break; 5397 } 5398 5399 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred); 5400 if (__predict_false(error != 0)) { 5401 error = cache_fplookup_failed_vexec(fpl, error); 5402 break; 5403 } 5404 5405 error = cache_fplookup_next(fpl); 5406 if (__predict_false(cache_fpl_terminated(fpl))) { 5407 break; 5408 } 5409 5410 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 5411 5412 if (fpl->tvp->v_type == VLNK) { 5413 error = cache_fplookup_symlink(fpl); 5414 if (cache_fpl_terminated(fpl)) { 5415 break; 5416 } 5417 } else { 5418 if (cache_fpl_islastcn(ndp)) { 5419 error = cache_fplookup_final(fpl); 5420 break; 5421 } 5422 5423 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { 5424 error = cache_fpl_aborted(fpl); 5425 break; 5426 } 5427 5428 fpl->dvp = fpl->tvp; 5429 fpl->dvp_seqc = fpl->tvp_seqc; 5430 cache_fplookup_parse_advance(fpl); 5431 } 5432 5433 cache_fpl_checkpoint(fpl); 5434 } 5435 5436 return (error); 5437 } 5438 5439 /* 5440 * Fast path lookup protected with SMR and sequence counters. 5441 * 5442 * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one. 5443 * 5444 * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria 5445 * outlined below. 5446 * 5447 * Traditional vnode lookup conceptually looks like this: 5448 * 5449 * vn_lock(current); 5450 * for (;;) { 5451 * next = find(); 5452 * vn_lock(next); 5453 * vn_unlock(current); 5454 * current = next; 5455 * if (last) 5456 * break; 5457 * } 5458 * return (current); 5459 * 5460 * Each jump to the next vnode is safe memory-wise and atomic with respect to 5461 * any modifications thanks to holding respective locks. 5462 * 5463 * The same guarantee can be provided with a combination of safe memory 5464 * reclamation and sequence counters instead. If all operations which affect 5465 * the relationship between the current vnode and the one we are looking for 5466 * also modify the counter, we can verify whether all the conditions held as 5467 * we made the jump. This includes things like permissions, mount points etc. 5468 * Counter modification is provided by enclosing relevant places in 5469 * vn_seqc_write_begin()/end() calls. 5470 * 5471 * Thus this translates to: 5472 * 5473 * vfs_smr_enter(); 5474 * dvp_seqc = seqc_read_any(dvp); 5475 * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode 5476 * abort(); 5477 * for (;;) { 5478 * tvp = find(); 5479 * tvp_seqc = seqc_read_any(tvp); 5480 * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode 5481 * abort(); 5482 * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode 5483 * abort(); 5484 * dvp = tvp; // we know nothing of importance has changed 5485 * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration 5486 * if (last) 5487 * break; 5488 * } 5489 * vget(); // secure the vnode 5490 * if (!seqc_consistent(tvp, tvp_seqc) // final check 5491 * abort(); 5492 * // at this point we know nothing has changed for any parent<->child pair 5493 * // as they were crossed during the lookup, meaning we matched the guarantee 5494 * // of the locked variant 5495 * return (tvp); 5496 * 5497 * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows: 5498 * - they are called while within vfs_smr protection which they must never exit 5499 * - EAGAIN can be returned to denote checking could not be performed, it is 5500 * always valid to return it 5501 * - if the sequence counter has not changed the result must be valid 5502 * - if the sequence counter has changed both false positives and false negatives 5503 * are permitted (since the result will be rejected later) 5504 * - for simple cases of unix permission checks vaccess_vexec_smr can be used 5505 * 5506 * Caveats to watch out for: 5507 * - vnodes are passed unlocked and unreferenced with nothing stopping 5508 * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised 5509 * to use atomic_load_ptr to fetch it. 5510 * - the aforementioned object can also get freed, meaning absent other means it 5511 * should be protected with vfs_smr 5512 * - either safely checking permissions as they are modified or guaranteeing 5513 * their stability is left to the routine 5514 */ 5515 int 5516 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status, 5517 struct pwd **pwdp) 5518 { 5519 struct cache_fpl fpl; 5520 struct pwd *pwd; 5521 struct vnode *dvp; 5522 struct componentname *cnp; 5523 int error; 5524 5525 fpl.status = CACHE_FPL_STATUS_UNSET; 5526 fpl.in_smr = false; 5527 fpl.ndp = ndp; 5528 fpl.cnp = cnp = &ndp->ni_cnd; 5529 MPASS(ndp->ni_lcf == 0); 5530 MPASS(curthread == cnp->cn_thread); 5531 KASSERT ((cnp->cn_flags & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, 5532 ("%s: internal flags found in cn_flags %" PRIx64, __func__, 5533 cnp->cn_flags)); 5534 if ((cnp->cn_flags & SAVESTART) != 0) { 5535 MPASS(cnp->cn_nameiop != LOOKUP); 5536 } 5537 MPASS(cnp->cn_nameptr == cnp->cn_pnbuf); 5538 5539 if (__predict_false(!cache_can_fplookup(&fpl))) { 5540 *status = fpl.status; 5541 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 5542 return (EOPNOTSUPP); 5543 } 5544 5545 cache_fpl_checkpoint_outer(&fpl); 5546 5547 cache_fpl_smr_enter_initial(&fpl); 5548 #ifdef INVARIANTS 5549 fpl.debug.ni_pathlen = ndp->ni_pathlen; 5550 #endif 5551 fpl.nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1]; 5552 fpl.fsearch = false; 5553 fpl.savename = (cnp->cn_flags & SAVENAME) != 0; 5554 fpl.pwd = pwdp; 5555 pwd = pwd_get_smr(); 5556 *(fpl.pwd) = pwd; 5557 ndp->ni_rootdir = pwd->pwd_rdir; 5558 ndp->ni_topdir = pwd->pwd_jdir; 5559 5560 if (cnp->cn_pnbuf[0] == '/') { 5561 dvp = cache_fpl_handle_root(&fpl); 5562 MPASS(ndp->ni_resflags == 0); 5563 ndp->ni_resflags = NIRES_ABS; 5564 } else { 5565 if (ndp->ni_dirfd == AT_FDCWD) { 5566 dvp = pwd->pwd_cdir; 5567 } else { 5568 error = cache_fplookup_dirfd(&fpl, &dvp); 5569 if (__predict_false(error != 0)) { 5570 goto out; 5571 } 5572 } 5573 } 5574 5575 SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true); 5576 error = cache_fplookup_impl(dvp, &fpl); 5577 out: 5578 cache_fpl_smr_assert_not_entered(&fpl); 5579 cache_fpl_assert_status(&fpl); 5580 *status = fpl.status; 5581 if (SDT_PROBES_ENABLED()) { 5582 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 5583 if (fpl.status == CACHE_FPL_STATUS_HANDLED) 5584 SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true, 5585 ndp); 5586 } 5587 5588 if (__predict_true(fpl.status == CACHE_FPL_STATUS_HANDLED)) { 5589 MPASS(error != CACHE_FPL_FAILED); 5590 if (error != 0) { 5591 MPASS(fpl.dvp == NULL); 5592 MPASS(fpl.tvp == NULL); 5593 MPASS(fpl.savename == false); 5594 } 5595 ndp->ni_dvp = fpl.dvp; 5596 ndp->ni_vp = fpl.tvp; 5597 if (fpl.savename) { 5598 cnp->cn_flags |= HASBUF; 5599 } else { 5600 cache_fpl_cleanup_cnp(cnp); 5601 } 5602 } 5603 return (error); 5604 } 5605