1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Poul-Henning Kamp of the FreeBSD Project. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_ddb.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/capsicum.h> 46 #include <sys/counter.h> 47 #include <sys/filedesc.h> 48 #include <sys/fnv_hash.h> 49 #include <sys/kernel.h> 50 #include <sys/ktr.h> 51 #include <sys/lock.h> 52 #include <sys/malloc.h> 53 #include <sys/fcntl.h> 54 #include <sys/jail.h> 55 #include <sys/mount.h> 56 #include <sys/namei.h> 57 #include <sys/proc.h> 58 #include <sys/seqc.h> 59 #include <sys/sdt.h> 60 #include <sys/smr.h> 61 #include <sys/smp.h> 62 #include <sys/syscallsubr.h> 63 #include <sys/sysctl.h> 64 #include <sys/sysproto.h> 65 #include <sys/vnode.h> 66 #include <ck_queue.h> 67 #ifdef KTRACE 68 #include <sys/ktrace.h> 69 #endif 70 71 #include <sys/capsicum.h> 72 73 #include <security/audit/audit.h> 74 #include <security/mac/mac_framework.h> 75 76 #ifdef DDB 77 #include <ddb/ddb.h> 78 #endif 79 80 #include <vm/uma.h> 81 82 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 83 "Name cache"); 84 85 SDT_PROVIDER_DECLARE(vfs); 86 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", 87 "struct vnode *"); 88 SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *", 89 "struct vnode *"); 90 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", 91 "char *"); 92 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *", 93 "const char *"); 94 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *", 95 "struct namecache *", "int", "int"); 96 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); 97 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", 98 "char *", "struct vnode *"); 99 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); 100 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", 101 "struct vnode *", "char *"); 102 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", 103 "struct vnode *"); 104 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, 105 "struct vnode *", "char *"); 106 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", 107 "char *"); 108 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *", 109 "struct componentname *"); 110 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *", 111 "struct componentname *"); 112 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); 113 SDT_PROBE_DEFINE1(vfs, namecache, purge, batch, "int"); 114 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); 115 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); 116 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", 117 "struct vnode *"); 118 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", 119 "char *"); 120 SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *", 121 "char *"); 122 123 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool"); 124 SDT_PROBE_DECLARE(vfs, namei, lookup, entry); 125 SDT_PROBE_DECLARE(vfs, namei, lookup, return); 126 127 /* 128 * This structure describes the elements in the cache of recent 129 * names looked up by namei. 130 */ 131 struct negstate { 132 u_char neg_flag; 133 u_char neg_hit; 134 }; 135 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *), 136 "the state must fit in a union with a pointer without growing it"); 137 138 struct namecache { 139 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 140 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 141 CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */ 142 struct vnode *nc_dvp; /* vnode of parent of name */ 143 union { 144 struct vnode *nu_vp; /* vnode the name refers to */ 145 struct negstate nu_neg;/* negative entry state */ 146 } n_un; 147 u_char nc_flag; /* flag bits */ 148 u_char nc_nlen; /* length of name */ 149 char nc_name[0]; /* segment name + nul */ 150 }; 151 152 /* 153 * struct namecache_ts repeats struct namecache layout up to the 154 * nc_nlen member. 155 * struct namecache_ts is used in place of struct namecache when time(s) need 156 * to be stored. The nc_dotdottime field is used when a cache entry is mapping 157 * both a non-dotdot directory name plus dotdot for the directory's 158 * parent. 159 * 160 * See below for alignment requirement. 161 */ 162 struct namecache_ts { 163 struct timespec nc_time; /* timespec provided by fs */ 164 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ 165 int nc_ticks; /* ticks value when entry was added */ 166 int nc_pad; 167 struct namecache nc_nc; 168 }; 169 170 TAILQ_HEAD(cache_freebatch, namecache); 171 172 /* 173 * At least mips n32 performs 64-bit accesses to timespec as found 174 * in namecache_ts and requires them to be aligned. Since others 175 * may be in the same spot suffer a little bit and enforce the 176 * alignment for everyone. Note this is a nop for 64-bit platforms. 177 */ 178 #define CACHE_ZONE_ALIGNMENT UMA_ALIGNOF(time_t) 179 180 /* 181 * TODO: the initial value of CACHE_PATH_CUTOFF was inherited from the 182 * 4.4 BSD codebase. Later on struct namecache was tweaked to become 183 * smaller and the value was bumped to retain the total size, but it 184 * was never re-evaluated for suitability. A simple test counting 185 * lengths during package building shows that the value of 45 covers 186 * about 86% of all added entries, reaching 99% at 65. 187 * 188 * Regardless of the above, use of dedicated zones instead of malloc may be 189 * inducing additional waste. This may be hard to address as said zones are 190 * tied to VFS SMR. Even if retaining them, the current split should be 191 * re-evaluated. 192 */ 193 #ifdef __LP64__ 194 #define CACHE_PATH_CUTOFF 45 195 #define CACHE_LARGE_PAD 6 196 #else 197 #define CACHE_PATH_CUTOFF 41 198 #define CACHE_LARGE_PAD 2 199 #endif 200 201 #define CACHE_ZONE_SMALL_SIZE (offsetof(struct namecache, nc_name) + CACHE_PATH_CUTOFF + 1) 202 #define CACHE_ZONE_SMALL_TS_SIZE (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_SMALL_SIZE) 203 #define CACHE_ZONE_LARGE_SIZE (offsetof(struct namecache, nc_name) + NAME_MAX + 1 + CACHE_LARGE_PAD) 204 #define CACHE_ZONE_LARGE_TS_SIZE (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_LARGE_SIZE) 205 206 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 207 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 208 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 209 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 210 211 #define nc_vp n_un.nu_vp 212 #define nc_neg n_un.nu_neg 213 214 /* 215 * Flags in namecache.nc_flag 216 */ 217 #define NCF_WHITE 0x01 218 #define NCF_ISDOTDOT 0x02 219 #define NCF_TS 0x04 220 #define NCF_DTS 0x08 221 #define NCF_DVDROP 0x10 222 #define NCF_NEGATIVE 0x20 223 #define NCF_INVALID 0x40 224 #define NCF_WIP 0x80 225 226 /* 227 * Flags in negstate.neg_flag 228 */ 229 #define NEG_HOT 0x01 230 231 static bool cache_neg_evict_cond(u_long lnumcache); 232 233 /* 234 * Mark an entry as invalid. 235 * 236 * This is called before it starts getting deconstructed. 237 */ 238 static void 239 cache_ncp_invalidate(struct namecache *ncp) 240 { 241 242 KASSERT((ncp->nc_flag & NCF_INVALID) == 0, 243 ("%s: entry %p already invalid", __func__, ncp)); 244 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID); 245 atomic_thread_fence_rel(); 246 } 247 248 /* 249 * Check whether the entry can be safely used. 250 * 251 * All places which elide locks are supposed to call this after they are 252 * done with reading from an entry. 253 */ 254 #define cache_ncp_canuse(ncp) ({ \ 255 struct namecache *_ncp = (ncp); \ 256 u_char _nc_flag; \ 257 \ 258 atomic_thread_fence_acq(); \ 259 _nc_flag = atomic_load_char(&_ncp->nc_flag); \ 260 __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP)) == 0); \ 261 }) 262 263 /* 264 * Name caching works as follows: 265 * 266 * Names found by directory scans are retained in a cache 267 * for future reference. It is managed LRU, so frequently 268 * used names will hang around. Cache is indexed by hash value 269 * obtained from (dvp, name) where dvp refers to the directory 270 * containing name. 271 * 272 * If it is a "negative" entry, (i.e. for a name that is known NOT to 273 * exist) the vnode pointer will be NULL. 274 * 275 * Upon reaching the last segment of a path, if the reference 276 * is for DELETE, or NOCACHE is set (rewrite), and the 277 * name is located in the cache, it will be dropped. 278 * 279 * These locks are used (in the order in which they can be taken): 280 * NAME TYPE ROLE 281 * vnodelock mtx vnode lists and v_cache_dd field protection 282 * bucketlock mtx for access to given set of hash buckets 283 * neglist mtx negative entry LRU management 284 * 285 * It is legal to take multiple vnodelock and bucketlock locks. The locking 286 * order is lower address first. Both are recursive. 287 * 288 * "." lookups are lockless. 289 * 290 * ".." and vnode -> name lookups require vnodelock. 291 * 292 * name -> vnode lookup requires the relevant bucketlock to be held for reading. 293 * 294 * Insertions and removals of entries require involved vnodes and bucketlocks 295 * to be locked to provide safe operation against other threads modifying the 296 * cache. 297 * 298 * Some lookups result in removal of the found entry (e.g. getting rid of a 299 * negative entry with the intent to create a positive one), which poses a 300 * problem when multiple threads reach the state. Similarly, two different 301 * threads can purge two different vnodes and try to remove the same name. 302 * 303 * If the already held vnode lock is lower than the second required lock, we 304 * can just take the other lock. However, in the opposite case, this could 305 * deadlock. As such, this is resolved by trylocking and if that fails unlocking 306 * the first node, locking everything in order and revalidating the state. 307 */ 308 309 VFS_SMR_DECLARE; 310 311 static SYSCTL_NODE(_vfs_cache, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 312 "Name cache parameters"); 313 314 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */ 315 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, size, CTLFLAG_RW, &ncsize, 0, 316 "Total namecache capacity"); 317 318 u_int ncsizefactor = 2; 319 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, sizefactor, CTLFLAG_RW, &ncsizefactor, 0, 320 "Size factor for namecache"); 321 322 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */ 323 SYSCTL_ULONG(_vfs_cache_param, OID_AUTO, negfactor, CTLFLAG_RW, &ncnegfactor, 0, 324 "Ratio of negative namecache entries"); 325 326 /* 327 * Negative entry % of namecache capacity above which automatic eviction is allowed. 328 * 329 * Check cache_neg_evict_cond for details. 330 */ 331 static u_int ncnegminpct = 3; 332 333 static u_int __read_mostly neg_min; /* the above recomputed against ncsize */ 334 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, negmin, CTLFLAG_RD, &neg_min, 0, 335 "Negative entry count above which automatic eviction is allowed"); 336 337 /* 338 * Structures associated with name caching. 339 */ 340 #define NCHHASH(hash) \ 341 (&nchashtbl[(hash) & nchash]) 342 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ 343 static u_long __read_mostly nchash; /* size of hash table */ 344 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 345 "Size of namecache hash table"); 346 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ 347 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ 348 349 struct nchstats nchstats; /* cache effectiveness statistics */ 350 351 static bool __read_frequently cache_fast_revlookup = true; 352 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_revlookup, CTLFLAG_RW, 353 &cache_fast_revlookup, 0, ""); 354 355 static u_int __exclusive_cache_line neg_cycle; 356 357 #define ncneghash 3 358 #define numneglists (ncneghash + 1) 359 360 struct neglist { 361 struct mtx nl_evict_lock; 362 struct mtx nl_lock __aligned(CACHE_LINE_SIZE); 363 TAILQ_HEAD(, namecache) nl_list; 364 TAILQ_HEAD(, namecache) nl_hotlist; 365 u_long nl_hotnum; 366 } __aligned(CACHE_LINE_SIZE); 367 368 static struct neglist neglists[numneglists]; 369 370 static inline struct neglist * 371 NCP2NEGLIST(struct namecache *ncp) 372 { 373 374 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); 375 } 376 377 static inline struct negstate * 378 NCP2NEGSTATE(struct namecache *ncp) 379 { 380 381 MPASS(ncp->nc_flag & NCF_NEGATIVE); 382 return (&ncp->nc_neg); 383 } 384 385 #define numbucketlocks (ncbuckethash + 1) 386 static u_int __read_mostly ncbuckethash; 387 static struct mtx_padalign __read_mostly *bucketlocks; 388 #define HASH2BUCKETLOCK(hash) \ 389 ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)])) 390 391 #define numvnodelocks (ncvnodehash + 1) 392 static u_int __read_mostly ncvnodehash; 393 static struct mtx __read_mostly *vnodelocks; 394 static inline struct mtx * 395 VP2VNODELOCK(struct vnode *vp) 396 { 397 398 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); 399 } 400 401 static void 402 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) 403 { 404 struct namecache_ts *ncp_ts; 405 406 KASSERT((ncp->nc_flag & NCF_TS) != 0 || 407 (tsp == NULL && ticksp == NULL), 408 ("No NCF_TS")); 409 410 if (tsp == NULL) 411 return; 412 413 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 414 *tsp = ncp_ts->nc_time; 415 *ticksp = ncp_ts->nc_ticks; 416 } 417 418 #ifdef DEBUG_CACHE 419 static int __read_mostly doingcache = 1; /* 1 => enable the cache */ 420 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, 421 "VFS namecache enabled"); 422 #endif 423 424 /* Export size information to userland */ 425 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 426 sizeof(struct namecache), "sizeof(struct namecache)"); 427 428 /* 429 * The new name cache statistics 430 */ 431 static SYSCTL_NODE(_vfs_cache, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 432 "Name cache statistics"); 433 434 #define STATNODE_ULONG(name, varname, descr) \ 435 SYSCTL_ULONG(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr); 436 #define STATNODE_COUNTER(name, varname, descr) \ 437 static COUNTER_U64_DEFINE_EARLY(varname); \ 438 SYSCTL_COUNTER_U64(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, \ 439 descr); 440 STATNODE_ULONG(neg, numneg, "Number of negative cache entries"); 441 STATNODE_ULONG(count, numcache, "Number of cache entries"); 442 STATNODE_COUNTER(heldvnodes, numcachehv, "Number of namecache entries with vnodes held"); 443 STATNODE_COUNTER(drops, numdrops, "Number of dropped entries due to reaching the limit"); 444 STATNODE_COUNTER(dothits, dothits, "Number of '.' hits"); 445 STATNODE_COUNTER(dotdothis, dotdothits, "Number of '..' hits"); 446 STATNODE_COUNTER(miss, nummiss, "Number of cache misses"); 447 STATNODE_COUNTER(misszap, nummisszap, "Number of cache misses we do not want to cache"); 448 STATNODE_COUNTER(posszaps, numposzaps, 449 "Number of cache hits (positive) we do not want to cache"); 450 STATNODE_COUNTER(poshits, numposhits, "Number of cache hits (positive)"); 451 STATNODE_COUNTER(negzaps, numnegzaps, 452 "Number of cache hits (negative) we do not want to cache"); 453 STATNODE_COUNTER(neghits, numneghits, "Number of cache hits (negative)"); 454 /* These count for vn_getcwd(), too. */ 455 STATNODE_COUNTER(fullpathcalls, numfullpathcalls, "Number of fullpath search calls"); 456 STATNODE_COUNTER(fullpathfail1, numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); 457 STATNODE_COUNTER(fullpathfail2, numfullpathfail2, 458 "Number of fullpath search errors (VOP_VPTOCNP failures)"); 459 STATNODE_COUNTER(fullpathfail4, numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); 460 STATNODE_COUNTER(fullpathfound, numfullpathfound, "Number of successful fullpath calls"); 461 462 /* 463 * Debug or developer statistics. 464 */ 465 static SYSCTL_NODE(_vfs_cache, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 466 "Name cache debugging"); 467 #define DEBUGNODE_ULONG(name, varname, descr) \ 468 SYSCTL_ULONG(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr); 469 #define DEBUGNODE_COUNTER(name, varname, descr) \ 470 static COUNTER_U64_DEFINE_EARLY(varname); \ 471 SYSCTL_COUNTER_U64(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, \ 472 descr); 473 DEBUGNODE_COUNTER(zap_bucket_relock_success, zap_bucket_relock_success, 474 "Number of successful removals after relocking"); 475 static long zap_bucket_fail; 476 DEBUGNODE_ULONG(zap_bucket_fail, zap_bucket_fail, ""); 477 static long zap_bucket_fail2; 478 DEBUGNODE_ULONG(zap_bucket_fail2, zap_bucket_fail2, ""); 479 static long cache_lock_vnodes_cel_3_failures; 480 DEBUGNODE_ULONG(vnodes_cel_3_failures, cache_lock_vnodes_cel_3_failures, 481 "Number of times 3-way vnode locking failed"); 482 483 static void cache_zap_locked(struct namecache *ncp); 484 static int vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, 485 char **freebuf, size_t *buflen); 486 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 487 char **retbuf, size_t *buflen, size_t addend); 488 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, 489 char **retbuf, size_t *buflen); 490 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, 491 char **retbuf, size_t *len, size_t addend); 492 493 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 494 495 static inline void 496 cache_assert_vlp_locked(struct mtx *vlp) 497 { 498 499 if (vlp != NULL) 500 mtx_assert(vlp, MA_OWNED); 501 } 502 503 static inline void 504 cache_assert_vnode_locked(struct vnode *vp) 505 { 506 struct mtx *vlp; 507 508 vlp = VP2VNODELOCK(vp); 509 cache_assert_vlp_locked(vlp); 510 } 511 512 /* 513 * Directory vnodes with entries are held for two reasons: 514 * 1. make them less of a target for reclamation in vnlru 515 * 2. suffer smaller performance penalty in locked lookup as requeieing is avoided 516 * 517 * Note this preferably would not be done and it's a hold over from. It will be 518 * feasible to eliminate altogether if all filesystems start supporting 519 * lockless lookup. 520 */ 521 static void 522 cache_hold_vnode(struct vnode *vp) 523 { 524 525 cache_assert_vnode_locked(vp); 526 VNPASS(LIST_EMPTY(&vp->v_cache_src), vp); 527 vhold(vp); 528 counter_u64_add(numcachehv, 1); 529 } 530 531 static void 532 cache_drop_vnode(struct vnode *vp) 533 { 534 535 /* 536 * Called after all locks are dropped, meaning we can't assert 537 * on the state of v_cache_src. 538 */ 539 vdrop(vp); 540 counter_u64_add(numcachehv, -1); 541 } 542 543 /* 544 * UMA zones. 545 */ 546 static uma_zone_t __read_mostly cache_zone_small; 547 static uma_zone_t __read_mostly cache_zone_small_ts; 548 static uma_zone_t __read_mostly cache_zone_large; 549 static uma_zone_t __read_mostly cache_zone_large_ts; 550 551 static struct namecache * 552 cache_alloc_uma(int len, bool ts) 553 { 554 struct namecache_ts *ncp_ts; 555 struct namecache *ncp; 556 557 if (__predict_false(ts)) { 558 if (len <= CACHE_PATH_CUTOFF) 559 ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK); 560 else 561 ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK); 562 ncp = &ncp_ts->nc_nc; 563 } else { 564 if (len <= CACHE_PATH_CUTOFF) 565 ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK); 566 else 567 ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK); 568 } 569 return (ncp); 570 } 571 572 static void 573 cache_free_uma(struct namecache *ncp) 574 { 575 struct namecache_ts *ncp_ts; 576 577 if (__predict_false(ncp->nc_flag & NCF_TS)) { 578 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 579 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 580 uma_zfree_smr(cache_zone_small_ts, ncp_ts); 581 else 582 uma_zfree_smr(cache_zone_large_ts, ncp_ts); 583 } else { 584 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 585 uma_zfree_smr(cache_zone_small, ncp); 586 else 587 uma_zfree_smr(cache_zone_large, ncp); 588 } 589 } 590 591 static struct namecache * 592 cache_alloc(int len, bool ts) 593 { 594 u_long lnumcache; 595 596 /* 597 * Avoid blowout in namecache entries. 598 * 599 * Bugs: 600 * 1. filesystems may end up trying to add an already existing entry 601 * (for example this can happen after a cache miss during concurrent 602 * lookup), in which case we will call cache_neg_evict despite not 603 * adding anything. 604 * 2. the routine may fail to free anything and no provisions are made 605 * to make it try harder (see the inside for failure modes) 606 * 3. it only ever looks at negative entries. 607 */ 608 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; 609 if (cache_neg_evict_cond(lnumcache)) { 610 lnumcache = atomic_load_long(&numcache); 611 } 612 if (__predict_false(lnumcache >= ncsize)) { 613 atomic_subtract_long(&numcache, 1); 614 counter_u64_add(numdrops, 1); 615 return (NULL); 616 } 617 return (cache_alloc_uma(len, ts)); 618 } 619 620 static void 621 cache_free(struct namecache *ncp) 622 { 623 624 MPASS(ncp != NULL); 625 if ((ncp->nc_flag & NCF_DVDROP) != 0) { 626 cache_drop_vnode(ncp->nc_dvp); 627 } 628 cache_free_uma(ncp); 629 atomic_subtract_long(&numcache, 1); 630 } 631 632 static void 633 cache_free_batch(struct cache_freebatch *batch) 634 { 635 struct namecache *ncp, *nnp; 636 int i; 637 638 i = 0; 639 if (TAILQ_EMPTY(batch)) 640 goto out; 641 TAILQ_FOREACH_SAFE(ncp, batch, nc_dst, nnp) { 642 if ((ncp->nc_flag & NCF_DVDROP) != 0) { 643 cache_drop_vnode(ncp->nc_dvp); 644 } 645 cache_free_uma(ncp); 646 i++; 647 } 648 atomic_subtract_long(&numcache, i); 649 out: 650 SDT_PROBE1(vfs, namecache, purge, batch, i); 651 } 652 653 /* 654 * TODO: With the value stored we can do better than computing the hash based 655 * on the address. The choice of FNV should also be revisited. 656 */ 657 static void 658 cache_prehash(struct vnode *vp) 659 { 660 661 vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT); 662 } 663 664 static uint32_t 665 cache_get_hash(char *name, u_char len, struct vnode *dvp) 666 { 667 668 return (fnv_32_buf(name, len, dvp->v_nchash)); 669 } 670 671 static inline struct nchashhead * 672 NCP2BUCKET(struct namecache *ncp) 673 { 674 uint32_t hash; 675 676 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 677 return (NCHHASH(hash)); 678 } 679 680 static inline struct mtx * 681 NCP2BUCKETLOCK(struct namecache *ncp) 682 { 683 uint32_t hash; 684 685 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 686 return (HASH2BUCKETLOCK(hash)); 687 } 688 689 #ifdef INVARIANTS 690 static void 691 cache_assert_bucket_locked(struct namecache *ncp) 692 { 693 struct mtx *blp; 694 695 blp = NCP2BUCKETLOCK(ncp); 696 mtx_assert(blp, MA_OWNED); 697 } 698 699 static void 700 cache_assert_bucket_unlocked(struct namecache *ncp) 701 { 702 struct mtx *blp; 703 704 blp = NCP2BUCKETLOCK(ncp); 705 mtx_assert(blp, MA_NOTOWNED); 706 } 707 #else 708 #define cache_assert_bucket_locked(x) do { } while (0) 709 #define cache_assert_bucket_unlocked(x) do { } while (0) 710 #endif 711 712 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) 713 static void 714 _cache_sort_vnodes(void **p1, void **p2) 715 { 716 void *tmp; 717 718 MPASS(*p1 != NULL || *p2 != NULL); 719 720 if (*p1 > *p2) { 721 tmp = *p2; 722 *p2 = *p1; 723 *p1 = tmp; 724 } 725 } 726 727 static void 728 cache_lock_all_buckets(void) 729 { 730 u_int i; 731 732 for (i = 0; i < numbucketlocks; i++) 733 mtx_lock(&bucketlocks[i]); 734 } 735 736 static void 737 cache_unlock_all_buckets(void) 738 { 739 u_int i; 740 741 for (i = 0; i < numbucketlocks; i++) 742 mtx_unlock(&bucketlocks[i]); 743 } 744 745 static void 746 cache_lock_all_vnodes(void) 747 { 748 u_int i; 749 750 for (i = 0; i < numvnodelocks; i++) 751 mtx_lock(&vnodelocks[i]); 752 } 753 754 static void 755 cache_unlock_all_vnodes(void) 756 { 757 u_int i; 758 759 for (i = 0; i < numvnodelocks; i++) 760 mtx_unlock(&vnodelocks[i]); 761 } 762 763 static int 764 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 765 { 766 767 cache_sort_vnodes(&vlp1, &vlp2); 768 769 if (vlp1 != NULL) { 770 if (!mtx_trylock(vlp1)) 771 return (EAGAIN); 772 } 773 if (!mtx_trylock(vlp2)) { 774 if (vlp1 != NULL) 775 mtx_unlock(vlp1); 776 return (EAGAIN); 777 } 778 779 return (0); 780 } 781 782 static void 783 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 784 { 785 786 MPASS(vlp1 != NULL || vlp2 != NULL); 787 MPASS(vlp1 <= vlp2); 788 789 if (vlp1 != NULL) 790 mtx_lock(vlp1); 791 if (vlp2 != NULL) 792 mtx_lock(vlp2); 793 } 794 795 static void 796 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 797 { 798 799 MPASS(vlp1 != NULL || vlp2 != NULL); 800 801 if (vlp1 != NULL) 802 mtx_unlock(vlp1); 803 if (vlp2 != NULL) 804 mtx_unlock(vlp2); 805 } 806 807 static int 808 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 809 { 810 struct nchstats snap; 811 812 if (req->oldptr == NULL) 813 return (SYSCTL_OUT(req, 0, sizeof(snap))); 814 815 snap = nchstats; 816 snap.ncs_goodhits = counter_u64_fetch(numposhits); 817 snap.ncs_neghits = counter_u64_fetch(numneghits); 818 snap.ncs_badhits = counter_u64_fetch(numposzaps) + 819 counter_u64_fetch(numnegzaps); 820 snap.ncs_miss = counter_u64_fetch(nummisszap) + 821 counter_u64_fetch(nummiss); 822 823 return (SYSCTL_OUT(req, &snap, sizeof(snap))); 824 } 825 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | 826 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", 827 "VFS cache effectiveness statistics"); 828 829 static void 830 cache_recalc_neg_min(u_int val) 831 { 832 833 neg_min = (ncsize * val) / 100; 834 } 835 836 static int 837 sysctl_negminpct(SYSCTL_HANDLER_ARGS) 838 { 839 u_int val; 840 int error; 841 842 val = ncnegminpct; 843 error = sysctl_handle_int(oidp, &val, 0, req); 844 if (error != 0 || req->newptr == NULL) 845 return (error); 846 847 if (val == ncnegminpct) 848 return (0); 849 if (val < 0 || val > 99) 850 return (EINVAL); 851 ncnegminpct = val; 852 cache_recalc_neg_min(val); 853 return (0); 854 } 855 856 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, negminpct, 857 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_negminpct, 858 "I", "Negative entry \% of namecache capacity above which automatic eviction is allowed"); 859 860 #ifdef DIAGNOSTIC 861 /* 862 * Grab an atomic snapshot of the name cache hash chain lengths 863 */ 864 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, 865 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 866 "hash table stats"); 867 868 static int 869 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) 870 { 871 struct nchashhead *ncpp; 872 struct namecache *ncp; 873 int i, error, n_nchash, *cntbuf; 874 875 retry: 876 n_nchash = nchash + 1; /* nchash is max index, not count */ 877 if (req->oldptr == NULL) 878 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); 879 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); 880 cache_lock_all_buckets(); 881 if (n_nchash != nchash + 1) { 882 cache_unlock_all_buckets(); 883 free(cntbuf, M_TEMP); 884 goto retry; 885 } 886 /* Scan hash tables counting entries */ 887 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) 888 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) 889 cntbuf[i]++; 890 cache_unlock_all_buckets(); 891 for (error = 0, i = 0; i < n_nchash; i++) 892 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) 893 break; 894 free(cntbuf, M_TEMP); 895 return (error); 896 } 897 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| 898 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", 899 "nchash chain lengths"); 900 901 static int 902 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) 903 { 904 int error; 905 struct nchashhead *ncpp; 906 struct namecache *ncp; 907 int n_nchash; 908 int count, maxlength, used, pct; 909 910 if (!req->oldptr) 911 return SYSCTL_OUT(req, 0, 4 * sizeof(int)); 912 913 cache_lock_all_buckets(); 914 n_nchash = nchash + 1; /* nchash is max index, not count */ 915 used = 0; 916 maxlength = 0; 917 918 /* Scan hash tables for applicable entries */ 919 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { 920 count = 0; 921 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) { 922 count++; 923 } 924 if (count) 925 used++; 926 if (maxlength < count) 927 maxlength = count; 928 } 929 n_nchash = nchash + 1; 930 cache_unlock_all_buckets(); 931 pct = (used * 100) / (n_nchash / 100); 932 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); 933 if (error) 934 return (error); 935 error = SYSCTL_OUT(req, &used, sizeof(used)); 936 if (error) 937 return (error); 938 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); 939 if (error) 940 return (error); 941 error = SYSCTL_OUT(req, &pct, sizeof(pct)); 942 if (error) 943 return (error); 944 return (0); 945 } 946 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| 947 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", 948 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); 949 #endif 950 951 /* 952 * Negative entries management 953 * 954 * Various workloads create plenty of negative entries and barely use them 955 * afterwards. Moreover malicious users can keep performing bogus lookups 956 * adding even more entries. For example "make tinderbox" as of writing this 957 * comment ends up with 2.6M namecache entries in total, 1.2M of which are 958 * negative. 959 * 960 * As such, a rather aggressive eviction method is needed. The currently 961 * employed method is a placeholder. 962 * 963 * Entries are split over numneglists separate lists, each of which is further 964 * split into hot and cold entries. Entries get promoted after getting a hit. 965 * Eviction happens on addition of new entry. 966 */ 967 static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 968 "Name cache negative entry statistics"); 969 970 SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0, 971 "Number of negative cache entries"); 972 973 static COUNTER_U64_DEFINE_EARLY(neg_created); 974 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created, 975 "Number of created negative entries"); 976 977 static COUNTER_U64_DEFINE_EARLY(neg_evicted); 978 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted, 979 "Number of evicted negative entries"); 980 981 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty); 982 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD, 983 &neg_evict_skipped_empty, 984 "Number of times evicting failed due to lack of entries"); 985 986 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed); 987 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD, 988 &neg_evict_skipped_missed, 989 "Number of times evicting failed due to target entry disappearing"); 990 991 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended); 992 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD, 993 &neg_evict_skipped_contended, 994 "Number of times evicting failed due to contention"); 995 996 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits, 997 "Number of cache hits (negative)"); 998 999 static int 1000 sysctl_neg_hot(SYSCTL_HANDLER_ARGS) 1001 { 1002 int i, out; 1003 1004 out = 0; 1005 for (i = 0; i < numneglists; i++) 1006 out += neglists[i].nl_hotnum; 1007 1008 return (SYSCTL_OUT(req, &out, sizeof(out))); 1009 } 1010 SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD | 1011 CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I", 1012 "Number of hot negative entries"); 1013 1014 static void 1015 cache_neg_init(struct namecache *ncp) 1016 { 1017 struct negstate *ns; 1018 1019 ncp->nc_flag |= NCF_NEGATIVE; 1020 ns = NCP2NEGSTATE(ncp); 1021 ns->neg_flag = 0; 1022 ns->neg_hit = 0; 1023 counter_u64_add(neg_created, 1); 1024 } 1025 1026 #define CACHE_NEG_PROMOTION_THRESH 2 1027 1028 static bool 1029 cache_neg_hit_prep(struct namecache *ncp) 1030 { 1031 struct negstate *ns; 1032 u_char n; 1033 1034 ns = NCP2NEGSTATE(ncp); 1035 n = atomic_load_char(&ns->neg_hit); 1036 for (;;) { 1037 if (n >= CACHE_NEG_PROMOTION_THRESH) 1038 return (false); 1039 if (atomic_fcmpset_8(&ns->neg_hit, &n, n + 1)) 1040 break; 1041 } 1042 return (n + 1 == CACHE_NEG_PROMOTION_THRESH); 1043 } 1044 1045 /* 1046 * Nothing to do here but it is provided for completeness as some 1047 * cache_neg_hit_prep callers may end up returning without even 1048 * trying to promote. 1049 */ 1050 #define cache_neg_hit_abort(ncp) do { } while (0) 1051 1052 static void 1053 cache_neg_hit_finish(struct namecache *ncp) 1054 { 1055 1056 SDT_PROBE2(vfs, namecache, lookup, hit__negative, ncp->nc_dvp, ncp->nc_name); 1057 counter_u64_add(numneghits, 1); 1058 } 1059 1060 /* 1061 * Move a negative entry to the hot list. 1062 */ 1063 static void 1064 cache_neg_promote_locked(struct namecache *ncp) 1065 { 1066 struct neglist *nl; 1067 struct negstate *ns; 1068 1069 ns = NCP2NEGSTATE(ncp); 1070 nl = NCP2NEGLIST(ncp); 1071 mtx_assert(&nl->nl_lock, MA_OWNED); 1072 if ((ns->neg_flag & NEG_HOT) == 0) { 1073 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst); 1074 TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst); 1075 nl->nl_hotnum++; 1076 ns->neg_flag |= NEG_HOT; 1077 } 1078 } 1079 1080 /* 1081 * Move a hot negative entry to the cold list. 1082 */ 1083 static void 1084 cache_neg_demote_locked(struct namecache *ncp) 1085 { 1086 struct neglist *nl; 1087 struct negstate *ns; 1088 1089 ns = NCP2NEGSTATE(ncp); 1090 nl = NCP2NEGLIST(ncp); 1091 mtx_assert(&nl->nl_lock, MA_OWNED); 1092 MPASS(ns->neg_flag & NEG_HOT); 1093 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst); 1094 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst); 1095 nl->nl_hotnum--; 1096 ns->neg_flag &= ~NEG_HOT; 1097 atomic_store_char(&ns->neg_hit, 0); 1098 } 1099 1100 /* 1101 * Move a negative entry to the hot list if it matches the lookup. 1102 * 1103 * We have to take locks, but they may be contended and in the worst 1104 * case we may need to go off CPU. We don't want to spin within the 1105 * smr section and we can't block with it. Exiting the section means 1106 * the found entry could have been evicted. We are going to look it 1107 * up again. 1108 */ 1109 static bool 1110 cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp, 1111 struct namecache *oncp, uint32_t hash) 1112 { 1113 struct namecache *ncp; 1114 struct neglist *nl; 1115 u_char nc_flag; 1116 1117 nl = NCP2NEGLIST(oncp); 1118 1119 mtx_lock(&nl->nl_lock); 1120 /* 1121 * For hash iteration. 1122 */ 1123 vfs_smr_enter(); 1124 1125 /* 1126 * Avoid all surprises by only succeeding if we got the same entry and 1127 * bailing completely otherwise. 1128 * XXX There are no provisions to keep the vnode around, meaning we may 1129 * end up promoting a negative entry for a *new* vnode and returning 1130 * ENOENT on its account. This is the error we want to return anyway 1131 * and promotion is harmless. 1132 * 1133 * In particular at this point there can be a new ncp which matches the 1134 * search but hashes to a different neglist. 1135 */ 1136 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1137 if (ncp == oncp) 1138 break; 1139 } 1140 1141 /* 1142 * No match to begin with. 1143 */ 1144 if (__predict_false(ncp == NULL)) { 1145 goto out_abort; 1146 } 1147 1148 /* 1149 * The newly found entry may be something different... 1150 */ 1151 if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1152 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) { 1153 goto out_abort; 1154 } 1155 1156 /* 1157 * ... and not even negative. 1158 */ 1159 nc_flag = atomic_load_char(&ncp->nc_flag); 1160 if ((nc_flag & NCF_NEGATIVE) == 0) { 1161 goto out_abort; 1162 } 1163 1164 if (!cache_ncp_canuse(ncp)) { 1165 goto out_abort; 1166 } 1167 1168 cache_neg_promote_locked(ncp); 1169 cache_neg_hit_finish(ncp); 1170 vfs_smr_exit(); 1171 mtx_unlock(&nl->nl_lock); 1172 return (true); 1173 out_abort: 1174 vfs_smr_exit(); 1175 mtx_unlock(&nl->nl_lock); 1176 return (false); 1177 } 1178 1179 static void 1180 cache_neg_promote(struct namecache *ncp) 1181 { 1182 struct neglist *nl; 1183 1184 nl = NCP2NEGLIST(ncp); 1185 mtx_lock(&nl->nl_lock); 1186 cache_neg_promote_locked(ncp); 1187 mtx_unlock(&nl->nl_lock); 1188 } 1189 1190 static void 1191 cache_neg_insert(struct namecache *ncp) 1192 { 1193 struct neglist *nl; 1194 1195 MPASS(ncp->nc_flag & NCF_NEGATIVE); 1196 cache_assert_bucket_locked(ncp); 1197 nl = NCP2NEGLIST(ncp); 1198 mtx_lock(&nl->nl_lock); 1199 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst); 1200 mtx_unlock(&nl->nl_lock); 1201 atomic_add_long(&numneg, 1); 1202 } 1203 1204 static void 1205 cache_neg_remove(struct namecache *ncp) 1206 { 1207 struct neglist *nl; 1208 struct negstate *ns; 1209 1210 cache_assert_bucket_locked(ncp); 1211 nl = NCP2NEGLIST(ncp); 1212 ns = NCP2NEGSTATE(ncp); 1213 mtx_lock(&nl->nl_lock); 1214 if ((ns->neg_flag & NEG_HOT) != 0) { 1215 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst); 1216 nl->nl_hotnum--; 1217 } else { 1218 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst); 1219 } 1220 mtx_unlock(&nl->nl_lock); 1221 atomic_subtract_long(&numneg, 1); 1222 } 1223 1224 static struct neglist * 1225 cache_neg_evict_select_list(void) 1226 { 1227 struct neglist *nl; 1228 u_int c; 1229 1230 c = atomic_fetchadd_int(&neg_cycle, 1) + 1; 1231 nl = &neglists[c % numneglists]; 1232 if (!mtx_trylock(&nl->nl_evict_lock)) { 1233 counter_u64_add(neg_evict_skipped_contended, 1); 1234 return (NULL); 1235 } 1236 return (nl); 1237 } 1238 1239 static struct namecache * 1240 cache_neg_evict_select_entry(struct neglist *nl) 1241 { 1242 struct namecache *ncp, *lncp; 1243 struct negstate *ns, *lns; 1244 int i; 1245 1246 mtx_assert(&nl->nl_evict_lock, MA_OWNED); 1247 mtx_assert(&nl->nl_lock, MA_OWNED); 1248 ncp = TAILQ_FIRST(&nl->nl_list); 1249 if (ncp == NULL) 1250 return (NULL); 1251 lncp = ncp; 1252 lns = NCP2NEGSTATE(lncp); 1253 for (i = 1; i < 4; i++) { 1254 ncp = TAILQ_NEXT(ncp, nc_dst); 1255 if (ncp == NULL) 1256 break; 1257 ns = NCP2NEGSTATE(ncp); 1258 if (ns->neg_hit < lns->neg_hit) { 1259 lncp = ncp; 1260 lns = ns; 1261 } 1262 } 1263 return (lncp); 1264 } 1265 1266 static bool 1267 cache_neg_evict(void) 1268 { 1269 struct namecache *ncp, *ncp2; 1270 struct neglist *nl; 1271 struct vnode *dvp; 1272 struct mtx *dvlp; 1273 struct mtx *blp; 1274 uint32_t hash; 1275 u_char nlen; 1276 bool evicted; 1277 1278 nl = cache_neg_evict_select_list(); 1279 if (nl == NULL) { 1280 return (false); 1281 } 1282 1283 mtx_lock(&nl->nl_lock); 1284 ncp = TAILQ_FIRST(&nl->nl_hotlist); 1285 if (ncp != NULL) { 1286 cache_neg_demote_locked(ncp); 1287 } 1288 ncp = cache_neg_evict_select_entry(nl); 1289 if (ncp == NULL) { 1290 counter_u64_add(neg_evict_skipped_empty, 1); 1291 mtx_unlock(&nl->nl_lock); 1292 mtx_unlock(&nl->nl_evict_lock); 1293 return (false); 1294 } 1295 nlen = ncp->nc_nlen; 1296 dvp = ncp->nc_dvp; 1297 hash = cache_get_hash(ncp->nc_name, nlen, dvp); 1298 dvlp = VP2VNODELOCK(dvp); 1299 blp = HASH2BUCKETLOCK(hash); 1300 mtx_unlock(&nl->nl_lock); 1301 mtx_unlock(&nl->nl_evict_lock); 1302 mtx_lock(dvlp); 1303 mtx_lock(blp); 1304 /* 1305 * Note that since all locks were dropped above, the entry may be 1306 * gone or reallocated to be something else. 1307 */ 1308 CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) { 1309 if (ncp2 == ncp && ncp2->nc_dvp == dvp && 1310 ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0) 1311 break; 1312 } 1313 if (ncp2 == NULL) { 1314 counter_u64_add(neg_evict_skipped_missed, 1); 1315 ncp = NULL; 1316 evicted = false; 1317 } else { 1318 MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp)); 1319 MPASS(blp == NCP2BUCKETLOCK(ncp)); 1320 SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp, 1321 ncp->nc_name); 1322 cache_zap_locked(ncp); 1323 counter_u64_add(neg_evicted, 1); 1324 evicted = true; 1325 } 1326 mtx_unlock(blp); 1327 mtx_unlock(dvlp); 1328 if (ncp != NULL) 1329 cache_free(ncp); 1330 return (evicted); 1331 } 1332 1333 /* 1334 * Maybe evict a negative entry to create more room. 1335 * 1336 * The ncnegfactor parameter limits what fraction of the total count 1337 * can comprise of negative entries. However, if the cache is just 1338 * warming up this leads to excessive evictions. As such, ncnegminpct 1339 * (recomputed to neg_min) dictates whether the above should be 1340 * applied. 1341 * 1342 * Try evicting if the cache is close to full capacity regardless of 1343 * other considerations. 1344 */ 1345 static bool 1346 cache_neg_evict_cond(u_long lnumcache) 1347 { 1348 u_long lnumneg; 1349 1350 if (ncsize - 1000 < lnumcache) 1351 goto out_evict; 1352 lnumneg = atomic_load_long(&numneg); 1353 if (lnumneg < neg_min) 1354 return (false); 1355 if (lnumneg * ncnegfactor < lnumcache) 1356 return (false); 1357 out_evict: 1358 return (cache_neg_evict()); 1359 } 1360 1361 /* 1362 * cache_zap_locked(): 1363 * 1364 * Removes a namecache entry from cache, whether it contains an actual 1365 * pointer to a vnode or if it is just a negative cache entry. 1366 */ 1367 static void 1368 cache_zap_locked(struct namecache *ncp) 1369 { 1370 struct nchashhead *ncpp; 1371 1372 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1373 cache_assert_vnode_locked(ncp->nc_vp); 1374 cache_assert_vnode_locked(ncp->nc_dvp); 1375 cache_assert_bucket_locked(ncp); 1376 1377 cache_ncp_invalidate(ncp); 1378 1379 ncpp = NCP2BUCKET(ncp); 1380 CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash); 1381 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 1382 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, 1383 ncp->nc_name, ncp->nc_vp); 1384 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); 1385 if (ncp == ncp->nc_vp->v_cache_dd) { 1386 vn_seqc_write_begin_unheld(ncp->nc_vp); 1387 ncp->nc_vp->v_cache_dd = NULL; 1388 vn_seqc_write_end(ncp->nc_vp); 1389 } 1390 } else { 1391 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp, 1392 ncp->nc_name); 1393 cache_neg_remove(ncp); 1394 } 1395 if (ncp->nc_flag & NCF_ISDOTDOT) { 1396 if (ncp == ncp->nc_dvp->v_cache_dd) { 1397 vn_seqc_write_begin_unheld(ncp->nc_dvp); 1398 ncp->nc_dvp->v_cache_dd = NULL; 1399 vn_seqc_write_end(ncp->nc_dvp); 1400 } 1401 } else { 1402 LIST_REMOVE(ncp, nc_src); 1403 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { 1404 ncp->nc_flag |= NCF_DVDROP; 1405 } 1406 } 1407 } 1408 1409 static void 1410 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) 1411 { 1412 struct mtx *blp; 1413 1414 MPASS(ncp->nc_dvp == vp); 1415 MPASS(ncp->nc_flag & NCF_NEGATIVE); 1416 cache_assert_vnode_locked(vp); 1417 1418 blp = NCP2BUCKETLOCK(ncp); 1419 mtx_lock(blp); 1420 cache_zap_locked(ncp); 1421 mtx_unlock(blp); 1422 } 1423 1424 static bool 1425 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, 1426 struct mtx **vlpp) 1427 { 1428 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 1429 struct mtx *blp; 1430 1431 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 1432 cache_assert_vnode_locked(vp); 1433 1434 if (ncp->nc_flag & NCF_NEGATIVE) { 1435 if (*vlpp != NULL) { 1436 mtx_unlock(*vlpp); 1437 *vlpp = NULL; 1438 } 1439 cache_zap_negative_locked_vnode_kl(ncp, vp); 1440 return (true); 1441 } 1442 1443 pvlp = VP2VNODELOCK(vp); 1444 blp = NCP2BUCKETLOCK(ncp); 1445 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 1446 vlp2 = VP2VNODELOCK(ncp->nc_vp); 1447 1448 if (*vlpp == vlp1 || *vlpp == vlp2) { 1449 to_unlock = *vlpp; 1450 *vlpp = NULL; 1451 } else { 1452 if (*vlpp != NULL) { 1453 mtx_unlock(*vlpp); 1454 *vlpp = NULL; 1455 } 1456 cache_sort_vnodes(&vlp1, &vlp2); 1457 if (vlp1 == pvlp) { 1458 mtx_lock(vlp2); 1459 to_unlock = vlp2; 1460 } else { 1461 if (!mtx_trylock(vlp1)) 1462 goto out_relock; 1463 to_unlock = vlp1; 1464 } 1465 } 1466 mtx_lock(blp); 1467 cache_zap_locked(ncp); 1468 mtx_unlock(blp); 1469 if (to_unlock != NULL) 1470 mtx_unlock(to_unlock); 1471 return (true); 1472 1473 out_relock: 1474 mtx_unlock(vlp2); 1475 mtx_lock(vlp1); 1476 mtx_lock(vlp2); 1477 MPASS(*vlpp == NULL); 1478 *vlpp = vlp1; 1479 return (false); 1480 } 1481 1482 /* 1483 * If trylocking failed we can get here. We know enough to take all needed locks 1484 * in the right order and re-lookup the entry. 1485 */ 1486 static int 1487 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1488 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, 1489 struct mtx *blp) 1490 { 1491 struct namecache *rncp; 1492 1493 cache_assert_bucket_unlocked(ncp); 1494 1495 cache_sort_vnodes(&dvlp, &vlp); 1496 cache_lock_vnodes(dvlp, vlp); 1497 mtx_lock(blp); 1498 CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { 1499 if (rncp == ncp && rncp->nc_dvp == dvp && 1500 rncp->nc_nlen == cnp->cn_namelen && 1501 !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) 1502 break; 1503 } 1504 if (rncp != NULL) { 1505 cache_zap_locked(rncp); 1506 mtx_unlock(blp); 1507 cache_unlock_vnodes(dvlp, vlp); 1508 counter_u64_add(zap_bucket_relock_success, 1); 1509 return (0); 1510 } 1511 1512 mtx_unlock(blp); 1513 cache_unlock_vnodes(dvlp, vlp); 1514 return (EAGAIN); 1515 } 1516 1517 static int __noinline 1518 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp, 1519 uint32_t hash, struct mtx *blp) 1520 { 1521 struct mtx *dvlp, *vlp; 1522 struct vnode *dvp; 1523 1524 cache_assert_bucket_locked(ncp); 1525 1526 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1527 vlp = NULL; 1528 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1529 vlp = VP2VNODELOCK(ncp->nc_vp); 1530 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1531 cache_zap_locked(ncp); 1532 mtx_unlock(blp); 1533 cache_unlock_vnodes(dvlp, vlp); 1534 return (0); 1535 } 1536 1537 dvp = ncp->nc_dvp; 1538 mtx_unlock(blp); 1539 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1540 } 1541 1542 static __noinline int 1543 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp) 1544 { 1545 struct namecache *ncp; 1546 struct mtx *blp; 1547 struct mtx *dvlp, *dvlp2; 1548 uint32_t hash; 1549 int error; 1550 1551 if (cnp->cn_namelen == 2 && 1552 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1553 dvlp = VP2VNODELOCK(dvp); 1554 dvlp2 = NULL; 1555 mtx_lock(dvlp); 1556 retry_dotdot: 1557 ncp = dvp->v_cache_dd; 1558 if (ncp == NULL) { 1559 mtx_unlock(dvlp); 1560 if (dvlp2 != NULL) 1561 mtx_unlock(dvlp2); 1562 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1563 return (0); 1564 } 1565 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1566 if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2)) 1567 goto retry_dotdot; 1568 MPASS(dvp->v_cache_dd == NULL); 1569 mtx_unlock(dvlp); 1570 if (dvlp2 != NULL) 1571 mtx_unlock(dvlp2); 1572 cache_free(ncp); 1573 } else { 1574 vn_seqc_write_begin(dvp); 1575 dvp->v_cache_dd = NULL; 1576 vn_seqc_write_end(dvp); 1577 mtx_unlock(dvlp); 1578 if (dvlp2 != NULL) 1579 mtx_unlock(dvlp2); 1580 } 1581 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1582 return (1); 1583 } 1584 1585 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1586 blp = HASH2BUCKETLOCK(hash); 1587 retry: 1588 if (CK_SLIST_EMPTY(NCHHASH(hash))) 1589 goto out_no_entry; 1590 1591 mtx_lock(blp); 1592 1593 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1594 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1595 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1596 break; 1597 } 1598 1599 if (ncp == NULL) { 1600 mtx_unlock(blp); 1601 goto out_no_entry; 1602 } 1603 1604 error = cache_zap_locked_bucket(ncp, cnp, hash, blp); 1605 if (__predict_false(error != 0)) { 1606 zap_bucket_fail++; 1607 goto retry; 1608 } 1609 counter_u64_add(numposzaps, 1); 1610 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1611 cache_free(ncp); 1612 return (1); 1613 out_no_entry: 1614 counter_u64_add(nummisszap, 1); 1615 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1616 return (0); 1617 } 1618 1619 static int __noinline 1620 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1621 struct timespec *tsp, int *ticksp) 1622 { 1623 int ltype; 1624 1625 *vpp = dvp; 1626 counter_u64_add(dothits, 1); 1627 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); 1628 if (tsp != NULL) 1629 timespecclear(tsp); 1630 if (ticksp != NULL) 1631 *ticksp = ticks; 1632 vrefact(*vpp); 1633 /* 1634 * When we lookup "." we still can be asked to lock it 1635 * differently... 1636 */ 1637 ltype = cnp->cn_lkflags & LK_TYPE_MASK; 1638 if (ltype != VOP_ISLOCKED(*vpp)) { 1639 if (ltype == LK_EXCLUSIVE) { 1640 vn_lock(*vpp, LK_UPGRADE | LK_RETRY); 1641 if (VN_IS_DOOMED((*vpp))) { 1642 /* forced unmount */ 1643 vrele(*vpp); 1644 *vpp = NULL; 1645 return (ENOENT); 1646 } 1647 } else 1648 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); 1649 } 1650 return (-1); 1651 } 1652 1653 static int __noinline 1654 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1655 struct timespec *tsp, int *ticksp) 1656 { 1657 struct namecache_ts *ncp_ts; 1658 struct namecache *ncp; 1659 struct mtx *dvlp; 1660 enum vgetstate vs; 1661 int error, ltype; 1662 bool whiteout; 1663 1664 MPASS((cnp->cn_flags & ISDOTDOT) != 0); 1665 1666 if ((cnp->cn_flags & MAKEENTRY) == 0) { 1667 cache_remove_cnp(dvp, cnp); 1668 return (0); 1669 } 1670 1671 counter_u64_add(dotdothits, 1); 1672 retry: 1673 dvlp = VP2VNODELOCK(dvp); 1674 mtx_lock(dvlp); 1675 ncp = dvp->v_cache_dd; 1676 if (ncp == NULL) { 1677 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, "..", NULL); 1678 mtx_unlock(dvlp); 1679 return (0); 1680 } 1681 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1682 if (ncp->nc_flag & NCF_NEGATIVE) 1683 *vpp = NULL; 1684 else 1685 *vpp = ncp->nc_vp; 1686 } else 1687 *vpp = ncp->nc_dvp; 1688 if (*vpp == NULL) 1689 goto negative_success; 1690 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp); 1691 cache_out_ts(ncp, tsp, ticksp); 1692 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == 1693 NCF_DTS && tsp != NULL) { 1694 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1695 *tsp = ncp_ts->nc_dotdottime; 1696 } 1697 1698 MPASS(dvp != *vpp); 1699 ltype = VOP_ISLOCKED(dvp); 1700 VOP_UNLOCK(dvp); 1701 vs = vget_prep(*vpp); 1702 mtx_unlock(dvlp); 1703 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1704 vn_lock(dvp, ltype | LK_RETRY); 1705 if (VN_IS_DOOMED(dvp)) { 1706 if (error == 0) 1707 vput(*vpp); 1708 *vpp = NULL; 1709 return (ENOENT); 1710 } 1711 if (error) { 1712 *vpp = NULL; 1713 goto retry; 1714 } 1715 return (-1); 1716 negative_success: 1717 if (__predict_false(cnp->cn_nameiop == CREATE)) { 1718 if (cnp->cn_flags & ISLASTCN) { 1719 counter_u64_add(numnegzaps, 1); 1720 cache_zap_negative_locked_vnode_kl(ncp, dvp); 1721 mtx_unlock(dvlp); 1722 cache_free(ncp); 1723 return (0); 1724 } 1725 } 1726 1727 whiteout = (ncp->nc_flag & NCF_WHITE); 1728 cache_out_ts(ncp, tsp, ticksp); 1729 if (cache_neg_hit_prep(ncp)) 1730 cache_neg_promote(ncp); 1731 else 1732 cache_neg_hit_finish(ncp); 1733 mtx_unlock(dvlp); 1734 if (whiteout) 1735 cnp->cn_flags |= ISWHITEOUT; 1736 return (ENOENT); 1737 } 1738 1739 /** 1740 * Lookup a name in the name cache 1741 * 1742 * # Arguments 1743 * 1744 * - dvp: Parent directory in which to search. 1745 * - vpp: Return argument. Will contain desired vnode on cache hit. 1746 * - cnp: Parameters of the name search. The most interesting bits of 1747 * the cn_flags field have the following meanings: 1748 * - MAKEENTRY: If clear, free an entry from the cache rather than look 1749 * it up. 1750 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." 1751 * - tsp: Return storage for cache timestamp. On a successful (positive 1752 * or negative) lookup, tsp will be filled with any timespec that 1753 * was stored when this cache entry was created. However, it will 1754 * be clear for "." entries. 1755 * - ticks: Return storage for alternate cache timestamp. On a successful 1756 * (positive or negative) lookup, it will contain the ticks value 1757 * that was current when the cache entry was created, unless cnp 1758 * was ".". 1759 * 1760 * Either both tsp and ticks have to be provided or neither of them. 1761 * 1762 * # Returns 1763 * 1764 * - -1: A positive cache hit. vpp will contain the desired vnode. 1765 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due 1766 * to a forced unmount. vpp will not be modified. If the entry 1767 * is a whiteout, then the ISWHITEOUT flag will be set in 1768 * cnp->cn_flags. 1769 * - 0: A cache miss. vpp will not be modified. 1770 * 1771 * # Locking 1772 * 1773 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up 1774 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the 1775 * lock is not recursively acquired. 1776 */ 1777 static int __noinline 1778 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1779 struct timespec *tsp, int *ticksp) 1780 { 1781 struct namecache *ncp; 1782 struct mtx *blp; 1783 uint32_t hash; 1784 enum vgetstate vs; 1785 int error; 1786 bool whiteout; 1787 1788 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 1789 MPASS((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) != 0); 1790 1791 retry: 1792 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1793 blp = HASH2BUCKETLOCK(hash); 1794 mtx_lock(blp); 1795 1796 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1797 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1798 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1799 break; 1800 } 1801 1802 if (__predict_false(ncp == NULL)) { 1803 mtx_unlock(blp); 1804 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1805 NULL); 1806 counter_u64_add(nummiss, 1); 1807 return (0); 1808 } 1809 1810 if (ncp->nc_flag & NCF_NEGATIVE) 1811 goto negative_success; 1812 1813 counter_u64_add(numposhits, 1); 1814 *vpp = ncp->nc_vp; 1815 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); 1816 cache_out_ts(ncp, tsp, ticksp); 1817 MPASS(dvp != *vpp); 1818 vs = vget_prep(*vpp); 1819 mtx_unlock(blp); 1820 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1821 if (error) { 1822 *vpp = NULL; 1823 goto retry; 1824 } 1825 return (-1); 1826 negative_success: 1827 if (__predict_false(cnp->cn_nameiop == CREATE)) { 1828 if (cnp->cn_flags & ISLASTCN) { 1829 counter_u64_add(numnegzaps, 1); 1830 error = cache_zap_locked_bucket(ncp, cnp, hash, blp); 1831 if (__predict_false(error != 0)) { 1832 zap_bucket_fail2++; 1833 goto retry; 1834 } 1835 cache_free(ncp); 1836 return (0); 1837 } 1838 } 1839 1840 whiteout = (ncp->nc_flag & NCF_WHITE); 1841 cache_out_ts(ncp, tsp, ticksp); 1842 if (cache_neg_hit_prep(ncp)) 1843 cache_neg_promote(ncp); 1844 else 1845 cache_neg_hit_finish(ncp); 1846 mtx_unlock(blp); 1847 if (whiteout) 1848 cnp->cn_flags |= ISWHITEOUT; 1849 return (ENOENT); 1850 } 1851 1852 int 1853 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1854 struct timespec *tsp, int *ticksp) 1855 { 1856 struct namecache *ncp; 1857 uint32_t hash; 1858 enum vgetstate vs; 1859 int error; 1860 bool whiteout, neg_promote; 1861 u_short nc_flag; 1862 1863 MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL)); 1864 1865 #ifdef DEBUG_CACHE 1866 if (__predict_false(!doingcache)) { 1867 cnp->cn_flags &= ~MAKEENTRY; 1868 return (0); 1869 } 1870 #endif 1871 1872 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 1873 if (cnp->cn_namelen == 1) 1874 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); 1875 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') 1876 return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp)); 1877 } 1878 1879 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 1880 1881 if ((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) == 0) { 1882 cache_remove_cnp(dvp, cnp); 1883 return (0); 1884 } 1885 1886 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1887 vfs_smr_enter(); 1888 1889 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1890 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1891 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1892 break; 1893 } 1894 1895 if (__predict_false(ncp == NULL)) { 1896 vfs_smr_exit(); 1897 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1898 NULL); 1899 counter_u64_add(nummiss, 1); 1900 return (0); 1901 } 1902 1903 nc_flag = atomic_load_char(&ncp->nc_flag); 1904 if (nc_flag & NCF_NEGATIVE) 1905 goto negative_success; 1906 1907 counter_u64_add(numposhits, 1); 1908 *vpp = ncp->nc_vp; 1909 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); 1910 cache_out_ts(ncp, tsp, ticksp); 1911 MPASS(dvp != *vpp); 1912 if (!cache_ncp_canuse(ncp)) { 1913 vfs_smr_exit(); 1914 *vpp = NULL; 1915 goto out_fallback; 1916 } 1917 vs = vget_prep_smr(*vpp); 1918 vfs_smr_exit(); 1919 if (__predict_false(vs == VGET_NONE)) { 1920 *vpp = NULL; 1921 goto out_fallback; 1922 } 1923 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1924 if (error) { 1925 *vpp = NULL; 1926 goto out_fallback; 1927 } 1928 return (-1); 1929 negative_success: 1930 if (__predict_false(cnp->cn_nameiop == CREATE)) { 1931 if (cnp->cn_flags & ISLASTCN) { 1932 vfs_smr_exit(); 1933 goto out_fallback; 1934 } 1935 } 1936 1937 cache_out_ts(ncp, tsp, ticksp); 1938 whiteout = (ncp->nc_flag & NCF_WHITE); 1939 neg_promote = cache_neg_hit_prep(ncp); 1940 if (!cache_ncp_canuse(ncp)) { 1941 cache_neg_hit_abort(ncp); 1942 vfs_smr_exit(); 1943 goto out_fallback; 1944 } 1945 if (neg_promote) { 1946 vfs_smr_exit(); 1947 if (!cache_neg_promote_cond(dvp, cnp, ncp, hash)) 1948 goto out_fallback; 1949 } else { 1950 cache_neg_hit_finish(ncp); 1951 vfs_smr_exit(); 1952 } 1953 if (whiteout) 1954 cnp->cn_flags |= ISWHITEOUT; 1955 return (ENOENT); 1956 out_fallback: 1957 return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp)); 1958 } 1959 1960 struct celockstate { 1961 struct mtx *vlp[3]; 1962 struct mtx *blp[2]; 1963 }; 1964 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); 1965 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); 1966 1967 static inline void 1968 cache_celockstate_init(struct celockstate *cel) 1969 { 1970 1971 bzero(cel, sizeof(*cel)); 1972 } 1973 1974 static void 1975 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, 1976 struct vnode *dvp) 1977 { 1978 struct mtx *vlp1, *vlp2; 1979 1980 MPASS(cel->vlp[0] == NULL); 1981 MPASS(cel->vlp[1] == NULL); 1982 MPASS(cel->vlp[2] == NULL); 1983 1984 MPASS(vp != NULL || dvp != NULL); 1985 1986 vlp1 = VP2VNODELOCK(vp); 1987 vlp2 = VP2VNODELOCK(dvp); 1988 cache_sort_vnodes(&vlp1, &vlp2); 1989 1990 if (vlp1 != NULL) { 1991 mtx_lock(vlp1); 1992 cel->vlp[0] = vlp1; 1993 } 1994 mtx_lock(vlp2); 1995 cel->vlp[1] = vlp2; 1996 } 1997 1998 static void 1999 cache_unlock_vnodes_cel(struct celockstate *cel) 2000 { 2001 2002 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); 2003 2004 if (cel->vlp[0] != NULL) 2005 mtx_unlock(cel->vlp[0]); 2006 if (cel->vlp[1] != NULL) 2007 mtx_unlock(cel->vlp[1]); 2008 if (cel->vlp[2] != NULL) 2009 mtx_unlock(cel->vlp[2]); 2010 } 2011 2012 static bool 2013 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) 2014 { 2015 struct mtx *vlp; 2016 bool ret; 2017 2018 cache_assert_vlp_locked(cel->vlp[0]); 2019 cache_assert_vlp_locked(cel->vlp[1]); 2020 MPASS(cel->vlp[2] == NULL); 2021 2022 MPASS(vp != NULL); 2023 vlp = VP2VNODELOCK(vp); 2024 2025 ret = true; 2026 if (vlp >= cel->vlp[1]) { 2027 mtx_lock(vlp); 2028 } else { 2029 if (mtx_trylock(vlp)) 2030 goto out; 2031 cache_lock_vnodes_cel_3_failures++; 2032 cache_unlock_vnodes_cel(cel); 2033 if (vlp < cel->vlp[0]) { 2034 mtx_lock(vlp); 2035 mtx_lock(cel->vlp[0]); 2036 mtx_lock(cel->vlp[1]); 2037 } else { 2038 if (cel->vlp[0] != NULL) 2039 mtx_lock(cel->vlp[0]); 2040 mtx_lock(vlp); 2041 mtx_lock(cel->vlp[1]); 2042 } 2043 ret = false; 2044 } 2045 out: 2046 cel->vlp[2] = vlp; 2047 return (ret); 2048 } 2049 2050 static void 2051 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1, 2052 struct mtx *blp2) 2053 { 2054 2055 MPASS(cel->blp[0] == NULL); 2056 MPASS(cel->blp[1] == NULL); 2057 2058 cache_sort_vnodes(&blp1, &blp2); 2059 2060 if (blp1 != NULL) { 2061 mtx_lock(blp1); 2062 cel->blp[0] = blp1; 2063 } 2064 mtx_lock(blp2); 2065 cel->blp[1] = blp2; 2066 } 2067 2068 static void 2069 cache_unlock_buckets_cel(struct celockstate *cel) 2070 { 2071 2072 if (cel->blp[0] != NULL) 2073 mtx_unlock(cel->blp[0]); 2074 mtx_unlock(cel->blp[1]); 2075 } 2076 2077 /* 2078 * Lock part of the cache affected by the insertion. 2079 * 2080 * This means vnodelocks for dvp, vp and the relevant bucketlock. 2081 * However, insertion can result in removal of an old entry. In this 2082 * case we have an additional vnode and bucketlock pair to lock. 2083 * 2084 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while 2085 * preserving the locking order (smaller address first). 2086 */ 2087 static void 2088 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 2089 uint32_t hash) 2090 { 2091 struct namecache *ncp; 2092 struct mtx *blps[2]; 2093 2094 blps[0] = HASH2BUCKETLOCK(hash); 2095 for (;;) { 2096 blps[1] = NULL; 2097 cache_lock_vnodes_cel(cel, dvp, vp); 2098 if (vp == NULL || vp->v_type != VDIR) 2099 break; 2100 ncp = vp->v_cache_dd; 2101 if (ncp == NULL) 2102 break; 2103 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2104 break; 2105 MPASS(ncp->nc_dvp == vp); 2106 blps[1] = NCP2BUCKETLOCK(ncp); 2107 if (ncp->nc_flag & NCF_NEGATIVE) 2108 break; 2109 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 2110 break; 2111 /* 2112 * All vnodes got re-locked. Re-validate the state and if 2113 * nothing changed we are done. Otherwise restart. 2114 */ 2115 if (ncp == vp->v_cache_dd && 2116 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 2117 blps[1] == NCP2BUCKETLOCK(ncp) && 2118 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 2119 break; 2120 cache_unlock_vnodes_cel(cel); 2121 cel->vlp[0] = NULL; 2122 cel->vlp[1] = NULL; 2123 cel->vlp[2] = NULL; 2124 } 2125 cache_lock_buckets_cel(cel, blps[0], blps[1]); 2126 } 2127 2128 static void 2129 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 2130 uint32_t hash) 2131 { 2132 struct namecache *ncp; 2133 struct mtx *blps[2]; 2134 2135 blps[0] = HASH2BUCKETLOCK(hash); 2136 for (;;) { 2137 blps[1] = NULL; 2138 cache_lock_vnodes_cel(cel, dvp, vp); 2139 ncp = dvp->v_cache_dd; 2140 if (ncp == NULL) 2141 break; 2142 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2143 break; 2144 MPASS(ncp->nc_dvp == dvp); 2145 blps[1] = NCP2BUCKETLOCK(ncp); 2146 if (ncp->nc_flag & NCF_NEGATIVE) 2147 break; 2148 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 2149 break; 2150 if (ncp == dvp->v_cache_dd && 2151 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 2152 blps[1] == NCP2BUCKETLOCK(ncp) && 2153 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 2154 break; 2155 cache_unlock_vnodes_cel(cel); 2156 cel->vlp[0] = NULL; 2157 cel->vlp[1] = NULL; 2158 cel->vlp[2] = NULL; 2159 } 2160 cache_lock_buckets_cel(cel, blps[0], blps[1]); 2161 } 2162 2163 static void 2164 cache_enter_unlock(struct celockstate *cel) 2165 { 2166 2167 cache_unlock_buckets_cel(cel); 2168 cache_unlock_vnodes_cel(cel); 2169 } 2170 2171 static void __noinline 2172 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, 2173 struct componentname *cnp) 2174 { 2175 struct celockstate cel; 2176 struct namecache *ncp; 2177 uint32_t hash; 2178 int len; 2179 2180 if (dvp->v_cache_dd == NULL) 2181 return; 2182 len = cnp->cn_namelen; 2183 cache_celockstate_init(&cel); 2184 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 2185 cache_enter_lock_dd(&cel, dvp, vp, hash); 2186 vn_seqc_write_begin(dvp); 2187 ncp = dvp->v_cache_dd; 2188 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) { 2189 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); 2190 cache_zap_locked(ncp); 2191 } else { 2192 ncp = NULL; 2193 } 2194 dvp->v_cache_dd = NULL; 2195 vn_seqc_write_end(dvp); 2196 cache_enter_unlock(&cel); 2197 if (ncp != NULL) 2198 cache_free(ncp); 2199 } 2200 2201 /* 2202 * Add an entry to the cache. 2203 */ 2204 void 2205 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 2206 struct timespec *tsp, struct timespec *dtsp) 2207 { 2208 struct celockstate cel; 2209 struct namecache *ncp, *n2, *ndd; 2210 struct namecache_ts *ncp_ts; 2211 struct nchashhead *ncpp; 2212 uint32_t hash; 2213 int flag; 2214 int len; 2215 2216 VNPASS(dvp != vp, dvp); 2217 VNPASS(!VN_IS_DOOMED(dvp), dvp); 2218 VNPASS(dvp->v_type != VNON, dvp); 2219 if (vp != NULL) { 2220 VNPASS(!VN_IS_DOOMED(vp), vp); 2221 VNPASS(vp->v_type != VNON, vp); 2222 } 2223 2224 #ifdef DEBUG_CACHE 2225 if (__predict_false(!doingcache)) 2226 return; 2227 #endif 2228 2229 flag = 0; 2230 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 2231 if (cnp->cn_namelen == 1) 2232 return; 2233 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 2234 cache_enter_dotdot_prep(dvp, vp, cnp); 2235 flag = NCF_ISDOTDOT; 2236 } 2237 } 2238 2239 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); 2240 if (ncp == NULL) 2241 return; 2242 2243 cache_celockstate_init(&cel); 2244 ndd = NULL; 2245 ncp_ts = NULL; 2246 2247 /* 2248 * Calculate the hash key and setup as much of the new 2249 * namecache entry as possible before acquiring the lock. 2250 */ 2251 ncp->nc_flag = flag | NCF_WIP; 2252 ncp->nc_vp = vp; 2253 if (vp == NULL) 2254 cache_neg_init(ncp); 2255 ncp->nc_dvp = dvp; 2256 if (tsp != NULL) { 2257 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 2258 ncp_ts->nc_time = *tsp; 2259 ncp_ts->nc_ticks = ticks; 2260 ncp_ts->nc_nc.nc_flag |= NCF_TS; 2261 if (dtsp != NULL) { 2262 ncp_ts->nc_dotdottime = *dtsp; 2263 ncp_ts->nc_nc.nc_flag |= NCF_DTS; 2264 } 2265 } 2266 len = ncp->nc_nlen = cnp->cn_namelen; 2267 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 2268 memcpy(ncp->nc_name, cnp->cn_nameptr, len); 2269 ncp->nc_name[len] = '\0'; 2270 cache_enter_lock(&cel, dvp, vp, hash); 2271 2272 /* 2273 * See if this vnode or negative entry is already in the cache 2274 * with this name. This can happen with concurrent lookups of 2275 * the same path name. 2276 */ 2277 ncpp = NCHHASH(hash); 2278 CK_SLIST_FOREACH(n2, ncpp, nc_hash) { 2279 if (n2->nc_dvp == dvp && 2280 n2->nc_nlen == cnp->cn_namelen && 2281 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { 2282 MPASS(cache_ncp_canuse(n2)); 2283 if ((n2->nc_flag & NCF_NEGATIVE) != 0) 2284 KASSERT(vp == NULL, 2285 ("%s: found entry pointing to a different vnode (%p != %p)", 2286 __func__, NULL, vp)); 2287 else 2288 KASSERT(n2->nc_vp == vp, 2289 ("%s: found entry pointing to a different vnode (%p != %p)", 2290 __func__, n2->nc_vp, vp)); 2291 /* 2292 * Entries are supposed to be immutable unless in the 2293 * process of getting destroyed. Accommodating for 2294 * changing timestamps is possible but not worth it. 2295 * This should be harmless in terms of correctness, in 2296 * the worst case resulting in an earlier expiration. 2297 * Alternatively, the found entry can be replaced 2298 * altogether. 2299 */ 2300 MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS))); 2301 #if 0 2302 if (tsp != NULL) { 2303 KASSERT((n2->nc_flag & NCF_TS) != 0, 2304 ("no NCF_TS")); 2305 n2_ts = __containerof(n2, struct namecache_ts, nc_nc); 2306 n2_ts->nc_time = ncp_ts->nc_time; 2307 n2_ts->nc_ticks = ncp_ts->nc_ticks; 2308 if (dtsp != NULL) { 2309 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; 2310 n2_ts->nc_nc.nc_flag |= NCF_DTS; 2311 } 2312 } 2313 #endif 2314 SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name, 2315 vp); 2316 goto out_unlock_free; 2317 } 2318 } 2319 2320 if (flag == NCF_ISDOTDOT) { 2321 /* 2322 * See if we are trying to add .. entry, but some other lookup 2323 * has populated v_cache_dd pointer already. 2324 */ 2325 if (dvp->v_cache_dd != NULL) 2326 goto out_unlock_free; 2327 KASSERT(vp == NULL || vp->v_type == VDIR, 2328 ("wrong vnode type %p", vp)); 2329 vn_seqc_write_begin(dvp); 2330 dvp->v_cache_dd = ncp; 2331 vn_seqc_write_end(dvp); 2332 } 2333 2334 if (vp != NULL) { 2335 if (flag != NCF_ISDOTDOT) { 2336 /* 2337 * For this case, the cache entry maps both the 2338 * directory name in it and the name ".." for the 2339 * directory's parent. 2340 */ 2341 vn_seqc_write_begin(vp); 2342 if ((ndd = vp->v_cache_dd) != NULL) { 2343 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) 2344 cache_zap_locked(ndd); 2345 else 2346 ndd = NULL; 2347 } 2348 vp->v_cache_dd = ncp; 2349 vn_seqc_write_end(vp); 2350 } else if (vp->v_type != VDIR) { 2351 if (vp->v_cache_dd != NULL) { 2352 vn_seqc_write_begin(vp); 2353 vp->v_cache_dd = NULL; 2354 vn_seqc_write_end(vp); 2355 } 2356 } 2357 } 2358 2359 if (flag != NCF_ISDOTDOT) { 2360 if (LIST_EMPTY(&dvp->v_cache_src)) { 2361 cache_hold_vnode(dvp); 2362 } 2363 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); 2364 } 2365 2366 /* 2367 * If the entry is "negative", we place it into the 2368 * "negative" cache queue, otherwise, we place it into the 2369 * destination vnode's cache entries queue. 2370 */ 2371 if (vp != NULL) { 2372 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); 2373 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, 2374 vp); 2375 } else { 2376 if (cnp->cn_flags & ISWHITEOUT) 2377 ncp->nc_flag |= NCF_WHITE; 2378 cache_neg_insert(ncp); 2379 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, 2380 ncp->nc_name); 2381 } 2382 2383 /* 2384 * Insert the new namecache entry into the appropriate chain 2385 * within the cache entries table. 2386 */ 2387 CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash); 2388 2389 atomic_thread_fence_rel(); 2390 /* 2391 * Mark the entry as fully constructed. 2392 * It is immutable past this point until its removal. 2393 */ 2394 atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP); 2395 2396 cache_enter_unlock(&cel); 2397 if (ndd != NULL) 2398 cache_free(ndd); 2399 return; 2400 out_unlock_free: 2401 cache_enter_unlock(&cel); 2402 cache_free(ncp); 2403 return; 2404 } 2405 2406 static u_int 2407 cache_roundup_2(u_int val) 2408 { 2409 u_int res; 2410 2411 for (res = 1; res <= val; res <<= 1) 2412 continue; 2413 2414 return (res); 2415 } 2416 2417 static struct nchashhead * 2418 nchinittbl(u_long elements, u_long *hashmask) 2419 { 2420 struct nchashhead *hashtbl; 2421 u_long hashsize, i; 2422 2423 hashsize = cache_roundup_2(elements) / 2; 2424 2425 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK); 2426 for (i = 0; i < hashsize; i++) 2427 CK_SLIST_INIT(&hashtbl[i]); 2428 *hashmask = hashsize - 1; 2429 return (hashtbl); 2430 } 2431 2432 static void 2433 ncfreetbl(struct nchashhead *hashtbl) 2434 { 2435 2436 free(hashtbl, M_VFSCACHE); 2437 } 2438 2439 /* 2440 * Name cache initialization, from vfs_init() when we are booting 2441 */ 2442 static void 2443 nchinit(void *dummy __unused) 2444 { 2445 u_int i; 2446 2447 cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE, 2448 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2449 cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE, 2450 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2451 cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE, 2452 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2453 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE, 2454 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2455 2456 VFS_SMR_ZONE_SET(cache_zone_small); 2457 VFS_SMR_ZONE_SET(cache_zone_small_ts); 2458 VFS_SMR_ZONE_SET(cache_zone_large); 2459 VFS_SMR_ZONE_SET(cache_zone_large_ts); 2460 2461 ncsize = desiredvnodes * ncsizefactor; 2462 cache_recalc_neg_min(ncnegminpct); 2463 nchashtbl = nchinittbl(desiredvnodes * 2, &nchash); 2464 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; 2465 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ 2466 ncbuckethash = 7; 2467 if (ncbuckethash > nchash) 2468 ncbuckethash = nchash; 2469 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, 2470 M_WAITOK | M_ZERO); 2471 for (i = 0; i < numbucketlocks; i++) 2472 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE); 2473 ncvnodehash = ncbuckethash; 2474 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, 2475 M_WAITOK | M_ZERO); 2476 for (i = 0; i < numvnodelocks; i++) 2477 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); 2478 2479 for (i = 0; i < numneglists; i++) { 2480 mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF); 2481 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); 2482 TAILQ_INIT(&neglists[i].nl_list); 2483 TAILQ_INIT(&neglists[i].nl_hotlist); 2484 } 2485 } 2486 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); 2487 2488 void 2489 cache_vnode_init(struct vnode *vp) 2490 { 2491 2492 LIST_INIT(&vp->v_cache_src); 2493 TAILQ_INIT(&vp->v_cache_dst); 2494 vp->v_cache_dd = NULL; 2495 cache_prehash(vp); 2496 } 2497 2498 void 2499 cache_changesize(u_long newmaxvnodes) 2500 { 2501 struct nchashhead *new_nchashtbl, *old_nchashtbl; 2502 u_long new_nchash, old_nchash; 2503 struct namecache *ncp; 2504 uint32_t hash; 2505 u_long newncsize; 2506 int i; 2507 2508 newncsize = newmaxvnodes * ncsizefactor; 2509 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); 2510 if (newmaxvnodes < numbucketlocks) 2511 newmaxvnodes = numbucketlocks; 2512 2513 new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash); 2514 /* If same hash table size, nothing to do */ 2515 if (nchash == new_nchash) { 2516 ncfreetbl(new_nchashtbl); 2517 return; 2518 } 2519 /* 2520 * Move everything from the old hash table to the new table. 2521 * None of the namecache entries in the table can be removed 2522 * because to do so, they have to be removed from the hash table. 2523 */ 2524 cache_lock_all_vnodes(); 2525 cache_lock_all_buckets(); 2526 old_nchashtbl = nchashtbl; 2527 old_nchash = nchash; 2528 nchashtbl = new_nchashtbl; 2529 nchash = new_nchash; 2530 for (i = 0; i <= old_nchash; i++) { 2531 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) { 2532 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, 2533 ncp->nc_dvp); 2534 CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash); 2535 CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); 2536 } 2537 } 2538 ncsize = newncsize; 2539 cache_recalc_neg_min(ncnegminpct); 2540 cache_unlock_all_buckets(); 2541 cache_unlock_all_vnodes(); 2542 ncfreetbl(old_nchashtbl); 2543 } 2544 2545 /* 2546 * Invalidate all entries from and to a particular vnode. 2547 */ 2548 static void 2549 cache_purge_impl(struct vnode *vp) 2550 { 2551 struct cache_freebatch batch; 2552 struct namecache *ncp; 2553 struct mtx *vlp, *vlp2; 2554 2555 TAILQ_INIT(&batch); 2556 vlp = VP2VNODELOCK(vp); 2557 vlp2 = NULL; 2558 mtx_lock(vlp); 2559 retry: 2560 while (!LIST_EMPTY(&vp->v_cache_src)) { 2561 ncp = LIST_FIRST(&vp->v_cache_src); 2562 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2563 goto retry; 2564 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2565 } 2566 while (!TAILQ_EMPTY(&vp->v_cache_dst)) { 2567 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2568 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2569 goto retry; 2570 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2571 } 2572 ncp = vp->v_cache_dd; 2573 if (ncp != NULL) { 2574 KASSERT(ncp->nc_flag & NCF_ISDOTDOT, 2575 ("lost dotdot link")); 2576 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2577 goto retry; 2578 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2579 } 2580 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); 2581 mtx_unlock(vlp); 2582 if (vlp2 != NULL) 2583 mtx_unlock(vlp2); 2584 cache_free_batch(&batch); 2585 } 2586 2587 /* 2588 * Opportunistic check to see if there is anything to do. 2589 */ 2590 static bool 2591 cache_has_entries(struct vnode *vp) 2592 { 2593 2594 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2595 vp->v_cache_dd == NULL) 2596 return (false); 2597 return (true); 2598 } 2599 2600 void 2601 cache_purge(struct vnode *vp) 2602 { 2603 2604 SDT_PROBE1(vfs, namecache, purge, done, vp); 2605 if (!cache_has_entries(vp)) 2606 return; 2607 cache_purge_impl(vp); 2608 } 2609 2610 /* 2611 * Only to be used by vgone. 2612 */ 2613 void 2614 cache_purge_vgone(struct vnode *vp) 2615 { 2616 struct mtx *vlp; 2617 2618 VNPASS(VN_IS_DOOMED(vp), vp); 2619 if (cache_has_entries(vp)) { 2620 cache_purge_impl(vp); 2621 return; 2622 } 2623 2624 /* 2625 * Serialize against a potential thread doing cache_purge. 2626 */ 2627 vlp = VP2VNODELOCK(vp); 2628 mtx_wait_unlocked(vlp); 2629 if (cache_has_entries(vp)) { 2630 cache_purge_impl(vp); 2631 return; 2632 } 2633 return; 2634 } 2635 2636 /* 2637 * Invalidate all negative entries for a particular directory vnode. 2638 */ 2639 void 2640 cache_purge_negative(struct vnode *vp) 2641 { 2642 struct cache_freebatch batch; 2643 struct namecache *ncp, *nnp; 2644 struct mtx *vlp; 2645 2646 SDT_PROBE1(vfs, namecache, purge_negative, done, vp); 2647 if (LIST_EMPTY(&vp->v_cache_src)) 2648 return; 2649 TAILQ_INIT(&batch); 2650 vlp = VP2VNODELOCK(vp); 2651 mtx_lock(vlp); 2652 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { 2653 if (!(ncp->nc_flag & NCF_NEGATIVE)) 2654 continue; 2655 cache_zap_negative_locked_vnode_kl(ncp, vp); 2656 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2657 } 2658 mtx_unlock(vlp); 2659 cache_free_batch(&batch); 2660 } 2661 2662 /* 2663 * Entry points for modifying VOP operations. 2664 */ 2665 void 2666 cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, 2667 struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp) 2668 { 2669 2670 ASSERT_VOP_IN_SEQC(fdvp); 2671 ASSERT_VOP_IN_SEQC(fvp); 2672 ASSERT_VOP_IN_SEQC(tdvp); 2673 if (tvp != NULL) 2674 ASSERT_VOP_IN_SEQC(tvp); 2675 2676 cache_purge(fvp); 2677 if (tvp != NULL) { 2678 cache_purge(tvp); 2679 KASSERT(!cache_remove_cnp(tdvp, tcnp), 2680 ("%s: lingering negative entry", __func__)); 2681 } else { 2682 cache_remove_cnp(tdvp, tcnp); 2683 } 2684 } 2685 2686 void 2687 cache_vop_rmdir(struct vnode *dvp, struct vnode *vp) 2688 { 2689 2690 ASSERT_VOP_IN_SEQC(dvp); 2691 ASSERT_VOP_IN_SEQC(vp); 2692 cache_purge(vp); 2693 } 2694 2695 #ifdef INVARIANTS 2696 /* 2697 * Validate that if an entry exists it matches. 2698 */ 2699 void 2700 cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) 2701 { 2702 struct namecache *ncp; 2703 struct mtx *blp; 2704 uint32_t hash; 2705 2706 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 2707 if (CK_SLIST_EMPTY(NCHHASH(hash))) 2708 return; 2709 blp = HASH2BUCKETLOCK(hash); 2710 mtx_lock(blp); 2711 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 2712 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 2713 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) { 2714 if (ncp->nc_vp != vp) 2715 panic("%s: mismatch (%p != %p); ncp %p [%s] dvp %p vp %p\n", 2716 __func__, vp, ncp->nc_vp, ncp, ncp->nc_name, ncp->nc_dvp, 2717 ncp->nc_vp); 2718 } 2719 } 2720 mtx_unlock(blp); 2721 } 2722 #endif 2723 2724 /* 2725 * Flush all entries referencing a particular filesystem. 2726 */ 2727 void 2728 cache_purgevfs(struct mount *mp) 2729 { 2730 struct vnode *vp, *mvp; 2731 2732 SDT_PROBE1(vfs, namecache, purgevfs, done, mp); 2733 /* 2734 * Somewhat wasteful iteration over all vnodes. Would be better to 2735 * support filtering and avoid the interlock to begin with. 2736 */ 2737 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 2738 if (!cache_has_entries(vp)) { 2739 VI_UNLOCK(vp); 2740 continue; 2741 } 2742 vholdl(vp); 2743 VI_UNLOCK(vp); 2744 cache_purge(vp); 2745 vdrop(vp); 2746 } 2747 } 2748 2749 /* 2750 * Perform canonical checks and cache lookup and pass on to filesystem 2751 * through the vop_cachedlookup only if needed. 2752 */ 2753 2754 int 2755 vfs_cache_lookup(struct vop_lookup_args *ap) 2756 { 2757 struct vnode *dvp; 2758 int error; 2759 struct vnode **vpp = ap->a_vpp; 2760 struct componentname *cnp = ap->a_cnp; 2761 int flags = cnp->cn_flags; 2762 2763 *vpp = NULL; 2764 dvp = ap->a_dvp; 2765 2766 if (dvp->v_type != VDIR) 2767 return (ENOTDIR); 2768 2769 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && 2770 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) 2771 return (EROFS); 2772 2773 error = vn_dir_check_exec(dvp, cnp); 2774 if (error != 0) 2775 return (error); 2776 2777 error = cache_lookup(dvp, vpp, cnp, NULL, NULL); 2778 if (error == 0) 2779 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); 2780 if (error == -1) 2781 return (0); 2782 return (error); 2783 } 2784 2785 /* Implementation of the getcwd syscall. */ 2786 int 2787 sys___getcwd(struct thread *td, struct __getcwd_args *uap) 2788 { 2789 char *buf, *retbuf; 2790 size_t buflen; 2791 int error; 2792 2793 buflen = uap->buflen; 2794 if (__predict_false(buflen < 2)) 2795 return (EINVAL); 2796 if (buflen > MAXPATHLEN) 2797 buflen = MAXPATHLEN; 2798 2799 buf = uma_zalloc(namei_zone, M_WAITOK); 2800 error = vn_getcwd(buf, &retbuf, &buflen); 2801 if (error == 0) 2802 error = copyout(retbuf, uap->buf, buflen); 2803 uma_zfree(namei_zone, buf); 2804 return (error); 2805 } 2806 2807 int 2808 vn_getcwd(char *buf, char **retbuf, size_t *buflen) 2809 { 2810 struct pwd *pwd; 2811 int error; 2812 2813 vfs_smr_enter(); 2814 pwd = pwd_get_smr(); 2815 error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, 2816 buflen, 0); 2817 VFS_SMR_ASSERT_NOT_ENTERED(); 2818 if (error < 0) { 2819 pwd = pwd_hold(curthread); 2820 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf, 2821 retbuf, buflen); 2822 pwd_drop(pwd); 2823 } 2824 2825 #ifdef KTRACE 2826 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0) 2827 ktrnamei(*retbuf); 2828 #endif 2829 return (error); 2830 } 2831 2832 static int 2833 kern___realpathat(struct thread *td, int fd, const char *path, char *buf, 2834 size_t size, int flags, enum uio_seg pathseg) 2835 { 2836 struct nameidata nd; 2837 char *retbuf, *freebuf; 2838 int error; 2839 2840 if (flags != 0) 2841 return (EINVAL); 2842 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1, 2843 pathseg, path, fd, &cap_fstat_rights, td); 2844 if ((error = namei(&nd)) != 0) 2845 return (error); 2846 error = vn_fullpath_hardlink(&nd, &retbuf, &freebuf, &size); 2847 if (error == 0) { 2848 error = copyout(retbuf, buf, size); 2849 free(freebuf, M_TEMP); 2850 } 2851 NDFREE(&nd, 0); 2852 return (error); 2853 } 2854 2855 int 2856 sys___realpathat(struct thread *td, struct __realpathat_args *uap) 2857 { 2858 2859 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size, 2860 uap->flags, UIO_USERSPACE)); 2861 } 2862 2863 /* 2864 * Retrieve the full filesystem path that correspond to a vnode from the name 2865 * cache (if available) 2866 */ 2867 int 2868 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf) 2869 { 2870 struct pwd *pwd; 2871 char *buf; 2872 size_t buflen; 2873 int error; 2874 2875 if (__predict_false(vp == NULL)) 2876 return (EINVAL); 2877 2878 buflen = MAXPATHLEN; 2879 buf = malloc(buflen, M_TEMP, M_WAITOK); 2880 vfs_smr_enter(); 2881 pwd = pwd_get_smr(); 2882 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, 0); 2883 VFS_SMR_ASSERT_NOT_ENTERED(); 2884 if (error < 0) { 2885 pwd = pwd_hold(curthread); 2886 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen); 2887 pwd_drop(pwd); 2888 } 2889 if (error == 0) 2890 *freebuf = buf; 2891 else 2892 free(buf, M_TEMP); 2893 return (error); 2894 } 2895 2896 /* 2897 * This function is similar to vn_fullpath, but it attempts to lookup the 2898 * pathname relative to the global root mount point. This is required for the 2899 * auditing sub-system, as audited pathnames must be absolute, relative to the 2900 * global root mount point. 2901 */ 2902 int 2903 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf) 2904 { 2905 char *buf; 2906 size_t buflen; 2907 int error; 2908 2909 if (__predict_false(vp == NULL)) 2910 return (EINVAL); 2911 buflen = MAXPATHLEN; 2912 buf = malloc(buflen, M_TEMP, M_WAITOK); 2913 vfs_smr_enter(); 2914 error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, 0); 2915 VFS_SMR_ASSERT_NOT_ENTERED(); 2916 if (error < 0) { 2917 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen); 2918 } 2919 if (error == 0) 2920 *freebuf = buf; 2921 else 2922 free(buf, M_TEMP); 2923 return (error); 2924 } 2925 2926 static struct namecache * 2927 vn_dd_from_dst(struct vnode *vp) 2928 { 2929 struct namecache *ncp; 2930 2931 cache_assert_vnode_locked(vp); 2932 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) { 2933 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2934 return (ncp); 2935 } 2936 return (NULL); 2937 } 2938 2939 int 2940 vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen) 2941 { 2942 struct vnode *dvp; 2943 struct namecache *ncp; 2944 struct mtx *vlp; 2945 int error; 2946 2947 vlp = VP2VNODELOCK(*vp); 2948 mtx_lock(vlp); 2949 ncp = (*vp)->v_cache_dd; 2950 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) { 2951 KASSERT(ncp == vn_dd_from_dst(*vp), 2952 ("%s: mismatch for dd entry (%p != %p)", __func__, 2953 ncp, vn_dd_from_dst(*vp))); 2954 } else { 2955 ncp = vn_dd_from_dst(*vp); 2956 } 2957 if (ncp != NULL) { 2958 if (*buflen < ncp->nc_nlen) { 2959 mtx_unlock(vlp); 2960 vrele(*vp); 2961 counter_u64_add(numfullpathfail4, 1); 2962 error = ENOMEM; 2963 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2964 vp, NULL); 2965 return (error); 2966 } 2967 *buflen -= ncp->nc_nlen; 2968 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 2969 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, 2970 ncp->nc_name, vp); 2971 dvp = *vp; 2972 *vp = ncp->nc_dvp; 2973 vref(*vp); 2974 mtx_unlock(vlp); 2975 vrele(dvp); 2976 return (0); 2977 } 2978 SDT_PROBE1(vfs, namecache, fullpath, miss, vp); 2979 2980 mtx_unlock(vlp); 2981 vn_lock(*vp, LK_SHARED | LK_RETRY); 2982 error = VOP_VPTOCNP(*vp, &dvp, buf, buflen); 2983 vput(*vp); 2984 if (error) { 2985 counter_u64_add(numfullpathfail2, 1); 2986 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2987 return (error); 2988 } 2989 2990 *vp = dvp; 2991 if (VN_IS_DOOMED(dvp)) { 2992 /* forced unmount */ 2993 vrele(dvp); 2994 error = ENOENT; 2995 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2996 return (error); 2997 } 2998 /* 2999 * *vp has its use count incremented still. 3000 */ 3001 3002 return (0); 3003 } 3004 3005 /* 3006 * Resolve a directory to a pathname. 3007 * 3008 * The name of the directory can always be found in the namecache or fetched 3009 * from the filesystem. There is also guaranteed to be only one parent, meaning 3010 * we can just follow vnodes up until we find the root. 3011 * 3012 * The vnode must be referenced. 3013 */ 3014 static int 3015 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 3016 size_t *len, size_t addend) 3017 { 3018 #ifdef KDTRACE_HOOKS 3019 struct vnode *startvp = vp; 3020 #endif 3021 struct vnode *vp1; 3022 size_t buflen; 3023 int error; 3024 bool slash_prefixed; 3025 3026 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); 3027 VNPASS(vp->v_usecount > 0, vp); 3028 3029 buflen = *len; 3030 3031 slash_prefixed = true; 3032 if (addend == 0) { 3033 MPASS(*len >= 2); 3034 buflen--; 3035 buf[buflen] = '\0'; 3036 slash_prefixed = false; 3037 } 3038 3039 error = 0; 3040 3041 SDT_PROBE1(vfs, namecache, fullpath, entry, vp); 3042 counter_u64_add(numfullpathcalls, 1); 3043 while (vp != rdir && vp != rootvnode) { 3044 /* 3045 * The vp vnode must be already fully constructed, 3046 * since it is either found in namecache or obtained 3047 * from VOP_VPTOCNP(). We may test for VV_ROOT safely 3048 * without obtaining the vnode lock. 3049 */ 3050 if ((vp->v_vflag & VV_ROOT) != 0) { 3051 vn_lock(vp, LK_RETRY | LK_SHARED); 3052 3053 /* 3054 * With the vnode locked, check for races with 3055 * unmount, forced or not. Note that we 3056 * already verified that vp is not equal to 3057 * the root vnode, which means that 3058 * mnt_vnodecovered can be NULL only for the 3059 * case of unmount. 3060 */ 3061 if (VN_IS_DOOMED(vp) || 3062 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || 3063 vp1->v_mountedhere != vp->v_mount) { 3064 vput(vp); 3065 error = ENOENT; 3066 SDT_PROBE3(vfs, namecache, fullpath, return, 3067 error, vp, NULL); 3068 break; 3069 } 3070 3071 vref(vp1); 3072 vput(vp); 3073 vp = vp1; 3074 continue; 3075 } 3076 if (vp->v_type != VDIR) { 3077 vrele(vp); 3078 counter_u64_add(numfullpathfail1, 1); 3079 error = ENOTDIR; 3080 SDT_PROBE3(vfs, namecache, fullpath, return, 3081 error, vp, NULL); 3082 break; 3083 } 3084 error = vn_vptocnp(&vp, buf, &buflen); 3085 if (error) 3086 break; 3087 if (buflen == 0) { 3088 vrele(vp); 3089 error = ENOMEM; 3090 SDT_PROBE3(vfs, namecache, fullpath, return, error, 3091 startvp, NULL); 3092 break; 3093 } 3094 buf[--buflen] = '/'; 3095 slash_prefixed = true; 3096 } 3097 if (error) 3098 return (error); 3099 if (!slash_prefixed) { 3100 if (buflen == 0) { 3101 vrele(vp); 3102 counter_u64_add(numfullpathfail4, 1); 3103 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, 3104 startvp, NULL); 3105 return (ENOMEM); 3106 } 3107 buf[--buflen] = '/'; 3108 } 3109 counter_u64_add(numfullpathfound, 1); 3110 vrele(vp); 3111 3112 *retbuf = buf + buflen; 3113 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf); 3114 *len -= buflen; 3115 *len += addend; 3116 return (0); 3117 } 3118 3119 /* 3120 * Resolve an arbitrary vnode to a pathname. 3121 * 3122 * Note 2 caveats: 3123 * - hardlinks are not tracked, thus if the vnode is not a directory this can 3124 * resolve to a different path than the one used to find it 3125 * - namecache is not mandatory, meaning names are not guaranteed to be added 3126 * (in which case resolving fails) 3127 */ 3128 static void __inline 3129 cache_rev_failed_impl(int *reason, int line) 3130 { 3131 3132 *reason = line; 3133 } 3134 #define cache_rev_failed(var) cache_rev_failed_impl((var), __LINE__) 3135 3136 static int 3137 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 3138 char **retbuf, size_t *buflen, size_t addend) 3139 { 3140 #ifdef KDTRACE_HOOKS 3141 struct vnode *startvp = vp; 3142 #endif 3143 struct vnode *tvp; 3144 struct mount *mp; 3145 struct namecache *ncp; 3146 size_t orig_buflen; 3147 int reason; 3148 int error; 3149 #ifdef KDTRACE_HOOKS 3150 int i; 3151 #endif 3152 seqc_t vp_seqc, tvp_seqc; 3153 u_char nc_flag; 3154 3155 VFS_SMR_ASSERT_ENTERED(); 3156 3157 if (!cache_fast_revlookup) { 3158 vfs_smr_exit(); 3159 return (-1); 3160 } 3161 3162 orig_buflen = *buflen; 3163 3164 if (addend == 0) { 3165 MPASS(*buflen >= 2); 3166 *buflen -= 1; 3167 buf[*buflen] = '\0'; 3168 } 3169 3170 if (vp == rdir || vp == rootvnode) { 3171 if (addend == 0) { 3172 *buflen -= 1; 3173 buf[*buflen] = '/'; 3174 } 3175 goto out_ok; 3176 } 3177 3178 #ifdef KDTRACE_HOOKS 3179 i = 0; 3180 #endif 3181 error = -1; 3182 ncp = NULL; /* for sdt probe down below */ 3183 vp_seqc = vn_seqc_read_any(vp); 3184 if (seqc_in_modify(vp_seqc)) { 3185 cache_rev_failed(&reason); 3186 goto out_abort; 3187 } 3188 3189 for (;;) { 3190 #ifdef KDTRACE_HOOKS 3191 i++; 3192 #endif 3193 if ((vp->v_vflag & VV_ROOT) != 0) { 3194 mp = atomic_load_ptr(&vp->v_mount); 3195 if (mp == NULL) { 3196 cache_rev_failed(&reason); 3197 goto out_abort; 3198 } 3199 tvp = atomic_load_ptr(&mp->mnt_vnodecovered); 3200 tvp_seqc = vn_seqc_read_any(tvp); 3201 if (seqc_in_modify(tvp_seqc)) { 3202 cache_rev_failed(&reason); 3203 goto out_abort; 3204 } 3205 if (!vn_seqc_consistent(vp, vp_seqc)) { 3206 cache_rev_failed(&reason); 3207 goto out_abort; 3208 } 3209 vp = tvp; 3210 vp_seqc = tvp_seqc; 3211 continue; 3212 } 3213 ncp = atomic_load_ptr(&vp->v_cache_dd); 3214 if (ncp == NULL) { 3215 cache_rev_failed(&reason); 3216 goto out_abort; 3217 } 3218 nc_flag = atomic_load_char(&ncp->nc_flag); 3219 if ((nc_flag & NCF_ISDOTDOT) != 0) { 3220 cache_rev_failed(&reason); 3221 goto out_abort; 3222 } 3223 if (!cache_ncp_canuse(ncp)) { 3224 cache_rev_failed(&reason); 3225 goto out_abort; 3226 } 3227 if (ncp->nc_nlen >= *buflen) { 3228 cache_rev_failed(&reason); 3229 error = ENOMEM; 3230 goto out_abort; 3231 } 3232 *buflen -= ncp->nc_nlen; 3233 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 3234 *buflen -= 1; 3235 buf[*buflen] = '/'; 3236 tvp = ncp->nc_dvp; 3237 tvp_seqc = vn_seqc_read_any(tvp); 3238 if (seqc_in_modify(tvp_seqc)) { 3239 cache_rev_failed(&reason); 3240 goto out_abort; 3241 } 3242 if (!vn_seqc_consistent(vp, vp_seqc)) { 3243 cache_rev_failed(&reason); 3244 goto out_abort; 3245 } 3246 vp = tvp; 3247 vp_seqc = tvp_seqc; 3248 if (vp == rdir || vp == rootvnode) 3249 break; 3250 } 3251 out_ok: 3252 vfs_smr_exit(); 3253 *retbuf = buf + *buflen; 3254 *buflen = orig_buflen - *buflen + addend; 3255 SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf); 3256 return (0); 3257 3258 out_abort: 3259 *buflen = orig_buflen; 3260 SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i); 3261 vfs_smr_exit(); 3262 return (error); 3263 } 3264 3265 static int 3266 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 3267 size_t *buflen) 3268 { 3269 size_t orig_buflen, addend; 3270 int error; 3271 3272 if (*buflen < 2) 3273 return (EINVAL); 3274 3275 orig_buflen = *buflen; 3276 3277 vref(vp); 3278 addend = 0; 3279 if (vp->v_type != VDIR) { 3280 *buflen -= 1; 3281 buf[*buflen] = '\0'; 3282 error = vn_vptocnp(&vp, buf, buflen); 3283 if (error) 3284 return (error); 3285 if (*buflen == 0) { 3286 vrele(vp); 3287 return (ENOMEM); 3288 } 3289 *buflen -= 1; 3290 buf[*buflen] = '/'; 3291 addend = orig_buflen - *buflen; 3292 } 3293 3294 return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, addend)); 3295 } 3296 3297 /* 3298 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks). 3299 * 3300 * Since the namecache does not track hardlinks, the caller is expected to first 3301 * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei. 3302 * 3303 * Then we have 2 cases: 3304 * - if the found vnode is a directory, the path can be constructed just by 3305 * following names up the chain 3306 * - otherwise we populate the buffer with the saved name and start resolving 3307 * from the parent 3308 */ 3309 static int 3310 vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, char **freebuf, 3311 size_t *buflen) 3312 { 3313 char *buf, *tmpbuf; 3314 struct pwd *pwd; 3315 struct componentname *cnp; 3316 struct vnode *vp; 3317 size_t addend; 3318 int error; 3319 enum vtype type; 3320 3321 if (*buflen < 2) 3322 return (EINVAL); 3323 if (*buflen > MAXPATHLEN) 3324 *buflen = MAXPATHLEN; 3325 3326 buf = malloc(*buflen, M_TEMP, M_WAITOK); 3327 3328 addend = 0; 3329 vp = ndp->ni_vp; 3330 /* 3331 * Check for VBAD to work around the vp_crossmp bug in lookup(). 3332 * 3333 * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be 3334 * set to mount point's root vnode while ni_dvp will be vp_crossmp. 3335 * If the type is VDIR (like in this very case) we can skip looking 3336 * at ni_dvp in the first place. However, since vnodes get passed here 3337 * unlocked the target may transition to doomed state (type == VBAD) 3338 * before we get to evaluate the condition. If this happens, we will 3339 * populate part of the buffer and descend to vn_fullpath_dir with 3340 * vp == vp_crossmp. Prevent the problem by checking for VBAD. 3341 * 3342 * This should be atomic_load(&vp->v_type) but it is illegal to take 3343 * an address of a bit field, even if said field is sized to char. 3344 * Work around the problem by reading the value into a full-sized enum 3345 * and then re-reading it with atomic_load which will still prevent 3346 * the compiler from re-reading down the road. 3347 */ 3348 type = vp->v_type; 3349 type = atomic_load_int(&type); 3350 if (type == VBAD) { 3351 error = ENOENT; 3352 goto out_bad; 3353 } 3354 if (type != VDIR) { 3355 cnp = &ndp->ni_cnd; 3356 addend = cnp->cn_namelen + 2; 3357 if (*buflen < addend) { 3358 error = ENOMEM; 3359 goto out_bad; 3360 } 3361 *buflen -= addend; 3362 tmpbuf = buf + *buflen; 3363 tmpbuf[0] = '/'; 3364 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen); 3365 tmpbuf[addend - 1] = '\0'; 3366 vp = ndp->ni_dvp; 3367 } 3368 3369 vfs_smr_enter(); 3370 pwd = pwd_get_smr(); 3371 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen, 3372 addend); 3373 VFS_SMR_ASSERT_NOT_ENTERED(); 3374 if (error < 0) { 3375 pwd = pwd_hold(curthread); 3376 vref(vp); 3377 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen, 3378 addend); 3379 pwd_drop(pwd); 3380 if (error != 0) 3381 goto out_bad; 3382 } 3383 3384 *freebuf = buf; 3385 3386 return (0); 3387 out_bad: 3388 free(buf, M_TEMP); 3389 return (error); 3390 } 3391 3392 struct vnode * 3393 vn_dir_dd_ino(struct vnode *vp) 3394 { 3395 struct namecache *ncp; 3396 struct vnode *ddvp; 3397 struct mtx *vlp; 3398 enum vgetstate vs; 3399 3400 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); 3401 vlp = VP2VNODELOCK(vp); 3402 mtx_lock(vlp); 3403 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { 3404 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 3405 continue; 3406 ddvp = ncp->nc_dvp; 3407 vs = vget_prep(ddvp); 3408 mtx_unlock(vlp); 3409 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) 3410 return (NULL); 3411 return (ddvp); 3412 } 3413 mtx_unlock(vlp); 3414 return (NULL); 3415 } 3416 3417 int 3418 vn_commname(struct vnode *vp, char *buf, u_int buflen) 3419 { 3420 struct namecache *ncp; 3421 struct mtx *vlp; 3422 int l; 3423 3424 vlp = VP2VNODELOCK(vp); 3425 mtx_lock(vlp); 3426 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) 3427 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 3428 break; 3429 if (ncp == NULL) { 3430 mtx_unlock(vlp); 3431 return (ENOENT); 3432 } 3433 l = min(ncp->nc_nlen, buflen - 1); 3434 memcpy(buf, ncp->nc_name, l); 3435 mtx_unlock(vlp); 3436 buf[l] = '\0'; 3437 return (0); 3438 } 3439 3440 /* 3441 * This function updates path string to vnode's full global path 3442 * and checks the size of the new path string against the pathlen argument. 3443 * 3444 * Requires a locked, referenced vnode. 3445 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 3446 * 3447 * If vp is a directory, the call to vn_fullpath_global() always succeeds 3448 * because it falls back to the ".." lookup if the namecache lookup fails. 3449 */ 3450 int 3451 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, 3452 u_int pathlen) 3453 { 3454 struct nameidata nd; 3455 struct vnode *vp1; 3456 char *rpath, *fbuf; 3457 int error; 3458 3459 ASSERT_VOP_ELOCKED(vp, __func__); 3460 3461 /* Construct global filesystem path from vp. */ 3462 VOP_UNLOCK(vp); 3463 error = vn_fullpath_global(vp, &rpath, &fbuf); 3464 3465 if (error != 0) { 3466 vrele(vp); 3467 return (error); 3468 } 3469 3470 if (strlen(rpath) >= pathlen) { 3471 vrele(vp); 3472 error = ENAMETOOLONG; 3473 goto out; 3474 } 3475 3476 /* 3477 * Re-lookup the vnode by path to detect a possible rename. 3478 * As a side effect, the vnode is relocked. 3479 * If vnode was renamed, return ENOENT. 3480 */ 3481 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, 3482 UIO_SYSSPACE, path, td); 3483 error = namei(&nd); 3484 if (error != 0) { 3485 vrele(vp); 3486 goto out; 3487 } 3488 NDFREE(&nd, NDF_ONLY_PNBUF); 3489 vp1 = nd.ni_vp; 3490 vrele(vp); 3491 if (vp1 == vp) 3492 strcpy(path, rpath); 3493 else { 3494 vput(vp1); 3495 error = ENOENT; 3496 } 3497 3498 out: 3499 free(fbuf, M_TEMP); 3500 return (error); 3501 } 3502 3503 #ifdef DDB 3504 static void 3505 db_print_vpath(struct vnode *vp) 3506 { 3507 3508 while (vp != NULL) { 3509 db_printf("%p: ", vp); 3510 if (vp == rootvnode) { 3511 db_printf("/"); 3512 vp = NULL; 3513 } else { 3514 if (vp->v_vflag & VV_ROOT) { 3515 db_printf("<mount point>"); 3516 vp = vp->v_mount->mnt_vnodecovered; 3517 } else { 3518 struct namecache *ncp; 3519 char *ncn; 3520 int i; 3521 3522 ncp = TAILQ_FIRST(&vp->v_cache_dst); 3523 if (ncp != NULL) { 3524 ncn = ncp->nc_name; 3525 for (i = 0; i < ncp->nc_nlen; i++) 3526 db_printf("%c", *ncn++); 3527 vp = ncp->nc_dvp; 3528 } else { 3529 vp = NULL; 3530 } 3531 } 3532 } 3533 db_printf("\n"); 3534 } 3535 3536 return; 3537 } 3538 3539 DB_SHOW_COMMAND(vpath, db_show_vpath) 3540 { 3541 struct vnode *vp; 3542 3543 if (!have_addr) { 3544 db_printf("usage: show vpath <struct vnode *>\n"); 3545 return; 3546 } 3547 3548 vp = (struct vnode *)addr; 3549 db_print_vpath(vp); 3550 } 3551 3552 #endif 3553 3554 static bool __read_frequently cache_fast_lookup = true; 3555 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW, 3556 &cache_fast_lookup, 0, ""); 3557 3558 #define CACHE_FPL_FAILED -2020 3559 3560 static void 3561 cache_fpl_cleanup_cnp(struct componentname *cnp) 3562 { 3563 3564 uma_zfree(namei_zone, cnp->cn_pnbuf); 3565 #ifdef DIAGNOSTIC 3566 cnp->cn_pnbuf = NULL; 3567 cnp->cn_nameptr = NULL; 3568 #endif 3569 } 3570 3571 static void 3572 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp) 3573 { 3574 struct componentname *cnp; 3575 3576 cnp = &ndp->ni_cnd; 3577 while (*(cnp->cn_nameptr) == '/') { 3578 cnp->cn_nameptr++; 3579 ndp->ni_pathlen--; 3580 } 3581 3582 *dpp = ndp->ni_rootdir; 3583 } 3584 3585 /* 3586 * Components of nameidata (or objects it can point to) which may 3587 * need restoring in case fast path lookup fails. 3588 */ 3589 struct nameidata_saved { 3590 long cn_namelen; 3591 char *cn_nameptr; 3592 size_t ni_pathlen; 3593 int cn_flags; 3594 }; 3595 3596 struct cache_fpl { 3597 struct nameidata *ndp; 3598 struct componentname *cnp; 3599 struct pwd *pwd; 3600 struct vnode *dvp; 3601 struct vnode *tvp; 3602 seqc_t dvp_seqc; 3603 seqc_t tvp_seqc; 3604 struct nameidata_saved snd; 3605 int line; 3606 enum cache_fpl_status status:8; 3607 bool in_smr; 3608 bool fsearch; 3609 }; 3610 3611 static void 3612 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd) 3613 { 3614 3615 snd->cn_flags = fpl->ndp->ni_cnd.cn_flags; 3616 snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen; 3617 snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr; 3618 snd->ni_pathlen = fpl->ndp->ni_pathlen; 3619 } 3620 3621 static void 3622 cache_fpl_restore_partial(struct cache_fpl *fpl, struct nameidata_saved *snd) 3623 { 3624 3625 fpl->ndp->ni_cnd.cn_flags = snd->cn_flags; 3626 fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen; 3627 fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr; 3628 fpl->ndp->ni_pathlen = snd->ni_pathlen; 3629 } 3630 3631 static void 3632 cache_fpl_restore_abort(struct cache_fpl *fpl, struct nameidata_saved *snd) 3633 { 3634 3635 cache_fpl_restore_partial(fpl, snd); 3636 /* 3637 * It is 0 on entry by API contract. 3638 */ 3639 fpl->ndp->ni_resflags = 0; 3640 } 3641 3642 #ifdef INVARIANTS 3643 #define cache_fpl_smr_assert_entered(fpl) ({ \ 3644 struct cache_fpl *_fpl = (fpl); \ 3645 MPASS(_fpl->in_smr == true); \ 3646 VFS_SMR_ASSERT_ENTERED(); \ 3647 }) 3648 #define cache_fpl_smr_assert_not_entered(fpl) ({ \ 3649 struct cache_fpl *_fpl = (fpl); \ 3650 MPASS(_fpl->in_smr == false); \ 3651 VFS_SMR_ASSERT_NOT_ENTERED(); \ 3652 }) 3653 #else 3654 #define cache_fpl_smr_assert_entered(fpl) do { } while (0) 3655 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0) 3656 #endif 3657 3658 #define cache_fpl_smr_enter_initial(fpl) ({ \ 3659 struct cache_fpl *_fpl = (fpl); \ 3660 vfs_smr_enter(); \ 3661 _fpl->in_smr = true; \ 3662 }) 3663 3664 #define cache_fpl_smr_enter(fpl) ({ \ 3665 struct cache_fpl *_fpl = (fpl); \ 3666 MPASS(_fpl->in_smr == false); \ 3667 vfs_smr_enter(); \ 3668 _fpl->in_smr = true; \ 3669 }) 3670 3671 #define cache_fpl_smr_exit(fpl) ({ \ 3672 struct cache_fpl *_fpl = (fpl); \ 3673 MPASS(_fpl->in_smr == true); \ 3674 vfs_smr_exit(); \ 3675 _fpl->in_smr = false; \ 3676 }) 3677 3678 static int 3679 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line) 3680 { 3681 3682 if (fpl->status != CACHE_FPL_STATUS_UNSET) { 3683 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, 3684 ("%s: converting to abort from %d at %d, set at %d\n", 3685 __func__, fpl->status, line, fpl->line)); 3686 } 3687 fpl->status = CACHE_FPL_STATUS_ABORTED; 3688 fpl->line = line; 3689 return (CACHE_FPL_FAILED); 3690 } 3691 3692 #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__) 3693 3694 static int 3695 cache_fpl_partial_impl(struct cache_fpl *fpl, int line) 3696 { 3697 3698 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3699 ("%s: setting to partial at %d, but already set to %d at %d\n", 3700 __func__, line, fpl->status, fpl->line)); 3701 cache_fpl_smr_assert_entered(fpl); 3702 fpl->status = CACHE_FPL_STATUS_PARTIAL; 3703 fpl->line = line; 3704 return (CACHE_FPL_FAILED); 3705 } 3706 3707 #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__) 3708 3709 static int 3710 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line) 3711 { 3712 3713 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3714 ("%s: setting to handled at %d, but already set to %d at %d\n", 3715 __func__, line, fpl->status, fpl->line)); 3716 cache_fpl_smr_assert_not_entered(fpl); 3717 MPASS(error != CACHE_FPL_FAILED); 3718 fpl->status = CACHE_FPL_STATUS_HANDLED; 3719 fpl->line = line; 3720 return (error); 3721 } 3722 3723 #define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__) 3724 3725 #define CACHE_FPL_SUPPORTED_CN_FLAGS \ 3726 (NC_NOMAKEENTRY | NC_KEEPPOSENTRY | LOCKLEAF | LOCKPARENT | WANTPARENT | \ 3727 FOLLOW | LOCKSHARED | SAVENAME | SAVESTART | WILLBEDIR | ISOPEN | \ 3728 NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK) 3729 3730 #define CACHE_FPL_INTERNAL_CN_FLAGS \ 3731 (ISDOTDOT | MAKEENTRY | ISLASTCN) 3732 3733 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, 3734 "supported and internal flags overlap"); 3735 3736 static bool 3737 cache_fpl_islastcn(struct nameidata *ndp) 3738 { 3739 3740 return (*ndp->ni_next == 0); 3741 } 3742 3743 static bool 3744 cache_fpl_isdotdot(struct componentname *cnp) 3745 { 3746 3747 if (cnp->cn_namelen == 2 && 3748 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') 3749 return (true); 3750 return (false); 3751 } 3752 3753 static bool 3754 cache_can_fplookup(struct cache_fpl *fpl) 3755 { 3756 struct nameidata *ndp; 3757 struct componentname *cnp; 3758 struct thread *td; 3759 3760 ndp = fpl->ndp; 3761 cnp = fpl->cnp; 3762 td = cnp->cn_thread; 3763 3764 if (!cache_fast_lookup) { 3765 cache_fpl_aborted(fpl); 3766 return (false); 3767 } 3768 #ifdef MAC 3769 if (mac_vnode_check_lookup_enabled()) { 3770 cache_fpl_aborted(fpl); 3771 return (false); 3772 } 3773 #endif 3774 if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) { 3775 cache_fpl_aborted(fpl); 3776 return (false); 3777 } 3778 if (IN_CAPABILITY_MODE(td)) { 3779 cache_fpl_aborted(fpl); 3780 return (false); 3781 } 3782 if (AUDITING_TD(td)) { 3783 cache_fpl_aborted(fpl); 3784 return (false); 3785 } 3786 if (ndp->ni_startdir != NULL) { 3787 cache_fpl_aborted(fpl); 3788 return (false); 3789 } 3790 return (true); 3791 } 3792 3793 static int 3794 cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp) 3795 { 3796 struct nameidata *ndp; 3797 int error; 3798 bool fsearch; 3799 3800 ndp = fpl->ndp; 3801 error = fgetvp_lookup_smr(ndp->ni_dirfd, ndp, vpp, &fsearch); 3802 if (__predict_false(error != 0)) { 3803 cache_fpl_smr_exit(fpl); 3804 return (cache_fpl_aborted(fpl)); 3805 } 3806 fpl->fsearch = fsearch; 3807 return (0); 3808 } 3809 3810 static bool 3811 cache_fplookup_vnode_supported(struct vnode *vp) 3812 { 3813 3814 return (vp->v_type != VLNK); 3815 } 3816 3817 static int __noinline 3818 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp, 3819 uint32_t hash) 3820 { 3821 struct componentname *cnp; 3822 struct vnode *dvp; 3823 3824 cnp = fpl->cnp; 3825 dvp = fpl->dvp; 3826 3827 cache_fpl_smr_exit(fpl); 3828 if (cache_neg_promote_cond(dvp, cnp, oncp, hash)) 3829 return (cache_fpl_handled(fpl, ENOENT)); 3830 else 3831 return (cache_fpl_aborted(fpl)); 3832 } 3833 3834 /* 3835 * The target vnode is not supported, prepare for the slow path to take over. 3836 */ 3837 static int __noinline 3838 cache_fplookup_partial_setup(struct cache_fpl *fpl) 3839 { 3840 struct nameidata *ndp; 3841 struct componentname *cnp; 3842 enum vgetstate dvs; 3843 struct vnode *dvp; 3844 struct pwd *pwd; 3845 seqc_t dvp_seqc; 3846 3847 ndp = fpl->ndp; 3848 cnp = fpl->cnp; 3849 pwd = fpl->pwd; 3850 dvp = fpl->dvp; 3851 dvp_seqc = fpl->dvp_seqc; 3852 3853 if (!pwd_hold_smr(pwd)) { 3854 cache_fpl_smr_exit(fpl); 3855 return (cache_fpl_aborted(fpl)); 3856 } 3857 3858 dvs = vget_prep_smr(dvp); 3859 cache_fpl_smr_exit(fpl); 3860 if (__predict_false(dvs == VGET_NONE)) { 3861 pwd_drop(pwd); 3862 return (cache_fpl_aborted(fpl)); 3863 } 3864 3865 vget_finish_ref(dvp, dvs); 3866 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3867 vrele(dvp); 3868 pwd_drop(pwd); 3869 return (cache_fpl_aborted(fpl)); 3870 } 3871 3872 cache_fpl_restore_partial(fpl, &fpl->snd); 3873 3874 ndp->ni_startdir = dvp; 3875 cnp->cn_flags |= MAKEENTRY; 3876 if (cache_fpl_islastcn(ndp)) 3877 cnp->cn_flags |= ISLASTCN; 3878 if (cache_fpl_isdotdot(cnp)) 3879 cnp->cn_flags |= ISDOTDOT; 3880 3881 return (0); 3882 } 3883 3884 static int 3885 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs) 3886 { 3887 struct componentname *cnp; 3888 struct vnode *tvp; 3889 seqc_t tvp_seqc; 3890 int error, lkflags; 3891 3892 cnp = fpl->cnp; 3893 tvp = fpl->tvp; 3894 tvp_seqc = fpl->tvp_seqc; 3895 3896 if ((cnp->cn_flags & LOCKLEAF) != 0) { 3897 lkflags = LK_SHARED; 3898 if ((cnp->cn_flags & LOCKSHARED) == 0) 3899 lkflags = LK_EXCLUSIVE; 3900 error = vget_finish(tvp, lkflags, tvs); 3901 if (__predict_false(error != 0)) { 3902 return (cache_fpl_aborted(fpl)); 3903 } 3904 } else { 3905 vget_finish_ref(tvp, tvs); 3906 } 3907 3908 if (!vn_seqc_consistent(tvp, tvp_seqc)) { 3909 if ((cnp->cn_flags & LOCKLEAF) != 0) 3910 vput(tvp); 3911 else 3912 vrele(tvp); 3913 return (cache_fpl_aborted(fpl)); 3914 } 3915 3916 return (cache_fpl_handled(fpl, 0)); 3917 } 3918 3919 /* 3920 * They want to possibly modify the state of the namecache. 3921 * 3922 * Don't try to match the API contract, just leave. 3923 * TODO: this leaves scalability on the table 3924 */ 3925 static int 3926 cache_fplookup_final_modifying(struct cache_fpl *fpl) 3927 { 3928 struct componentname *cnp; 3929 3930 cnp = fpl->cnp; 3931 MPASS(cnp->cn_nameiop != LOOKUP); 3932 return (cache_fpl_partial(fpl)); 3933 } 3934 3935 static int __noinline 3936 cache_fplookup_final_withparent(struct cache_fpl *fpl) 3937 { 3938 struct componentname *cnp; 3939 enum vgetstate dvs, tvs; 3940 struct vnode *dvp, *tvp; 3941 seqc_t dvp_seqc; 3942 int error; 3943 3944 cnp = fpl->cnp; 3945 dvp = fpl->dvp; 3946 dvp_seqc = fpl->dvp_seqc; 3947 tvp = fpl->tvp; 3948 3949 MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0); 3950 3951 /* 3952 * This is less efficient than it can be for simplicity. 3953 */ 3954 dvs = vget_prep_smr(dvp); 3955 if (__predict_false(dvs == VGET_NONE)) { 3956 return (cache_fpl_aborted(fpl)); 3957 } 3958 tvs = vget_prep_smr(tvp); 3959 if (__predict_false(tvs == VGET_NONE)) { 3960 cache_fpl_smr_exit(fpl); 3961 vget_abort(dvp, dvs); 3962 return (cache_fpl_aborted(fpl)); 3963 } 3964 3965 cache_fpl_smr_exit(fpl); 3966 3967 if ((cnp->cn_flags & LOCKPARENT) != 0) { 3968 error = vget_finish(dvp, LK_EXCLUSIVE, dvs); 3969 if (__predict_false(error != 0)) { 3970 vget_abort(tvp, tvs); 3971 return (cache_fpl_aborted(fpl)); 3972 } 3973 } else { 3974 vget_finish_ref(dvp, dvs); 3975 } 3976 3977 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3978 vget_abort(tvp, tvs); 3979 if ((cnp->cn_flags & LOCKPARENT) != 0) 3980 vput(dvp); 3981 else 3982 vrele(dvp); 3983 return (cache_fpl_aborted(fpl)); 3984 } 3985 3986 error = cache_fplookup_final_child(fpl, tvs); 3987 if (__predict_false(error != 0)) { 3988 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED); 3989 if ((cnp->cn_flags & LOCKPARENT) != 0) 3990 vput(dvp); 3991 else 3992 vrele(dvp); 3993 return (error); 3994 } 3995 3996 MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED); 3997 return (0); 3998 } 3999 4000 static int 4001 cache_fplookup_final(struct cache_fpl *fpl) 4002 { 4003 struct componentname *cnp; 4004 enum vgetstate tvs; 4005 struct vnode *dvp, *tvp; 4006 seqc_t dvp_seqc; 4007 4008 cnp = fpl->cnp; 4009 dvp = fpl->dvp; 4010 dvp_seqc = fpl->dvp_seqc; 4011 tvp = fpl->tvp; 4012 4013 VNPASS(cache_fplookup_vnode_supported(dvp), dvp); 4014 4015 if (cnp->cn_nameiop != LOOKUP) { 4016 return (cache_fplookup_final_modifying(fpl)); 4017 } 4018 4019 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) 4020 return (cache_fplookup_final_withparent(fpl)); 4021 4022 tvs = vget_prep_smr(tvp); 4023 if (__predict_false(tvs == VGET_NONE)) { 4024 return (cache_fpl_partial(fpl)); 4025 } 4026 4027 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4028 cache_fpl_smr_exit(fpl); 4029 vget_abort(tvp, tvs); 4030 return (cache_fpl_aborted(fpl)); 4031 } 4032 4033 cache_fpl_smr_exit(fpl); 4034 return (cache_fplookup_final_child(fpl, tvs)); 4035 } 4036 4037 static int __noinline 4038 cache_fplookup_dot(struct cache_fpl *fpl) 4039 { 4040 struct vnode *dvp; 4041 4042 dvp = fpl->dvp; 4043 4044 fpl->tvp = dvp; 4045 fpl->tvp_seqc = vn_seqc_read_any(dvp); 4046 if (seqc_in_modify(fpl->tvp_seqc)) { 4047 return (cache_fpl_aborted(fpl)); 4048 } 4049 4050 counter_u64_add(dothits, 1); 4051 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", dvp); 4052 4053 return (0); 4054 } 4055 4056 static int __noinline 4057 cache_fplookup_dotdot(struct cache_fpl *fpl) 4058 { 4059 struct nameidata *ndp; 4060 struct componentname *cnp; 4061 struct namecache *ncp; 4062 struct vnode *dvp; 4063 struct prison *pr; 4064 u_char nc_flag; 4065 4066 ndp = fpl->ndp; 4067 cnp = fpl->cnp; 4068 dvp = fpl->dvp; 4069 4070 /* 4071 * XXX this is racy the same way regular lookup is 4072 */ 4073 for (pr = cnp->cn_cred->cr_prison; pr != NULL; 4074 pr = pr->pr_parent) 4075 if (dvp == pr->pr_root) 4076 break; 4077 4078 if (dvp == ndp->ni_rootdir || 4079 dvp == ndp->ni_topdir || 4080 dvp == rootvnode || 4081 pr != NULL) { 4082 fpl->tvp = dvp; 4083 fpl->tvp_seqc = vn_seqc_read_any(dvp); 4084 if (seqc_in_modify(fpl->tvp_seqc)) { 4085 return (cache_fpl_aborted(fpl)); 4086 } 4087 return (0); 4088 } 4089 4090 if ((dvp->v_vflag & VV_ROOT) != 0) { 4091 /* 4092 * TODO 4093 * The opposite of climb mount is needed here. 4094 */ 4095 return (cache_fpl_aborted(fpl)); 4096 } 4097 4098 ncp = atomic_load_ptr(&dvp->v_cache_dd); 4099 if (ncp == NULL) { 4100 return (cache_fpl_aborted(fpl)); 4101 } 4102 4103 nc_flag = atomic_load_char(&ncp->nc_flag); 4104 if ((nc_flag & NCF_ISDOTDOT) != 0) { 4105 if ((nc_flag & NCF_NEGATIVE) != 0) 4106 return (cache_fpl_aborted(fpl)); 4107 fpl->tvp = ncp->nc_vp; 4108 } else { 4109 fpl->tvp = ncp->nc_dvp; 4110 } 4111 4112 if (!cache_ncp_canuse(ncp)) { 4113 return (cache_fpl_aborted(fpl)); 4114 } 4115 4116 fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp); 4117 if (seqc_in_modify(fpl->tvp_seqc)) { 4118 return (cache_fpl_partial(fpl)); 4119 } 4120 4121 counter_u64_add(dotdothits, 1); 4122 return (0); 4123 } 4124 4125 static int __noinline 4126 cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash) 4127 { 4128 u_char nc_flag; 4129 bool neg_promote; 4130 4131 nc_flag = atomic_load_char(&ncp->nc_flag); 4132 MPASS((nc_flag & NCF_NEGATIVE) != 0); 4133 /* 4134 * If they want to create an entry we need to replace this one. 4135 */ 4136 if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) { 4137 /* 4138 * TODO 4139 * This should call something similar to 4140 * cache_fplookup_final_modifying. 4141 */ 4142 return (cache_fpl_partial(fpl)); 4143 } 4144 neg_promote = cache_neg_hit_prep(ncp); 4145 if (!cache_ncp_canuse(ncp)) { 4146 cache_neg_hit_abort(ncp); 4147 return (cache_fpl_partial(fpl)); 4148 } 4149 if (__predict_false((nc_flag & NCF_WHITE) != 0)) { 4150 cache_neg_hit_abort(ncp); 4151 return (cache_fpl_partial(fpl)); 4152 } 4153 if (neg_promote) { 4154 return (cache_fplookup_negative_promote(fpl, ncp, hash)); 4155 } 4156 cache_neg_hit_finish(ncp); 4157 cache_fpl_smr_exit(fpl); 4158 return (cache_fpl_handled(fpl, ENOENT)); 4159 } 4160 4161 static int 4162 cache_fplookup_next(struct cache_fpl *fpl) 4163 { 4164 struct componentname *cnp; 4165 struct namecache *ncp; 4166 struct vnode *dvp, *tvp; 4167 u_char nc_flag; 4168 uint32_t hash; 4169 4170 cnp = fpl->cnp; 4171 dvp = fpl->dvp; 4172 4173 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) { 4174 return (cache_fplookup_dot(fpl)); 4175 } 4176 4177 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 4178 4179 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 4180 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 4181 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 4182 break; 4183 } 4184 4185 /* 4186 * If there is no entry we have to punt to the slow path to perform 4187 * actual lookup. Should there be nothing with this name a negative 4188 * entry will be created. 4189 */ 4190 if (__predict_false(ncp == NULL)) { 4191 return (cache_fpl_partial(fpl)); 4192 } 4193 4194 tvp = atomic_load_ptr(&ncp->nc_vp); 4195 nc_flag = atomic_load_char(&ncp->nc_flag); 4196 if ((nc_flag & NCF_NEGATIVE) != 0) { 4197 return (cache_fplookup_neg(fpl, ncp, hash)); 4198 } 4199 4200 if (!cache_ncp_canuse(ncp)) { 4201 return (cache_fpl_partial(fpl)); 4202 } 4203 4204 fpl->tvp = tvp; 4205 fpl->tvp_seqc = vn_seqc_read_any(tvp); 4206 if (seqc_in_modify(fpl->tvp_seqc)) { 4207 return (cache_fpl_partial(fpl)); 4208 } 4209 4210 if (!cache_fplookup_vnode_supported(tvp)) { 4211 return (cache_fpl_partial(fpl)); 4212 } 4213 4214 counter_u64_add(numposhits, 1); 4215 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp); 4216 return (0); 4217 } 4218 4219 static bool 4220 cache_fplookup_mp_supported(struct mount *mp) 4221 { 4222 4223 if (mp == NULL) 4224 return (false); 4225 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) 4226 return (false); 4227 return (true); 4228 } 4229 4230 /* 4231 * Walk up the mount stack (if any). 4232 * 4233 * Correctness is provided in the following ways: 4234 * - all vnodes are protected from freeing with SMR 4235 * - struct mount objects are type stable making them always safe to access 4236 * - stability of the particular mount is provided by busying it 4237 * - relationship between the vnode which is mounted on and the mount is 4238 * verified with the vnode sequence counter after busying 4239 * - association between root vnode of the mount and the mount is protected 4240 * by busy 4241 * 4242 * From that point on we can read the sequence counter of the root vnode 4243 * and get the next mount on the stack (if any) using the same protection. 4244 * 4245 * By the end of successful walk we are guaranteed the reached state was 4246 * indeed present at least at some point which matches the regular lookup. 4247 */ 4248 static int __noinline 4249 cache_fplookup_climb_mount(struct cache_fpl *fpl) 4250 { 4251 struct mount *mp, *prev_mp; 4252 struct mount_pcpu *mpcpu, *prev_mpcpu; 4253 struct vnode *vp; 4254 seqc_t vp_seqc; 4255 4256 vp = fpl->tvp; 4257 vp_seqc = fpl->tvp_seqc; 4258 4259 VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp); 4260 mp = atomic_load_ptr(&vp->v_mountedhere); 4261 if (mp == NULL) 4262 return (0); 4263 4264 prev_mp = NULL; 4265 for (;;) { 4266 if (!vfs_op_thread_enter_crit(mp, mpcpu)) { 4267 if (prev_mp != NULL) 4268 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); 4269 return (cache_fpl_partial(fpl)); 4270 } 4271 if (prev_mp != NULL) 4272 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); 4273 if (!vn_seqc_consistent(vp, vp_seqc)) { 4274 vfs_op_thread_exit_crit(mp, mpcpu); 4275 return (cache_fpl_partial(fpl)); 4276 } 4277 if (!cache_fplookup_mp_supported(mp)) { 4278 vfs_op_thread_exit_crit(mp, mpcpu); 4279 return (cache_fpl_partial(fpl)); 4280 } 4281 vp = atomic_load_ptr(&mp->mnt_rootvnode); 4282 if (vp == NULL || VN_IS_DOOMED(vp)) { 4283 vfs_op_thread_exit_crit(mp, mpcpu); 4284 return (cache_fpl_partial(fpl)); 4285 } 4286 vp_seqc = vn_seqc_read_any(vp); 4287 if (seqc_in_modify(vp_seqc)) { 4288 vfs_op_thread_exit_crit(mp, mpcpu); 4289 return (cache_fpl_partial(fpl)); 4290 } 4291 prev_mp = mp; 4292 prev_mpcpu = mpcpu; 4293 mp = atomic_load_ptr(&vp->v_mountedhere); 4294 if (mp == NULL) 4295 break; 4296 } 4297 4298 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); 4299 fpl->tvp = vp; 4300 fpl->tvp_seqc = vp_seqc; 4301 return (0); 4302 } 4303 4304 static bool 4305 cache_fplookup_need_climb_mount(struct cache_fpl *fpl) 4306 { 4307 struct mount *mp; 4308 struct vnode *vp; 4309 4310 vp = fpl->tvp; 4311 4312 /* 4313 * Hack: while this is a union, the pointer tends to be NULL so save on 4314 * a branch. 4315 */ 4316 mp = atomic_load_ptr(&vp->v_mountedhere); 4317 if (mp == NULL) 4318 return (false); 4319 if (vp->v_type == VDIR) 4320 return (true); 4321 return (false); 4322 } 4323 4324 /* 4325 * Parse the path. 4326 * 4327 * The code was originally copy-pasted from regular lookup and despite 4328 * clean ups leaves performance on the table. Any modifications here 4329 * must take into account that in case off fallback the resulting 4330 * nameidata state has to be compatible with the original. 4331 */ 4332 static int 4333 cache_fplookup_parse(struct cache_fpl *fpl) 4334 { 4335 struct nameidata *ndp; 4336 struct componentname *cnp; 4337 char *cp; 4338 4339 ndp = fpl->ndp; 4340 cnp = fpl->cnp; 4341 4342 /* 4343 * Search a new directory. 4344 * 4345 * The last component of the filename is left accessible via 4346 * cnp->cn_nameptr for callers that need the name. Callers needing 4347 * the name set the SAVENAME flag. When done, they assume 4348 * responsibility for freeing the pathname buffer. 4349 */ 4350 for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++) 4351 continue; 4352 cnp->cn_namelen = cp - cnp->cn_nameptr; 4353 if (__predict_false(cnp->cn_namelen > NAME_MAX)) { 4354 cache_fpl_smr_exit(fpl); 4355 return (cache_fpl_handled(fpl, ENAMETOOLONG)); 4356 } 4357 ndp->ni_pathlen -= cnp->cn_namelen; 4358 KASSERT(ndp->ni_pathlen <= PATH_MAX, 4359 ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen)); 4360 ndp->ni_next = cp; 4361 4362 /* 4363 * Replace multiple slashes by a single slash and trailing slashes 4364 * by a null. This must be done before VOP_LOOKUP() because some 4365 * fs's don't know about trailing slashes. Remember if there were 4366 * trailing slashes to handle symlinks, existing non-directories 4367 * and non-existing files that won't be directories specially later. 4368 */ 4369 while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) { 4370 cp++; 4371 ndp->ni_pathlen--; 4372 if (*cp == '\0') { 4373 /* 4374 * TODO 4375 * Regular lookup performs the following: 4376 * *ndp->ni_next = '\0'; 4377 * cnp->cn_flags |= TRAILINGSLASH; 4378 * 4379 * Which is problematic since it modifies data read 4380 * from userspace. Then if fast path lookup was to 4381 * abort we would have to either restore it or convey 4382 * the flag. Since this is a corner case just ignore 4383 * it for simplicity. 4384 */ 4385 return (cache_fpl_partial(fpl)); 4386 } 4387 } 4388 ndp->ni_next = cp; 4389 4390 /* 4391 * Check for degenerate name (e.g. / or "") 4392 * which is a way of talking about a directory, 4393 * e.g. like "/." or ".". 4394 * 4395 * TODO 4396 * Another corner case handled by the regular lookup 4397 */ 4398 if (__predict_false(cnp->cn_nameptr[0] == '\0')) { 4399 return (cache_fpl_partial(fpl)); 4400 } 4401 return (0); 4402 } 4403 4404 static void 4405 cache_fplookup_parse_advance(struct cache_fpl *fpl) 4406 { 4407 struct nameidata *ndp; 4408 struct componentname *cnp; 4409 4410 ndp = fpl->ndp; 4411 cnp = fpl->cnp; 4412 4413 cnp->cn_nameptr = ndp->ni_next; 4414 while (*cnp->cn_nameptr == '/') { 4415 cnp->cn_nameptr++; 4416 ndp->ni_pathlen--; 4417 } 4418 } 4419 4420 /* 4421 * See the API contract for VOP_FPLOOKUP_VEXEC. 4422 */ 4423 static int __noinline 4424 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error) 4425 { 4426 struct vnode *dvp; 4427 seqc_t dvp_seqc; 4428 4429 dvp = fpl->dvp; 4430 dvp_seqc = fpl->dvp_seqc; 4431 4432 /* 4433 * Hack: they may be looking up foo/bar, where foo is a 4434 * regular file. In such a case we need to turn ENOTDIR, 4435 * but we may happen to get here with a different error. 4436 */ 4437 if (dvp->v_type != VDIR) { 4438 /* 4439 * The check here is predominantly to catch 4440 * EOPNOTSUPP from dead_vnodeops. If the vnode 4441 * gets doomed past this point it is going to 4442 * fail seqc verification. 4443 */ 4444 if (VN_IS_DOOMED(dvp)) { 4445 return (cache_fpl_aborted(fpl)); 4446 } 4447 error = ENOTDIR; 4448 } 4449 4450 /* 4451 * Hack: handle O_SEARCH. 4452 * 4453 * Open Group Base Specifications Issue 7, 2018 edition states: 4454 * If the access mode of the open file description associated with the 4455 * file descriptor is not O_SEARCH, the function shall check whether 4456 * directory searches are permitted using the current permissions of 4457 * the directory underlying the file descriptor. If the access mode is 4458 * O_SEARCH, the function shall not perform the check. 4459 * 4460 * Regular lookup tests for the NOEXECCHECK flag for every path 4461 * component to decide whether to do the permission check. However, 4462 * since most lookups never have the flag (and when they do it is only 4463 * present for the first path component), lockless lookup only acts on 4464 * it if there is a permission problem. Here the flag is represented 4465 * with a boolean so that we don't have to clear it on the way out. 4466 * 4467 * For simplicity this always aborts. 4468 * TODO: check if this is the first lookup and ignore the permission 4469 * problem. Note the flag has to survive fallback (if it happens to be 4470 * performed). 4471 */ 4472 if (fpl->fsearch) { 4473 return (cache_fpl_aborted(fpl)); 4474 } 4475 4476 switch (error) { 4477 case EAGAIN: 4478 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4479 error = cache_fpl_aborted(fpl); 4480 } else { 4481 cache_fpl_partial(fpl); 4482 } 4483 break; 4484 default: 4485 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4486 error = cache_fpl_aborted(fpl); 4487 } else { 4488 cache_fpl_smr_exit(fpl); 4489 cache_fpl_handled(fpl, error); 4490 } 4491 break; 4492 } 4493 return (error); 4494 } 4495 4496 static int 4497 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl) 4498 { 4499 struct nameidata *ndp; 4500 struct componentname *cnp; 4501 struct mount *mp; 4502 int error; 4503 4504 error = CACHE_FPL_FAILED; 4505 ndp = fpl->ndp; 4506 cnp = fpl->cnp; 4507 4508 cache_fpl_checkpoint(fpl, &fpl->snd); 4509 4510 fpl->dvp = dvp; 4511 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); 4512 if (seqc_in_modify(fpl->dvp_seqc)) { 4513 cache_fpl_aborted(fpl); 4514 goto out; 4515 } 4516 mp = atomic_load_ptr(&fpl->dvp->v_mount); 4517 if (!cache_fplookup_mp_supported(mp)) { 4518 cache_fpl_aborted(fpl); 4519 goto out; 4520 } 4521 4522 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); 4523 4524 for (;;) { 4525 error = cache_fplookup_parse(fpl); 4526 if (__predict_false(error != 0)) { 4527 break; 4528 } 4529 4530 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); 4531 4532 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred); 4533 if (__predict_false(error != 0)) { 4534 error = cache_fplookup_failed_vexec(fpl, error); 4535 break; 4536 } 4537 4538 if (__predict_false(cache_fpl_isdotdot(cnp))) { 4539 error = cache_fplookup_dotdot(fpl); 4540 if (__predict_false(error != 0)) { 4541 break; 4542 } 4543 } else { 4544 error = cache_fplookup_next(fpl); 4545 if (__predict_false(error != 0)) { 4546 break; 4547 } 4548 4549 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 4550 4551 if (cache_fplookup_need_climb_mount(fpl)) { 4552 error = cache_fplookup_climb_mount(fpl); 4553 if (__predict_false(error != 0)) { 4554 break; 4555 } 4556 } 4557 } 4558 4559 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 4560 4561 if (cache_fpl_islastcn(ndp)) { 4562 error = cache_fplookup_final(fpl); 4563 break; 4564 } 4565 4566 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { 4567 error = cache_fpl_aborted(fpl); 4568 break; 4569 } 4570 4571 fpl->dvp = fpl->tvp; 4572 fpl->dvp_seqc = fpl->tvp_seqc; 4573 4574 cache_fplookup_parse_advance(fpl); 4575 cache_fpl_checkpoint(fpl, &fpl->snd); 4576 } 4577 out: 4578 switch (fpl->status) { 4579 case CACHE_FPL_STATUS_UNSET: 4580 __assert_unreachable(); 4581 break; 4582 case CACHE_FPL_STATUS_PARTIAL: 4583 cache_fpl_smr_assert_entered(fpl); 4584 return (cache_fplookup_partial_setup(fpl)); 4585 case CACHE_FPL_STATUS_ABORTED: 4586 if (fpl->in_smr) 4587 cache_fpl_smr_exit(fpl); 4588 return (CACHE_FPL_FAILED); 4589 case CACHE_FPL_STATUS_HANDLED: 4590 MPASS(error != CACHE_FPL_FAILED); 4591 cache_fpl_smr_assert_not_entered(fpl); 4592 if (__predict_false(error != 0)) { 4593 ndp->ni_dvp = NULL; 4594 ndp->ni_vp = NULL; 4595 cache_fpl_cleanup_cnp(cnp); 4596 return (error); 4597 } 4598 ndp->ni_dvp = fpl->dvp; 4599 ndp->ni_vp = fpl->tvp; 4600 if (cnp->cn_flags & SAVENAME) 4601 cnp->cn_flags |= HASBUF; 4602 else 4603 cache_fpl_cleanup_cnp(cnp); 4604 return (error); 4605 } 4606 } 4607 4608 /* 4609 * Fast path lookup protected with SMR and sequence counters. 4610 * 4611 * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one. 4612 * 4613 * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria 4614 * outlined below. 4615 * 4616 * Traditional vnode lookup conceptually looks like this: 4617 * 4618 * vn_lock(current); 4619 * for (;;) { 4620 * next = find(); 4621 * vn_lock(next); 4622 * vn_unlock(current); 4623 * current = next; 4624 * if (last) 4625 * break; 4626 * } 4627 * return (current); 4628 * 4629 * Each jump to the next vnode is safe memory-wise and atomic with respect to 4630 * any modifications thanks to holding respective locks. 4631 * 4632 * The same guarantee can be provided with a combination of safe memory 4633 * reclamation and sequence counters instead. If all operations which affect 4634 * the relationship between the current vnode and the one we are looking for 4635 * also modify the counter, we can verify whether all the conditions held as 4636 * we made the jump. This includes things like permissions, mount points etc. 4637 * Counter modification is provided by enclosing relevant places in 4638 * vn_seqc_write_begin()/end() calls. 4639 * 4640 * Thus this translates to: 4641 * 4642 * vfs_smr_enter(); 4643 * dvp_seqc = seqc_read_any(dvp); 4644 * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode 4645 * abort(); 4646 * for (;;) { 4647 * tvp = find(); 4648 * tvp_seqc = seqc_read_any(tvp); 4649 * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode 4650 * abort(); 4651 * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode 4652 * abort(); 4653 * dvp = tvp; // we know nothing of importance has changed 4654 * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration 4655 * if (last) 4656 * break; 4657 * } 4658 * vget(); // secure the vnode 4659 * if (!seqc_consistent(tvp, tvp_seqc) // final check 4660 * abort(); 4661 * // at this point we know nothing has changed for any parent<->child pair 4662 * // as they were crossed during the lookup, meaning we matched the guarantee 4663 * // of the locked variant 4664 * return (tvp); 4665 * 4666 * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows: 4667 * - they are called while within vfs_smr protection which they must never exit 4668 * - EAGAIN can be returned to denote checking could not be performed, it is 4669 * always valid to return it 4670 * - if the sequence counter has not changed the result must be valid 4671 * - if the sequence counter has changed both false positives and false negatives 4672 * are permitted (since the result will be rejected later) 4673 * - for simple cases of unix permission checks vaccess_vexec_smr can be used 4674 * 4675 * Caveats to watch out for: 4676 * - vnodes are passed unlocked and unreferenced with nothing stopping 4677 * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised 4678 * to use atomic_load_ptr to fetch it. 4679 * - the aforementioned object can also get freed, meaning absent other means it 4680 * should be protected with vfs_smr 4681 * - either safely checking permissions as they are modified or guaranteeing 4682 * their stability is left to the routine 4683 */ 4684 int 4685 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status, 4686 struct pwd **pwdp) 4687 { 4688 struct cache_fpl fpl; 4689 struct pwd *pwd; 4690 struct vnode *dvp; 4691 struct componentname *cnp; 4692 struct nameidata_saved orig; 4693 int error; 4694 4695 MPASS(ndp->ni_lcf == 0); 4696 4697 fpl.status = CACHE_FPL_STATUS_UNSET; 4698 fpl.ndp = ndp; 4699 fpl.cnp = &ndp->ni_cnd; 4700 MPASS(curthread == fpl.cnp->cn_thread); 4701 4702 if ((fpl.cnp->cn_flags & SAVESTART) != 0) 4703 MPASS(fpl.cnp->cn_nameiop != LOOKUP); 4704 4705 if (!cache_can_fplookup(&fpl)) { 4706 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 4707 *status = fpl.status; 4708 return (EOPNOTSUPP); 4709 } 4710 4711 cache_fpl_checkpoint(&fpl, &orig); 4712 4713 cache_fpl_smr_enter_initial(&fpl); 4714 fpl.fsearch = false; 4715 pwd = pwd_get_smr(); 4716 fpl.pwd = pwd; 4717 ndp->ni_rootdir = pwd->pwd_rdir; 4718 ndp->ni_topdir = pwd->pwd_jdir; 4719 4720 cnp = fpl.cnp; 4721 cnp->cn_nameptr = cnp->cn_pnbuf; 4722 if (cnp->cn_pnbuf[0] == '/') { 4723 cache_fpl_handle_root(ndp, &dvp); 4724 ndp->ni_resflags |= NIRES_ABS; 4725 } else { 4726 if (ndp->ni_dirfd == AT_FDCWD) { 4727 dvp = pwd->pwd_cdir; 4728 } else { 4729 error = cache_fplookup_dirfd(&fpl, &dvp); 4730 if (__predict_false(error != 0)) { 4731 goto out; 4732 } 4733 } 4734 } 4735 4736 SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true); 4737 4738 error = cache_fplookup_impl(dvp, &fpl); 4739 out: 4740 cache_fpl_smr_assert_not_entered(&fpl); 4741 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 4742 4743 *status = fpl.status; 4744 switch (fpl.status) { 4745 case CACHE_FPL_STATUS_UNSET: 4746 __assert_unreachable(); 4747 break; 4748 case CACHE_FPL_STATUS_HANDLED: 4749 SDT_PROBE3(vfs, namei, lookup, return, error, 4750 (error == 0 ? ndp->ni_vp : NULL), true); 4751 break; 4752 case CACHE_FPL_STATUS_PARTIAL: 4753 *pwdp = fpl.pwd; 4754 /* 4755 * Status restored by cache_fplookup_partial_setup. 4756 */ 4757 break; 4758 case CACHE_FPL_STATUS_ABORTED: 4759 cache_fpl_restore_abort(&fpl, &orig); 4760 break; 4761 } 4762 return (error); 4763 } 4764