1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Poul-Henning Kamp of the FreeBSD Project. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_ddb.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/capsicum.h> 46 #include <sys/counter.h> 47 #include <sys/filedesc.h> 48 #include <sys/fnv_hash.h> 49 #include <sys/kernel.h> 50 #include <sys/ktr.h> 51 #include <sys/lock.h> 52 #include <sys/malloc.h> 53 #include <sys/fcntl.h> 54 #include <sys/jail.h> 55 #include <sys/mount.h> 56 #include <sys/namei.h> 57 #include <sys/proc.h> 58 #include <sys/seqc.h> 59 #include <sys/sdt.h> 60 #include <sys/smr.h> 61 #include <sys/smp.h> 62 #include <sys/syscallsubr.h> 63 #include <sys/sysctl.h> 64 #include <sys/sysproto.h> 65 #include <sys/vnode.h> 66 #include <ck_queue.h> 67 #ifdef KTRACE 68 #include <sys/ktrace.h> 69 #endif 70 #ifdef INVARIANTS 71 #include <machine/_inttypes.h> 72 #endif 73 74 #include <sys/capsicum.h> 75 76 #include <security/audit/audit.h> 77 #include <security/mac/mac_framework.h> 78 79 #ifdef DDB 80 #include <ddb/ddb.h> 81 #endif 82 83 #include <vm/uma.h> 84 85 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 86 "Name cache"); 87 88 SDT_PROVIDER_DECLARE(vfs); 89 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", 90 "struct vnode *"); 91 SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *", 92 "struct vnode *"); 93 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", 94 "char *"); 95 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *", 96 "const char *"); 97 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *", 98 "struct namecache *", "int", "int"); 99 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); 100 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", 101 "char *", "struct vnode *"); 102 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); 103 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", 104 "struct vnode *", "char *"); 105 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", 106 "struct vnode *"); 107 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, 108 "struct vnode *", "char *"); 109 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", 110 "char *"); 111 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *", 112 "struct componentname *"); 113 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *", 114 "struct componentname *"); 115 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); 116 SDT_PROBE_DEFINE1(vfs, namecache, purge, batch, "int"); 117 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); 118 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); 119 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", 120 "struct vnode *"); 121 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", 122 "char *"); 123 SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *", 124 "char *"); 125 SDT_PROBE_DEFINE1(vfs, namecache, symlink, alloc__fail, "size_t"); 126 127 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool"); 128 SDT_PROBE_DECLARE(vfs, namei, lookup, entry); 129 SDT_PROBE_DECLARE(vfs, namei, lookup, return); 130 131 /* 132 * This structure describes the elements in the cache of recent 133 * names looked up by namei. 134 */ 135 struct negstate { 136 u_char neg_flag; 137 u_char neg_hit; 138 }; 139 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *), 140 "the state must fit in a union with a pointer without growing it"); 141 142 struct namecache { 143 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 144 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 145 CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */ 146 struct vnode *nc_dvp; /* vnode of parent of name */ 147 union { 148 struct vnode *nu_vp; /* vnode the name refers to */ 149 struct negstate nu_neg;/* negative entry state */ 150 } n_un; 151 u_char nc_flag; /* flag bits */ 152 u_char nc_nlen; /* length of name */ 153 char nc_name[0]; /* segment name + nul */ 154 }; 155 156 /* 157 * struct namecache_ts repeats struct namecache layout up to the 158 * nc_nlen member. 159 * struct namecache_ts is used in place of struct namecache when time(s) need 160 * to be stored. The nc_dotdottime field is used when a cache entry is mapping 161 * both a non-dotdot directory name plus dotdot for the directory's 162 * parent. 163 * 164 * See below for alignment requirement. 165 */ 166 struct namecache_ts { 167 struct timespec nc_time; /* timespec provided by fs */ 168 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ 169 int nc_ticks; /* ticks value when entry was added */ 170 int nc_pad; 171 struct namecache nc_nc; 172 }; 173 174 TAILQ_HEAD(cache_freebatch, namecache); 175 176 /* 177 * At least mips n32 performs 64-bit accesses to timespec as found 178 * in namecache_ts and requires them to be aligned. Since others 179 * may be in the same spot suffer a little bit and enforce the 180 * alignment for everyone. Note this is a nop for 64-bit platforms. 181 */ 182 #define CACHE_ZONE_ALIGNMENT UMA_ALIGNOF(time_t) 183 184 /* 185 * TODO: the initial value of CACHE_PATH_CUTOFF was inherited from the 186 * 4.4 BSD codebase. Later on struct namecache was tweaked to become 187 * smaller and the value was bumped to retain the total size, but it 188 * was never re-evaluated for suitability. A simple test counting 189 * lengths during package building shows that the value of 45 covers 190 * about 86% of all added entries, reaching 99% at 65. 191 * 192 * Regardless of the above, use of dedicated zones instead of malloc may be 193 * inducing additional waste. This may be hard to address as said zones are 194 * tied to VFS SMR. Even if retaining them, the current split should be 195 * re-evaluated. 196 */ 197 #ifdef __LP64__ 198 #define CACHE_PATH_CUTOFF 45 199 #define CACHE_LARGE_PAD 6 200 #else 201 #define CACHE_PATH_CUTOFF 41 202 #define CACHE_LARGE_PAD 2 203 #endif 204 205 #define CACHE_ZONE_SMALL_SIZE (offsetof(struct namecache, nc_name) + CACHE_PATH_CUTOFF + 1) 206 #define CACHE_ZONE_SMALL_TS_SIZE (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_SMALL_SIZE) 207 #define CACHE_ZONE_LARGE_SIZE (offsetof(struct namecache, nc_name) + NAME_MAX + 1 + CACHE_LARGE_PAD) 208 #define CACHE_ZONE_LARGE_TS_SIZE (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_LARGE_SIZE) 209 210 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 211 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 212 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 213 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 214 215 #define nc_vp n_un.nu_vp 216 #define nc_neg n_un.nu_neg 217 218 /* 219 * Flags in namecache.nc_flag 220 */ 221 #define NCF_WHITE 0x01 222 #define NCF_ISDOTDOT 0x02 223 #define NCF_TS 0x04 224 #define NCF_DTS 0x08 225 #define NCF_DVDROP 0x10 226 #define NCF_NEGATIVE 0x20 227 #define NCF_INVALID 0x40 228 #define NCF_WIP 0x80 229 230 /* 231 * Flags in negstate.neg_flag 232 */ 233 #define NEG_HOT 0x01 234 235 static bool cache_neg_evict_cond(u_long lnumcache); 236 237 /* 238 * Mark an entry as invalid. 239 * 240 * This is called before it starts getting deconstructed. 241 */ 242 static void 243 cache_ncp_invalidate(struct namecache *ncp) 244 { 245 246 KASSERT((ncp->nc_flag & NCF_INVALID) == 0, 247 ("%s: entry %p already invalid", __func__, ncp)); 248 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID); 249 atomic_thread_fence_rel(); 250 } 251 252 /* 253 * Check whether the entry can be safely used. 254 * 255 * All places which elide locks are supposed to call this after they are 256 * done with reading from an entry. 257 */ 258 #define cache_ncp_canuse(ncp) ({ \ 259 struct namecache *_ncp = (ncp); \ 260 u_char _nc_flag; \ 261 \ 262 atomic_thread_fence_acq(); \ 263 _nc_flag = atomic_load_char(&_ncp->nc_flag); \ 264 __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP)) == 0); \ 265 }) 266 267 /* 268 * Like the above but also checks NCF_WHITE. 269 */ 270 #define cache_fpl_neg_ncp_canuse(ncp) ({ \ 271 struct namecache *_ncp = (ncp); \ 272 u_char _nc_flag; \ 273 \ 274 atomic_thread_fence_acq(); \ 275 _nc_flag = atomic_load_char(&_ncp->nc_flag); \ 276 __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP | NCF_WHITE)) == 0); \ 277 }) 278 279 /* 280 * Name caching works as follows: 281 * 282 * Names found by directory scans are retained in a cache 283 * for future reference. It is managed LRU, so frequently 284 * used names will hang around. Cache is indexed by hash value 285 * obtained from (dvp, name) where dvp refers to the directory 286 * containing name. 287 * 288 * If it is a "negative" entry, (i.e. for a name that is known NOT to 289 * exist) the vnode pointer will be NULL. 290 * 291 * Upon reaching the last segment of a path, if the reference 292 * is for DELETE, or NOCACHE is set (rewrite), and the 293 * name is located in the cache, it will be dropped. 294 * 295 * These locks are used (in the order in which they can be taken): 296 * NAME TYPE ROLE 297 * vnodelock mtx vnode lists and v_cache_dd field protection 298 * bucketlock mtx for access to given set of hash buckets 299 * neglist mtx negative entry LRU management 300 * 301 * It is legal to take multiple vnodelock and bucketlock locks. The locking 302 * order is lower address first. Both are recursive. 303 * 304 * "." lookups are lockless. 305 * 306 * ".." and vnode -> name lookups require vnodelock. 307 * 308 * name -> vnode lookup requires the relevant bucketlock to be held for reading. 309 * 310 * Insertions and removals of entries require involved vnodes and bucketlocks 311 * to be locked to provide safe operation against other threads modifying the 312 * cache. 313 * 314 * Some lookups result in removal of the found entry (e.g. getting rid of a 315 * negative entry with the intent to create a positive one), which poses a 316 * problem when multiple threads reach the state. Similarly, two different 317 * threads can purge two different vnodes and try to remove the same name. 318 * 319 * If the already held vnode lock is lower than the second required lock, we 320 * can just take the other lock. However, in the opposite case, this could 321 * deadlock. As such, this is resolved by trylocking and if that fails unlocking 322 * the first node, locking everything in order and revalidating the state. 323 */ 324 325 VFS_SMR_DECLARE; 326 327 static SYSCTL_NODE(_vfs_cache, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 328 "Name cache parameters"); 329 330 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */ 331 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, size, CTLFLAG_RW, &ncsize, 0, 332 "Total namecache capacity"); 333 334 u_int ncsizefactor = 2; 335 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, sizefactor, CTLFLAG_RW, &ncsizefactor, 0, 336 "Size factor for namecache"); 337 338 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */ 339 SYSCTL_ULONG(_vfs_cache_param, OID_AUTO, negfactor, CTLFLAG_RW, &ncnegfactor, 0, 340 "Ratio of negative namecache entries"); 341 342 /* 343 * Negative entry % of namecache capacity above which automatic eviction is allowed. 344 * 345 * Check cache_neg_evict_cond for details. 346 */ 347 static u_int ncnegminpct = 3; 348 349 static u_int __read_mostly neg_min; /* the above recomputed against ncsize */ 350 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, negmin, CTLFLAG_RD, &neg_min, 0, 351 "Negative entry count above which automatic eviction is allowed"); 352 353 /* 354 * Structures associated with name caching. 355 */ 356 #define NCHHASH(hash) \ 357 (&nchashtbl[(hash) & nchash]) 358 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ 359 static u_long __read_mostly nchash; /* size of hash table */ 360 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 361 "Size of namecache hash table"); 362 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ 363 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ 364 365 struct nchstats nchstats; /* cache effectiveness statistics */ 366 367 static bool __read_frequently cache_fast_revlookup = true; 368 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_revlookup, CTLFLAG_RW, 369 &cache_fast_revlookup, 0, ""); 370 371 static bool __read_mostly cache_rename_add = true; 372 SYSCTL_BOOL(_vfs, OID_AUTO, cache_rename_add, CTLFLAG_RW, 373 &cache_rename_add, 0, ""); 374 375 static u_int __exclusive_cache_line neg_cycle; 376 377 #define ncneghash 3 378 #define numneglists (ncneghash + 1) 379 380 struct neglist { 381 struct mtx nl_evict_lock; 382 struct mtx nl_lock __aligned(CACHE_LINE_SIZE); 383 TAILQ_HEAD(, namecache) nl_list; 384 TAILQ_HEAD(, namecache) nl_hotlist; 385 u_long nl_hotnum; 386 } __aligned(CACHE_LINE_SIZE); 387 388 static struct neglist neglists[numneglists]; 389 390 static inline struct neglist * 391 NCP2NEGLIST(struct namecache *ncp) 392 { 393 394 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); 395 } 396 397 static inline struct negstate * 398 NCP2NEGSTATE(struct namecache *ncp) 399 { 400 401 MPASS(atomic_load_char(&ncp->nc_flag) & NCF_NEGATIVE); 402 return (&ncp->nc_neg); 403 } 404 405 #define numbucketlocks (ncbuckethash + 1) 406 static u_int __read_mostly ncbuckethash; 407 static struct mtx_padalign __read_mostly *bucketlocks; 408 #define HASH2BUCKETLOCK(hash) \ 409 ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)])) 410 411 #define numvnodelocks (ncvnodehash + 1) 412 static u_int __read_mostly ncvnodehash; 413 static struct mtx __read_mostly *vnodelocks; 414 static inline struct mtx * 415 VP2VNODELOCK(struct vnode *vp) 416 { 417 418 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); 419 } 420 421 static void 422 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) 423 { 424 struct namecache_ts *ncp_ts; 425 426 KASSERT((ncp->nc_flag & NCF_TS) != 0 || 427 (tsp == NULL && ticksp == NULL), 428 ("No NCF_TS")); 429 430 if (tsp == NULL) 431 return; 432 433 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 434 *tsp = ncp_ts->nc_time; 435 *ticksp = ncp_ts->nc_ticks; 436 } 437 438 #ifdef DEBUG_CACHE 439 static int __read_mostly doingcache = 1; /* 1 => enable the cache */ 440 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, 441 "VFS namecache enabled"); 442 #endif 443 444 /* Export size information to userland */ 445 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 446 sizeof(struct namecache), "sizeof(struct namecache)"); 447 448 /* 449 * The new name cache statistics 450 */ 451 static SYSCTL_NODE(_vfs_cache, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 452 "Name cache statistics"); 453 454 #define STATNODE_ULONG(name, varname, descr) \ 455 SYSCTL_ULONG(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr); 456 #define STATNODE_COUNTER(name, varname, descr) \ 457 static COUNTER_U64_DEFINE_EARLY(varname); \ 458 SYSCTL_COUNTER_U64(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, \ 459 descr); 460 STATNODE_ULONG(neg, numneg, "Number of negative cache entries"); 461 STATNODE_ULONG(count, numcache, "Number of cache entries"); 462 STATNODE_COUNTER(heldvnodes, numcachehv, "Number of namecache entries with vnodes held"); 463 STATNODE_COUNTER(drops, numdrops, "Number of dropped entries due to reaching the limit"); 464 STATNODE_COUNTER(dothits, dothits, "Number of '.' hits"); 465 STATNODE_COUNTER(dotdothis, dotdothits, "Number of '..' hits"); 466 STATNODE_COUNTER(miss, nummiss, "Number of cache misses"); 467 STATNODE_COUNTER(misszap, nummisszap, "Number of cache misses we do not want to cache"); 468 STATNODE_COUNTER(posszaps, numposzaps, 469 "Number of cache hits (positive) we do not want to cache"); 470 STATNODE_COUNTER(poshits, numposhits, "Number of cache hits (positive)"); 471 STATNODE_COUNTER(negzaps, numnegzaps, 472 "Number of cache hits (negative) we do not want to cache"); 473 STATNODE_COUNTER(neghits, numneghits, "Number of cache hits (negative)"); 474 /* These count for vn_getcwd(), too. */ 475 STATNODE_COUNTER(fullpathcalls, numfullpathcalls, "Number of fullpath search calls"); 476 STATNODE_COUNTER(fullpathfail1, numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); 477 STATNODE_COUNTER(fullpathfail2, numfullpathfail2, 478 "Number of fullpath search errors (VOP_VPTOCNP failures)"); 479 STATNODE_COUNTER(fullpathfail4, numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); 480 STATNODE_COUNTER(fullpathfound, numfullpathfound, "Number of successful fullpath calls"); 481 STATNODE_COUNTER(symlinktoobig, symlinktoobig, "Number of times symlink did not fit the cache"); 482 483 /* 484 * Debug or developer statistics. 485 */ 486 static SYSCTL_NODE(_vfs_cache, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 487 "Name cache debugging"); 488 #define DEBUGNODE_ULONG(name, varname, descr) \ 489 SYSCTL_ULONG(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr); 490 #define DEBUGNODE_COUNTER(name, varname, descr) \ 491 static COUNTER_U64_DEFINE_EARLY(varname); \ 492 SYSCTL_COUNTER_U64(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, \ 493 descr); 494 DEBUGNODE_COUNTER(zap_bucket_relock_success, zap_bucket_relock_success, 495 "Number of successful removals after relocking"); 496 static long zap_bucket_fail; 497 DEBUGNODE_ULONG(zap_bucket_fail, zap_bucket_fail, ""); 498 static long zap_bucket_fail2; 499 DEBUGNODE_ULONG(zap_bucket_fail2, zap_bucket_fail2, ""); 500 static long cache_lock_vnodes_cel_3_failures; 501 DEBUGNODE_ULONG(vnodes_cel_3_failures, cache_lock_vnodes_cel_3_failures, 502 "Number of times 3-way vnode locking failed"); 503 504 static void cache_zap_locked(struct namecache *ncp); 505 static int vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, 506 char **freebuf, size_t *buflen); 507 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 508 char **retbuf, size_t *buflen, size_t addend); 509 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, 510 char **retbuf, size_t *buflen); 511 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, 512 char **retbuf, size_t *len, size_t addend); 513 514 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 515 516 static inline void 517 cache_assert_vlp_locked(struct mtx *vlp) 518 { 519 520 if (vlp != NULL) 521 mtx_assert(vlp, MA_OWNED); 522 } 523 524 static inline void 525 cache_assert_vnode_locked(struct vnode *vp) 526 { 527 struct mtx *vlp; 528 529 vlp = VP2VNODELOCK(vp); 530 cache_assert_vlp_locked(vlp); 531 } 532 533 /* 534 * Directory vnodes with entries are held for two reasons: 535 * 1. make them less of a target for reclamation in vnlru 536 * 2. suffer smaller performance penalty in locked lookup as requeieing is avoided 537 * 538 * It will be feasible to stop doing it altogether if all filesystems start 539 * supporting lockless lookup. 540 */ 541 static void 542 cache_hold_vnode(struct vnode *vp) 543 { 544 545 cache_assert_vnode_locked(vp); 546 VNPASS(LIST_EMPTY(&vp->v_cache_src), vp); 547 vhold(vp); 548 counter_u64_add(numcachehv, 1); 549 } 550 551 static void 552 cache_drop_vnode(struct vnode *vp) 553 { 554 555 /* 556 * Called after all locks are dropped, meaning we can't assert 557 * on the state of v_cache_src. 558 */ 559 vdrop(vp); 560 counter_u64_add(numcachehv, -1); 561 } 562 563 /* 564 * UMA zones. 565 */ 566 static uma_zone_t __read_mostly cache_zone_small; 567 static uma_zone_t __read_mostly cache_zone_small_ts; 568 static uma_zone_t __read_mostly cache_zone_large; 569 static uma_zone_t __read_mostly cache_zone_large_ts; 570 571 char * 572 cache_symlink_alloc(size_t size, int flags) 573 { 574 575 if (size < CACHE_ZONE_SMALL_SIZE) { 576 return (uma_zalloc_smr(cache_zone_small, flags)); 577 } 578 if (size < CACHE_ZONE_LARGE_SIZE) { 579 return (uma_zalloc_smr(cache_zone_large, flags)); 580 } 581 counter_u64_add(symlinktoobig, 1); 582 SDT_PROBE1(vfs, namecache, symlink, alloc__fail, size); 583 return (NULL); 584 } 585 586 void 587 cache_symlink_free(char *string, size_t size) 588 { 589 590 MPASS(string != NULL); 591 KASSERT(size < CACHE_ZONE_LARGE_SIZE, 592 ("%s: size %zu too big", __func__, size)); 593 594 if (size < CACHE_ZONE_SMALL_SIZE) { 595 uma_zfree_smr(cache_zone_small, string); 596 return; 597 } 598 if (size < CACHE_ZONE_LARGE_SIZE) { 599 uma_zfree_smr(cache_zone_large, string); 600 return; 601 } 602 __assert_unreachable(); 603 } 604 605 static struct namecache * 606 cache_alloc_uma(int len, bool ts) 607 { 608 struct namecache_ts *ncp_ts; 609 struct namecache *ncp; 610 611 if (__predict_false(ts)) { 612 if (len <= CACHE_PATH_CUTOFF) 613 ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK); 614 else 615 ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK); 616 ncp = &ncp_ts->nc_nc; 617 } else { 618 if (len <= CACHE_PATH_CUTOFF) 619 ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK); 620 else 621 ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK); 622 } 623 return (ncp); 624 } 625 626 static void 627 cache_free_uma(struct namecache *ncp) 628 { 629 struct namecache_ts *ncp_ts; 630 631 if (__predict_false(ncp->nc_flag & NCF_TS)) { 632 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 633 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 634 uma_zfree_smr(cache_zone_small_ts, ncp_ts); 635 else 636 uma_zfree_smr(cache_zone_large_ts, ncp_ts); 637 } else { 638 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 639 uma_zfree_smr(cache_zone_small, ncp); 640 else 641 uma_zfree_smr(cache_zone_large, ncp); 642 } 643 } 644 645 static struct namecache * 646 cache_alloc(int len, bool ts) 647 { 648 u_long lnumcache; 649 650 /* 651 * Avoid blowout in namecache entries. 652 * 653 * Bugs: 654 * 1. filesystems may end up trying to add an already existing entry 655 * (for example this can happen after a cache miss during concurrent 656 * lookup), in which case we will call cache_neg_evict despite not 657 * adding anything. 658 * 2. the routine may fail to free anything and no provisions are made 659 * to make it try harder (see the inside for failure modes) 660 * 3. it only ever looks at negative entries. 661 */ 662 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; 663 if (cache_neg_evict_cond(lnumcache)) { 664 lnumcache = atomic_load_long(&numcache); 665 } 666 if (__predict_false(lnumcache >= ncsize)) { 667 atomic_subtract_long(&numcache, 1); 668 counter_u64_add(numdrops, 1); 669 return (NULL); 670 } 671 return (cache_alloc_uma(len, ts)); 672 } 673 674 static void 675 cache_free(struct namecache *ncp) 676 { 677 678 MPASS(ncp != NULL); 679 if ((ncp->nc_flag & NCF_DVDROP) != 0) { 680 cache_drop_vnode(ncp->nc_dvp); 681 } 682 cache_free_uma(ncp); 683 atomic_subtract_long(&numcache, 1); 684 } 685 686 static void 687 cache_free_batch(struct cache_freebatch *batch) 688 { 689 struct namecache *ncp, *nnp; 690 int i; 691 692 i = 0; 693 if (TAILQ_EMPTY(batch)) 694 goto out; 695 TAILQ_FOREACH_SAFE(ncp, batch, nc_dst, nnp) { 696 if ((ncp->nc_flag & NCF_DVDROP) != 0) { 697 cache_drop_vnode(ncp->nc_dvp); 698 } 699 cache_free_uma(ncp); 700 i++; 701 } 702 atomic_subtract_long(&numcache, i); 703 out: 704 SDT_PROBE1(vfs, namecache, purge, batch, i); 705 } 706 707 /* 708 * TODO: With the value stored we can do better than computing the hash based 709 * on the address. The choice of FNV should also be revisited. 710 */ 711 static void 712 cache_prehash(struct vnode *vp) 713 { 714 715 vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT); 716 } 717 718 static uint32_t 719 cache_get_hash(char *name, u_char len, struct vnode *dvp) 720 { 721 722 return (fnv_32_buf(name, len, dvp->v_nchash)); 723 } 724 725 static uint32_t 726 cache_get_hash_iter_start(struct vnode *dvp) 727 { 728 729 return (dvp->v_nchash); 730 } 731 732 static uint32_t 733 cache_get_hash_iter(char c, uint32_t hash) 734 { 735 736 return (fnv_32_buf(&c, 1, hash)); 737 } 738 739 static uint32_t 740 cache_get_hash_iter_finish(uint32_t hash) 741 { 742 743 return (hash); 744 } 745 746 static inline struct nchashhead * 747 NCP2BUCKET(struct namecache *ncp) 748 { 749 uint32_t hash; 750 751 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 752 return (NCHHASH(hash)); 753 } 754 755 static inline struct mtx * 756 NCP2BUCKETLOCK(struct namecache *ncp) 757 { 758 uint32_t hash; 759 760 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 761 return (HASH2BUCKETLOCK(hash)); 762 } 763 764 #ifdef INVARIANTS 765 static void 766 cache_assert_bucket_locked(struct namecache *ncp) 767 { 768 struct mtx *blp; 769 770 blp = NCP2BUCKETLOCK(ncp); 771 mtx_assert(blp, MA_OWNED); 772 } 773 774 static void 775 cache_assert_bucket_unlocked(struct namecache *ncp) 776 { 777 struct mtx *blp; 778 779 blp = NCP2BUCKETLOCK(ncp); 780 mtx_assert(blp, MA_NOTOWNED); 781 } 782 #else 783 #define cache_assert_bucket_locked(x) do { } while (0) 784 #define cache_assert_bucket_unlocked(x) do { } while (0) 785 #endif 786 787 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) 788 static void 789 _cache_sort_vnodes(void **p1, void **p2) 790 { 791 void *tmp; 792 793 MPASS(*p1 != NULL || *p2 != NULL); 794 795 if (*p1 > *p2) { 796 tmp = *p2; 797 *p2 = *p1; 798 *p1 = tmp; 799 } 800 } 801 802 static void 803 cache_lock_all_buckets(void) 804 { 805 u_int i; 806 807 for (i = 0; i < numbucketlocks; i++) 808 mtx_lock(&bucketlocks[i]); 809 } 810 811 static void 812 cache_unlock_all_buckets(void) 813 { 814 u_int i; 815 816 for (i = 0; i < numbucketlocks; i++) 817 mtx_unlock(&bucketlocks[i]); 818 } 819 820 static void 821 cache_lock_all_vnodes(void) 822 { 823 u_int i; 824 825 for (i = 0; i < numvnodelocks; i++) 826 mtx_lock(&vnodelocks[i]); 827 } 828 829 static void 830 cache_unlock_all_vnodes(void) 831 { 832 u_int i; 833 834 for (i = 0; i < numvnodelocks; i++) 835 mtx_unlock(&vnodelocks[i]); 836 } 837 838 static int 839 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 840 { 841 842 cache_sort_vnodes(&vlp1, &vlp2); 843 844 if (vlp1 != NULL) { 845 if (!mtx_trylock(vlp1)) 846 return (EAGAIN); 847 } 848 if (!mtx_trylock(vlp2)) { 849 if (vlp1 != NULL) 850 mtx_unlock(vlp1); 851 return (EAGAIN); 852 } 853 854 return (0); 855 } 856 857 static void 858 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 859 { 860 861 MPASS(vlp1 != NULL || vlp2 != NULL); 862 MPASS(vlp1 <= vlp2); 863 864 if (vlp1 != NULL) 865 mtx_lock(vlp1); 866 if (vlp2 != NULL) 867 mtx_lock(vlp2); 868 } 869 870 static void 871 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 872 { 873 874 MPASS(vlp1 != NULL || vlp2 != NULL); 875 876 if (vlp1 != NULL) 877 mtx_unlock(vlp1); 878 if (vlp2 != NULL) 879 mtx_unlock(vlp2); 880 } 881 882 static int 883 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 884 { 885 struct nchstats snap; 886 887 if (req->oldptr == NULL) 888 return (SYSCTL_OUT(req, 0, sizeof(snap))); 889 890 snap = nchstats; 891 snap.ncs_goodhits = counter_u64_fetch(numposhits); 892 snap.ncs_neghits = counter_u64_fetch(numneghits); 893 snap.ncs_badhits = counter_u64_fetch(numposzaps) + 894 counter_u64_fetch(numnegzaps); 895 snap.ncs_miss = counter_u64_fetch(nummisszap) + 896 counter_u64_fetch(nummiss); 897 898 return (SYSCTL_OUT(req, &snap, sizeof(snap))); 899 } 900 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | 901 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", 902 "VFS cache effectiveness statistics"); 903 904 static void 905 cache_recalc_neg_min(u_int val) 906 { 907 908 neg_min = (ncsize * val) / 100; 909 } 910 911 static int 912 sysctl_negminpct(SYSCTL_HANDLER_ARGS) 913 { 914 u_int val; 915 int error; 916 917 val = ncnegminpct; 918 error = sysctl_handle_int(oidp, &val, 0, req); 919 if (error != 0 || req->newptr == NULL) 920 return (error); 921 922 if (val == ncnegminpct) 923 return (0); 924 if (val < 0 || val > 99) 925 return (EINVAL); 926 ncnegminpct = val; 927 cache_recalc_neg_min(val); 928 return (0); 929 } 930 931 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, negminpct, 932 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_negminpct, 933 "I", "Negative entry \% of namecache capacity above which automatic eviction is allowed"); 934 935 #ifdef DIAGNOSTIC 936 /* 937 * Grab an atomic snapshot of the name cache hash chain lengths 938 */ 939 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, 940 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 941 "hash table stats"); 942 943 static int 944 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) 945 { 946 struct nchashhead *ncpp; 947 struct namecache *ncp; 948 int i, error, n_nchash, *cntbuf; 949 950 retry: 951 n_nchash = nchash + 1; /* nchash is max index, not count */ 952 if (req->oldptr == NULL) 953 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); 954 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); 955 cache_lock_all_buckets(); 956 if (n_nchash != nchash + 1) { 957 cache_unlock_all_buckets(); 958 free(cntbuf, M_TEMP); 959 goto retry; 960 } 961 /* Scan hash tables counting entries */ 962 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) 963 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) 964 cntbuf[i]++; 965 cache_unlock_all_buckets(); 966 for (error = 0, i = 0; i < n_nchash; i++) 967 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) 968 break; 969 free(cntbuf, M_TEMP); 970 return (error); 971 } 972 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| 973 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", 974 "nchash chain lengths"); 975 976 static int 977 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) 978 { 979 int error; 980 struct nchashhead *ncpp; 981 struct namecache *ncp; 982 int n_nchash; 983 int count, maxlength, used, pct; 984 985 if (!req->oldptr) 986 return SYSCTL_OUT(req, 0, 4 * sizeof(int)); 987 988 cache_lock_all_buckets(); 989 n_nchash = nchash + 1; /* nchash is max index, not count */ 990 used = 0; 991 maxlength = 0; 992 993 /* Scan hash tables for applicable entries */ 994 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { 995 count = 0; 996 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) { 997 count++; 998 } 999 if (count) 1000 used++; 1001 if (maxlength < count) 1002 maxlength = count; 1003 } 1004 n_nchash = nchash + 1; 1005 cache_unlock_all_buckets(); 1006 pct = (used * 100) / (n_nchash / 100); 1007 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); 1008 if (error) 1009 return (error); 1010 error = SYSCTL_OUT(req, &used, sizeof(used)); 1011 if (error) 1012 return (error); 1013 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); 1014 if (error) 1015 return (error); 1016 error = SYSCTL_OUT(req, &pct, sizeof(pct)); 1017 if (error) 1018 return (error); 1019 return (0); 1020 } 1021 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| 1022 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", 1023 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); 1024 #endif 1025 1026 /* 1027 * Negative entries management 1028 * 1029 * Various workloads create plenty of negative entries and barely use them 1030 * afterwards. Moreover malicious users can keep performing bogus lookups 1031 * adding even more entries. For example "make tinderbox" as of writing this 1032 * comment ends up with 2.6M namecache entries in total, 1.2M of which are 1033 * negative. 1034 * 1035 * As such, a rather aggressive eviction method is needed. The currently 1036 * employed method is a placeholder. 1037 * 1038 * Entries are split over numneglists separate lists, each of which is further 1039 * split into hot and cold entries. Entries get promoted after getting a hit. 1040 * Eviction happens on addition of new entry. 1041 */ 1042 static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1043 "Name cache negative entry statistics"); 1044 1045 SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0, 1046 "Number of negative cache entries"); 1047 1048 static COUNTER_U64_DEFINE_EARLY(neg_created); 1049 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created, 1050 "Number of created negative entries"); 1051 1052 static COUNTER_U64_DEFINE_EARLY(neg_evicted); 1053 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted, 1054 "Number of evicted negative entries"); 1055 1056 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty); 1057 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD, 1058 &neg_evict_skipped_empty, 1059 "Number of times evicting failed due to lack of entries"); 1060 1061 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed); 1062 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD, 1063 &neg_evict_skipped_missed, 1064 "Number of times evicting failed due to target entry disappearing"); 1065 1066 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended); 1067 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD, 1068 &neg_evict_skipped_contended, 1069 "Number of times evicting failed due to contention"); 1070 1071 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits, 1072 "Number of cache hits (negative)"); 1073 1074 static int 1075 sysctl_neg_hot(SYSCTL_HANDLER_ARGS) 1076 { 1077 int i, out; 1078 1079 out = 0; 1080 for (i = 0; i < numneglists; i++) 1081 out += neglists[i].nl_hotnum; 1082 1083 return (SYSCTL_OUT(req, &out, sizeof(out))); 1084 } 1085 SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD | 1086 CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I", 1087 "Number of hot negative entries"); 1088 1089 static void 1090 cache_neg_init(struct namecache *ncp) 1091 { 1092 struct negstate *ns; 1093 1094 ncp->nc_flag |= NCF_NEGATIVE; 1095 ns = NCP2NEGSTATE(ncp); 1096 ns->neg_flag = 0; 1097 ns->neg_hit = 0; 1098 counter_u64_add(neg_created, 1); 1099 } 1100 1101 #define CACHE_NEG_PROMOTION_THRESH 2 1102 1103 static bool 1104 cache_neg_hit_prep(struct namecache *ncp) 1105 { 1106 struct negstate *ns; 1107 u_char n; 1108 1109 ns = NCP2NEGSTATE(ncp); 1110 n = atomic_load_char(&ns->neg_hit); 1111 for (;;) { 1112 if (n >= CACHE_NEG_PROMOTION_THRESH) 1113 return (false); 1114 if (atomic_fcmpset_8(&ns->neg_hit, &n, n + 1)) 1115 break; 1116 } 1117 return (n + 1 == CACHE_NEG_PROMOTION_THRESH); 1118 } 1119 1120 /* 1121 * Nothing to do here but it is provided for completeness as some 1122 * cache_neg_hit_prep callers may end up returning without even 1123 * trying to promote. 1124 */ 1125 #define cache_neg_hit_abort(ncp) do { } while (0) 1126 1127 static void 1128 cache_neg_hit_finish(struct namecache *ncp) 1129 { 1130 1131 SDT_PROBE2(vfs, namecache, lookup, hit__negative, ncp->nc_dvp, ncp->nc_name); 1132 counter_u64_add(numneghits, 1); 1133 } 1134 1135 /* 1136 * Move a negative entry to the hot list. 1137 */ 1138 static void 1139 cache_neg_promote_locked(struct namecache *ncp) 1140 { 1141 struct neglist *nl; 1142 struct negstate *ns; 1143 1144 ns = NCP2NEGSTATE(ncp); 1145 nl = NCP2NEGLIST(ncp); 1146 mtx_assert(&nl->nl_lock, MA_OWNED); 1147 if ((ns->neg_flag & NEG_HOT) == 0) { 1148 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst); 1149 TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst); 1150 nl->nl_hotnum++; 1151 ns->neg_flag |= NEG_HOT; 1152 } 1153 } 1154 1155 /* 1156 * Move a hot negative entry to the cold list. 1157 */ 1158 static void 1159 cache_neg_demote_locked(struct namecache *ncp) 1160 { 1161 struct neglist *nl; 1162 struct negstate *ns; 1163 1164 ns = NCP2NEGSTATE(ncp); 1165 nl = NCP2NEGLIST(ncp); 1166 mtx_assert(&nl->nl_lock, MA_OWNED); 1167 MPASS(ns->neg_flag & NEG_HOT); 1168 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst); 1169 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst); 1170 nl->nl_hotnum--; 1171 ns->neg_flag &= ~NEG_HOT; 1172 atomic_store_char(&ns->neg_hit, 0); 1173 } 1174 1175 /* 1176 * Move a negative entry to the hot list if it matches the lookup. 1177 * 1178 * We have to take locks, but they may be contended and in the worst 1179 * case we may need to go off CPU. We don't want to spin within the 1180 * smr section and we can't block with it. Exiting the section means 1181 * the found entry could have been evicted. We are going to look it 1182 * up again. 1183 */ 1184 static bool 1185 cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp, 1186 struct namecache *oncp, uint32_t hash) 1187 { 1188 struct namecache *ncp; 1189 struct neglist *nl; 1190 u_char nc_flag; 1191 1192 nl = NCP2NEGLIST(oncp); 1193 1194 mtx_lock(&nl->nl_lock); 1195 /* 1196 * For hash iteration. 1197 */ 1198 vfs_smr_enter(); 1199 1200 /* 1201 * Avoid all surprises by only succeeding if we got the same entry and 1202 * bailing completely otherwise. 1203 * XXX There are no provisions to keep the vnode around, meaning we may 1204 * end up promoting a negative entry for a *new* vnode and returning 1205 * ENOENT on its account. This is the error we want to return anyway 1206 * and promotion is harmless. 1207 * 1208 * In particular at this point there can be a new ncp which matches the 1209 * search but hashes to a different neglist. 1210 */ 1211 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1212 if (ncp == oncp) 1213 break; 1214 } 1215 1216 /* 1217 * No match to begin with. 1218 */ 1219 if (__predict_false(ncp == NULL)) { 1220 goto out_abort; 1221 } 1222 1223 /* 1224 * The newly found entry may be something different... 1225 */ 1226 if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1227 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) { 1228 goto out_abort; 1229 } 1230 1231 /* 1232 * ... and not even negative. 1233 */ 1234 nc_flag = atomic_load_char(&ncp->nc_flag); 1235 if ((nc_flag & NCF_NEGATIVE) == 0) { 1236 goto out_abort; 1237 } 1238 1239 if (!cache_ncp_canuse(ncp)) { 1240 goto out_abort; 1241 } 1242 1243 cache_neg_promote_locked(ncp); 1244 cache_neg_hit_finish(ncp); 1245 vfs_smr_exit(); 1246 mtx_unlock(&nl->nl_lock); 1247 return (true); 1248 out_abort: 1249 vfs_smr_exit(); 1250 mtx_unlock(&nl->nl_lock); 1251 return (false); 1252 } 1253 1254 static void 1255 cache_neg_promote(struct namecache *ncp) 1256 { 1257 struct neglist *nl; 1258 1259 nl = NCP2NEGLIST(ncp); 1260 mtx_lock(&nl->nl_lock); 1261 cache_neg_promote_locked(ncp); 1262 mtx_unlock(&nl->nl_lock); 1263 } 1264 1265 static void 1266 cache_neg_insert(struct namecache *ncp) 1267 { 1268 struct neglist *nl; 1269 1270 MPASS(ncp->nc_flag & NCF_NEGATIVE); 1271 cache_assert_bucket_locked(ncp); 1272 nl = NCP2NEGLIST(ncp); 1273 mtx_lock(&nl->nl_lock); 1274 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst); 1275 mtx_unlock(&nl->nl_lock); 1276 atomic_add_long(&numneg, 1); 1277 } 1278 1279 static void 1280 cache_neg_remove(struct namecache *ncp) 1281 { 1282 struct neglist *nl; 1283 struct negstate *ns; 1284 1285 cache_assert_bucket_locked(ncp); 1286 nl = NCP2NEGLIST(ncp); 1287 ns = NCP2NEGSTATE(ncp); 1288 mtx_lock(&nl->nl_lock); 1289 if ((ns->neg_flag & NEG_HOT) != 0) { 1290 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst); 1291 nl->nl_hotnum--; 1292 } else { 1293 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst); 1294 } 1295 mtx_unlock(&nl->nl_lock); 1296 atomic_subtract_long(&numneg, 1); 1297 } 1298 1299 static struct neglist * 1300 cache_neg_evict_select_list(void) 1301 { 1302 struct neglist *nl; 1303 u_int c; 1304 1305 c = atomic_fetchadd_int(&neg_cycle, 1) + 1; 1306 nl = &neglists[c % numneglists]; 1307 if (!mtx_trylock(&nl->nl_evict_lock)) { 1308 counter_u64_add(neg_evict_skipped_contended, 1); 1309 return (NULL); 1310 } 1311 return (nl); 1312 } 1313 1314 static struct namecache * 1315 cache_neg_evict_select_entry(struct neglist *nl) 1316 { 1317 struct namecache *ncp, *lncp; 1318 struct negstate *ns, *lns; 1319 int i; 1320 1321 mtx_assert(&nl->nl_evict_lock, MA_OWNED); 1322 mtx_assert(&nl->nl_lock, MA_OWNED); 1323 ncp = TAILQ_FIRST(&nl->nl_list); 1324 if (ncp == NULL) 1325 return (NULL); 1326 lncp = ncp; 1327 lns = NCP2NEGSTATE(lncp); 1328 for (i = 1; i < 4; i++) { 1329 ncp = TAILQ_NEXT(ncp, nc_dst); 1330 if (ncp == NULL) 1331 break; 1332 ns = NCP2NEGSTATE(ncp); 1333 if (ns->neg_hit < lns->neg_hit) { 1334 lncp = ncp; 1335 lns = ns; 1336 } 1337 } 1338 return (lncp); 1339 } 1340 1341 static bool 1342 cache_neg_evict(void) 1343 { 1344 struct namecache *ncp, *ncp2; 1345 struct neglist *nl; 1346 struct vnode *dvp; 1347 struct mtx *dvlp; 1348 struct mtx *blp; 1349 uint32_t hash; 1350 u_char nlen; 1351 bool evicted; 1352 1353 nl = cache_neg_evict_select_list(); 1354 if (nl == NULL) { 1355 return (false); 1356 } 1357 1358 mtx_lock(&nl->nl_lock); 1359 ncp = TAILQ_FIRST(&nl->nl_hotlist); 1360 if (ncp != NULL) { 1361 cache_neg_demote_locked(ncp); 1362 } 1363 ncp = cache_neg_evict_select_entry(nl); 1364 if (ncp == NULL) { 1365 counter_u64_add(neg_evict_skipped_empty, 1); 1366 mtx_unlock(&nl->nl_lock); 1367 mtx_unlock(&nl->nl_evict_lock); 1368 return (false); 1369 } 1370 nlen = ncp->nc_nlen; 1371 dvp = ncp->nc_dvp; 1372 hash = cache_get_hash(ncp->nc_name, nlen, dvp); 1373 dvlp = VP2VNODELOCK(dvp); 1374 blp = HASH2BUCKETLOCK(hash); 1375 mtx_unlock(&nl->nl_lock); 1376 mtx_unlock(&nl->nl_evict_lock); 1377 mtx_lock(dvlp); 1378 mtx_lock(blp); 1379 /* 1380 * Note that since all locks were dropped above, the entry may be 1381 * gone or reallocated to be something else. 1382 */ 1383 CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) { 1384 if (ncp2 == ncp && ncp2->nc_dvp == dvp && 1385 ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0) 1386 break; 1387 } 1388 if (ncp2 == NULL) { 1389 counter_u64_add(neg_evict_skipped_missed, 1); 1390 ncp = NULL; 1391 evicted = false; 1392 } else { 1393 MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp)); 1394 MPASS(blp == NCP2BUCKETLOCK(ncp)); 1395 SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp, 1396 ncp->nc_name); 1397 cache_zap_locked(ncp); 1398 counter_u64_add(neg_evicted, 1); 1399 evicted = true; 1400 } 1401 mtx_unlock(blp); 1402 mtx_unlock(dvlp); 1403 if (ncp != NULL) 1404 cache_free(ncp); 1405 return (evicted); 1406 } 1407 1408 /* 1409 * Maybe evict a negative entry to create more room. 1410 * 1411 * The ncnegfactor parameter limits what fraction of the total count 1412 * can comprise of negative entries. However, if the cache is just 1413 * warming up this leads to excessive evictions. As such, ncnegminpct 1414 * (recomputed to neg_min) dictates whether the above should be 1415 * applied. 1416 * 1417 * Try evicting if the cache is close to full capacity regardless of 1418 * other considerations. 1419 */ 1420 static bool 1421 cache_neg_evict_cond(u_long lnumcache) 1422 { 1423 u_long lnumneg; 1424 1425 if (ncsize - 1000 < lnumcache) 1426 goto out_evict; 1427 lnumneg = atomic_load_long(&numneg); 1428 if (lnumneg < neg_min) 1429 return (false); 1430 if (lnumneg * ncnegfactor < lnumcache) 1431 return (false); 1432 out_evict: 1433 return (cache_neg_evict()); 1434 } 1435 1436 /* 1437 * cache_zap_locked(): 1438 * 1439 * Removes a namecache entry from cache, whether it contains an actual 1440 * pointer to a vnode or if it is just a negative cache entry. 1441 */ 1442 static void 1443 cache_zap_locked(struct namecache *ncp) 1444 { 1445 struct nchashhead *ncpp; 1446 struct vnode *dvp, *vp; 1447 1448 dvp = ncp->nc_dvp; 1449 vp = ncp->nc_vp; 1450 1451 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1452 cache_assert_vnode_locked(vp); 1453 cache_assert_vnode_locked(dvp); 1454 cache_assert_bucket_locked(ncp); 1455 1456 cache_ncp_invalidate(ncp); 1457 1458 ncpp = NCP2BUCKET(ncp); 1459 CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash); 1460 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 1461 SDT_PROBE3(vfs, namecache, zap, done, dvp, ncp->nc_name, vp); 1462 TAILQ_REMOVE(&vp->v_cache_dst, ncp, nc_dst); 1463 if (ncp == vp->v_cache_dd) { 1464 atomic_store_ptr(&vp->v_cache_dd, NULL); 1465 } 1466 } else { 1467 SDT_PROBE2(vfs, namecache, zap_negative, done, dvp, ncp->nc_name); 1468 cache_neg_remove(ncp); 1469 } 1470 if (ncp->nc_flag & NCF_ISDOTDOT) { 1471 if (ncp == dvp->v_cache_dd) { 1472 atomic_store_ptr(&dvp->v_cache_dd, NULL); 1473 } 1474 } else { 1475 LIST_REMOVE(ncp, nc_src); 1476 if (LIST_EMPTY(&dvp->v_cache_src)) { 1477 ncp->nc_flag |= NCF_DVDROP; 1478 } 1479 } 1480 } 1481 1482 static void 1483 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) 1484 { 1485 struct mtx *blp; 1486 1487 MPASS(ncp->nc_dvp == vp); 1488 MPASS(ncp->nc_flag & NCF_NEGATIVE); 1489 cache_assert_vnode_locked(vp); 1490 1491 blp = NCP2BUCKETLOCK(ncp); 1492 mtx_lock(blp); 1493 cache_zap_locked(ncp); 1494 mtx_unlock(blp); 1495 } 1496 1497 static bool 1498 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, 1499 struct mtx **vlpp) 1500 { 1501 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 1502 struct mtx *blp; 1503 1504 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 1505 cache_assert_vnode_locked(vp); 1506 1507 if (ncp->nc_flag & NCF_NEGATIVE) { 1508 if (*vlpp != NULL) { 1509 mtx_unlock(*vlpp); 1510 *vlpp = NULL; 1511 } 1512 cache_zap_negative_locked_vnode_kl(ncp, vp); 1513 return (true); 1514 } 1515 1516 pvlp = VP2VNODELOCK(vp); 1517 blp = NCP2BUCKETLOCK(ncp); 1518 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 1519 vlp2 = VP2VNODELOCK(ncp->nc_vp); 1520 1521 if (*vlpp == vlp1 || *vlpp == vlp2) { 1522 to_unlock = *vlpp; 1523 *vlpp = NULL; 1524 } else { 1525 if (*vlpp != NULL) { 1526 mtx_unlock(*vlpp); 1527 *vlpp = NULL; 1528 } 1529 cache_sort_vnodes(&vlp1, &vlp2); 1530 if (vlp1 == pvlp) { 1531 mtx_lock(vlp2); 1532 to_unlock = vlp2; 1533 } else { 1534 if (!mtx_trylock(vlp1)) 1535 goto out_relock; 1536 to_unlock = vlp1; 1537 } 1538 } 1539 mtx_lock(blp); 1540 cache_zap_locked(ncp); 1541 mtx_unlock(blp); 1542 if (to_unlock != NULL) 1543 mtx_unlock(to_unlock); 1544 return (true); 1545 1546 out_relock: 1547 mtx_unlock(vlp2); 1548 mtx_lock(vlp1); 1549 mtx_lock(vlp2); 1550 MPASS(*vlpp == NULL); 1551 *vlpp = vlp1; 1552 return (false); 1553 } 1554 1555 /* 1556 * If trylocking failed we can get here. We know enough to take all needed locks 1557 * in the right order and re-lookup the entry. 1558 */ 1559 static int 1560 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1561 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, 1562 struct mtx *blp) 1563 { 1564 struct namecache *rncp; 1565 1566 cache_assert_bucket_unlocked(ncp); 1567 1568 cache_sort_vnodes(&dvlp, &vlp); 1569 cache_lock_vnodes(dvlp, vlp); 1570 mtx_lock(blp); 1571 CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { 1572 if (rncp == ncp && rncp->nc_dvp == dvp && 1573 rncp->nc_nlen == cnp->cn_namelen && 1574 !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) 1575 break; 1576 } 1577 if (rncp != NULL) { 1578 cache_zap_locked(rncp); 1579 mtx_unlock(blp); 1580 cache_unlock_vnodes(dvlp, vlp); 1581 counter_u64_add(zap_bucket_relock_success, 1); 1582 return (0); 1583 } 1584 1585 mtx_unlock(blp); 1586 cache_unlock_vnodes(dvlp, vlp); 1587 return (EAGAIN); 1588 } 1589 1590 static int __noinline 1591 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp, 1592 uint32_t hash, struct mtx *blp) 1593 { 1594 struct mtx *dvlp, *vlp; 1595 struct vnode *dvp; 1596 1597 cache_assert_bucket_locked(ncp); 1598 1599 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1600 vlp = NULL; 1601 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1602 vlp = VP2VNODELOCK(ncp->nc_vp); 1603 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1604 cache_zap_locked(ncp); 1605 mtx_unlock(blp); 1606 cache_unlock_vnodes(dvlp, vlp); 1607 return (0); 1608 } 1609 1610 dvp = ncp->nc_dvp; 1611 mtx_unlock(blp); 1612 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1613 } 1614 1615 static __noinline int 1616 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp) 1617 { 1618 struct namecache *ncp; 1619 struct mtx *blp; 1620 struct mtx *dvlp, *dvlp2; 1621 uint32_t hash; 1622 int error; 1623 1624 if (cnp->cn_namelen == 2 && 1625 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1626 dvlp = VP2VNODELOCK(dvp); 1627 dvlp2 = NULL; 1628 mtx_lock(dvlp); 1629 retry_dotdot: 1630 ncp = dvp->v_cache_dd; 1631 if (ncp == NULL) { 1632 mtx_unlock(dvlp); 1633 if (dvlp2 != NULL) 1634 mtx_unlock(dvlp2); 1635 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1636 return (0); 1637 } 1638 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1639 if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2)) 1640 goto retry_dotdot; 1641 MPASS(dvp->v_cache_dd == NULL); 1642 mtx_unlock(dvlp); 1643 if (dvlp2 != NULL) 1644 mtx_unlock(dvlp2); 1645 cache_free(ncp); 1646 } else { 1647 atomic_store_ptr(&dvp->v_cache_dd, NULL); 1648 mtx_unlock(dvlp); 1649 if (dvlp2 != NULL) 1650 mtx_unlock(dvlp2); 1651 } 1652 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1653 return (1); 1654 } 1655 1656 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1657 blp = HASH2BUCKETLOCK(hash); 1658 retry: 1659 if (CK_SLIST_EMPTY(NCHHASH(hash))) 1660 goto out_no_entry; 1661 1662 mtx_lock(blp); 1663 1664 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1665 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1666 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1667 break; 1668 } 1669 1670 if (ncp == NULL) { 1671 mtx_unlock(blp); 1672 goto out_no_entry; 1673 } 1674 1675 error = cache_zap_locked_bucket(ncp, cnp, hash, blp); 1676 if (__predict_false(error != 0)) { 1677 zap_bucket_fail++; 1678 goto retry; 1679 } 1680 counter_u64_add(numposzaps, 1); 1681 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1682 cache_free(ncp); 1683 return (1); 1684 out_no_entry: 1685 counter_u64_add(nummisszap, 1); 1686 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1687 return (0); 1688 } 1689 1690 static int __noinline 1691 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1692 struct timespec *tsp, int *ticksp) 1693 { 1694 int ltype; 1695 1696 *vpp = dvp; 1697 counter_u64_add(dothits, 1); 1698 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); 1699 if (tsp != NULL) 1700 timespecclear(tsp); 1701 if (ticksp != NULL) 1702 *ticksp = ticks; 1703 vrefact(*vpp); 1704 /* 1705 * When we lookup "." we still can be asked to lock it 1706 * differently... 1707 */ 1708 ltype = cnp->cn_lkflags & LK_TYPE_MASK; 1709 if (ltype != VOP_ISLOCKED(*vpp)) { 1710 if (ltype == LK_EXCLUSIVE) { 1711 vn_lock(*vpp, LK_UPGRADE | LK_RETRY); 1712 if (VN_IS_DOOMED((*vpp))) { 1713 /* forced unmount */ 1714 vrele(*vpp); 1715 *vpp = NULL; 1716 return (ENOENT); 1717 } 1718 } else 1719 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); 1720 } 1721 return (-1); 1722 } 1723 1724 static int __noinline 1725 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1726 struct timespec *tsp, int *ticksp) 1727 { 1728 struct namecache_ts *ncp_ts; 1729 struct namecache *ncp; 1730 struct mtx *dvlp; 1731 enum vgetstate vs; 1732 int error, ltype; 1733 bool whiteout; 1734 1735 MPASS((cnp->cn_flags & ISDOTDOT) != 0); 1736 1737 if ((cnp->cn_flags & MAKEENTRY) == 0) { 1738 cache_remove_cnp(dvp, cnp); 1739 return (0); 1740 } 1741 1742 counter_u64_add(dotdothits, 1); 1743 retry: 1744 dvlp = VP2VNODELOCK(dvp); 1745 mtx_lock(dvlp); 1746 ncp = dvp->v_cache_dd; 1747 if (ncp == NULL) { 1748 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, "..", NULL); 1749 mtx_unlock(dvlp); 1750 return (0); 1751 } 1752 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1753 if (ncp->nc_flag & NCF_NEGATIVE) 1754 *vpp = NULL; 1755 else 1756 *vpp = ncp->nc_vp; 1757 } else 1758 *vpp = ncp->nc_dvp; 1759 if (*vpp == NULL) 1760 goto negative_success; 1761 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp); 1762 cache_out_ts(ncp, tsp, ticksp); 1763 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == 1764 NCF_DTS && tsp != NULL) { 1765 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1766 *tsp = ncp_ts->nc_dotdottime; 1767 } 1768 1769 MPASS(dvp != *vpp); 1770 ltype = VOP_ISLOCKED(dvp); 1771 VOP_UNLOCK(dvp); 1772 vs = vget_prep(*vpp); 1773 mtx_unlock(dvlp); 1774 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1775 vn_lock(dvp, ltype | LK_RETRY); 1776 if (VN_IS_DOOMED(dvp)) { 1777 if (error == 0) 1778 vput(*vpp); 1779 *vpp = NULL; 1780 return (ENOENT); 1781 } 1782 if (error) { 1783 *vpp = NULL; 1784 goto retry; 1785 } 1786 return (-1); 1787 negative_success: 1788 if (__predict_false(cnp->cn_nameiop == CREATE)) { 1789 if (cnp->cn_flags & ISLASTCN) { 1790 counter_u64_add(numnegzaps, 1); 1791 cache_zap_negative_locked_vnode_kl(ncp, dvp); 1792 mtx_unlock(dvlp); 1793 cache_free(ncp); 1794 return (0); 1795 } 1796 } 1797 1798 whiteout = (ncp->nc_flag & NCF_WHITE); 1799 cache_out_ts(ncp, tsp, ticksp); 1800 if (cache_neg_hit_prep(ncp)) 1801 cache_neg_promote(ncp); 1802 else 1803 cache_neg_hit_finish(ncp); 1804 mtx_unlock(dvlp); 1805 if (whiteout) 1806 cnp->cn_flags |= ISWHITEOUT; 1807 return (ENOENT); 1808 } 1809 1810 /** 1811 * Lookup a name in the name cache 1812 * 1813 * # Arguments 1814 * 1815 * - dvp: Parent directory in which to search. 1816 * - vpp: Return argument. Will contain desired vnode on cache hit. 1817 * - cnp: Parameters of the name search. The most interesting bits of 1818 * the cn_flags field have the following meanings: 1819 * - MAKEENTRY: If clear, free an entry from the cache rather than look 1820 * it up. 1821 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." 1822 * - tsp: Return storage for cache timestamp. On a successful (positive 1823 * or negative) lookup, tsp will be filled with any timespec that 1824 * was stored when this cache entry was created. However, it will 1825 * be clear for "." entries. 1826 * - ticks: Return storage for alternate cache timestamp. On a successful 1827 * (positive or negative) lookup, it will contain the ticks value 1828 * that was current when the cache entry was created, unless cnp 1829 * was ".". 1830 * 1831 * Either both tsp and ticks have to be provided or neither of them. 1832 * 1833 * # Returns 1834 * 1835 * - -1: A positive cache hit. vpp will contain the desired vnode. 1836 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due 1837 * to a forced unmount. vpp will not be modified. If the entry 1838 * is a whiteout, then the ISWHITEOUT flag will be set in 1839 * cnp->cn_flags. 1840 * - 0: A cache miss. vpp will not be modified. 1841 * 1842 * # Locking 1843 * 1844 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up 1845 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the 1846 * lock is not recursively acquired. 1847 */ 1848 static int __noinline 1849 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1850 struct timespec *tsp, int *ticksp) 1851 { 1852 struct namecache *ncp; 1853 struct mtx *blp; 1854 uint32_t hash; 1855 enum vgetstate vs; 1856 int error; 1857 bool whiteout; 1858 1859 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 1860 MPASS((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) != 0); 1861 1862 retry: 1863 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1864 blp = HASH2BUCKETLOCK(hash); 1865 mtx_lock(blp); 1866 1867 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1868 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1869 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1870 break; 1871 } 1872 1873 if (__predict_false(ncp == NULL)) { 1874 mtx_unlock(blp); 1875 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1876 NULL); 1877 counter_u64_add(nummiss, 1); 1878 return (0); 1879 } 1880 1881 if (ncp->nc_flag & NCF_NEGATIVE) 1882 goto negative_success; 1883 1884 counter_u64_add(numposhits, 1); 1885 *vpp = ncp->nc_vp; 1886 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); 1887 cache_out_ts(ncp, tsp, ticksp); 1888 MPASS(dvp != *vpp); 1889 vs = vget_prep(*vpp); 1890 mtx_unlock(blp); 1891 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1892 if (error) { 1893 *vpp = NULL; 1894 goto retry; 1895 } 1896 return (-1); 1897 negative_success: 1898 /* 1899 * We don't get here with regular lookup apart from corner cases. 1900 */ 1901 if (__predict_true(cnp->cn_nameiop == CREATE)) { 1902 if (cnp->cn_flags & ISLASTCN) { 1903 counter_u64_add(numnegzaps, 1); 1904 error = cache_zap_locked_bucket(ncp, cnp, hash, blp); 1905 if (__predict_false(error != 0)) { 1906 zap_bucket_fail2++; 1907 goto retry; 1908 } 1909 cache_free(ncp); 1910 return (0); 1911 } 1912 } 1913 1914 whiteout = (ncp->nc_flag & NCF_WHITE); 1915 cache_out_ts(ncp, tsp, ticksp); 1916 if (cache_neg_hit_prep(ncp)) 1917 cache_neg_promote(ncp); 1918 else 1919 cache_neg_hit_finish(ncp); 1920 mtx_unlock(blp); 1921 if (whiteout) 1922 cnp->cn_flags |= ISWHITEOUT; 1923 return (ENOENT); 1924 } 1925 1926 int 1927 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1928 struct timespec *tsp, int *ticksp) 1929 { 1930 struct namecache *ncp; 1931 uint32_t hash; 1932 enum vgetstate vs; 1933 int error; 1934 bool whiteout, neg_promote; 1935 u_short nc_flag; 1936 1937 MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL)); 1938 1939 #ifdef DEBUG_CACHE 1940 if (__predict_false(!doingcache)) { 1941 cnp->cn_flags &= ~MAKEENTRY; 1942 return (0); 1943 } 1944 #endif 1945 1946 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 1947 if (cnp->cn_namelen == 1) 1948 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); 1949 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') 1950 return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp)); 1951 } 1952 1953 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 1954 1955 if ((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) == 0) { 1956 cache_remove_cnp(dvp, cnp); 1957 return (0); 1958 } 1959 1960 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1961 vfs_smr_enter(); 1962 1963 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1964 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1965 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1966 break; 1967 } 1968 1969 if (__predict_false(ncp == NULL)) { 1970 vfs_smr_exit(); 1971 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1972 NULL); 1973 counter_u64_add(nummiss, 1); 1974 return (0); 1975 } 1976 1977 nc_flag = atomic_load_char(&ncp->nc_flag); 1978 if (nc_flag & NCF_NEGATIVE) 1979 goto negative_success; 1980 1981 counter_u64_add(numposhits, 1); 1982 *vpp = ncp->nc_vp; 1983 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); 1984 cache_out_ts(ncp, tsp, ticksp); 1985 MPASS(dvp != *vpp); 1986 if (!cache_ncp_canuse(ncp)) { 1987 vfs_smr_exit(); 1988 *vpp = NULL; 1989 goto out_fallback; 1990 } 1991 vs = vget_prep_smr(*vpp); 1992 vfs_smr_exit(); 1993 if (__predict_false(vs == VGET_NONE)) { 1994 *vpp = NULL; 1995 goto out_fallback; 1996 } 1997 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1998 if (error) { 1999 *vpp = NULL; 2000 goto out_fallback; 2001 } 2002 return (-1); 2003 negative_success: 2004 if (cnp->cn_nameiop == CREATE) { 2005 if (cnp->cn_flags & ISLASTCN) { 2006 vfs_smr_exit(); 2007 goto out_fallback; 2008 } 2009 } 2010 2011 cache_out_ts(ncp, tsp, ticksp); 2012 whiteout = (atomic_load_char(&ncp->nc_flag) & NCF_WHITE); 2013 neg_promote = cache_neg_hit_prep(ncp); 2014 if (!cache_ncp_canuse(ncp)) { 2015 cache_neg_hit_abort(ncp); 2016 vfs_smr_exit(); 2017 goto out_fallback; 2018 } 2019 if (neg_promote) { 2020 vfs_smr_exit(); 2021 if (!cache_neg_promote_cond(dvp, cnp, ncp, hash)) 2022 goto out_fallback; 2023 } else { 2024 cache_neg_hit_finish(ncp); 2025 vfs_smr_exit(); 2026 } 2027 if (whiteout) 2028 cnp->cn_flags |= ISWHITEOUT; 2029 return (ENOENT); 2030 out_fallback: 2031 return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp)); 2032 } 2033 2034 struct celockstate { 2035 struct mtx *vlp[3]; 2036 struct mtx *blp[2]; 2037 }; 2038 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); 2039 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); 2040 2041 static inline void 2042 cache_celockstate_init(struct celockstate *cel) 2043 { 2044 2045 bzero(cel, sizeof(*cel)); 2046 } 2047 2048 static void 2049 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, 2050 struct vnode *dvp) 2051 { 2052 struct mtx *vlp1, *vlp2; 2053 2054 MPASS(cel->vlp[0] == NULL); 2055 MPASS(cel->vlp[1] == NULL); 2056 MPASS(cel->vlp[2] == NULL); 2057 2058 MPASS(vp != NULL || dvp != NULL); 2059 2060 vlp1 = VP2VNODELOCK(vp); 2061 vlp2 = VP2VNODELOCK(dvp); 2062 cache_sort_vnodes(&vlp1, &vlp2); 2063 2064 if (vlp1 != NULL) { 2065 mtx_lock(vlp1); 2066 cel->vlp[0] = vlp1; 2067 } 2068 mtx_lock(vlp2); 2069 cel->vlp[1] = vlp2; 2070 } 2071 2072 static void 2073 cache_unlock_vnodes_cel(struct celockstate *cel) 2074 { 2075 2076 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); 2077 2078 if (cel->vlp[0] != NULL) 2079 mtx_unlock(cel->vlp[0]); 2080 if (cel->vlp[1] != NULL) 2081 mtx_unlock(cel->vlp[1]); 2082 if (cel->vlp[2] != NULL) 2083 mtx_unlock(cel->vlp[2]); 2084 } 2085 2086 static bool 2087 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) 2088 { 2089 struct mtx *vlp; 2090 bool ret; 2091 2092 cache_assert_vlp_locked(cel->vlp[0]); 2093 cache_assert_vlp_locked(cel->vlp[1]); 2094 MPASS(cel->vlp[2] == NULL); 2095 2096 MPASS(vp != NULL); 2097 vlp = VP2VNODELOCK(vp); 2098 2099 ret = true; 2100 if (vlp >= cel->vlp[1]) { 2101 mtx_lock(vlp); 2102 } else { 2103 if (mtx_trylock(vlp)) 2104 goto out; 2105 cache_lock_vnodes_cel_3_failures++; 2106 cache_unlock_vnodes_cel(cel); 2107 if (vlp < cel->vlp[0]) { 2108 mtx_lock(vlp); 2109 mtx_lock(cel->vlp[0]); 2110 mtx_lock(cel->vlp[1]); 2111 } else { 2112 if (cel->vlp[0] != NULL) 2113 mtx_lock(cel->vlp[0]); 2114 mtx_lock(vlp); 2115 mtx_lock(cel->vlp[1]); 2116 } 2117 ret = false; 2118 } 2119 out: 2120 cel->vlp[2] = vlp; 2121 return (ret); 2122 } 2123 2124 static void 2125 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1, 2126 struct mtx *blp2) 2127 { 2128 2129 MPASS(cel->blp[0] == NULL); 2130 MPASS(cel->blp[1] == NULL); 2131 2132 cache_sort_vnodes(&blp1, &blp2); 2133 2134 if (blp1 != NULL) { 2135 mtx_lock(blp1); 2136 cel->blp[0] = blp1; 2137 } 2138 mtx_lock(blp2); 2139 cel->blp[1] = blp2; 2140 } 2141 2142 static void 2143 cache_unlock_buckets_cel(struct celockstate *cel) 2144 { 2145 2146 if (cel->blp[0] != NULL) 2147 mtx_unlock(cel->blp[0]); 2148 mtx_unlock(cel->blp[1]); 2149 } 2150 2151 /* 2152 * Lock part of the cache affected by the insertion. 2153 * 2154 * This means vnodelocks for dvp, vp and the relevant bucketlock. 2155 * However, insertion can result in removal of an old entry. In this 2156 * case we have an additional vnode and bucketlock pair to lock. 2157 * 2158 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while 2159 * preserving the locking order (smaller address first). 2160 */ 2161 static void 2162 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 2163 uint32_t hash) 2164 { 2165 struct namecache *ncp; 2166 struct mtx *blps[2]; 2167 u_char nc_flag; 2168 2169 blps[0] = HASH2BUCKETLOCK(hash); 2170 for (;;) { 2171 blps[1] = NULL; 2172 cache_lock_vnodes_cel(cel, dvp, vp); 2173 if (vp == NULL || vp->v_type != VDIR) 2174 break; 2175 ncp = atomic_load_consume_ptr(&vp->v_cache_dd); 2176 if (ncp == NULL) 2177 break; 2178 nc_flag = atomic_load_char(&ncp->nc_flag); 2179 if ((nc_flag & NCF_ISDOTDOT) == 0) 2180 break; 2181 MPASS(ncp->nc_dvp == vp); 2182 blps[1] = NCP2BUCKETLOCK(ncp); 2183 if ((nc_flag & NCF_NEGATIVE) != 0) 2184 break; 2185 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 2186 break; 2187 /* 2188 * All vnodes got re-locked. Re-validate the state and if 2189 * nothing changed we are done. Otherwise restart. 2190 */ 2191 if (ncp == vp->v_cache_dd && 2192 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 2193 blps[1] == NCP2BUCKETLOCK(ncp) && 2194 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 2195 break; 2196 cache_unlock_vnodes_cel(cel); 2197 cel->vlp[0] = NULL; 2198 cel->vlp[1] = NULL; 2199 cel->vlp[2] = NULL; 2200 } 2201 cache_lock_buckets_cel(cel, blps[0], blps[1]); 2202 } 2203 2204 static void 2205 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 2206 uint32_t hash) 2207 { 2208 struct namecache *ncp; 2209 struct mtx *blps[2]; 2210 u_char nc_flag; 2211 2212 blps[0] = HASH2BUCKETLOCK(hash); 2213 for (;;) { 2214 blps[1] = NULL; 2215 cache_lock_vnodes_cel(cel, dvp, vp); 2216 ncp = atomic_load_consume_ptr(&dvp->v_cache_dd); 2217 if (ncp == NULL) 2218 break; 2219 nc_flag = atomic_load_char(&ncp->nc_flag); 2220 if ((nc_flag & NCF_ISDOTDOT) == 0) 2221 break; 2222 MPASS(ncp->nc_dvp == dvp); 2223 blps[1] = NCP2BUCKETLOCK(ncp); 2224 if ((nc_flag & NCF_NEGATIVE) != 0) 2225 break; 2226 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 2227 break; 2228 if (ncp == dvp->v_cache_dd && 2229 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 2230 blps[1] == NCP2BUCKETLOCK(ncp) && 2231 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 2232 break; 2233 cache_unlock_vnodes_cel(cel); 2234 cel->vlp[0] = NULL; 2235 cel->vlp[1] = NULL; 2236 cel->vlp[2] = NULL; 2237 } 2238 cache_lock_buckets_cel(cel, blps[0], blps[1]); 2239 } 2240 2241 static void 2242 cache_enter_unlock(struct celockstate *cel) 2243 { 2244 2245 cache_unlock_buckets_cel(cel); 2246 cache_unlock_vnodes_cel(cel); 2247 } 2248 2249 static void __noinline 2250 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, 2251 struct componentname *cnp) 2252 { 2253 struct celockstate cel; 2254 struct namecache *ncp; 2255 uint32_t hash; 2256 int len; 2257 2258 if (atomic_load_ptr(&dvp->v_cache_dd) == NULL) 2259 return; 2260 len = cnp->cn_namelen; 2261 cache_celockstate_init(&cel); 2262 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 2263 cache_enter_lock_dd(&cel, dvp, vp, hash); 2264 ncp = dvp->v_cache_dd; 2265 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) { 2266 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); 2267 cache_zap_locked(ncp); 2268 } else { 2269 ncp = NULL; 2270 } 2271 atomic_store_ptr(&dvp->v_cache_dd, NULL); 2272 cache_enter_unlock(&cel); 2273 if (ncp != NULL) 2274 cache_free(ncp); 2275 } 2276 2277 /* 2278 * Add an entry to the cache. 2279 */ 2280 void 2281 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 2282 struct timespec *tsp, struct timespec *dtsp) 2283 { 2284 struct celockstate cel; 2285 struct namecache *ncp, *n2, *ndd; 2286 struct namecache_ts *ncp_ts; 2287 struct nchashhead *ncpp; 2288 uint32_t hash; 2289 int flag; 2290 int len; 2291 2292 KASSERT(cnp->cn_namelen <= NAME_MAX, 2293 ("%s: passed len %ld exceeds NAME_MAX (%d)", __func__, cnp->cn_namelen, 2294 NAME_MAX)); 2295 VNPASS(dvp != vp, dvp); 2296 VNPASS(!VN_IS_DOOMED(dvp), dvp); 2297 VNPASS(dvp->v_type != VNON, dvp); 2298 if (vp != NULL) { 2299 VNPASS(!VN_IS_DOOMED(vp), vp); 2300 VNPASS(vp->v_type != VNON, vp); 2301 } 2302 2303 #ifdef DEBUG_CACHE 2304 if (__predict_false(!doingcache)) 2305 return; 2306 #endif 2307 2308 flag = 0; 2309 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 2310 if (cnp->cn_namelen == 1) 2311 return; 2312 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 2313 cache_enter_dotdot_prep(dvp, vp, cnp); 2314 flag = NCF_ISDOTDOT; 2315 } 2316 } 2317 2318 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); 2319 if (ncp == NULL) 2320 return; 2321 2322 cache_celockstate_init(&cel); 2323 ndd = NULL; 2324 ncp_ts = NULL; 2325 2326 /* 2327 * Calculate the hash key and setup as much of the new 2328 * namecache entry as possible before acquiring the lock. 2329 */ 2330 ncp->nc_flag = flag | NCF_WIP; 2331 ncp->nc_vp = vp; 2332 if (vp == NULL) 2333 cache_neg_init(ncp); 2334 ncp->nc_dvp = dvp; 2335 if (tsp != NULL) { 2336 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 2337 ncp_ts->nc_time = *tsp; 2338 ncp_ts->nc_ticks = ticks; 2339 ncp_ts->nc_nc.nc_flag |= NCF_TS; 2340 if (dtsp != NULL) { 2341 ncp_ts->nc_dotdottime = *dtsp; 2342 ncp_ts->nc_nc.nc_flag |= NCF_DTS; 2343 } 2344 } 2345 len = ncp->nc_nlen = cnp->cn_namelen; 2346 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 2347 memcpy(ncp->nc_name, cnp->cn_nameptr, len); 2348 ncp->nc_name[len] = '\0'; 2349 cache_enter_lock(&cel, dvp, vp, hash); 2350 2351 /* 2352 * See if this vnode or negative entry is already in the cache 2353 * with this name. This can happen with concurrent lookups of 2354 * the same path name. 2355 */ 2356 ncpp = NCHHASH(hash); 2357 CK_SLIST_FOREACH(n2, ncpp, nc_hash) { 2358 if (n2->nc_dvp == dvp && 2359 n2->nc_nlen == cnp->cn_namelen && 2360 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { 2361 MPASS(cache_ncp_canuse(n2)); 2362 if ((n2->nc_flag & NCF_NEGATIVE) != 0) 2363 KASSERT(vp == NULL, 2364 ("%s: found entry pointing to a different vnode (%p != %p)", 2365 __func__, NULL, vp)); 2366 else 2367 KASSERT(n2->nc_vp == vp, 2368 ("%s: found entry pointing to a different vnode (%p != %p)", 2369 __func__, n2->nc_vp, vp)); 2370 /* 2371 * Entries are supposed to be immutable unless in the 2372 * process of getting destroyed. Accommodating for 2373 * changing timestamps is possible but not worth it. 2374 * This should be harmless in terms of correctness, in 2375 * the worst case resulting in an earlier expiration. 2376 * Alternatively, the found entry can be replaced 2377 * altogether. 2378 */ 2379 MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS))); 2380 #if 0 2381 if (tsp != NULL) { 2382 KASSERT((n2->nc_flag & NCF_TS) != 0, 2383 ("no NCF_TS")); 2384 n2_ts = __containerof(n2, struct namecache_ts, nc_nc); 2385 n2_ts->nc_time = ncp_ts->nc_time; 2386 n2_ts->nc_ticks = ncp_ts->nc_ticks; 2387 if (dtsp != NULL) { 2388 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; 2389 n2_ts->nc_nc.nc_flag |= NCF_DTS; 2390 } 2391 } 2392 #endif 2393 SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name, 2394 vp); 2395 goto out_unlock_free; 2396 } 2397 } 2398 2399 if (flag == NCF_ISDOTDOT) { 2400 /* 2401 * See if we are trying to add .. entry, but some other lookup 2402 * has populated v_cache_dd pointer already. 2403 */ 2404 if (dvp->v_cache_dd != NULL) 2405 goto out_unlock_free; 2406 KASSERT(vp == NULL || vp->v_type == VDIR, 2407 ("wrong vnode type %p", vp)); 2408 atomic_thread_fence_rel(); 2409 atomic_store_ptr(&dvp->v_cache_dd, ncp); 2410 } 2411 2412 if (vp != NULL) { 2413 if (flag != NCF_ISDOTDOT) { 2414 /* 2415 * For this case, the cache entry maps both the 2416 * directory name in it and the name ".." for the 2417 * directory's parent. 2418 */ 2419 if ((ndd = vp->v_cache_dd) != NULL) { 2420 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) 2421 cache_zap_locked(ndd); 2422 else 2423 ndd = NULL; 2424 } 2425 atomic_thread_fence_rel(); 2426 atomic_store_ptr(&vp->v_cache_dd, ncp); 2427 } else if (vp->v_type != VDIR) { 2428 if (vp->v_cache_dd != NULL) { 2429 atomic_store_ptr(&vp->v_cache_dd, NULL); 2430 } 2431 } 2432 } 2433 2434 if (flag != NCF_ISDOTDOT) { 2435 if (LIST_EMPTY(&dvp->v_cache_src)) { 2436 cache_hold_vnode(dvp); 2437 } 2438 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); 2439 } 2440 2441 /* 2442 * If the entry is "negative", we place it into the 2443 * "negative" cache queue, otherwise, we place it into the 2444 * destination vnode's cache entries queue. 2445 */ 2446 if (vp != NULL) { 2447 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); 2448 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, 2449 vp); 2450 } else { 2451 if (cnp->cn_flags & ISWHITEOUT) 2452 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_WHITE); 2453 cache_neg_insert(ncp); 2454 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, 2455 ncp->nc_name); 2456 } 2457 2458 /* 2459 * Insert the new namecache entry into the appropriate chain 2460 * within the cache entries table. 2461 */ 2462 CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash); 2463 2464 atomic_thread_fence_rel(); 2465 /* 2466 * Mark the entry as fully constructed. 2467 * It is immutable past this point until its removal. 2468 */ 2469 atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP); 2470 2471 cache_enter_unlock(&cel); 2472 if (ndd != NULL) 2473 cache_free(ndd); 2474 return; 2475 out_unlock_free: 2476 cache_enter_unlock(&cel); 2477 cache_free(ncp); 2478 return; 2479 } 2480 2481 static u_int 2482 cache_roundup_2(u_int val) 2483 { 2484 u_int res; 2485 2486 for (res = 1; res <= val; res <<= 1) 2487 continue; 2488 2489 return (res); 2490 } 2491 2492 static struct nchashhead * 2493 nchinittbl(u_long elements, u_long *hashmask) 2494 { 2495 struct nchashhead *hashtbl; 2496 u_long hashsize, i; 2497 2498 hashsize = cache_roundup_2(elements) / 2; 2499 2500 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK); 2501 for (i = 0; i < hashsize; i++) 2502 CK_SLIST_INIT(&hashtbl[i]); 2503 *hashmask = hashsize - 1; 2504 return (hashtbl); 2505 } 2506 2507 static void 2508 ncfreetbl(struct nchashhead *hashtbl) 2509 { 2510 2511 free(hashtbl, M_VFSCACHE); 2512 } 2513 2514 /* 2515 * Name cache initialization, from vfs_init() when we are booting 2516 */ 2517 static void 2518 nchinit(void *dummy __unused) 2519 { 2520 u_int i; 2521 2522 cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE, 2523 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2524 cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE, 2525 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2526 cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE, 2527 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2528 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE, 2529 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2530 2531 VFS_SMR_ZONE_SET(cache_zone_small); 2532 VFS_SMR_ZONE_SET(cache_zone_small_ts); 2533 VFS_SMR_ZONE_SET(cache_zone_large); 2534 VFS_SMR_ZONE_SET(cache_zone_large_ts); 2535 2536 ncsize = desiredvnodes * ncsizefactor; 2537 cache_recalc_neg_min(ncnegminpct); 2538 nchashtbl = nchinittbl(desiredvnodes * 2, &nchash); 2539 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; 2540 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ 2541 ncbuckethash = 7; 2542 if (ncbuckethash > nchash) 2543 ncbuckethash = nchash; 2544 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, 2545 M_WAITOK | M_ZERO); 2546 for (i = 0; i < numbucketlocks; i++) 2547 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE); 2548 ncvnodehash = ncbuckethash; 2549 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, 2550 M_WAITOK | M_ZERO); 2551 for (i = 0; i < numvnodelocks; i++) 2552 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); 2553 2554 for (i = 0; i < numneglists; i++) { 2555 mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF); 2556 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); 2557 TAILQ_INIT(&neglists[i].nl_list); 2558 TAILQ_INIT(&neglists[i].nl_hotlist); 2559 } 2560 } 2561 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); 2562 2563 void 2564 cache_vnode_init(struct vnode *vp) 2565 { 2566 2567 LIST_INIT(&vp->v_cache_src); 2568 TAILQ_INIT(&vp->v_cache_dst); 2569 vp->v_cache_dd = NULL; 2570 cache_prehash(vp); 2571 } 2572 2573 void 2574 cache_changesize(u_long newmaxvnodes) 2575 { 2576 struct nchashhead *new_nchashtbl, *old_nchashtbl; 2577 u_long new_nchash, old_nchash; 2578 struct namecache *ncp; 2579 uint32_t hash; 2580 u_long newncsize; 2581 int i; 2582 2583 newncsize = newmaxvnodes * ncsizefactor; 2584 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); 2585 if (newmaxvnodes < numbucketlocks) 2586 newmaxvnodes = numbucketlocks; 2587 2588 new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash); 2589 /* If same hash table size, nothing to do */ 2590 if (nchash == new_nchash) { 2591 ncfreetbl(new_nchashtbl); 2592 return; 2593 } 2594 /* 2595 * Move everything from the old hash table to the new table. 2596 * None of the namecache entries in the table can be removed 2597 * because to do so, they have to be removed from the hash table. 2598 */ 2599 cache_lock_all_vnodes(); 2600 cache_lock_all_buckets(); 2601 old_nchashtbl = nchashtbl; 2602 old_nchash = nchash; 2603 nchashtbl = new_nchashtbl; 2604 nchash = new_nchash; 2605 for (i = 0; i <= old_nchash; i++) { 2606 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) { 2607 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, 2608 ncp->nc_dvp); 2609 CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash); 2610 CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); 2611 } 2612 } 2613 ncsize = newncsize; 2614 cache_recalc_neg_min(ncnegminpct); 2615 cache_unlock_all_buckets(); 2616 cache_unlock_all_vnodes(); 2617 ncfreetbl(old_nchashtbl); 2618 } 2619 2620 /* 2621 * Remove all entries from and to a particular vnode. 2622 */ 2623 static void 2624 cache_purge_impl(struct vnode *vp) 2625 { 2626 struct cache_freebatch batch; 2627 struct namecache *ncp; 2628 struct mtx *vlp, *vlp2; 2629 2630 TAILQ_INIT(&batch); 2631 vlp = VP2VNODELOCK(vp); 2632 vlp2 = NULL; 2633 mtx_lock(vlp); 2634 retry: 2635 while (!LIST_EMPTY(&vp->v_cache_src)) { 2636 ncp = LIST_FIRST(&vp->v_cache_src); 2637 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2638 goto retry; 2639 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2640 } 2641 while (!TAILQ_EMPTY(&vp->v_cache_dst)) { 2642 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2643 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2644 goto retry; 2645 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2646 } 2647 ncp = vp->v_cache_dd; 2648 if (ncp != NULL) { 2649 KASSERT(ncp->nc_flag & NCF_ISDOTDOT, 2650 ("lost dotdot link")); 2651 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2652 goto retry; 2653 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2654 } 2655 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); 2656 mtx_unlock(vlp); 2657 if (vlp2 != NULL) 2658 mtx_unlock(vlp2); 2659 cache_free_batch(&batch); 2660 } 2661 2662 /* 2663 * Opportunistic check to see if there is anything to do. 2664 */ 2665 static bool 2666 cache_has_entries(struct vnode *vp) 2667 { 2668 2669 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2670 atomic_load_ptr(&vp->v_cache_dd) == NULL) 2671 return (false); 2672 return (true); 2673 } 2674 2675 void 2676 cache_purge(struct vnode *vp) 2677 { 2678 2679 SDT_PROBE1(vfs, namecache, purge, done, vp); 2680 if (!cache_has_entries(vp)) 2681 return; 2682 cache_purge_impl(vp); 2683 } 2684 2685 /* 2686 * Only to be used by vgone. 2687 */ 2688 void 2689 cache_purge_vgone(struct vnode *vp) 2690 { 2691 struct mtx *vlp; 2692 2693 VNPASS(VN_IS_DOOMED(vp), vp); 2694 if (cache_has_entries(vp)) { 2695 cache_purge_impl(vp); 2696 return; 2697 } 2698 2699 /* 2700 * Serialize against a potential thread doing cache_purge. 2701 */ 2702 vlp = VP2VNODELOCK(vp); 2703 mtx_wait_unlocked(vlp); 2704 if (cache_has_entries(vp)) { 2705 cache_purge_impl(vp); 2706 return; 2707 } 2708 return; 2709 } 2710 2711 /* 2712 * Remove all negative entries for a particular directory vnode. 2713 */ 2714 void 2715 cache_purge_negative(struct vnode *vp) 2716 { 2717 struct cache_freebatch batch; 2718 struct namecache *ncp, *nnp; 2719 struct mtx *vlp; 2720 2721 SDT_PROBE1(vfs, namecache, purge_negative, done, vp); 2722 if (LIST_EMPTY(&vp->v_cache_src)) 2723 return; 2724 TAILQ_INIT(&batch); 2725 vlp = VP2VNODELOCK(vp); 2726 mtx_lock(vlp); 2727 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { 2728 if (!(ncp->nc_flag & NCF_NEGATIVE)) 2729 continue; 2730 cache_zap_negative_locked_vnode_kl(ncp, vp); 2731 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2732 } 2733 mtx_unlock(vlp); 2734 cache_free_batch(&batch); 2735 } 2736 2737 /* 2738 * Entry points for modifying VOP operations. 2739 */ 2740 void 2741 cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, 2742 struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp) 2743 { 2744 2745 ASSERT_VOP_IN_SEQC(fdvp); 2746 ASSERT_VOP_IN_SEQC(fvp); 2747 ASSERT_VOP_IN_SEQC(tdvp); 2748 if (tvp != NULL) 2749 ASSERT_VOP_IN_SEQC(tvp); 2750 2751 cache_purge(fvp); 2752 if (tvp != NULL) { 2753 cache_purge(tvp); 2754 KASSERT(!cache_remove_cnp(tdvp, tcnp), 2755 ("%s: lingering negative entry", __func__)); 2756 } else { 2757 cache_remove_cnp(tdvp, tcnp); 2758 } 2759 2760 /* 2761 * TODO 2762 * 2763 * Historically renaming was always purging all revelang entries, 2764 * but that's quite wasteful. In particular turns out that in many cases 2765 * the target file is immediately accessed after rename, inducing a cache 2766 * miss. 2767 * 2768 * Recode this to reduce relocking and reuse the existing entry (if any) 2769 * instead of just removing it above and allocating a new one here. 2770 */ 2771 if (cache_rename_add) { 2772 cache_enter(tdvp, fvp, tcnp); 2773 } 2774 } 2775 2776 void 2777 cache_vop_rmdir(struct vnode *dvp, struct vnode *vp) 2778 { 2779 2780 ASSERT_VOP_IN_SEQC(dvp); 2781 ASSERT_VOP_IN_SEQC(vp); 2782 cache_purge(vp); 2783 } 2784 2785 #ifdef INVARIANTS 2786 /* 2787 * Validate that if an entry exists it matches. 2788 */ 2789 void 2790 cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) 2791 { 2792 struct namecache *ncp; 2793 struct mtx *blp; 2794 uint32_t hash; 2795 2796 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 2797 if (CK_SLIST_EMPTY(NCHHASH(hash))) 2798 return; 2799 blp = HASH2BUCKETLOCK(hash); 2800 mtx_lock(blp); 2801 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 2802 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 2803 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) { 2804 if (ncp->nc_vp != vp) 2805 panic("%s: mismatch (%p != %p); ncp %p [%s] dvp %p vp %p\n", 2806 __func__, vp, ncp->nc_vp, ncp, ncp->nc_name, ncp->nc_dvp, 2807 ncp->nc_vp); 2808 } 2809 } 2810 mtx_unlock(blp); 2811 } 2812 #endif 2813 2814 /* 2815 * Flush all entries referencing a particular filesystem. 2816 */ 2817 void 2818 cache_purgevfs(struct mount *mp) 2819 { 2820 struct vnode *vp, *mvp; 2821 2822 SDT_PROBE1(vfs, namecache, purgevfs, done, mp); 2823 /* 2824 * Somewhat wasteful iteration over all vnodes. Would be better to 2825 * support filtering and avoid the interlock to begin with. 2826 */ 2827 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 2828 if (!cache_has_entries(vp)) { 2829 VI_UNLOCK(vp); 2830 continue; 2831 } 2832 vholdl(vp); 2833 VI_UNLOCK(vp); 2834 cache_purge(vp); 2835 vdrop(vp); 2836 } 2837 } 2838 2839 /* 2840 * Perform canonical checks and cache lookup and pass on to filesystem 2841 * through the vop_cachedlookup only if needed. 2842 */ 2843 2844 int 2845 vfs_cache_lookup(struct vop_lookup_args *ap) 2846 { 2847 struct vnode *dvp; 2848 int error; 2849 struct vnode **vpp = ap->a_vpp; 2850 struct componentname *cnp = ap->a_cnp; 2851 int flags = cnp->cn_flags; 2852 2853 *vpp = NULL; 2854 dvp = ap->a_dvp; 2855 2856 if (dvp->v_type != VDIR) 2857 return (ENOTDIR); 2858 2859 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && 2860 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) 2861 return (EROFS); 2862 2863 error = vn_dir_check_exec(dvp, cnp); 2864 if (error != 0) 2865 return (error); 2866 2867 error = cache_lookup(dvp, vpp, cnp, NULL, NULL); 2868 if (error == 0) 2869 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); 2870 if (error == -1) 2871 return (0); 2872 return (error); 2873 } 2874 2875 /* Implementation of the getcwd syscall. */ 2876 int 2877 sys___getcwd(struct thread *td, struct __getcwd_args *uap) 2878 { 2879 char *buf, *retbuf; 2880 size_t buflen; 2881 int error; 2882 2883 buflen = uap->buflen; 2884 if (__predict_false(buflen < 2)) 2885 return (EINVAL); 2886 if (buflen > MAXPATHLEN) 2887 buflen = MAXPATHLEN; 2888 2889 buf = uma_zalloc(namei_zone, M_WAITOK); 2890 error = vn_getcwd(buf, &retbuf, &buflen); 2891 if (error == 0) 2892 error = copyout(retbuf, uap->buf, buflen); 2893 uma_zfree(namei_zone, buf); 2894 return (error); 2895 } 2896 2897 int 2898 vn_getcwd(char *buf, char **retbuf, size_t *buflen) 2899 { 2900 struct pwd *pwd; 2901 int error; 2902 2903 vfs_smr_enter(); 2904 pwd = pwd_get_smr(); 2905 error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, 2906 buflen, 0); 2907 VFS_SMR_ASSERT_NOT_ENTERED(); 2908 if (error < 0) { 2909 pwd = pwd_hold(curthread); 2910 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf, 2911 retbuf, buflen); 2912 pwd_drop(pwd); 2913 } 2914 2915 #ifdef KTRACE 2916 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0) 2917 ktrnamei(*retbuf); 2918 #endif 2919 return (error); 2920 } 2921 2922 static int 2923 kern___realpathat(struct thread *td, int fd, const char *path, char *buf, 2924 size_t size, int flags, enum uio_seg pathseg) 2925 { 2926 struct nameidata nd; 2927 char *retbuf, *freebuf; 2928 int error; 2929 2930 if (flags != 0) 2931 return (EINVAL); 2932 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1, 2933 pathseg, path, fd, &cap_fstat_rights, td); 2934 if ((error = namei(&nd)) != 0) 2935 return (error); 2936 error = vn_fullpath_hardlink(&nd, &retbuf, &freebuf, &size); 2937 if (error == 0) { 2938 error = copyout(retbuf, buf, size); 2939 free(freebuf, M_TEMP); 2940 } 2941 NDFREE(&nd, 0); 2942 return (error); 2943 } 2944 2945 int 2946 sys___realpathat(struct thread *td, struct __realpathat_args *uap) 2947 { 2948 2949 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size, 2950 uap->flags, UIO_USERSPACE)); 2951 } 2952 2953 /* 2954 * Retrieve the full filesystem path that correspond to a vnode from the name 2955 * cache (if available) 2956 */ 2957 int 2958 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf) 2959 { 2960 struct pwd *pwd; 2961 char *buf; 2962 size_t buflen; 2963 int error; 2964 2965 if (__predict_false(vp == NULL)) 2966 return (EINVAL); 2967 2968 buflen = MAXPATHLEN; 2969 buf = malloc(buflen, M_TEMP, M_WAITOK); 2970 vfs_smr_enter(); 2971 pwd = pwd_get_smr(); 2972 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, 0); 2973 VFS_SMR_ASSERT_NOT_ENTERED(); 2974 if (error < 0) { 2975 pwd = pwd_hold(curthread); 2976 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen); 2977 pwd_drop(pwd); 2978 } 2979 if (error == 0) 2980 *freebuf = buf; 2981 else 2982 free(buf, M_TEMP); 2983 return (error); 2984 } 2985 2986 /* 2987 * This function is similar to vn_fullpath, but it attempts to lookup the 2988 * pathname relative to the global root mount point. This is required for the 2989 * auditing sub-system, as audited pathnames must be absolute, relative to the 2990 * global root mount point. 2991 */ 2992 int 2993 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf) 2994 { 2995 char *buf; 2996 size_t buflen; 2997 int error; 2998 2999 if (__predict_false(vp == NULL)) 3000 return (EINVAL); 3001 buflen = MAXPATHLEN; 3002 buf = malloc(buflen, M_TEMP, M_WAITOK); 3003 vfs_smr_enter(); 3004 error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, 0); 3005 VFS_SMR_ASSERT_NOT_ENTERED(); 3006 if (error < 0) { 3007 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen); 3008 } 3009 if (error == 0) 3010 *freebuf = buf; 3011 else 3012 free(buf, M_TEMP); 3013 return (error); 3014 } 3015 3016 static struct namecache * 3017 vn_dd_from_dst(struct vnode *vp) 3018 { 3019 struct namecache *ncp; 3020 3021 cache_assert_vnode_locked(vp); 3022 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) { 3023 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 3024 return (ncp); 3025 } 3026 return (NULL); 3027 } 3028 3029 int 3030 vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen) 3031 { 3032 struct vnode *dvp; 3033 struct namecache *ncp; 3034 struct mtx *vlp; 3035 int error; 3036 3037 vlp = VP2VNODELOCK(*vp); 3038 mtx_lock(vlp); 3039 ncp = (*vp)->v_cache_dd; 3040 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) { 3041 KASSERT(ncp == vn_dd_from_dst(*vp), 3042 ("%s: mismatch for dd entry (%p != %p)", __func__, 3043 ncp, vn_dd_from_dst(*vp))); 3044 } else { 3045 ncp = vn_dd_from_dst(*vp); 3046 } 3047 if (ncp != NULL) { 3048 if (*buflen < ncp->nc_nlen) { 3049 mtx_unlock(vlp); 3050 vrele(*vp); 3051 counter_u64_add(numfullpathfail4, 1); 3052 error = ENOMEM; 3053 SDT_PROBE3(vfs, namecache, fullpath, return, error, 3054 vp, NULL); 3055 return (error); 3056 } 3057 *buflen -= ncp->nc_nlen; 3058 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 3059 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, 3060 ncp->nc_name, vp); 3061 dvp = *vp; 3062 *vp = ncp->nc_dvp; 3063 vref(*vp); 3064 mtx_unlock(vlp); 3065 vrele(dvp); 3066 return (0); 3067 } 3068 SDT_PROBE1(vfs, namecache, fullpath, miss, vp); 3069 3070 mtx_unlock(vlp); 3071 vn_lock(*vp, LK_SHARED | LK_RETRY); 3072 error = VOP_VPTOCNP(*vp, &dvp, buf, buflen); 3073 vput(*vp); 3074 if (error) { 3075 counter_u64_add(numfullpathfail2, 1); 3076 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 3077 return (error); 3078 } 3079 3080 *vp = dvp; 3081 if (VN_IS_DOOMED(dvp)) { 3082 /* forced unmount */ 3083 vrele(dvp); 3084 error = ENOENT; 3085 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 3086 return (error); 3087 } 3088 /* 3089 * *vp has its use count incremented still. 3090 */ 3091 3092 return (0); 3093 } 3094 3095 /* 3096 * Resolve a directory to a pathname. 3097 * 3098 * The name of the directory can always be found in the namecache or fetched 3099 * from the filesystem. There is also guaranteed to be only one parent, meaning 3100 * we can just follow vnodes up until we find the root. 3101 * 3102 * The vnode must be referenced. 3103 */ 3104 static int 3105 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 3106 size_t *len, size_t addend) 3107 { 3108 #ifdef KDTRACE_HOOKS 3109 struct vnode *startvp = vp; 3110 #endif 3111 struct vnode *vp1; 3112 size_t buflen; 3113 int error; 3114 bool slash_prefixed; 3115 3116 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); 3117 VNPASS(vp->v_usecount > 0, vp); 3118 3119 buflen = *len; 3120 3121 slash_prefixed = true; 3122 if (addend == 0) { 3123 MPASS(*len >= 2); 3124 buflen--; 3125 buf[buflen] = '\0'; 3126 slash_prefixed = false; 3127 } 3128 3129 error = 0; 3130 3131 SDT_PROBE1(vfs, namecache, fullpath, entry, vp); 3132 counter_u64_add(numfullpathcalls, 1); 3133 while (vp != rdir && vp != rootvnode) { 3134 /* 3135 * The vp vnode must be already fully constructed, 3136 * since it is either found in namecache or obtained 3137 * from VOP_VPTOCNP(). We may test for VV_ROOT safely 3138 * without obtaining the vnode lock. 3139 */ 3140 if ((vp->v_vflag & VV_ROOT) != 0) { 3141 vn_lock(vp, LK_RETRY | LK_SHARED); 3142 3143 /* 3144 * With the vnode locked, check for races with 3145 * unmount, forced or not. Note that we 3146 * already verified that vp is not equal to 3147 * the root vnode, which means that 3148 * mnt_vnodecovered can be NULL only for the 3149 * case of unmount. 3150 */ 3151 if (VN_IS_DOOMED(vp) || 3152 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || 3153 vp1->v_mountedhere != vp->v_mount) { 3154 vput(vp); 3155 error = ENOENT; 3156 SDT_PROBE3(vfs, namecache, fullpath, return, 3157 error, vp, NULL); 3158 break; 3159 } 3160 3161 vref(vp1); 3162 vput(vp); 3163 vp = vp1; 3164 continue; 3165 } 3166 if (vp->v_type != VDIR) { 3167 vrele(vp); 3168 counter_u64_add(numfullpathfail1, 1); 3169 error = ENOTDIR; 3170 SDT_PROBE3(vfs, namecache, fullpath, return, 3171 error, vp, NULL); 3172 break; 3173 } 3174 error = vn_vptocnp(&vp, buf, &buflen); 3175 if (error) 3176 break; 3177 if (buflen == 0) { 3178 vrele(vp); 3179 error = ENOMEM; 3180 SDT_PROBE3(vfs, namecache, fullpath, return, error, 3181 startvp, NULL); 3182 break; 3183 } 3184 buf[--buflen] = '/'; 3185 slash_prefixed = true; 3186 } 3187 if (error) 3188 return (error); 3189 if (!slash_prefixed) { 3190 if (buflen == 0) { 3191 vrele(vp); 3192 counter_u64_add(numfullpathfail4, 1); 3193 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, 3194 startvp, NULL); 3195 return (ENOMEM); 3196 } 3197 buf[--buflen] = '/'; 3198 } 3199 counter_u64_add(numfullpathfound, 1); 3200 vrele(vp); 3201 3202 *retbuf = buf + buflen; 3203 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf); 3204 *len -= buflen; 3205 *len += addend; 3206 return (0); 3207 } 3208 3209 /* 3210 * Resolve an arbitrary vnode to a pathname. 3211 * 3212 * Note 2 caveats: 3213 * - hardlinks are not tracked, thus if the vnode is not a directory this can 3214 * resolve to a different path than the one used to find it 3215 * - namecache is not mandatory, meaning names are not guaranteed to be added 3216 * (in which case resolving fails) 3217 */ 3218 static void __inline 3219 cache_rev_failed_impl(int *reason, int line) 3220 { 3221 3222 *reason = line; 3223 } 3224 #define cache_rev_failed(var) cache_rev_failed_impl((var), __LINE__) 3225 3226 static int 3227 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 3228 char **retbuf, size_t *buflen, size_t addend) 3229 { 3230 #ifdef KDTRACE_HOOKS 3231 struct vnode *startvp = vp; 3232 #endif 3233 struct vnode *tvp; 3234 struct mount *mp; 3235 struct namecache *ncp; 3236 size_t orig_buflen; 3237 int reason; 3238 int error; 3239 #ifdef KDTRACE_HOOKS 3240 int i; 3241 #endif 3242 seqc_t vp_seqc, tvp_seqc; 3243 u_char nc_flag; 3244 3245 VFS_SMR_ASSERT_ENTERED(); 3246 3247 if (!cache_fast_revlookup) { 3248 vfs_smr_exit(); 3249 return (-1); 3250 } 3251 3252 orig_buflen = *buflen; 3253 3254 if (addend == 0) { 3255 MPASS(*buflen >= 2); 3256 *buflen -= 1; 3257 buf[*buflen] = '\0'; 3258 } 3259 3260 if (vp == rdir || vp == rootvnode) { 3261 if (addend == 0) { 3262 *buflen -= 1; 3263 buf[*buflen] = '/'; 3264 } 3265 goto out_ok; 3266 } 3267 3268 #ifdef KDTRACE_HOOKS 3269 i = 0; 3270 #endif 3271 error = -1; 3272 ncp = NULL; /* for sdt probe down below */ 3273 vp_seqc = vn_seqc_read_any(vp); 3274 if (seqc_in_modify(vp_seqc)) { 3275 cache_rev_failed(&reason); 3276 goto out_abort; 3277 } 3278 3279 for (;;) { 3280 #ifdef KDTRACE_HOOKS 3281 i++; 3282 #endif 3283 if ((vp->v_vflag & VV_ROOT) != 0) { 3284 mp = atomic_load_ptr(&vp->v_mount); 3285 if (mp == NULL) { 3286 cache_rev_failed(&reason); 3287 goto out_abort; 3288 } 3289 tvp = atomic_load_ptr(&mp->mnt_vnodecovered); 3290 tvp_seqc = vn_seqc_read_any(tvp); 3291 if (seqc_in_modify(tvp_seqc)) { 3292 cache_rev_failed(&reason); 3293 goto out_abort; 3294 } 3295 if (!vn_seqc_consistent(vp, vp_seqc)) { 3296 cache_rev_failed(&reason); 3297 goto out_abort; 3298 } 3299 vp = tvp; 3300 vp_seqc = tvp_seqc; 3301 continue; 3302 } 3303 ncp = atomic_load_consume_ptr(&vp->v_cache_dd); 3304 if (ncp == NULL) { 3305 cache_rev_failed(&reason); 3306 goto out_abort; 3307 } 3308 nc_flag = atomic_load_char(&ncp->nc_flag); 3309 if ((nc_flag & NCF_ISDOTDOT) != 0) { 3310 cache_rev_failed(&reason); 3311 goto out_abort; 3312 } 3313 if (ncp->nc_nlen >= *buflen) { 3314 cache_rev_failed(&reason); 3315 error = ENOMEM; 3316 goto out_abort; 3317 } 3318 *buflen -= ncp->nc_nlen; 3319 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 3320 *buflen -= 1; 3321 buf[*buflen] = '/'; 3322 tvp = ncp->nc_dvp; 3323 tvp_seqc = vn_seqc_read_any(tvp); 3324 if (seqc_in_modify(tvp_seqc)) { 3325 cache_rev_failed(&reason); 3326 goto out_abort; 3327 } 3328 if (!vn_seqc_consistent(vp, vp_seqc)) { 3329 cache_rev_failed(&reason); 3330 goto out_abort; 3331 } 3332 /* 3333 * Acquire fence provided by vn_seqc_read_any above. 3334 */ 3335 if (__predict_false(atomic_load_ptr(&vp->v_cache_dd) != ncp)) { 3336 cache_rev_failed(&reason); 3337 goto out_abort; 3338 } 3339 if (!cache_ncp_canuse(ncp)) { 3340 cache_rev_failed(&reason); 3341 goto out_abort; 3342 } 3343 vp = tvp; 3344 vp_seqc = tvp_seqc; 3345 if (vp == rdir || vp == rootvnode) 3346 break; 3347 } 3348 out_ok: 3349 vfs_smr_exit(); 3350 *retbuf = buf + *buflen; 3351 *buflen = orig_buflen - *buflen + addend; 3352 SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf); 3353 return (0); 3354 3355 out_abort: 3356 *buflen = orig_buflen; 3357 SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i); 3358 vfs_smr_exit(); 3359 return (error); 3360 } 3361 3362 static int 3363 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 3364 size_t *buflen) 3365 { 3366 size_t orig_buflen, addend; 3367 int error; 3368 3369 if (*buflen < 2) 3370 return (EINVAL); 3371 3372 orig_buflen = *buflen; 3373 3374 vref(vp); 3375 addend = 0; 3376 if (vp->v_type != VDIR) { 3377 *buflen -= 1; 3378 buf[*buflen] = '\0'; 3379 error = vn_vptocnp(&vp, buf, buflen); 3380 if (error) 3381 return (error); 3382 if (*buflen == 0) { 3383 vrele(vp); 3384 return (ENOMEM); 3385 } 3386 *buflen -= 1; 3387 buf[*buflen] = '/'; 3388 addend = orig_buflen - *buflen; 3389 } 3390 3391 return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, addend)); 3392 } 3393 3394 /* 3395 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks). 3396 * 3397 * Since the namecache does not track hardlinks, the caller is expected to first 3398 * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei. 3399 * 3400 * Then we have 2 cases: 3401 * - if the found vnode is a directory, the path can be constructed just by 3402 * following names up the chain 3403 * - otherwise we populate the buffer with the saved name and start resolving 3404 * from the parent 3405 */ 3406 static int 3407 vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, char **freebuf, 3408 size_t *buflen) 3409 { 3410 char *buf, *tmpbuf; 3411 struct pwd *pwd; 3412 struct componentname *cnp; 3413 struct vnode *vp; 3414 size_t addend; 3415 int error; 3416 enum vtype type; 3417 3418 if (*buflen < 2) 3419 return (EINVAL); 3420 if (*buflen > MAXPATHLEN) 3421 *buflen = MAXPATHLEN; 3422 3423 buf = malloc(*buflen, M_TEMP, M_WAITOK); 3424 3425 addend = 0; 3426 vp = ndp->ni_vp; 3427 /* 3428 * Check for VBAD to work around the vp_crossmp bug in lookup(). 3429 * 3430 * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be 3431 * set to mount point's root vnode while ni_dvp will be vp_crossmp. 3432 * If the type is VDIR (like in this very case) we can skip looking 3433 * at ni_dvp in the first place. However, since vnodes get passed here 3434 * unlocked the target may transition to doomed state (type == VBAD) 3435 * before we get to evaluate the condition. If this happens, we will 3436 * populate part of the buffer and descend to vn_fullpath_dir with 3437 * vp == vp_crossmp. Prevent the problem by checking for VBAD. 3438 * 3439 * This should be atomic_load(&vp->v_type) but it is illegal to take 3440 * an address of a bit field, even if said field is sized to char. 3441 * Work around the problem by reading the value into a full-sized enum 3442 * and then re-reading it with atomic_load which will still prevent 3443 * the compiler from re-reading down the road. 3444 */ 3445 type = vp->v_type; 3446 type = atomic_load_int(&type); 3447 if (type == VBAD) { 3448 error = ENOENT; 3449 goto out_bad; 3450 } 3451 if (type != VDIR) { 3452 cnp = &ndp->ni_cnd; 3453 addend = cnp->cn_namelen + 2; 3454 if (*buflen < addend) { 3455 error = ENOMEM; 3456 goto out_bad; 3457 } 3458 *buflen -= addend; 3459 tmpbuf = buf + *buflen; 3460 tmpbuf[0] = '/'; 3461 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen); 3462 tmpbuf[addend - 1] = '\0'; 3463 vp = ndp->ni_dvp; 3464 } 3465 3466 vfs_smr_enter(); 3467 pwd = pwd_get_smr(); 3468 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen, 3469 addend); 3470 VFS_SMR_ASSERT_NOT_ENTERED(); 3471 if (error < 0) { 3472 pwd = pwd_hold(curthread); 3473 vref(vp); 3474 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen, 3475 addend); 3476 pwd_drop(pwd); 3477 if (error != 0) 3478 goto out_bad; 3479 } 3480 3481 *freebuf = buf; 3482 3483 return (0); 3484 out_bad: 3485 free(buf, M_TEMP); 3486 return (error); 3487 } 3488 3489 struct vnode * 3490 vn_dir_dd_ino(struct vnode *vp) 3491 { 3492 struct namecache *ncp; 3493 struct vnode *ddvp; 3494 struct mtx *vlp; 3495 enum vgetstate vs; 3496 3497 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); 3498 vlp = VP2VNODELOCK(vp); 3499 mtx_lock(vlp); 3500 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { 3501 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 3502 continue; 3503 ddvp = ncp->nc_dvp; 3504 vs = vget_prep(ddvp); 3505 mtx_unlock(vlp); 3506 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) 3507 return (NULL); 3508 return (ddvp); 3509 } 3510 mtx_unlock(vlp); 3511 return (NULL); 3512 } 3513 3514 int 3515 vn_commname(struct vnode *vp, char *buf, u_int buflen) 3516 { 3517 struct namecache *ncp; 3518 struct mtx *vlp; 3519 int l; 3520 3521 vlp = VP2VNODELOCK(vp); 3522 mtx_lock(vlp); 3523 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) 3524 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 3525 break; 3526 if (ncp == NULL) { 3527 mtx_unlock(vlp); 3528 return (ENOENT); 3529 } 3530 l = min(ncp->nc_nlen, buflen - 1); 3531 memcpy(buf, ncp->nc_name, l); 3532 mtx_unlock(vlp); 3533 buf[l] = '\0'; 3534 return (0); 3535 } 3536 3537 /* 3538 * This function updates path string to vnode's full global path 3539 * and checks the size of the new path string against the pathlen argument. 3540 * 3541 * Requires a locked, referenced vnode. 3542 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 3543 * 3544 * If vp is a directory, the call to vn_fullpath_global() always succeeds 3545 * because it falls back to the ".." lookup if the namecache lookup fails. 3546 */ 3547 int 3548 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, 3549 u_int pathlen) 3550 { 3551 struct nameidata nd; 3552 struct vnode *vp1; 3553 char *rpath, *fbuf; 3554 int error; 3555 3556 ASSERT_VOP_ELOCKED(vp, __func__); 3557 3558 /* Construct global filesystem path from vp. */ 3559 VOP_UNLOCK(vp); 3560 error = vn_fullpath_global(vp, &rpath, &fbuf); 3561 3562 if (error != 0) { 3563 vrele(vp); 3564 return (error); 3565 } 3566 3567 if (strlen(rpath) >= pathlen) { 3568 vrele(vp); 3569 error = ENAMETOOLONG; 3570 goto out; 3571 } 3572 3573 /* 3574 * Re-lookup the vnode by path to detect a possible rename. 3575 * As a side effect, the vnode is relocked. 3576 * If vnode was renamed, return ENOENT. 3577 */ 3578 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, 3579 UIO_SYSSPACE, path, td); 3580 error = namei(&nd); 3581 if (error != 0) { 3582 vrele(vp); 3583 goto out; 3584 } 3585 NDFREE(&nd, NDF_ONLY_PNBUF); 3586 vp1 = nd.ni_vp; 3587 vrele(vp); 3588 if (vp1 == vp) 3589 strcpy(path, rpath); 3590 else { 3591 vput(vp1); 3592 error = ENOENT; 3593 } 3594 3595 out: 3596 free(fbuf, M_TEMP); 3597 return (error); 3598 } 3599 3600 #ifdef DDB 3601 static void 3602 db_print_vpath(struct vnode *vp) 3603 { 3604 3605 while (vp != NULL) { 3606 db_printf("%p: ", vp); 3607 if (vp == rootvnode) { 3608 db_printf("/"); 3609 vp = NULL; 3610 } else { 3611 if (vp->v_vflag & VV_ROOT) { 3612 db_printf("<mount point>"); 3613 vp = vp->v_mount->mnt_vnodecovered; 3614 } else { 3615 struct namecache *ncp; 3616 char *ncn; 3617 int i; 3618 3619 ncp = TAILQ_FIRST(&vp->v_cache_dst); 3620 if (ncp != NULL) { 3621 ncn = ncp->nc_name; 3622 for (i = 0; i < ncp->nc_nlen; i++) 3623 db_printf("%c", *ncn++); 3624 vp = ncp->nc_dvp; 3625 } else { 3626 vp = NULL; 3627 } 3628 } 3629 } 3630 db_printf("\n"); 3631 } 3632 3633 return; 3634 } 3635 3636 DB_SHOW_COMMAND(vpath, db_show_vpath) 3637 { 3638 struct vnode *vp; 3639 3640 if (!have_addr) { 3641 db_printf("usage: show vpath <struct vnode *>\n"); 3642 return; 3643 } 3644 3645 vp = (struct vnode *)addr; 3646 db_print_vpath(vp); 3647 } 3648 3649 #endif 3650 3651 static int cache_fast_lookup = 1; 3652 static char __read_frequently cache_fast_lookup_enabled = true; 3653 3654 #define CACHE_FPL_FAILED -2020 3655 3656 void 3657 cache_fast_lookup_enabled_recalc(void) 3658 { 3659 int lookup_flag; 3660 int mac_on; 3661 3662 #ifdef MAC 3663 mac_on = mac_vnode_check_lookup_enabled(); 3664 mac_on |= mac_vnode_check_readlink_enabled(); 3665 #else 3666 mac_on = 0; 3667 #endif 3668 3669 lookup_flag = atomic_load_int(&cache_fast_lookup); 3670 if (lookup_flag && !mac_on) { 3671 atomic_store_char(&cache_fast_lookup_enabled, true); 3672 } else { 3673 atomic_store_char(&cache_fast_lookup_enabled, false); 3674 } 3675 } 3676 3677 static int 3678 syscal_vfs_cache_fast_lookup(SYSCTL_HANDLER_ARGS) 3679 { 3680 int error, old; 3681 3682 old = atomic_load_int(&cache_fast_lookup); 3683 error = sysctl_handle_int(oidp, arg1, arg2, req); 3684 if (error == 0 && req->newptr && old != atomic_load_int(&cache_fast_lookup)) 3685 cache_fast_lookup_enabled_recalc(); 3686 return (error); 3687 } 3688 SYSCTL_PROC(_vfs, OID_AUTO, cache_fast_lookup, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_MPSAFE, 3689 &cache_fast_lookup, 0, syscal_vfs_cache_fast_lookup, "IU", ""); 3690 3691 /* 3692 * Components of nameidata (or objects it can point to) which may 3693 * need restoring in case fast path lookup fails. 3694 */ 3695 struct nameidata_outer { 3696 size_t ni_pathlen; 3697 int cn_flags; 3698 }; 3699 3700 struct nameidata_saved { 3701 #ifdef INVARIANTS 3702 char *cn_nameptr; 3703 size_t ni_pathlen; 3704 #endif 3705 }; 3706 3707 #ifdef INVARIANTS 3708 struct cache_fpl_debug { 3709 size_t ni_pathlen; 3710 }; 3711 #endif 3712 3713 struct cache_fpl { 3714 struct nameidata *ndp; 3715 struct componentname *cnp; 3716 char *nulchar; 3717 struct vnode *dvp; 3718 struct vnode *tvp; 3719 seqc_t dvp_seqc; 3720 seqc_t tvp_seqc; 3721 uint32_t hash; 3722 struct nameidata_saved snd; 3723 struct nameidata_outer snd_outer; 3724 int line; 3725 enum cache_fpl_status status:8; 3726 bool in_smr; 3727 bool fsearch; 3728 bool savename; 3729 struct pwd **pwd; 3730 #ifdef INVARIANTS 3731 struct cache_fpl_debug debug; 3732 #endif 3733 }; 3734 3735 static bool cache_fplookup_is_mp(struct cache_fpl *fpl); 3736 static int cache_fplookup_cross_mount(struct cache_fpl *fpl); 3737 static int cache_fplookup_partial_setup(struct cache_fpl *fpl); 3738 static int cache_fplookup_skip_slashes(struct cache_fpl *fpl); 3739 static int cache_fplookup_trailingslash(struct cache_fpl *fpl); 3740 static void cache_fpl_pathlen_dec(struct cache_fpl *fpl); 3741 static void cache_fpl_pathlen_inc(struct cache_fpl *fpl); 3742 static void cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n); 3743 static void cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n); 3744 3745 static void 3746 cache_fpl_cleanup_cnp(struct componentname *cnp) 3747 { 3748 3749 uma_zfree(namei_zone, cnp->cn_pnbuf); 3750 #ifdef DIAGNOSTIC 3751 cnp->cn_pnbuf = NULL; 3752 cnp->cn_nameptr = NULL; 3753 #endif 3754 } 3755 3756 static struct vnode * 3757 cache_fpl_handle_root(struct cache_fpl *fpl) 3758 { 3759 struct nameidata *ndp; 3760 struct componentname *cnp; 3761 3762 ndp = fpl->ndp; 3763 cnp = fpl->cnp; 3764 3765 MPASS(*(cnp->cn_nameptr) == '/'); 3766 cnp->cn_nameptr++; 3767 cache_fpl_pathlen_dec(fpl); 3768 3769 if (__predict_false(*(cnp->cn_nameptr) == '/')) { 3770 do { 3771 cnp->cn_nameptr++; 3772 cache_fpl_pathlen_dec(fpl); 3773 } while (*(cnp->cn_nameptr) == '/'); 3774 } 3775 3776 return (ndp->ni_rootdir); 3777 } 3778 3779 static void 3780 cache_fpl_checkpoint_outer(struct cache_fpl *fpl) 3781 { 3782 3783 fpl->snd_outer.ni_pathlen = fpl->ndp->ni_pathlen; 3784 fpl->snd_outer.cn_flags = fpl->ndp->ni_cnd.cn_flags; 3785 } 3786 3787 static void 3788 cache_fpl_checkpoint(struct cache_fpl *fpl) 3789 { 3790 3791 #ifdef INVARIANTS 3792 fpl->snd.cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr; 3793 fpl->snd.ni_pathlen = fpl->debug.ni_pathlen; 3794 #endif 3795 } 3796 3797 static void 3798 cache_fpl_restore_partial(struct cache_fpl *fpl) 3799 { 3800 3801 fpl->ndp->ni_cnd.cn_flags = fpl->snd_outer.cn_flags; 3802 #ifdef INVARIANTS 3803 fpl->debug.ni_pathlen = fpl->snd.ni_pathlen; 3804 #endif 3805 } 3806 3807 static void 3808 cache_fpl_restore_abort(struct cache_fpl *fpl) 3809 { 3810 3811 cache_fpl_restore_partial(fpl); 3812 /* 3813 * It is 0 on entry by API contract. 3814 */ 3815 fpl->ndp->ni_resflags = 0; 3816 fpl->ndp->ni_cnd.cn_nameptr = fpl->ndp->ni_cnd.cn_pnbuf; 3817 fpl->ndp->ni_pathlen = fpl->snd_outer.ni_pathlen; 3818 } 3819 3820 #ifdef INVARIANTS 3821 #define cache_fpl_smr_assert_entered(fpl) ({ \ 3822 struct cache_fpl *_fpl = (fpl); \ 3823 MPASS(_fpl->in_smr == true); \ 3824 VFS_SMR_ASSERT_ENTERED(); \ 3825 }) 3826 #define cache_fpl_smr_assert_not_entered(fpl) ({ \ 3827 struct cache_fpl *_fpl = (fpl); \ 3828 MPASS(_fpl->in_smr == false); \ 3829 VFS_SMR_ASSERT_NOT_ENTERED(); \ 3830 }) 3831 static void 3832 cache_fpl_assert_status(struct cache_fpl *fpl) 3833 { 3834 3835 switch (fpl->status) { 3836 case CACHE_FPL_STATUS_UNSET: 3837 __assert_unreachable(); 3838 break; 3839 case CACHE_FPL_STATUS_DESTROYED: 3840 case CACHE_FPL_STATUS_ABORTED: 3841 case CACHE_FPL_STATUS_PARTIAL: 3842 case CACHE_FPL_STATUS_HANDLED: 3843 break; 3844 } 3845 } 3846 #else 3847 #define cache_fpl_smr_assert_entered(fpl) do { } while (0) 3848 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0) 3849 #define cache_fpl_assert_status(fpl) do { } while (0) 3850 #endif 3851 3852 #define cache_fpl_smr_enter_initial(fpl) ({ \ 3853 struct cache_fpl *_fpl = (fpl); \ 3854 vfs_smr_enter(); \ 3855 _fpl->in_smr = true; \ 3856 }) 3857 3858 #define cache_fpl_smr_enter(fpl) ({ \ 3859 struct cache_fpl *_fpl = (fpl); \ 3860 MPASS(_fpl->in_smr == false); \ 3861 vfs_smr_enter(); \ 3862 _fpl->in_smr = true; \ 3863 }) 3864 3865 #define cache_fpl_smr_exit(fpl) ({ \ 3866 struct cache_fpl *_fpl = (fpl); \ 3867 MPASS(_fpl->in_smr == true); \ 3868 vfs_smr_exit(); \ 3869 _fpl->in_smr = false; \ 3870 }) 3871 3872 static int 3873 cache_fpl_aborted_early_impl(struct cache_fpl *fpl, int line) 3874 { 3875 3876 if (fpl->status != CACHE_FPL_STATUS_UNSET) { 3877 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, 3878 ("%s: converting to abort from %d at %d, set at %d\n", 3879 __func__, fpl->status, line, fpl->line)); 3880 } 3881 cache_fpl_smr_assert_not_entered(fpl); 3882 fpl->status = CACHE_FPL_STATUS_ABORTED; 3883 fpl->line = line; 3884 return (CACHE_FPL_FAILED); 3885 } 3886 3887 #define cache_fpl_aborted_early(x) cache_fpl_aborted_early_impl((x), __LINE__) 3888 3889 static int __noinline 3890 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line) 3891 { 3892 struct nameidata *ndp; 3893 struct componentname *cnp; 3894 3895 ndp = fpl->ndp; 3896 cnp = fpl->cnp; 3897 3898 if (fpl->status != CACHE_FPL_STATUS_UNSET) { 3899 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, 3900 ("%s: converting to abort from %d at %d, set at %d\n", 3901 __func__, fpl->status, line, fpl->line)); 3902 } 3903 fpl->status = CACHE_FPL_STATUS_ABORTED; 3904 fpl->line = line; 3905 if (fpl->in_smr) 3906 cache_fpl_smr_exit(fpl); 3907 cache_fpl_restore_abort(fpl); 3908 /* 3909 * Resolving symlinks overwrites data passed by the caller. 3910 * Let namei know. 3911 */ 3912 if (ndp->ni_loopcnt > 0) { 3913 fpl->status = CACHE_FPL_STATUS_DESTROYED; 3914 cache_fpl_cleanup_cnp(cnp); 3915 } 3916 return (CACHE_FPL_FAILED); 3917 } 3918 3919 #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__) 3920 3921 static int __noinline 3922 cache_fpl_partial_impl(struct cache_fpl *fpl, int line) 3923 { 3924 3925 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3926 ("%s: setting to partial at %d, but already set to %d at %d\n", 3927 __func__, line, fpl->status, fpl->line)); 3928 cache_fpl_smr_assert_entered(fpl); 3929 fpl->status = CACHE_FPL_STATUS_PARTIAL; 3930 fpl->line = line; 3931 return (cache_fplookup_partial_setup(fpl)); 3932 } 3933 3934 #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__) 3935 3936 static int 3937 cache_fpl_handled_impl(struct cache_fpl *fpl, int line) 3938 { 3939 3940 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3941 ("%s: setting to handled at %d, but already set to %d at %d\n", 3942 __func__, line, fpl->status, fpl->line)); 3943 cache_fpl_smr_assert_not_entered(fpl); 3944 fpl->status = CACHE_FPL_STATUS_HANDLED; 3945 fpl->line = line; 3946 return (0); 3947 } 3948 3949 #define cache_fpl_handled(x) cache_fpl_handled_impl((x), __LINE__) 3950 3951 static int 3952 cache_fpl_handled_error_impl(struct cache_fpl *fpl, int error, int line) 3953 { 3954 3955 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3956 ("%s: setting to handled at %d, but already set to %d at %d\n", 3957 __func__, line, fpl->status, fpl->line)); 3958 MPASS(error != 0); 3959 MPASS(error != CACHE_FPL_FAILED); 3960 cache_fpl_smr_assert_not_entered(fpl); 3961 fpl->status = CACHE_FPL_STATUS_HANDLED; 3962 fpl->line = line; 3963 fpl->dvp = NULL; 3964 fpl->tvp = NULL; 3965 fpl->savename = false; 3966 return (error); 3967 } 3968 3969 #define cache_fpl_handled_error(x, e) cache_fpl_handled_error_impl((x), (e), __LINE__) 3970 3971 static bool 3972 cache_fpl_terminated(struct cache_fpl *fpl) 3973 { 3974 3975 return (fpl->status != CACHE_FPL_STATUS_UNSET); 3976 } 3977 3978 #define CACHE_FPL_SUPPORTED_CN_FLAGS \ 3979 (NC_NOMAKEENTRY | NC_KEEPPOSENTRY | LOCKLEAF | LOCKPARENT | WANTPARENT | \ 3980 FAILIFEXISTS | FOLLOW | LOCKSHARED | SAVENAME | SAVESTART | WILLBEDIR | \ 3981 ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK) 3982 3983 #define CACHE_FPL_INTERNAL_CN_FLAGS \ 3984 (ISDOTDOT | MAKEENTRY | ISLASTCN) 3985 3986 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, 3987 "supported and internal flags overlap"); 3988 3989 static bool 3990 cache_fpl_islastcn(struct nameidata *ndp) 3991 { 3992 3993 return (*ndp->ni_next == 0); 3994 } 3995 3996 static bool 3997 cache_fpl_istrailingslash(struct cache_fpl *fpl) 3998 { 3999 4000 return (*(fpl->nulchar - 1) == '/'); 4001 } 4002 4003 static bool 4004 cache_fpl_isdotdot(struct componentname *cnp) 4005 { 4006 4007 if (cnp->cn_namelen == 2 && 4008 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') 4009 return (true); 4010 return (false); 4011 } 4012 4013 static bool 4014 cache_can_fplookup(struct cache_fpl *fpl) 4015 { 4016 struct nameidata *ndp; 4017 struct componentname *cnp; 4018 struct thread *td; 4019 4020 ndp = fpl->ndp; 4021 cnp = fpl->cnp; 4022 td = cnp->cn_thread; 4023 4024 if (!atomic_load_char(&cache_fast_lookup_enabled)) { 4025 cache_fpl_aborted_early(fpl); 4026 return (false); 4027 } 4028 if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) { 4029 cache_fpl_aborted_early(fpl); 4030 return (false); 4031 } 4032 if (IN_CAPABILITY_MODE(td)) { 4033 cache_fpl_aborted_early(fpl); 4034 return (false); 4035 } 4036 if (AUDITING_TD(td)) { 4037 cache_fpl_aborted_early(fpl); 4038 return (false); 4039 } 4040 if (ndp->ni_startdir != NULL) { 4041 cache_fpl_aborted_early(fpl); 4042 return (false); 4043 } 4044 return (true); 4045 } 4046 4047 static int 4048 cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp) 4049 { 4050 struct nameidata *ndp; 4051 int error; 4052 bool fsearch; 4053 4054 ndp = fpl->ndp; 4055 error = fgetvp_lookup_smr(ndp->ni_dirfd, ndp, vpp, &fsearch); 4056 if (__predict_false(error != 0)) { 4057 return (cache_fpl_aborted(fpl)); 4058 } 4059 fpl->fsearch = fsearch; 4060 return (0); 4061 } 4062 4063 static int __noinline 4064 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp, 4065 uint32_t hash) 4066 { 4067 struct componentname *cnp; 4068 struct vnode *dvp; 4069 4070 cnp = fpl->cnp; 4071 dvp = fpl->dvp; 4072 4073 cache_fpl_smr_exit(fpl); 4074 if (cache_neg_promote_cond(dvp, cnp, oncp, hash)) 4075 return (cache_fpl_handled_error(fpl, ENOENT)); 4076 else 4077 return (cache_fpl_aborted(fpl)); 4078 } 4079 4080 /* 4081 * The target vnode is not supported, prepare for the slow path to take over. 4082 */ 4083 static int __noinline 4084 cache_fplookup_partial_setup(struct cache_fpl *fpl) 4085 { 4086 struct nameidata *ndp; 4087 struct componentname *cnp; 4088 enum vgetstate dvs; 4089 struct vnode *dvp; 4090 struct pwd *pwd; 4091 seqc_t dvp_seqc; 4092 4093 ndp = fpl->ndp; 4094 cnp = fpl->cnp; 4095 pwd = *(fpl->pwd); 4096 dvp = fpl->dvp; 4097 dvp_seqc = fpl->dvp_seqc; 4098 4099 if (!pwd_hold_smr(pwd)) { 4100 return (cache_fpl_aborted(fpl)); 4101 } 4102 4103 /* 4104 * Note that seqc is checked before the vnode is locked, so by 4105 * the time regular lookup gets to it it may have moved. 4106 * 4107 * Ultimately this does not affect correctness, any lookup errors 4108 * are userspace racing with itself. It is guaranteed that any 4109 * path which ultimately gets found could also have been found 4110 * by regular lookup going all the way in absence of concurrent 4111 * modifications. 4112 */ 4113 dvs = vget_prep_smr(dvp); 4114 cache_fpl_smr_exit(fpl); 4115 if (__predict_false(dvs == VGET_NONE)) { 4116 pwd_drop(pwd); 4117 return (cache_fpl_aborted(fpl)); 4118 } 4119 4120 vget_finish_ref(dvp, dvs); 4121 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4122 vrele(dvp); 4123 pwd_drop(pwd); 4124 return (cache_fpl_aborted(fpl)); 4125 } 4126 4127 cache_fpl_restore_partial(fpl); 4128 #ifdef INVARIANTS 4129 if (cnp->cn_nameptr != fpl->snd.cn_nameptr) { 4130 panic("%s: cn_nameptr mismatch (%p != %p) full [%s]\n", __func__, 4131 cnp->cn_nameptr, fpl->snd.cn_nameptr, cnp->cn_pnbuf); 4132 } 4133 #endif 4134 4135 ndp->ni_startdir = dvp; 4136 cnp->cn_flags |= MAKEENTRY; 4137 if (cache_fpl_islastcn(ndp)) 4138 cnp->cn_flags |= ISLASTCN; 4139 if (cache_fpl_isdotdot(cnp)) 4140 cnp->cn_flags |= ISDOTDOT; 4141 4142 /* 4143 * Skip potential extra slashes parsing did not take care of. 4144 * cache_fplookup_skip_slashes explains the mechanism. 4145 */ 4146 if (__predict_false(*(cnp->cn_nameptr) == '/')) { 4147 do { 4148 cnp->cn_nameptr++; 4149 cache_fpl_pathlen_dec(fpl); 4150 } while (*(cnp->cn_nameptr) == '/'); 4151 } 4152 4153 ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1; 4154 #ifdef INVARIANTS 4155 if (ndp->ni_pathlen != fpl->debug.ni_pathlen) { 4156 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n", 4157 __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar, 4158 cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf); 4159 } 4160 #endif 4161 return (0); 4162 } 4163 4164 static int 4165 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs) 4166 { 4167 struct componentname *cnp; 4168 struct vnode *tvp; 4169 seqc_t tvp_seqc; 4170 int error, lkflags; 4171 4172 cnp = fpl->cnp; 4173 tvp = fpl->tvp; 4174 tvp_seqc = fpl->tvp_seqc; 4175 4176 if ((cnp->cn_flags & LOCKLEAF) != 0) { 4177 lkflags = LK_SHARED; 4178 if ((cnp->cn_flags & LOCKSHARED) == 0) 4179 lkflags = LK_EXCLUSIVE; 4180 error = vget_finish(tvp, lkflags, tvs); 4181 if (__predict_false(error != 0)) { 4182 return (cache_fpl_aborted(fpl)); 4183 } 4184 } else { 4185 vget_finish_ref(tvp, tvs); 4186 } 4187 4188 if (!vn_seqc_consistent(tvp, tvp_seqc)) { 4189 if ((cnp->cn_flags & LOCKLEAF) != 0) 4190 vput(tvp); 4191 else 4192 vrele(tvp); 4193 return (cache_fpl_aborted(fpl)); 4194 } 4195 4196 return (cache_fpl_handled(fpl)); 4197 } 4198 4199 /* 4200 * They want to possibly modify the state of the namecache. 4201 */ 4202 static int __noinline 4203 cache_fplookup_final_modifying(struct cache_fpl *fpl) 4204 { 4205 struct nameidata *ndp; 4206 struct componentname *cnp; 4207 enum vgetstate dvs; 4208 struct vnode *dvp, *tvp; 4209 struct mount *mp; 4210 seqc_t dvp_seqc; 4211 int error; 4212 bool docache; 4213 4214 ndp = fpl->ndp; 4215 cnp = fpl->cnp; 4216 dvp = fpl->dvp; 4217 dvp_seqc = fpl->dvp_seqc; 4218 4219 MPASS(*(cnp->cn_nameptr) != '/'); 4220 MPASS(cache_fpl_islastcn(ndp)); 4221 if ((cnp->cn_flags & LOCKPARENT) == 0) 4222 MPASS((cnp->cn_flags & WANTPARENT) != 0); 4223 MPASS((cnp->cn_flags & TRAILINGSLASH) == 0); 4224 MPASS(cnp->cn_nameiop == CREATE || cnp->cn_nameiop == DELETE || 4225 cnp->cn_nameiop == RENAME); 4226 MPASS((cnp->cn_flags & MAKEENTRY) == 0); 4227 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 4228 4229 docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE; 4230 if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME) 4231 docache = false; 4232 4233 /* 4234 * Regular lookup nulifies the slash, which we don't do here. 4235 * Don't take chances with filesystem routines seeing it for 4236 * the last entry. 4237 */ 4238 if (cache_fpl_istrailingslash(fpl)) { 4239 return (cache_fpl_partial(fpl)); 4240 } 4241 4242 mp = atomic_load_ptr(&dvp->v_mount); 4243 if (__predict_false(mp == NULL)) { 4244 return (cache_fpl_aborted(fpl)); 4245 } 4246 4247 if (__predict_false(mp->mnt_flag & MNT_RDONLY)) { 4248 cache_fpl_smr_exit(fpl); 4249 /* 4250 * Original code keeps not checking for CREATE which 4251 * might be a bug. For now let the old lookup decide. 4252 */ 4253 if (cnp->cn_nameiop == CREATE) { 4254 return (cache_fpl_aborted(fpl)); 4255 } 4256 return (cache_fpl_handled_error(fpl, EROFS)); 4257 } 4258 4259 if (fpl->tvp != NULL && (cnp->cn_flags & FAILIFEXISTS) != 0) { 4260 cache_fpl_smr_exit(fpl); 4261 return (cache_fpl_handled_error(fpl, EEXIST)); 4262 } 4263 4264 /* 4265 * Secure access to dvp; check cache_fplookup_partial_setup for 4266 * reasoning. 4267 * 4268 * XXX At least UFS requires its lookup routine to be called for 4269 * the last path component, which leads to some level of complication 4270 * and inefficiency: 4271 * - the target routine always locks the target vnode, but our caller 4272 * may not need it locked 4273 * - some of the VOP machinery asserts that the parent is locked, which 4274 * once more may be not required 4275 * 4276 * TODO: add a flag for filesystems which don't need this. 4277 */ 4278 dvs = vget_prep_smr(dvp); 4279 cache_fpl_smr_exit(fpl); 4280 if (__predict_false(dvs == VGET_NONE)) { 4281 return (cache_fpl_aborted(fpl)); 4282 } 4283 4284 vget_finish_ref(dvp, dvs); 4285 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4286 vrele(dvp); 4287 return (cache_fpl_aborted(fpl)); 4288 } 4289 4290 error = vn_lock(dvp, LK_EXCLUSIVE); 4291 if (__predict_false(error != 0)) { 4292 vrele(dvp); 4293 return (cache_fpl_aborted(fpl)); 4294 } 4295 4296 tvp = NULL; 4297 cnp->cn_flags |= ISLASTCN; 4298 if (docache) 4299 cnp->cn_flags |= MAKEENTRY; 4300 if (cache_fpl_isdotdot(cnp)) 4301 cnp->cn_flags |= ISDOTDOT; 4302 cnp->cn_lkflags = LK_EXCLUSIVE; 4303 error = VOP_LOOKUP(dvp, &tvp, cnp); 4304 switch (error) { 4305 case EJUSTRETURN: 4306 case 0: 4307 break; 4308 case ENOTDIR: 4309 case ENOENT: 4310 vput(dvp); 4311 return (cache_fpl_handled_error(fpl, error)); 4312 default: 4313 vput(dvp); 4314 return (cache_fpl_aborted(fpl)); 4315 } 4316 4317 fpl->tvp = tvp; 4318 fpl->savename = (cnp->cn_flags & SAVENAME) != 0; 4319 4320 if (tvp == NULL) { 4321 if ((cnp->cn_flags & SAVESTART) != 0) { 4322 ndp->ni_startdir = dvp; 4323 vrefact(ndp->ni_startdir); 4324 cnp->cn_flags |= SAVENAME; 4325 fpl->savename = true; 4326 } 4327 MPASS(error == EJUSTRETURN); 4328 if ((cnp->cn_flags & LOCKPARENT) == 0) { 4329 VOP_UNLOCK(dvp); 4330 } 4331 return (cache_fpl_handled(fpl)); 4332 } 4333 4334 /* 4335 * There are very hairy corner cases concerning various flag combinations 4336 * and locking state. In particular here we only hold one lock instead of 4337 * two. 4338 * 4339 * Skip the complexity as it is of no significance for normal workloads. 4340 */ 4341 if (__predict_false(tvp == dvp)) { 4342 vput(dvp); 4343 vrele(tvp); 4344 return (cache_fpl_aborted(fpl)); 4345 } 4346 4347 /* 4348 * If they want the symlink itself we are fine, but if they want to 4349 * follow it regular lookup has to be engaged. 4350 */ 4351 if (tvp->v_type == VLNK) { 4352 if ((cnp->cn_flags & FOLLOW) != 0) { 4353 vput(dvp); 4354 vput(tvp); 4355 return (cache_fpl_aborted(fpl)); 4356 } 4357 } 4358 4359 /* 4360 * Since we expect this to be the terminal vnode it should almost never 4361 * be a mount point. 4362 */ 4363 if (__predict_false(cache_fplookup_is_mp(fpl))) { 4364 vput(dvp); 4365 vput(tvp); 4366 return (cache_fpl_aborted(fpl)); 4367 } 4368 4369 if ((cnp->cn_flags & FAILIFEXISTS) != 0) { 4370 vput(dvp); 4371 vput(tvp); 4372 return (cache_fpl_handled_error(fpl, EEXIST)); 4373 } 4374 4375 if ((cnp->cn_flags & LOCKLEAF) == 0) { 4376 VOP_UNLOCK(tvp); 4377 } 4378 4379 if ((cnp->cn_flags & LOCKPARENT) == 0) { 4380 VOP_UNLOCK(dvp); 4381 } 4382 4383 if ((cnp->cn_flags & SAVESTART) != 0) { 4384 ndp->ni_startdir = dvp; 4385 vrefact(ndp->ni_startdir); 4386 cnp->cn_flags |= SAVENAME; 4387 fpl->savename = true; 4388 } 4389 4390 return (cache_fpl_handled(fpl)); 4391 } 4392 4393 static int __noinline 4394 cache_fplookup_modifying(struct cache_fpl *fpl) 4395 { 4396 struct nameidata *ndp; 4397 4398 ndp = fpl->ndp; 4399 4400 if (!cache_fpl_islastcn(ndp)) { 4401 return (cache_fpl_partial(fpl)); 4402 } 4403 return (cache_fplookup_final_modifying(fpl)); 4404 } 4405 4406 static int __noinline 4407 cache_fplookup_final_withparent(struct cache_fpl *fpl) 4408 { 4409 struct componentname *cnp; 4410 enum vgetstate dvs, tvs; 4411 struct vnode *dvp, *tvp; 4412 seqc_t dvp_seqc; 4413 int error; 4414 4415 cnp = fpl->cnp; 4416 dvp = fpl->dvp; 4417 dvp_seqc = fpl->dvp_seqc; 4418 tvp = fpl->tvp; 4419 4420 MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0); 4421 4422 /* 4423 * This is less efficient than it can be for simplicity. 4424 */ 4425 dvs = vget_prep_smr(dvp); 4426 if (__predict_false(dvs == VGET_NONE)) { 4427 return (cache_fpl_aborted(fpl)); 4428 } 4429 tvs = vget_prep_smr(tvp); 4430 if (__predict_false(tvs == VGET_NONE)) { 4431 cache_fpl_smr_exit(fpl); 4432 vget_abort(dvp, dvs); 4433 return (cache_fpl_aborted(fpl)); 4434 } 4435 4436 cache_fpl_smr_exit(fpl); 4437 4438 if ((cnp->cn_flags & LOCKPARENT) != 0) { 4439 error = vget_finish(dvp, LK_EXCLUSIVE, dvs); 4440 if (__predict_false(error != 0)) { 4441 vget_abort(tvp, tvs); 4442 return (cache_fpl_aborted(fpl)); 4443 } 4444 } else { 4445 vget_finish_ref(dvp, dvs); 4446 } 4447 4448 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4449 vget_abort(tvp, tvs); 4450 if ((cnp->cn_flags & LOCKPARENT) != 0) 4451 vput(dvp); 4452 else 4453 vrele(dvp); 4454 return (cache_fpl_aborted(fpl)); 4455 } 4456 4457 error = cache_fplookup_final_child(fpl, tvs); 4458 if (__predict_false(error != 0)) { 4459 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED); 4460 if ((cnp->cn_flags & LOCKPARENT) != 0) 4461 vput(dvp); 4462 else 4463 vrele(dvp); 4464 return (error); 4465 } 4466 4467 MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED); 4468 return (0); 4469 } 4470 4471 static int 4472 cache_fplookup_final(struct cache_fpl *fpl) 4473 { 4474 struct componentname *cnp; 4475 enum vgetstate tvs; 4476 struct vnode *dvp, *tvp; 4477 seqc_t dvp_seqc; 4478 4479 cnp = fpl->cnp; 4480 dvp = fpl->dvp; 4481 dvp_seqc = fpl->dvp_seqc; 4482 tvp = fpl->tvp; 4483 4484 MPASS(*(cnp->cn_nameptr) != '/'); 4485 4486 if (cnp->cn_nameiop != LOOKUP) { 4487 return (cache_fplookup_final_modifying(fpl)); 4488 } 4489 4490 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) 4491 return (cache_fplookup_final_withparent(fpl)); 4492 4493 tvs = vget_prep_smr(tvp); 4494 if (__predict_false(tvs == VGET_NONE)) { 4495 return (cache_fpl_partial(fpl)); 4496 } 4497 4498 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4499 cache_fpl_smr_exit(fpl); 4500 vget_abort(tvp, tvs); 4501 return (cache_fpl_aborted(fpl)); 4502 } 4503 4504 cache_fpl_smr_exit(fpl); 4505 return (cache_fplookup_final_child(fpl, tvs)); 4506 } 4507 4508 /* 4509 * Comment from locked lookup: 4510 * Check for degenerate name (e.g. / or "") which is a way of talking about a 4511 * directory, e.g. like "/." or ".". 4512 */ 4513 static int __noinline 4514 cache_fplookup_degenerate(struct cache_fpl *fpl) 4515 { 4516 struct componentname *cnp; 4517 struct vnode *dvp; 4518 enum vgetstate dvs; 4519 int error, lkflags; 4520 #ifdef INVARIANTS 4521 char *cp; 4522 #endif 4523 4524 fpl->tvp = fpl->dvp; 4525 fpl->tvp_seqc = fpl->dvp_seqc; 4526 4527 cnp = fpl->cnp; 4528 dvp = fpl->dvp; 4529 4530 #ifdef INVARIANTS 4531 for (cp = cnp->cn_pnbuf; *cp != '\0'; cp++) { 4532 KASSERT(*cp == '/', 4533 ("%s: encountered non-slash; string [%s]\n", __func__, 4534 cnp->cn_pnbuf)); 4535 } 4536 #endif 4537 4538 if (__predict_false(cnp->cn_nameiop != LOOKUP)) { 4539 cache_fpl_smr_exit(fpl); 4540 return (cache_fpl_handled_error(fpl, EISDIR)); 4541 } 4542 4543 MPASS((cnp->cn_flags & SAVESTART) == 0); 4544 4545 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) { 4546 return (cache_fplookup_final_withparent(fpl)); 4547 } 4548 4549 dvs = vget_prep_smr(dvp); 4550 cache_fpl_smr_exit(fpl); 4551 if (__predict_false(dvs == VGET_NONE)) { 4552 return (cache_fpl_aborted(fpl)); 4553 } 4554 4555 if ((cnp->cn_flags & LOCKLEAF) != 0) { 4556 lkflags = LK_SHARED; 4557 if ((cnp->cn_flags & LOCKSHARED) == 0) 4558 lkflags = LK_EXCLUSIVE; 4559 error = vget_finish(dvp, lkflags, dvs); 4560 if (__predict_false(error != 0)) { 4561 return (cache_fpl_aborted(fpl)); 4562 } 4563 } else { 4564 vget_finish_ref(dvp, dvs); 4565 } 4566 return (cache_fpl_handled(fpl)); 4567 } 4568 4569 static int __noinline 4570 cache_fplookup_noentry(struct cache_fpl *fpl) 4571 { 4572 struct nameidata *ndp; 4573 struct componentname *cnp; 4574 enum vgetstate dvs; 4575 struct vnode *dvp, *tvp; 4576 seqc_t dvp_seqc; 4577 int error; 4578 bool docache; 4579 4580 ndp = fpl->ndp; 4581 cnp = fpl->cnp; 4582 dvp = fpl->dvp; 4583 dvp_seqc = fpl->dvp_seqc; 4584 4585 MPASS((cnp->cn_flags & MAKEENTRY) == 0); 4586 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 4587 MPASS(!cache_fpl_isdotdot(cnp)); 4588 4589 /* 4590 * Hack: delayed name len checking. 4591 */ 4592 if (__predict_false(cnp->cn_namelen > NAME_MAX)) { 4593 cache_fpl_smr_exit(fpl); 4594 return (cache_fpl_handled_error(fpl, ENAMETOOLONG)); 4595 } 4596 4597 if (cnp->cn_nameptr[0] == '/') { 4598 return (cache_fplookup_skip_slashes(fpl)); 4599 } 4600 4601 if (cnp->cn_nameptr[0] == '\0') { 4602 if (fpl->tvp == NULL) { 4603 return (cache_fplookup_degenerate(fpl)); 4604 } 4605 return (cache_fplookup_trailingslash(fpl)); 4606 } 4607 4608 if (cnp->cn_nameiop != LOOKUP) { 4609 fpl->tvp = NULL; 4610 return (cache_fplookup_modifying(fpl)); 4611 } 4612 4613 MPASS((cnp->cn_flags & SAVESTART) == 0); 4614 4615 /* 4616 * Only try to fill in the component if it is the last one, 4617 * otherwise not only there may be several to handle but the 4618 * walk may be complicated. 4619 */ 4620 if (!cache_fpl_islastcn(ndp)) { 4621 return (cache_fpl_partial(fpl)); 4622 } 4623 4624 /* 4625 * Regular lookup nulifies the slash, which we don't do here. 4626 * Don't take chances with filesystem routines seeing it for 4627 * the last entry. 4628 */ 4629 if (cache_fpl_istrailingslash(fpl)) { 4630 return (cache_fpl_partial(fpl)); 4631 } 4632 4633 /* 4634 * Secure access to dvp; check cache_fplookup_partial_setup for 4635 * reasoning. 4636 */ 4637 dvs = vget_prep_smr(dvp); 4638 cache_fpl_smr_exit(fpl); 4639 if (__predict_false(dvs == VGET_NONE)) { 4640 return (cache_fpl_aborted(fpl)); 4641 } 4642 4643 vget_finish_ref(dvp, dvs); 4644 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4645 vrele(dvp); 4646 return (cache_fpl_aborted(fpl)); 4647 } 4648 4649 error = vn_lock(dvp, LK_SHARED); 4650 if (__predict_false(error != 0)) { 4651 vrele(dvp); 4652 return (cache_fpl_aborted(fpl)); 4653 } 4654 4655 tvp = NULL; 4656 /* 4657 * TODO: provide variants which don't require locking either vnode. 4658 */ 4659 cnp->cn_flags |= ISLASTCN; 4660 docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE; 4661 if (docache) 4662 cnp->cn_flags |= MAKEENTRY; 4663 cnp->cn_lkflags = LK_SHARED; 4664 if ((cnp->cn_flags & LOCKSHARED) == 0) { 4665 cnp->cn_lkflags = LK_EXCLUSIVE; 4666 } 4667 error = VOP_LOOKUP(dvp, &tvp, cnp); 4668 switch (error) { 4669 case EJUSTRETURN: 4670 case 0: 4671 break; 4672 case ENOTDIR: 4673 case ENOENT: 4674 vput(dvp); 4675 return (cache_fpl_handled_error(fpl, error)); 4676 default: 4677 vput(dvp); 4678 return (cache_fpl_aborted(fpl)); 4679 } 4680 4681 fpl->tvp = tvp; 4682 if (!fpl->savename) { 4683 MPASS((cnp->cn_flags & SAVENAME) == 0); 4684 } 4685 4686 if (tvp == NULL) { 4687 MPASS(error == EJUSTRETURN); 4688 if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) { 4689 vput(dvp); 4690 } else if ((cnp->cn_flags & LOCKPARENT) == 0) { 4691 VOP_UNLOCK(dvp); 4692 } 4693 return (cache_fpl_handled(fpl)); 4694 } 4695 4696 if (tvp->v_type == VLNK) { 4697 if ((cnp->cn_flags & FOLLOW) != 0) { 4698 vput(dvp); 4699 vput(tvp); 4700 return (cache_fpl_aborted(fpl)); 4701 } 4702 } 4703 4704 if (__predict_false(cache_fplookup_is_mp(fpl))) { 4705 vput(dvp); 4706 vput(tvp); 4707 return (cache_fpl_aborted(fpl)); 4708 } 4709 4710 if ((cnp->cn_flags & LOCKLEAF) == 0) { 4711 VOP_UNLOCK(tvp); 4712 } 4713 4714 if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) { 4715 vput(dvp); 4716 } else if ((cnp->cn_flags & LOCKPARENT) == 0) { 4717 VOP_UNLOCK(dvp); 4718 } 4719 return (cache_fpl_handled(fpl)); 4720 } 4721 4722 static int __noinline 4723 cache_fplookup_dot(struct cache_fpl *fpl) 4724 { 4725 int error; 4726 4727 MPASS(!seqc_in_modify(fpl->dvp_seqc)); 4728 /* 4729 * Just re-assign the value. seqc will be checked later for the first 4730 * non-dot path component in line and/or before deciding to return the 4731 * vnode. 4732 */ 4733 fpl->tvp = fpl->dvp; 4734 fpl->tvp_seqc = fpl->dvp_seqc; 4735 4736 counter_u64_add(dothits, 1); 4737 SDT_PROBE3(vfs, namecache, lookup, hit, fpl->dvp, ".", fpl->dvp); 4738 4739 error = 0; 4740 if (cache_fplookup_is_mp(fpl)) { 4741 error = cache_fplookup_cross_mount(fpl); 4742 } 4743 return (error); 4744 } 4745 4746 static int __noinline 4747 cache_fplookup_dotdot(struct cache_fpl *fpl) 4748 { 4749 struct nameidata *ndp; 4750 struct componentname *cnp; 4751 struct namecache *ncp; 4752 struct vnode *dvp; 4753 struct prison *pr; 4754 u_char nc_flag; 4755 4756 ndp = fpl->ndp; 4757 cnp = fpl->cnp; 4758 dvp = fpl->dvp; 4759 4760 MPASS(cache_fpl_isdotdot(cnp)); 4761 4762 /* 4763 * XXX this is racy the same way regular lookup is 4764 */ 4765 for (pr = cnp->cn_cred->cr_prison; pr != NULL; 4766 pr = pr->pr_parent) 4767 if (dvp == pr->pr_root) 4768 break; 4769 4770 if (dvp == ndp->ni_rootdir || 4771 dvp == ndp->ni_topdir || 4772 dvp == rootvnode || 4773 pr != NULL) { 4774 fpl->tvp = dvp; 4775 fpl->tvp_seqc = vn_seqc_read_any(dvp); 4776 if (seqc_in_modify(fpl->tvp_seqc)) { 4777 return (cache_fpl_aborted(fpl)); 4778 } 4779 return (0); 4780 } 4781 4782 if ((dvp->v_vflag & VV_ROOT) != 0) { 4783 /* 4784 * TODO 4785 * The opposite of climb mount is needed here. 4786 */ 4787 return (cache_fpl_partial(fpl)); 4788 } 4789 4790 ncp = atomic_load_consume_ptr(&dvp->v_cache_dd); 4791 if (ncp == NULL) { 4792 return (cache_fpl_aborted(fpl)); 4793 } 4794 4795 nc_flag = atomic_load_char(&ncp->nc_flag); 4796 if ((nc_flag & NCF_ISDOTDOT) != 0) { 4797 if ((nc_flag & NCF_NEGATIVE) != 0) 4798 return (cache_fpl_aborted(fpl)); 4799 fpl->tvp = ncp->nc_vp; 4800 } else { 4801 fpl->tvp = ncp->nc_dvp; 4802 } 4803 4804 fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp); 4805 if (seqc_in_modify(fpl->tvp_seqc)) { 4806 return (cache_fpl_partial(fpl)); 4807 } 4808 4809 /* 4810 * Acquire fence provided by vn_seqc_read_any above. 4811 */ 4812 if (__predict_false(atomic_load_ptr(&dvp->v_cache_dd) != ncp)) { 4813 return (cache_fpl_aborted(fpl)); 4814 } 4815 4816 if (!cache_ncp_canuse(ncp)) { 4817 return (cache_fpl_aborted(fpl)); 4818 } 4819 4820 counter_u64_add(dotdothits, 1); 4821 return (0); 4822 } 4823 4824 static int __noinline 4825 cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash) 4826 { 4827 u_char nc_flag; 4828 bool neg_promote; 4829 4830 nc_flag = atomic_load_char(&ncp->nc_flag); 4831 MPASS((nc_flag & NCF_NEGATIVE) != 0); 4832 /* 4833 * If they want to create an entry we need to replace this one. 4834 */ 4835 if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) { 4836 fpl->tvp = NULL; 4837 return (cache_fplookup_modifying(fpl)); 4838 } 4839 neg_promote = cache_neg_hit_prep(ncp); 4840 if (!cache_fpl_neg_ncp_canuse(ncp)) { 4841 cache_neg_hit_abort(ncp); 4842 return (cache_fpl_partial(fpl)); 4843 } 4844 if (neg_promote) { 4845 return (cache_fplookup_negative_promote(fpl, ncp, hash)); 4846 } 4847 cache_neg_hit_finish(ncp); 4848 cache_fpl_smr_exit(fpl); 4849 return (cache_fpl_handled_error(fpl, ENOENT)); 4850 } 4851 4852 /* 4853 * Resolve a symlink. Called by filesystem-specific routines. 4854 * 4855 * Code flow is: 4856 * ... -> cache_fplookup_symlink -> VOP_FPLOOKUP_SYMLINK -> cache_symlink_resolve 4857 */ 4858 int 4859 cache_symlink_resolve(struct cache_fpl *fpl, const char *string, size_t len) 4860 { 4861 struct nameidata *ndp; 4862 struct componentname *cnp; 4863 size_t adjust; 4864 4865 ndp = fpl->ndp; 4866 cnp = fpl->cnp; 4867 4868 if (__predict_false(len == 0)) { 4869 return (ENOENT); 4870 } 4871 4872 if (__predict_false(len > MAXPATHLEN - 2)) { 4873 if (cache_fpl_istrailingslash(fpl)) { 4874 return (EAGAIN); 4875 } 4876 } 4877 4878 ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr - cnp->cn_namelen + 1; 4879 #ifdef INVARIANTS 4880 if (ndp->ni_pathlen != fpl->debug.ni_pathlen) { 4881 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n", 4882 __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar, 4883 cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf); 4884 } 4885 #endif 4886 4887 if (__predict_false(len + ndp->ni_pathlen > MAXPATHLEN)) { 4888 return (ENAMETOOLONG); 4889 } 4890 4891 if (__predict_false(ndp->ni_loopcnt++ >= MAXSYMLINKS)) { 4892 return (ELOOP); 4893 } 4894 4895 adjust = len; 4896 if (ndp->ni_pathlen > 1) { 4897 bcopy(ndp->ni_next, cnp->cn_pnbuf + len, ndp->ni_pathlen); 4898 } else { 4899 if (cache_fpl_istrailingslash(fpl)) { 4900 adjust = len + 1; 4901 cnp->cn_pnbuf[len] = '/'; 4902 cnp->cn_pnbuf[len + 1] = '\0'; 4903 } else { 4904 cnp->cn_pnbuf[len] = '\0'; 4905 } 4906 } 4907 bcopy(string, cnp->cn_pnbuf, len); 4908 4909 ndp->ni_pathlen += adjust; 4910 cache_fpl_pathlen_add(fpl, adjust); 4911 cnp->cn_nameptr = cnp->cn_pnbuf; 4912 fpl->nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1]; 4913 fpl->tvp = NULL; 4914 return (0); 4915 } 4916 4917 static int __noinline 4918 cache_fplookup_symlink(struct cache_fpl *fpl) 4919 { 4920 struct mount *mp; 4921 struct nameidata *ndp; 4922 struct componentname *cnp; 4923 struct vnode *dvp, *tvp; 4924 int error; 4925 4926 ndp = fpl->ndp; 4927 cnp = fpl->cnp; 4928 dvp = fpl->dvp; 4929 tvp = fpl->tvp; 4930 4931 if (cache_fpl_islastcn(ndp)) { 4932 if ((cnp->cn_flags & FOLLOW) == 0) { 4933 return (cache_fplookup_final(fpl)); 4934 } 4935 } 4936 4937 mp = atomic_load_ptr(&dvp->v_mount); 4938 if (__predict_false(mp == NULL)) { 4939 return (cache_fpl_aborted(fpl)); 4940 } 4941 4942 /* 4943 * Note this check races against setting the flag just like regular 4944 * lookup. 4945 */ 4946 if (__predict_false((mp->mnt_flag & MNT_NOSYMFOLLOW) != 0)) { 4947 cache_fpl_smr_exit(fpl); 4948 return (cache_fpl_handled_error(fpl, EACCES)); 4949 } 4950 4951 error = VOP_FPLOOKUP_SYMLINK(tvp, fpl); 4952 if (__predict_false(error != 0)) { 4953 switch (error) { 4954 case EAGAIN: 4955 return (cache_fpl_partial(fpl)); 4956 case ENOENT: 4957 case ENAMETOOLONG: 4958 case ELOOP: 4959 cache_fpl_smr_exit(fpl); 4960 return (cache_fpl_handled_error(fpl, error)); 4961 default: 4962 return (cache_fpl_aborted(fpl)); 4963 } 4964 } 4965 4966 if (*(cnp->cn_nameptr) == '/') { 4967 fpl->dvp = cache_fpl_handle_root(fpl); 4968 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); 4969 if (seqc_in_modify(fpl->dvp_seqc)) { 4970 return (cache_fpl_aborted(fpl)); 4971 } 4972 } 4973 return (0); 4974 } 4975 4976 static int 4977 cache_fplookup_next(struct cache_fpl *fpl) 4978 { 4979 struct componentname *cnp; 4980 struct namecache *ncp; 4981 struct vnode *dvp, *tvp; 4982 u_char nc_flag; 4983 uint32_t hash; 4984 int error; 4985 4986 cnp = fpl->cnp; 4987 dvp = fpl->dvp; 4988 hash = fpl->hash; 4989 4990 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 4991 if (cnp->cn_namelen == 1) { 4992 return (cache_fplookup_dot(fpl)); 4993 } 4994 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 4995 return (cache_fplookup_dotdot(fpl)); 4996 } 4997 } 4998 4999 MPASS(!cache_fpl_isdotdot(cnp)); 5000 5001 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 5002 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 5003 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 5004 break; 5005 } 5006 5007 if (__predict_false(ncp == NULL)) { 5008 return (cache_fplookup_noentry(fpl)); 5009 } 5010 5011 tvp = atomic_load_ptr(&ncp->nc_vp); 5012 nc_flag = atomic_load_char(&ncp->nc_flag); 5013 if ((nc_flag & NCF_NEGATIVE) != 0) { 5014 return (cache_fplookup_neg(fpl, ncp, hash)); 5015 } 5016 5017 if (!cache_ncp_canuse(ncp)) { 5018 return (cache_fpl_partial(fpl)); 5019 } 5020 5021 fpl->tvp = tvp; 5022 fpl->tvp_seqc = vn_seqc_read_any(tvp); 5023 if (seqc_in_modify(fpl->tvp_seqc)) { 5024 return (cache_fpl_partial(fpl)); 5025 } 5026 5027 counter_u64_add(numposhits, 1); 5028 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp); 5029 5030 error = 0; 5031 if (cache_fplookup_is_mp(fpl)) { 5032 error = cache_fplookup_cross_mount(fpl); 5033 } 5034 return (error); 5035 } 5036 5037 static bool 5038 cache_fplookup_mp_supported(struct mount *mp) 5039 { 5040 5041 MPASS(mp != NULL); 5042 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) 5043 return (false); 5044 return (true); 5045 } 5046 5047 /* 5048 * Walk up the mount stack (if any). 5049 * 5050 * Correctness is provided in the following ways: 5051 * - all vnodes are protected from freeing with SMR 5052 * - struct mount objects are type stable making them always safe to access 5053 * - stability of the particular mount is provided by busying it 5054 * - relationship between the vnode which is mounted on and the mount is 5055 * verified with the vnode sequence counter after busying 5056 * - association between root vnode of the mount and the mount is protected 5057 * by busy 5058 * 5059 * From that point on we can read the sequence counter of the root vnode 5060 * and get the next mount on the stack (if any) using the same protection. 5061 * 5062 * By the end of successful walk we are guaranteed the reached state was 5063 * indeed present at least at some point which matches the regular lookup. 5064 */ 5065 static int __noinline 5066 cache_fplookup_climb_mount(struct cache_fpl *fpl) 5067 { 5068 struct mount *mp, *prev_mp; 5069 struct mount_pcpu *mpcpu, *prev_mpcpu; 5070 struct vnode *vp; 5071 seqc_t vp_seqc; 5072 5073 vp = fpl->tvp; 5074 vp_seqc = fpl->tvp_seqc; 5075 5076 VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp); 5077 mp = atomic_load_ptr(&vp->v_mountedhere); 5078 if (__predict_false(mp == NULL)) { 5079 return (0); 5080 } 5081 5082 prev_mp = NULL; 5083 for (;;) { 5084 if (!vfs_op_thread_enter_crit(mp, mpcpu)) { 5085 if (prev_mp != NULL) 5086 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); 5087 return (cache_fpl_partial(fpl)); 5088 } 5089 if (prev_mp != NULL) 5090 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); 5091 if (!vn_seqc_consistent(vp, vp_seqc)) { 5092 vfs_op_thread_exit_crit(mp, mpcpu); 5093 return (cache_fpl_partial(fpl)); 5094 } 5095 if (!cache_fplookup_mp_supported(mp)) { 5096 vfs_op_thread_exit_crit(mp, mpcpu); 5097 return (cache_fpl_partial(fpl)); 5098 } 5099 vp = atomic_load_ptr(&mp->mnt_rootvnode); 5100 if (vp == NULL) { 5101 vfs_op_thread_exit_crit(mp, mpcpu); 5102 return (cache_fpl_partial(fpl)); 5103 } 5104 vp_seqc = vn_seqc_read_any(vp); 5105 if (seqc_in_modify(vp_seqc)) { 5106 vfs_op_thread_exit_crit(mp, mpcpu); 5107 return (cache_fpl_partial(fpl)); 5108 } 5109 prev_mp = mp; 5110 prev_mpcpu = mpcpu; 5111 mp = atomic_load_ptr(&vp->v_mountedhere); 5112 if (mp == NULL) 5113 break; 5114 } 5115 5116 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); 5117 fpl->tvp = vp; 5118 fpl->tvp_seqc = vp_seqc; 5119 return (0); 5120 } 5121 5122 static int __noinline 5123 cache_fplookup_cross_mount(struct cache_fpl *fpl) 5124 { 5125 struct mount *mp; 5126 struct mount_pcpu *mpcpu; 5127 struct vnode *vp; 5128 seqc_t vp_seqc; 5129 5130 vp = fpl->tvp; 5131 vp_seqc = fpl->tvp_seqc; 5132 5133 VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp); 5134 mp = atomic_load_ptr(&vp->v_mountedhere); 5135 if (__predict_false(mp == NULL)) { 5136 return (0); 5137 } 5138 5139 if (!vfs_op_thread_enter_crit(mp, mpcpu)) { 5140 return (cache_fpl_partial(fpl)); 5141 } 5142 if (!vn_seqc_consistent(vp, vp_seqc)) { 5143 vfs_op_thread_exit_crit(mp, mpcpu); 5144 return (cache_fpl_partial(fpl)); 5145 } 5146 if (!cache_fplookup_mp_supported(mp)) { 5147 vfs_op_thread_exit_crit(mp, mpcpu); 5148 return (cache_fpl_partial(fpl)); 5149 } 5150 vp = atomic_load_ptr(&mp->mnt_rootvnode); 5151 if (__predict_false(vp == NULL)) { 5152 vfs_op_thread_exit_crit(mp, mpcpu); 5153 return (cache_fpl_partial(fpl)); 5154 } 5155 vp_seqc = vn_seqc_read_any(vp); 5156 vfs_op_thread_exit_crit(mp, mpcpu); 5157 if (seqc_in_modify(vp_seqc)) { 5158 return (cache_fpl_partial(fpl)); 5159 } 5160 mp = atomic_load_ptr(&vp->v_mountedhere); 5161 if (__predict_false(mp != NULL)) { 5162 /* 5163 * There are possibly more mount points on top. 5164 * Normally this does not happen so for simplicity just start 5165 * over. 5166 */ 5167 return (cache_fplookup_climb_mount(fpl)); 5168 } 5169 5170 fpl->tvp = vp; 5171 fpl->tvp_seqc = vp_seqc; 5172 return (0); 5173 } 5174 5175 /* 5176 * Check if a vnode is mounted on. 5177 */ 5178 static bool 5179 cache_fplookup_is_mp(struct cache_fpl *fpl) 5180 { 5181 struct vnode *vp; 5182 5183 vp = fpl->tvp; 5184 return ((vn_irflag_read(vp) & VIRF_MOUNTPOINT) != 0); 5185 } 5186 5187 /* 5188 * Parse the path. 5189 * 5190 * The code was originally copy-pasted from regular lookup and despite 5191 * clean ups leaves performance on the table. Any modifications here 5192 * must take into account that in case off fallback the resulting 5193 * nameidata state has to be compatible with the original. 5194 */ 5195 5196 /* 5197 * Debug ni_pathlen tracking. 5198 */ 5199 #ifdef INVARIANTS 5200 static void 5201 cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n) 5202 { 5203 5204 fpl->debug.ni_pathlen += n; 5205 KASSERT(fpl->debug.ni_pathlen <= PATH_MAX, 5206 ("%s: pathlen overflow to %zd\n", __func__, fpl->debug.ni_pathlen)); 5207 } 5208 5209 static void 5210 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n) 5211 { 5212 5213 fpl->debug.ni_pathlen -= n; 5214 KASSERT(fpl->debug.ni_pathlen <= PATH_MAX, 5215 ("%s: pathlen underflow to %zd\n", __func__, fpl->debug.ni_pathlen)); 5216 } 5217 5218 static void 5219 cache_fpl_pathlen_inc(struct cache_fpl *fpl) 5220 { 5221 5222 cache_fpl_pathlen_add(fpl, 1); 5223 } 5224 5225 static void 5226 cache_fpl_pathlen_dec(struct cache_fpl *fpl) 5227 { 5228 5229 cache_fpl_pathlen_sub(fpl, 1); 5230 } 5231 #else 5232 static void 5233 cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n) 5234 { 5235 } 5236 5237 static void 5238 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n) 5239 { 5240 } 5241 5242 static void 5243 cache_fpl_pathlen_inc(struct cache_fpl *fpl) 5244 { 5245 } 5246 5247 static void 5248 cache_fpl_pathlen_dec(struct cache_fpl *fpl) 5249 { 5250 } 5251 #endif 5252 5253 static void 5254 cache_fplookup_parse(struct cache_fpl *fpl) 5255 { 5256 struct nameidata *ndp; 5257 struct componentname *cnp; 5258 struct vnode *dvp; 5259 char *cp; 5260 uint32_t hash; 5261 5262 ndp = fpl->ndp; 5263 cnp = fpl->cnp; 5264 dvp = fpl->dvp; 5265 5266 /* 5267 * Find the end of this path component, it is either / or nul. 5268 * 5269 * Store / as a temporary sentinel so that we only have one character 5270 * to test for. Pathnames tend to be short so this should not be 5271 * resulting in cache misses. 5272 * 5273 * TODO: fix this to be word-sized. 5274 */ 5275 KASSERT(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] == fpl->nulchar, 5276 ("%s: mismatch between pathlen (%zu) and nulchar (%p != %p), string [%s]\n", 5277 __func__, fpl->debug.ni_pathlen, &cnp->cn_nameptr[fpl->debug.ni_pathlen - 1], 5278 fpl->nulchar, cnp->cn_pnbuf)); 5279 KASSERT(*fpl->nulchar == '\0', 5280 ("%s: expected nul at %p; string [%s]\n", __func__, fpl->nulchar, 5281 cnp->cn_pnbuf)); 5282 hash = cache_get_hash_iter_start(dvp); 5283 *fpl->nulchar = '/'; 5284 for (cp = cnp->cn_nameptr; *cp != '/'; cp++) { 5285 KASSERT(*cp != '\0', 5286 ("%s: encountered unexpected nul; string [%s]\n", __func__, 5287 cnp->cn_nameptr)); 5288 hash = cache_get_hash_iter(*cp, hash); 5289 continue; 5290 } 5291 *fpl->nulchar = '\0'; 5292 fpl->hash = cache_get_hash_iter_finish(hash); 5293 5294 cnp->cn_namelen = cp - cnp->cn_nameptr; 5295 cache_fpl_pathlen_sub(fpl, cnp->cn_namelen); 5296 5297 #ifdef INVARIANTS 5298 if (cnp->cn_namelen <= NAME_MAX) { 5299 if (fpl->hash != cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp)) { 5300 panic("%s: mismatched hash for [%s] len %ld", __func__, 5301 cnp->cn_nameptr, cnp->cn_namelen); 5302 } 5303 } 5304 #endif 5305 5306 /* 5307 * Hack: we have to check if the found path component's length exceeds 5308 * NAME_MAX. However, the condition is very rarely true and check can 5309 * be elided in the common case -- if an entry was found in the cache, 5310 * then it could not have been too long to begin with. 5311 */ 5312 ndp->ni_next = cp; 5313 } 5314 5315 static void 5316 cache_fplookup_parse_advance(struct cache_fpl *fpl) 5317 { 5318 struct nameidata *ndp; 5319 struct componentname *cnp; 5320 5321 ndp = fpl->ndp; 5322 cnp = fpl->cnp; 5323 5324 cnp->cn_nameptr = ndp->ni_next; 5325 KASSERT(*(cnp->cn_nameptr) == '/', 5326 ("%s: should have seen slash at %p ; buf %p [%s]\n", __func__, 5327 cnp->cn_nameptr, cnp->cn_pnbuf, cnp->cn_pnbuf)); 5328 cnp->cn_nameptr++; 5329 cache_fpl_pathlen_dec(fpl); 5330 } 5331 5332 /* 5333 * Skip spurious slashes in a pathname (e.g., "foo///bar") and retry. 5334 * 5335 * Lockless lookup tries to elide checking for spurious slashes and should they 5336 * be present is guaranteed to fail to find an entry. In this case the caller 5337 * must check if the name starts with a slash and call this routine. It is 5338 * going to fast forward across the spurious slashes and set the state up for 5339 * retry. 5340 */ 5341 static int __noinline 5342 cache_fplookup_skip_slashes(struct cache_fpl *fpl) 5343 { 5344 struct nameidata *ndp; 5345 struct componentname *cnp; 5346 5347 ndp = fpl->ndp; 5348 cnp = fpl->cnp; 5349 5350 MPASS(*(cnp->cn_nameptr) == '/'); 5351 do { 5352 cnp->cn_nameptr++; 5353 cache_fpl_pathlen_dec(fpl); 5354 } while (*(cnp->cn_nameptr) == '/'); 5355 5356 /* 5357 * Go back to one slash so that cache_fplookup_parse_advance has 5358 * something to skip. 5359 */ 5360 cnp->cn_nameptr--; 5361 cache_fpl_pathlen_inc(fpl); 5362 5363 /* 5364 * cache_fplookup_parse_advance starts from ndp->ni_next 5365 */ 5366 ndp->ni_next = cnp->cn_nameptr; 5367 5368 /* 5369 * See cache_fplookup_dot. 5370 */ 5371 fpl->tvp = fpl->dvp; 5372 fpl->tvp_seqc = fpl->dvp_seqc; 5373 5374 return (0); 5375 } 5376 5377 /* 5378 * Handle trailing slashes (e.g., "foo/"). 5379 * 5380 * If a trailing slash is found the terminal vnode must be a directory. 5381 * Regular lookup shortens the path by nulifying the first trailing slash and 5382 * sets the TRAILINGSLASH flag to denote this took place. There are several 5383 * checks on it performed later. 5384 * 5385 * Similarly to spurious slashes, lockless lookup handles this in a speculative 5386 * manner relying on an invariant that a non-directory vnode will get a miss. 5387 * In this case cn_nameptr[0] == '\0' and cn_namelen == 0. 5388 * 5389 * Thus for a path like "foo/bar/" the code unwinds the state back to 'bar/' 5390 * and denotes this is the last path component, which avoids looping back. 5391 * 5392 * Only plain lookups are supported for now to restrict corner cases to handle. 5393 */ 5394 static int __noinline 5395 cache_fplookup_trailingslash(struct cache_fpl *fpl) 5396 { 5397 #ifdef INVARIANTS 5398 size_t ni_pathlen; 5399 #endif 5400 struct nameidata *ndp; 5401 struct componentname *cnp; 5402 struct namecache *ncp; 5403 struct vnode *tvp; 5404 char *cn_nameptr_orig, *cn_nameptr_slash; 5405 seqc_t tvp_seqc; 5406 u_char nc_flag; 5407 5408 ndp = fpl->ndp; 5409 cnp = fpl->cnp; 5410 tvp = fpl->tvp; 5411 tvp_seqc = fpl->tvp_seqc; 5412 5413 MPASS(fpl->dvp == fpl->tvp); 5414 KASSERT(cache_fpl_istrailingslash(fpl), 5415 ("%s: expected trailing slash at %p; string [%s]\n", __func__, fpl->nulchar - 1, 5416 cnp->cn_pnbuf)); 5417 KASSERT(cnp->cn_nameptr[0] == '\0', 5418 ("%s: expected nul char at %p; string [%s]\n", __func__, &cnp->cn_nameptr[0], 5419 cnp->cn_pnbuf)); 5420 KASSERT(cnp->cn_namelen == 0, 5421 ("%s: namelen 0 but got %ld; string [%s]\n", __func__, cnp->cn_namelen, 5422 cnp->cn_pnbuf)); 5423 MPASS(cnp->cn_nameptr > cnp->cn_pnbuf); 5424 5425 if (cnp->cn_nameiop != LOOKUP) { 5426 return (cache_fpl_aborted(fpl)); 5427 } 5428 5429 if (__predict_false(tvp->v_type != VDIR)) { 5430 if (!vn_seqc_consistent(tvp, tvp_seqc)) { 5431 return (cache_fpl_aborted(fpl)); 5432 } 5433 cache_fpl_smr_exit(fpl); 5434 return (cache_fpl_handled_error(fpl, ENOTDIR)); 5435 } 5436 5437 /* 5438 * Denote the last component. 5439 */ 5440 ndp->ni_next = &cnp->cn_nameptr[0]; 5441 MPASS(cache_fpl_islastcn(ndp)); 5442 5443 /* 5444 * Unwind trailing slashes. 5445 */ 5446 cn_nameptr_orig = cnp->cn_nameptr; 5447 while (cnp->cn_nameptr >= cnp->cn_pnbuf) { 5448 cnp->cn_nameptr--; 5449 if (cnp->cn_nameptr[0] != '/') { 5450 break; 5451 } 5452 } 5453 5454 /* 5455 * Unwind to the beginning of the path component. 5456 * 5457 * Note the path may or may not have started with a slash. 5458 */ 5459 cn_nameptr_slash = cnp->cn_nameptr; 5460 while (cnp->cn_nameptr > cnp->cn_pnbuf) { 5461 cnp->cn_nameptr--; 5462 if (cnp->cn_nameptr[0] == '/') { 5463 break; 5464 } 5465 } 5466 if (cnp->cn_nameptr[0] == '/') { 5467 cnp->cn_nameptr++; 5468 } 5469 5470 cnp->cn_namelen = cn_nameptr_slash - cnp->cn_nameptr + 1; 5471 cache_fpl_pathlen_add(fpl, cn_nameptr_orig - cnp->cn_nameptr); 5472 cache_fpl_checkpoint(fpl); 5473 5474 #ifdef INVARIANTS 5475 ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1; 5476 if (ni_pathlen != fpl->debug.ni_pathlen) { 5477 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n", 5478 __func__, ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar, 5479 cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf); 5480 } 5481 #endif 5482 5483 /* 5484 * The previous directory is this one. 5485 */ 5486 if (cnp->cn_nameptr[0] == '.' && cnp->cn_namelen == 1) { 5487 return (0); 5488 } 5489 5490 /* 5491 * The previous directory is something else. 5492 */ 5493 tvp = fpl->tvp; 5494 ncp = atomic_load_consume_ptr(&tvp->v_cache_dd); 5495 if (__predict_false(ncp == NULL)) { 5496 return (cache_fpl_aborted(fpl)); 5497 } 5498 nc_flag = atomic_load_char(&ncp->nc_flag); 5499 if ((nc_flag & NCF_ISDOTDOT) != 0) { 5500 return (cache_fpl_aborted(fpl)); 5501 } 5502 fpl->dvp = ncp->nc_dvp; 5503 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); 5504 if (seqc_in_modify(fpl->dvp_seqc)) { 5505 return (cache_fpl_aborted(fpl)); 5506 } 5507 return (0); 5508 } 5509 5510 /* 5511 * See the API contract for VOP_FPLOOKUP_VEXEC. 5512 */ 5513 static int __noinline 5514 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error) 5515 { 5516 struct componentname *cnp; 5517 struct vnode *dvp; 5518 seqc_t dvp_seqc; 5519 5520 cnp = fpl->cnp; 5521 dvp = fpl->dvp; 5522 dvp_seqc = fpl->dvp_seqc; 5523 5524 /* 5525 * Hack: delayed degenerate path checking. 5526 */ 5527 if (cnp->cn_nameptr[0] == '\0' && fpl->tvp == NULL) { 5528 return (cache_fplookup_degenerate(fpl)); 5529 } 5530 5531 /* 5532 * Hack: delayed name len checking. 5533 */ 5534 if (__predict_false(cnp->cn_namelen > NAME_MAX)) { 5535 cache_fpl_smr_exit(fpl); 5536 return (cache_fpl_handled_error(fpl, ENAMETOOLONG)); 5537 } 5538 5539 /* 5540 * Hack: they may be looking up foo/bar, where foo is not a directory. 5541 * In such a case we need to return ENOTDIR, but we may happen to get 5542 * here with a different error. 5543 */ 5544 if (dvp->v_type != VDIR) { 5545 error = ENOTDIR; 5546 } 5547 5548 /* 5549 * Hack: handle O_SEARCH. 5550 * 5551 * Open Group Base Specifications Issue 7, 2018 edition states: 5552 * <quote> 5553 * If the access mode of the open file description associated with the 5554 * file descriptor is not O_SEARCH, the function shall check whether 5555 * directory searches are permitted using the current permissions of 5556 * the directory underlying the file descriptor. If the access mode is 5557 * O_SEARCH, the function shall not perform the check. 5558 * </quote> 5559 * 5560 * Regular lookup tests for the NOEXECCHECK flag for every path 5561 * component to decide whether to do the permission check. However, 5562 * since most lookups never have the flag (and when they do it is only 5563 * present for the first path component), lockless lookup only acts on 5564 * it if there is a permission problem. Here the flag is represented 5565 * with a boolean so that we don't have to clear it on the way out. 5566 * 5567 * For simplicity this always aborts. 5568 * TODO: check if this is the first lookup and ignore the permission 5569 * problem. Note the flag has to survive fallback (if it happens to be 5570 * performed). 5571 */ 5572 if (fpl->fsearch) { 5573 return (cache_fpl_aborted(fpl)); 5574 } 5575 5576 switch (error) { 5577 case EAGAIN: 5578 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 5579 error = cache_fpl_aborted(fpl); 5580 } else { 5581 cache_fpl_partial(fpl); 5582 } 5583 break; 5584 default: 5585 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 5586 error = cache_fpl_aborted(fpl); 5587 } else { 5588 cache_fpl_smr_exit(fpl); 5589 cache_fpl_handled_error(fpl, error); 5590 } 5591 break; 5592 } 5593 return (error); 5594 } 5595 5596 static int 5597 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl) 5598 { 5599 struct nameidata *ndp; 5600 struct componentname *cnp; 5601 struct mount *mp; 5602 int error; 5603 5604 ndp = fpl->ndp; 5605 cnp = fpl->cnp; 5606 5607 cache_fpl_checkpoint(fpl); 5608 5609 /* 5610 * The vnode at hand is almost always stable, skip checking for it. 5611 * Worst case this postpones the check towards the end of the iteration 5612 * of the main loop. 5613 */ 5614 fpl->dvp = dvp; 5615 fpl->dvp_seqc = vn_seqc_read_notmodify(fpl->dvp); 5616 5617 mp = atomic_load_ptr(&dvp->v_mount); 5618 if (__predict_false(mp == NULL || !cache_fplookup_mp_supported(mp))) { 5619 return (cache_fpl_aborted(fpl)); 5620 } 5621 5622 MPASS(fpl->tvp == NULL); 5623 5624 for (;;) { 5625 cache_fplookup_parse(fpl); 5626 5627 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred); 5628 if (__predict_false(error != 0)) { 5629 error = cache_fplookup_failed_vexec(fpl, error); 5630 break; 5631 } 5632 5633 error = cache_fplookup_next(fpl); 5634 if (__predict_false(cache_fpl_terminated(fpl))) { 5635 break; 5636 } 5637 5638 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 5639 5640 if (fpl->tvp->v_type == VLNK) { 5641 error = cache_fplookup_symlink(fpl); 5642 if (cache_fpl_terminated(fpl)) { 5643 break; 5644 } 5645 } else { 5646 if (cache_fpl_islastcn(ndp)) { 5647 error = cache_fplookup_final(fpl); 5648 break; 5649 } 5650 5651 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { 5652 error = cache_fpl_aborted(fpl); 5653 break; 5654 } 5655 5656 fpl->dvp = fpl->tvp; 5657 fpl->dvp_seqc = fpl->tvp_seqc; 5658 cache_fplookup_parse_advance(fpl); 5659 } 5660 5661 cache_fpl_checkpoint(fpl); 5662 } 5663 5664 return (error); 5665 } 5666 5667 /* 5668 * Fast path lookup protected with SMR and sequence counters. 5669 * 5670 * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one. 5671 * 5672 * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria 5673 * outlined below. 5674 * 5675 * Traditional vnode lookup conceptually looks like this: 5676 * 5677 * vn_lock(current); 5678 * for (;;) { 5679 * next = find(); 5680 * vn_lock(next); 5681 * vn_unlock(current); 5682 * current = next; 5683 * if (last) 5684 * break; 5685 * } 5686 * return (current); 5687 * 5688 * Each jump to the next vnode is safe memory-wise and atomic with respect to 5689 * any modifications thanks to holding respective locks. 5690 * 5691 * The same guarantee can be provided with a combination of safe memory 5692 * reclamation and sequence counters instead. If all operations which affect 5693 * the relationship between the current vnode and the one we are looking for 5694 * also modify the counter, we can verify whether all the conditions held as 5695 * we made the jump. This includes things like permissions, mount points etc. 5696 * Counter modification is provided by enclosing relevant places in 5697 * vn_seqc_write_begin()/end() calls. 5698 * 5699 * Thus this translates to: 5700 * 5701 * vfs_smr_enter(); 5702 * dvp_seqc = seqc_read_any(dvp); 5703 * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode 5704 * abort(); 5705 * for (;;) { 5706 * tvp = find(); 5707 * tvp_seqc = seqc_read_any(tvp); 5708 * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode 5709 * abort(); 5710 * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode 5711 * abort(); 5712 * dvp = tvp; // we know nothing of importance has changed 5713 * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration 5714 * if (last) 5715 * break; 5716 * } 5717 * vget(); // secure the vnode 5718 * if (!seqc_consistent(tvp, tvp_seqc) // final check 5719 * abort(); 5720 * // at this point we know nothing has changed for any parent<->child pair 5721 * // as they were crossed during the lookup, meaning we matched the guarantee 5722 * // of the locked variant 5723 * return (tvp); 5724 * 5725 * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows: 5726 * - they are called while within vfs_smr protection which they must never exit 5727 * - EAGAIN can be returned to denote checking could not be performed, it is 5728 * always valid to return it 5729 * - if the sequence counter has not changed the result must be valid 5730 * - if the sequence counter has changed both false positives and false negatives 5731 * are permitted (since the result will be rejected later) 5732 * - for simple cases of unix permission checks vaccess_vexec_smr can be used 5733 * 5734 * Caveats to watch out for: 5735 * - vnodes are passed unlocked and unreferenced with nothing stopping 5736 * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised 5737 * to use atomic_load_ptr to fetch it. 5738 * - the aforementioned object can also get freed, meaning absent other means it 5739 * should be protected with vfs_smr 5740 * - either safely checking permissions as they are modified or guaranteeing 5741 * their stability is left to the routine 5742 */ 5743 int 5744 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status, 5745 struct pwd **pwdp) 5746 { 5747 struct cache_fpl fpl; 5748 struct pwd *pwd; 5749 struct vnode *dvp; 5750 struct componentname *cnp; 5751 int error; 5752 5753 fpl.status = CACHE_FPL_STATUS_UNSET; 5754 fpl.in_smr = false; 5755 fpl.ndp = ndp; 5756 fpl.cnp = cnp = &ndp->ni_cnd; 5757 MPASS(ndp->ni_lcf == 0); 5758 MPASS(curthread == cnp->cn_thread); 5759 KASSERT ((cnp->cn_flags & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, 5760 ("%s: internal flags found in cn_flags %" PRIx64, __func__, 5761 cnp->cn_flags)); 5762 if ((cnp->cn_flags & SAVESTART) != 0) { 5763 MPASS(cnp->cn_nameiop != LOOKUP); 5764 } 5765 MPASS(cnp->cn_nameptr == cnp->cn_pnbuf); 5766 5767 if (__predict_false(!cache_can_fplookup(&fpl))) { 5768 *status = fpl.status; 5769 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 5770 return (EOPNOTSUPP); 5771 } 5772 5773 cache_fpl_checkpoint_outer(&fpl); 5774 5775 cache_fpl_smr_enter_initial(&fpl); 5776 #ifdef INVARIANTS 5777 fpl.debug.ni_pathlen = ndp->ni_pathlen; 5778 #endif 5779 fpl.nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1]; 5780 fpl.fsearch = false; 5781 fpl.savename = (cnp->cn_flags & SAVENAME) != 0; 5782 fpl.tvp = NULL; /* for degenerate path handling */ 5783 fpl.pwd = pwdp; 5784 pwd = pwd_get_smr(); 5785 *(fpl.pwd) = pwd; 5786 ndp->ni_rootdir = pwd->pwd_rdir; 5787 ndp->ni_topdir = pwd->pwd_jdir; 5788 5789 if (cnp->cn_pnbuf[0] == '/') { 5790 dvp = cache_fpl_handle_root(&fpl); 5791 MPASS(ndp->ni_resflags == 0); 5792 ndp->ni_resflags = NIRES_ABS; 5793 } else { 5794 if (ndp->ni_dirfd == AT_FDCWD) { 5795 dvp = pwd->pwd_cdir; 5796 } else { 5797 error = cache_fplookup_dirfd(&fpl, &dvp); 5798 if (__predict_false(error != 0)) { 5799 goto out; 5800 } 5801 } 5802 } 5803 5804 SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true); 5805 error = cache_fplookup_impl(dvp, &fpl); 5806 out: 5807 cache_fpl_smr_assert_not_entered(&fpl); 5808 cache_fpl_assert_status(&fpl); 5809 *status = fpl.status; 5810 if (SDT_PROBES_ENABLED()) { 5811 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 5812 if (fpl.status == CACHE_FPL_STATUS_HANDLED) 5813 SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true, 5814 ndp); 5815 } 5816 5817 if (__predict_true(fpl.status == CACHE_FPL_STATUS_HANDLED)) { 5818 MPASS(error != CACHE_FPL_FAILED); 5819 if (error != 0) { 5820 MPASS(fpl.dvp == NULL); 5821 MPASS(fpl.tvp == NULL); 5822 MPASS(fpl.savename == false); 5823 } 5824 ndp->ni_dvp = fpl.dvp; 5825 ndp->ni_vp = fpl.tvp; 5826 if (fpl.savename) { 5827 cnp->cn_flags |= HASBUF; 5828 } else { 5829 cache_fpl_cleanup_cnp(cnp); 5830 } 5831 } 5832 return (error); 5833 } 5834