1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Poul-Henning Kamp of the FreeBSD Project. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_ddb.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/capsicum.h> 46 #include <sys/counter.h> 47 #include <sys/filedesc.h> 48 #include <sys/fnv_hash.h> 49 #include <sys/kernel.h> 50 #include <sys/ktr.h> 51 #include <sys/lock.h> 52 #include <sys/malloc.h> 53 #include <sys/fcntl.h> 54 #include <sys/jail.h> 55 #include <sys/mount.h> 56 #include <sys/namei.h> 57 #include <sys/proc.h> 58 #include <sys/seqc.h> 59 #include <sys/sdt.h> 60 #include <sys/smr.h> 61 #include <sys/smp.h> 62 #include <sys/syscallsubr.h> 63 #include <sys/sysctl.h> 64 #include <sys/sysproto.h> 65 #include <sys/vnode.h> 66 #include <ck_queue.h> 67 #ifdef KTRACE 68 #include <sys/ktrace.h> 69 #endif 70 #ifdef INVARIANTS 71 #include <machine/_inttypes.h> 72 #endif 73 74 #include <sys/capsicum.h> 75 76 #include <security/audit/audit.h> 77 #include <security/mac/mac_framework.h> 78 79 #ifdef DDB 80 #include <ddb/ddb.h> 81 #endif 82 83 #include <vm/uma.h> 84 85 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 86 "Name cache"); 87 88 SDT_PROVIDER_DECLARE(vfs); 89 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", 90 "struct vnode *"); 91 SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *", 92 "struct vnode *"); 93 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", 94 "char *"); 95 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *", 96 "const char *"); 97 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *", 98 "struct namecache *", "int", "int"); 99 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); 100 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", 101 "char *", "struct vnode *"); 102 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); 103 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", 104 "struct vnode *", "char *"); 105 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", 106 "struct vnode *"); 107 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, 108 "struct vnode *", "char *"); 109 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", 110 "char *"); 111 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *", 112 "struct componentname *"); 113 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *", 114 "struct componentname *"); 115 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); 116 SDT_PROBE_DEFINE1(vfs, namecache, purge, batch, "int"); 117 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); 118 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); 119 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", 120 "struct vnode *"); 121 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", 122 "char *"); 123 SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *", 124 "char *"); 125 SDT_PROBE_DEFINE1(vfs, namecache, symlink, alloc__fail, "size_t"); 126 127 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool"); 128 SDT_PROBE_DECLARE(vfs, namei, lookup, entry); 129 SDT_PROBE_DECLARE(vfs, namei, lookup, return); 130 131 /* 132 * This structure describes the elements in the cache of recent 133 * names looked up by namei. 134 */ 135 struct negstate { 136 u_char neg_flag; 137 u_char neg_hit; 138 }; 139 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *), 140 "the state must fit in a union with a pointer without growing it"); 141 142 struct namecache { 143 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 144 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 145 CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */ 146 struct vnode *nc_dvp; /* vnode of parent of name */ 147 union { 148 struct vnode *nu_vp; /* vnode the name refers to */ 149 struct negstate nu_neg;/* negative entry state */ 150 } n_un; 151 u_char nc_flag; /* flag bits */ 152 u_char nc_nlen; /* length of name */ 153 char nc_name[0]; /* segment name + nul */ 154 }; 155 156 /* 157 * struct namecache_ts repeats struct namecache layout up to the 158 * nc_nlen member. 159 * struct namecache_ts is used in place of struct namecache when time(s) need 160 * to be stored. The nc_dotdottime field is used when a cache entry is mapping 161 * both a non-dotdot directory name plus dotdot for the directory's 162 * parent. 163 * 164 * See below for alignment requirement. 165 */ 166 struct namecache_ts { 167 struct timespec nc_time; /* timespec provided by fs */ 168 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ 169 int nc_ticks; /* ticks value when entry was added */ 170 int nc_pad; 171 struct namecache nc_nc; 172 }; 173 174 TAILQ_HEAD(cache_freebatch, namecache); 175 176 /* 177 * At least mips n32 performs 64-bit accesses to timespec as found 178 * in namecache_ts and requires them to be aligned. Since others 179 * may be in the same spot suffer a little bit and enforce the 180 * alignment for everyone. Note this is a nop for 64-bit platforms. 181 */ 182 #define CACHE_ZONE_ALIGNMENT UMA_ALIGNOF(time_t) 183 184 /* 185 * TODO: the initial value of CACHE_PATH_CUTOFF was inherited from the 186 * 4.4 BSD codebase. Later on struct namecache was tweaked to become 187 * smaller and the value was bumped to retain the total size, but it 188 * was never re-evaluated for suitability. A simple test counting 189 * lengths during package building shows that the value of 45 covers 190 * about 86% of all added entries, reaching 99% at 65. 191 * 192 * Regardless of the above, use of dedicated zones instead of malloc may be 193 * inducing additional waste. This may be hard to address as said zones are 194 * tied to VFS SMR. Even if retaining them, the current split should be 195 * re-evaluated. 196 */ 197 #ifdef __LP64__ 198 #define CACHE_PATH_CUTOFF 45 199 #define CACHE_LARGE_PAD 6 200 #else 201 #define CACHE_PATH_CUTOFF 41 202 #define CACHE_LARGE_PAD 2 203 #endif 204 205 #define CACHE_ZONE_SMALL_SIZE (offsetof(struct namecache, nc_name) + CACHE_PATH_CUTOFF + 1) 206 #define CACHE_ZONE_SMALL_TS_SIZE (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_SMALL_SIZE) 207 #define CACHE_ZONE_LARGE_SIZE (offsetof(struct namecache, nc_name) + NAME_MAX + 1 + CACHE_LARGE_PAD) 208 #define CACHE_ZONE_LARGE_TS_SIZE (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_LARGE_SIZE) 209 210 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 211 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 212 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 213 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 214 215 #define nc_vp n_un.nu_vp 216 #define nc_neg n_un.nu_neg 217 218 /* 219 * Flags in namecache.nc_flag 220 */ 221 #define NCF_WHITE 0x01 222 #define NCF_ISDOTDOT 0x02 223 #define NCF_TS 0x04 224 #define NCF_DTS 0x08 225 #define NCF_DVDROP 0x10 226 #define NCF_NEGATIVE 0x20 227 #define NCF_INVALID 0x40 228 #define NCF_WIP 0x80 229 230 /* 231 * Flags in negstate.neg_flag 232 */ 233 #define NEG_HOT 0x01 234 235 static bool cache_neg_evict_cond(u_long lnumcache); 236 237 /* 238 * Mark an entry as invalid. 239 * 240 * This is called before it starts getting deconstructed. 241 */ 242 static void 243 cache_ncp_invalidate(struct namecache *ncp) 244 { 245 246 KASSERT((ncp->nc_flag & NCF_INVALID) == 0, 247 ("%s: entry %p already invalid", __func__, ncp)); 248 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID); 249 atomic_thread_fence_rel(); 250 } 251 252 /* 253 * Check whether the entry can be safely used. 254 * 255 * All places which elide locks are supposed to call this after they are 256 * done with reading from an entry. 257 */ 258 #define cache_ncp_canuse(ncp) ({ \ 259 struct namecache *_ncp = (ncp); \ 260 u_char _nc_flag; \ 261 \ 262 atomic_thread_fence_acq(); \ 263 _nc_flag = atomic_load_char(&_ncp->nc_flag); \ 264 __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP)) == 0); \ 265 }) 266 267 /* 268 * Like the above but also checks NCF_WHITE. 269 */ 270 #define cache_fpl_neg_ncp_canuse(ncp) ({ \ 271 struct namecache *_ncp = (ncp); \ 272 u_char _nc_flag; \ 273 \ 274 atomic_thread_fence_acq(); \ 275 _nc_flag = atomic_load_char(&_ncp->nc_flag); \ 276 __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP | NCF_WHITE)) == 0); \ 277 }) 278 279 VFS_SMR_DECLARE; 280 281 static SYSCTL_NODE(_vfs_cache, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 282 "Name cache parameters"); 283 284 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */ 285 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, size, CTLFLAG_RW, &ncsize, 0, 286 "Total namecache capacity"); 287 288 u_int ncsizefactor = 2; 289 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, sizefactor, CTLFLAG_RW, &ncsizefactor, 0, 290 "Size factor for namecache"); 291 292 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */ 293 SYSCTL_ULONG(_vfs_cache_param, OID_AUTO, negfactor, CTLFLAG_RW, &ncnegfactor, 0, 294 "Ratio of negative namecache entries"); 295 296 /* 297 * Negative entry % of namecache capacity above which automatic eviction is allowed. 298 * 299 * Check cache_neg_evict_cond for details. 300 */ 301 static u_int ncnegminpct = 3; 302 303 static u_int __read_mostly neg_min; /* the above recomputed against ncsize */ 304 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, negmin, CTLFLAG_RD, &neg_min, 0, 305 "Negative entry count above which automatic eviction is allowed"); 306 307 /* 308 * Structures associated with name caching. 309 */ 310 #define NCHHASH(hash) \ 311 (&nchashtbl[(hash) & nchash]) 312 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ 313 static u_long __read_mostly nchash; /* size of hash table */ 314 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 315 "Size of namecache hash table"); 316 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ 317 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ 318 319 struct nchstats nchstats; /* cache effectiveness statistics */ 320 321 static bool __read_frequently cache_fast_revlookup = true; 322 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_revlookup, CTLFLAG_RW, 323 &cache_fast_revlookup, 0, ""); 324 325 static bool __read_mostly cache_rename_add = true; 326 SYSCTL_BOOL(_vfs, OID_AUTO, cache_rename_add, CTLFLAG_RW, 327 &cache_rename_add, 0, ""); 328 329 static u_int __exclusive_cache_line neg_cycle; 330 331 #define ncneghash 3 332 #define numneglists (ncneghash + 1) 333 334 struct neglist { 335 struct mtx nl_evict_lock; 336 struct mtx nl_lock __aligned(CACHE_LINE_SIZE); 337 TAILQ_HEAD(, namecache) nl_list; 338 TAILQ_HEAD(, namecache) nl_hotlist; 339 u_long nl_hotnum; 340 } __aligned(CACHE_LINE_SIZE); 341 342 static struct neglist neglists[numneglists]; 343 344 static inline struct neglist * 345 NCP2NEGLIST(struct namecache *ncp) 346 { 347 348 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); 349 } 350 351 static inline struct negstate * 352 NCP2NEGSTATE(struct namecache *ncp) 353 { 354 355 MPASS(atomic_load_char(&ncp->nc_flag) & NCF_NEGATIVE); 356 return (&ncp->nc_neg); 357 } 358 359 #define numbucketlocks (ncbuckethash + 1) 360 static u_int __read_mostly ncbuckethash; 361 static struct mtx_padalign __read_mostly *bucketlocks; 362 #define HASH2BUCKETLOCK(hash) \ 363 ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)])) 364 365 #define numvnodelocks (ncvnodehash + 1) 366 static u_int __read_mostly ncvnodehash; 367 static struct mtx __read_mostly *vnodelocks; 368 static inline struct mtx * 369 VP2VNODELOCK(struct vnode *vp) 370 { 371 372 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); 373 } 374 375 static void 376 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) 377 { 378 struct namecache_ts *ncp_ts; 379 380 KASSERT((ncp->nc_flag & NCF_TS) != 0 || 381 (tsp == NULL && ticksp == NULL), 382 ("No NCF_TS")); 383 384 if (tsp == NULL) 385 return; 386 387 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 388 *tsp = ncp_ts->nc_time; 389 *ticksp = ncp_ts->nc_ticks; 390 } 391 392 #ifdef DEBUG_CACHE 393 static int __read_mostly doingcache = 1; /* 1 => enable the cache */ 394 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, 395 "VFS namecache enabled"); 396 #endif 397 398 /* Export size information to userland */ 399 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 400 sizeof(struct namecache), "sizeof(struct namecache)"); 401 402 /* 403 * The new name cache statistics 404 */ 405 static SYSCTL_NODE(_vfs_cache, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 406 "Name cache statistics"); 407 408 #define STATNODE_ULONG(name, varname, descr) \ 409 SYSCTL_ULONG(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr); 410 #define STATNODE_COUNTER(name, varname, descr) \ 411 static COUNTER_U64_DEFINE_EARLY(varname); \ 412 SYSCTL_COUNTER_U64(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, \ 413 descr); 414 STATNODE_ULONG(neg, numneg, "Number of negative cache entries"); 415 STATNODE_ULONG(count, numcache, "Number of cache entries"); 416 STATNODE_COUNTER(heldvnodes, numcachehv, "Number of namecache entries with vnodes held"); 417 STATNODE_COUNTER(drops, numdrops, "Number of dropped entries due to reaching the limit"); 418 STATNODE_COUNTER(dothits, dothits, "Number of '.' hits"); 419 STATNODE_COUNTER(dotdothis, dotdothits, "Number of '..' hits"); 420 STATNODE_COUNTER(miss, nummiss, "Number of cache misses"); 421 STATNODE_COUNTER(misszap, nummisszap, "Number of cache misses we do not want to cache"); 422 STATNODE_COUNTER(posszaps, numposzaps, 423 "Number of cache hits (positive) we do not want to cache"); 424 STATNODE_COUNTER(poshits, numposhits, "Number of cache hits (positive)"); 425 STATNODE_COUNTER(negzaps, numnegzaps, 426 "Number of cache hits (negative) we do not want to cache"); 427 STATNODE_COUNTER(neghits, numneghits, "Number of cache hits (negative)"); 428 /* These count for vn_getcwd(), too. */ 429 STATNODE_COUNTER(fullpathcalls, numfullpathcalls, "Number of fullpath search calls"); 430 STATNODE_COUNTER(fullpathfail1, numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); 431 STATNODE_COUNTER(fullpathfail2, numfullpathfail2, 432 "Number of fullpath search errors (VOP_VPTOCNP failures)"); 433 STATNODE_COUNTER(fullpathfail4, numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); 434 STATNODE_COUNTER(fullpathfound, numfullpathfound, "Number of successful fullpath calls"); 435 STATNODE_COUNTER(symlinktoobig, symlinktoobig, "Number of times symlink did not fit the cache"); 436 437 /* 438 * Debug or developer statistics. 439 */ 440 static SYSCTL_NODE(_vfs_cache, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 441 "Name cache debugging"); 442 #define DEBUGNODE_ULONG(name, varname, descr) \ 443 SYSCTL_ULONG(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr); 444 #define DEBUGNODE_COUNTER(name, varname, descr) \ 445 static COUNTER_U64_DEFINE_EARLY(varname); \ 446 SYSCTL_COUNTER_U64(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, \ 447 descr); 448 DEBUGNODE_COUNTER(zap_bucket_relock_success, zap_bucket_relock_success, 449 "Number of successful removals after relocking"); 450 static long zap_bucket_fail; 451 DEBUGNODE_ULONG(zap_bucket_fail, zap_bucket_fail, ""); 452 static long zap_bucket_fail2; 453 DEBUGNODE_ULONG(zap_bucket_fail2, zap_bucket_fail2, ""); 454 static long cache_lock_vnodes_cel_3_failures; 455 DEBUGNODE_ULONG(vnodes_cel_3_failures, cache_lock_vnodes_cel_3_failures, 456 "Number of times 3-way vnode locking failed"); 457 458 static void cache_zap_locked(struct namecache *ncp); 459 static int vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, 460 char **freebuf, size_t *buflen); 461 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 462 char **retbuf, size_t *buflen, size_t addend); 463 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, 464 char **retbuf, size_t *buflen); 465 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, 466 char **retbuf, size_t *len, size_t addend); 467 468 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 469 470 static inline void 471 cache_assert_vlp_locked(struct mtx *vlp) 472 { 473 474 if (vlp != NULL) 475 mtx_assert(vlp, MA_OWNED); 476 } 477 478 static inline void 479 cache_assert_vnode_locked(struct vnode *vp) 480 { 481 struct mtx *vlp; 482 483 vlp = VP2VNODELOCK(vp); 484 cache_assert_vlp_locked(vlp); 485 } 486 487 /* 488 * Directory vnodes with entries are held for two reasons: 489 * 1. make them less of a target for reclamation in vnlru 490 * 2. suffer smaller performance penalty in locked lookup as requeieing is avoided 491 * 492 * It will be feasible to stop doing it altogether if all filesystems start 493 * supporting lockless lookup. 494 */ 495 static void 496 cache_hold_vnode(struct vnode *vp) 497 { 498 499 cache_assert_vnode_locked(vp); 500 VNPASS(LIST_EMPTY(&vp->v_cache_src), vp); 501 vhold(vp); 502 counter_u64_add(numcachehv, 1); 503 } 504 505 static void 506 cache_drop_vnode(struct vnode *vp) 507 { 508 509 /* 510 * Called after all locks are dropped, meaning we can't assert 511 * on the state of v_cache_src. 512 */ 513 vdrop(vp); 514 counter_u64_add(numcachehv, -1); 515 } 516 517 /* 518 * UMA zones. 519 */ 520 static uma_zone_t __read_mostly cache_zone_small; 521 static uma_zone_t __read_mostly cache_zone_small_ts; 522 static uma_zone_t __read_mostly cache_zone_large; 523 static uma_zone_t __read_mostly cache_zone_large_ts; 524 525 char * 526 cache_symlink_alloc(size_t size, int flags) 527 { 528 529 if (size < CACHE_ZONE_SMALL_SIZE) { 530 return (uma_zalloc_smr(cache_zone_small, flags)); 531 } 532 if (size < CACHE_ZONE_LARGE_SIZE) { 533 return (uma_zalloc_smr(cache_zone_large, flags)); 534 } 535 counter_u64_add(symlinktoobig, 1); 536 SDT_PROBE1(vfs, namecache, symlink, alloc__fail, size); 537 return (NULL); 538 } 539 540 void 541 cache_symlink_free(char *string, size_t size) 542 { 543 544 MPASS(string != NULL); 545 KASSERT(size < CACHE_ZONE_LARGE_SIZE, 546 ("%s: size %zu too big", __func__, size)); 547 548 if (size < CACHE_ZONE_SMALL_SIZE) { 549 uma_zfree_smr(cache_zone_small, string); 550 return; 551 } 552 if (size < CACHE_ZONE_LARGE_SIZE) { 553 uma_zfree_smr(cache_zone_large, string); 554 return; 555 } 556 __assert_unreachable(); 557 } 558 559 static struct namecache * 560 cache_alloc_uma(int len, bool ts) 561 { 562 struct namecache_ts *ncp_ts; 563 struct namecache *ncp; 564 565 if (__predict_false(ts)) { 566 if (len <= CACHE_PATH_CUTOFF) 567 ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK); 568 else 569 ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK); 570 ncp = &ncp_ts->nc_nc; 571 } else { 572 if (len <= CACHE_PATH_CUTOFF) 573 ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK); 574 else 575 ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK); 576 } 577 return (ncp); 578 } 579 580 static void 581 cache_free_uma(struct namecache *ncp) 582 { 583 struct namecache_ts *ncp_ts; 584 585 if (__predict_false(ncp->nc_flag & NCF_TS)) { 586 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 587 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 588 uma_zfree_smr(cache_zone_small_ts, ncp_ts); 589 else 590 uma_zfree_smr(cache_zone_large_ts, ncp_ts); 591 } else { 592 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 593 uma_zfree_smr(cache_zone_small, ncp); 594 else 595 uma_zfree_smr(cache_zone_large, ncp); 596 } 597 } 598 599 static struct namecache * 600 cache_alloc(int len, bool ts) 601 { 602 u_long lnumcache; 603 604 /* 605 * Avoid blowout in namecache entries. 606 * 607 * Bugs: 608 * 1. filesystems may end up trying to add an already existing entry 609 * (for example this can happen after a cache miss during concurrent 610 * lookup), in which case we will call cache_neg_evict despite not 611 * adding anything. 612 * 2. the routine may fail to free anything and no provisions are made 613 * to make it try harder (see the inside for failure modes) 614 * 3. it only ever looks at negative entries. 615 */ 616 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; 617 if (cache_neg_evict_cond(lnumcache)) { 618 lnumcache = atomic_load_long(&numcache); 619 } 620 if (__predict_false(lnumcache >= ncsize)) { 621 atomic_subtract_long(&numcache, 1); 622 counter_u64_add(numdrops, 1); 623 return (NULL); 624 } 625 return (cache_alloc_uma(len, ts)); 626 } 627 628 static void 629 cache_free(struct namecache *ncp) 630 { 631 632 MPASS(ncp != NULL); 633 if ((ncp->nc_flag & NCF_DVDROP) != 0) { 634 cache_drop_vnode(ncp->nc_dvp); 635 } 636 cache_free_uma(ncp); 637 atomic_subtract_long(&numcache, 1); 638 } 639 640 static void 641 cache_free_batch(struct cache_freebatch *batch) 642 { 643 struct namecache *ncp, *nnp; 644 int i; 645 646 i = 0; 647 if (TAILQ_EMPTY(batch)) 648 goto out; 649 TAILQ_FOREACH_SAFE(ncp, batch, nc_dst, nnp) { 650 if ((ncp->nc_flag & NCF_DVDROP) != 0) { 651 cache_drop_vnode(ncp->nc_dvp); 652 } 653 cache_free_uma(ncp); 654 i++; 655 } 656 atomic_subtract_long(&numcache, i); 657 out: 658 SDT_PROBE1(vfs, namecache, purge, batch, i); 659 } 660 661 /* 662 * Hashing. 663 * 664 * The code was made to use FNV in 2001 and this choice needs to be revisited. 665 * 666 * Short summary of the difficulty: 667 * The longest name which can be inserted is NAME_MAX characters in length (or 668 * 255 at the time of writing this comment), while majority of names used in 669 * practice are significantly shorter (mostly below 10). More importantly 670 * majority of lookups performed find names are even shorter than that. 671 * 672 * This poses a problem where hashes which do better than FNV past word size 673 * (or so) tend to come with additional overhead when finalizing the result, 674 * making them noticeably slower for the most commonly used range. 675 * 676 * Consider a path like: /usr/obj/usr/src/sys/amd64/GENERIC/vnode_if.c 677 * 678 * When looking it up the most time consuming part by a large margin (at least 679 * on amd64) is hashing. Replacing FNV with something which pessimizes short 680 * input would make the slowest part stand out even more. 681 */ 682 683 /* 684 * TODO: With the value stored we can do better than computing the hash based 685 * on the address. 686 */ 687 static void 688 cache_prehash(struct vnode *vp) 689 { 690 691 vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT); 692 } 693 694 static uint32_t 695 cache_get_hash(char *name, u_char len, struct vnode *dvp) 696 { 697 698 return (fnv_32_buf(name, len, dvp->v_nchash)); 699 } 700 701 static uint32_t 702 cache_get_hash_iter_start(struct vnode *dvp) 703 { 704 705 return (dvp->v_nchash); 706 } 707 708 static uint32_t 709 cache_get_hash_iter(char c, uint32_t hash) 710 { 711 712 return (fnv_32_buf(&c, 1, hash)); 713 } 714 715 static uint32_t 716 cache_get_hash_iter_finish(uint32_t hash) 717 { 718 719 return (hash); 720 } 721 722 static inline struct nchashhead * 723 NCP2BUCKET(struct namecache *ncp) 724 { 725 uint32_t hash; 726 727 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 728 return (NCHHASH(hash)); 729 } 730 731 static inline struct mtx * 732 NCP2BUCKETLOCK(struct namecache *ncp) 733 { 734 uint32_t hash; 735 736 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 737 return (HASH2BUCKETLOCK(hash)); 738 } 739 740 #ifdef INVARIANTS 741 static void 742 cache_assert_bucket_locked(struct namecache *ncp) 743 { 744 struct mtx *blp; 745 746 blp = NCP2BUCKETLOCK(ncp); 747 mtx_assert(blp, MA_OWNED); 748 } 749 750 static void 751 cache_assert_bucket_unlocked(struct namecache *ncp) 752 { 753 struct mtx *blp; 754 755 blp = NCP2BUCKETLOCK(ncp); 756 mtx_assert(blp, MA_NOTOWNED); 757 } 758 #else 759 #define cache_assert_bucket_locked(x) do { } while (0) 760 #define cache_assert_bucket_unlocked(x) do { } while (0) 761 #endif 762 763 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) 764 static void 765 _cache_sort_vnodes(void **p1, void **p2) 766 { 767 void *tmp; 768 769 MPASS(*p1 != NULL || *p2 != NULL); 770 771 if (*p1 > *p2) { 772 tmp = *p2; 773 *p2 = *p1; 774 *p1 = tmp; 775 } 776 } 777 778 static void 779 cache_lock_all_buckets(void) 780 { 781 u_int i; 782 783 for (i = 0; i < numbucketlocks; i++) 784 mtx_lock(&bucketlocks[i]); 785 } 786 787 static void 788 cache_unlock_all_buckets(void) 789 { 790 u_int i; 791 792 for (i = 0; i < numbucketlocks; i++) 793 mtx_unlock(&bucketlocks[i]); 794 } 795 796 static void 797 cache_lock_all_vnodes(void) 798 { 799 u_int i; 800 801 for (i = 0; i < numvnodelocks; i++) 802 mtx_lock(&vnodelocks[i]); 803 } 804 805 static void 806 cache_unlock_all_vnodes(void) 807 { 808 u_int i; 809 810 for (i = 0; i < numvnodelocks; i++) 811 mtx_unlock(&vnodelocks[i]); 812 } 813 814 static int 815 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 816 { 817 818 cache_sort_vnodes(&vlp1, &vlp2); 819 820 if (vlp1 != NULL) { 821 if (!mtx_trylock(vlp1)) 822 return (EAGAIN); 823 } 824 if (!mtx_trylock(vlp2)) { 825 if (vlp1 != NULL) 826 mtx_unlock(vlp1); 827 return (EAGAIN); 828 } 829 830 return (0); 831 } 832 833 static void 834 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 835 { 836 837 MPASS(vlp1 != NULL || vlp2 != NULL); 838 MPASS(vlp1 <= vlp2); 839 840 if (vlp1 != NULL) 841 mtx_lock(vlp1); 842 if (vlp2 != NULL) 843 mtx_lock(vlp2); 844 } 845 846 static void 847 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 848 { 849 850 MPASS(vlp1 != NULL || vlp2 != NULL); 851 852 if (vlp1 != NULL) 853 mtx_unlock(vlp1); 854 if (vlp2 != NULL) 855 mtx_unlock(vlp2); 856 } 857 858 static int 859 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 860 { 861 struct nchstats snap; 862 863 if (req->oldptr == NULL) 864 return (SYSCTL_OUT(req, 0, sizeof(snap))); 865 866 snap = nchstats; 867 snap.ncs_goodhits = counter_u64_fetch(numposhits); 868 snap.ncs_neghits = counter_u64_fetch(numneghits); 869 snap.ncs_badhits = counter_u64_fetch(numposzaps) + 870 counter_u64_fetch(numnegzaps); 871 snap.ncs_miss = counter_u64_fetch(nummisszap) + 872 counter_u64_fetch(nummiss); 873 874 return (SYSCTL_OUT(req, &snap, sizeof(snap))); 875 } 876 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | 877 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", 878 "VFS cache effectiveness statistics"); 879 880 static void 881 cache_recalc_neg_min(u_int val) 882 { 883 884 neg_min = (ncsize * val) / 100; 885 } 886 887 static int 888 sysctl_negminpct(SYSCTL_HANDLER_ARGS) 889 { 890 u_int val; 891 int error; 892 893 val = ncnegminpct; 894 error = sysctl_handle_int(oidp, &val, 0, req); 895 if (error != 0 || req->newptr == NULL) 896 return (error); 897 898 if (val == ncnegminpct) 899 return (0); 900 if (val < 0 || val > 99) 901 return (EINVAL); 902 ncnegminpct = val; 903 cache_recalc_neg_min(val); 904 return (0); 905 } 906 907 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, negminpct, 908 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_negminpct, 909 "I", "Negative entry \% of namecache capacity above which automatic eviction is allowed"); 910 911 #ifdef DIAGNOSTIC 912 /* 913 * Grab an atomic snapshot of the name cache hash chain lengths 914 */ 915 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, 916 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 917 "hash table stats"); 918 919 static int 920 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) 921 { 922 struct nchashhead *ncpp; 923 struct namecache *ncp; 924 int i, error, n_nchash, *cntbuf; 925 926 retry: 927 n_nchash = nchash + 1; /* nchash is max index, not count */ 928 if (req->oldptr == NULL) 929 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); 930 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); 931 cache_lock_all_buckets(); 932 if (n_nchash != nchash + 1) { 933 cache_unlock_all_buckets(); 934 free(cntbuf, M_TEMP); 935 goto retry; 936 } 937 /* Scan hash tables counting entries */ 938 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) 939 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) 940 cntbuf[i]++; 941 cache_unlock_all_buckets(); 942 for (error = 0, i = 0; i < n_nchash; i++) 943 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) 944 break; 945 free(cntbuf, M_TEMP); 946 return (error); 947 } 948 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| 949 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", 950 "nchash chain lengths"); 951 952 static int 953 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) 954 { 955 int error; 956 struct nchashhead *ncpp; 957 struct namecache *ncp; 958 int n_nchash; 959 int count, maxlength, used, pct; 960 961 if (!req->oldptr) 962 return SYSCTL_OUT(req, 0, 4 * sizeof(int)); 963 964 cache_lock_all_buckets(); 965 n_nchash = nchash + 1; /* nchash is max index, not count */ 966 used = 0; 967 maxlength = 0; 968 969 /* Scan hash tables for applicable entries */ 970 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { 971 count = 0; 972 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) { 973 count++; 974 } 975 if (count) 976 used++; 977 if (maxlength < count) 978 maxlength = count; 979 } 980 n_nchash = nchash + 1; 981 cache_unlock_all_buckets(); 982 pct = (used * 100) / (n_nchash / 100); 983 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); 984 if (error) 985 return (error); 986 error = SYSCTL_OUT(req, &used, sizeof(used)); 987 if (error) 988 return (error); 989 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); 990 if (error) 991 return (error); 992 error = SYSCTL_OUT(req, &pct, sizeof(pct)); 993 if (error) 994 return (error); 995 return (0); 996 } 997 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| 998 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", 999 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); 1000 #endif 1001 1002 /* 1003 * Negative entries management 1004 * 1005 * Various workloads create plenty of negative entries and barely use them 1006 * afterwards. Moreover malicious users can keep performing bogus lookups 1007 * adding even more entries. For example "make tinderbox" as of writing this 1008 * comment ends up with 2.6M namecache entries in total, 1.2M of which are 1009 * negative. 1010 * 1011 * As such, a rather aggressive eviction method is needed. The currently 1012 * employed method is a placeholder. 1013 * 1014 * Entries are split over numneglists separate lists, each of which is further 1015 * split into hot and cold entries. Entries get promoted after getting a hit. 1016 * Eviction happens on addition of new entry. 1017 */ 1018 static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1019 "Name cache negative entry statistics"); 1020 1021 SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0, 1022 "Number of negative cache entries"); 1023 1024 static COUNTER_U64_DEFINE_EARLY(neg_created); 1025 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created, 1026 "Number of created negative entries"); 1027 1028 static COUNTER_U64_DEFINE_EARLY(neg_evicted); 1029 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted, 1030 "Number of evicted negative entries"); 1031 1032 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty); 1033 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD, 1034 &neg_evict_skipped_empty, 1035 "Number of times evicting failed due to lack of entries"); 1036 1037 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed); 1038 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD, 1039 &neg_evict_skipped_missed, 1040 "Number of times evicting failed due to target entry disappearing"); 1041 1042 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended); 1043 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD, 1044 &neg_evict_skipped_contended, 1045 "Number of times evicting failed due to contention"); 1046 1047 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits, 1048 "Number of cache hits (negative)"); 1049 1050 static int 1051 sysctl_neg_hot(SYSCTL_HANDLER_ARGS) 1052 { 1053 int i, out; 1054 1055 out = 0; 1056 for (i = 0; i < numneglists; i++) 1057 out += neglists[i].nl_hotnum; 1058 1059 return (SYSCTL_OUT(req, &out, sizeof(out))); 1060 } 1061 SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD | 1062 CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I", 1063 "Number of hot negative entries"); 1064 1065 static void 1066 cache_neg_init(struct namecache *ncp) 1067 { 1068 struct negstate *ns; 1069 1070 ncp->nc_flag |= NCF_NEGATIVE; 1071 ns = NCP2NEGSTATE(ncp); 1072 ns->neg_flag = 0; 1073 ns->neg_hit = 0; 1074 counter_u64_add(neg_created, 1); 1075 } 1076 1077 #define CACHE_NEG_PROMOTION_THRESH 2 1078 1079 static bool 1080 cache_neg_hit_prep(struct namecache *ncp) 1081 { 1082 struct negstate *ns; 1083 u_char n; 1084 1085 ns = NCP2NEGSTATE(ncp); 1086 n = atomic_load_char(&ns->neg_hit); 1087 for (;;) { 1088 if (n >= CACHE_NEG_PROMOTION_THRESH) 1089 return (false); 1090 if (atomic_fcmpset_8(&ns->neg_hit, &n, n + 1)) 1091 break; 1092 } 1093 return (n + 1 == CACHE_NEG_PROMOTION_THRESH); 1094 } 1095 1096 /* 1097 * Nothing to do here but it is provided for completeness as some 1098 * cache_neg_hit_prep callers may end up returning without even 1099 * trying to promote. 1100 */ 1101 #define cache_neg_hit_abort(ncp) do { } while (0) 1102 1103 static void 1104 cache_neg_hit_finish(struct namecache *ncp) 1105 { 1106 1107 SDT_PROBE2(vfs, namecache, lookup, hit__negative, ncp->nc_dvp, ncp->nc_name); 1108 counter_u64_add(numneghits, 1); 1109 } 1110 1111 /* 1112 * Move a negative entry to the hot list. 1113 */ 1114 static void 1115 cache_neg_promote_locked(struct namecache *ncp) 1116 { 1117 struct neglist *nl; 1118 struct negstate *ns; 1119 1120 ns = NCP2NEGSTATE(ncp); 1121 nl = NCP2NEGLIST(ncp); 1122 mtx_assert(&nl->nl_lock, MA_OWNED); 1123 if ((ns->neg_flag & NEG_HOT) == 0) { 1124 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst); 1125 TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst); 1126 nl->nl_hotnum++; 1127 ns->neg_flag |= NEG_HOT; 1128 } 1129 } 1130 1131 /* 1132 * Move a hot negative entry to the cold list. 1133 */ 1134 static void 1135 cache_neg_demote_locked(struct namecache *ncp) 1136 { 1137 struct neglist *nl; 1138 struct negstate *ns; 1139 1140 ns = NCP2NEGSTATE(ncp); 1141 nl = NCP2NEGLIST(ncp); 1142 mtx_assert(&nl->nl_lock, MA_OWNED); 1143 MPASS(ns->neg_flag & NEG_HOT); 1144 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst); 1145 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst); 1146 nl->nl_hotnum--; 1147 ns->neg_flag &= ~NEG_HOT; 1148 atomic_store_char(&ns->neg_hit, 0); 1149 } 1150 1151 /* 1152 * Move a negative entry to the hot list if it matches the lookup. 1153 * 1154 * We have to take locks, but they may be contended and in the worst 1155 * case we may need to go off CPU. We don't want to spin within the 1156 * smr section and we can't block with it. Exiting the section means 1157 * the found entry could have been evicted. We are going to look it 1158 * up again. 1159 */ 1160 static bool 1161 cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp, 1162 struct namecache *oncp, uint32_t hash) 1163 { 1164 struct namecache *ncp; 1165 struct neglist *nl; 1166 u_char nc_flag; 1167 1168 nl = NCP2NEGLIST(oncp); 1169 1170 mtx_lock(&nl->nl_lock); 1171 /* 1172 * For hash iteration. 1173 */ 1174 vfs_smr_enter(); 1175 1176 /* 1177 * Avoid all surprises by only succeeding if we got the same entry and 1178 * bailing completely otherwise. 1179 * XXX There are no provisions to keep the vnode around, meaning we may 1180 * end up promoting a negative entry for a *new* vnode and returning 1181 * ENOENT on its account. This is the error we want to return anyway 1182 * and promotion is harmless. 1183 * 1184 * In particular at this point there can be a new ncp which matches the 1185 * search but hashes to a different neglist. 1186 */ 1187 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1188 if (ncp == oncp) 1189 break; 1190 } 1191 1192 /* 1193 * No match to begin with. 1194 */ 1195 if (__predict_false(ncp == NULL)) { 1196 goto out_abort; 1197 } 1198 1199 /* 1200 * The newly found entry may be something different... 1201 */ 1202 if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1203 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) { 1204 goto out_abort; 1205 } 1206 1207 /* 1208 * ... and not even negative. 1209 */ 1210 nc_flag = atomic_load_char(&ncp->nc_flag); 1211 if ((nc_flag & NCF_NEGATIVE) == 0) { 1212 goto out_abort; 1213 } 1214 1215 if (!cache_ncp_canuse(ncp)) { 1216 goto out_abort; 1217 } 1218 1219 cache_neg_promote_locked(ncp); 1220 cache_neg_hit_finish(ncp); 1221 vfs_smr_exit(); 1222 mtx_unlock(&nl->nl_lock); 1223 return (true); 1224 out_abort: 1225 vfs_smr_exit(); 1226 mtx_unlock(&nl->nl_lock); 1227 return (false); 1228 } 1229 1230 static void 1231 cache_neg_promote(struct namecache *ncp) 1232 { 1233 struct neglist *nl; 1234 1235 nl = NCP2NEGLIST(ncp); 1236 mtx_lock(&nl->nl_lock); 1237 cache_neg_promote_locked(ncp); 1238 mtx_unlock(&nl->nl_lock); 1239 } 1240 1241 static void 1242 cache_neg_insert(struct namecache *ncp) 1243 { 1244 struct neglist *nl; 1245 1246 MPASS(ncp->nc_flag & NCF_NEGATIVE); 1247 cache_assert_bucket_locked(ncp); 1248 nl = NCP2NEGLIST(ncp); 1249 mtx_lock(&nl->nl_lock); 1250 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst); 1251 mtx_unlock(&nl->nl_lock); 1252 atomic_add_long(&numneg, 1); 1253 } 1254 1255 static void 1256 cache_neg_remove(struct namecache *ncp) 1257 { 1258 struct neglist *nl; 1259 struct negstate *ns; 1260 1261 cache_assert_bucket_locked(ncp); 1262 nl = NCP2NEGLIST(ncp); 1263 ns = NCP2NEGSTATE(ncp); 1264 mtx_lock(&nl->nl_lock); 1265 if ((ns->neg_flag & NEG_HOT) != 0) { 1266 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst); 1267 nl->nl_hotnum--; 1268 } else { 1269 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst); 1270 } 1271 mtx_unlock(&nl->nl_lock); 1272 atomic_subtract_long(&numneg, 1); 1273 } 1274 1275 static struct neglist * 1276 cache_neg_evict_select_list(void) 1277 { 1278 struct neglist *nl; 1279 u_int c; 1280 1281 c = atomic_fetchadd_int(&neg_cycle, 1) + 1; 1282 nl = &neglists[c % numneglists]; 1283 if (!mtx_trylock(&nl->nl_evict_lock)) { 1284 counter_u64_add(neg_evict_skipped_contended, 1); 1285 return (NULL); 1286 } 1287 return (nl); 1288 } 1289 1290 static struct namecache * 1291 cache_neg_evict_select_entry(struct neglist *nl) 1292 { 1293 struct namecache *ncp, *lncp; 1294 struct negstate *ns, *lns; 1295 int i; 1296 1297 mtx_assert(&nl->nl_evict_lock, MA_OWNED); 1298 mtx_assert(&nl->nl_lock, MA_OWNED); 1299 ncp = TAILQ_FIRST(&nl->nl_list); 1300 if (ncp == NULL) 1301 return (NULL); 1302 lncp = ncp; 1303 lns = NCP2NEGSTATE(lncp); 1304 for (i = 1; i < 4; i++) { 1305 ncp = TAILQ_NEXT(ncp, nc_dst); 1306 if (ncp == NULL) 1307 break; 1308 ns = NCP2NEGSTATE(ncp); 1309 if (ns->neg_hit < lns->neg_hit) { 1310 lncp = ncp; 1311 lns = ns; 1312 } 1313 } 1314 return (lncp); 1315 } 1316 1317 static bool 1318 cache_neg_evict(void) 1319 { 1320 struct namecache *ncp, *ncp2; 1321 struct neglist *nl; 1322 struct vnode *dvp; 1323 struct mtx *dvlp; 1324 struct mtx *blp; 1325 uint32_t hash; 1326 u_char nlen; 1327 bool evicted; 1328 1329 nl = cache_neg_evict_select_list(); 1330 if (nl == NULL) { 1331 return (false); 1332 } 1333 1334 mtx_lock(&nl->nl_lock); 1335 ncp = TAILQ_FIRST(&nl->nl_hotlist); 1336 if (ncp != NULL) { 1337 cache_neg_demote_locked(ncp); 1338 } 1339 ncp = cache_neg_evict_select_entry(nl); 1340 if (ncp == NULL) { 1341 counter_u64_add(neg_evict_skipped_empty, 1); 1342 mtx_unlock(&nl->nl_lock); 1343 mtx_unlock(&nl->nl_evict_lock); 1344 return (false); 1345 } 1346 nlen = ncp->nc_nlen; 1347 dvp = ncp->nc_dvp; 1348 hash = cache_get_hash(ncp->nc_name, nlen, dvp); 1349 dvlp = VP2VNODELOCK(dvp); 1350 blp = HASH2BUCKETLOCK(hash); 1351 mtx_unlock(&nl->nl_lock); 1352 mtx_unlock(&nl->nl_evict_lock); 1353 mtx_lock(dvlp); 1354 mtx_lock(blp); 1355 /* 1356 * Note that since all locks were dropped above, the entry may be 1357 * gone or reallocated to be something else. 1358 */ 1359 CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) { 1360 if (ncp2 == ncp && ncp2->nc_dvp == dvp && 1361 ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0) 1362 break; 1363 } 1364 if (ncp2 == NULL) { 1365 counter_u64_add(neg_evict_skipped_missed, 1); 1366 ncp = NULL; 1367 evicted = false; 1368 } else { 1369 MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp)); 1370 MPASS(blp == NCP2BUCKETLOCK(ncp)); 1371 SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp, 1372 ncp->nc_name); 1373 cache_zap_locked(ncp); 1374 counter_u64_add(neg_evicted, 1); 1375 evicted = true; 1376 } 1377 mtx_unlock(blp); 1378 mtx_unlock(dvlp); 1379 if (ncp != NULL) 1380 cache_free(ncp); 1381 return (evicted); 1382 } 1383 1384 /* 1385 * Maybe evict a negative entry to create more room. 1386 * 1387 * The ncnegfactor parameter limits what fraction of the total count 1388 * can comprise of negative entries. However, if the cache is just 1389 * warming up this leads to excessive evictions. As such, ncnegminpct 1390 * (recomputed to neg_min) dictates whether the above should be 1391 * applied. 1392 * 1393 * Try evicting if the cache is close to full capacity regardless of 1394 * other considerations. 1395 */ 1396 static bool 1397 cache_neg_evict_cond(u_long lnumcache) 1398 { 1399 u_long lnumneg; 1400 1401 if (ncsize - 1000 < lnumcache) 1402 goto out_evict; 1403 lnumneg = atomic_load_long(&numneg); 1404 if (lnumneg < neg_min) 1405 return (false); 1406 if (lnumneg * ncnegfactor < lnumcache) 1407 return (false); 1408 out_evict: 1409 return (cache_neg_evict()); 1410 } 1411 1412 /* 1413 * cache_zap_locked(): 1414 * 1415 * Removes a namecache entry from cache, whether it contains an actual 1416 * pointer to a vnode or if it is just a negative cache entry. 1417 */ 1418 static void 1419 cache_zap_locked(struct namecache *ncp) 1420 { 1421 struct nchashhead *ncpp; 1422 struct vnode *dvp, *vp; 1423 1424 dvp = ncp->nc_dvp; 1425 vp = ncp->nc_vp; 1426 1427 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1428 cache_assert_vnode_locked(vp); 1429 cache_assert_vnode_locked(dvp); 1430 cache_assert_bucket_locked(ncp); 1431 1432 cache_ncp_invalidate(ncp); 1433 1434 ncpp = NCP2BUCKET(ncp); 1435 CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash); 1436 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 1437 SDT_PROBE3(vfs, namecache, zap, done, dvp, ncp->nc_name, vp); 1438 TAILQ_REMOVE(&vp->v_cache_dst, ncp, nc_dst); 1439 if (ncp == vp->v_cache_dd) { 1440 atomic_store_ptr(&vp->v_cache_dd, NULL); 1441 } 1442 } else { 1443 SDT_PROBE2(vfs, namecache, zap_negative, done, dvp, ncp->nc_name); 1444 cache_neg_remove(ncp); 1445 } 1446 if (ncp->nc_flag & NCF_ISDOTDOT) { 1447 if (ncp == dvp->v_cache_dd) { 1448 atomic_store_ptr(&dvp->v_cache_dd, NULL); 1449 } 1450 } else { 1451 LIST_REMOVE(ncp, nc_src); 1452 if (LIST_EMPTY(&dvp->v_cache_src)) { 1453 ncp->nc_flag |= NCF_DVDROP; 1454 } 1455 } 1456 } 1457 1458 static void 1459 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) 1460 { 1461 struct mtx *blp; 1462 1463 MPASS(ncp->nc_dvp == vp); 1464 MPASS(ncp->nc_flag & NCF_NEGATIVE); 1465 cache_assert_vnode_locked(vp); 1466 1467 blp = NCP2BUCKETLOCK(ncp); 1468 mtx_lock(blp); 1469 cache_zap_locked(ncp); 1470 mtx_unlock(blp); 1471 } 1472 1473 static bool 1474 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, 1475 struct mtx **vlpp) 1476 { 1477 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 1478 struct mtx *blp; 1479 1480 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 1481 cache_assert_vnode_locked(vp); 1482 1483 if (ncp->nc_flag & NCF_NEGATIVE) { 1484 if (*vlpp != NULL) { 1485 mtx_unlock(*vlpp); 1486 *vlpp = NULL; 1487 } 1488 cache_zap_negative_locked_vnode_kl(ncp, vp); 1489 return (true); 1490 } 1491 1492 pvlp = VP2VNODELOCK(vp); 1493 blp = NCP2BUCKETLOCK(ncp); 1494 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 1495 vlp2 = VP2VNODELOCK(ncp->nc_vp); 1496 1497 if (*vlpp == vlp1 || *vlpp == vlp2) { 1498 to_unlock = *vlpp; 1499 *vlpp = NULL; 1500 } else { 1501 if (*vlpp != NULL) { 1502 mtx_unlock(*vlpp); 1503 *vlpp = NULL; 1504 } 1505 cache_sort_vnodes(&vlp1, &vlp2); 1506 if (vlp1 == pvlp) { 1507 mtx_lock(vlp2); 1508 to_unlock = vlp2; 1509 } else { 1510 if (!mtx_trylock(vlp1)) 1511 goto out_relock; 1512 to_unlock = vlp1; 1513 } 1514 } 1515 mtx_lock(blp); 1516 cache_zap_locked(ncp); 1517 mtx_unlock(blp); 1518 if (to_unlock != NULL) 1519 mtx_unlock(to_unlock); 1520 return (true); 1521 1522 out_relock: 1523 mtx_unlock(vlp2); 1524 mtx_lock(vlp1); 1525 mtx_lock(vlp2); 1526 MPASS(*vlpp == NULL); 1527 *vlpp = vlp1; 1528 return (false); 1529 } 1530 1531 /* 1532 * If trylocking failed we can get here. We know enough to take all needed locks 1533 * in the right order and re-lookup the entry. 1534 */ 1535 static int 1536 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1537 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, 1538 struct mtx *blp) 1539 { 1540 struct namecache *rncp; 1541 1542 cache_assert_bucket_unlocked(ncp); 1543 1544 cache_sort_vnodes(&dvlp, &vlp); 1545 cache_lock_vnodes(dvlp, vlp); 1546 mtx_lock(blp); 1547 CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { 1548 if (rncp == ncp && rncp->nc_dvp == dvp && 1549 rncp->nc_nlen == cnp->cn_namelen && 1550 !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) 1551 break; 1552 } 1553 if (rncp != NULL) { 1554 cache_zap_locked(rncp); 1555 mtx_unlock(blp); 1556 cache_unlock_vnodes(dvlp, vlp); 1557 counter_u64_add(zap_bucket_relock_success, 1); 1558 return (0); 1559 } 1560 1561 mtx_unlock(blp); 1562 cache_unlock_vnodes(dvlp, vlp); 1563 return (EAGAIN); 1564 } 1565 1566 static int __noinline 1567 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp, 1568 uint32_t hash, struct mtx *blp) 1569 { 1570 struct mtx *dvlp, *vlp; 1571 struct vnode *dvp; 1572 1573 cache_assert_bucket_locked(ncp); 1574 1575 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1576 vlp = NULL; 1577 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1578 vlp = VP2VNODELOCK(ncp->nc_vp); 1579 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1580 cache_zap_locked(ncp); 1581 mtx_unlock(blp); 1582 cache_unlock_vnodes(dvlp, vlp); 1583 return (0); 1584 } 1585 1586 dvp = ncp->nc_dvp; 1587 mtx_unlock(blp); 1588 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1589 } 1590 1591 static __noinline int 1592 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp) 1593 { 1594 struct namecache *ncp; 1595 struct mtx *blp; 1596 struct mtx *dvlp, *dvlp2; 1597 uint32_t hash; 1598 int error; 1599 1600 if (cnp->cn_namelen == 2 && 1601 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1602 dvlp = VP2VNODELOCK(dvp); 1603 dvlp2 = NULL; 1604 mtx_lock(dvlp); 1605 retry_dotdot: 1606 ncp = dvp->v_cache_dd; 1607 if (ncp == NULL) { 1608 mtx_unlock(dvlp); 1609 if (dvlp2 != NULL) 1610 mtx_unlock(dvlp2); 1611 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1612 return (0); 1613 } 1614 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1615 if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2)) 1616 goto retry_dotdot; 1617 MPASS(dvp->v_cache_dd == NULL); 1618 mtx_unlock(dvlp); 1619 if (dvlp2 != NULL) 1620 mtx_unlock(dvlp2); 1621 cache_free(ncp); 1622 } else { 1623 atomic_store_ptr(&dvp->v_cache_dd, NULL); 1624 mtx_unlock(dvlp); 1625 if (dvlp2 != NULL) 1626 mtx_unlock(dvlp2); 1627 } 1628 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1629 return (1); 1630 } 1631 1632 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1633 blp = HASH2BUCKETLOCK(hash); 1634 retry: 1635 if (CK_SLIST_EMPTY(NCHHASH(hash))) 1636 goto out_no_entry; 1637 1638 mtx_lock(blp); 1639 1640 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1641 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1642 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1643 break; 1644 } 1645 1646 if (ncp == NULL) { 1647 mtx_unlock(blp); 1648 goto out_no_entry; 1649 } 1650 1651 error = cache_zap_locked_bucket(ncp, cnp, hash, blp); 1652 if (__predict_false(error != 0)) { 1653 zap_bucket_fail++; 1654 goto retry; 1655 } 1656 counter_u64_add(numposzaps, 1); 1657 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1658 cache_free(ncp); 1659 return (1); 1660 out_no_entry: 1661 counter_u64_add(nummisszap, 1); 1662 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1663 return (0); 1664 } 1665 1666 static int __noinline 1667 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1668 struct timespec *tsp, int *ticksp) 1669 { 1670 int ltype; 1671 1672 *vpp = dvp; 1673 counter_u64_add(dothits, 1); 1674 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); 1675 if (tsp != NULL) 1676 timespecclear(tsp); 1677 if (ticksp != NULL) 1678 *ticksp = ticks; 1679 vrefact(*vpp); 1680 /* 1681 * When we lookup "." we still can be asked to lock it 1682 * differently... 1683 */ 1684 ltype = cnp->cn_lkflags & LK_TYPE_MASK; 1685 if (ltype != VOP_ISLOCKED(*vpp)) { 1686 if (ltype == LK_EXCLUSIVE) { 1687 vn_lock(*vpp, LK_UPGRADE | LK_RETRY); 1688 if (VN_IS_DOOMED((*vpp))) { 1689 /* forced unmount */ 1690 vrele(*vpp); 1691 *vpp = NULL; 1692 return (ENOENT); 1693 } 1694 } else 1695 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); 1696 } 1697 return (-1); 1698 } 1699 1700 static int __noinline 1701 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1702 struct timespec *tsp, int *ticksp) 1703 { 1704 struct namecache_ts *ncp_ts; 1705 struct namecache *ncp; 1706 struct mtx *dvlp; 1707 enum vgetstate vs; 1708 int error, ltype; 1709 bool whiteout; 1710 1711 MPASS((cnp->cn_flags & ISDOTDOT) != 0); 1712 1713 if ((cnp->cn_flags & MAKEENTRY) == 0) { 1714 cache_remove_cnp(dvp, cnp); 1715 return (0); 1716 } 1717 1718 counter_u64_add(dotdothits, 1); 1719 retry: 1720 dvlp = VP2VNODELOCK(dvp); 1721 mtx_lock(dvlp); 1722 ncp = dvp->v_cache_dd; 1723 if (ncp == NULL) { 1724 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, ".."); 1725 mtx_unlock(dvlp); 1726 return (0); 1727 } 1728 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1729 if (ncp->nc_flag & NCF_NEGATIVE) 1730 *vpp = NULL; 1731 else 1732 *vpp = ncp->nc_vp; 1733 } else 1734 *vpp = ncp->nc_dvp; 1735 if (*vpp == NULL) 1736 goto negative_success; 1737 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp); 1738 cache_out_ts(ncp, tsp, ticksp); 1739 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == 1740 NCF_DTS && tsp != NULL) { 1741 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1742 *tsp = ncp_ts->nc_dotdottime; 1743 } 1744 1745 MPASS(dvp != *vpp); 1746 ltype = VOP_ISLOCKED(dvp); 1747 VOP_UNLOCK(dvp); 1748 vs = vget_prep(*vpp); 1749 mtx_unlock(dvlp); 1750 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1751 vn_lock(dvp, ltype | LK_RETRY); 1752 if (VN_IS_DOOMED(dvp)) { 1753 if (error == 0) 1754 vput(*vpp); 1755 *vpp = NULL; 1756 return (ENOENT); 1757 } 1758 if (error) { 1759 *vpp = NULL; 1760 goto retry; 1761 } 1762 return (-1); 1763 negative_success: 1764 if (__predict_false(cnp->cn_nameiop == CREATE)) { 1765 if (cnp->cn_flags & ISLASTCN) { 1766 counter_u64_add(numnegzaps, 1); 1767 cache_zap_negative_locked_vnode_kl(ncp, dvp); 1768 mtx_unlock(dvlp); 1769 cache_free(ncp); 1770 return (0); 1771 } 1772 } 1773 1774 whiteout = (ncp->nc_flag & NCF_WHITE); 1775 cache_out_ts(ncp, tsp, ticksp); 1776 if (cache_neg_hit_prep(ncp)) 1777 cache_neg_promote(ncp); 1778 else 1779 cache_neg_hit_finish(ncp); 1780 mtx_unlock(dvlp); 1781 if (whiteout) 1782 cnp->cn_flags |= ISWHITEOUT; 1783 return (ENOENT); 1784 } 1785 1786 /** 1787 * Lookup a name in the name cache 1788 * 1789 * # Arguments 1790 * 1791 * - dvp: Parent directory in which to search. 1792 * - vpp: Return argument. Will contain desired vnode on cache hit. 1793 * - cnp: Parameters of the name search. The most interesting bits of 1794 * the cn_flags field have the following meanings: 1795 * - MAKEENTRY: If clear, free an entry from the cache rather than look 1796 * it up. 1797 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." 1798 * - tsp: Return storage for cache timestamp. On a successful (positive 1799 * or negative) lookup, tsp will be filled with any timespec that 1800 * was stored when this cache entry was created. However, it will 1801 * be clear for "." entries. 1802 * - ticks: Return storage for alternate cache timestamp. On a successful 1803 * (positive or negative) lookup, it will contain the ticks value 1804 * that was current when the cache entry was created, unless cnp 1805 * was ".". 1806 * 1807 * Either both tsp and ticks have to be provided or neither of them. 1808 * 1809 * # Returns 1810 * 1811 * - -1: A positive cache hit. vpp will contain the desired vnode. 1812 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due 1813 * to a forced unmount. vpp will not be modified. If the entry 1814 * is a whiteout, then the ISWHITEOUT flag will be set in 1815 * cnp->cn_flags. 1816 * - 0: A cache miss. vpp will not be modified. 1817 * 1818 * # Locking 1819 * 1820 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up 1821 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the 1822 * lock is not recursively acquired. 1823 */ 1824 static int __noinline 1825 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1826 struct timespec *tsp, int *ticksp) 1827 { 1828 struct namecache *ncp; 1829 struct mtx *blp; 1830 uint32_t hash; 1831 enum vgetstate vs; 1832 int error; 1833 bool whiteout; 1834 1835 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 1836 MPASS((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) != 0); 1837 1838 retry: 1839 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1840 blp = HASH2BUCKETLOCK(hash); 1841 mtx_lock(blp); 1842 1843 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1844 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1845 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1846 break; 1847 } 1848 1849 if (__predict_false(ncp == NULL)) { 1850 mtx_unlock(blp); 1851 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr); 1852 counter_u64_add(nummiss, 1); 1853 return (0); 1854 } 1855 1856 if (ncp->nc_flag & NCF_NEGATIVE) 1857 goto negative_success; 1858 1859 counter_u64_add(numposhits, 1); 1860 *vpp = ncp->nc_vp; 1861 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); 1862 cache_out_ts(ncp, tsp, ticksp); 1863 MPASS(dvp != *vpp); 1864 vs = vget_prep(*vpp); 1865 mtx_unlock(blp); 1866 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1867 if (error) { 1868 *vpp = NULL; 1869 goto retry; 1870 } 1871 return (-1); 1872 negative_success: 1873 /* 1874 * We don't get here with regular lookup apart from corner cases. 1875 */ 1876 if (__predict_true(cnp->cn_nameiop == CREATE)) { 1877 if (cnp->cn_flags & ISLASTCN) { 1878 counter_u64_add(numnegzaps, 1); 1879 error = cache_zap_locked_bucket(ncp, cnp, hash, blp); 1880 if (__predict_false(error != 0)) { 1881 zap_bucket_fail2++; 1882 goto retry; 1883 } 1884 cache_free(ncp); 1885 return (0); 1886 } 1887 } 1888 1889 whiteout = (ncp->nc_flag & NCF_WHITE); 1890 cache_out_ts(ncp, tsp, ticksp); 1891 if (cache_neg_hit_prep(ncp)) 1892 cache_neg_promote(ncp); 1893 else 1894 cache_neg_hit_finish(ncp); 1895 mtx_unlock(blp); 1896 if (whiteout) 1897 cnp->cn_flags |= ISWHITEOUT; 1898 return (ENOENT); 1899 } 1900 1901 int 1902 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1903 struct timespec *tsp, int *ticksp) 1904 { 1905 struct namecache *ncp; 1906 uint32_t hash; 1907 enum vgetstate vs; 1908 int error; 1909 bool whiteout, neg_promote; 1910 u_short nc_flag; 1911 1912 MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL)); 1913 1914 #ifdef DEBUG_CACHE 1915 if (__predict_false(!doingcache)) { 1916 cnp->cn_flags &= ~MAKEENTRY; 1917 return (0); 1918 } 1919 #endif 1920 1921 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 1922 if (cnp->cn_namelen == 1) 1923 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); 1924 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') 1925 return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp)); 1926 } 1927 1928 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 1929 1930 if ((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) == 0) { 1931 cache_remove_cnp(dvp, cnp); 1932 return (0); 1933 } 1934 1935 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1936 vfs_smr_enter(); 1937 1938 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1939 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1940 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1941 break; 1942 } 1943 1944 if (__predict_false(ncp == NULL)) { 1945 vfs_smr_exit(); 1946 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr); 1947 counter_u64_add(nummiss, 1); 1948 return (0); 1949 } 1950 1951 nc_flag = atomic_load_char(&ncp->nc_flag); 1952 if (nc_flag & NCF_NEGATIVE) 1953 goto negative_success; 1954 1955 counter_u64_add(numposhits, 1); 1956 *vpp = ncp->nc_vp; 1957 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); 1958 cache_out_ts(ncp, tsp, ticksp); 1959 MPASS(dvp != *vpp); 1960 if (!cache_ncp_canuse(ncp)) { 1961 vfs_smr_exit(); 1962 *vpp = NULL; 1963 goto out_fallback; 1964 } 1965 vs = vget_prep_smr(*vpp); 1966 vfs_smr_exit(); 1967 if (__predict_false(vs == VGET_NONE)) { 1968 *vpp = NULL; 1969 goto out_fallback; 1970 } 1971 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1972 if (error) { 1973 *vpp = NULL; 1974 goto out_fallback; 1975 } 1976 return (-1); 1977 negative_success: 1978 if (cnp->cn_nameiop == CREATE) { 1979 if (cnp->cn_flags & ISLASTCN) { 1980 vfs_smr_exit(); 1981 goto out_fallback; 1982 } 1983 } 1984 1985 cache_out_ts(ncp, tsp, ticksp); 1986 whiteout = (atomic_load_char(&ncp->nc_flag) & NCF_WHITE); 1987 neg_promote = cache_neg_hit_prep(ncp); 1988 if (!cache_ncp_canuse(ncp)) { 1989 cache_neg_hit_abort(ncp); 1990 vfs_smr_exit(); 1991 goto out_fallback; 1992 } 1993 if (neg_promote) { 1994 vfs_smr_exit(); 1995 if (!cache_neg_promote_cond(dvp, cnp, ncp, hash)) 1996 goto out_fallback; 1997 } else { 1998 cache_neg_hit_finish(ncp); 1999 vfs_smr_exit(); 2000 } 2001 if (whiteout) 2002 cnp->cn_flags |= ISWHITEOUT; 2003 return (ENOENT); 2004 out_fallback: 2005 return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp)); 2006 } 2007 2008 struct celockstate { 2009 struct mtx *vlp[3]; 2010 struct mtx *blp[2]; 2011 }; 2012 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); 2013 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); 2014 2015 static inline void 2016 cache_celockstate_init(struct celockstate *cel) 2017 { 2018 2019 bzero(cel, sizeof(*cel)); 2020 } 2021 2022 static void 2023 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, 2024 struct vnode *dvp) 2025 { 2026 struct mtx *vlp1, *vlp2; 2027 2028 MPASS(cel->vlp[0] == NULL); 2029 MPASS(cel->vlp[1] == NULL); 2030 MPASS(cel->vlp[2] == NULL); 2031 2032 MPASS(vp != NULL || dvp != NULL); 2033 2034 vlp1 = VP2VNODELOCK(vp); 2035 vlp2 = VP2VNODELOCK(dvp); 2036 cache_sort_vnodes(&vlp1, &vlp2); 2037 2038 if (vlp1 != NULL) { 2039 mtx_lock(vlp1); 2040 cel->vlp[0] = vlp1; 2041 } 2042 mtx_lock(vlp2); 2043 cel->vlp[1] = vlp2; 2044 } 2045 2046 static void 2047 cache_unlock_vnodes_cel(struct celockstate *cel) 2048 { 2049 2050 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); 2051 2052 if (cel->vlp[0] != NULL) 2053 mtx_unlock(cel->vlp[0]); 2054 if (cel->vlp[1] != NULL) 2055 mtx_unlock(cel->vlp[1]); 2056 if (cel->vlp[2] != NULL) 2057 mtx_unlock(cel->vlp[2]); 2058 } 2059 2060 static bool 2061 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) 2062 { 2063 struct mtx *vlp; 2064 bool ret; 2065 2066 cache_assert_vlp_locked(cel->vlp[0]); 2067 cache_assert_vlp_locked(cel->vlp[1]); 2068 MPASS(cel->vlp[2] == NULL); 2069 2070 MPASS(vp != NULL); 2071 vlp = VP2VNODELOCK(vp); 2072 2073 ret = true; 2074 if (vlp >= cel->vlp[1]) { 2075 mtx_lock(vlp); 2076 } else { 2077 if (mtx_trylock(vlp)) 2078 goto out; 2079 cache_lock_vnodes_cel_3_failures++; 2080 cache_unlock_vnodes_cel(cel); 2081 if (vlp < cel->vlp[0]) { 2082 mtx_lock(vlp); 2083 mtx_lock(cel->vlp[0]); 2084 mtx_lock(cel->vlp[1]); 2085 } else { 2086 if (cel->vlp[0] != NULL) 2087 mtx_lock(cel->vlp[0]); 2088 mtx_lock(vlp); 2089 mtx_lock(cel->vlp[1]); 2090 } 2091 ret = false; 2092 } 2093 out: 2094 cel->vlp[2] = vlp; 2095 return (ret); 2096 } 2097 2098 static void 2099 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1, 2100 struct mtx *blp2) 2101 { 2102 2103 MPASS(cel->blp[0] == NULL); 2104 MPASS(cel->blp[1] == NULL); 2105 2106 cache_sort_vnodes(&blp1, &blp2); 2107 2108 if (blp1 != NULL) { 2109 mtx_lock(blp1); 2110 cel->blp[0] = blp1; 2111 } 2112 mtx_lock(blp2); 2113 cel->blp[1] = blp2; 2114 } 2115 2116 static void 2117 cache_unlock_buckets_cel(struct celockstate *cel) 2118 { 2119 2120 if (cel->blp[0] != NULL) 2121 mtx_unlock(cel->blp[0]); 2122 mtx_unlock(cel->blp[1]); 2123 } 2124 2125 /* 2126 * Lock part of the cache affected by the insertion. 2127 * 2128 * This means vnodelocks for dvp, vp and the relevant bucketlock. 2129 * However, insertion can result in removal of an old entry. In this 2130 * case we have an additional vnode and bucketlock pair to lock. 2131 * 2132 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while 2133 * preserving the locking order (smaller address first). 2134 */ 2135 static void 2136 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 2137 uint32_t hash) 2138 { 2139 struct namecache *ncp; 2140 struct mtx *blps[2]; 2141 u_char nc_flag; 2142 2143 blps[0] = HASH2BUCKETLOCK(hash); 2144 for (;;) { 2145 blps[1] = NULL; 2146 cache_lock_vnodes_cel(cel, dvp, vp); 2147 if (vp == NULL || vp->v_type != VDIR) 2148 break; 2149 ncp = atomic_load_consume_ptr(&vp->v_cache_dd); 2150 if (ncp == NULL) 2151 break; 2152 nc_flag = atomic_load_char(&ncp->nc_flag); 2153 if ((nc_flag & NCF_ISDOTDOT) == 0) 2154 break; 2155 MPASS(ncp->nc_dvp == vp); 2156 blps[1] = NCP2BUCKETLOCK(ncp); 2157 if ((nc_flag & NCF_NEGATIVE) != 0) 2158 break; 2159 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 2160 break; 2161 /* 2162 * All vnodes got re-locked. Re-validate the state and if 2163 * nothing changed we are done. Otherwise restart. 2164 */ 2165 if (ncp == vp->v_cache_dd && 2166 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 2167 blps[1] == NCP2BUCKETLOCK(ncp) && 2168 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 2169 break; 2170 cache_unlock_vnodes_cel(cel); 2171 cel->vlp[0] = NULL; 2172 cel->vlp[1] = NULL; 2173 cel->vlp[2] = NULL; 2174 } 2175 cache_lock_buckets_cel(cel, blps[0], blps[1]); 2176 } 2177 2178 static void 2179 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 2180 uint32_t hash) 2181 { 2182 struct namecache *ncp; 2183 struct mtx *blps[2]; 2184 u_char nc_flag; 2185 2186 blps[0] = HASH2BUCKETLOCK(hash); 2187 for (;;) { 2188 blps[1] = NULL; 2189 cache_lock_vnodes_cel(cel, dvp, vp); 2190 ncp = atomic_load_consume_ptr(&dvp->v_cache_dd); 2191 if (ncp == NULL) 2192 break; 2193 nc_flag = atomic_load_char(&ncp->nc_flag); 2194 if ((nc_flag & NCF_ISDOTDOT) == 0) 2195 break; 2196 MPASS(ncp->nc_dvp == dvp); 2197 blps[1] = NCP2BUCKETLOCK(ncp); 2198 if ((nc_flag & NCF_NEGATIVE) != 0) 2199 break; 2200 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 2201 break; 2202 if (ncp == dvp->v_cache_dd && 2203 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 2204 blps[1] == NCP2BUCKETLOCK(ncp) && 2205 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 2206 break; 2207 cache_unlock_vnodes_cel(cel); 2208 cel->vlp[0] = NULL; 2209 cel->vlp[1] = NULL; 2210 cel->vlp[2] = NULL; 2211 } 2212 cache_lock_buckets_cel(cel, blps[0], blps[1]); 2213 } 2214 2215 static void 2216 cache_enter_unlock(struct celockstate *cel) 2217 { 2218 2219 cache_unlock_buckets_cel(cel); 2220 cache_unlock_vnodes_cel(cel); 2221 } 2222 2223 static void __noinline 2224 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, 2225 struct componentname *cnp) 2226 { 2227 struct celockstate cel; 2228 struct namecache *ncp; 2229 uint32_t hash; 2230 int len; 2231 2232 if (atomic_load_ptr(&dvp->v_cache_dd) == NULL) 2233 return; 2234 len = cnp->cn_namelen; 2235 cache_celockstate_init(&cel); 2236 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 2237 cache_enter_lock_dd(&cel, dvp, vp, hash); 2238 ncp = dvp->v_cache_dd; 2239 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) { 2240 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); 2241 cache_zap_locked(ncp); 2242 } else { 2243 ncp = NULL; 2244 } 2245 atomic_store_ptr(&dvp->v_cache_dd, NULL); 2246 cache_enter_unlock(&cel); 2247 if (ncp != NULL) 2248 cache_free(ncp); 2249 } 2250 2251 /* 2252 * Add an entry to the cache. 2253 */ 2254 void 2255 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 2256 struct timespec *tsp, struct timespec *dtsp) 2257 { 2258 struct celockstate cel; 2259 struct namecache *ncp, *n2, *ndd; 2260 struct namecache_ts *ncp_ts; 2261 struct nchashhead *ncpp; 2262 uint32_t hash; 2263 int flag; 2264 int len; 2265 2266 KASSERT(cnp->cn_namelen <= NAME_MAX, 2267 ("%s: passed len %ld exceeds NAME_MAX (%d)", __func__, cnp->cn_namelen, 2268 NAME_MAX)); 2269 VNPASS(dvp != vp, dvp); 2270 VNPASS(!VN_IS_DOOMED(dvp), dvp); 2271 VNPASS(dvp->v_type != VNON, dvp); 2272 if (vp != NULL) { 2273 VNPASS(!VN_IS_DOOMED(vp), vp); 2274 VNPASS(vp->v_type != VNON, vp); 2275 } 2276 2277 #ifdef DEBUG_CACHE 2278 if (__predict_false(!doingcache)) 2279 return; 2280 #endif 2281 2282 flag = 0; 2283 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 2284 if (cnp->cn_namelen == 1) 2285 return; 2286 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 2287 cache_enter_dotdot_prep(dvp, vp, cnp); 2288 flag = NCF_ISDOTDOT; 2289 } 2290 } 2291 2292 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); 2293 if (ncp == NULL) 2294 return; 2295 2296 cache_celockstate_init(&cel); 2297 ndd = NULL; 2298 ncp_ts = NULL; 2299 2300 /* 2301 * Calculate the hash key and setup as much of the new 2302 * namecache entry as possible before acquiring the lock. 2303 */ 2304 ncp->nc_flag = flag | NCF_WIP; 2305 ncp->nc_vp = vp; 2306 if (vp == NULL) 2307 cache_neg_init(ncp); 2308 ncp->nc_dvp = dvp; 2309 if (tsp != NULL) { 2310 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 2311 ncp_ts->nc_time = *tsp; 2312 ncp_ts->nc_ticks = ticks; 2313 ncp_ts->nc_nc.nc_flag |= NCF_TS; 2314 if (dtsp != NULL) { 2315 ncp_ts->nc_dotdottime = *dtsp; 2316 ncp_ts->nc_nc.nc_flag |= NCF_DTS; 2317 } 2318 } 2319 len = ncp->nc_nlen = cnp->cn_namelen; 2320 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 2321 memcpy(ncp->nc_name, cnp->cn_nameptr, len); 2322 ncp->nc_name[len] = '\0'; 2323 cache_enter_lock(&cel, dvp, vp, hash); 2324 2325 /* 2326 * See if this vnode or negative entry is already in the cache 2327 * with this name. This can happen with concurrent lookups of 2328 * the same path name. 2329 */ 2330 ncpp = NCHHASH(hash); 2331 CK_SLIST_FOREACH(n2, ncpp, nc_hash) { 2332 if (n2->nc_dvp == dvp && 2333 n2->nc_nlen == cnp->cn_namelen && 2334 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { 2335 MPASS(cache_ncp_canuse(n2)); 2336 if ((n2->nc_flag & NCF_NEGATIVE) != 0) 2337 KASSERT(vp == NULL, 2338 ("%s: found entry pointing to a different vnode (%p != %p)", 2339 __func__, NULL, vp)); 2340 else 2341 KASSERT(n2->nc_vp == vp, 2342 ("%s: found entry pointing to a different vnode (%p != %p)", 2343 __func__, n2->nc_vp, vp)); 2344 /* 2345 * Entries are supposed to be immutable unless in the 2346 * process of getting destroyed. Accommodating for 2347 * changing timestamps is possible but not worth it. 2348 * This should be harmless in terms of correctness, in 2349 * the worst case resulting in an earlier expiration. 2350 * Alternatively, the found entry can be replaced 2351 * altogether. 2352 */ 2353 MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS))); 2354 #if 0 2355 if (tsp != NULL) { 2356 KASSERT((n2->nc_flag & NCF_TS) != 0, 2357 ("no NCF_TS")); 2358 n2_ts = __containerof(n2, struct namecache_ts, nc_nc); 2359 n2_ts->nc_time = ncp_ts->nc_time; 2360 n2_ts->nc_ticks = ncp_ts->nc_ticks; 2361 if (dtsp != NULL) { 2362 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; 2363 n2_ts->nc_nc.nc_flag |= NCF_DTS; 2364 } 2365 } 2366 #endif 2367 SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name, 2368 vp); 2369 goto out_unlock_free; 2370 } 2371 } 2372 2373 if (flag == NCF_ISDOTDOT) { 2374 /* 2375 * See if we are trying to add .. entry, but some other lookup 2376 * has populated v_cache_dd pointer already. 2377 */ 2378 if (dvp->v_cache_dd != NULL) 2379 goto out_unlock_free; 2380 KASSERT(vp == NULL || vp->v_type == VDIR, 2381 ("wrong vnode type %p", vp)); 2382 atomic_thread_fence_rel(); 2383 atomic_store_ptr(&dvp->v_cache_dd, ncp); 2384 } 2385 2386 if (vp != NULL) { 2387 if (flag != NCF_ISDOTDOT) { 2388 /* 2389 * For this case, the cache entry maps both the 2390 * directory name in it and the name ".." for the 2391 * directory's parent. 2392 */ 2393 if ((ndd = vp->v_cache_dd) != NULL) { 2394 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) 2395 cache_zap_locked(ndd); 2396 else 2397 ndd = NULL; 2398 } 2399 atomic_thread_fence_rel(); 2400 atomic_store_ptr(&vp->v_cache_dd, ncp); 2401 } else if (vp->v_type != VDIR) { 2402 if (vp->v_cache_dd != NULL) { 2403 atomic_store_ptr(&vp->v_cache_dd, NULL); 2404 } 2405 } 2406 } 2407 2408 if (flag != NCF_ISDOTDOT) { 2409 if (LIST_EMPTY(&dvp->v_cache_src)) { 2410 cache_hold_vnode(dvp); 2411 } 2412 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); 2413 } 2414 2415 /* 2416 * If the entry is "negative", we place it into the 2417 * "negative" cache queue, otherwise, we place it into the 2418 * destination vnode's cache entries queue. 2419 */ 2420 if (vp != NULL) { 2421 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); 2422 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, 2423 vp); 2424 } else { 2425 if (cnp->cn_flags & ISWHITEOUT) 2426 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_WHITE); 2427 cache_neg_insert(ncp); 2428 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, 2429 ncp->nc_name); 2430 } 2431 2432 /* 2433 * Insert the new namecache entry into the appropriate chain 2434 * within the cache entries table. 2435 */ 2436 CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash); 2437 2438 atomic_thread_fence_rel(); 2439 /* 2440 * Mark the entry as fully constructed. 2441 * It is immutable past this point until its removal. 2442 */ 2443 atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP); 2444 2445 cache_enter_unlock(&cel); 2446 if (ndd != NULL) 2447 cache_free(ndd); 2448 return; 2449 out_unlock_free: 2450 cache_enter_unlock(&cel); 2451 cache_free(ncp); 2452 return; 2453 } 2454 2455 static u_int 2456 cache_roundup_2(u_int val) 2457 { 2458 u_int res; 2459 2460 for (res = 1; res <= val; res <<= 1) 2461 continue; 2462 2463 return (res); 2464 } 2465 2466 static struct nchashhead * 2467 nchinittbl(u_long elements, u_long *hashmask) 2468 { 2469 struct nchashhead *hashtbl; 2470 u_long hashsize, i; 2471 2472 hashsize = cache_roundup_2(elements) / 2; 2473 2474 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK); 2475 for (i = 0; i < hashsize; i++) 2476 CK_SLIST_INIT(&hashtbl[i]); 2477 *hashmask = hashsize - 1; 2478 return (hashtbl); 2479 } 2480 2481 static void 2482 ncfreetbl(struct nchashhead *hashtbl) 2483 { 2484 2485 free(hashtbl, M_VFSCACHE); 2486 } 2487 2488 /* 2489 * Name cache initialization, from vfs_init() when we are booting 2490 */ 2491 static void 2492 nchinit(void *dummy __unused) 2493 { 2494 u_int i; 2495 2496 cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE, 2497 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2498 cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE, 2499 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2500 cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE, 2501 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2502 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE, 2503 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2504 2505 VFS_SMR_ZONE_SET(cache_zone_small); 2506 VFS_SMR_ZONE_SET(cache_zone_small_ts); 2507 VFS_SMR_ZONE_SET(cache_zone_large); 2508 VFS_SMR_ZONE_SET(cache_zone_large_ts); 2509 2510 ncsize = desiredvnodes * ncsizefactor; 2511 cache_recalc_neg_min(ncnegminpct); 2512 nchashtbl = nchinittbl(desiredvnodes * 2, &nchash); 2513 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; 2514 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ 2515 ncbuckethash = 7; 2516 if (ncbuckethash > nchash) 2517 ncbuckethash = nchash; 2518 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, 2519 M_WAITOK | M_ZERO); 2520 for (i = 0; i < numbucketlocks; i++) 2521 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE); 2522 ncvnodehash = ncbuckethash; 2523 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, 2524 M_WAITOK | M_ZERO); 2525 for (i = 0; i < numvnodelocks; i++) 2526 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); 2527 2528 for (i = 0; i < numneglists; i++) { 2529 mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF); 2530 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); 2531 TAILQ_INIT(&neglists[i].nl_list); 2532 TAILQ_INIT(&neglists[i].nl_hotlist); 2533 } 2534 } 2535 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); 2536 2537 void 2538 cache_vnode_init(struct vnode *vp) 2539 { 2540 2541 LIST_INIT(&vp->v_cache_src); 2542 TAILQ_INIT(&vp->v_cache_dst); 2543 vp->v_cache_dd = NULL; 2544 cache_prehash(vp); 2545 } 2546 2547 void 2548 cache_changesize(u_long newmaxvnodes) 2549 { 2550 struct nchashhead *new_nchashtbl, *old_nchashtbl; 2551 u_long new_nchash, old_nchash; 2552 struct namecache *ncp; 2553 uint32_t hash; 2554 u_long newncsize; 2555 int i; 2556 2557 newncsize = newmaxvnodes * ncsizefactor; 2558 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); 2559 if (newmaxvnodes < numbucketlocks) 2560 newmaxvnodes = numbucketlocks; 2561 2562 new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash); 2563 /* If same hash table size, nothing to do */ 2564 if (nchash == new_nchash) { 2565 ncfreetbl(new_nchashtbl); 2566 return; 2567 } 2568 /* 2569 * Move everything from the old hash table to the new table. 2570 * None of the namecache entries in the table can be removed 2571 * because to do so, they have to be removed from the hash table. 2572 */ 2573 cache_lock_all_vnodes(); 2574 cache_lock_all_buckets(); 2575 old_nchashtbl = nchashtbl; 2576 old_nchash = nchash; 2577 nchashtbl = new_nchashtbl; 2578 nchash = new_nchash; 2579 for (i = 0; i <= old_nchash; i++) { 2580 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) { 2581 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, 2582 ncp->nc_dvp); 2583 CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash); 2584 CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); 2585 } 2586 } 2587 ncsize = newncsize; 2588 cache_recalc_neg_min(ncnegminpct); 2589 cache_unlock_all_buckets(); 2590 cache_unlock_all_vnodes(); 2591 ncfreetbl(old_nchashtbl); 2592 } 2593 2594 /* 2595 * Remove all entries from and to a particular vnode. 2596 */ 2597 static void 2598 cache_purge_impl(struct vnode *vp) 2599 { 2600 struct cache_freebatch batch; 2601 struct namecache *ncp; 2602 struct mtx *vlp, *vlp2; 2603 2604 TAILQ_INIT(&batch); 2605 vlp = VP2VNODELOCK(vp); 2606 vlp2 = NULL; 2607 mtx_lock(vlp); 2608 retry: 2609 while (!LIST_EMPTY(&vp->v_cache_src)) { 2610 ncp = LIST_FIRST(&vp->v_cache_src); 2611 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2612 goto retry; 2613 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2614 } 2615 while (!TAILQ_EMPTY(&vp->v_cache_dst)) { 2616 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2617 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2618 goto retry; 2619 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2620 } 2621 ncp = vp->v_cache_dd; 2622 if (ncp != NULL) { 2623 KASSERT(ncp->nc_flag & NCF_ISDOTDOT, 2624 ("lost dotdot link")); 2625 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2626 goto retry; 2627 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2628 } 2629 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); 2630 mtx_unlock(vlp); 2631 if (vlp2 != NULL) 2632 mtx_unlock(vlp2); 2633 cache_free_batch(&batch); 2634 } 2635 2636 /* 2637 * Opportunistic check to see if there is anything to do. 2638 */ 2639 static bool 2640 cache_has_entries(struct vnode *vp) 2641 { 2642 2643 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2644 atomic_load_ptr(&vp->v_cache_dd) == NULL) 2645 return (false); 2646 return (true); 2647 } 2648 2649 void 2650 cache_purge(struct vnode *vp) 2651 { 2652 2653 SDT_PROBE1(vfs, namecache, purge, done, vp); 2654 if (!cache_has_entries(vp)) 2655 return; 2656 cache_purge_impl(vp); 2657 } 2658 2659 /* 2660 * Only to be used by vgone. 2661 */ 2662 void 2663 cache_purge_vgone(struct vnode *vp) 2664 { 2665 struct mtx *vlp; 2666 2667 VNPASS(VN_IS_DOOMED(vp), vp); 2668 if (cache_has_entries(vp)) { 2669 cache_purge_impl(vp); 2670 return; 2671 } 2672 2673 /* 2674 * Serialize against a potential thread doing cache_purge. 2675 */ 2676 vlp = VP2VNODELOCK(vp); 2677 mtx_wait_unlocked(vlp); 2678 if (cache_has_entries(vp)) { 2679 cache_purge_impl(vp); 2680 return; 2681 } 2682 return; 2683 } 2684 2685 /* 2686 * Remove all negative entries for a particular directory vnode. 2687 */ 2688 void 2689 cache_purge_negative(struct vnode *vp) 2690 { 2691 struct cache_freebatch batch; 2692 struct namecache *ncp, *nnp; 2693 struct mtx *vlp; 2694 2695 SDT_PROBE1(vfs, namecache, purge_negative, done, vp); 2696 if (LIST_EMPTY(&vp->v_cache_src)) 2697 return; 2698 TAILQ_INIT(&batch); 2699 vlp = VP2VNODELOCK(vp); 2700 mtx_lock(vlp); 2701 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { 2702 if (!(ncp->nc_flag & NCF_NEGATIVE)) 2703 continue; 2704 cache_zap_negative_locked_vnode_kl(ncp, vp); 2705 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2706 } 2707 mtx_unlock(vlp); 2708 cache_free_batch(&batch); 2709 } 2710 2711 /* 2712 * Entry points for modifying VOP operations. 2713 */ 2714 void 2715 cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, 2716 struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp) 2717 { 2718 2719 ASSERT_VOP_IN_SEQC(fdvp); 2720 ASSERT_VOP_IN_SEQC(fvp); 2721 ASSERT_VOP_IN_SEQC(tdvp); 2722 if (tvp != NULL) 2723 ASSERT_VOP_IN_SEQC(tvp); 2724 2725 cache_purge(fvp); 2726 if (tvp != NULL) { 2727 cache_purge(tvp); 2728 KASSERT(!cache_remove_cnp(tdvp, tcnp), 2729 ("%s: lingering negative entry", __func__)); 2730 } else { 2731 cache_remove_cnp(tdvp, tcnp); 2732 } 2733 2734 /* 2735 * TODO 2736 * 2737 * Historically renaming was always purging all revelang entries, 2738 * but that's quite wasteful. In particular turns out that in many cases 2739 * the target file is immediately accessed after rename, inducing a cache 2740 * miss. 2741 * 2742 * Recode this to reduce relocking and reuse the existing entry (if any) 2743 * instead of just removing it above and allocating a new one here. 2744 */ 2745 if (cache_rename_add) { 2746 cache_enter(tdvp, fvp, tcnp); 2747 } 2748 } 2749 2750 void 2751 cache_vop_rmdir(struct vnode *dvp, struct vnode *vp) 2752 { 2753 2754 ASSERT_VOP_IN_SEQC(dvp); 2755 ASSERT_VOP_IN_SEQC(vp); 2756 cache_purge(vp); 2757 } 2758 2759 #ifdef INVARIANTS 2760 /* 2761 * Validate that if an entry exists it matches. 2762 */ 2763 void 2764 cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) 2765 { 2766 struct namecache *ncp; 2767 struct mtx *blp; 2768 uint32_t hash; 2769 2770 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 2771 if (CK_SLIST_EMPTY(NCHHASH(hash))) 2772 return; 2773 blp = HASH2BUCKETLOCK(hash); 2774 mtx_lock(blp); 2775 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 2776 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 2777 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) { 2778 if (ncp->nc_vp != vp) 2779 panic("%s: mismatch (%p != %p); ncp %p [%s] dvp %p\n", 2780 __func__, vp, ncp->nc_vp, ncp, ncp->nc_name, ncp->nc_dvp); 2781 } 2782 } 2783 mtx_unlock(blp); 2784 } 2785 #endif 2786 2787 /* 2788 * Flush all entries referencing a particular filesystem. 2789 */ 2790 void 2791 cache_purgevfs(struct mount *mp) 2792 { 2793 struct vnode *vp, *mvp; 2794 2795 SDT_PROBE1(vfs, namecache, purgevfs, done, mp); 2796 /* 2797 * Somewhat wasteful iteration over all vnodes. Would be better to 2798 * support filtering and avoid the interlock to begin with. 2799 */ 2800 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 2801 if (!cache_has_entries(vp)) { 2802 VI_UNLOCK(vp); 2803 continue; 2804 } 2805 vholdl(vp); 2806 VI_UNLOCK(vp); 2807 cache_purge(vp); 2808 vdrop(vp); 2809 } 2810 } 2811 2812 /* 2813 * Perform canonical checks and cache lookup and pass on to filesystem 2814 * through the vop_cachedlookup only if needed. 2815 */ 2816 2817 int 2818 vfs_cache_lookup(struct vop_lookup_args *ap) 2819 { 2820 struct vnode *dvp; 2821 int error; 2822 struct vnode **vpp = ap->a_vpp; 2823 struct componentname *cnp = ap->a_cnp; 2824 int flags = cnp->cn_flags; 2825 2826 *vpp = NULL; 2827 dvp = ap->a_dvp; 2828 2829 if (dvp->v_type != VDIR) 2830 return (ENOTDIR); 2831 2832 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && 2833 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) 2834 return (EROFS); 2835 2836 error = vn_dir_check_exec(dvp, cnp); 2837 if (error != 0) 2838 return (error); 2839 2840 error = cache_lookup(dvp, vpp, cnp, NULL, NULL); 2841 if (error == 0) 2842 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); 2843 if (error == -1) 2844 return (0); 2845 return (error); 2846 } 2847 2848 /* Implementation of the getcwd syscall. */ 2849 int 2850 sys___getcwd(struct thread *td, struct __getcwd_args *uap) 2851 { 2852 char *buf, *retbuf; 2853 size_t buflen; 2854 int error; 2855 2856 buflen = uap->buflen; 2857 if (__predict_false(buflen < 2)) 2858 return (EINVAL); 2859 if (buflen > MAXPATHLEN) 2860 buflen = MAXPATHLEN; 2861 2862 buf = uma_zalloc(namei_zone, M_WAITOK); 2863 error = vn_getcwd(buf, &retbuf, &buflen); 2864 if (error == 0) 2865 error = copyout(retbuf, uap->buf, buflen); 2866 uma_zfree(namei_zone, buf); 2867 return (error); 2868 } 2869 2870 int 2871 vn_getcwd(char *buf, char **retbuf, size_t *buflen) 2872 { 2873 struct pwd *pwd; 2874 int error; 2875 2876 vfs_smr_enter(); 2877 pwd = pwd_get_smr(); 2878 error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, 2879 buflen, 0); 2880 VFS_SMR_ASSERT_NOT_ENTERED(); 2881 if (error < 0) { 2882 pwd = pwd_hold(curthread); 2883 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf, 2884 retbuf, buflen); 2885 pwd_drop(pwd); 2886 } 2887 2888 #ifdef KTRACE 2889 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0) 2890 ktrnamei(*retbuf); 2891 #endif 2892 return (error); 2893 } 2894 2895 static int 2896 kern___realpathat(struct thread *td, int fd, const char *path, char *buf, 2897 size_t size, int flags, enum uio_seg pathseg) 2898 { 2899 struct nameidata nd; 2900 char *retbuf, *freebuf; 2901 int error; 2902 2903 if (flags != 0) 2904 return (EINVAL); 2905 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1, 2906 pathseg, path, fd, &cap_fstat_rights, td); 2907 if ((error = namei(&nd)) != 0) 2908 return (error); 2909 error = vn_fullpath_hardlink(&nd, &retbuf, &freebuf, &size); 2910 if (error == 0) { 2911 error = copyout(retbuf, buf, size); 2912 free(freebuf, M_TEMP); 2913 } 2914 NDFREE(&nd, 0); 2915 return (error); 2916 } 2917 2918 int 2919 sys___realpathat(struct thread *td, struct __realpathat_args *uap) 2920 { 2921 2922 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size, 2923 uap->flags, UIO_USERSPACE)); 2924 } 2925 2926 /* 2927 * Retrieve the full filesystem path that correspond to a vnode from the name 2928 * cache (if available) 2929 */ 2930 int 2931 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf) 2932 { 2933 struct pwd *pwd; 2934 char *buf; 2935 size_t buflen; 2936 int error; 2937 2938 if (__predict_false(vp == NULL)) 2939 return (EINVAL); 2940 2941 buflen = MAXPATHLEN; 2942 buf = malloc(buflen, M_TEMP, M_WAITOK); 2943 vfs_smr_enter(); 2944 pwd = pwd_get_smr(); 2945 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, 0); 2946 VFS_SMR_ASSERT_NOT_ENTERED(); 2947 if (error < 0) { 2948 pwd = pwd_hold(curthread); 2949 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen); 2950 pwd_drop(pwd); 2951 } 2952 if (error == 0) 2953 *freebuf = buf; 2954 else 2955 free(buf, M_TEMP); 2956 return (error); 2957 } 2958 2959 /* 2960 * This function is similar to vn_fullpath, but it attempts to lookup the 2961 * pathname relative to the global root mount point. This is required for the 2962 * auditing sub-system, as audited pathnames must be absolute, relative to the 2963 * global root mount point. 2964 */ 2965 int 2966 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf) 2967 { 2968 char *buf; 2969 size_t buflen; 2970 int error; 2971 2972 if (__predict_false(vp == NULL)) 2973 return (EINVAL); 2974 buflen = MAXPATHLEN; 2975 buf = malloc(buflen, M_TEMP, M_WAITOK); 2976 vfs_smr_enter(); 2977 error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, 0); 2978 VFS_SMR_ASSERT_NOT_ENTERED(); 2979 if (error < 0) { 2980 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen); 2981 } 2982 if (error == 0) 2983 *freebuf = buf; 2984 else 2985 free(buf, M_TEMP); 2986 return (error); 2987 } 2988 2989 static struct namecache * 2990 vn_dd_from_dst(struct vnode *vp) 2991 { 2992 struct namecache *ncp; 2993 2994 cache_assert_vnode_locked(vp); 2995 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) { 2996 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2997 return (ncp); 2998 } 2999 return (NULL); 3000 } 3001 3002 int 3003 vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen) 3004 { 3005 struct vnode *dvp; 3006 struct namecache *ncp; 3007 struct mtx *vlp; 3008 int error; 3009 3010 vlp = VP2VNODELOCK(*vp); 3011 mtx_lock(vlp); 3012 ncp = (*vp)->v_cache_dd; 3013 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) { 3014 KASSERT(ncp == vn_dd_from_dst(*vp), 3015 ("%s: mismatch for dd entry (%p != %p)", __func__, 3016 ncp, vn_dd_from_dst(*vp))); 3017 } else { 3018 ncp = vn_dd_from_dst(*vp); 3019 } 3020 if (ncp != NULL) { 3021 if (*buflen < ncp->nc_nlen) { 3022 mtx_unlock(vlp); 3023 vrele(*vp); 3024 counter_u64_add(numfullpathfail4, 1); 3025 error = ENOMEM; 3026 SDT_PROBE3(vfs, namecache, fullpath, return, error, 3027 vp, NULL); 3028 return (error); 3029 } 3030 *buflen -= ncp->nc_nlen; 3031 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 3032 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, 3033 ncp->nc_name, vp); 3034 dvp = *vp; 3035 *vp = ncp->nc_dvp; 3036 vref(*vp); 3037 mtx_unlock(vlp); 3038 vrele(dvp); 3039 return (0); 3040 } 3041 SDT_PROBE1(vfs, namecache, fullpath, miss, vp); 3042 3043 mtx_unlock(vlp); 3044 vn_lock(*vp, LK_SHARED | LK_RETRY); 3045 error = VOP_VPTOCNP(*vp, &dvp, buf, buflen); 3046 vput(*vp); 3047 if (error) { 3048 counter_u64_add(numfullpathfail2, 1); 3049 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 3050 return (error); 3051 } 3052 3053 *vp = dvp; 3054 if (VN_IS_DOOMED(dvp)) { 3055 /* forced unmount */ 3056 vrele(dvp); 3057 error = ENOENT; 3058 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 3059 return (error); 3060 } 3061 /* 3062 * *vp has its use count incremented still. 3063 */ 3064 3065 return (0); 3066 } 3067 3068 /* 3069 * Resolve a directory to a pathname. 3070 * 3071 * The name of the directory can always be found in the namecache or fetched 3072 * from the filesystem. There is also guaranteed to be only one parent, meaning 3073 * we can just follow vnodes up until we find the root. 3074 * 3075 * The vnode must be referenced. 3076 */ 3077 static int 3078 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 3079 size_t *len, size_t addend) 3080 { 3081 #ifdef KDTRACE_HOOKS 3082 struct vnode *startvp = vp; 3083 #endif 3084 struct vnode *vp1; 3085 size_t buflen; 3086 int error; 3087 bool slash_prefixed; 3088 3089 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); 3090 VNPASS(vp->v_usecount > 0, vp); 3091 3092 buflen = *len; 3093 3094 slash_prefixed = true; 3095 if (addend == 0) { 3096 MPASS(*len >= 2); 3097 buflen--; 3098 buf[buflen] = '\0'; 3099 slash_prefixed = false; 3100 } 3101 3102 error = 0; 3103 3104 SDT_PROBE1(vfs, namecache, fullpath, entry, vp); 3105 counter_u64_add(numfullpathcalls, 1); 3106 while (vp != rdir && vp != rootvnode) { 3107 /* 3108 * The vp vnode must be already fully constructed, 3109 * since it is either found in namecache or obtained 3110 * from VOP_VPTOCNP(). We may test for VV_ROOT safely 3111 * without obtaining the vnode lock. 3112 */ 3113 if ((vp->v_vflag & VV_ROOT) != 0) { 3114 vn_lock(vp, LK_RETRY | LK_SHARED); 3115 3116 /* 3117 * With the vnode locked, check for races with 3118 * unmount, forced or not. Note that we 3119 * already verified that vp is not equal to 3120 * the root vnode, which means that 3121 * mnt_vnodecovered can be NULL only for the 3122 * case of unmount. 3123 */ 3124 if (VN_IS_DOOMED(vp) || 3125 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || 3126 vp1->v_mountedhere != vp->v_mount) { 3127 vput(vp); 3128 error = ENOENT; 3129 SDT_PROBE3(vfs, namecache, fullpath, return, 3130 error, vp, NULL); 3131 break; 3132 } 3133 3134 vref(vp1); 3135 vput(vp); 3136 vp = vp1; 3137 continue; 3138 } 3139 if (vp->v_type != VDIR) { 3140 vrele(vp); 3141 counter_u64_add(numfullpathfail1, 1); 3142 error = ENOTDIR; 3143 SDT_PROBE3(vfs, namecache, fullpath, return, 3144 error, vp, NULL); 3145 break; 3146 } 3147 error = vn_vptocnp(&vp, buf, &buflen); 3148 if (error) 3149 break; 3150 if (buflen == 0) { 3151 vrele(vp); 3152 error = ENOMEM; 3153 SDT_PROBE3(vfs, namecache, fullpath, return, error, 3154 startvp, NULL); 3155 break; 3156 } 3157 buf[--buflen] = '/'; 3158 slash_prefixed = true; 3159 } 3160 if (error) 3161 return (error); 3162 if (!slash_prefixed) { 3163 if (buflen == 0) { 3164 vrele(vp); 3165 counter_u64_add(numfullpathfail4, 1); 3166 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, 3167 startvp, NULL); 3168 return (ENOMEM); 3169 } 3170 buf[--buflen] = '/'; 3171 } 3172 counter_u64_add(numfullpathfound, 1); 3173 vrele(vp); 3174 3175 *retbuf = buf + buflen; 3176 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf); 3177 *len -= buflen; 3178 *len += addend; 3179 return (0); 3180 } 3181 3182 /* 3183 * Resolve an arbitrary vnode to a pathname. 3184 * 3185 * Note 2 caveats: 3186 * - hardlinks are not tracked, thus if the vnode is not a directory this can 3187 * resolve to a different path than the one used to find it 3188 * - namecache is not mandatory, meaning names are not guaranteed to be added 3189 * (in which case resolving fails) 3190 */ 3191 static void __inline 3192 cache_rev_failed_impl(int *reason, int line) 3193 { 3194 3195 *reason = line; 3196 } 3197 #define cache_rev_failed(var) cache_rev_failed_impl((var), __LINE__) 3198 3199 static int 3200 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 3201 char **retbuf, size_t *buflen, size_t addend) 3202 { 3203 #ifdef KDTRACE_HOOKS 3204 struct vnode *startvp = vp; 3205 #endif 3206 struct vnode *tvp; 3207 struct mount *mp; 3208 struct namecache *ncp; 3209 size_t orig_buflen; 3210 int reason; 3211 int error; 3212 #ifdef KDTRACE_HOOKS 3213 int i; 3214 #endif 3215 seqc_t vp_seqc, tvp_seqc; 3216 u_char nc_flag; 3217 3218 VFS_SMR_ASSERT_ENTERED(); 3219 3220 if (!cache_fast_revlookup) { 3221 vfs_smr_exit(); 3222 return (-1); 3223 } 3224 3225 orig_buflen = *buflen; 3226 3227 if (addend == 0) { 3228 MPASS(*buflen >= 2); 3229 *buflen -= 1; 3230 buf[*buflen] = '\0'; 3231 } 3232 3233 if (vp == rdir || vp == rootvnode) { 3234 if (addend == 0) { 3235 *buflen -= 1; 3236 buf[*buflen] = '/'; 3237 } 3238 goto out_ok; 3239 } 3240 3241 #ifdef KDTRACE_HOOKS 3242 i = 0; 3243 #endif 3244 error = -1; 3245 ncp = NULL; /* for sdt probe down below */ 3246 vp_seqc = vn_seqc_read_any(vp); 3247 if (seqc_in_modify(vp_seqc)) { 3248 cache_rev_failed(&reason); 3249 goto out_abort; 3250 } 3251 3252 for (;;) { 3253 #ifdef KDTRACE_HOOKS 3254 i++; 3255 #endif 3256 if ((vp->v_vflag & VV_ROOT) != 0) { 3257 mp = atomic_load_ptr(&vp->v_mount); 3258 if (mp == NULL) { 3259 cache_rev_failed(&reason); 3260 goto out_abort; 3261 } 3262 tvp = atomic_load_ptr(&mp->mnt_vnodecovered); 3263 tvp_seqc = vn_seqc_read_any(tvp); 3264 if (seqc_in_modify(tvp_seqc)) { 3265 cache_rev_failed(&reason); 3266 goto out_abort; 3267 } 3268 if (!vn_seqc_consistent(vp, vp_seqc)) { 3269 cache_rev_failed(&reason); 3270 goto out_abort; 3271 } 3272 vp = tvp; 3273 vp_seqc = tvp_seqc; 3274 continue; 3275 } 3276 ncp = atomic_load_consume_ptr(&vp->v_cache_dd); 3277 if (ncp == NULL) { 3278 cache_rev_failed(&reason); 3279 goto out_abort; 3280 } 3281 nc_flag = atomic_load_char(&ncp->nc_flag); 3282 if ((nc_flag & NCF_ISDOTDOT) != 0) { 3283 cache_rev_failed(&reason); 3284 goto out_abort; 3285 } 3286 if (ncp->nc_nlen >= *buflen) { 3287 cache_rev_failed(&reason); 3288 error = ENOMEM; 3289 goto out_abort; 3290 } 3291 *buflen -= ncp->nc_nlen; 3292 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 3293 *buflen -= 1; 3294 buf[*buflen] = '/'; 3295 tvp = ncp->nc_dvp; 3296 tvp_seqc = vn_seqc_read_any(tvp); 3297 if (seqc_in_modify(tvp_seqc)) { 3298 cache_rev_failed(&reason); 3299 goto out_abort; 3300 } 3301 if (!vn_seqc_consistent(vp, vp_seqc)) { 3302 cache_rev_failed(&reason); 3303 goto out_abort; 3304 } 3305 /* 3306 * Acquire fence provided by vn_seqc_read_any above. 3307 */ 3308 if (__predict_false(atomic_load_ptr(&vp->v_cache_dd) != ncp)) { 3309 cache_rev_failed(&reason); 3310 goto out_abort; 3311 } 3312 if (!cache_ncp_canuse(ncp)) { 3313 cache_rev_failed(&reason); 3314 goto out_abort; 3315 } 3316 vp = tvp; 3317 vp_seqc = tvp_seqc; 3318 if (vp == rdir || vp == rootvnode) 3319 break; 3320 } 3321 out_ok: 3322 vfs_smr_exit(); 3323 *retbuf = buf + *buflen; 3324 *buflen = orig_buflen - *buflen + addend; 3325 SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf); 3326 return (0); 3327 3328 out_abort: 3329 *buflen = orig_buflen; 3330 SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i); 3331 vfs_smr_exit(); 3332 return (error); 3333 } 3334 3335 static int 3336 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 3337 size_t *buflen) 3338 { 3339 size_t orig_buflen, addend; 3340 int error; 3341 3342 if (*buflen < 2) 3343 return (EINVAL); 3344 3345 orig_buflen = *buflen; 3346 3347 vref(vp); 3348 addend = 0; 3349 if (vp->v_type != VDIR) { 3350 *buflen -= 1; 3351 buf[*buflen] = '\0'; 3352 error = vn_vptocnp(&vp, buf, buflen); 3353 if (error) 3354 return (error); 3355 if (*buflen == 0) { 3356 vrele(vp); 3357 return (ENOMEM); 3358 } 3359 *buflen -= 1; 3360 buf[*buflen] = '/'; 3361 addend = orig_buflen - *buflen; 3362 } 3363 3364 return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, addend)); 3365 } 3366 3367 /* 3368 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks). 3369 * 3370 * Since the namecache does not track hardlinks, the caller is expected to first 3371 * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei. 3372 * 3373 * Then we have 2 cases: 3374 * - if the found vnode is a directory, the path can be constructed just by 3375 * following names up the chain 3376 * - otherwise we populate the buffer with the saved name and start resolving 3377 * from the parent 3378 */ 3379 static int 3380 vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, char **freebuf, 3381 size_t *buflen) 3382 { 3383 char *buf, *tmpbuf; 3384 struct pwd *pwd; 3385 struct componentname *cnp; 3386 struct vnode *vp; 3387 size_t addend; 3388 int error; 3389 enum vtype type; 3390 3391 if (*buflen < 2) 3392 return (EINVAL); 3393 if (*buflen > MAXPATHLEN) 3394 *buflen = MAXPATHLEN; 3395 3396 buf = malloc(*buflen, M_TEMP, M_WAITOK); 3397 3398 addend = 0; 3399 vp = ndp->ni_vp; 3400 /* 3401 * Check for VBAD to work around the vp_crossmp bug in lookup(). 3402 * 3403 * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be 3404 * set to mount point's root vnode while ni_dvp will be vp_crossmp. 3405 * If the type is VDIR (like in this very case) we can skip looking 3406 * at ni_dvp in the first place. However, since vnodes get passed here 3407 * unlocked the target may transition to doomed state (type == VBAD) 3408 * before we get to evaluate the condition. If this happens, we will 3409 * populate part of the buffer and descend to vn_fullpath_dir with 3410 * vp == vp_crossmp. Prevent the problem by checking for VBAD. 3411 * 3412 * This should be atomic_load(&vp->v_type) but it is illegal to take 3413 * an address of a bit field, even if said field is sized to char. 3414 * Work around the problem by reading the value into a full-sized enum 3415 * and then re-reading it with atomic_load which will still prevent 3416 * the compiler from re-reading down the road. 3417 */ 3418 type = vp->v_type; 3419 type = atomic_load_int(&type); 3420 if (type == VBAD) { 3421 error = ENOENT; 3422 goto out_bad; 3423 } 3424 if (type != VDIR) { 3425 cnp = &ndp->ni_cnd; 3426 addend = cnp->cn_namelen + 2; 3427 if (*buflen < addend) { 3428 error = ENOMEM; 3429 goto out_bad; 3430 } 3431 *buflen -= addend; 3432 tmpbuf = buf + *buflen; 3433 tmpbuf[0] = '/'; 3434 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen); 3435 tmpbuf[addend - 1] = '\0'; 3436 vp = ndp->ni_dvp; 3437 } 3438 3439 vfs_smr_enter(); 3440 pwd = pwd_get_smr(); 3441 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen, 3442 addend); 3443 VFS_SMR_ASSERT_NOT_ENTERED(); 3444 if (error < 0) { 3445 pwd = pwd_hold(curthread); 3446 vref(vp); 3447 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen, 3448 addend); 3449 pwd_drop(pwd); 3450 if (error != 0) 3451 goto out_bad; 3452 } 3453 3454 *freebuf = buf; 3455 3456 return (0); 3457 out_bad: 3458 free(buf, M_TEMP); 3459 return (error); 3460 } 3461 3462 struct vnode * 3463 vn_dir_dd_ino(struct vnode *vp) 3464 { 3465 struct namecache *ncp; 3466 struct vnode *ddvp; 3467 struct mtx *vlp; 3468 enum vgetstate vs; 3469 3470 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); 3471 vlp = VP2VNODELOCK(vp); 3472 mtx_lock(vlp); 3473 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { 3474 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 3475 continue; 3476 ddvp = ncp->nc_dvp; 3477 vs = vget_prep(ddvp); 3478 mtx_unlock(vlp); 3479 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) 3480 return (NULL); 3481 return (ddvp); 3482 } 3483 mtx_unlock(vlp); 3484 return (NULL); 3485 } 3486 3487 int 3488 vn_commname(struct vnode *vp, char *buf, u_int buflen) 3489 { 3490 struct namecache *ncp; 3491 struct mtx *vlp; 3492 int l; 3493 3494 vlp = VP2VNODELOCK(vp); 3495 mtx_lock(vlp); 3496 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) 3497 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 3498 break; 3499 if (ncp == NULL) { 3500 mtx_unlock(vlp); 3501 return (ENOENT); 3502 } 3503 l = min(ncp->nc_nlen, buflen - 1); 3504 memcpy(buf, ncp->nc_name, l); 3505 mtx_unlock(vlp); 3506 buf[l] = '\0'; 3507 return (0); 3508 } 3509 3510 /* 3511 * This function updates path string to vnode's full global path 3512 * and checks the size of the new path string against the pathlen argument. 3513 * 3514 * Requires a locked, referenced vnode. 3515 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 3516 * 3517 * If vp is a directory, the call to vn_fullpath_global() always succeeds 3518 * because it falls back to the ".." lookup if the namecache lookup fails. 3519 */ 3520 int 3521 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, 3522 u_int pathlen) 3523 { 3524 struct nameidata nd; 3525 struct vnode *vp1; 3526 char *rpath, *fbuf; 3527 int error; 3528 3529 ASSERT_VOP_ELOCKED(vp, __func__); 3530 3531 /* Construct global filesystem path from vp. */ 3532 VOP_UNLOCK(vp); 3533 error = vn_fullpath_global(vp, &rpath, &fbuf); 3534 3535 if (error != 0) { 3536 vrele(vp); 3537 return (error); 3538 } 3539 3540 if (strlen(rpath) >= pathlen) { 3541 vrele(vp); 3542 error = ENAMETOOLONG; 3543 goto out; 3544 } 3545 3546 /* 3547 * Re-lookup the vnode by path to detect a possible rename. 3548 * As a side effect, the vnode is relocked. 3549 * If vnode was renamed, return ENOENT. 3550 */ 3551 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, 3552 UIO_SYSSPACE, path, td); 3553 error = namei(&nd); 3554 if (error != 0) { 3555 vrele(vp); 3556 goto out; 3557 } 3558 NDFREE(&nd, NDF_ONLY_PNBUF); 3559 vp1 = nd.ni_vp; 3560 vrele(vp); 3561 if (vp1 == vp) 3562 strcpy(path, rpath); 3563 else { 3564 vput(vp1); 3565 error = ENOENT; 3566 } 3567 3568 out: 3569 free(fbuf, M_TEMP); 3570 return (error); 3571 } 3572 3573 #ifdef DDB 3574 static void 3575 db_print_vpath(struct vnode *vp) 3576 { 3577 3578 while (vp != NULL) { 3579 db_printf("%p: ", vp); 3580 if (vp == rootvnode) { 3581 db_printf("/"); 3582 vp = NULL; 3583 } else { 3584 if (vp->v_vflag & VV_ROOT) { 3585 db_printf("<mount point>"); 3586 vp = vp->v_mount->mnt_vnodecovered; 3587 } else { 3588 struct namecache *ncp; 3589 char *ncn; 3590 int i; 3591 3592 ncp = TAILQ_FIRST(&vp->v_cache_dst); 3593 if (ncp != NULL) { 3594 ncn = ncp->nc_name; 3595 for (i = 0; i < ncp->nc_nlen; i++) 3596 db_printf("%c", *ncn++); 3597 vp = ncp->nc_dvp; 3598 } else { 3599 vp = NULL; 3600 } 3601 } 3602 } 3603 db_printf("\n"); 3604 } 3605 3606 return; 3607 } 3608 3609 DB_SHOW_COMMAND(vpath, db_show_vpath) 3610 { 3611 struct vnode *vp; 3612 3613 if (!have_addr) { 3614 db_printf("usage: show vpath <struct vnode *>\n"); 3615 return; 3616 } 3617 3618 vp = (struct vnode *)addr; 3619 db_print_vpath(vp); 3620 } 3621 3622 #endif 3623 3624 static int cache_fast_lookup = 1; 3625 static char __read_frequently cache_fast_lookup_enabled = true; 3626 3627 #define CACHE_FPL_FAILED -2020 3628 3629 void 3630 cache_fast_lookup_enabled_recalc(void) 3631 { 3632 int lookup_flag; 3633 int mac_on; 3634 3635 #ifdef MAC 3636 mac_on = mac_vnode_check_lookup_enabled(); 3637 mac_on |= mac_vnode_check_readlink_enabled(); 3638 #else 3639 mac_on = 0; 3640 #endif 3641 3642 lookup_flag = atomic_load_int(&cache_fast_lookup); 3643 if (lookup_flag && !mac_on) { 3644 atomic_store_char(&cache_fast_lookup_enabled, true); 3645 } else { 3646 atomic_store_char(&cache_fast_lookup_enabled, false); 3647 } 3648 } 3649 3650 static int 3651 syscal_vfs_cache_fast_lookup(SYSCTL_HANDLER_ARGS) 3652 { 3653 int error, old; 3654 3655 old = atomic_load_int(&cache_fast_lookup); 3656 error = sysctl_handle_int(oidp, arg1, arg2, req); 3657 if (error == 0 && req->newptr && old != atomic_load_int(&cache_fast_lookup)) 3658 cache_fast_lookup_enabled_recalc(); 3659 return (error); 3660 } 3661 SYSCTL_PROC(_vfs, OID_AUTO, cache_fast_lookup, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_MPSAFE, 3662 &cache_fast_lookup, 0, syscal_vfs_cache_fast_lookup, "IU", ""); 3663 3664 /* 3665 * Components of nameidata (or objects it can point to) which may 3666 * need restoring in case fast path lookup fails. 3667 */ 3668 struct nameidata_outer { 3669 size_t ni_pathlen; 3670 int cn_flags; 3671 }; 3672 3673 struct nameidata_saved { 3674 #ifdef INVARIANTS 3675 char *cn_nameptr; 3676 size_t ni_pathlen; 3677 #endif 3678 }; 3679 3680 #ifdef INVARIANTS 3681 struct cache_fpl_debug { 3682 size_t ni_pathlen; 3683 }; 3684 #endif 3685 3686 struct cache_fpl { 3687 struct nameidata *ndp; 3688 struct componentname *cnp; 3689 char *nulchar; 3690 struct vnode *dvp; 3691 struct vnode *tvp; 3692 seqc_t dvp_seqc; 3693 seqc_t tvp_seqc; 3694 uint32_t hash; 3695 struct nameidata_saved snd; 3696 struct nameidata_outer snd_outer; 3697 int line; 3698 enum cache_fpl_status status:8; 3699 bool in_smr; 3700 bool fsearch; 3701 bool savename; 3702 struct pwd **pwd; 3703 #ifdef INVARIANTS 3704 struct cache_fpl_debug debug; 3705 #endif 3706 }; 3707 3708 static bool cache_fplookup_is_mp(struct cache_fpl *fpl); 3709 static int cache_fplookup_cross_mount(struct cache_fpl *fpl); 3710 static int cache_fplookup_partial_setup(struct cache_fpl *fpl); 3711 static int cache_fplookup_skip_slashes(struct cache_fpl *fpl); 3712 static int cache_fplookup_trailingslash(struct cache_fpl *fpl); 3713 static void cache_fpl_pathlen_dec(struct cache_fpl *fpl); 3714 static void cache_fpl_pathlen_inc(struct cache_fpl *fpl); 3715 static void cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n); 3716 static void cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n); 3717 3718 static void 3719 cache_fpl_cleanup_cnp(struct componentname *cnp) 3720 { 3721 3722 uma_zfree(namei_zone, cnp->cn_pnbuf); 3723 #ifdef DIAGNOSTIC 3724 cnp->cn_pnbuf = NULL; 3725 cnp->cn_nameptr = NULL; 3726 #endif 3727 } 3728 3729 static struct vnode * 3730 cache_fpl_handle_root(struct cache_fpl *fpl) 3731 { 3732 struct nameidata *ndp; 3733 struct componentname *cnp; 3734 3735 ndp = fpl->ndp; 3736 cnp = fpl->cnp; 3737 3738 MPASS(*(cnp->cn_nameptr) == '/'); 3739 cnp->cn_nameptr++; 3740 cache_fpl_pathlen_dec(fpl); 3741 3742 if (__predict_false(*(cnp->cn_nameptr) == '/')) { 3743 do { 3744 cnp->cn_nameptr++; 3745 cache_fpl_pathlen_dec(fpl); 3746 } while (*(cnp->cn_nameptr) == '/'); 3747 } 3748 3749 return (ndp->ni_rootdir); 3750 } 3751 3752 static void 3753 cache_fpl_checkpoint_outer(struct cache_fpl *fpl) 3754 { 3755 3756 fpl->snd_outer.ni_pathlen = fpl->ndp->ni_pathlen; 3757 fpl->snd_outer.cn_flags = fpl->ndp->ni_cnd.cn_flags; 3758 } 3759 3760 static void 3761 cache_fpl_checkpoint(struct cache_fpl *fpl) 3762 { 3763 3764 #ifdef INVARIANTS 3765 fpl->snd.cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr; 3766 fpl->snd.ni_pathlen = fpl->debug.ni_pathlen; 3767 #endif 3768 } 3769 3770 static void 3771 cache_fpl_restore_partial(struct cache_fpl *fpl) 3772 { 3773 3774 fpl->ndp->ni_cnd.cn_flags = fpl->snd_outer.cn_flags; 3775 #ifdef INVARIANTS 3776 fpl->debug.ni_pathlen = fpl->snd.ni_pathlen; 3777 #endif 3778 } 3779 3780 static void 3781 cache_fpl_restore_abort(struct cache_fpl *fpl) 3782 { 3783 3784 cache_fpl_restore_partial(fpl); 3785 /* 3786 * It is 0 on entry by API contract. 3787 */ 3788 fpl->ndp->ni_resflags = 0; 3789 fpl->ndp->ni_cnd.cn_nameptr = fpl->ndp->ni_cnd.cn_pnbuf; 3790 fpl->ndp->ni_pathlen = fpl->snd_outer.ni_pathlen; 3791 } 3792 3793 #ifdef INVARIANTS 3794 #define cache_fpl_smr_assert_entered(fpl) ({ \ 3795 struct cache_fpl *_fpl = (fpl); \ 3796 MPASS(_fpl->in_smr == true); \ 3797 VFS_SMR_ASSERT_ENTERED(); \ 3798 }) 3799 #define cache_fpl_smr_assert_not_entered(fpl) ({ \ 3800 struct cache_fpl *_fpl = (fpl); \ 3801 MPASS(_fpl->in_smr == false); \ 3802 VFS_SMR_ASSERT_NOT_ENTERED(); \ 3803 }) 3804 static void 3805 cache_fpl_assert_status(struct cache_fpl *fpl) 3806 { 3807 3808 switch (fpl->status) { 3809 case CACHE_FPL_STATUS_UNSET: 3810 __assert_unreachable(); 3811 break; 3812 case CACHE_FPL_STATUS_DESTROYED: 3813 case CACHE_FPL_STATUS_ABORTED: 3814 case CACHE_FPL_STATUS_PARTIAL: 3815 case CACHE_FPL_STATUS_HANDLED: 3816 break; 3817 } 3818 } 3819 #else 3820 #define cache_fpl_smr_assert_entered(fpl) do { } while (0) 3821 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0) 3822 #define cache_fpl_assert_status(fpl) do { } while (0) 3823 #endif 3824 3825 #define cache_fpl_smr_enter_initial(fpl) ({ \ 3826 struct cache_fpl *_fpl = (fpl); \ 3827 vfs_smr_enter(); \ 3828 _fpl->in_smr = true; \ 3829 }) 3830 3831 #define cache_fpl_smr_enter(fpl) ({ \ 3832 struct cache_fpl *_fpl = (fpl); \ 3833 MPASS(_fpl->in_smr == false); \ 3834 vfs_smr_enter(); \ 3835 _fpl->in_smr = true; \ 3836 }) 3837 3838 #define cache_fpl_smr_exit(fpl) ({ \ 3839 struct cache_fpl *_fpl = (fpl); \ 3840 MPASS(_fpl->in_smr == true); \ 3841 vfs_smr_exit(); \ 3842 _fpl->in_smr = false; \ 3843 }) 3844 3845 static int 3846 cache_fpl_aborted_early_impl(struct cache_fpl *fpl, int line) 3847 { 3848 3849 if (fpl->status != CACHE_FPL_STATUS_UNSET) { 3850 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, 3851 ("%s: converting to abort from %d at %d, set at %d\n", 3852 __func__, fpl->status, line, fpl->line)); 3853 } 3854 cache_fpl_smr_assert_not_entered(fpl); 3855 fpl->status = CACHE_FPL_STATUS_ABORTED; 3856 fpl->line = line; 3857 return (CACHE_FPL_FAILED); 3858 } 3859 3860 #define cache_fpl_aborted_early(x) cache_fpl_aborted_early_impl((x), __LINE__) 3861 3862 static int __noinline 3863 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line) 3864 { 3865 struct nameidata *ndp; 3866 struct componentname *cnp; 3867 3868 ndp = fpl->ndp; 3869 cnp = fpl->cnp; 3870 3871 if (fpl->status != CACHE_FPL_STATUS_UNSET) { 3872 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, 3873 ("%s: converting to abort from %d at %d, set at %d\n", 3874 __func__, fpl->status, line, fpl->line)); 3875 } 3876 fpl->status = CACHE_FPL_STATUS_ABORTED; 3877 fpl->line = line; 3878 if (fpl->in_smr) 3879 cache_fpl_smr_exit(fpl); 3880 cache_fpl_restore_abort(fpl); 3881 /* 3882 * Resolving symlinks overwrites data passed by the caller. 3883 * Let namei know. 3884 */ 3885 if (ndp->ni_loopcnt > 0) { 3886 fpl->status = CACHE_FPL_STATUS_DESTROYED; 3887 cache_fpl_cleanup_cnp(cnp); 3888 } 3889 return (CACHE_FPL_FAILED); 3890 } 3891 3892 #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__) 3893 3894 static int __noinline 3895 cache_fpl_partial_impl(struct cache_fpl *fpl, int line) 3896 { 3897 3898 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3899 ("%s: setting to partial at %d, but already set to %d at %d\n", 3900 __func__, line, fpl->status, fpl->line)); 3901 cache_fpl_smr_assert_entered(fpl); 3902 fpl->status = CACHE_FPL_STATUS_PARTIAL; 3903 fpl->line = line; 3904 return (cache_fplookup_partial_setup(fpl)); 3905 } 3906 3907 #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__) 3908 3909 static int 3910 cache_fpl_handled_impl(struct cache_fpl *fpl, int line) 3911 { 3912 3913 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3914 ("%s: setting to handled at %d, but already set to %d at %d\n", 3915 __func__, line, fpl->status, fpl->line)); 3916 cache_fpl_smr_assert_not_entered(fpl); 3917 fpl->status = CACHE_FPL_STATUS_HANDLED; 3918 fpl->line = line; 3919 return (0); 3920 } 3921 3922 #define cache_fpl_handled(x) cache_fpl_handled_impl((x), __LINE__) 3923 3924 static int 3925 cache_fpl_handled_error_impl(struct cache_fpl *fpl, int error, int line) 3926 { 3927 3928 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3929 ("%s: setting to handled at %d, but already set to %d at %d\n", 3930 __func__, line, fpl->status, fpl->line)); 3931 MPASS(error != 0); 3932 MPASS(error != CACHE_FPL_FAILED); 3933 cache_fpl_smr_assert_not_entered(fpl); 3934 fpl->status = CACHE_FPL_STATUS_HANDLED; 3935 fpl->line = line; 3936 fpl->dvp = NULL; 3937 fpl->tvp = NULL; 3938 fpl->savename = false; 3939 return (error); 3940 } 3941 3942 #define cache_fpl_handled_error(x, e) cache_fpl_handled_error_impl((x), (e), __LINE__) 3943 3944 static bool 3945 cache_fpl_terminated(struct cache_fpl *fpl) 3946 { 3947 3948 return (fpl->status != CACHE_FPL_STATUS_UNSET); 3949 } 3950 3951 #define CACHE_FPL_SUPPORTED_CN_FLAGS \ 3952 (NC_NOMAKEENTRY | NC_KEEPPOSENTRY | LOCKLEAF | LOCKPARENT | WANTPARENT | \ 3953 FAILIFEXISTS | FOLLOW | LOCKSHARED | SAVENAME | SAVESTART | WILLBEDIR | \ 3954 ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK) 3955 3956 #define CACHE_FPL_INTERNAL_CN_FLAGS \ 3957 (ISDOTDOT | MAKEENTRY | ISLASTCN) 3958 3959 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, 3960 "supported and internal flags overlap"); 3961 3962 static bool 3963 cache_fpl_islastcn(struct nameidata *ndp) 3964 { 3965 3966 return (*ndp->ni_next == 0); 3967 } 3968 3969 static bool 3970 cache_fpl_istrailingslash(struct cache_fpl *fpl) 3971 { 3972 3973 return (*(fpl->nulchar - 1) == '/'); 3974 } 3975 3976 static bool 3977 cache_fpl_isdotdot(struct componentname *cnp) 3978 { 3979 3980 if (cnp->cn_namelen == 2 && 3981 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') 3982 return (true); 3983 return (false); 3984 } 3985 3986 static bool 3987 cache_can_fplookup(struct cache_fpl *fpl) 3988 { 3989 struct nameidata *ndp; 3990 struct componentname *cnp; 3991 struct thread *td; 3992 3993 ndp = fpl->ndp; 3994 cnp = fpl->cnp; 3995 td = cnp->cn_thread; 3996 3997 if (!atomic_load_char(&cache_fast_lookup_enabled)) { 3998 cache_fpl_aborted_early(fpl); 3999 return (false); 4000 } 4001 if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) { 4002 cache_fpl_aborted_early(fpl); 4003 return (false); 4004 } 4005 if (IN_CAPABILITY_MODE(td)) { 4006 cache_fpl_aborted_early(fpl); 4007 return (false); 4008 } 4009 if (AUDITING_TD(td)) { 4010 cache_fpl_aborted_early(fpl); 4011 return (false); 4012 } 4013 if (ndp->ni_startdir != NULL) { 4014 cache_fpl_aborted_early(fpl); 4015 return (false); 4016 } 4017 return (true); 4018 } 4019 4020 static int 4021 cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp) 4022 { 4023 struct nameidata *ndp; 4024 int error; 4025 bool fsearch; 4026 4027 ndp = fpl->ndp; 4028 error = fgetvp_lookup_smr(ndp->ni_dirfd, ndp, vpp, &fsearch); 4029 if (__predict_false(error != 0)) { 4030 return (cache_fpl_aborted(fpl)); 4031 } 4032 fpl->fsearch = fsearch; 4033 return (0); 4034 } 4035 4036 static int __noinline 4037 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp, 4038 uint32_t hash) 4039 { 4040 struct componentname *cnp; 4041 struct vnode *dvp; 4042 4043 cnp = fpl->cnp; 4044 dvp = fpl->dvp; 4045 4046 cache_fpl_smr_exit(fpl); 4047 if (cache_neg_promote_cond(dvp, cnp, oncp, hash)) 4048 return (cache_fpl_handled_error(fpl, ENOENT)); 4049 else 4050 return (cache_fpl_aborted(fpl)); 4051 } 4052 4053 /* 4054 * The target vnode is not supported, prepare for the slow path to take over. 4055 */ 4056 static int __noinline 4057 cache_fplookup_partial_setup(struct cache_fpl *fpl) 4058 { 4059 struct nameidata *ndp; 4060 struct componentname *cnp; 4061 enum vgetstate dvs; 4062 struct vnode *dvp; 4063 struct pwd *pwd; 4064 seqc_t dvp_seqc; 4065 4066 ndp = fpl->ndp; 4067 cnp = fpl->cnp; 4068 pwd = *(fpl->pwd); 4069 dvp = fpl->dvp; 4070 dvp_seqc = fpl->dvp_seqc; 4071 4072 if (!pwd_hold_smr(pwd)) { 4073 return (cache_fpl_aborted(fpl)); 4074 } 4075 4076 /* 4077 * Note that seqc is checked before the vnode is locked, so by 4078 * the time regular lookup gets to it it may have moved. 4079 * 4080 * Ultimately this does not affect correctness, any lookup errors 4081 * are userspace racing with itself. It is guaranteed that any 4082 * path which ultimately gets found could also have been found 4083 * by regular lookup going all the way in absence of concurrent 4084 * modifications. 4085 */ 4086 dvs = vget_prep_smr(dvp); 4087 cache_fpl_smr_exit(fpl); 4088 if (__predict_false(dvs == VGET_NONE)) { 4089 pwd_drop(pwd); 4090 return (cache_fpl_aborted(fpl)); 4091 } 4092 4093 vget_finish_ref(dvp, dvs); 4094 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4095 vrele(dvp); 4096 pwd_drop(pwd); 4097 return (cache_fpl_aborted(fpl)); 4098 } 4099 4100 cache_fpl_restore_partial(fpl); 4101 #ifdef INVARIANTS 4102 if (cnp->cn_nameptr != fpl->snd.cn_nameptr) { 4103 panic("%s: cn_nameptr mismatch (%p != %p) full [%s]\n", __func__, 4104 cnp->cn_nameptr, fpl->snd.cn_nameptr, cnp->cn_pnbuf); 4105 } 4106 #endif 4107 4108 ndp->ni_startdir = dvp; 4109 cnp->cn_flags |= MAKEENTRY; 4110 if (cache_fpl_islastcn(ndp)) 4111 cnp->cn_flags |= ISLASTCN; 4112 if (cache_fpl_isdotdot(cnp)) 4113 cnp->cn_flags |= ISDOTDOT; 4114 4115 /* 4116 * Skip potential extra slashes parsing did not take care of. 4117 * cache_fplookup_skip_slashes explains the mechanism. 4118 */ 4119 if (__predict_false(*(cnp->cn_nameptr) == '/')) { 4120 do { 4121 cnp->cn_nameptr++; 4122 cache_fpl_pathlen_dec(fpl); 4123 } while (*(cnp->cn_nameptr) == '/'); 4124 } 4125 4126 ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1; 4127 #ifdef INVARIANTS 4128 if (ndp->ni_pathlen != fpl->debug.ni_pathlen) { 4129 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n", 4130 __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar, 4131 cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf); 4132 } 4133 #endif 4134 return (0); 4135 } 4136 4137 static int 4138 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs) 4139 { 4140 struct componentname *cnp; 4141 struct vnode *tvp; 4142 seqc_t tvp_seqc; 4143 int error, lkflags; 4144 4145 cnp = fpl->cnp; 4146 tvp = fpl->tvp; 4147 tvp_seqc = fpl->tvp_seqc; 4148 4149 if ((cnp->cn_flags & LOCKLEAF) != 0) { 4150 lkflags = LK_SHARED; 4151 if ((cnp->cn_flags & LOCKSHARED) == 0) 4152 lkflags = LK_EXCLUSIVE; 4153 error = vget_finish(tvp, lkflags, tvs); 4154 if (__predict_false(error != 0)) { 4155 return (cache_fpl_aborted(fpl)); 4156 } 4157 } else { 4158 vget_finish_ref(tvp, tvs); 4159 } 4160 4161 if (!vn_seqc_consistent(tvp, tvp_seqc)) { 4162 if ((cnp->cn_flags & LOCKLEAF) != 0) 4163 vput(tvp); 4164 else 4165 vrele(tvp); 4166 return (cache_fpl_aborted(fpl)); 4167 } 4168 4169 return (cache_fpl_handled(fpl)); 4170 } 4171 4172 /* 4173 * They want to possibly modify the state of the namecache. 4174 */ 4175 static int __noinline 4176 cache_fplookup_final_modifying(struct cache_fpl *fpl) 4177 { 4178 struct nameidata *ndp; 4179 struct componentname *cnp; 4180 enum vgetstate dvs; 4181 struct vnode *dvp, *tvp; 4182 struct mount *mp; 4183 seqc_t dvp_seqc; 4184 int error; 4185 bool docache; 4186 4187 ndp = fpl->ndp; 4188 cnp = fpl->cnp; 4189 dvp = fpl->dvp; 4190 dvp_seqc = fpl->dvp_seqc; 4191 4192 MPASS(*(cnp->cn_nameptr) != '/'); 4193 MPASS(cache_fpl_islastcn(ndp)); 4194 if ((cnp->cn_flags & LOCKPARENT) == 0) 4195 MPASS((cnp->cn_flags & WANTPARENT) != 0); 4196 MPASS((cnp->cn_flags & TRAILINGSLASH) == 0); 4197 MPASS(cnp->cn_nameiop == CREATE || cnp->cn_nameiop == DELETE || 4198 cnp->cn_nameiop == RENAME); 4199 MPASS((cnp->cn_flags & MAKEENTRY) == 0); 4200 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 4201 4202 docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE; 4203 if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME) 4204 docache = false; 4205 4206 /* 4207 * Regular lookup nulifies the slash, which we don't do here. 4208 * Don't take chances with filesystem routines seeing it for 4209 * the last entry. 4210 */ 4211 if (cache_fpl_istrailingslash(fpl)) { 4212 return (cache_fpl_partial(fpl)); 4213 } 4214 4215 mp = atomic_load_ptr(&dvp->v_mount); 4216 if (__predict_false(mp == NULL)) { 4217 return (cache_fpl_aborted(fpl)); 4218 } 4219 4220 if (__predict_false(mp->mnt_flag & MNT_RDONLY)) { 4221 cache_fpl_smr_exit(fpl); 4222 /* 4223 * Original code keeps not checking for CREATE which 4224 * might be a bug. For now let the old lookup decide. 4225 */ 4226 if (cnp->cn_nameiop == CREATE) { 4227 return (cache_fpl_aborted(fpl)); 4228 } 4229 return (cache_fpl_handled_error(fpl, EROFS)); 4230 } 4231 4232 if (fpl->tvp != NULL && (cnp->cn_flags & FAILIFEXISTS) != 0) { 4233 cache_fpl_smr_exit(fpl); 4234 return (cache_fpl_handled_error(fpl, EEXIST)); 4235 } 4236 4237 /* 4238 * Secure access to dvp; check cache_fplookup_partial_setup for 4239 * reasoning. 4240 * 4241 * XXX At least UFS requires its lookup routine to be called for 4242 * the last path component, which leads to some level of complication 4243 * and inefficiency: 4244 * - the target routine always locks the target vnode, but our caller 4245 * may not need it locked 4246 * - some of the VOP machinery asserts that the parent is locked, which 4247 * once more may be not required 4248 * 4249 * TODO: add a flag for filesystems which don't need this. 4250 */ 4251 dvs = vget_prep_smr(dvp); 4252 cache_fpl_smr_exit(fpl); 4253 if (__predict_false(dvs == VGET_NONE)) { 4254 return (cache_fpl_aborted(fpl)); 4255 } 4256 4257 vget_finish_ref(dvp, dvs); 4258 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4259 vrele(dvp); 4260 return (cache_fpl_aborted(fpl)); 4261 } 4262 4263 error = vn_lock(dvp, LK_EXCLUSIVE); 4264 if (__predict_false(error != 0)) { 4265 vrele(dvp); 4266 return (cache_fpl_aborted(fpl)); 4267 } 4268 4269 tvp = NULL; 4270 cnp->cn_flags |= ISLASTCN; 4271 if (docache) 4272 cnp->cn_flags |= MAKEENTRY; 4273 if (cache_fpl_isdotdot(cnp)) 4274 cnp->cn_flags |= ISDOTDOT; 4275 cnp->cn_lkflags = LK_EXCLUSIVE; 4276 error = VOP_LOOKUP(dvp, &tvp, cnp); 4277 switch (error) { 4278 case EJUSTRETURN: 4279 case 0: 4280 break; 4281 case ENOTDIR: 4282 case ENOENT: 4283 vput(dvp); 4284 return (cache_fpl_handled_error(fpl, error)); 4285 default: 4286 vput(dvp); 4287 return (cache_fpl_aborted(fpl)); 4288 } 4289 4290 fpl->tvp = tvp; 4291 fpl->savename = (cnp->cn_flags & SAVENAME) != 0; 4292 4293 if (tvp == NULL) { 4294 if ((cnp->cn_flags & SAVESTART) != 0) { 4295 ndp->ni_startdir = dvp; 4296 vrefact(ndp->ni_startdir); 4297 cnp->cn_flags |= SAVENAME; 4298 fpl->savename = true; 4299 } 4300 MPASS(error == EJUSTRETURN); 4301 if ((cnp->cn_flags & LOCKPARENT) == 0) { 4302 VOP_UNLOCK(dvp); 4303 } 4304 return (cache_fpl_handled(fpl)); 4305 } 4306 4307 /* 4308 * There are very hairy corner cases concerning various flag combinations 4309 * and locking state. In particular here we only hold one lock instead of 4310 * two. 4311 * 4312 * Skip the complexity as it is of no significance for normal workloads. 4313 */ 4314 if (__predict_false(tvp == dvp)) { 4315 vput(dvp); 4316 vrele(tvp); 4317 return (cache_fpl_aborted(fpl)); 4318 } 4319 4320 /* 4321 * If they want the symlink itself we are fine, but if they want to 4322 * follow it regular lookup has to be engaged. 4323 */ 4324 if (tvp->v_type == VLNK) { 4325 if ((cnp->cn_flags & FOLLOW) != 0) { 4326 vput(dvp); 4327 vput(tvp); 4328 return (cache_fpl_aborted(fpl)); 4329 } 4330 } 4331 4332 /* 4333 * Since we expect this to be the terminal vnode it should almost never 4334 * be a mount point. 4335 */ 4336 if (__predict_false(cache_fplookup_is_mp(fpl))) { 4337 vput(dvp); 4338 vput(tvp); 4339 return (cache_fpl_aborted(fpl)); 4340 } 4341 4342 if ((cnp->cn_flags & FAILIFEXISTS) != 0) { 4343 vput(dvp); 4344 vput(tvp); 4345 return (cache_fpl_handled_error(fpl, EEXIST)); 4346 } 4347 4348 if ((cnp->cn_flags & LOCKLEAF) == 0) { 4349 VOP_UNLOCK(tvp); 4350 } 4351 4352 if ((cnp->cn_flags & LOCKPARENT) == 0) { 4353 VOP_UNLOCK(dvp); 4354 } 4355 4356 if ((cnp->cn_flags & SAVESTART) != 0) { 4357 ndp->ni_startdir = dvp; 4358 vrefact(ndp->ni_startdir); 4359 cnp->cn_flags |= SAVENAME; 4360 fpl->savename = true; 4361 } 4362 4363 return (cache_fpl_handled(fpl)); 4364 } 4365 4366 static int __noinline 4367 cache_fplookup_modifying(struct cache_fpl *fpl) 4368 { 4369 struct nameidata *ndp; 4370 4371 ndp = fpl->ndp; 4372 4373 if (!cache_fpl_islastcn(ndp)) { 4374 return (cache_fpl_partial(fpl)); 4375 } 4376 return (cache_fplookup_final_modifying(fpl)); 4377 } 4378 4379 static int __noinline 4380 cache_fplookup_final_withparent(struct cache_fpl *fpl) 4381 { 4382 struct componentname *cnp; 4383 enum vgetstate dvs, tvs; 4384 struct vnode *dvp, *tvp; 4385 seqc_t dvp_seqc; 4386 int error; 4387 4388 cnp = fpl->cnp; 4389 dvp = fpl->dvp; 4390 dvp_seqc = fpl->dvp_seqc; 4391 tvp = fpl->tvp; 4392 4393 MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0); 4394 4395 /* 4396 * This is less efficient than it can be for simplicity. 4397 */ 4398 dvs = vget_prep_smr(dvp); 4399 if (__predict_false(dvs == VGET_NONE)) { 4400 return (cache_fpl_aborted(fpl)); 4401 } 4402 tvs = vget_prep_smr(tvp); 4403 if (__predict_false(tvs == VGET_NONE)) { 4404 cache_fpl_smr_exit(fpl); 4405 vget_abort(dvp, dvs); 4406 return (cache_fpl_aborted(fpl)); 4407 } 4408 4409 cache_fpl_smr_exit(fpl); 4410 4411 if ((cnp->cn_flags & LOCKPARENT) != 0) { 4412 error = vget_finish(dvp, LK_EXCLUSIVE, dvs); 4413 if (__predict_false(error != 0)) { 4414 vget_abort(tvp, tvs); 4415 return (cache_fpl_aborted(fpl)); 4416 } 4417 } else { 4418 vget_finish_ref(dvp, dvs); 4419 } 4420 4421 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4422 vget_abort(tvp, tvs); 4423 if ((cnp->cn_flags & LOCKPARENT) != 0) 4424 vput(dvp); 4425 else 4426 vrele(dvp); 4427 return (cache_fpl_aborted(fpl)); 4428 } 4429 4430 error = cache_fplookup_final_child(fpl, tvs); 4431 if (__predict_false(error != 0)) { 4432 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED); 4433 if ((cnp->cn_flags & LOCKPARENT) != 0) 4434 vput(dvp); 4435 else 4436 vrele(dvp); 4437 return (error); 4438 } 4439 4440 MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED); 4441 return (0); 4442 } 4443 4444 static int 4445 cache_fplookup_final(struct cache_fpl *fpl) 4446 { 4447 struct componentname *cnp; 4448 enum vgetstate tvs; 4449 struct vnode *dvp, *tvp; 4450 seqc_t dvp_seqc; 4451 4452 cnp = fpl->cnp; 4453 dvp = fpl->dvp; 4454 dvp_seqc = fpl->dvp_seqc; 4455 tvp = fpl->tvp; 4456 4457 MPASS(*(cnp->cn_nameptr) != '/'); 4458 4459 if (cnp->cn_nameiop != LOOKUP) { 4460 return (cache_fplookup_final_modifying(fpl)); 4461 } 4462 4463 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) 4464 return (cache_fplookup_final_withparent(fpl)); 4465 4466 tvs = vget_prep_smr(tvp); 4467 if (__predict_false(tvs == VGET_NONE)) { 4468 return (cache_fpl_partial(fpl)); 4469 } 4470 4471 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4472 cache_fpl_smr_exit(fpl); 4473 vget_abort(tvp, tvs); 4474 return (cache_fpl_aborted(fpl)); 4475 } 4476 4477 cache_fpl_smr_exit(fpl); 4478 return (cache_fplookup_final_child(fpl, tvs)); 4479 } 4480 4481 /* 4482 * Comment from locked lookup: 4483 * Check for degenerate name (e.g. / or "") which is a way of talking about a 4484 * directory, e.g. like "/." or ".". 4485 */ 4486 static int __noinline 4487 cache_fplookup_degenerate(struct cache_fpl *fpl) 4488 { 4489 struct componentname *cnp; 4490 struct vnode *dvp; 4491 enum vgetstate dvs; 4492 int error, lkflags; 4493 #ifdef INVARIANTS 4494 char *cp; 4495 #endif 4496 4497 fpl->tvp = fpl->dvp; 4498 fpl->tvp_seqc = fpl->dvp_seqc; 4499 4500 cnp = fpl->cnp; 4501 dvp = fpl->dvp; 4502 4503 #ifdef INVARIANTS 4504 for (cp = cnp->cn_pnbuf; *cp != '\0'; cp++) { 4505 KASSERT(*cp == '/', 4506 ("%s: encountered non-slash; string [%s]\n", __func__, 4507 cnp->cn_pnbuf)); 4508 } 4509 #endif 4510 4511 if (__predict_false(cnp->cn_nameiop != LOOKUP)) { 4512 cache_fpl_smr_exit(fpl); 4513 return (cache_fpl_handled_error(fpl, EISDIR)); 4514 } 4515 4516 MPASS((cnp->cn_flags & SAVESTART) == 0); 4517 4518 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) { 4519 return (cache_fplookup_final_withparent(fpl)); 4520 } 4521 4522 dvs = vget_prep_smr(dvp); 4523 cache_fpl_smr_exit(fpl); 4524 if (__predict_false(dvs == VGET_NONE)) { 4525 return (cache_fpl_aborted(fpl)); 4526 } 4527 4528 if ((cnp->cn_flags & LOCKLEAF) != 0) { 4529 lkflags = LK_SHARED; 4530 if ((cnp->cn_flags & LOCKSHARED) == 0) 4531 lkflags = LK_EXCLUSIVE; 4532 error = vget_finish(dvp, lkflags, dvs); 4533 if (__predict_false(error != 0)) { 4534 return (cache_fpl_aborted(fpl)); 4535 } 4536 } else { 4537 vget_finish_ref(dvp, dvs); 4538 } 4539 return (cache_fpl_handled(fpl)); 4540 } 4541 4542 static int __noinline 4543 cache_fplookup_noentry(struct cache_fpl *fpl) 4544 { 4545 struct nameidata *ndp; 4546 struct componentname *cnp; 4547 enum vgetstate dvs; 4548 struct vnode *dvp, *tvp; 4549 seqc_t dvp_seqc; 4550 int error; 4551 bool docache; 4552 4553 ndp = fpl->ndp; 4554 cnp = fpl->cnp; 4555 dvp = fpl->dvp; 4556 dvp_seqc = fpl->dvp_seqc; 4557 4558 MPASS((cnp->cn_flags & MAKEENTRY) == 0); 4559 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 4560 MPASS(!cache_fpl_isdotdot(cnp)); 4561 4562 /* 4563 * Hack: delayed name len checking. 4564 */ 4565 if (__predict_false(cnp->cn_namelen > NAME_MAX)) { 4566 cache_fpl_smr_exit(fpl); 4567 return (cache_fpl_handled_error(fpl, ENAMETOOLONG)); 4568 } 4569 4570 if (cnp->cn_nameptr[0] == '/') { 4571 return (cache_fplookup_skip_slashes(fpl)); 4572 } 4573 4574 if (cnp->cn_nameptr[0] == '\0') { 4575 if (fpl->tvp == NULL) { 4576 return (cache_fplookup_degenerate(fpl)); 4577 } 4578 return (cache_fplookup_trailingslash(fpl)); 4579 } 4580 4581 if (cnp->cn_nameiop != LOOKUP) { 4582 fpl->tvp = NULL; 4583 return (cache_fplookup_modifying(fpl)); 4584 } 4585 4586 MPASS((cnp->cn_flags & SAVESTART) == 0); 4587 4588 /* 4589 * Only try to fill in the component if it is the last one, 4590 * otherwise not only there may be several to handle but the 4591 * walk may be complicated. 4592 */ 4593 if (!cache_fpl_islastcn(ndp)) { 4594 return (cache_fpl_partial(fpl)); 4595 } 4596 4597 /* 4598 * Regular lookup nulifies the slash, which we don't do here. 4599 * Don't take chances with filesystem routines seeing it for 4600 * the last entry. 4601 */ 4602 if (cache_fpl_istrailingslash(fpl)) { 4603 return (cache_fpl_partial(fpl)); 4604 } 4605 4606 /* 4607 * Secure access to dvp; check cache_fplookup_partial_setup for 4608 * reasoning. 4609 */ 4610 dvs = vget_prep_smr(dvp); 4611 cache_fpl_smr_exit(fpl); 4612 if (__predict_false(dvs == VGET_NONE)) { 4613 return (cache_fpl_aborted(fpl)); 4614 } 4615 4616 vget_finish_ref(dvp, dvs); 4617 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4618 vrele(dvp); 4619 return (cache_fpl_aborted(fpl)); 4620 } 4621 4622 error = vn_lock(dvp, LK_SHARED); 4623 if (__predict_false(error != 0)) { 4624 vrele(dvp); 4625 return (cache_fpl_aborted(fpl)); 4626 } 4627 4628 tvp = NULL; 4629 /* 4630 * TODO: provide variants which don't require locking either vnode. 4631 */ 4632 cnp->cn_flags |= ISLASTCN; 4633 docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE; 4634 if (docache) 4635 cnp->cn_flags |= MAKEENTRY; 4636 cnp->cn_lkflags = LK_SHARED; 4637 if ((cnp->cn_flags & LOCKSHARED) == 0) { 4638 cnp->cn_lkflags = LK_EXCLUSIVE; 4639 } 4640 error = VOP_LOOKUP(dvp, &tvp, cnp); 4641 switch (error) { 4642 case EJUSTRETURN: 4643 case 0: 4644 break; 4645 case ENOTDIR: 4646 case ENOENT: 4647 vput(dvp); 4648 return (cache_fpl_handled_error(fpl, error)); 4649 default: 4650 vput(dvp); 4651 return (cache_fpl_aborted(fpl)); 4652 } 4653 4654 fpl->tvp = tvp; 4655 if (!fpl->savename) { 4656 MPASS((cnp->cn_flags & SAVENAME) == 0); 4657 } 4658 4659 if (tvp == NULL) { 4660 MPASS(error == EJUSTRETURN); 4661 if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) { 4662 vput(dvp); 4663 } else if ((cnp->cn_flags & LOCKPARENT) == 0) { 4664 VOP_UNLOCK(dvp); 4665 } 4666 return (cache_fpl_handled(fpl)); 4667 } 4668 4669 if (tvp->v_type == VLNK) { 4670 if ((cnp->cn_flags & FOLLOW) != 0) { 4671 vput(dvp); 4672 vput(tvp); 4673 return (cache_fpl_aborted(fpl)); 4674 } 4675 } 4676 4677 if (__predict_false(cache_fplookup_is_mp(fpl))) { 4678 vput(dvp); 4679 vput(tvp); 4680 return (cache_fpl_aborted(fpl)); 4681 } 4682 4683 if ((cnp->cn_flags & LOCKLEAF) == 0) { 4684 VOP_UNLOCK(tvp); 4685 } 4686 4687 if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) { 4688 vput(dvp); 4689 } else if ((cnp->cn_flags & LOCKPARENT) == 0) { 4690 VOP_UNLOCK(dvp); 4691 } 4692 return (cache_fpl_handled(fpl)); 4693 } 4694 4695 static int __noinline 4696 cache_fplookup_dot(struct cache_fpl *fpl) 4697 { 4698 int error; 4699 4700 MPASS(!seqc_in_modify(fpl->dvp_seqc)); 4701 /* 4702 * Just re-assign the value. seqc will be checked later for the first 4703 * non-dot path component in line and/or before deciding to return the 4704 * vnode. 4705 */ 4706 fpl->tvp = fpl->dvp; 4707 fpl->tvp_seqc = fpl->dvp_seqc; 4708 4709 counter_u64_add(dothits, 1); 4710 SDT_PROBE3(vfs, namecache, lookup, hit, fpl->dvp, ".", fpl->dvp); 4711 4712 error = 0; 4713 if (cache_fplookup_is_mp(fpl)) { 4714 error = cache_fplookup_cross_mount(fpl); 4715 } 4716 return (error); 4717 } 4718 4719 static int __noinline 4720 cache_fplookup_dotdot(struct cache_fpl *fpl) 4721 { 4722 struct nameidata *ndp; 4723 struct componentname *cnp; 4724 struct namecache *ncp; 4725 struct vnode *dvp; 4726 struct prison *pr; 4727 u_char nc_flag; 4728 4729 ndp = fpl->ndp; 4730 cnp = fpl->cnp; 4731 dvp = fpl->dvp; 4732 4733 MPASS(cache_fpl_isdotdot(cnp)); 4734 4735 /* 4736 * XXX this is racy the same way regular lookup is 4737 */ 4738 for (pr = cnp->cn_cred->cr_prison; pr != NULL; 4739 pr = pr->pr_parent) 4740 if (dvp == pr->pr_root) 4741 break; 4742 4743 if (dvp == ndp->ni_rootdir || 4744 dvp == ndp->ni_topdir || 4745 dvp == rootvnode || 4746 pr != NULL) { 4747 fpl->tvp = dvp; 4748 fpl->tvp_seqc = vn_seqc_read_any(dvp); 4749 if (seqc_in_modify(fpl->tvp_seqc)) { 4750 return (cache_fpl_aborted(fpl)); 4751 } 4752 return (0); 4753 } 4754 4755 if ((dvp->v_vflag & VV_ROOT) != 0) { 4756 /* 4757 * TODO 4758 * The opposite of climb mount is needed here. 4759 */ 4760 return (cache_fpl_partial(fpl)); 4761 } 4762 4763 ncp = atomic_load_consume_ptr(&dvp->v_cache_dd); 4764 if (ncp == NULL) { 4765 return (cache_fpl_aborted(fpl)); 4766 } 4767 4768 nc_flag = atomic_load_char(&ncp->nc_flag); 4769 if ((nc_flag & NCF_ISDOTDOT) != 0) { 4770 if ((nc_flag & NCF_NEGATIVE) != 0) 4771 return (cache_fpl_aborted(fpl)); 4772 fpl->tvp = ncp->nc_vp; 4773 } else { 4774 fpl->tvp = ncp->nc_dvp; 4775 } 4776 4777 fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp); 4778 if (seqc_in_modify(fpl->tvp_seqc)) { 4779 return (cache_fpl_partial(fpl)); 4780 } 4781 4782 /* 4783 * Acquire fence provided by vn_seqc_read_any above. 4784 */ 4785 if (__predict_false(atomic_load_ptr(&dvp->v_cache_dd) != ncp)) { 4786 return (cache_fpl_aborted(fpl)); 4787 } 4788 4789 if (!cache_ncp_canuse(ncp)) { 4790 return (cache_fpl_aborted(fpl)); 4791 } 4792 4793 counter_u64_add(dotdothits, 1); 4794 return (0); 4795 } 4796 4797 static int __noinline 4798 cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash) 4799 { 4800 u_char nc_flag; 4801 bool neg_promote; 4802 4803 nc_flag = atomic_load_char(&ncp->nc_flag); 4804 MPASS((nc_flag & NCF_NEGATIVE) != 0); 4805 /* 4806 * If they want to create an entry we need to replace this one. 4807 */ 4808 if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) { 4809 fpl->tvp = NULL; 4810 return (cache_fplookup_modifying(fpl)); 4811 } 4812 neg_promote = cache_neg_hit_prep(ncp); 4813 if (!cache_fpl_neg_ncp_canuse(ncp)) { 4814 cache_neg_hit_abort(ncp); 4815 return (cache_fpl_partial(fpl)); 4816 } 4817 if (neg_promote) { 4818 return (cache_fplookup_negative_promote(fpl, ncp, hash)); 4819 } 4820 cache_neg_hit_finish(ncp); 4821 cache_fpl_smr_exit(fpl); 4822 return (cache_fpl_handled_error(fpl, ENOENT)); 4823 } 4824 4825 /* 4826 * Resolve a symlink. Called by filesystem-specific routines. 4827 * 4828 * Code flow is: 4829 * ... -> cache_fplookup_symlink -> VOP_FPLOOKUP_SYMLINK -> cache_symlink_resolve 4830 */ 4831 int 4832 cache_symlink_resolve(struct cache_fpl *fpl, const char *string, size_t len) 4833 { 4834 struct nameidata *ndp; 4835 struct componentname *cnp; 4836 size_t adjust; 4837 4838 ndp = fpl->ndp; 4839 cnp = fpl->cnp; 4840 4841 if (__predict_false(len == 0)) { 4842 return (ENOENT); 4843 } 4844 4845 if (__predict_false(len > MAXPATHLEN - 2)) { 4846 if (cache_fpl_istrailingslash(fpl)) { 4847 return (EAGAIN); 4848 } 4849 } 4850 4851 ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr - cnp->cn_namelen + 1; 4852 #ifdef INVARIANTS 4853 if (ndp->ni_pathlen != fpl->debug.ni_pathlen) { 4854 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n", 4855 __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar, 4856 cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf); 4857 } 4858 #endif 4859 4860 if (__predict_false(len + ndp->ni_pathlen > MAXPATHLEN)) { 4861 return (ENAMETOOLONG); 4862 } 4863 4864 if (__predict_false(ndp->ni_loopcnt++ >= MAXSYMLINKS)) { 4865 return (ELOOP); 4866 } 4867 4868 adjust = len; 4869 if (ndp->ni_pathlen > 1) { 4870 bcopy(ndp->ni_next, cnp->cn_pnbuf + len, ndp->ni_pathlen); 4871 } else { 4872 if (cache_fpl_istrailingslash(fpl)) { 4873 adjust = len + 1; 4874 cnp->cn_pnbuf[len] = '/'; 4875 cnp->cn_pnbuf[len + 1] = '\0'; 4876 } else { 4877 cnp->cn_pnbuf[len] = '\0'; 4878 } 4879 } 4880 bcopy(string, cnp->cn_pnbuf, len); 4881 4882 ndp->ni_pathlen += adjust; 4883 cache_fpl_pathlen_add(fpl, adjust); 4884 cnp->cn_nameptr = cnp->cn_pnbuf; 4885 fpl->nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1]; 4886 fpl->tvp = NULL; 4887 return (0); 4888 } 4889 4890 static int __noinline 4891 cache_fplookup_symlink(struct cache_fpl *fpl) 4892 { 4893 struct mount *mp; 4894 struct nameidata *ndp; 4895 struct componentname *cnp; 4896 struct vnode *dvp, *tvp; 4897 int error; 4898 4899 ndp = fpl->ndp; 4900 cnp = fpl->cnp; 4901 dvp = fpl->dvp; 4902 tvp = fpl->tvp; 4903 4904 if (cache_fpl_islastcn(ndp)) { 4905 if ((cnp->cn_flags & FOLLOW) == 0) { 4906 return (cache_fplookup_final(fpl)); 4907 } 4908 } 4909 4910 mp = atomic_load_ptr(&dvp->v_mount); 4911 if (__predict_false(mp == NULL)) { 4912 return (cache_fpl_aborted(fpl)); 4913 } 4914 4915 /* 4916 * Note this check races against setting the flag just like regular 4917 * lookup. 4918 */ 4919 if (__predict_false((mp->mnt_flag & MNT_NOSYMFOLLOW) != 0)) { 4920 cache_fpl_smr_exit(fpl); 4921 return (cache_fpl_handled_error(fpl, EACCES)); 4922 } 4923 4924 error = VOP_FPLOOKUP_SYMLINK(tvp, fpl); 4925 if (__predict_false(error != 0)) { 4926 switch (error) { 4927 case EAGAIN: 4928 return (cache_fpl_partial(fpl)); 4929 case ENOENT: 4930 case ENAMETOOLONG: 4931 case ELOOP: 4932 cache_fpl_smr_exit(fpl); 4933 return (cache_fpl_handled_error(fpl, error)); 4934 default: 4935 return (cache_fpl_aborted(fpl)); 4936 } 4937 } 4938 4939 if (*(cnp->cn_nameptr) == '/') { 4940 fpl->dvp = cache_fpl_handle_root(fpl); 4941 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); 4942 if (seqc_in_modify(fpl->dvp_seqc)) { 4943 return (cache_fpl_aborted(fpl)); 4944 } 4945 } 4946 return (0); 4947 } 4948 4949 static int 4950 cache_fplookup_next(struct cache_fpl *fpl) 4951 { 4952 struct componentname *cnp; 4953 struct namecache *ncp; 4954 struct vnode *dvp, *tvp; 4955 u_char nc_flag; 4956 uint32_t hash; 4957 int error; 4958 4959 cnp = fpl->cnp; 4960 dvp = fpl->dvp; 4961 hash = fpl->hash; 4962 4963 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 4964 if (cnp->cn_namelen == 1) { 4965 return (cache_fplookup_dot(fpl)); 4966 } 4967 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 4968 return (cache_fplookup_dotdot(fpl)); 4969 } 4970 } 4971 4972 MPASS(!cache_fpl_isdotdot(cnp)); 4973 4974 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 4975 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 4976 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 4977 break; 4978 } 4979 4980 if (__predict_false(ncp == NULL)) { 4981 return (cache_fplookup_noentry(fpl)); 4982 } 4983 4984 tvp = atomic_load_ptr(&ncp->nc_vp); 4985 nc_flag = atomic_load_char(&ncp->nc_flag); 4986 if ((nc_flag & NCF_NEGATIVE) != 0) { 4987 return (cache_fplookup_neg(fpl, ncp, hash)); 4988 } 4989 4990 if (!cache_ncp_canuse(ncp)) { 4991 return (cache_fpl_partial(fpl)); 4992 } 4993 4994 fpl->tvp = tvp; 4995 fpl->tvp_seqc = vn_seqc_read_any(tvp); 4996 if (seqc_in_modify(fpl->tvp_seqc)) { 4997 return (cache_fpl_partial(fpl)); 4998 } 4999 5000 counter_u64_add(numposhits, 1); 5001 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp); 5002 5003 error = 0; 5004 if (cache_fplookup_is_mp(fpl)) { 5005 error = cache_fplookup_cross_mount(fpl); 5006 } 5007 return (error); 5008 } 5009 5010 static bool 5011 cache_fplookup_mp_supported(struct mount *mp) 5012 { 5013 5014 MPASS(mp != NULL); 5015 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) 5016 return (false); 5017 return (true); 5018 } 5019 5020 /* 5021 * Walk up the mount stack (if any). 5022 * 5023 * Correctness is provided in the following ways: 5024 * - all vnodes are protected from freeing with SMR 5025 * - struct mount objects are type stable making them always safe to access 5026 * - stability of the particular mount is provided by busying it 5027 * - relationship between the vnode which is mounted on and the mount is 5028 * verified with the vnode sequence counter after busying 5029 * - association between root vnode of the mount and the mount is protected 5030 * by busy 5031 * 5032 * From that point on we can read the sequence counter of the root vnode 5033 * and get the next mount on the stack (if any) using the same protection. 5034 * 5035 * By the end of successful walk we are guaranteed the reached state was 5036 * indeed present at least at some point which matches the regular lookup. 5037 */ 5038 static int __noinline 5039 cache_fplookup_climb_mount(struct cache_fpl *fpl) 5040 { 5041 struct mount *mp, *prev_mp; 5042 struct mount_pcpu *mpcpu, *prev_mpcpu; 5043 struct vnode *vp; 5044 seqc_t vp_seqc; 5045 5046 vp = fpl->tvp; 5047 vp_seqc = fpl->tvp_seqc; 5048 5049 VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp); 5050 mp = atomic_load_ptr(&vp->v_mountedhere); 5051 if (__predict_false(mp == NULL)) { 5052 return (0); 5053 } 5054 5055 prev_mp = NULL; 5056 for (;;) { 5057 if (!vfs_op_thread_enter_crit(mp, mpcpu)) { 5058 if (prev_mp != NULL) 5059 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); 5060 return (cache_fpl_partial(fpl)); 5061 } 5062 if (prev_mp != NULL) 5063 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); 5064 if (!vn_seqc_consistent(vp, vp_seqc)) { 5065 vfs_op_thread_exit_crit(mp, mpcpu); 5066 return (cache_fpl_partial(fpl)); 5067 } 5068 if (!cache_fplookup_mp_supported(mp)) { 5069 vfs_op_thread_exit_crit(mp, mpcpu); 5070 return (cache_fpl_partial(fpl)); 5071 } 5072 vp = atomic_load_ptr(&mp->mnt_rootvnode); 5073 if (vp == NULL) { 5074 vfs_op_thread_exit_crit(mp, mpcpu); 5075 return (cache_fpl_partial(fpl)); 5076 } 5077 vp_seqc = vn_seqc_read_any(vp); 5078 if (seqc_in_modify(vp_seqc)) { 5079 vfs_op_thread_exit_crit(mp, mpcpu); 5080 return (cache_fpl_partial(fpl)); 5081 } 5082 prev_mp = mp; 5083 prev_mpcpu = mpcpu; 5084 mp = atomic_load_ptr(&vp->v_mountedhere); 5085 if (mp == NULL) 5086 break; 5087 } 5088 5089 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); 5090 fpl->tvp = vp; 5091 fpl->tvp_seqc = vp_seqc; 5092 return (0); 5093 } 5094 5095 static int __noinline 5096 cache_fplookup_cross_mount(struct cache_fpl *fpl) 5097 { 5098 struct mount *mp; 5099 struct mount_pcpu *mpcpu; 5100 struct vnode *vp; 5101 seqc_t vp_seqc; 5102 5103 vp = fpl->tvp; 5104 vp_seqc = fpl->tvp_seqc; 5105 5106 VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp); 5107 mp = atomic_load_ptr(&vp->v_mountedhere); 5108 if (__predict_false(mp == NULL)) { 5109 return (0); 5110 } 5111 5112 if (!vfs_op_thread_enter_crit(mp, mpcpu)) { 5113 return (cache_fpl_partial(fpl)); 5114 } 5115 if (!vn_seqc_consistent(vp, vp_seqc)) { 5116 vfs_op_thread_exit_crit(mp, mpcpu); 5117 return (cache_fpl_partial(fpl)); 5118 } 5119 if (!cache_fplookup_mp_supported(mp)) { 5120 vfs_op_thread_exit_crit(mp, mpcpu); 5121 return (cache_fpl_partial(fpl)); 5122 } 5123 vp = atomic_load_ptr(&mp->mnt_rootvnode); 5124 if (__predict_false(vp == NULL)) { 5125 vfs_op_thread_exit_crit(mp, mpcpu); 5126 return (cache_fpl_partial(fpl)); 5127 } 5128 vp_seqc = vn_seqc_read_any(vp); 5129 vfs_op_thread_exit_crit(mp, mpcpu); 5130 if (seqc_in_modify(vp_seqc)) { 5131 return (cache_fpl_partial(fpl)); 5132 } 5133 mp = atomic_load_ptr(&vp->v_mountedhere); 5134 if (__predict_false(mp != NULL)) { 5135 /* 5136 * There are possibly more mount points on top. 5137 * Normally this does not happen so for simplicity just start 5138 * over. 5139 */ 5140 return (cache_fplookup_climb_mount(fpl)); 5141 } 5142 5143 fpl->tvp = vp; 5144 fpl->tvp_seqc = vp_seqc; 5145 return (0); 5146 } 5147 5148 /* 5149 * Check if a vnode is mounted on. 5150 */ 5151 static bool 5152 cache_fplookup_is_mp(struct cache_fpl *fpl) 5153 { 5154 struct vnode *vp; 5155 5156 vp = fpl->tvp; 5157 return ((vn_irflag_read(vp) & VIRF_MOUNTPOINT) != 0); 5158 } 5159 5160 /* 5161 * Parse the path. 5162 * 5163 * The code was originally copy-pasted from regular lookup and despite 5164 * clean ups leaves performance on the table. Any modifications here 5165 * must take into account that in case off fallback the resulting 5166 * nameidata state has to be compatible with the original. 5167 */ 5168 5169 /* 5170 * Debug ni_pathlen tracking. 5171 */ 5172 #ifdef INVARIANTS 5173 static void 5174 cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n) 5175 { 5176 5177 fpl->debug.ni_pathlen += n; 5178 KASSERT(fpl->debug.ni_pathlen <= PATH_MAX, 5179 ("%s: pathlen overflow to %zd\n", __func__, fpl->debug.ni_pathlen)); 5180 } 5181 5182 static void 5183 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n) 5184 { 5185 5186 fpl->debug.ni_pathlen -= n; 5187 KASSERT(fpl->debug.ni_pathlen <= PATH_MAX, 5188 ("%s: pathlen underflow to %zd\n", __func__, fpl->debug.ni_pathlen)); 5189 } 5190 5191 static void 5192 cache_fpl_pathlen_inc(struct cache_fpl *fpl) 5193 { 5194 5195 cache_fpl_pathlen_add(fpl, 1); 5196 } 5197 5198 static void 5199 cache_fpl_pathlen_dec(struct cache_fpl *fpl) 5200 { 5201 5202 cache_fpl_pathlen_sub(fpl, 1); 5203 } 5204 #else 5205 static void 5206 cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n) 5207 { 5208 } 5209 5210 static void 5211 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n) 5212 { 5213 } 5214 5215 static void 5216 cache_fpl_pathlen_inc(struct cache_fpl *fpl) 5217 { 5218 } 5219 5220 static void 5221 cache_fpl_pathlen_dec(struct cache_fpl *fpl) 5222 { 5223 } 5224 #endif 5225 5226 static void 5227 cache_fplookup_parse(struct cache_fpl *fpl) 5228 { 5229 struct nameidata *ndp; 5230 struct componentname *cnp; 5231 struct vnode *dvp; 5232 char *cp; 5233 uint32_t hash; 5234 5235 ndp = fpl->ndp; 5236 cnp = fpl->cnp; 5237 dvp = fpl->dvp; 5238 5239 /* 5240 * Find the end of this path component, it is either / or nul. 5241 * 5242 * Store / as a temporary sentinel so that we only have one character 5243 * to test for. Pathnames tend to be short so this should not be 5244 * resulting in cache misses. 5245 * 5246 * TODO: fix this to be word-sized. 5247 */ 5248 KASSERT(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] == fpl->nulchar, 5249 ("%s: mismatch between pathlen (%zu) and nulchar (%p != %p), string [%s]\n", 5250 __func__, fpl->debug.ni_pathlen, &cnp->cn_nameptr[fpl->debug.ni_pathlen - 1], 5251 fpl->nulchar, cnp->cn_pnbuf)); 5252 KASSERT(*fpl->nulchar == '\0', 5253 ("%s: expected nul at %p; string [%s]\n", __func__, fpl->nulchar, 5254 cnp->cn_pnbuf)); 5255 hash = cache_get_hash_iter_start(dvp); 5256 *fpl->nulchar = '/'; 5257 for (cp = cnp->cn_nameptr; *cp != '/'; cp++) { 5258 KASSERT(*cp != '\0', 5259 ("%s: encountered unexpected nul; string [%s]\n", __func__, 5260 cnp->cn_nameptr)); 5261 hash = cache_get_hash_iter(*cp, hash); 5262 continue; 5263 } 5264 *fpl->nulchar = '\0'; 5265 fpl->hash = cache_get_hash_iter_finish(hash); 5266 5267 cnp->cn_namelen = cp - cnp->cn_nameptr; 5268 cache_fpl_pathlen_sub(fpl, cnp->cn_namelen); 5269 5270 #ifdef INVARIANTS 5271 /* 5272 * cache_get_hash only accepts lengths up to NAME_MAX. This is fine since 5273 * we are going to fail this lookup with ENAMETOOLONG (see below). 5274 */ 5275 if (cnp->cn_namelen <= NAME_MAX) { 5276 if (fpl->hash != cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp)) { 5277 panic("%s: mismatched hash for [%s] len %ld", __func__, 5278 cnp->cn_nameptr, cnp->cn_namelen); 5279 } 5280 } 5281 #endif 5282 5283 /* 5284 * Hack: we have to check if the found path component's length exceeds 5285 * NAME_MAX. However, the condition is very rarely true and check can 5286 * be elided in the common case -- if an entry was found in the cache, 5287 * then it could not have been too long to begin with. 5288 */ 5289 ndp->ni_next = cp; 5290 } 5291 5292 static void 5293 cache_fplookup_parse_advance(struct cache_fpl *fpl) 5294 { 5295 struct nameidata *ndp; 5296 struct componentname *cnp; 5297 5298 ndp = fpl->ndp; 5299 cnp = fpl->cnp; 5300 5301 cnp->cn_nameptr = ndp->ni_next; 5302 KASSERT(*(cnp->cn_nameptr) == '/', 5303 ("%s: should have seen slash at %p ; buf %p [%s]\n", __func__, 5304 cnp->cn_nameptr, cnp->cn_pnbuf, cnp->cn_pnbuf)); 5305 cnp->cn_nameptr++; 5306 cache_fpl_pathlen_dec(fpl); 5307 } 5308 5309 /* 5310 * Skip spurious slashes in a pathname (e.g., "foo///bar") and retry. 5311 * 5312 * Lockless lookup tries to elide checking for spurious slashes and should they 5313 * be present is guaranteed to fail to find an entry. In this case the caller 5314 * must check if the name starts with a slash and call this routine. It is 5315 * going to fast forward across the spurious slashes and set the state up for 5316 * retry. 5317 */ 5318 static int __noinline 5319 cache_fplookup_skip_slashes(struct cache_fpl *fpl) 5320 { 5321 struct nameidata *ndp; 5322 struct componentname *cnp; 5323 5324 ndp = fpl->ndp; 5325 cnp = fpl->cnp; 5326 5327 MPASS(*(cnp->cn_nameptr) == '/'); 5328 do { 5329 cnp->cn_nameptr++; 5330 cache_fpl_pathlen_dec(fpl); 5331 } while (*(cnp->cn_nameptr) == '/'); 5332 5333 /* 5334 * Go back to one slash so that cache_fplookup_parse_advance has 5335 * something to skip. 5336 */ 5337 cnp->cn_nameptr--; 5338 cache_fpl_pathlen_inc(fpl); 5339 5340 /* 5341 * cache_fplookup_parse_advance starts from ndp->ni_next 5342 */ 5343 ndp->ni_next = cnp->cn_nameptr; 5344 5345 /* 5346 * See cache_fplookup_dot. 5347 */ 5348 fpl->tvp = fpl->dvp; 5349 fpl->tvp_seqc = fpl->dvp_seqc; 5350 5351 return (0); 5352 } 5353 5354 /* 5355 * Handle trailing slashes (e.g., "foo/"). 5356 * 5357 * If a trailing slash is found the terminal vnode must be a directory. 5358 * Regular lookup shortens the path by nulifying the first trailing slash and 5359 * sets the TRAILINGSLASH flag to denote this took place. There are several 5360 * checks on it performed later. 5361 * 5362 * Similarly to spurious slashes, lockless lookup handles this in a speculative 5363 * manner relying on an invariant that a non-directory vnode will get a miss. 5364 * In this case cn_nameptr[0] == '\0' and cn_namelen == 0. 5365 * 5366 * Thus for a path like "foo/bar/" the code unwinds the state back to "bar/" 5367 * and denotes this is the last path component, which avoids looping back. 5368 * 5369 * Only plain lookups are supported for now to restrict corner cases to handle. 5370 */ 5371 static int __noinline 5372 cache_fplookup_trailingslash(struct cache_fpl *fpl) 5373 { 5374 #ifdef INVARIANTS 5375 size_t ni_pathlen; 5376 #endif 5377 struct nameidata *ndp; 5378 struct componentname *cnp; 5379 struct namecache *ncp; 5380 struct vnode *tvp; 5381 char *cn_nameptr_orig, *cn_nameptr_slash; 5382 seqc_t tvp_seqc; 5383 u_char nc_flag; 5384 5385 ndp = fpl->ndp; 5386 cnp = fpl->cnp; 5387 tvp = fpl->tvp; 5388 tvp_seqc = fpl->tvp_seqc; 5389 5390 MPASS(fpl->dvp == fpl->tvp); 5391 KASSERT(cache_fpl_istrailingslash(fpl), 5392 ("%s: expected trailing slash at %p; string [%s]\n", __func__, fpl->nulchar - 1, 5393 cnp->cn_pnbuf)); 5394 KASSERT(cnp->cn_nameptr[0] == '\0', 5395 ("%s: expected nul char at %p; string [%s]\n", __func__, &cnp->cn_nameptr[0], 5396 cnp->cn_pnbuf)); 5397 KASSERT(cnp->cn_namelen == 0, 5398 ("%s: namelen 0 but got %ld; string [%s]\n", __func__, cnp->cn_namelen, 5399 cnp->cn_pnbuf)); 5400 MPASS(cnp->cn_nameptr > cnp->cn_pnbuf); 5401 5402 if (cnp->cn_nameiop != LOOKUP) { 5403 return (cache_fpl_aborted(fpl)); 5404 } 5405 5406 if (__predict_false(tvp->v_type != VDIR)) { 5407 if (!vn_seqc_consistent(tvp, tvp_seqc)) { 5408 return (cache_fpl_aborted(fpl)); 5409 } 5410 cache_fpl_smr_exit(fpl); 5411 return (cache_fpl_handled_error(fpl, ENOTDIR)); 5412 } 5413 5414 /* 5415 * Denote the last component. 5416 */ 5417 ndp->ni_next = &cnp->cn_nameptr[0]; 5418 MPASS(cache_fpl_islastcn(ndp)); 5419 5420 /* 5421 * Unwind trailing slashes. 5422 */ 5423 cn_nameptr_orig = cnp->cn_nameptr; 5424 while (cnp->cn_nameptr >= cnp->cn_pnbuf) { 5425 cnp->cn_nameptr--; 5426 if (cnp->cn_nameptr[0] != '/') { 5427 break; 5428 } 5429 } 5430 5431 /* 5432 * Unwind to the beginning of the path component. 5433 * 5434 * Note the path may or may not have started with a slash. 5435 */ 5436 cn_nameptr_slash = cnp->cn_nameptr; 5437 while (cnp->cn_nameptr > cnp->cn_pnbuf) { 5438 cnp->cn_nameptr--; 5439 if (cnp->cn_nameptr[0] == '/') { 5440 break; 5441 } 5442 } 5443 if (cnp->cn_nameptr[0] == '/') { 5444 cnp->cn_nameptr++; 5445 } 5446 5447 cnp->cn_namelen = cn_nameptr_slash - cnp->cn_nameptr + 1; 5448 cache_fpl_pathlen_add(fpl, cn_nameptr_orig - cnp->cn_nameptr); 5449 cache_fpl_checkpoint(fpl); 5450 5451 #ifdef INVARIANTS 5452 ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1; 5453 if (ni_pathlen != fpl->debug.ni_pathlen) { 5454 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n", 5455 __func__, ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar, 5456 cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf); 5457 } 5458 #endif 5459 5460 /* 5461 * If this was a "./" lookup the parent directory is already correct. 5462 */ 5463 if (cnp->cn_nameptr[0] == '.' && cnp->cn_namelen == 1) { 5464 return (0); 5465 } 5466 5467 /* 5468 * Otherwise we need to look it up. 5469 */ 5470 tvp = fpl->tvp; 5471 ncp = atomic_load_consume_ptr(&tvp->v_cache_dd); 5472 if (__predict_false(ncp == NULL)) { 5473 return (cache_fpl_aborted(fpl)); 5474 } 5475 nc_flag = atomic_load_char(&ncp->nc_flag); 5476 if ((nc_flag & NCF_ISDOTDOT) != 0) { 5477 return (cache_fpl_aborted(fpl)); 5478 } 5479 fpl->dvp = ncp->nc_dvp; 5480 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); 5481 if (seqc_in_modify(fpl->dvp_seqc)) { 5482 return (cache_fpl_aborted(fpl)); 5483 } 5484 return (0); 5485 } 5486 5487 /* 5488 * See the API contract for VOP_FPLOOKUP_VEXEC. 5489 */ 5490 static int __noinline 5491 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error) 5492 { 5493 struct componentname *cnp; 5494 struct vnode *dvp; 5495 seqc_t dvp_seqc; 5496 5497 cnp = fpl->cnp; 5498 dvp = fpl->dvp; 5499 dvp_seqc = fpl->dvp_seqc; 5500 5501 /* 5502 * TODO: Due to ignoring trailing slashes lookup will perform a 5503 * permission check on the last dir when it should not be doing it. It 5504 * may fail, but said failure should be ignored. It is possible to fix 5505 * it up fully without resorting to regular lookup, but for now just 5506 * abort. 5507 */ 5508 if (cache_fpl_istrailingslash(fpl)) { 5509 return (cache_fpl_aborted(fpl)); 5510 } 5511 5512 /* 5513 * Hack: delayed degenerate path checking. 5514 */ 5515 if (cnp->cn_nameptr[0] == '\0' && fpl->tvp == NULL) { 5516 return (cache_fplookup_degenerate(fpl)); 5517 } 5518 5519 /* 5520 * Hack: delayed name len checking. 5521 */ 5522 if (__predict_false(cnp->cn_namelen > NAME_MAX)) { 5523 cache_fpl_smr_exit(fpl); 5524 return (cache_fpl_handled_error(fpl, ENAMETOOLONG)); 5525 } 5526 5527 /* 5528 * Hack: they may be looking up foo/bar, where foo is not a directory. 5529 * In such a case we need to return ENOTDIR, but we may happen to get 5530 * here with a different error. 5531 */ 5532 if (dvp->v_type != VDIR) { 5533 error = ENOTDIR; 5534 } 5535 5536 /* 5537 * Hack: handle O_SEARCH. 5538 * 5539 * Open Group Base Specifications Issue 7, 2018 edition states: 5540 * <quote> 5541 * If the access mode of the open file description associated with the 5542 * file descriptor is not O_SEARCH, the function shall check whether 5543 * directory searches are permitted using the current permissions of 5544 * the directory underlying the file descriptor. If the access mode is 5545 * O_SEARCH, the function shall not perform the check. 5546 * </quote> 5547 * 5548 * Regular lookup tests for the NOEXECCHECK flag for every path 5549 * component to decide whether to do the permission check. However, 5550 * since most lookups never have the flag (and when they do it is only 5551 * present for the first path component), lockless lookup only acts on 5552 * it if there is a permission problem. Here the flag is represented 5553 * with a boolean so that we don't have to clear it on the way out. 5554 * 5555 * For simplicity this always aborts. 5556 * TODO: check if this is the first lookup and ignore the permission 5557 * problem. Note the flag has to survive fallback (if it happens to be 5558 * performed). 5559 */ 5560 if (fpl->fsearch) { 5561 return (cache_fpl_aborted(fpl)); 5562 } 5563 5564 switch (error) { 5565 case EAGAIN: 5566 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 5567 error = cache_fpl_aborted(fpl); 5568 } else { 5569 cache_fpl_partial(fpl); 5570 } 5571 break; 5572 default: 5573 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 5574 error = cache_fpl_aborted(fpl); 5575 } else { 5576 cache_fpl_smr_exit(fpl); 5577 cache_fpl_handled_error(fpl, error); 5578 } 5579 break; 5580 } 5581 return (error); 5582 } 5583 5584 static int 5585 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl) 5586 { 5587 struct nameidata *ndp; 5588 struct componentname *cnp; 5589 struct mount *mp; 5590 int error; 5591 5592 ndp = fpl->ndp; 5593 cnp = fpl->cnp; 5594 5595 cache_fpl_checkpoint(fpl); 5596 5597 /* 5598 * The vnode at hand is almost always stable, skip checking for it. 5599 * Worst case this postpones the check towards the end of the iteration 5600 * of the main loop. 5601 */ 5602 fpl->dvp = dvp; 5603 fpl->dvp_seqc = vn_seqc_read_notmodify(fpl->dvp); 5604 5605 mp = atomic_load_ptr(&dvp->v_mount); 5606 if (__predict_false(mp == NULL || !cache_fplookup_mp_supported(mp))) { 5607 return (cache_fpl_aborted(fpl)); 5608 } 5609 5610 MPASS(fpl->tvp == NULL); 5611 5612 for (;;) { 5613 cache_fplookup_parse(fpl); 5614 5615 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred); 5616 if (__predict_false(error != 0)) { 5617 error = cache_fplookup_failed_vexec(fpl, error); 5618 break; 5619 } 5620 5621 error = cache_fplookup_next(fpl); 5622 if (__predict_false(cache_fpl_terminated(fpl))) { 5623 break; 5624 } 5625 5626 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 5627 5628 if (fpl->tvp->v_type == VLNK) { 5629 error = cache_fplookup_symlink(fpl); 5630 if (cache_fpl_terminated(fpl)) { 5631 break; 5632 } 5633 } else { 5634 if (cache_fpl_islastcn(ndp)) { 5635 error = cache_fplookup_final(fpl); 5636 break; 5637 } 5638 5639 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { 5640 error = cache_fpl_aborted(fpl); 5641 break; 5642 } 5643 5644 fpl->dvp = fpl->tvp; 5645 fpl->dvp_seqc = fpl->tvp_seqc; 5646 cache_fplookup_parse_advance(fpl); 5647 } 5648 5649 cache_fpl_checkpoint(fpl); 5650 } 5651 5652 return (error); 5653 } 5654 5655 /* 5656 * Fast path lookup protected with SMR and sequence counters. 5657 * 5658 * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one. 5659 * 5660 * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria 5661 * outlined below. 5662 * 5663 * Traditional vnode lookup conceptually looks like this: 5664 * 5665 * vn_lock(current); 5666 * for (;;) { 5667 * next = find(); 5668 * vn_lock(next); 5669 * vn_unlock(current); 5670 * current = next; 5671 * if (last) 5672 * break; 5673 * } 5674 * return (current); 5675 * 5676 * Each jump to the next vnode is safe memory-wise and atomic with respect to 5677 * any modifications thanks to holding respective locks. 5678 * 5679 * The same guarantee can be provided with a combination of safe memory 5680 * reclamation and sequence counters instead. If all operations which affect 5681 * the relationship between the current vnode and the one we are looking for 5682 * also modify the counter, we can verify whether all the conditions held as 5683 * we made the jump. This includes things like permissions, mount points etc. 5684 * Counter modification is provided by enclosing relevant places in 5685 * vn_seqc_write_begin()/end() calls. 5686 * 5687 * Thus this translates to: 5688 * 5689 * vfs_smr_enter(); 5690 * dvp_seqc = seqc_read_any(dvp); 5691 * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode 5692 * abort(); 5693 * for (;;) { 5694 * tvp = find(); 5695 * tvp_seqc = seqc_read_any(tvp); 5696 * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode 5697 * abort(); 5698 * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode 5699 * abort(); 5700 * dvp = tvp; // we know nothing of importance has changed 5701 * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration 5702 * if (last) 5703 * break; 5704 * } 5705 * vget(); // secure the vnode 5706 * if (!seqc_consistent(tvp, tvp_seqc) // final check 5707 * abort(); 5708 * // at this point we know nothing has changed for any parent<->child pair 5709 * // as they were crossed during the lookup, meaning we matched the guarantee 5710 * // of the locked variant 5711 * return (tvp); 5712 * 5713 * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows: 5714 * - they are called while within vfs_smr protection which they must never exit 5715 * - EAGAIN can be returned to denote checking could not be performed, it is 5716 * always valid to return it 5717 * - if the sequence counter has not changed the result must be valid 5718 * - if the sequence counter has changed both false positives and false negatives 5719 * are permitted (since the result will be rejected later) 5720 * - for simple cases of unix permission checks vaccess_vexec_smr can be used 5721 * 5722 * Caveats to watch out for: 5723 * - vnodes are passed unlocked and unreferenced with nothing stopping 5724 * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised 5725 * to use atomic_load_ptr to fetch it. 5726 * - the aforementioned object can also get freed, meaning absent other means it 5727 * should be protected with vfs_smr 5728 * - either safely checking permissions as they are modified or guaranteeing 5729 * their stability is left to the routine 5730 */ 5731 int 5732 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status, 5733 struct pwd **pwdp) 5734 { 5735 struct cache_fpl fpl; 5736 struct pwd *pwd; 5737 struct vnode *dvp; 5738 struct componentname *cnp; 5739 int error; 5740 5741 fpl.status = CACHE_FPL_STATUS_UNSET; 5742 fpl.in_smr = false; 5743 fpl.ndp = ndp; 5744 fpl.cnp = cnp = &ndp->ni_cnd; 5745 MPASS(ndp->ni_lcf == 0); 5746 MPASS(curthread == cnp->cn_thread); 5747 KASSERT ((cnp->cn_flags & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, 5748 ("%s: internal flags found in cn_flags %" PRIx64, __func__, 5749 cnp->cn_flags)); 5750 if ((cnp->cn_flags & SAVESTART) != 0) { 5751 MPASS(cnp->cn_nameiop != LOOKUP); 5752 } 5753 MPASS(cnp->cn_nameptr == cnp->cn_pnbuf); 5754 5755 if (__predict_false(!cache_can_fplookup(&fpl))) { 5756 *status = fpl.status; 5757 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 5758 return (EOPNOTSUPP); 5759 } 5760 5761 cache_fpl_checkpoint_outer(&fpl); 5762 5763 cache_fpl_smr_enter_initial(&fpl); 5764 #ifdef INVARIANTS 5765 fpl.debug.ni_pathlen = ndp->ni_pathlen; 5766 #endif 5767 fpl.nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1]; 5768 fpl.fsearch = false; 5769 fpl.savename = (cnp->cn_flags & SAVENAME) != 0; 5770 fpl.tvp = NULL; /* for degenerate path handling */ 5771 fpl.pwd = pwdp; 5772 pwd = pwd_get_smr(); 5773 *(fpl.pwd) = pwd; 5774 ndp->ni_rootdir = pwd->pwd_rdir; 5775 ndp->ni_topdir = pwd->pwd_jdir; 5776 5777 if (cnp->cn_pnbuf[0] == '/') { 5778 dvp = cache_fpl_handle_root(&fpl); 5779 MPASS(ndp->ni_resflags == 0); 5780 ndp->ni_resflags = NIRES_ABS; 5781 } else { 5782 if (ndp->ni_dirfd == AT_FDCWD) { 5783 dvp = pwd->pwd_cdir; 5784 } else { 5785 error = cache_fplookup_dirfd(&fpl, &dvp); 5786 if (__predict_false(error != 0)) { 5787 goto out; 5788 } 5789 } 5790 } 5791 5792 SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true); 5793 error = cache_fplookup_impl(dvp, &fpl); 5794 out: 5795 cache_fpl_smr_assert_not_entered(&fpl); 5796 cache_fpl_assert_status(&fpl); 5797 *status = fpl.status; 5798 if (SDT_PROBES_ENABLED()) { 5799 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 5800 if (fpl.status == CACHE_FPL_STATUS_HANDLED) 5801 SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true, 5802 ndp); 5803 } 5804 5805 if (__predict_true(fpl.status == CACHE_FPL_STATUS_HANDLED)) { 5806 MPASS(error != CACHE_FPL_FAILED); 5807 if (error != 0) { 5808 MPASS(fpl.dvp == NULL); 5809 MPASS(fpl.tvp == NULL); 5810 MPASS(fpl.savename == false); 5811 } 5812 ndp->ni_dvp = fpl.dvp; 5813 ndp->ni_vp = fpl.tvp; 5814 if (fpl.savename) { 5815 cnp->cn_flags |= HASBUF; 5816 } else { 5817 cache_fpl_cleanup_cnp(cnp); 5818 } 5819 } 5820 return (error); 5821 } 5822