1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Poul-Henning Kamp of the FreeBSD Project. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_ddb.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/capsicum.h> 46 #include <sys/counter.h> 47 #include <sys/filedesc.h> 48 #include <sys/fnv_hash.h> 49 #include <sys/kernel.h> 50 #include <sys/ktr.h> 51 #include <sys/lock.h> 52 #include <sys/malloc.h> 53 #include <sys/fcntl.h> 54 #include <sys/jail.h> 55 #include <sys/mount.h> 56 #include <sys/namei.h> 57 #include <sys/proc.h> 58 #include <sys/seqc.h> 59 #include <sys/sdt.h> 60 #include <sys/smr.h> 61 #include <sys/smp.h> 62 #include <sys/syscallsubr.h> 63 #include <sys/sysctl.h> 64 #include <sys/sysproto.h> 65 #include <sys/vnode.h> 66 #include <ck_queue.h> 67 #ifdef KTRACE 68 #include <sys/ktrace.h> 69 #endif 70 #ifdef INVARIANTS 71 #include <machine/_inttypes.h> 72 #endif 73 74 #include <sys/capsicum.h> 75 76 #include <security/audit/audit.h> 77 #include <security/mac/mac_framework.h> 78 79 #ifdef DDB 80 #include <ddb/ddb.h> 81 #endif 82 83 #include <vm/uma.h> 84 85 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 86 "Name cache"); 87 88 SDT_PROVIDER_DECLARE(vfs); 89 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", 90 "struct vnode *"); 91 SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *", 92 "struct vnode *"); 93 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", 94 "char *"); 95 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *", 96 "const char *"); 97 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *", 98 "struct namecache *", "int", "int"); 99 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); 100 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", 101 "char *", "struct vnode *"); 102 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); 103 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", 104 "struct vnode *", "char *"); 105 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", 106 "struct vnode *"); 107 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, 108 "struct vnode *", "char *"); 109 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", 110 "char *"); 111 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *", 112 "struct componentname *"); 113 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *", 114 "struct componentname *"); 115 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); 116 SDT_PROBE_DEFINE1(vfs, namecache, purge, batch, "int"); 117 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); 118 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); 119 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", 120 "struct vnode *"); 121 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", 122 "char *"); 123 SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *", 124 "char *"); 125 SDT_PROBE_DEFINE1(vfs, namecache, symlink, alloc__fail, "size_t"); 126 127 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool"); 128 SDT_PROBE_DECLARE(vfs, namei, lookup, entry); 129 SDT_PROBE_DECLARE(vfs, namei, lookup, return); 130 131 /* 132 * This structure describes the elements in the cache of recent 133 * names looked up by namei. 134 */ 135 struct negstate { 136 u_char neg_flag; 137 u_char neg_hit; 138 }; 139 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *), 140 "the state must fit in a union with a pointer without growing it"); 141 142 struct namecache { 143 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 144 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 145 CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */ 146 struct vnode *nc_dvp; /* vnode of parent of name */ 147 union { 148 struct vnode *nu_vp; /* vnode the name refers to */ 149 struct negstate nu_neg;/* negative entry state */ 150 } n_un; 151 u_char nc_flag; /* flag bits */ 152 u_char nc_nlen; /* length of name */ 153 char nc_name[0]; /* segment name + nul */ 154 }; 155 156 /* 157 * struct namecache_ts repeats struct namecache layout up to the 158 * nc_nlen member. 159 * struct namecache_ts is used in place of struct namecache when time(s) need 160 * to be stored. The nc_dotdottime field is used when a cache entry is mapping 161 * both a non-dotdot directory name plus dotdot for the directory's 162 * parent. 163 * 164 * See below for alignment requirement. 165 */ 166 struct namecache_ts { 167 struct timespec nc_time; /* timespec provided by fs */ 168 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ 169 int nc_ticks; /* ticks value when entry was added */ 170 int nc_pad; 171 struct namecache nc_nc; 172 }; 173 174 TAILQ_HEAD(cache_freebatch, namecache); 175 176 /* 177 * At least mips n32 performs 64-bit accesses to timespec as found 178 * in namecache_ts and requires them to be aligned. Since others 179 * may be in the same spot suffer a little bit and enforce the 180 * alignment for everyone. Note this is a nop for 64-bit platforms. 181 */ 182 #define CACHE_ZONE_ALIGNMENT UMA_ALIGNOF(time_t) 183 184 /* 185 * TODO: the initial value of CACHE_PATH_CUTOFF was inherited from the 186 * 4.4 BSD codebase. Later on struct namecache was tweaked to become 187 * smaller and the value was bumped to retain the total size, but it 188 * was never re-evaluated for suitability. A simple test counting 189 * lengths during package building shows that the value of 45 covers 190 * about 86% of all added entries, reaching 99% at 65. 191 * 192 * Regardless of the above, use of dedicated zones instead of malloc may be 193 * inducing additional waste. This may be hard to address as said zones are 194 * tied to VFS SMR. Even if retaining them, the current split should be 195 * re-evaluated. 196 */ 197 #ifdef __LP64__ 198 #define CACHE_PATH_CUTOFF 45 199 #define CACHE_LARGE_PAD 6 200 #else 201 #define CACHE_PATH_CUTOFF 41 202 #define CACHE_LARGE_PAD 2 203 #endif 204 205 #define CACHE_ZONE_SMALL_SIZE (offsetof(struct namecache, nc_name) + CACHE_PATH_CUTOFF + 1) 206 #define CACHE_ZONE_SMALL_TS_SIZE (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_SMALL_SIZE) 207 #define CACHE_ZONE_LARGE_SIZE (offsetof(struct namecache, nc_name) + NAME_MAX + 1 + CACHE_LARGE_PAD) 208 #define CACHE_ZONE_LARGE_TS_SIZE (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_LARGE_SIZE) 209 210 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 211 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 212 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 213 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 214 215 #define nc_vp n_un.nu_vp 216 #define nc_neg n_un.nu_neg 217 218 /* 219 * Flags in namecache.nc_flag 220 */ 221 #define NCF_WHITE 0x01 222 #define NCF_ISDOTDOT 0x02 223 #define NCF_TS 0x04 224 #define NCF_DTS 0x08 225 #define NCF_DVDROP 0x10 226 #define NCF_NEGATIVE 0x20 227 #define NCF_INVALID 0x40 228 #define NCF_WIP 0x80 229 230 /* 231 * Flags in negstate.neg_flag 232 */ 233 #define NEG_HOT 0x01 234 235 static bool cache_neg_evict_cond(u_long lnumcache); 236 237 /* 238 * Mark an entry as invalid. 239 * 240 * This is called before it starts getting deconstructed. 241 */ 242 static void 243 cache_ncp_invalidate(struct namecache *ncp) 244 { 245 246 KASSERT((ncp->nc_flag & NCF_INVALID) == 0, 247 ("%s: entry %p already invalid", __func__, ncp)); 248 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID); 249 atomic_thread_fence_rel(); 250 } 251 252 /* 253 * Check whether the entry can be safely used. 254 * 255 * All places which elide locks are supposed to call this after they are 256 * done with reading from an entry. 257 */ 258 #define cache_ncp_canuse(ncp) ({ \ 259 struct namecache *_ncp = (ncp); \ 260 u_char _nc_flag; \ 261 \ 262 atomic_thread_fence_acq(); \ 263 _nc_flag = atomic_load_char(&_ncp->nc_flag); \ 264 __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP)) == 0); \ 265 }) 266 267 /* 268 * Like the above but also checks NCF_WHITE. 269 */ 270 #define cache_fpl_neg_ncp_canuse(ncp) ({ \ 271 struct namecache *_ncp = (ncp); \ 272 u_char _nc_flag; \ 273 \ 274 atomic_thread_fence_acq(); \ 275 _nc_flag = atomic_load_char(&_ncp->nc_flag); \ 276 __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP | NCF_WHITE)) == 0); \ 277 }) 278 279 VFS_SMR_DECLARE; 280 281 static SYSCTL_NODE(_vfs_cache, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 282 "Name cache parameters"); 283 284 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */ 285 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, size, CTLFLAG_RW, &ncsize, 0, 286 "Total namecache capacity"); 287 288 u_int ncsizefactor = 2; 289 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, sizefactor, CTLFLAG_RW, &ncsizefactor, 0, 290 "Size factor for namecache"); 291 292 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */ 293 SYSCTL_ULONG(_vfs_cache_param, OID_AUTO, negfactor, CTLFLAG_RW, &ncnegfactor, 0, 294 "Ratio of negative namecache entries"); 295 296 /* 297 * Negative entry % of namecache capacity above which automatic eviction is allowed. 298 * 299 * Check cache_neg_evict_cond for details. 300 */ 301 static u_int ncnegminpct = 3; 302 303 static u_int __read_mostly neg_min; /* the above recomputed against ncsize */ 304 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, negmin, CTLFLAG_RD, &neg_min, 0, 305 "Negative entry count above which automatic eviction is allowed"); 306 307 /* 308 * Structures associated with name caching. 309 */ 310 #define NCHHASH(hash) \ 311 (&nchashtbl[(hash) & nchash]) 312 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ 313 static u_long __read_mostly nchash; /* size of hash table */ 314 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 315 "Size of namecache hash table"); 316 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ 317 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ 318 319 struct nchstats nchstats; /* cache effectiveness statistics */ 320 321 static bool __read_frequently cache_fast_revlookup = true; 322 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_revlookup, CTLFLAG_RW, 323 &cache_fast_revlookup, 0, ""); 324 325 static bool __read_mostly cache_rename_add = true; 326 SYSCTL_BOOL(_vfs, OID_AUTO, cache_rename_add, CTLFLAG_RW, 327 &cache_rename_add, 0, ""); 328 329 static u_int __exclusive_cache_line neg_cycle; 330 331 #define ncneghash 3 332 #define numneglists (ncneghash + 1) 333 334 struct neglist { 335 struct mtx nl_evict_lock; 336 struct mtx nl_lock __aligned(CACHE_LINE_SIZE); 337 TAILQ_HEAD(, namecache) nl_list; 338 TAILQ_HEAD(, namecache) nl_hotlist; 339 u_long nl_hotnum; 340 } __aligned(CACHE_LINE_SIZE); 341 342 static struct neglist neglists[numneglists]; 343 344 static inline struct neglist * 345 NCP2NEGLIST(struct namecache *ncp) 346 { 347 348 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); 349 } 350 351 static inline struct negstate * 352 NCP2NEGSTATE(struct namecache *ncp) 353 { 354 355 MPASS(atomic_load_char(&ncp->nc_flag) & NCF_NEGATIVE); 356 return (&ncp->nc_neg); 357 } 358 359 #define numbucketlocks (ncbuckethash + 1) 360 static u_int __read_mostly ncbuckethash; 361 static struct mtx_padalign __read_mostly *bucketlocks; 362 #define HASH2BUCKETLOCK(hash) \ 363 ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)])) 364 365 #define numvnodelocks (ncvnodehash + 1) 366 static u_int __read_mostly ncvnodehash; 367 static struct mtx __read_mostly *vnodelocks; 368 static inline struct mtx * 369 VP2VNODELOCK(struct vnode *vp) 370 { 371 372 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); 373 } 374 375 static void 376 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) 377 { 378 struct namecache_ts *ncp_ts; 379 380 KASSERT((ncp->nc_flag & NCF_TS) != 0 || 381 (tsp == NULL && ticksp == NULL), 382 ("No NCF_TS")); 383 384 if (tsp == NULL) 385 return; 386 387 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 388 *tsp = ncp_ts->nc_time; 389 *ticksp = ncp_ts->nc_ticks; 390 } 391 392 #ifdef DEBUG_CACHE 393 static int __read_mostly doingcache = 1; /* 1 => enable the cache */ 394 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, 395 "VFS namecache enabled"); 396 #endif 397 398 /* Export size information to userland */ 399 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 400 sizeof(struct namecache), "sizeof(struct namecache)"); 401 402 /* 403 * The new name cache statistics 404 */ 405 static SYSCTL_NODE(_vfs_cache, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 406 "Name cache statistics"); 407 408 #define STATNODE_ULONG(name, varname, descr) \ 409 SYSCTL_ULONG(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr); 410 #define STATNODE_COUNTER(name, varname, descr) \ 411 static COUNTER_U64_DEFINE_EARLY(varname); \ 412 SYSCTL_COUNTER_U64(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, \ 413 descr); 414 STATNODE_ULONG(neg, numneg, "Number of negative cache entries"); 415 STATNODE_ULONG(count, numcache, "Number of cache entries"); 416 STATNODE_COUNTER(heldvnodes, numcachehv, "Number of namecache entries with vnodes held"); 417 STATNODE_COUNTER(drops, numdrops, "Number of dropped entries due to reaching the limit"); 418 STATNODE_COUNTER(dothits, dothits, "Number of '.' hits"); 419 STATNODE_COUNTER(dotdothis, dotdothits, "Number of '..' hits"); 420 STATNODE_COUNTER(miss, nummiss, "Number of cache misses"); 421 STATNODE_COUNTER(misszap, nummisszap, "Number of cache misses we do not want to cache"); 422 STATNODE_COUNTER(posszaps, numposzaps, 423 "Number of cache hits (positive) we do not want to cache"); 424 STATNODE_COUNTER(poshits, numposhits, "Number of cache hits (positive)"); 425 STATNODE_COUNTER(negzaps, numnegzaps, 426 "Number of cache hits (negative) we do not want to cache"); 427 STATNODE_COUNTER(neghits, numneghits, "Number of cache hits (negative)"); 428 /* These count for vn_getcwd(), too. */ 429 STATNODE_COUNTER(fullpathcalls, numfullpathcalls, "Number of fullpath search calls"); 430 STATNODE_COUNTER(fullpathfail1, numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); 431 STATNODE_COUNTER(fullpathfail2, numfullpathfail2, 432 "Number of fullpath search errors (VOP_VPTOCNP failures)"); 433 STATNODE_COUNTER(fullpathfail4, numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); 434 STATNODE_COUNTER(fullpathfound, numfullpathfound, "Number of successful fullpath calls"); 435 STATNODE_COUNTER(symlinktoobig, symlinktoobig, "Number of times symlink did not fit the cache"); 436 437 /* 438 * Debug or developer statistics. 439 */ 440 static SYSCTL_NODE(_vfs_cache, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 441 "Name cache debugging"); 442 #define DEBUGNODE_ULONG(name, varname, descr) \ 443 SYSCTL_ULONG(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr); 444 #define DEBUGNODE_COUNTER(name, varname, descr) \ 445 static COUNTER_U64_DEFINE_EARLY(varname); \ 446 SYSCTL_COUNTER_U64(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, \ 447 descr); 448 DEBUGNODE_COUNTER(zap_bucket_relock_success, zap_bucket_relock_success, 449 "Number of successful removals after relocking"); 450 static long zap_bucket_fail; 451 DEBUGNODE_ULONG(zap_bucket_fail, zap_bucket_fail, ""); 452 static long zap_bucket_fail2; 453 DEBUGNODE_ULONG(zap_bucket_fail2, zap_bucket_fail2, ""); 454 static long cache_lock_vnodes_cel_3_failures; 455 DEBUGNODE_ULONG(vnodes_cel_3_failures, cache_lock_vnodes_cel_3_failures, 456 "Number of times 3-way vnode locking failed"); 457 458 static void cache_zap_locked(struct namecache *ncp); 459 static int vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, 460 char **freebuf, size_t *buflen); 461 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 462 char **retbuf, size_t *buflen, size_t addend); 463 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, 464 char **retbuf, size_t *buflen); 465 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, 466 char **retbuf, size_t *len, size_t addend); 467 468 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 469 470 static inline void 471 cache_assert_vlp_locked(struct mtx *vlp) 472 { 473 474 if (vlp != NULL) 475 mtx_assert(vlp, MA_OWNED); 476 } 477 478 static inline void 479 cache_assert_vnode_locked(struct vnode *vp) 480 { 481 struct mtx *vlp; 482 483 vlp = VP2VNODELOCK(vp); 484 cache_assert_vlp_locked(vlp); 485 } 486 487 /* 488 * Directory vnodes with entries are held for two reasons: 489 * 1. make them less of a target for reclamation in vnlru 490 * 2. suffer smaller performance penalty in locked lookup as requeieing is avoided 491 * 492 * It will be feasible to stop doing it altogether if all filesystems start 493 * supporting lockless lookup. 494 */ 495 static void 496 cache_hold_vnode(struct vnode *vp) 497 { 498 499 cache_assert_vnode_locked(vp); 500 VNPASS(LIST_EMPTY(&vp->v_cache_src), vp); 501 vhold(vp); 502 counter_u64_add(numcachehv, 1); 503 } 504 505 static void 506 cache_drop_vnode(struct vnode *vp) 507 { 508 509 /* 510 * Called after all locks are dropped, meaning we can't assert 511 * on the state of v_cache_src. 512 */ 513 vdrop(vp); 514 counter_u64_add(numcachehv, -1); 515 } 516 517 /* 518 * UMA zones. 519 */ 520 static uma_zone_t __read_mostly cache_zone_small; 521 static uma_zone_t __read_mostly cache_zone_small_ts; 522 static uma_zone_t __read_mostly cache_zone_large; 523 static uma_zone_t __read_mostly cache_zone_large_ts; 524 525 char * 526 cache_symlink_alloc(size_t size, int flags) 527 { 528 529 if (size < CACHE_ZONE_SMALL_SIZE) { 530 return (uma_zalloc_smr(cache_zone_small, flags)); 531 } 532 if (size < CACHE_ZONE_LARGE_SIZE) { 533 return (uma_zalloc_smr(cache_zone_large, flags)); 534 } 535 counter_u64_add(symlinktoobig, 1); 536 SDT_PROBE1(vfs, namecache, symlink, alloc__fail, size); 537 return (NULL); 538 } 539 540 void 541 cache_symlink_free(char *string, size_t size) 542 { 543 544 MPASS(string != NULL); 545 KASSERT(size < CACHE_ZONE_LARGE_SIZE, 546 ("%s: size %zu too big", __func__, size)); 547 548 if (size < CACHE_ZONE_SMALL_SIZE) { 549 uma_zfree_smr(cache_zone_small, string); 550 return; 551 } 552 if (size < CACHE_ZONE_LARGE_SIZE) { 553 uma_zfree_smr(cache_zone_large, string); 554 return; 555 } 556 __assert_unreachable(); 557 } 558 559 static struct namecache * 560 cache_alloc_uma(int len, bool ts) 561 { 562 struct namecache_ts *ncp_ts; 563 struct namecache *ncp; 564 565 if (__predict_false(ts)) { 566 if (len <= CACHE_PATH_CUTOFF) 567 ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK); 568 else 569 ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK); 570 ncp = &ncp_ts->nc_nc; 571 } else { 572 if (len <= CACHE_PATH_CUTOFF) 573 ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK); 574 else 575 ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK); 576 } 577 return (ncp); 578 } 579 580 static void 581 cache_free_uma(struct namecache *ncp) 582 { 583 struct namecache_ts *ncp_ts; 584 585 if (__predict_false(ncp->nc_flag & NCF_TS)) { 586 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 587 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 588 uma_zfree_smr(cache_zone_small_ts, ncp_ts); 589 else 590 uma_zfree_smr(cache_zone_large_ts, ncp_ts); 591 } else { 592 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 593 uma_zfree_smr(cache_zone_small, ncp); 594 else 595 uma_zfree_smr(cache_zone_large, ncp); 596 } 597 } 598 599 static struct namecache * 600 cache_alloc(int len, bool ts) 601 { 602 u_long lnumcache; 603 604 /* 605 * Avoid blowout in namecache entries. 606 * 607 * Bugs: 608 * 1. filesystems may end up trying to add an already existing entry 609 * (for example this can happen after a cache miss during concurrent 610 * lookup), in which case we will call cache_neg_evict despite not 611 * adding anything. 612 * 2. the routine may fail to free anything and no provisions are made 613 * to make it try harder (see the inside for failure modes) 614 * 3. it only ever looks at negative entries. 615 */ 616 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; 617 if (cache_neg_evict_cond(lnumcache)) { 618 lnumcache = atomic_load_long(&numcache); 619 } 620 if (__predict_false(lnumcache >= ncsize)) { 621 atomic_subtract_long(&numcache, 1); 622 counter_u64_add(numdrops, 1); 623 return (NULL); 624 } 625 return (cache_alloc_uma(len, ts)); 626 } 627 628 static void 629 cache_free(struct namecache *ncp) 630 { 631 632 MPASS(ncp != NULL); 633 if ((ncp->nc_flag & NCF_DVDROP) != 0) { 634 cache_drop_vnode(ncp->nc_dvp); 635 } 636 cache_free_uma(ncp); 637 atomic_subtract_long(&numcache, 1); 638 } 639 640 static void 641 cache_free_batch(struct cache_freebatch *batch) 642 { 643 struct namecache *ncp, *nnp; 644 int i; 645 646 i = 0; 647 if (TAILQ_EMPTY(batch)) 648 goto out; 649 TAILQ_FOREACH_SAFE(ncp, batch, nc_dst, nnp) { 650 if ((ncp->nc_flag & NCF_DVDROP) != 0) { 651 cache_drop_vnode(ncp->nc_dvp); 652 } 653 cache_free_uma(ncp); 654 i++; 655 } 656 atomic_subtract_long(&numcache, i); 657 out: 658 SDT_PROBE1(vfs, namecache, purge, batch, i); 659 } 660 661 /* 662 * Hashing. 663 * 664 * The code was made to use FNV in 2001 and this choice needs to be revisited. 665 * 666 * Short summary of the difficulty: 667 * The longest name which can be inserted is NAME_MAX characters in length (or 668 * 255 at the time of writing this comment), while majority of names used in 669 * practice are significantly shorter (mostly below 10). More importantly 670 * majority of lookups performed find names are even shorter than that. 671 * 672 * This poses a problem where hashes which do better than FNV past word size 673 * (or so) tend to come with additional overhead when finalizing the result, 674 * making them noticeably slower for the most commonly used range. 675 * 676 * Consider a path like: /usr/obj/usr/src/sys/amd64/GENERIC/vnode_if.c 677 * 678 * When looking it up the most time consuming part by a large margin (at least 679 * on amd64) is hashing. Replacing FNV with something which pessimizes short 680 * input would make the slowest part stand out even more. 681 */ 682 683 /* 684 * TODO: With the value stored we can do better than computing the hash based 685 * on the address. 686 */ 687 static void 688 cache_prehash(struct vnode *vp) 689 { 690 691 vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT); 692 } 693 694 static uint32_t 695 cache_get_hash(char *name, u_char len, struct vnode *dvp) 696 { 697 698 return (fnv_32_buf(name, len, dvp->v_nchash)); 699 } 700 701 static uint32_t 702 cache_get_hash_iter_start(struct vnode *dvp) 703 { 704 705 return (dvp->v_nchash); 706 } 707 708 static uint32_t 709 cache_get_hash_iter(char c, uint32_t hash) 710 { 711 712 return (fnv_32_buf(&c, 1, hash)); 713 } 714 715 static uint32_t 716 cache_get_hash_iter_finish(uint32_t hash) 717 { 718 719 return (hash); 720 } 721 722 static inline struct nchashhead * 723 NCP2BUCKET(struct namecache *ncp) 724 { 725 uint32_t hash; 726 727 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 728 return (NCHHASH(hash)); 729 } 730 731 static inline struct mtx * 732 NCP2BUCKETLOCK(struct namecache *ncp) 733 { 734 uint32_t hash; 735 736 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 737 return (HASH2BUCKETLOCK(hash)); 738 } 739 740 #ifdef INVARIANTS 741 static void 742 cache_assert_bucket_locked(struct namecache *ncp) 743 { 744 struct mtx *blp; 745 746 blp = NCP2BUCKETLOCK(ncp); 747 mtx_assert(blp, MA_OWNED); 748 } 749 750 static void 751 cache_assert_bucket_unlocked(struct namecache *ncp) 752 { 753 struct mtx *blp; 754 755 blp = NCP2BUCKETLOCK(ncp); 756 mtx_assert(blp, MA_NOTOWNED); 757 } 758 #else 759 #define cache_assert_bucket_locked(x) do { } while (0) 760 #define cache_assert_bucket_unlocked(x) do { } while (0) 761 #endif 762 763 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) 764 static void 765 _cache_sort_vnodes(void **p1, void **p2) 766 { 767 void *tmp; 768 769 MPASS(*p1 != NULL || *p2 != NULL); 770 771 if (*p1 > *p2) { 772 tmp = *p2; 773 *p2 = *p1; 774 *p1 = tmp; 775 } 776 } 777 778 static void 779 cache_lock_all_buckets(void) 780 { 781 u_int i; 782 783 for (i = 0; i < numbucketlocks; i++) 784 mtx_lock(&bucketlocks[i]); 785 } 786 787 static void 788 cache_unlock_all_buckets(void) 789 { 790 u_int i; 791 792 for (i = 0; i < numbucketlocks; i++) 793 mtx_unlock(&bucketlocks[i]); 794 } 795 796 static void 797 cache_lock_all_vnodes(void) 798 { 799 u_int i; 800 801 for (i = 0; i < numvnodelocks; i++) 802 mtx_lock(&vnodelocks[i]); 803 } 804 805 static void 806 cache_unlock_all_vnodes(void) 807 { 808 u_int i; 809 810 for (i = 0; i < numvnodelocks; i++) 811 mtx_unlock(&vnodelocks[i]); 812 } 813 814 static int 815 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 816 { 817 818 cache_sort_vnodes(&vlp1, &vlp2); 819 820 if (vlp1 != NULL) { 821 if (!mtx_trylock(vlp1)) 822 return (EAGAIN); 823 } 824 if (!mtx_trylock(vlp2)) { 825 if (vlp1 != NULL) 826 mtx_unlock(vlp1); 827 return (EAGAIN); 828 } 829 830 return (0); 831 } 832 833 static void 834 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 835 { 836 837 MPASS(vlp1 != NULL || vlp2 != NULL); 838 MPASS(vlp1 <= vlp2); 839 840 if (vlp1 != NULL) 841 mtx_lock(vlp1); 842 if (vlp2 != NULL) 843 mtx_lock(vlp2); 844 } 845 846 static void 847 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 848 { 849 850 MPASS(vlp1 != NULL || vlp2 != NULL); 851 852 if (vlp1 != NULL) 853 mtx_unlock(vlp1); 854 if (vlp2 != NULL) 855 mtx_unlock(vlp2); 856 } 857 858 static int 859 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 860 { 861 struct nchstats snap; 862 863 if (req->oldptr == NULL) 864 return (SYSCTL_OUT(req, 0, sizeof(snap))); 865 866 snap = nchstats; 867 snap.ncs_goodhits = counter_u64_fetch(numposhits); 868 snap.ncs_neghits = counter_u64_fetch(numneghits); 869 snap.ncs_badhits = counter_u64_fetch(numposzaps) + 870 counter_u64_fetch(numnegzaps); 871 snap.ncs_miss = counter_u64_fetch(nummisszap) + 872 counter_u64_fetch(nummiss); 873 874 return (SYSCTL_OUT(req, &snap, sizeof(snap))); 875 } 876 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | 877 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", 878 "VFS cache effectiveness statistics"); 879 880 static void 881 cache_recalc_neg_min(u_int val) 882 { 883 884 neg_min = (ncsize * val) / 100; 885 } 886 887 static int 888 sysctl_negminpct(SYSCTL_HANDLER_ARGS) 889 { 890 u_int val; 891 int error; 892 893 val = ncnegminpct; 894 error = sysctl_handle_int(oidp, &val, 0, req); 895 if (error != 0 || req->newptr == NULL) 896 return (error); 897 898 if (val == ncnegminpct) 899 return (0); 900 if (val < 0 || val > 99) 901 return (EINVAL); 902 ncnegminpct = val; 903 cache_recalc_neg_min(val); 904 return (0); 905 } 906 907 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, negminpct, 908 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_negminpct, 909 "I", "Negative entry \% of namecache capacity above which automatic eviction is allowed"); 910 911 #ifdef DIAGNOSTIC 912 /* 913 * Grab an atomic snapshot of the name cache hash chain lengths 914 */ 915 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, 916 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 917 "hash table stats"); 918 919 static int 920 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) 921 { 922 struct nchashhead *ncpp; 923 struct namecache *ncp; 924 int i, error, n_nchash, *cntbuf; 925 926 retry: 927 n_nchash = nchash + 1; /* nchash is max index, not count */ 928 if (req->oldptr == NULL) 929 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); 930 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); 931 cache_lock_all_buckets(); 932 if (n_nchash != nchash + 1) { 933 cache_unlock_all_buckets(); 934 free(cntbuf, M_TEMP); 935 goto retry; 936 } 937 /* Scan hash tables counting entries */ 938 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) 939 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) 940 cntbuf[i]++; 941 cache_unlock_all_buckets(); 942 for (error = 0, i = 0; i < n_nchash; i++) 943 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) 944 break; 945 free(cntbuf, M_TEMP); 946 return (error); 947 } 948 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| 949 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", 950 "nchash chain lengths"); 951 952 static int 953 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) 954 { 955 int error; 956 struct nchashhead *ncpp; 957 struct namecache *ncp; 958 int n_nchash; 959 int count, maxlength, used, pct; 960 961 if (!req->oldptr) 962 return SYSCTL_OUT(req, 0, 4 * sizeof(int)); 963 964 cache_lock_all_buckets(); 965 n_nchash = nchash + 1; /* nchash is max index, not count */ 966 used = 0; 967 maxlength = 0; 968 969 /* Scan hash tables for applicable entries */ 970 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { 971 count = 0; 972 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) { 973 count++; 974 } 975 if (count) 976 used++; 977 if (maxlength < count) 978 maxlength = count; 979 } 980 n_nchash = nchash + 1; 981 cache_unlock_all_buckets(); 982 pct = (used * 100) / (n_nchash / 100); 983 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); 984 if (error) 985 return (error); 986 error = SYSCTL_OUT(req, &used, sizeof(used)); 987 if (error) 988 return (error); 989 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); 990 if (error) 991 return (error); 992 error = SYSCTL_OUT(req, &pct, sizeof(pct)); 993 if (error) 994 return (error); 995 return (0); 996 } 997 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| 998 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", 999 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); 1000 #endif 1001 1002 /* 1003 * Negative entries management 1004 * 1005 * Various workloads create plenty of negative entries and barely use them 1006 * afterwards. Moreover malicious users can keep performing bogus lookups 1007 * adding even more entries. For example "make tinderbox" as of writing this 1008 * comment ends up with 2.6M namecache entries in total, 1.2M of which are 1009 * negative. 1010 * 1011 * As such, a rather aggressive eviction method is needed. The currently 1012 * employed method is a placeholder. 1013 * 1014 * Entries are split over numneglists separate lists, each of which is further 1015 * split into hot and cold entries. Entries get promoted after getting a hit. 1016 * Eviction happens on addition of new entry. 1017 */ 1018 static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1019 "Name cache negative entry statistics"); 1020 1021 SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0, 1022 "Number of negative cache entries"); 1023 1024 static COUNTER_U64_DEFINE_EARLY(neg_created); 1025 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created, 1026 "Number of created negative entries"); 1027 1028 static COUNTER_U64_DEFINE_EARLY(neg_evicted); 1029 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted, 1030 "Number of evicted negative entries"); 1031 1032 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty); 1033 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD, 1034 &neg_evict_skipped_empty, 1035 "Number of times evicting failed due to lack of entries"); 1036 1037 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed); 1038 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD, 1039 &neg_evict_skipped_missed, 1040 "Number of times evicting failed due to target entry disappearing"); 1041 1042 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended); 1043 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD, 1044 &neg_evict_skipped_contended, 1045 "Number of times evicting failed due to contention"); 1046 1047 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits, 1048 "Number of cache hits (negative)"); 1049 1050 static int 1051 sysctl_neg_hot(SYSCTL_HANDLER_ARGS) 1052 { 1053 int i, out; 1054 1055 out = 0; 1056 for (i = 0; i < numneglists; i++) 1057 out += neglists[i].nl_hotnum; 1058 1059 return (SYSCTL_OUT(req, &out, sizeof(out))); 1060 } 1061 SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD | 1062 CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I", 1063 "Number of hot negative entries"); 1064 1065 static void 1066 cache_neg_init(struct namecache *ncp) 1067 { 1068 struct negstate *ns; 1069 1070 ncp->nc_flag |= NCF_NEGATIVE; 1071 ns = NCP2NEGSTATE(ncp); 1072 ns->neg_flag = 0; 1073 ns->neg_hit = 0; 1074 counter_u64_add(neg_created, 1); 1075 } 1076 1077 #define CACHE_NEG_PROMOTION_THRESH 2 1078 1079 static bool 1080 cache_neg_hit_prep(struct namecache *ncp) 1081 { 1082 struct negstate *ns; 1083 u_char n; 1084 1085 ns = NCP2NEGSTATE(ncp); 1086 n = atomic_load_char(&ns->neg_hit); 1087 for (;;) { 1088 if (n >= CACHE_NEG_PROMOTION_THRESH) 1089 return (false); 1090 if (atomic_fcmpset_8(&ns->neg_hit, &n, n + 1)) 1091 break; 1092 } 1093 return (n + 1 == CACHE_NEG_PROMOTION_THRESH); 1094 } 1095 1096 /* 1097 * Nothing to do here but it is provided for completeness as some 1098 * cache_neg_hit_prep callers may end up returning without even 1099 * trying to promote. 1100 */ 1101 #define cache_neg_hit_abort(ncp) do { } while (0) 1102 1103 static void 1104 cache_neg_hit_finish(struct namecache *ncp) 1105 { 1106 1107 SDT_PROBE2(vfs, namecache, lookup, hit__negative, ncp->nc_dvp, ncp->nc_name); 1108 counter_u64_add(numneghits, 1); 1109 } 1110 1111 /* 1112 * Move a negative entry to the hot list. 1113 */ 1114 static void 1115 cache_neg_promote_locked(struct namecache *ncp) 1116 { 1117 struct neglist *nl; 1118 struct negstate *ns; 1119 1120 ns = NCP2NEGSTATE(ncp); 1121 nl = NCP2NEGLIST(ncp); 1122 mtx_assert(&nl->nl_lock, MA_OWNED); 1123 if ((ns->neg_flag & NEG_HOT) == 0) { 1124 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst); 1125 TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst); 1126 nl->nl_hotnum++; 1127 ns->neg_flag |= NEG_HOT; 1128 } 1129 } 1130 1131 /* 1132 * Move a hot negative entry to the cold list. 1133 */ 1134 static void 1135 cache_neg_demote_locked(struct namecache *ncp) 1136 { 1137 struct neglist *nl; 1138 struct negstate *ns; 1139 1140 ns = NCP2NEGSTATE(ncp); 1141 nl = NCP2NEGLIST(ncp); 1142 mtx_assert(&nl->nl_lock, MA_OWNED); 1143 MPASS(ns->neg_flag & NEG_HOT); 1144 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst); 1145 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst); 1146 nl->nl_hotnum--; 1147 ns->neg_flag &= ~NEG_HOT; 1148 atomic_store_char(&ns->neg_hit, 0); 1149 } 1150 1151 /* 1152 * Move a negative entry to the hot list if it matches the lookup. 1153 * 1154 * We have to take locks, but they may be contended and in the worst 1155 * case we may need to go off CPU. We don't want to spin within the 1156 * smr section and we can't block with it. Exiting the section means 1157 * the found entry could have been evicted. We are going to look it 1158 * up again. 1159 */ 1160 static bool 1161 cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp, 1162 struct namecache *oncp, uint32_t hash) 1163 { 1164 struct namecache *ncp; 1165 struct neglist *nl; 1166 u_char nc_flag; 1167 1168 nl = NCP2NEGLIST(oncp); 1169 1170 mtx_lock(&nl->nl_lock); 1171 /* 1172 * For hash iteration. 1173 */ 1174 vfs_smr_enter(); 1175 1176 /* 1177 * Avoid all surprises by only succeeding if we got the same entry and 1178 * bailing completely otherwise. 1179 * XXX There are no provisions to keep the vnode around, meaning we may 1180 * end up promoting a negative entry for a *new* vnode and returning 1181 * ENOENT on its account. This is the error we want to return anyway 1182 * and promotion is harmless. 1183 * 1184 * In particular at this point there can be a new ncp which matches the 1185 * search but hashes to a different neglist. 1186 */ 1187 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1188 if (ncp == oncp) 1189 break; 1190 } 1191 1192 /* 1193 * No match to begin with. 1194 */ 1195 if (__predict_false(ncp == NULL)) { 1196 goto out_abort; 1197 } 1198 1199 /* 1200 * The newly found entry may be something different... 1201 */ 1202 if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1203 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) { 1204 goto out_abort; 1205 } 1206 1207 /* 1208 * ... and not even negative. 1209 */ 1210 nc_flag = atomic_load_char(&ncp->nc_flag); 1211 if ((nc_flag & NCF_NEGATIVE) == 0) { 1212 goto out_abort; 1213 } 1214 1215 if (!cache_ncp_canuse(ncp)) { 1216 goto out_abort; 1217 } 1218 1219 cache_neg_promote_locked(ncp); 1220 cache_neg_hit_finish(ncp); 1221 vfs_smr_exit(); 1222 mtx_unlock(&nl->nl_lock); 1223 return (true); 1224 out_abort: 1225 vfs_smr_exit(); 1226 mtx_unlock(&nl->nl_lock); 1227 return (false); 1228 } 1229 1230 static void 1231 cache_neg_promote(struct namecache *ncp) 1232 { 1233 struct neglist *nl; 1234 1235 nl = NCP2NEGLIST(ncp); 1236 mtx_lock(&nl->nl_lock); 1237 cache_neg_promote_locked(ncp); 1238 mtx_unlock(&nl->nl_lock); 1239 } 1240 1241 static void 1242 cache_neg_insert(struct namecache *ncp) 1243 { 1244 struct neglist *nl; 1245 1246 MPASS(ncp->nc_flag & NCF_NEGATIVE); 1247 cache_assert_bucket_locked(ncp); 1248 nl = NCP2NEGLIST(ncp); 1249 mtx_lock(&nl->nl_lock); 1250 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst); 1251 mtx_unlock(&nl->nl_lock); 1252 atomic_add_long(&numneg, 1); 1253 } 1254 1255 static void 1256 cache_neg_remove(struct namecache *ncp) 1257 { 1258 struct neglist *nl; 1259 struct negstate *ns; 1260 1261 cache_assert_bucket_locked(ncp); 1262 nl = NCP2NEGLIST(ncp); 1263 ns = NCP2NEGSTATE(ncp); 1264 mtx_lock(&nl->nl_lock); 1265 if ((ns->neg_flag & NEG_HOT) != 0) { 1266 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst); 1267 nl->nl_hotnum--; 1268 } else { 1269 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst); 1270 } 1271 mtx_unlock(&nl->nl_lock); 1272 atomic_subtract_long(&numneg, 1); 1273 } 1274 1275 static struct neglist * 1276 cache_neg_evict_select_list(void) 1277 { 1278 struct neglist *nl; 1279 u_int c; 1280 1281 c = atomic_fetchadd_int(&neg_cycle, 1) + 1; 1282 nl = &neglists[c % numneglists]; 1283 if (!mtx_trylock(&nl->nl_evict_lock)) { 1284 counter_u64_add(neg_evict_skipped_contended, 1); 1285 return (NULL); 1286 } 1287 return (nl); 1288 } 1289 1290 static struct namecache * 1291 cache_neg_evict_select_entry(struct neglist *nl) 1292 { 1293 struct namecache *ncp, *lncp; 1294 struct negstate *ns, *lns; 1295 int i; 1296 1297 mtx_assert(&nl->nl_evict_lock, MA_OWNED); 1298 mtx_assert(&nl->nl_lock, MA_OWNED); 1299 ncp = TAILQ_FIRST(&nl->nl_list); 1300 if (ncp == NULL) 1301 return (NULL); 1302 lncp = ncp; 1303 lns = NCP2NEGSTATE(lncp); 1304 for (i = 1; i < 4; i++) { 1305 ncp = TAILQ_NEXT(ncp, nc_dst); 1306 if (ncp == NULL) 1307 break; 1308 ns = NCP2NEGSTATE(ncp); 1309 if (ns->neg_hit < lns->neg_hit) { 1310 lncp = ncp; 1311 lns = ns; 1312 } 1313 } 1314 return (lncp); 1315 } 1316 1317 static bool 1318 cache_neg_evict(void) 1319 { 1320 struct namecache *ncp, *ncp2; 1321 struct neglist *nl; 1322 struct vnode *dvp; 1323 struct mtx *dvlp; 1324 struct mtx *blp; 1325 uint32_t hash; 1326 u_char nlen; 1327 bool evicted; 1328 1329 nl = cache_neg_evict_select_list(); 1330 if (nl == NULL) { 1331 return (false); 1332 } 1333 1334 mtx_lock(&nl->nl_lock); 1335 ncp = TAILQ_FIRST(&nl->nl_hotlist); 1336 if (ncp != NULL) { 1337 cache_neg_demote_locked(ncp); 1338 } 1339 ncp = cache_neg_evict_select_entry(nl); 1340 if (ncp == NULL) { 1341 counter_u64_add(neg_evict_skipped_empty, 1); 1342 mtx_unlock(&nl->nl_lock); 1343 mtx_unlock(&nl->nl_evict_lock); 1344 return (false); 1345 } 1346 nlen = ncp->nc_nlen; 1347 dvp = ncp->nc_dvp; 1348 hash = cache_get_hash(ncp->nc_name, nlen, dvp); 1349 dvlp = VP2VNODELOCK(dvp); 1350 blp = HASH2BUCKETLOCK(hash); 1351 mtx_unlock(&nl->nl_lock); 1352 mtx_unlock(&nl->nl_evict_lock); 1353 mtx_lock(dvlp); 1354 mtx_lock(blp); 1355 /* 1356 * Note that since all locks were dropped above, the entry may be 1357 * gone or reallocated to be something else. 1358 */ 1359 CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) { 1360 if (ncp2 == ncp && ncp2->nc_dvp == dvp && 1361 ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0) 1362 break; 1363 } 1364 if (ncp2 == NULL) { 1365 counter_u64_add(neg_evict_skipped_missed, 1); 1366 ncp = NULL; 1367 evicted = false; 1368 } else { 1369 MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp)); 1370 MPASS(blp == NCP2BUCKETLOCK(ncp)); 1371 SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp, 1372 ncp->nc_name); 1373 cache_zap_locked(ncp); 1374 counter_u64_add(neg_evicted, 1); 1375 evicted = true; 1376 } 1377 mtx_unlock(blp); 1378 mtx_unlock(dvlp); 1379 if (ncp != NULL) 1380 cache_free(ncp); 1381 return (evicted); 1382 } 1383 1384 /* 1385 * Maybe evict a negative entry to create more room. 1386 * 1387 * The ncnegfactor parameter limits what fraction of the total count 1388 * can comprise of negative entries. However, if the cache is just 1389 * warming up this leads to excessive evictions. As such, ncnegminpct 1390 * (recomputed to neg_min) dictates whether the above should be 1391 * applied. 1392 * 1393 * Try evicting if the cache is close to full capacity regardless of 1394 * other considerations. 1395 */ 1396 static bool 1397 cache_neg_evict_cond(u_long lnumcache) 1398 { 1399 u_long lnumneg; 1400 1401 if (ncsize - 1000 < lnumcache) 1402 goto out_evict; 1403 lnumneg = atomic_load_long(&numneg); 1404 if (lnumneg < neg_min) 1405 return (false); 1406 if (lnumneg * ncnegfactor < lnumcache) 1407 return (false); 1408 out_evict: 1409 return (cache_neg_evict()); 1410 } 1411 1412 /* 1413 * cache_zap_locked(): 1414 * 1415 * Removes a namecache entry from cache, whether it contains an actual 1416 * pointer to a vnode or if it is just a negative cache entry. 1417 */ 1418 static void 1419 cache_zap_locked(struct namecache *ncp) 1420 { 1421 struct nchashhead *ncpp; 1422 struct vnode *dvp, *vp; 1423 1424 dvp = ncp->nc_dvp; 1425 vp = ncp->nc_vp; 1426 1427 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1428 cache_assert_vnode_locked(vp); 1429 cache_assert_vnode_locked(dvp); 1430 cache_assert_bucket_locked(ncp); 1431 1432 cache_ncp_invalidate(ncp); 1433 1434 ncpp = NCP2BUCKET(ncp); 1435 CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash); 1436 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 1437 SDT_PROBE3(vfs, namecache, zap, done, dvp, ncp->nc_name, vp); 1438 TAILQ_REMOVE(&vp->v_cache_dst, ncp, nc_dst); 1439 if (ncp == vp->v_cache_dd) { 1440 atomic_store_ptr(&vp->v_cache_dd, NULL); 1441 } 1442 } else { 1443 SDT_PROBE2(vfs, namecache, zap_negative, done, dvp, ncp->nc_name); 1444 cache_neg_remove(ncp); 1445 } 1446 if (ncp->nc_flag & NCF_ISDOTDOT) { 1447 if (ncp == dvp->v_cache_dd) { 1448 atomic_store_ptr(&dvp->v_cache_dd, NULL); 1449 } 1450 } else { 1451 LIST_REMOVE(ncp, nc_src); 1452 if (LIST_EMPTY(&dvp->v_cache_src)) { 1453 ncp->nc_flag |= NCF_DVDROP; 1454 } 1455 } 1456 } 1457 1458 static void 1459 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) 1460 { 1461 struct mtx *blp; 1462 1463 MPASS(ncp->nc_dvp == vp); 1464 MPASS(ncp->nc_flag & NCF_NEGATIVE); 1465 cache_assert_vnode_locked(vp); 1466 1467 blp = NCP2BUCKETLOCK(ncp); 1468 mtx_lock(blp); 1469 cache_zap_locked(ncp); 1470 mtx_unlock(blp); 1471 } 1472 1473 static bool 1474 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, 1475 struct mtx **vlpp) 1476 { 1477 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 1478 struct mtx *blp; 1479 1480 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 1481 cache_assert_vnode_locked(vp); 1482 1483 if (ncp->nc_flag & NCF_NEGATIVE) { 1484 if (*vlpp != NULL) { 1485 mtx_unlock(*vlpp); 1486 *vlpp = NULL; 1487 } 1488 cache_zap_negative_locked_vnode_kl(ncp, vp); 1489 return (true); 1490 } 1491 1492 pvlp = VP2VNODELOCK(vp); 1493 blp = NCP2BUCKETLOCK(ncp); 1494 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 1495 vlp2 = VP2VNODELOCK(ncp->nc_vp); 1496 1497 if (*vlpp == vlp1 || *vlpp == vlp2) { 1498 to_unlock = *vlpp; 1499 *vlpp = NULL; 1500 } else { 1501 if (*vlpp != NULL) { 1502 mtx_unlock(*vlpp); 1503 *vlpp = NULL; 1504 } 1505 cache_sort_vnodes(&vlp1, &vlp2); 1506 if (vlp1 == pvlp) { 1507 mtx_lock(vlp2); 1508 to_unlock = vlp2; 1509 } else { 1510 if (!mtx_trylock(vlp1)) 1511 goto out_relock; 1512 to_unlock = vlp1; 1513 } 1514 } 1515 mtx_lock(blp); 1516 cache_zap_locked(ncp); 1517 mtx_unlock(blp); 1518 if (to_unlock != NULL) 1519 mtx_unlock(to_unlock); 1520 return (true); 1521 1522 out_relock: 1523 mtx_unlock(vlp2); 1524 mtx_lock(vlp1); 1525 mtx_lock(vlp2); 1526 MPASS(*vlpp == NULL); 1527 *vlpp = vlp1; 1528 return (false); 1529 } 1530 1531 /* 1532 * If trylocking failed we can get here. We know enough to take all needed locks 1533 * in the right order and re-lookup the entry. 1534 */ 1535 static int 1536 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1537 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, 1538 struct mtx *blp) 1539 { 1540 struct namecache *rncp; 1541 1542 cache_assert_bucket_unlocked(ncp); 1543 1544 cache_sort_vnodes(&dvlp, &vlp); 1545 cache_lock_vnodes(dvlp, vlp); 1546 mtx_lock(blp); 1547 CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { 1548 if (rncp == ncp && rncp->nc_dvp == dvp && 1549 rncp->nc_nlen == cnp->cn_namelen && 1550 !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) 1551 break; 1552 } 1553 if (rncp != NULL) { 1554 cache_zap_locked(rncp); 1555 mtx_unlock(blp); 1556 cache_unlock_vnodes(dvlp, vlp); 1557 counter_u64_add(zap_bucket_relock_success, 1); 1558 return (0); 1559 } 1560 1561 mtx_unlock(blp); 1562 cache_unlock_vnodes(dvlp, vlp); 1563 return (EAGAIN); 1564 } 1565 1566 static int __noinline 1567 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp, 1568 uint32_t hash, struct mtx *blp) 1569 { 1570 struct mtx *dvlp, *vlp; 1571 struct vnode *dvp; 1572 1573 cache_assert_bucket_locked(ncp); 1574 1575 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1576 vlp = NULL; 1577 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1578 vlp = VP2VNODELOCK(ncp->nc_vp); 1579 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1580 cache_zap_locked(ncp); 1581 mtx_unlock(blp); 1582 cache_unlock_vnodes(dvlp, vlp); 1583 return (0); 1584 } 1585 1586 dvp = ncp->nc_dvp; 1587 mtx_unlock(blp); 1588 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1589 } 1590 1591 static __noinline int 1592 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp) 1593 { 1594 struct namecache *ncp; 1595 struct mtx *blp; 1596 struct mtx *dvlp, *dvlp2; 1597 uint32_t hash; 1598 int error; 1599 1600 if (cnp->cn_namelen == 2 && 1601 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1602 dvlp = VP2VNODELOCK(dvp); 1603 dvlp2 = NULL; 1604 mtx_lock(dvlp); 1605 retry_dotdot: 1606 ncp = dvp->v_cache_dd; 1607 if (ncp == NULL) { 1608 mtx_unlock(dvlp); 1609 if (dvlp2 != NULL) 1610 mtx_unlock(dvlp2); 1611 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1612 return (0); 1613 } 1614 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1615 if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2)) 1616 goto retry_dotdot; 1617 MPASS(dvp->v_cache_dd == NULL); 1618 mtx_unlock(dvlp); 1619 if (dvlp2 != NULL) 1620 mtx_unlock(dvlp2); 1621 cache_free(ncp); 1622 } else { 1623 atomic_store_ptr(&dvp->v_cache_dd, NULL); 1624 mtx_unlock(dvlp); 1625 if (dvlp2 != NULL) 1626 mtx_unlock(dvlp2); 1627 } 1628 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1629 return (1); 1630 } 1631 1632 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1633 blp = HASH2BUCKETLOCK(hash); 1634 retry: 1635 if (CK_SLIST_EMPTY(NCHHASH(hash))) 1636 goto out_no_entry; 1637 1638 mtx_lock(blp); 1639 1640 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1641 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1642 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1643 break; 1644 } 1645 1646 if (ncp == NULL) { 1647 mtx_unlock(blp); 1648 goto out_no_entry; 1649 } 1650 1651 error = cache_zap_locked_bucket(ncp, cnp, hash, blp); 1652 if (__predict_false(error != 0)) { 1653 zap_bucket_fail++; 1654 goto retry; 1655 } 1656 counter_u64_add(numposzaps, 1); 1657 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1658 cache_free(ncp); 1659 return (1); 1660 out_no_entry: 1661 counter_u64_add(nummisszap, 1); 1662 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1663 return (0); 1664 } 1665 1666 static int __noinline 1667 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1668 struct timespec *tsp, int *ticksp) 1669 { 1670 int ltype; 1671 1672 *vpp = dvp; 1673 counter_u64_add(dothits, 1); 1674 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); 1675 if (tsp != NULL) 1676 timespecclear(tsp); 1677 if (ticksp != NULL) 1678 *ticksp = ticks; 1679 vrefact(*vpp); 1680 /* 1681 * When we lookup "." we still can be asked to lock it 1682 * differently... 1683 */ 1684 ltype = cnp->cn_lkflags & LK_TYPE_MASK; 1685 if (ltype != VOP_ISLOCKED(*vpp)) { 1686 if (ltype == LK_EXCLUSIVE) { 1687 vn_lock(*vpp, LK_UPGRADE | LK_RETRY); 1688 if (VN_IS_DOOMED((*vpp))) { 1689 /* forced unmount */ 1690 vrele(*vpp); 1691 *vpp = NULL; 1692 return (ENOENT); 1693 } 1694 } else 1695 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); 1696 } 1697 return (-1); 1698 } 1699 1700 static int __noinline 1701 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1702 struct timespec *tsp, int *ticksp) 1703 { 1704 struct namecache_ts *ncp_ts; 1705 struct namecache *ncp; 1706 struct mtx *dvlp; 1707 enum vgetstate vs; 1708 int error, ltype; 1709 bool whiteout; 1710 1711 MPASS((cnp->cn_flags & ISDOTDOT) != 0); 1712 1713 if ((cnp->cn_flags & MAKEENTRY) == 0) { 1714 cache_remove_cnp(dvp, cnp); 1715 return (0); 1716 } 1717 1718 counter_u64_add(dotdothits, 1); 1719 retry: 1720 dvlp = VP2VNODELOCK(dvp); 1721 mtx_lock(dvlp); 1722 ncp = dvp->v_cache_dd; 1723 if (ncp == NULL) { 1724 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, ".."); 1725 mtx_unlock(dvlp); 1726 return (0); 1727 } 1728 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1729 if (ncp->nc_flag & NCF_NEGATIVE) 1730 *vpp = NULL; 1731 else 1732 *vpp = ncp->nc_vp; 1733 } else 1734 *vpp = ncp->nc_dvp; 1735 if (*vpp == NULL) 1736 goto negative_success; 1737 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp); 1738 cache_out_ts(ncp, tsp, ticksp); 1739 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == 1740 NCF_DTS && tsp != NULL) { 1741 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1742 *tsp = ncp_ts->nc_dotdottime; 1743 } 1744 1745 MPASS(dvp != *vpp); 1746 ltype = VOP_ISLOCKED(dvp); 1747 VOP_UNLOCK(dvp); 1748 vs = vget_prep(*vpp); 1749 mtx_unlock(dvlp); 1750 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1751 vn_lock(dvp, ltype | LK_RETRY); 1752 if (VN_IS_DOOMED(dvp)) { 1753 if (error == 0) 1754 vput(*vpp); 1755 *vpp = NULL; 1756 return (ENOENT); 1757 } 1758 if (error) { 1759 *vpp = NULL; 1760 goto retry; 1761 } 1762 return (-1); 1763 negative_success: 1764 if (__predict_false(cnp->cn_nameiop == CREATE)) { 1765 if (cnp->cn_flags & ISLASTCN) { 1766 counter_u64_add(numnegzaps, 1); 1767 cache_zap_negative_locked_vnode_kl(ncp, dvp); 1768 mtx_unlock(dvlp); 1769 cache_free(ncp); 1770 return (0); 1771 } 1772 } 1773 1774 whiteout = (ncp->nc_flag & NCF_WHITE); 1775 cache_out_ts(ncp, tsp, ticksp); 1776 if (cache_neg_hit_prep(ncp)) 1777 cache_neg_promote(ncp); 1778 else 1779 cache_neg_hit_finish(ncp); 1780 mtx_unlock(dvlp); 1781 if (whiteout) 1782 cnp->cn_flags |= ISWHITEOUT; 1783 return (ENOENT); 1784 } 1785 1786 /** 1787 * Lookup a name in the name cache 1788 * 1789 * # Arguments 1790 * 1791 * - dvp: Parent directory in which to search. 1792 * - vpp: Return argument. Will contain desired vnode on cache hit. 1793 * - cnp: Parameters of the name search. The most interesting bits of 1794 * the cn_flags field have the following meanings: 1795 * - MAKEENTRY: If clear, free an entry from the cache rather than look 1796 * it up. 1797 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." 1798 * - tsp: Return storage for cache timestamp. On a successful (positive 1799 * or negative) lookup, tsp will be filled with any timespec that 1800 * was stored when this cache entry was created. However, it will 1801 * be clear for "." entries. 1802 * - ticks: Return storage for alternate cache timestamp. On a successful 1803 * (positive or negative) lookup, it will contain the ticks value 1804 * that was current when the cache entry was created, unless cnp 1805 * was ".". 1806 * 1807 * Either both tsp and ticks have to be provided or neither of them. 1808 * 1809 * # Returns 1810 * 1811 * - -1: A positive cache hit. vpp will contain the desired vnode. 1812 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due 1813 * to a forced unmount. vpp will not be modified. If the entry 1814 * is a whiteout, then the ISWHITEOUT flag will be set in 1815 * cnp->cn_flags. 1816 * - 0: A cache miss. vpp will not be modified. 1817 * 1818 * # Locking 1819 * 1820 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up 1821 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the 1822 * lock is not recursively acquired. 1823 */ 1824 static int __noinline 1825 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1826 struct timespec *tsp, int *ticksp) 1827 { 1828 struct namecache *ncp; 1829 struct mtx *blp; 1830 uint32_t hash; 1831 enum vgetstate vs; 1832 int error; 1833 bool whiteout; 1834 1835 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 1836 MPASS((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) != 0); 1837 1838 retry: 1839 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1840 blp = HASH2BUCKETLOCK(hash); 1841 mtx_lock(blp); 1842 1843 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1844 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1845 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1846 break; 1847 } 1848 1849 if (__predict_false(ncp == NULL)) { 1850 mtx_unlock(blp); 1851 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr); 1852 counter_u64_add(nummiss, 1); 1853 return (0); 1854 } 1855 1856 if (ncp->nc_flag & NCF_NEGATIVE) 1857 goto negative_success; 1858 1859 counter_u64_add(numposhits, 1); 1860 *vpp = ncp->nc_vp; 1861 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); 1862 cache_out_ts(ncp, tsp, ticksp); 1863 MPASS(dvp != *vpp); 1864 vs = vget_prep(*vpp); 1865 mtx_unlock(blp); 1866 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1867 if (error) { 1868 *vpp = NULL; 1869 goto retry; 1870 } 1871 return (-1); 1872 negative_success: 1873 /* 1874 * We don't get here with regular lookup apart from corner cases. 1875 */ 1876 if (__predict_true(cnp->cn_nameiop == CREATE)) { 1877 if (cnp->cn_flags & ISLASTCN) { 1878 counter_u64_add(numnegzaps, 1); 1879 error = cache_zap_locked_bucket(ncp, cnp, hash, blp); 1880 if (__predict_false(error != 0)) { 1881 zap_bucket_fail2++; 1882 goto retry; 1883 } 1884 cache_free(ncp); 1885 return (0); 1886 } 1887 } 1888 1889 whiteout = (ncp->nc_flag & NCF_WHITE); 1890 cache_out_ts(ncp, tsp, ticksp); 1891 if (cache_neg_hit_prep(ncp)) 1892 cache_neg_promote(ncp); 1893 else 1894 cache_neg_hit_finish(ncp); 1895 mtx_unlock(blp); 1896 if (whiteout) 1897 cnp->cn_flags |= ISWHITEOUT; 1898 return (ENOENT); 1899 } 1900 1901 int 1902 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1903 struct timespec *tsp, int *ticksp) 1904 { 1905 struct namecache *ncp; 1906 uint32_t hash; 1907 enum vgetstate vs; 1908 int error; 1909 bool whiteout, neg_promote; 1910 u_short nc_flag; 1911 1912 MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL)); 1913 1914 #ifdef DEBUG_CACHE 1915 if (__predict_false(!doingcache)) { 1916 cnp->cn_flags &= ~MAKEENTRY; 1917 return (0); 1918 } 1919 #endif 1920 1921 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 1922 if (cnp->cn_namelen == 1) 1923 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); 1924 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') 1925 return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp)); 1926 } 1927 1928 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 1929 1930 if ((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) == 0) { 1931 cache_remove_cnp(dvp, cnp); 1932 return (0); 1933 } 1934 1935 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1936 vfs_smr_enter(); 1937 1938 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1939 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1940 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1941 break; 1942 } 1943 1944 if (__predict_false(ncp == NULL)) { 1945 vfs_smr_exit(); 1946 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr); 1947 counter_u64_add(nummiss, 1); 1948 return (0); 1949 } 1950 1951 nc_flag = atomic_load_char(&ncp->nc_flag); 1952 if (nc_flag & NCF_NEGATIVE) 1953 goto negative_success; 1954 1955 counter_u64_add(numposhits, 1); 1956 *vpp = ncp->nc_vp; 1957 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); 1958 cache_out_ts(ncp, tsp, ticksp); 1959 MPASS(dvp != *vpp); 1960 if (!cache_ncp_canuse(ncp)) { 1961 vfs_smr_exit(); 1962 *vpp = NULL; 1963 goto out_fallback; 1964 } 1965 vs = vget_prep_smr(*vpp); 1966 vfs_smr_exit(); 1967 if (__predict_false(vs == VGET_NONE)) { 1968 *vpp = NULL; 1969 goto out_fallback; 1970 } 1971 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1972 if (error) { 1973 *vpp = NULL; 1974 goto out_fallback; 1975 } 1976 return (-1); 1977 negative_success: 1978 if (cnp->cn_nameiop == CREATE) { 1979 if (cnp->cn_flags & ISLASTCN) { 1980 vfs_smr_exit(); 1981 goto out_fallback; 1982 } 1983 } 1984 1985 cache_out_ts(ncp, tsp, ticksp); 1986 whiteout = (atomic_load_char(&ncp->nc_flag) & NCF_WHITE); 1987 neg_promote = cache_neg_hit_prep(ncp); 1988 if (!cache_ncp_canuse(ncp)) { 1989 cache_neg_hit_abort(ncp); 1990 vfs_smr_exit(); 1991 goto out_fallback; 1992 } 1993 if (neg_promote) { 1994 vfs_smr_exit(); 1995 if (!cache_neg_promote_cond(dvp, cnp, ncp, hash)) 1996 goto out_fallback; 1997 } else { 1998 cache_neg_hit_finish(ncp); 1999 vfs_smr_exit(); 2000 } 2001 if (whiteout) 2002 cnp->cn_flags |= ISWHITEOUT; 2003 return (ENOENT); 2004 out_fallback: 2005 return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp)); 2006 } 2007 2008 struct celockstate { 2009 struct mtx *vlp[3]; 2010 struct mtx *blp[2]; 2011 }; 2012 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); 2013 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); 2014 2015 static inline void 2016 cache_celockstate_init(struct celockstate *cel) 2017 { 2018 2019 bzero(cel, sizeof(*cel)); 2020 } 2021 2022 static void 2023 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, 2024 struct vnode *dvp) 2025 { 2026 struct mtx *vlp1, *vlp2; 2027 2028 MPASS(cel->vlp[0] == NULL); 2029 MPASS(cel->vlp[1] == NULL); 2030 MPASS(cel->vlp[2] == NULL); 2031 2032 MPASS(vp != NULL || dvp != NULL); 2033 2034 vlp1 = VP2VNODELOCK(vp); 2035 vlp2 = VP2VNODELOCK(dvp); 2036 cache_sort_vnodes(&vlp1, &vlp2); 2037 2038 if (vlp1 != NULL) { 2039 mtx_lock(vlp1); 2040 cel->vlp[0] = vlp1; 2041 } 2042 mtx_lock(vlp2); 2043 cel->vlp[1] = vlp2; 2044 } 2045 2046 static void 2047 cache_unlock_vnodes_cel(struct celockstate *cel) 2048 { 2049 2050 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); 2051 2052 if (cel->vlp[0] != NULL) 2053 mtx_unlock(cel->vlp[0]); 2054 if (cel->vlp[1] != NULL) 2055 mtx_unlock(cel->vlp[1]); 2056 if (cel->vlp[2] != NULL) 2057 mtx_unlock(cel->vlp[2]); 2058 } 2059 2060 static bool 2061 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) 2062 { 2063 struct mtx *vlp; 2064 bool ret; 2065 2066 cache_assert_vlp_locked(cel->vlp[0]); 2067 cache_assert_vlp_locked(cel->vlp[1]); 2068 MPASS(cel->vlp[2] == NULL); 2069 2070 MPASS(vp != NULL); 2071 vlp = VP2VNODELOCK(vp); 2072 2073 ret = true; 2074 if (vlp >= cel->vlp[1]) { 2075 mtx_lock(vlp); 2076 } else { 2077 if (mtx_trylock(vlp)) 2078 goto out; 2079 cache_lock_vnodes_cel_3_failures++; 2080 cache_unlock_vnodes_cel(cel); 2081 if (vlp < cel->vlp[0]) { 2082 mtx_lock(vlp); 2083 mtx_lock(cel->vlp[0]); 2084 mtx_lock(cel->vlp[1]); 2085 } else { 2086 if (cel->vlp[0] != NULL) 2087 mtx_lock(cel->vlp[0]); 2088 mtx_lock(vlp); 2089 mtx_lock(cel->vlp[1]); 2090 } 2091 ret = false; 2092 } 2093 out: 2094 cel->vlp[2] = vlp; 2095 return (ret); 2096 } 2097 2098 static void 2099 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1, 2100 struct mtx *blp2) 2101 { 2102 2103 MPASS(cel->blp[0] == NULL); 2104 MPASS(cel->blp[1] == NULL); 2105 2106 cache_sort_vnodes(&blp1, &blp2); 2107 2108 if (blp1 != NULL) { 2109 mtx_lock(blp1); 2110 cel->blp[0] = blp1; 2111 } 2112 mtx_lock(blp2); 2113 cel->blp[1] = blp2; 2114 } 2115 2116 static void 2117 cache_unlock_buckets_cel(struct celockstate *cel) 2118 { 2119 2120 if (cel->blp[0] != NULL) 2121 mtx_unlock(cel->blp[0]); 2122 mtx_unlock(cel->blp[1]); 2123 } 2124 2125 /* 2126 * Lock part of the cache affected by the insertion. 2127 * 2128 * This means vnodelocks for dvp, vp and the relevant bucketlock. 2129 * However, insertion can result in removal of an old entry. In this 2130 * case we have an additional vnode and bucketlock pair to lock. 2131 * 2132 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while 2133 * preserving the locking order (smaller address first). 2134 */ 2135 static void 2136 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 2137 uint32_t hash) 2138 { 2139 struct namecache *ncp; 2140 struct mtx *blps[2]; 2141 u_char nc_flag; 2142 2143 blps[0] = HASH2BUCKETLOCK(hash); 2144 for (;;) { 2145 blps[1] = NULL; 2146 cache_lock_vnodes_cel(cel, dvp, vp); 2147 if (vp == NULL || vp->v_type != VDIR) 2148 break; 2149 ncp = atomic_load_consume_ptr(&vp->v_cache_dd); 2150 if (ncp == NULL) 2151 break; 2152 nc_flag = atomic_load_char(&ncp->nc_flag); 2153 if ((nc_flag & NCF_ISDOTDOT) == 0) 2154 break; 2155 MPASS(ncp->nc_dvp == vp); 2156 blps[1] = NCP2BUCKETLOCK(ncp); 2157 if ((nc_flag & NCF_NEGATIVE) != 0) 2158 break; 2159 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 2160 break; 2161 /* 2162 * All vnodes got re-locked. Re-validate the state and if 2163 * nothing changed we are done. Otherwise restart. 2164 */ 2165 if (ncp == vp->v_cache_dd && 2166 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 2167 blps[1] == NCP2BUCKETLOCK(ncp) && 2168 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 2169 break; 2170 cache_unlock_vnodes_cel(cel); 2171 cel->vlp[0] = NULL; 2172 cel->vlp[1] = NULL; 2173 cel->vlp[2] = NULL; 2174 } 2175 cache_lock_buckets_cel(cel, blps[0], blps[1]); 2176 } 2177 2178 static void 2179 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 2180 uint32_t hash) 2181 { 2182 struct namecache *ncp; 2183 struct mtx *blps[2]; 2184 u_char nc_flag; 2185 2186 blps[0] = HASH2BUCKETLOCK(hash); 2187 for (;;) { 2188 blps[1] = NULL; 2189 cache_lock_vnodes_cel(cel, dvp, vp); 2190 ncp = atomic_load_consume_ptr(&dvp->v_cache_dd); 2191 if (ncp == NULL) 2192 break; 2193 nc_flag = atomic_load_char(&ncp->nc_flag); 2194 if ((nc_flag & NCF_ISDOTDOT) == 0) 2195 break; 2196 MPASS(ncp->nc_dvp == dvp); 2197 blps[1] = NCP2BUCKETLOCK(ncp); 2198 if ((nc_flag & NCF_NEGATIVE) != 0) 2199 break; 2200 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 2201 break; 2202 if (ncp == dvp->v_cache_dd && 2203 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 2204 blps[1] == NCP2BUCKETLOCK(ncp) && 2205 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 2206 break; 2207 cache_unlock_vnodes_cel(cel); 2208 cel->vlp[0] = NULL; 2209 cel->vlp[1] = NULL; 2210 cel->vlp[2] = NULL; 2211 } 2212 cache_lock_buckets_cel(cel, blps[0], blps[1]); 2213 } 2214 2215 static void 2216 cache_enter_unlock(struct celockstate *cel) 2217 { 2218 2219 cache_unlock_buckets_cel(cel); 2220 cache_unlock_vnodes_cel(cel); 2221 } 2222 2223 static void __noinline 2224 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, 2225 struct componentname *cnp) 2226 { 2227 struct celockstate cel; 2228 struct namecache *ncp; 2229 uint32_t hash; 2230 int len; 2231 2232 if (atomic_load_ptr(&dvp->v_cache_dd) == NULL) 2233 return; 2234 len = cnp->cn_namelen; 2235 cache_celockstate_init(&cel); 2236 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 2237 cache_enter_lock_dd(&cel, dvp, vp, hash); 2238 ncp = dvp->v_cache_dd; 2239 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) { 2240 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); 2241 cache_zap_locked(ncp); 2242 } else { 2243 ncp = NULL; 2244 } 2245 atomic_store_ptr(&dvp->v_cache_dd, NULL); 2246 cache_enter_unlock(&cel); 2247 if (ncp != NULL) 2248 cache_free(ncp); 2249 } 2250 2251 /* 2252 * Add an entry to the cache. 2253 */ 2254 void 2255 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 2256 struct timespec *tsp, struct timespec *dtsp) 2257 { 2258 struct celockstate cel; 2259 struct namecache *ncp, *n2, *ndd; 2260 struct namecache_ts *ncp_ts; 2261 struct nchashhead *ncpp; 2262 uint32_t hash; 2263 int flag; 2264 int len; 2265 2266 KASSERT(cnp->cn_namelen <= NAME_MAX, 2267 ("%s: passed len %ld exceeds NAME_MAX (%d)", __func__, cnp->cn_namelen, 2268 NAME_MAX)); 2269 #ifdef notyet 2270 /* 2271 * Not everything doing this is weeded out yet. 2272 */ 2273 VNPASS(dvp != vp, dvp); 2274 #endif 2275 VNPASS(!VN_IS_DOOMED(dvp), dvp); 2276 VNPASS(dvp->v_type != VNON, dvp); 2277 if (vp != NULL) { 2278 VNPASS(!VN_IS_DOOMED(vp), vp); 2279 VNPASS(vp->v_type != VNON, vp); 2280 } 2281 2282 #ifdef DEBUG_CACHE 2283 if (__predict_false(!doingcache)) 2284 return; 2285 #endif 2286 2287 flag = 0; 2288 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 2289 if (cnp->cn_namelen == 1) 2290 return; 2291 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 2292 cache_enter_dotdot_prep(dvp, vp, cnp); 2293 flag = NCF_ISDOTDOT; 2294 } 2295 } 2296 2297 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); 2298 if (ncp == NULL) 2299 return; 2300 2301 cache_celockstate_init(&cel); 2302 ndd = NULL; 2303 ncp_ts = NULL; 2304 2305 /* 2306 * Calculate the hash key and setup as much of the new 2307 * namecache entry as possible before acquiring the lock. 2308 */ 2309 ncp->nc_flag = flag | NCF_WIP; 2310 ncp->nc_vp = vp; 2311 if (vp == NULL) 2312 cache_neg_init(ncp); 2313 ncp->nc_dvp = dvp; 2314 if (tsp != NULL) { 2315 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 2316 ncp_ts->nc_time = *tsp; 2317 ncp_ts->nc_ticks = ticks; 2318 ncp_ts->nc_nc.nc_flag |= NCF_TS; 2319 if (dtsp != NULL) { 2320 ncp_ts->nc_dotdottime = *dtsp; 2321 ncp_ts->nc_nc.nc_flag |= NCF_DTS; 2322 } 2323 } 2324 len = ncp->nc_nlen = cnp->cn_namelen; 2325 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 2326 memcpy(ncp->nc_name, cnp->cn_nameptr, len); 2327 ncp->nc_name[len] = '\0'; 2328 cache_enter_lock(&cel, dvp, vp, hash); 2329 2330 /* 2331 * See if this vnode or negative entry is already in the cache 2332 * with this name. This can happen with concurrent lookups of 2333 * the same path name. 2334 */ 2335 ncpp = NCHHASH(hash); 2336 CK_SLIST_FOREACH(n2, ncpp, nc_hash) { 2337 if (n2->nc_dvp == dvp && 2338 n2->nc_nlen == cnp->cn_namelen && 2339 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { 2340 MPASS(cache_ncp_canuse(n2)); 2341 if ((n2->nc_flag & NCF_NEGATIVE) != 0) 2342 KASSERT(vp == NULL, 2343 ("%s: found entry pointing to a different vnode (%p != %p)", 2344 __func__, NULL, vp)); 2345 else 2346 KASSERT(n2->nc_vp == vp, 2347 ("%s: found entry pointing to a different vnode (%p != %p)", 2348 __func__, n2->nc_vp, vp)); 2349 /* 2350 * Entries are supposed to be immutable unless in the 2351 * process of getting destroyed. Accommodating for 2352 * changing timestamps is possible but not worth it. 2353 * This should be harmless in terms of correctness, in 2354 * the worst case resulting in an earlier expiration. 2355 * Alternatively, the found entry can be replaced 2356 * altogether. 2357 */ 2358 MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS))); 2359 #if 0 2360 if (tsp != NULL) { 2361 KASSERT((n2->nc_flag & NCF_TS) != 0, 2362 ("no NCF_TS")); 2363 n2_ts = __containerof(n2, struct namecache_ts, nc_nc); 2364 n2_ts->nc_time = ncp_ts->nc_time; 2365 n2_ts->nc_ticks = ncp_ts->nc_ticks; 2366 if (dtsp != NULL) { 2367 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; 2368 n2_ts->nc_nc.nc_flag |= NCF_DTS; 2369 } 2370 } 2371 #endif 2372 SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name, 2373 vp); 2374 goto out_unlock_free; 2375 } 2376 } 2377 2378 if (flag == NCF_ISDOTDOT) { 2379 /* 2380 * See if we are trying to add .. entry, but some other lookup 2381 * has populated v_cache_dd pointer already. 2382 */ 2383 if (dvp->v_cache_dd != NULL) 2384 goto out_unlock_free; 2385 KASSERT(vp == NULL || vp->v_type == VDIR, 2386 ("wrong vnode type %p", vp)); 2387 atomic_thread_fence_rel(); 2388 atomic_store_ptr(&dvp->v_cache_dd, ncp); 2389 } 2390 2391 if (vp != NULL) { 2392 if (flag != NCF_ISDOTDOT) { 2393 /* 2394 * For this case, the cache entry maps both the 2395 * directory name in it and the name ".." for the 2396 * directory's parent. 2397 */ 2398 if ((ndd = vp->v_cache_dd) != NULL) { 2399 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) 2400 cache_zap_locked(ndd); 2401 else 2402 ndd = NULL; 2403 } 2404 atomic_thread_fence_rel(); 2405 atomic_store_ptr(&vp->v_cache_dd, ncp); 2406 } else if (vp->v_type != VDIR) { 2407 if (vp->v_cache_dd != NULL) { 2408 atomic_store_ptr(&vp->v_cache_dd, NULL); 2409 } 2410 } 2411 } 2412 2413 if (flag != NCF_ISDOTDOT) { 2414 if (LIST_EMPTY(&dvp->v_cache_src)) { 2415 cache_hold_vnode(dvp); 2416 } 2417 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); 2418 } 2419 2420 /* 2421 * If the entry is "negative", we place it into the 2422 * "negative" cache queue, otherwise, we place it into the 2423 * destination vnode's cache entries queue. 2424 */ 2425 if (vp != NULL) { 2426 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); 2427 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, 2428 vp); 2429 } else { 2430 if (cnp->cn_flags & ISWHITEOUT) 2431 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_WHITE); 2432 cache_neg_insert(ncp); 2433 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, 2434 ncp->nc_name); 2435 } 2436 2437 /* 2438 * Insert the new namecache entry into the appropriate chain 2439 * within the cache entries table. 2440 */ 2441 CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash); 2442 2443 atomic_thread_fence_rel(); 2444 /* 2445 * Mark the entry as fully constructed. 2446 * It is immutable past this point until its removal. 2447 */ 2448 atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP); 2449 2450 cache_enter_unlock(&cel); 2451 if (ndd != NULL) 2452 cache_free(ndd); 2453 return; 2454 out_unlock_free: 2455 cache_enter_unlock(&cel); 2456 cache_free(ncp); 2457 return; 2458 } 2459 2460 static u_int 2461 cache_roundup_2(u_int val) 2462 { 2463 u_int res; 2464 2465 for (res = 1; res <= val; res <<= 1) 2466 continue; 2467 2468 return (res); 2469 } 2470 2471 static struct nchashhead * 2472 nchinittbl(u_long elements, u_long *hashmask) 2473 { 2474 struct nchashhead *hashtbl; 2475 u_long hashsize, i; 2476 2477 hashsize = cache_roundup_2(elements) / 2; 2478 2479 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK); 2480 for (i = 0; i < hashsize; i++) 2481 CK_SLIST_INIT(&hashtbl[i]); 2482 *hashmask = hashsize - 1; 2483 return (hashtbl); 2484 } 2485 2486 static void 2487 ncfreetbl(struct nchashhead *hashtbl) 2488 { 2489 2490 free(hashtbl, M_VFSCACHE); 2491 } 2492 2493 /* 2494 * Name cache initialization, from vfs_init() when we are booting 2495 */ 2496 static void 2497 nchinit(void *dummy __unused) 2498 { 2499 u_int i; 2500 2501 cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE, 2502 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2503 cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE, 2504 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2505 cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE, 2506 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2507 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE, 2508 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2509 2510 VFS_SMR_ZONE_SET(cache_zone_small); 2511 VFS_SMR_ZONE_SET(cache_zone_small_ts); 2512 VFS_SMR_ZONE_SET(cache_zone_large); 2513 VFS_SMR_ZONE_SET(cache_zone_large_ts); 2514 2515 ncsize = desiredvnodes * ncsizefactor; 2516 cache_recalc_neg_min(ncnegminpct); 2517 nchashtbl = nchinittbl(desiredvnodes * 2, &nchash); 2518 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; 2519 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ 2520 ncbuckethash = 7; 2521 if (ncbuckethash > nchash) 2522 ncbuckethash = nchash; 2523 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, 2524 M_WAITOK | M_ZERO); 2525 for (i = 0; i < numbucketlocks; i++) 2526 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE); 2527 ncvnodehash = ncbuckethash; 2528 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, 2529 M_WAITOK | M_ZERO); 2530 for (i = 0; i < numvnodelocks; i++) 2531 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); 2532 2533 for (i = 0; i < numneglists; i++) { 2534 mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF); 2535 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); 2536 TAILQ_INIT(&neglists[i].nl_list); 2537 TAILQ_INIT(&neglists[i].nl_hotlist); 2538 } 2539 } 2540 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); 2541 2542 void 2543 cache_vnode_init(struct vnode *vp) 2544 { 2545 2546 LIST_INIT(&vp->v_cache_src); 2547 TAILQ_INIT(&vp->v_cache_dst); 2548 vp->v_cache_dd = NULL; 2549 cache_prehash(vp); 2550 } 2551 2552 void 2553 cache_changesize(u_long newmaxvnodes) 2554 { 2555 struct nchashhead *new_nchashtbl, *old_nchashtbl; 2556 u_long new_nchash, old_nchash; 2557 struct namecache *ncp; 2558 uint32_t hash; 2559 u_long newncsize; 2560 int i; 2561 2562 newncsize = newmaxvnodes * ncsizefactor; 2563 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); 2564 if (newmaxvnodes < numbucketlocks) 2565 newmaxvnodes = numbucketlocks; 2566 2567 new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash); 2568 /* If same hash table size, nothing to do */ 2569 if (nchash == new_nchash) { 2570 ncfreetbl(new_nchashtbl); 2571 return; 2572 } 2573 /* 2574 * Move everything from the old hash table to the new table. 2575 * None of the namecache entries in the table can be removed 2576 * because to do so, they have to be removed from the hash table. 2577 */ 2578 cache_lock_all_vnodes(); 2579 cache_lock_all_buckets(); 2580 old_nchashtbl = nchashtbl; 2581 old_nchash = nchash; 2582 nchashtbl = new_nchashtbl; 2583 nchash = new_nchash; 2584 for (i = 0; i <= old_nchash; i++) { 2585 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) { 2586 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, 2587 ncp->nc_dvp); 2588 CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash); 2589 CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); 2590 } 2591 } 2592 ncsize = newncsize; 2593 cache_recalc_neg_min(ncnegminpct); 2594 cache_unlock_all_buckets(); 2595 cache_unlock_all_vnodes(); 2596 ncfreetbl(old_nchashtbl); 2597 } 2598 2599 /* 2600 * Remove all entries from and to a particular vnode. 2601 */ 2602 static void 2603 cache_purge_impl(struct vnode *vp) 2604 { 2605 struct cache_freebatch batch; 2606 struct namecache *ncp; 2607 struct mtx *vlp, *vlp2; 2608 2609 TAILQ_INIT(&batch); 2610 vlp = VP2VNODELOCK(vp); 2611 vlp2 = NULL; 2612 mtx_lock(vlp); 2613 retry: 2614 while (!LIST_EMPTY(&vp->v_cache_src)) { 2615 ncp = LIST_FIRST(&vp->v_cache_src); 2616 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2617 goto retry; 2618 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2619 } 2620 while (!TAILQ_EMPTY(&vp->v_cache_dst)) { 2621 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2622 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2623 goto retry; 2624 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2625 } 2626 ncp = vp->v_cache_dd; 2627 if (ncp != NULL) { 2628 KASSERT(ncp->nc_flag & NCF_ISDOTDOT, 2629 ("lost dotdot link")); 2630 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2631 goto retry; 2632 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2633 } 2634 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); 2635 mtx_unlock(vlp); 2636 if (vlp2 != NULL) 2637 mtx_unlock(vlp2); 2638 cache_free_batch(&batch); 2639 } 2640 2641 /* 2642 * Opportunistic check to see if there is anything to do. 2643 */ 2644 static bool 2645 cache_has_entries(struct vnode *vp) 2646 { 2647 2648 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2649 atomic_load_ptr(&vp->v_cache_dd) == NULL) 2650 return (false); 2651 return (true); 2652 } 2653 2654 void 2655 cache_purge(struct vnode *vp) 2656 { 2657 2658 SDT_PROBE1(vfs, namecache, purge, done, vp); 2659 if (!cache_has_entries(vp)) 2660 return; 2661 cache_purge_impl(vp); 2662 } 2663 2664 /* 2665 * Only to be used by vgone. 2666 */ 2667 void 2668 cache_purge_vgone(struct vnode *vp) 2669 { 2670 struct mtx *vlp; 2671 2672 VNPASS(VN_IS_DOOMED(vp), vp); 2673 if (cache_has_entries(vp)) { 2674 cache_purge_impl(vp); 2675 return; 2676 } 2677 2678 /* 2679 * Serialize against a potential thread doing cache_purge. 2680 */ 2681 vlp = VP2VNODELOCK(vp); 2682 mtx_wait_unlocked(vlp); 2683 if (cache_has_entries(vp)) { 2684 cache_purge_impl(vp); 2685 return; 2686 } 2687 return; 2688 } 2689 2690 /* 2691 * Remove all negative entries for a particular directory vnode. 2692 */ 2693 void 2694 cache_purge_negative(struct vnode *vp) 2695 { 2696 struct cache_freebatch batch; 2697 struct namecache *ncp, *nnp; 2698 struct mtx *vlp; 2699 2700 SDT_PROBE1(vfs, namecache, purge_negative, done, vp); 2701 if (LIST_EMPTY(&vp->v_cache_src)) 2702 return; 2703 TAILQ_INIT(&batch); 2704 vlp = VP2VNODELOCK(vp); 2705 mtx_lock(vlp); 2706 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { 2707 if (!(ncp->nc_flag & NCF_NEGATIVE)) 2708 continue; 2709 cache_zap_negative_locked_vnode_kl(ncp, vp); 2710 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); 2711 } 2712 mtx_unlock(vlp); 2713 cache_free_batch(&batch); 2714 } 2715 2716 /* 2717 * Entry points for modifying VOP operations. 2718 */ 2719 void 2720 cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, 2721 struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp) 2722 { 2723 2724 ASSERT_VOP_IN_SEQC(fdvp); 2725 ASSERT_VOP_IN_SEQC(fvp); 2726 ASSERT_VOP_IN_SEQC(tdvp); 2727 if (tvp != NULL) 2728 ASSERT_VOP_IN_SEQC(tvp); 2729 2730 cache_purge(fvp); 2731 if (tvp != NULL) { 2732 cache_purge(tvp); 2733 KASSERT(!cache_remove_cnp(tdvp, tcnp), 2734 ("%s: lingering negative entry", __func__)); 2735 } else { 2736 cache_remove_cnp(tdvp, tcnp); 2737 } 2738 2739 /* 2740 * TODO 2741 * 2742 * Historically renaming was always purging all revelang entries, 2743 * but that's quite wasteful. In particular turns out that in many cases 2744 * the target file is immediately accessed after rename, inducing a cache 2745 * miss. 2746 * 2747 * Recode this to reduce relocking and reuse the existing entry (if any) 2748 * instead of just removing it above and allocating a new one here. 2749 */ 2750 if (cache_rename_add) { 2751 cache_enter(tdvp, fvp, tcnp); 2752 } 2753 } 2754 2755 void 2756 cache_vop_rmdir(struct vnode *dvp, struct vnode *vp) 2757 { 2758 2759 ASSERT_VOP_IN_SEQC(dvp); 2760 ASSERT_VOP_IN_SEQC(vp); 2761 cache_purge(vp); 2762 } 2763 2764 #ifdef INVARIANTS 2765 /* 2766 * Validate that if an entry exists it matches. 2767 */ 2768 void 2769 cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) 2770 { 2771 struct namecache *ncp; 2772 struct mtx *blp; 2773 uint32_t hash; 2774 2775 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 2776 if (CK_SLIST_EMPTY(NCHHASH(hash))) 2777 return; 2778 blp = HASH2BUCKETLOCK(hash); 2779 mtx_lock(blp); 2780 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 2781 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 2782 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) { 2783 if (ncp->nc_vp != vp) 2784 panic("%s: mismatch (%p != %p); ncp %p [%s] dvp %p\n", 2785 __func__, vp, ncp->nc_vp, ncp, ncp->nc_name, ncp->nc_dvp); 2786 } 2787 } 2788 mtx_unlock(blp); 2789 } 2790 #endif 2791 2792 /* 2793 * Flush all entries referencing a particular filesystem. 2794 */ 2795 void 2796 cache_purgevfs(struct mount *mp) 2797 { 2798 struct vnode *vp, *mvp; 2799 2800 SDT_PROBE1(vfs, namecache, purgevfs, done, mp); 2801 /* 2802 * Somewhat wasteful iteration over all vnodes. Would be better to 2803 * support filtering and avoid the interlock to begin with. 2804 */ 2805 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 2806 if (!cache_has_entries(vp)) { 2807 VI_UNLOCK(vp); 2808 continue; 2809 } 2810 vholdl(vp); 2811 VI_UNLOCK(vp); 2812 cache_purge(vp); 2813 vdrop(vp); 2814 } 2815 } 2816 2817 /* 2818 * Perform canonical checks and cache lookup and pass on to filesystem 2819 * through the vop_cachedlookup only if needed. 2820 */ 2821 2822 int 2823 vfs_cache_lookup(struct vop_lookup_args *ap) 2824 { 2825 struct vnode *dvp; 2826 int error; 2827 struct vnode **vpp = ap->a_vpp; 2828 struct componentname *cnp = ap->a_cnp; 2829 int flags = cnp->cn_flags; 2830 2831 *vpp = NULL; 2832 dvp = ap->a_dvp; 2833 2834 if (dvp->v_type != VDIR) 2835 return (ENOTDIR); 2836 2837 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && 2838 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) 2839 return (EROFS); 2840 2841 error = vn_dir_check_exec(dvp, cnp); 2842 if (error != 0) 2843 return (error); 2844 2845 error = cache_lookup(dvp, vpp, cnp, NULL, NULL); 2846 if (error == 0) 2847 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); 2848 if (error == -1) 2849 return (0); 2850 return (error); 2851 } 2852 2853 /* Implementation of the getcwd syscall. */ 2854 int 2855 sys___getcwd(struct thread *td, struct __getcwd_args *uap) 2856 { 2857 char *buf, *retbuf; 2858 size_t buflen; 2859 int error; 2860 2861 buflen = uap->buflen; 2862 if (__predict_false(buflen < 2)) 2863 return (EINVAL); 2864 if (buflen > MAXPATHLEN) 2865 buflen = MAXPATHLEN; 2866 2867 buf = uma_zalloc(namei_zone, M_WAITOK); 2868 error = vn_getcwd(buf, &retbuf, &buflen); 2869 if (error == 0) 2870 error = copyout(retbuf, uap->buf, buflen); 2871 uma_zfree(namei_zone, buf); 2872 return (error); 2873 } 2874 2875 int 2876 vn_getcwd(char *buf, char **retbuf, size_t *buflen) 2877 { 2878 struct pwd *pwd; 2879 int error; 2880 2881 vfs_smr_enter(); 2882 pwd = pwd_get_smr(); 2883 error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, 2884 buflen, 0); 2885 VFS_SMR_ASSERT_NOT_ENTERED(); 2886 if (error < 0) { 2887 pwd = pwd_hold(curthread); 2888 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf, 2889 retbuf, buflen); 2890 pwd_drop(pwd); 2891 } 2892 2893 #ifdef KTRACE 2894 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0) 2895 ktrnamei(*retbuf); 2896 #endif 2897 return (error); 2898 } 2899 2900 static int 2901 kern___realpathat(struct thread *td, int fd, const char *path, char *buf, 2902 size_t size, int flags, enum uio_seg pathseg) 2903 { 2904 struct nameidata nd; 2905 char *retbuf, *freebuf; 2906 int error; 2907 2908 if (flags != 0) 2909 return (EINVAL); 2910 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1, 2911 pathseg, path, fd, &cap_fstat_rights, td); 2912 if ((error = namei(&nd)) != 0) 2913 return (error); 2914 error = vn_fullpath_hardlink(&nd, &retbuf, &freebuf, &size); 2915 if (error == 0) { 2916 error = copyout(retbuf, buf, size); 2917 free(freebuf, M_TEMP); 2918 } 2919 NDFREE(&nd, 0); 2920 return (error); 2921 } 2922 2923 int 2924 sys___realpathat(struct thread *td, struct __realpathat_args *uap) 2925 { 2926 2927 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size, 2928 uap->flags, UIO_USERSPACE)); 2929 } 2930 2931 /* 2932 * Retrieve the full filesystem path that correspond to a vnode from the name 2933 * cache (if available) 2934 */ 2935 int 2936 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf) 2937 { 2938 struct pwd *pwd; 2939 char *buf; 2940 size_t buflen; 2941 int error; 2942 2943 if (__predict_false(vp == NULL)) 2944 return (EINVAL); 2945 2946 buflen = MAXPATHLEN; 2947 buf = malloc(buflen, M_TEMP, M_WAITOK); 2948 vfs_smr_enter(); 2949 pwd = pwd_get_smr(); 2950 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, 0); 2951 VFS_SMR_ASSERT_NOT_ENTERED(); 2952 if (error < 0) { 2953 pwd = pwd_hold(curthread); 2954 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen); 2955 pwd_drop(pwd); 2956 } 2957 if (error == 0) 2958 *freebuf = buf; 2959 else 2960 free(buf, M_TEMP); 2961 return (error); 2962 } 2963 2964 /* 2965 * This function is similar to vn_fullpath, but it attempts to lookup the 2966 * pathname relative to the global root mount point. This is required for the 2967 * auditing sub-system, as audited pathnames must be absolute, relative to the 2968 * global root mount point. 2969 */ 2970 int 2971 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf) 2972 { 2973 char *buf; 2974 size_t buflen; 2975 int error; 2976 2977 if (__predict_false(vp == NULL)) 2978 return (EINVAL); 2979 buflen = MAXPATHLEN; 2980 buf = malloc(buflen, M_TEMP, M_WAITOK); 2981 vfs_smr_enter(); 2982 error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, 0); 2983 VFS_SMR_ASSERT_NOT_ENTERED(); 2984 if (error < 0) { 2985 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen); 2986 } 2987 if (error == 0) 2988 *freebuf = buf; 2989 else 2990 free(buf, M_TEMP); 2991 return (error); 2992 } 2993 2994 static struct namecache * 2995 vn_dd_from_dst(struct vnode *vp) 2996 { 2997 struct namecache *ncp; 2998 2999 cache_assert_vnode_locked(vp); 3000 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) { 3001 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 3002 return (ncp); 3003 } 3004 return (NULL); 3005 } 3006 3007 int 3008 vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen) 3009 { 3010 struct vnode *dvp; 3011 struct namecache *ncp; 3012 struct mtx *vlp; 3013 int error; 3014 3015 vlp = VP2VNODELOCK(*vp); 3016 mtx_lock(vlp); 3017 ncp = (*vp)->v_cache_dd; 3018 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) { 3019 KASSERT(ncp == vn_dd_from_dst(*vp), 3020 ("%s: mismatch for dd entry (%p != %p)", __func__, 3021 ncp, vn_dd_from_dst(*vp))); 3022 } else { 3023 ncp = vn_dd_from_dst(*vp); 3024 } 3025 if (ncp != NULL) { 3026 if (*buflen < ncp->nc_nlen) { 3027 mtx_unlock(vlp); 3028 vrele(*vp); 3029 counter_u64_add(numfullpathfail4, 1); 3030 error = ENOMEM; 3031 SDT_PROBE3(vfs, namecache, fullpath, return, error, 3032 vp, NULL); 3033 return (error); 3034 } 3035 *buflen -= ncp->nc_nlen; 3036 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 3037 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, 3038 ncp->nc_name, vp); 3039 dvp = *vp; 3040 *vp = ncp->nc_dvp; 3041 vref(*vp); 3042 mtx_unlock(vlp); 3043 vrele(dvp); 3044 return (0); 3045 } 3046 SDT_PROBE1(vfs, namecache, fullpath, miss, vp); 3047 3048 mtx_unlock(vlp); 3049 vn_lock(*vp, LK_SHARED | LK_RETRY); 3050 error = VOP_VPTOCNP(*vp, &dvp, buf, buflen); 3051 vput(*vp); 3052 if (error) { 3053 counter_u64_add(numfullpathfail2, 1); 3054 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 3055 return (error); 3056 } 3057 3058 *vp = dvp; 3059 if (VN_IS_DOOMED(dvp)) { 3060 /* forced unmount */ 3061 vrele(dvp); 3062 error = ENOENT; 3063 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 3064 return (error); 3065 } 3066 /* 3067 * *vp has its use count incremented still. 3068 */ 3069 3070 return (0); 3071 } 3072 3073 /* 3074 * Resolve a directory to a pathname. 3075 * 3076 * The name of the directory can always be found in the namecache or fetched 3077 * from the filesystem. There is also guaranteed to be only one parent, meaning 3078 * we can just follow vnodes up until we find the root. 3079 * 3080 * The vnode must be referenced. 3081 */ 3082 static int 3083 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 3084 size_t *len, size_t addend) 3085 { 3086 #ifdef KDTRACE_HOOKS 3087 struct vnode *startvp = vp; 3088 #endif 3089 struct vnode *vp1; 3090 size_t buflen; 3091 int error; 3092 bool slash_prefixed; 3093 3094 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); 3095 VNPASS(vp->v_usecount > 0, vp); 3096 3097 buflen = *len; 3098 3099 slash_prefixed = true; 3100 if (addend == 0) { 3101 MPASS(*len >= 2); 3102 buflen--; 3103 buf[buflen] = '\0'; 3104 slash_prefixed = false; 3105 } 3106 3107 error = 0; 3108 3109 SDT_PROBE1(vfs, namecache, fullpath, entry, vp); 3110 counter_u64_add(numfullpathcalls, 1); 3111 while (vp != rdir && vp != rootvnode) { 3112 /* 3113 * The vp vnode must be already fully constructed, 3114 * since it is either found in namecache or obtained 3115 * from VOP_VPTOCNP(). We may test for VV_ROOT safely 3116 * without obtaining the vnode lock. 3117 */ 3118 if ((vp->v_vflag & VV_ROOT) != 0) { 3119 vn_lock(vp, LK_RETRY | LK_SHARED); 3120 3121 /* 3122 * With the vnode locked, check for races with 3123 * unmount, forced or not. Note that we 3124 * already verified that vp is not equal to 3125 * the root vnode, which means that 3126 * mnt_vnodecovered can be NULL only for the 3127 * case of unmount. 3128 */ 3129 if (VN_IS_DOOMED(vp) || 3130 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || 3131 vp1->v_mountedhere != vp->v_mount) { 3132 vput(vp); 3133 error = ENOENT; 3134 SDT_PROBE3(vfs, namecache, fullpath, return, 3135 error, vp, NULL); 3136 break; 3137 } 3138 3139 vref(vp1); 3140 vput(vp); 3141 vp = vp1; 3142 continue; 3143 } 3144 if (vp->v_type != VDIR) { 3145 vrele(vp); 3146 counter_u64_add(numfullpathfail1, 1); 3147 error = ENOTDIR; 3148 SDT_PROBE3(vfs, namecache, fullpath, return, 3149 error, vp, NULL); 3150 break; 3151 } 3152 error = vn_vptocnp(&vp, buf, &buflen); 3153 if (error) 3154 break; 3155 if (buflen == 0) { 3156 vrele(vp); 3157 error = ENOMEM; 3158 SDT_PROBE3(vfs, namecache, fullpath, return, error, 3159 startvp, NULL); 3160 break; 3161 } 3162 buf[--buflen] = '/'; 3163 slash_prefixed = true; 3164 } 3165 if (error) 3166 return (error); 3167 if (!slash_prefixed) { 3168 if (buflen == 0) { 3169 vrele(vp); 3170 counter_u64_add(numfullpathfail4, 1); 3171 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, 3172 startvp, NULL); 3173 return (ENOMEM); 3174 } 3175 buf[--buflen] = '/'; 3176 } 3177 counter_u64_add(numfullpathfound, 1); 3178 vrele(vp); 3179 3180 *retbuf = buf + buflen; 3181 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf); 3182 *len -= buflen; 3183 *len += addend; 3184 return (0); 3185 } 3186 3187 /* 3188 * Resolve an arbitrary vnode to a pathname. 3189 * 3190 * Note 2 caveats: 3191 * - hardlinks are not tracked, thus if the vnode is not a directory this can 3192 * resolve to a different path than the one used to find it 3193 * - namecache is not mandatory, meaning names are not guaranteed to be added 3194 * (in which case resolving fails) 3195 */ 3196 static void __inline 3197 cache_rev_failed_impl(int *reason, int line) 3198 { 3199 3200 *reason = line; 3201 } 3202 #define cache_rev_failed(var) cache_rev_failed_impl((var), __LINE__) 3203 3204 static int 3205 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 3206 char **retbuf, size_t *buflen, size_t addend) 3207 { 3208 #ifdef KDTRACE_HOOKS 3209 struct vnode *startvp = vp; 3210 #endif 3211 struct vnode *tvp; 3212 struct mount *mp; 3213 struct namecache *ncp; 3214 size_t orig_buflen; 3215 int reason; 3216 int error; 3217 #ifdef KDTRACE_HOOKS 3218 int i; 3219 #endif 3220 seqc_t vp_seqc, tvp_seqc; 3221 u_char nc_flag; 3222 3223 VFS_SMR_ASSERT_ENTERED(); 3224 3225 if (!cache_fast_revlookup) { 3226 vfs_smr_exit(); 3227 return (-1); 3228 } 3229 3230 orig_buflen = *buflen; 3231 3232 if (addend == 0) { 3233 MPASS(*buflen >= 2); 3234 *buflen -= 1; 3235 buf[*buflen] = '\0'; 3236 } 3237 3238 if (vp == rdir || vp == rootvnode) { 3239 if (addend == 0) { 3240 *buflen -= 1; 3241 buf[*buflen] = '/'; 3242 } 3243 goto out_ok; 3244 } 3245 3246 #ifdef KDTRACE_HOOKS 3247 i = 0; 3248 #endif 3249 error = -1; 3250 ncp = NULL; /* for sdt probe down below */ 3251 vp_seqc = vn_seqc_read_any(vp); 3252 if (seqc_in_modify(vp_seqc)) { 3253 cache_rev_failed(&reason); 3254 goto out_abort; 3255 } 3256 3257 for (;;) { 3258 #ifdef KDTRACE_HOOKS 3259 i++; 3260 #endif 3261 if ((vp->v_vflag & VV_ROOT) != 0) { 3262 mp = atomic_load_ptr(&vp->v_mount); 3263 if (mp == NULL) { 3264 cache_rev_failed(&reason); 3265 goto out_abort; 3266 } 3267 tvp = atomic_load_ptr(&mp->mnt_vnodecovered); 3268 tvp_seqc = vn_seqc_read_any(tvp); 3269 if (seqc_in_modify(tvp_seqc)) { 3270 cache_rev_failed(&reason); 3271 goto out_abort; 3272 } 3273 if (!vn_seqc_consistent(vp, vp_seqc)) { 3274 cache_rev_failed(&reason); 3275 goto out_abort; 3276 } 3277 vp = tvp; 3278 vp_seqc = tvp_seqc; 3279 continue; 3280 } 3281 ncp = atomic_load_consume_ptr(&vp->v_cache_dd); 3282 if (ncp == NULL) { 3283 cache_rev_failed(&reason); 3284 goto out_abort; 3285 } 3286 nc_flag = atomic_load_char(&ncp->nc_flag); 3287 if ((nc_flag & NCF_ISDOTDOT) != 0) { 3288 cache_rev_failed(&reason); 3289 goto out_abort; 3290 } 3291 if (ncp->nc_nlen >= *buflen) { 3292 cache_rev_failed(&reason); 3293 error = ENOMEM; 3294 goto out_abort; 3295 } 3296 *buflen -= ncp->nc_nlen; 3297 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 3298 *buflen -= 1; 3299 buf[*buflen] = '/'; 3300 tvp = ncp->nc_dvp; 3301 tvp_seqc = vn_seqc_read_any(tvp); 3302 if (seqc_in_modify(tvp_seqc)) { 3303 cache_rev_failed(&reason); 3304 goto out_abort; 3305 } 3306 if (!vn_seqc_consistent(vp, vp_seqc)) { 3307 cache_rev_failed(&reason); 3308 goto out_abort; 3309 } 3310 /* 3311 * Acquire fence provided by vn_seqc_read_any above. 3312 */ 3313 if (__predict_false(atomic_load_ptr(&vp->v_cache_dd) != ncp)) { 3314 cache_rev_failed(&reason); 3315 goto out_abort; 3316 } 3317 if (!cache_ncp_canuse(ncp)) { 3318 cache_rev_failed(&reason); 3319 goto out_abort; 3320 } 3321 vp = tvp; 3322 vp_seqc = tvp_seqc; 3323 if (vp == rdir || vp == rootvnode) 3324 break; 3325 } 3326 out_ok: 3327 vfs_smr_exit(); 3328 *retbuf = buf + *buflen; 3329 *buflen = orig_buflen - *buflen + addend; 3330 SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf); 3331 return (0); 3332 3333 out_abort: 3334 *buflen = orig_buflen; 3335 SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i); 3336 vfs_smr_exit(); 3337 return (error); 3338 } 3339 3340 static int 3341 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 3342 size_t *buflen) 3343 { 3344 size_t orig_buflen, addend; 3345 int error; 3346 3347 if (*buflen < 2) 3348 return (EINVAL); 3349 3350 orig_buflen = *buflen; 3351 3352 vref(vp); 3353 addend = 0; 3354 if (vp->v_type != VDIR) { 3355 *buflen -= 1; 3356 buf[*buflen] = '\0'; 3357 error = vn_vptocnp(&vp, buf, buflen); 3358 if (error) 3359 return (error); 3360 if (*buflen == 0) { 3361 vrele(vp); 3362 return (ENOMEM); 3363 } 3364 *buflen -= 1; 3365 buf[*buflen] = '/'; 3366 addend = orig_buflen - *buflen; 3367 } 3368 3369 return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, addend)); 3370 } 3371 3372 /* 3373 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks). 3374 * 3375 * Since the namecache does not track hardlinks, the caller is expected to first 3376 * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei. 3377 * 3378 * Then we have 2 cases: 3379 * - if the found vnode is a directory, the path can be constructed just by 3380 * following names up the chain 3381 * - otherwise we populate the buffer with the saved name and start resolving 3382 * from the parent 3383 */ 3384 static int 3385 vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, char **freebuf, 3386 size_t *buflen) 3387 { 3388 char *buf, *tmpbuf; 3389 struct pwd *pwd; 3390 struct componentname *cnp; 3391 struct vnode *vp; 3392 size_t addend; 3393 int error; 3394 enum vtype type; 3395 3396 if (*buflen < 2) 3397 return (EINVAL); 3398 if (*buflen > MAXPATHLEN) 3399 *buflen = MAXPATHLEN; 3400 3401 buf = malloc(*buflen, M_TEMP, M_WAITOK); 3402 3403 addend = 0; 3404 vp = ndp->ni_vp; 3405 /* 3406 * Check for VBAD to work around the vp_crossmp bug in lookup(). 3407 * 3408 * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be 3409 * set to mount point's root vnode while ni_dvp will be vp_crossmp. 3410 * If the type is VDIR (like in this very case) we can skip looking 3411 * at ni_dvp in the first place. However, since vnodes get passed here 3412 * unlocked the target may transition to doomed state (type == VBAD) 3413 * before we get to evaluate the condition. If this happens, we will 3414 * populate part of the buffer and descend to vn_fullpath_dir with 3415 * vp == vp_crossmp. Prevent the problem by checking for VBAD. 3416 * 3417 * This should be atomic_load(&vp->v_type) but it is illegal to take 3418 * an address of a bit field, even if said field is sized to char. 3419 * Work around the problem by reading the value into a full-sized enum 3420 * and then re-reading it with atomic_load which will still prevent 3421 * the compiler from re-reading down the road. 3422 */ 3423 type = vp->v_type; 3424 type = atomic_load_int(&type); 3425 if (type == VBAD) { 3426 error = ENOENT; 3427 goto out_bad; 3428 } 3429 if (type != VDIR) { 3430 cnp = &ndp->ni_cnd; 3431 addend = cnp->cn_namelen + 2; 3432 if (*buflen < addend) { 3433 error = ENOMEM; 3434 goto out_bad; 3435 } 3436 *buflen -= addend; 3437 tmpbuf = buf + *buflen; 3438 tmpbuf[0] = '/'; 3439 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen); 3440 tmpbuf[addend - 1] = '\0'; 3441 vp = ndp->ni_dvp; 3442 } 3443 3444 vfs_smr_enter(); 3445 pwd = pwd_get_smr(); 3446 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen, 3447 addend); 3448 VFS_SMR_ASSERT_NOT_ENTERED(); 3449 if (error < 0) { 3450 pwd = pwd_hold(curthread); 3451 vref(vp); 3452 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen, 3453 addend); 3454 pwd_drop(pwd); 3455 if (error != 0) 3456 goto out_bad; 3457 } 3458 3459 *freebuf = buf; 3460 3461 return (0); 3462 out_bad: 3463 free(buf, M_TEMP); 3464 return (error); 3465 } 3466 3467 struct vnode * 3468 vn_dir_dd_ino(struct vnode *vp) 3469 { 3470 struct namecache *ncp; 3471 struct vnode *ddvp; 3472 struct mtx *vlp; 3473 enum vgetstate vs; 3474 3475 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); 3476 vlp = VP2VNODELOCK(vp); 3477 mtx_lock(vlp); 3478 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { 3479 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 3480 continue; 3481 ddvp = ncp->nc_dvp; 3482 vs = vget_prep(ddvp); 3483 mtx_unlock(vlp); 3484 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) 3485 return (NULL); 3486 return (ddvp); 3487 } 3488 mtx_unlock(vlp); 3489 return (NULL); 3490 } 3491 3492 int 3493 vn_commname(struct vnode *vp, char *buf, u_int buflen) 3494 { 3495 struct namecache *ncp; 3496 struct mtx *vlp; 3497 int l; 3498 3499 vlp = VP2VNODELOCK(vp); 3500 mtx_lock(vlp); 3501 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) 3502 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 3503 break; 3504 if (ncp == NULL) { 3505 mtx_unlock(vlp); 3506 return (ENOENT); 3507 } 3508 l = min(ncp->nc_nlen, buflen - 1); 3509 memcpy(buf, ncp->nc_name, l); 3510 mtx_unlock(vlp); 3511 buf[l] = '\0'; 3512 return (0); 3513 } 3514 3515 /* 3516 * This function updates path string to vnode's full global path 3517 * and checks the size of the new path string against the pathlen argument. 3518 * 3519 * Requires a locked, referenced vnode. 3520 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 3521 * 3522 * If vp is a directory, the call to vn_fullpath_global() always succeeds 3523 * because it falls back to the ".." lookup if the namecache lookup fails. 3524 */ 3525 int 3526 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, 3527 u_int pathlen) 3528 { 3529 struct nameidata nd; 3530 struct vnode *vp1; 3531 char *rpath, *fbuf; 3532 int error; 3533 3534 ASSERT_VOP_ELOCKED(vp, __func__); 3535 3536 /* Construct global filesystem path from vp. */ 3537 VOP_UNLOCK(vp); 3538 error = vn_fullpath_global(vp, &rpath, &fbuf); 3539 3540 if (error != 0) { 3541 vrele(vp); 3542 return (error); 3543 } 3544 3545 if (strlen(rpath) >= pathlen) { 3546 vrele(vp); 3547 error = ENAMETOOLONG; 3548 goto out; 3549 } 3550 3551 /* 3552 * Re-lookup the vnode by path to detect a possible rename. 3553 * As a side effect, the vnode is relocked. 3554 * If vnode was renamed, return ENOENT. 3555 */ 3556 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, 3557 UIO_SYSSPACE, path, td); 3558 error = namei(&nd); 3559 if (error != 0) { 3560 vrele(vp); 3561 goto out; 3562 } 3563 NDFREE(&nd, NDF_ONLY_PNBUF); 3564 vp1 = nd.ni_vp; 3565 vrele(vp); 3566 if (vp1 == vp) 3567 strcpy(path, rpath); 3568 else { 3569 vput(vp1); 3570 error = ENOENT; 3571 } 3572 3573 out: 3574 free(fbuf, M_TEMP); 3575 return (error); 3576 } 3577 3578 #ifdef DDB 3579 static void 3580 db_print_vpath(struct vnode *vp) 3581 { 3582 3583 while (vp != NULL) { 3584 db_printf("%p: ", vp); 3585 if (vp == rootvnode) { 3586 db_printf("/"); 3587 vp = NULL; 3588 } else { 3589 if (vp->v_vflag & VV_ROOT) { 3590 db_printf("<mount point>"); 3591 vp = vp->v_mount->mnt_vnodecovered; 3592 } else { 3593 struct namecache *ncp; 3594 char *ncn; 3595 int i; 3596 3597 ncp = TAILQ_FIRST(&vp->v_cache_dst); 3598 if (ncp != NULL) { 3599 ncn = ncp->nc_name; 3600 for (i = 0; i < ncp->nc_nlen; i++) 3601 db_printf("%c", *ncn++); 3602 vp = ncp->nc_dvp; 3603 } else { 3604 vp = NULL; 3605 } 3606 } 3607 } 3608 db_printf("\n"); 3609 } 3610 3611 return; 3612 } 3613 3614 DB_SHOW_COMMAND(vpath, db_show_vpath) 3615 { 3616 struct vnode *vp; 3617 3618 if (!have_addr) { 3619 db_printf("usage: show vpath <struct vnode *>\n"); 3620 return; 3621 } 3622 3623 vp = (struct vnode *)addr; 3624 db_print_vpath(vp); 3625 } 3626 3627 #endif 3628 3629 static int cache_fast_lookup = 1; 3630 static char __read_frequently cache_fast_lookup_enabled = true; 3631 3632 #define CACHE_FPL_FAILED -2020 3633 3634 void 3635 cache_fast_lookup_enabled_recalc(void) 3636 { 3637 int lookup_flag; 3638 int mac_on; 3639 3640 #ifdef MAC 3641 mac_on = mac_vnode_check_lookup_enabled(); 3642 mac_on |= mac_vnode_check_readlink_enabled(); 3643 #else 3644 mac_on = 0; 3645 #endif 3646 3647 lookup_flag = atomic_load_int(&cache_fast_lookup); 3648 if (lookup_flag && !mac_on) { 3649 atomic_store_char(&cache_fast_lookup_enabled, true); 3650 } else { 3651 atomic_store_char(&cache_fast_lookup_enabled, false); 3652 } 3653 } 3654 3655 static int 3656 syscal_vfs_cache_fast_lookup(SYSCTL_HANDLER_ARGS) 3657 { 3658 int error, old; 3659 3660 old = atomic_load_int(&cache_fast_lookup); 3661 error = sysctl_handle_int(oidp, arg1, arg2, req); 3662 if (error == 0 && req->newptr && old != atomic_load_int(&cache_fast_lookup)) 3663 cache_fast_lookup_enabled_recalc(); 3664 return (error); 3665 } 3666 SYSCTL_PROC(_vfs, OID_AUTO, cache_fast_lookup, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_MPSAFE, 3667 &cache_fast_lookup, 0, syscal_vfs_cache_fast_lookup, "IU", ""); 3668 3669 /* 3670 * Components of nameidata (or objects it can point to) which may 3671 * need restoring in case fast path lookup fails. 3672 */ 3673 struct nameidata_outer { 3674 size_t ni_pathlen; 3675 int cn_flags; 3676 }; 3677 3678 struct nameidata_saved { 3679 #ifdef INVARIANTS 3680 char *cn_nameptr; 3681 size_t ni_pathlen; 3682 #endif 3683 }; 3684 3685 #ifdef INVARIANTS 3686 struct cache_fpl_debug { 3687 size_t ni_pathlen; 3688 }; 3689 #endif 3690 3691 struct cache_fpl { 3692 struct nameidata *ndp; 3693 struct componentname *cnp; 3694 char *nulchar; 3695 struct vnode *dvp; 3696 struct vnode *tvp; 3697 seqc_t dvp_seqc; 3698 seqc_t tvp_seqc; 3699 uint32_t hash; 3700 struct nameidata_saved snd; 3701 struct nameidata_outer snd_outer; 3702 int line; 3703 enum cache_fpl_status status:8; 3704 bool in_smr; 3705 bool fsearch; 3706 bool savename; 3707 struct pwd **pwd; 3708 #ifdef INVARIANTS 3709 struct cache_fpl_debug debug; 3710 #endif 3711 }; 3712 3713 static bool cache_fplookup_is_mp(struct cache_fpl *fpl); 3714 static int cache_fplookup_cross_mount(struct cache_fpl *fpl); 3715 static int cache_fplookup_partial_setup(struct cache_fpl *fpl); 3716 static int cache_fplookup_skip_slashes(struct cache_fpl *fpl); 3717 static int cache_fplookup_trailingslash(struct cache_fpl *fpl); 3718 static void cache_fpl_pathlen_dec(struct cache_fpl *fpl); 3719 static void cache_fpl_pathlen_inc(struct cache_fpl *fpl); 3720 static void cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n); 3721 static void cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n); 3722 3723 static void 3724 cache_fpl_cleanup_cnp(struct componentname *cnp) 3725 { 3726 3727 uma_zfree(namei_zone, cnp->cn_pnbuf); 3728 #ifdef DIAGNOSTIC 3729 cnp->cn_pnbuf = NULL; 3730 cnp->cn_nameptr = NULL; 3731 #endif 3732 } 3733 3734 static struct vnode * 3735 cache_fpl_handle_root(struct cache_fpl *fpl) 3736 { 3737 struct nameidata *ndp; 3738 struct componentname *cnp; 3739 3740 ndp = fpl->ndp; 3741 cnp = fpl->cnp; 3742 3743 MPASS(*(cnp->cn_nameptr) == '/'); 3744 cnp->cn_nameptr++; 3745 cache_fpl_pathlen_dec(fpl); 3746 3747 if (__predict_false(*(cnp->cn_nameptr) == '/')) { 3748 do { 3749 cnp->cn_nameptr++; 3750 cache_fpl_pathlen_dec(fpl); 3751 } while (*(cnp->cn_nameptr) == '/'); 3752 } 3753 3754 return (ndp->ni_rootdir); 3755 } 3756 3757 static void 3758 cache_fpl_checkpoint_outer(struct cache_fpl *fpl) 3759 { 3760 3761 fpl->snd_outer.ni_pathlen = fpl->ndp->ni_pathlen; 3762 fpl->snd_outer.cn_flags = fpl->ndp->ni_cnd.cn_flags; 3763 } 3764 3765 static void 3766 cache_fpl_checkpoint(struct cache_fpl *fpl) 3767 { 3768 3769 #ifdef INVARIANTS 3770 fpl->snd.cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr; 3771 fpl->snd.ni_pathlen = fpl->debug.ni_pathlen; 3772 #endif 3773 } 3774 3775 static void 3776 cache_fpl_restore_partial(struct cache_fpl *fpl) 3777 { 3778 3779 fpl->ndp->ni_cnd.cn_flags = fpl->snd_outer.cn_flags; 3780 #ifdef INVARIANTS 3781 fpl->debug.ni_pathlen = fpl->snd.ni_pathlen; 3782 #endif 3783 } 3784 3785 static void 3786 cache_fpl_restore_abort(struct cache_fpl *fpl) 3787 { 3788 3789 cache_fpl_restore_partial(fpl); 3790 /* 3791 * It is 0 on entry by API contract. 3792 */ 3793 fpl->ndp->ni_resflags = 0; 3794 fpl->ndp->ni_cnd.cn_nameptr = fpl->ndp->ni_cnd.cn_pnbuf; 3795 fpl->ndp->ni_pathlen = fpl->snd_outer.ni_pathlen; 3796 } 3797 3798 #ifdef INVARIANTS 3799 #define cache_fpl_smr_assert_entered(fpl) ({ \ 3800 struct cache_fpl *_fpl = (fpl); \ 3801 MPASS(_fpl->in_smr == true); \ 3802 VFS_SMR_ASSERT_ENTERED(); \ 3803 }) 3804 #define cache_fpl_smr_assert_not_entered(fpl) ({ \ 3805 struct cache_fpl *_fpl = (fpl); \ 3806 MPASS(_fpl->in_smr == false); \ 3807 VFS_SMR_ASSERT_NOT_ENTERED(); \ 3808 }) 3809 static void 3810 cache_fpl_assert_status(struct cache_fpl *fpl) 3811 { 3812 3813 switch (fpl->status) { 3814 case CACHE_FPL_STATUS_UNSET: 3815 __assert_unreachable(); 3816 break; 3817 case CACHE_FPL_STATUS_DESTROYED: 3818 case CACHE_FPL_STATUS_ABORTED: 3819 case CACHE_FPL_STATUS_PARTIAL: 3820 case CACHE_FPL_STATUS_HANDLED: 3821 break; 3822 } 3823 } 3824 #else 3825 #define cache_fpl_smr_assert_entered(fpl) do { } while (0) 3826 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0) 3827 #define cache_fpl_assert_status(fpl) do { } while (0) 3828 #endif 3829 3830 #define cache_fpl_smr_enter_initial(fpl) ({ \ 3831 struct cache_fpl *_fpl = (fpl); \ 3832 vfs_smr_enter(); \ 3833 _fpl->in_smr = true; \ 3834 }) 3835 3836 #define cache_fpl_smr_enter(fpl) ({ \ 3837 struct cache_fpl *_fpl = (fpl); \ 3838 MPASS(_fpl->in_smr == false); \ 3839 vfs_smr_enter(); \ 3840 _fpl->in_smr = true; \ 3841 }) 3842 3843 #define cache_fpl_smr_exit(fpl) ({ \ 3844 struct cache_fpl *_fpl = (fpl); \ 3845 MPASS(_fpl->in_smr == true); \ 3846 vfs_smr_exit(); \ 3847 _fpl->in_smr = false; \ 3848 }) 3849 3850 static int 3851 cache_fpl_aborted_early_impl(struct cache_fpl *fpl, int line) 3852 { 3853 3854 if (fpl->status != CACHE_FPL_STATUS_UNSET) { 3855 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, 3856 ("%s: converting to abort from %d at %d, set at %d\n", 3857 __func__, fpl->status, line, fpl->line)); 3858 } 3859 cache_fpl_smr_assert_not_entered(fpl); 3860 fpl->status = CACHE_FPL_STATUS_ABORTED; 3861 fpl->line = line; 3862 return (CACHE_FPL_FAILED); 3863 } 3864 3865 #define cache_fpl_aborted_early(x) cache_fpl_aborted_early_impl((x), __LINE__) 3866 3867 static int __noinline 3868 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line) 3869 { 3870 struct nameidata *ndp; 3871 struct componentname *cnp; 3872 3873 ndp = fpl->ndp; 3874 cnp = fpl->cnp; 3875 3876 if (fpl->status != CACHE_FPL_STATUS_UNSET) { 3877 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, 3878 ("%s: converting to abort from %d at %d, set at %d\n", 3879 __func__, fpl->status, line, fpl->line)); 3880 } 3881 fpl->status = CACHE_FPL_STATUS_ABORTED; 3882 fpl->line = line; 3883 if (fpl->in_smr) 3884 cache_fpl_smr_exit(fpl); 3885 cache_fpl_restore_abort(fpl); 3886 /* 3887 * Resolving symlinks overwrites data passed by the caller. 3888 * Let namei know. 3889 */ 3890 if (ndp->ni_loopcnt > 0) { 3891 fpl->status = CACHE_FPL_STATUS_DESTROYED; 3892 cache_fpl_cleanup_cnp(cnp); 3893 } 3894 return (CACHE_FPL_FAILED); 3895 } 3896 3897 #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__) 3898 3899 static int __noinline 3900 cache_fpl_partial_impl(struct cache_fpl *fpl, int line) 3901 { 3902 3903 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3904 ("%s: setting to partial at %d, but already set to %d at %d\n", 3905 __func__, line, fpl->status, fpl->line)); 3906 cache_fpl_smr_assert_entered(fpl); 3907 fpl->status = CACHE_FPL_STATUS_PARTIAL; 3908 fpl->line = line; 3909 return (cache_fplookup_partial_setup(fpl)); 3910 } 3911 3912 #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__) 3913 3914 static int 3915 cache_fpl_handled_impl(struct cache_fpl *fpl, int line) 3916 { 3917 3918 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3919 ("%s: setting to handled at %d, but already set to %d at %d\n", 3920 __func__, line, fpl->status, fpl->line)); 3921 cache_fpl_smr_assert_not_entered(fpl); 3922 fpl->status = CACHE_FPL_STATUS_HANDLED; 3923 fpl->line = line; 3924 return (0); 3925 } 3926 3927 #define cache_fpl_handled(x) cache_fpl_handled_impl((x), __LINE__) 3928 3929 static int 3930 cache_fpl_handled_error_impl(struct cache_fpl *fpl, int error, int line) 3931 { 3932 3933 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3934 ("%s: setting to handled at %d, but already set to %d at %d\n", 3935 __func__, line, fpl->status, fpl->line)); 3936 MPASS(error != 0); 3937 MPASS(error != CACHE_FPL_FAILED); 3938 cache_fpl_smr_assert_not_entered(fpl); 3939 fpl->status = CACHE_FPL_STATUS_HANDLED; 3940 fpl->line = line; 3941 fpl->dvp = NULL; 3942 fpl->tvp = NULL; 3943 fpl->savename = false; 3944 return (error); 3945 } 3946 3947 #define cache_fpl_handled_error(x, e) cache_fpl_handled_error_impl((x), (e), __LINE__) 3948 3949 static bool 3950 cache_fpl_terminated(struct cache_fpl *fpl) 3951 { 3952 3953 return (fpl->status != CACHE_FPL_STATUS_UNSET); 3954 } 3955 3956 #define CACHE_FPL_SUPPORTED_CN_FLAGS \ 3957 (NC_NOMAKEENTRY | NC_KEEPPOSENTRY | LOCKLEAF | LOCKPARENT | WANTPARENT | \ 3958 FAILIFEXISTS | FOLLOW | LOCKSHARED | SAVENAME | SAVESTART | WILLBEDIR | \ 3959 ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK) 3960 3961 #define CACHE_FPL_INTERNAL_CN_FLAGS \ 3962 (ISDOTDOT | MAKEENTRY | ISLASTCN) 3963 3964 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, 3965 "supported and internal flags overlap"); 3966 3967 static bool 3968 cache_fpl_islastcn(struct nameidata *ndp) 3969 { 3970 3971 return (*ndp->ni_next == 0); 3972 } 3973 3974 static bool 3975 cache_fpl_istrailingslash(struct cache_fpl *fpl) 3976 { 3977 3978 return (*(fpl->nulchar - 1) == '/'); 3979 } 3980 3981 static bool 3982 cache_fpl_isdotdot(struct componentname *cnp) 3983 { 3984 3985 if (cnp->cn_namelen == 2 && 3986 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') 3987 return (true); 3988 return (false); 3989 } 3990 3991 static bool 3992 cache_can_fplookup(struct cache_fpl *fpl) 3993 { 3994 struct nameidata *ndp; 3995 struct componentname *cnp; 3996 struct thread *td; 3997 3998 ndp = fpl->ndp; 3999 cnp = fpl->cnp; 4000 td = cnp->cn_thread; 4001 4002 if (!atomic_load_char(&cache_fast_lookup_enabled)) { 4003 cache_fpl_aborted_early(fpl); 4004 return (false); 4005 } 4006 if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) { 4007 cache_fpl_aborted_early(fpl); 4008 return (false); 4009 } 4010 if (IN_CAPABILITY_MODE(td)) { 4011 cache_fpl_aborted_early(fpl); 4012 return (false); 4013 } 4014 if (AUDITING_TD(td)) { 4015 cache_fpl_aborted_early(fpl); 4016 return (false); 4017 } 4018 if (ndp->ni_startdir != NULL) { 4019 cache_fpl_aborted_early(fpl); 4020 return (false); 4021 } 4022 return (true); 4023 } 4024 4025 static int 4026 cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp) 4027 { 4028 struct nameidata *ndp; 4029 int error; 4030 bool fsearch; 4031 4032 ndp = fpl->ndp; 4033 error = fgetvp_lookup_smr(ndp->ni_dirfd, ndp, vpp, &fsearch); 4034 if (__predict_false(error != 0)) { 4035 return (cache_fpl_aborted(fpl)); 4036 } 4037 fpl->fsearch = fsearch; 4038 return (0); 4039 } 4040 4041 static int __noinline 4042 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp, 4043 uint32_t hash) 4044 { 4045 struct componentname *cnp; 4046 struct vnode *dvp; 4047 4048 cnp = fpl->cnp; 4049 dvp = fpl->dvp; 4050 4051 cache_fpl_smr_exit(fpl); 4052 if (cache_neg_promote_cond(dvp, cnp, oncp, hash)) 4053 return (cache_fpl_handled_error(fpl, ENOENT)); 4054 else 4055 return (cache_fpl_aborted(fpl)); 4056 } 4057 4058 /* 4059 * The target vnode is not supported, prepare for the slow path to take over. 4060 */ 4061 static int __noinline 4062 cache_fplookup_partial_setup(struct cache_fpl *fpl) 4063 { 4064 struct nameidata *ndp; 4065 struct componentname *cnp; 4066 enum vgetstate dvs; 4067 struct vnode *dvp; 4068 struct pwd *pwd; 4069 seqc_t dvp_seqc; 4070 4071 ndp = fpl->ndp; 4072 cnp = fpl->cnp; 4073 pwd = *(fpl->pwd); 4074 dvp = fpl->dvp; 4075 dvp_seqc = fpl->dvp_seqc; 4076 4077 if (!pwd_hold_smr(pwd)) { 4078 return (cache_fpl_aborted(fpl)); 4079 } 4080 4081 /* 4082 * Note that seqc is checked before the vnode is locked, so by 4083 * the time regular lookup gets to it it may have moved. 4084 * 4085 * Ultimately this does not affect correctness, any lookup errors 4086 * are userspace racing with itself. It is guaranteed that any 4087 * path which ultimately gets found could also have been found 4088 * by regular lookup going all the way in absence of concurrent 4089 * modifications. 4090 */ 4091 dvs = vget_prep_smr(dvp); 4092 cache_fpl_smr_exit(fpl); 4093 if (__predict_false(dvs == VGET_NONE)) { 4094 pwd_drop(pwd); 4095 return (cache_fpl_aborted(fpl)); 4096 } 4097 4098 vget_finish_ref(dvp, dvs); 4099 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4100 vrele(dvp); 4101 pwd_drop(pwd); 4102 return (cache_fpl_aborted(fpl)); 4103 } 4104 4105 cache_fpl_restore_partial(fpl); 4106 #ifdef INVARIANTS 4107 if (cnp->cn_nameptr != fpl->snd.cn_nameptr) { 4108 panic("%s: cn_nameptr mismatch (%p != %p) full [%s]\n", __func__, 4109 cnp->cn_nameptr, fpl->snd.cn_nameptr, cnp->cn_pnbuf); 4110 } 4111 #endif 4112 4113 ndp->ni_startdir = dvp; 4114 cnp->cn_flags |= MAKEENTRY; 4115 if (cache_fpl_islastcn(ndp)) 4116 cnp->cn_flags |= ISLASTCN; 4117 if (cache_fpl_isdotdot(cnp)) 4118 cnp->cn_flags |= ISDOTDOT; 4119 4120 /* 4121 * Skip potential extra slashes parsing did not take care of. 4122 * cache_fplookup_skip_slashes explains the mechanism. 4123 */ 4124 if (__predict_false(*(cnp->cn_nameptr) == '/')) { 4125 do { 4126 cnp->cn_nameptr++; 4127 cache_fpl_pathlen_dec(fpl); 4128 } while (*(cnp->cn_nameptr) == '/'); 4129 } 4130 4131 ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1; 4132 #ifdef INVARIANTS 4133 if (ndp->ni_pathlen != fpl->debug.ni_pathlen) { 4134 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n", 4135 __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar, 4136 cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf); 4137 } 4138 #endif 4139 return (0); 4140 } 4141 4142 static int 4143 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs) 4144 { 4145 struct componentname *cnp; 4146 struct vnode *tvp; 4147 seqc_t tvp_seqc; 4148 int error, lkflags; 4149 4150 cnp = fpl->cnp; 4151 tvp = fpl->tvp; 4152 tvp_seqc = fpl->tvp_seqc; 4153 4154 if ((cnp->cn_flags & LOCKLEAF) != 0) { 4155 lkflags = LK_SHARED; 4156 if ((cnp->cn_flags & LOCKSHARED) == 0) 4157 lkflags = LK_EXCLUSIVE; 4158 error = vget_finish(tvp, lkflags, tvs); 4159 if (__predict_false(error != 0)) { 4160 return (cache_fpl_aborted(fpl)); 4161 } 4162 } else { 4163 vget_finish_ref(tvp, tvs); 4164 } 4165 4166 if (!vn_seqc_consistent(tvp, tvp_seqc)) { 4167 if ((cnp->cn_flags & LOCKLEAF) != 0) 4168 vput(tvp); 4169 else 4170 vrele(tvp); 4171 return (cache_fpl_aborted(fpl)); 4172 } 4173 4174 return (cache_fpl_handled(fpl)); 4175 } 4176 4177 /* 4178 * They want to possibly modify the state of the namecache. 4179 */ 4180 static int __noinline 4181 cache_fplookup_final_modifying(struct cache_fpl *fpl) 4182 { 4183 struct nameidata *ndp; 4184 struct componentname *cnp; 4185 enum vgetstate dvs; 4186 struct vnode *dvp, *tvp; 4187 struct mount *mp; 4188 seqc_t dvp_seqc; 4189 int error; 4190 bool docache; 4191 4192 ndp = fpl->ndp; 4193 cnp = fpl->cnp; 4194 dvp = fpl->dvp; 4195 dvp_seqc = fpl->dvp_seqc; 4196 4197 MPASS(*(cnp->cn_nameptr) != '/'); 4198 MPASS(cache_fpl_islastcn(ndp)); 4199 if ((cnp->cn_flags & LOCKPARENT) == 0) 4200 MPASS((cnp->cn_flags & WANTPARENT) != 0); 4201 MPASS((cnp->cn_flags & TRAILINGSLASH) == 0); 4202 MPASS(cnp->cn_nameiop == CREATE || cnp->cn_nameiop == DELETE || 4203 cnp->cn_nameiop == RENAME); 4204 MPASS((cnp->cn_flags & MAKEENTRY) == 0); 4205 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 4206 4207 docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE; 4208 if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME) 4209 docache = false; 4210 4211 /* 4212 * Regular lookup nulifies the slash, which we don't do here. 4213 * Don't take chances with filesystem routines seeing it for 4214 * the last entry. 4215 */ 4216 if (cache_fpl_istrailingslash(fpl)) { 4217 return (cache_fpl_partial(fpl)); 4218 } 4219 4220 mp = atomic_load_ptr(&dvp->v_mount); 4221 if (__predict_false(mp == NULL)) { 4222 return (cache_fpl_aborted(fpl)); 4223 } 4224 4225 if (__predict_false(mp->mnt_flag & MNT_RDONLY)) { 4226 cache_fpl_smr_exit(fpl); 4227 /* 4228 * Original code keeps not checking for CREATE which 4229 * might be a bug. For now let the old lookup decide. 4230 */ 4231 if (cnp->cn_nameiop == CREATE) { 4232 return (cache_fpl_aborted(fpl)); 4233 } 4234 return (cache_fpl_handled_error(fpl, EROFS)); 4235 } 4236 4237 if (fpl->tvp != NULL && (cnp->cn_flags & FAILIFEXISTS) != 0) { 4238 cache_fpl_smr_exit(fpl); 4239 return (cache_fpl_handled_error(fpl, EEXIST)); 4240 } 4241 4242 /* 4243 * Secure access to dvp; check cache_fplookup_partial_setup for 4244 * reasoning. 4245 * 4246 * XXX At least UFS requires its lookup routine to be called for 4247 * the last path component, which leads to some level of complication 4248 * and inefficiency: 4249 * - the target routine always locks the target vnode, but our caller 4250 * may not need it locked 4251 * - some of the VOP machinery asserts that the parent is locked, which 4252 * once more may be not required 4253 * 4254 * TODO: add a flag for filesystems which don't need this. 4255 */ 4256 dvs = vget_prep_smr(dvp); 4257 cache_fpl_smr_exit(fpl); 4258 if (__predict_false(dvs == VGET_NONE)) { 4259 return (cache_fpl_aborted(fpl)); 4260 } 4261 4262 vget_finish_ref(dvp, dvs); 4263 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4264 vrele(dvp); 4265 return (cache_fpl_aborted(fpl)); 4266 } 4267 4268 error = vn_lock(dvp, LK_EXCLUSIVE); 4269 if (__predict_false(error != 0)) { 4270 vrele(dvp); 4271 return (cache_fpl_aborted(fpl)); 4272 } 4273 4274 tvp = NULL; 4275 cnp->cn_flags |= ISLASTCN; 4276 if (docache) 4277 cnp->cn_flags |= MAKEENTRY; 4278 if (cache_fpl_isdotdot(cnp)) 4279 cnp->cn_flags |= ISDOTDOT; 4280 cnp->cn_lkflags = LK_EXCLUSIVE; 4281 error = VOP_LOOKUP(dvp, &tvp, cnp); 4282 switch (error) { 4283 case EJUSTRETURN: 4284 case 0: 4285 break; 4286 case ENOTDIR: 4287 case ENOENT: 4288 vput(dvp); 4289 return (cache_fpl_handled_error(fpl, error)); 4290 default: 4291 vput(dvp); 4292 return (cache_fpl_aborted(fpl)); 4293 } 4294 4295 fpl->tvp = tvp; 4296 fpl->savename = (cnp->cn_flags & SAVENAME) != 0; 4297 4298 if (tvp == NULL) { 4299 if ((cnp->cn_flags & SAVESTART) != 0) { 4300 ndp->ni_startdir = dvp; 4301 vrefact(ndp->ni_startdir); 4302 cnp->cn_flags |= SAVENAME; 4303 fpl->savename = true; 4304 } 4305 MPASS(error == EJUSTRETURN); 4306 if ((cnp->cn_flags & LOCKPARENT) == 0) { 4307 VOP_UNLOCK(dvp); 4308 } 4309 return (cache_fpl_handled(fpl)); 4310 } 4311 4312 /* 4313 * There are very hairy corner cases concerning various flag combinations 4314 * and locking state. In particular here we only hold one lock instead of 4315 * two. 4316 * 4317 * Skip the complexity as it is of no significance for normal workloads. 4318 */ 4319 if (__predict_false(tvp == dvp)) { 4320 vput(dvp); 4321 vrele(tvp); 4322 return (cache_fpl_aborted(fpl)); 4323 } 4324 4325 /* 4326 * If they want the symlink itself we are fine, but if they want to 4327 * follow it regular lookup has to be engaged. 4328 */ 4329 if (tvp->v_type == VLNK) { 4330 if ((cnp->cn_flags & FOLLOW) != 0) { 4331 vput(dvp); 4332 vput(tvp); 4333 return (cache_fpl_aborted(fpl)); 4334 } 4335 } 4336 4337 /* 4338 * Since we expect this to be the terminal vnode it should almost never 4339 * be a mount point. 4340 */ 4341 if (__predict_false(cache_fplookup_is_mp(fpl))) { 4342 vput(dvp); 4343 vput(tvp); 4344 return (cache_fpl_aborted(fpl)); 4345 } 4346 4347 if ((cnp->cn_flags & FAILIFEXISTS) != 0) { 4348 vput(dvp); 4349 vput(tvp); 4350 return (cache_fpl_handled_error(fpl, EEXIST)); 4351 } 4352 4353 if ((cnp->cn_flags & LOCKLEAF) == 0) { 4354 VOP_UNLOCK(tvp); 4355 } 4356 4357 if ((cnp->cn_flags & LOCKPARENT) == 0) { 4358 VOP_UNLOCK(dvp); 4359 } 4360 4361 if ((cnp->cn_flags & SAVESTART) != 0) { 4362 ndp->ni_startdir = dvp; 4363 vrefact(ndp->ni_startdir); 4364 cnp->cn_flags |= SAVENAME; 4365 fpl->savename = true; 4366 } 4367 4368 return (cache_fpl_handled(fpl)); 4369 } 4370 4371 static int __noinline 4372 cache_fplookup_modifying(struct cache_fpl *fpl) 4373 { 4374 struct nameidata *ndp; 4375 4376 ndp = fpl->ndp; 4377 4378 if (!cache_fpl_islastcn(ndp)) { 4379 return (cache_fpl_partial(fpl)); 4380 } 4381 return (cache_fplookup_final_modifying(fpl)); 4382 } 4383 4384 static int __noinline 4385 cache_fplookup_final_withparent(struct cache_fpl *fpl) 4386 { 4387 struct componentname *cnp; 4388 enum vgetstate dvs, tvs; 4389 struct vnode *dvp, *tvp; 4390 seqc_t dvp_seqc; 4391 int error; 4392 4393 cnp = fpl->cnp; 4394 dvp = fpl->dvp; 4395 dvp_seqc = fpl->dvp_seqc; 4396 tvp = fpl->tvp; 4397 4398 MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0); 4399 4400 /* 4401 * This is less efficient than it can be for simplicity. 4402 */ 4403 dvs = vget_prep_smr(dvp); 4404 if (__predict_false(dvs == VGET_NONE)) { 4405 return (cache_fpl_aborted(fpl)); 4406 } 4407 tvs = vget_prep_smr(tvp); 4408 if (__predict_false(tvs == VGET_NONE)) { 4409 cache_fpl_smr_exit(fpl); 4410 vget_abort(dvp, dvs); 4411 return (cache_fpl_aborted(fpl)); 4412 } 4413 4414 cache_fpl_smr_exit(fpl); 4415 4416 if ((cnp->cn_flags & LOCKPARENT) != 0) { 4417 error = vget_finish(dvp, LK_EXCLUSIVE, dvs); 4418 if (__predict_false(error != 0)) { 4419 vget_abort(tvp, tvs); 4420 return (cache_fpl_aborted(fpl)); 4421 } 4422 } else { 4423 vget_finish_ref(dvp, dvs); 4424 } 4425 4426 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4427 vget_abort(tvp, tvs); 4428 if ((cnp->cn_flags & LOCKPARENT) != 0) 4429 vput(dvp); 4430 else 4431 vrele(dvp); 4432 return (cache_fpl_aborted(fpl)); 4433 } 4434 4435 error = cache_fplookup_final_child(fpl, tvs); 4436 if (__predict_false(error != 0)) { 4437 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED); 4438 if ((cnp->cn_flags & LOCKPARENT) != 0) 4439 vput(dvp); 4440 else 4441 vrele(dvp); 4442 return (error); 4443 } 4444 4445 MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED); 4446 return (0); 4447 } 4448 4449 static int 4450 cache_fplookup_final(struct cache_fpl *fpl) 4451 { 4452 struct componentname *cnp; 4453 enum vgetstate tvs; 4454 struct vnode *dvp, *tvp; 4455 seqc_t dvp_seqc; 4456 4457 cnp = fpl->cnp; 4458 dvp = fpl->dvp; 4459 dvp_seqc = fpl->dvp_seqc; 4460 tvp = fpl->tvp; 4461 4462 MPASS(*(cnp->cn_nameptr) != '/'); 4463 4464 if (cnp->cn_nameiop != LOOKUP) { 4465 return (cache_fplookup_final_modifying(fpl)); 4466 } 4467 4468 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) 4469 return (cache_fplookup_final_withparent(fpl)); 4470 4471 tvs = vget_prep_smr(tvp); 4472 if (__predict_false(tvs == VGET_NONE)) { 4473 return (cache_fpl_partial(fpl)); 4474 } 4475 4476 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4477 cache_fpl_smr_exit(fpl); 4478 vget_abort(tvp, tvs); 4479 return (cache_fpl_aborted(fpl)); 4480 } 4481 4482 cache_fpl_smr_exit(fpl); 4483 return (cache_fplookup_final_child(fpl, tvs)); 4484 } 4485 4486 /* 4487 * Comment from locked lookup: 4488 * Check for degenerate name (e.g. / or "") which is a way of talking about a 4489 * directory, e.g. like "/." or ".". 4490 */ 4491 static int __noinline 4492 cache_fplookup_degenerate(struct cache_fpl *fpl) 4493 { 4494 struct componentname *cnp; 4495 struct vnode *dvp; 4496 enum vgetstate dvs; 4497 int error, lkflags; 4498 #ifdef INVARIANTS 4499 char *cp; 4500 #endif 4501 4502 fpl->tvp = fpl->dvp; 4503 fpl->tvp_seqc = fpl->dvp_seqc; 4504 4505 cnp = fpl->cnp; 4506 dvp = fpl->dvp; 4507 4508 #ifdef INVARIANTS 4509 for (cp = cnp->cn_pnbuf; *cp != '\0'; cp++) { 4510 KASSERT(*cp == '/', 4511 ("%s: encountered non-slash; string [%s]\n", __func__, 4512 cnp->cn_pnbuf)); 4513 } 4514 #endif 4515 4516 if (__predict_false(cnp->cn_nameiop != LOOKUP)) { 4517 cache_fpl_smr_exit(fpl); 4518 return (cache_fpl_handled_error(fpl, EISDIR)); 4519 } 4520 4521 MPASS((cnp->cn_flags & SAVESTART) == 0); 4522 4523 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) { 4524 return (cache_fplookup_final_withparent(fpl)); 4525 } 4526 4527 dvs = vget_prep_smr(dvp); 4528 cache_fpl_smr_exit(fpl); 4529 if (__predict_false(dvs == VGET_NONE)) { 4530 return (cache_fpl_aborted(fpl)); 4531 } 4532 4533 if ((cnp->cn_flags & LOCKLEAF) != 0) { 4534 lkflags = LK_SHARED; 4535 if ((cnp->cn_flags & LOCKSHARED) == 0) 4536 lkflags = LK_EXCLUSIVE; 4537 error = vget_finish(dvp, lkflags, dvs); 4538 if (__predict_false(error != 0)) { 4539 return (cache_fpl_aborted(fpl)); 4540 } 4541 } else { 4542 vget_finish_ref(dvp, dvs); 4543 } 4544 return (cache_fpl_handled(fpl)); 4545 } 4546 4547 static int __noinline 4548 cache_fplookup_noentry(struct cache_fpl *fpl) 4549 { 4550 struct nameidata *ndp; 4551 struct componentname *cnp; 4552 enum vgetstate dvs; 4553 struct vnode *dvp, *tvp; 4554 seqc_t dvp_seqc; 4555 int error; 4556 bool docache; 4557 4558 ndp = fpl->ndp; 4559 cnp = fpl->cnp; 4560 dvp = fpl->dvp; 4561 dvp_seqc = fpl->dvp_seqc; 4562 4563 MPASS((cnp->cn_flags & MAKEENTRY) == 0); 4564 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 4565 MPASS(!cache_fpl_isdotdot(cnp)); 4566 4567 /* 4568 * Hack: delayed name len checking. 4569 */ 4570 if (__predict_false(cnp->cn_namelen > NAME_MAX)) { 4571 cache_fpl_smr_exit(fpl); 4572 return (cache_fpl_handled_error(fpl, ENAMETOOLONG)); 4573 } 4574 4575 if (cnp->cn_nameptr[0] == '/') { 4576 return (cache_fplookup_skip_slashes(fpl)); 4577 } 4578 4579 if (cnp->cn_nameptr[0] == '\0') { 4580 if (fpl->tvp == NULL) { 4581 return (cache_fplookup_degenerate(fpl)); 4582 } 4583 return (cache_fplookup_trailingslash(fpl)); 4584 } 4585 4586 if (cnp->cn_nameiop != LOOKUP) { 4587 fpl->tvp = NULL; 4588 return (cache_fplookup_modifying(fpl)); 4589 } 4590 4591 MPASS((cnp->cn_flags & SAVESTART) == 0); 4592 4593 /* 4594 * Only try to fill in the component if it is the last one, 4595 * otherwise not only there may be several to handle but the 4596 * walk may be complicated. 4597 */ 4598 if (!cache_fpl_islastcn(ndp)) { 4599 return (cache_fpl_partial(fpl)); 4600 } 4601 4602 /* 4603 * Regular lookup nulifies the slash, which we don't do here. 4604 * Don't take chances with filesystem routines seeing it for 4605 * the last entry. 4606 */ 4607 if (cache_fpl_istrailingslash(fpl)) { 4608 return (cache_fpl_partial(fpl)); 4609 } 4610 4611 /* 4612 * Secure access to dvp; check cache_fplookup_partial_setup for 4613 * reasoning. 4614 */ 4615 dvs = vget_prep_smr(dvp); 4616 cache_fpl_smr_exit(fpl); 4617 if (__predict_false(dvs == VGET_NONE)) { 4618 return (cache_fpl_aborted(fpl)); 4619 } 4620 4621 vget_finish_ref(dvp, dvs); 4622 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4623 vrele(dvp); 4624 return (cache_fpl_aborted(fpl)); 4625 } 4626 4627 error = vn_lock(dvp, LK_SHARED); 4628 if (__predict_false(error != 0)) { 4629 vrele(dvp); 4630 return (cache_fpl_aborted(fpl)); 4631 } 4632 4633 tvp = NULL; 4634 /* 4635 * TODO: provide variants which don't require locking either vnode. 4636 */ 4637 cnp->cn_flags |= ISLASTCN; 4638 docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE; 4639 if (docache) 4640 cnp->cn_flags |= MAKEENTRY; 4641 cnp->cn_lkflags = LK_SHARED; 4642 if ((cnp->cn_flags & LOCKSHARED) == 0) { 4643 cnp->cn_lkflags = LK_EXCLUSIVE; 4644 } 4645 error = VOP_LOOKUP(dvp, &tvp, cnp); 4646 switch (error) { 4647 case EJUSTRETURN: 4648 case 0: 4649 break; 4650 case ENOTDIR: 4651 case ENOENT: 4652 vput(dvp); 4653 return (cache_fpl_handled_error(fpl, error)); 4654 default: 4655 vput(dvp); 4656 return (cache_fpl_aborted(fpl)); 4657 } 4658 4659 fpl->tvp = tvp; 4660 if (!fpl->savename) { 4661 MPASS((cnp->cn_flags & SAVENAME) == 0); 4662 } 4663 4664 if (tvp == NULL) { 4665 MPASS(error == EJUSTRETURN); 4666 if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) { 4667 vput(dvp); 4668 } else if ((cnp->cn_flags & LOCKPARENT) == 0) { 4669 VOP_UNLOCK(dvp); 4670 } 4671 return (cache_fpl_handled(fpl)); 4672 } 4673 4674 if (tvp->v_type == VLNK) { 4675 if ((cnp->cn_flags & FOLLOW) != 0) { 4676 vput(dvp); 4677 vput(tvp); 4678 return (cache_fpl_aborted(fpl)); 4679 } 4680 } 4681 4682 if (__predict_false(cache_fplookup_is_mp(fpl))) { 4683 vput(dvp); 4684 vput(tvp); 4685 return (cache_fpl_aborted(fpl)); 4686 } 4687 4688 if ((cnp->cn_flags & LOCKLEAF) == 0) { 4689 VOP_UNLOCK(tvp); 4690 } 4691 4692 if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) { 4693 vput(dvp); 4694 } else if ((cnp->cn_flags & LOCKPARENT) == 0) { 4695 VOP_UNLOCK(dvp); 4696 } 4697 return (cache_fpl_handled(fpl)); 4698 } 4699 4700 static int __noinline 4701 cache_fplookup_dot(struct cache_fpl *fpl) 4702 { 4703 int error; 4704 4705 MPASS(!seqc_in_modify(fpl->dvp_seqc)); 4706 /* 4707 * Just re-assign the value. seqc will be checked later for the first 4708 * non-dot path component in line and/or before deciding to return the 4709 * vnode. 4710 */ 4711 fpl->tvp = fpl->dvp; 4712 fpl->tvp_seqc = fpl->dvp_seqc; 4713 4714 counter_u64_add(dothits, 1); 4715 SDT_PROBE3(vfs, namecache, lookup, hit, fpl->dvp, ".", fpl->dvp); 4716 4717 error = 0; 4718 if (cache_fplookup_is_mp(fpl)) { 4719 error = cache_fplookup_cross_mount(fpl); 4720 } 4721 return (error); 4722 } 4723 4724 static int __noinline 4725 cache_fplookup_dotdot(struct cache_fpl *fpl) 4726 { 4727 struct nameidata *ndp; 4728 struct componentname *cnp; 4729 struct namecache *ncp; 4730 struct vnode *dvp; 4731 struct prison *pr; 4732 u_char nc_flag; 4733 4734 ndp = fpl->ndp; 4735 cnp = fpl->cnp; 4736 dvp = fpl->dvp; 4737 4738 MPASS(cache_fpl_isdotdot(cnp)); 4739 4740 /* 4741 * XXX this is racy the same way regular lookup is 4742 */ 4743 for (pr = cnp->cn_cred->cr_prison; pr != NULL; 4744 pr = pr->pr_parent) 4745 if (dvp == pr->pr_root) 4746 break; 4747 4748 if (dvp == ndp->ni_rootdir || 4749 dvp == ndp->ni_topdir || 4750 dvp == rootvnode || 4751 pr != NULL) { 4752 fpl->tvp = dvp; 4753 fpl->tvp_seqc = vn_seqc_read_any(dvp); 4754 if (seqc_in_modify(fpl->tvp_seqc)) { 4755 return (cache_fpl_aborted(fpl)); 4756 } 4757 return (0); 4758 } 4759 4760 if ((dvp->v_vflag & VV_ROOT) != 0) { 4761 /* 4762 * TODO 4763 * The opposite of climb mount is needed here. 4764 */ 4765 return (cache_fpl_partial(fpl)); 4766 } 4767 4768 ncp = atomic_load_consume_ptr(&dvp->v_cache_dd); 4769 if (ncp == NULL) { 4770 return (cache_fpl_aborted(fpl)); 4771 } 4772 4773 nc_flag = atomic_load_char(&ncp->nc_flag); 4774 if ((nc_flag & NCF_ISDOTDOT) != 0) { 4775 if ((nc_flag & NCF_NEGATIVE) != 0) 4776 return (cache_fpl_aborted(fpl)); 4777 fpl->tvp = ncp->nc_vp; 4778 } else { 4779 fpl->tvp = ncp->nc_dvp; 4780 } 4781 4782 fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp); 4783 if (seqc_in_modify(fpl->tvp_seqc)) { 4784 return (cache_fpl_partial(fpl)); 4785 } 4786 4787 /* 4788 * Acquire fence provided by vn_seqc_read_any above. 4789 */ 4790 if (__predict_false(atomic_load_ptr(&dvp->v_cache_dd) != ncp)) { 4791 return (cache_fpl_aborted(fpl)); 4792 } 4793 4794 if (!cache_ncp_canuse(ncp)) { 4795 return (cache_fpl_aborted(fpl)); 4796 } 4797 4798 counter_u64_add(dotdothits, 1); 4799 return (0); 4800 } 4801 4802 static int __noinline 4803 cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash) 4804 { 4805 u_char nc_flag; 4806 bool neg_promote; 4807 4808 nc_flag = atomic_load_char(&ncp->nc_flag); 4809 MPASS((nc_flag & NCF_NEGATIVE) != 0); 4810 /* 4811 * If they want to create an entry we need to replace this one. 4812 */ 4813 if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) { 4814 fpl->tvp = NULL; 4815 return (cache_fplookup_modifying(fpl)); 4816 } 4817 neg_promote = cache_neg_hit_prep(ncp); 4818 if (!cache_fpl_neg_ncp_canuse(ncp)) { 4819 cache_neg_hit_abort(ncp); 4820 return (cache_fpl_partial(fpl)); 4821 } 4822 if (neg_promote) { 4823 return (cache_fplookup_negative_promote(fpl, ncp, hash)); 4824 } 4825 cache_neg_hit_finish(ncp); 4826 cache_fpl_smr_exit(fpl); 4827 return (cache_fpl_handled_error(fpl, ENOENT)); 4828 } 4829 4830 /* 4831 * Resolve a symlink. Called by filesystem-specific routines. 4832 * 4833 * Code flow is: 4834 * ... -> cache_fplookup_symlink -> VOP_FPLOOKUP_SYMLINK -> cache_symlink_resolve 4835 */ 4836 int 4837 cache_symlink_resolve(struct cache_fpl *fpl, const char *string, size_t len) 4838 { 4839 struct nameidata *ndp; 4840 struct componentname *cnp; 4841 size_t adjust; 4842 4843 ndp = fpl->ndp; 4844 cnp = fpl->cnp; 4845 4846 if (__predict_false(len == 0)) { 4847 return (ENOENT); 4848 } 4849 4850 if (__predict_false(len > MAXPATHLEN - 2)) { 4851 if (cache_fpl_istrailingslash(fpl)) { 4852 return (EAGAIN); 4853 } 4854 } 4855 4856 ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr - cnp->cn_namelen + 1; 4857 #ifdef INVARIANTS 4858 if (ndp->ni_pathlen != fpl->debug.ni_pathlen) { 4859 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n", 4860 __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar, 4861 cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf); 4862 } 4863 #endif 4864 4865 if (__predict_false(len + ndp->ni_pathlen > MAXPATHLEN)) { 4866 return (ENAMETOOLONG); 4867 } 4868 4869 if (__predict_false(ndp->ni_loopcnt++ >= MAXSYMLINKS)) { 4870 return (ELOOP); 4871 } 4872 4873 adjust = len; 4874 if (ndp->ni_pathlen > 1) { 4875 bcopy(ndp->ni_next, cnp->cn_pnbuf + len, ndp->ni_pathlen); 4876 } else { 4877 if (cache_fpl_istrailingslash(fpl)) { 4878 adjust = len + 1; 4879 cnp->cn_pnbuf[len] = '/'; 4880 cnp->cn_pnbuf[len + 1] = '\0'; 4881 } else { 4882 cnp->cn_pnbuf[len] = '\0'; 4883 } 4884 } 4885 bcopy(string, cnp->cn_pnbuf, len); 4886 4887 ndp->ni_pathlen += adjust; 4888 cache_fpl_pathlen_add(fpl, adjust); 4889 cnp->cn_nameptr = cnp->cn_pnbuf; 4890 fpl->nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1]; 4891 fpl->tvp = NULL; 4892 return (0); 4893 } 4894 4895 static int __noinline 4896 cache_fplookup_symlink(struct cache_fpl *fpl) 4897 { 4898 struct mount *mp; 4899 struct nameidata *ndp; 4900 struct componentname *cnp; 4901 struct vnode *dvp, *tvp; 4902 int error; 4903 4904 ndp = fpl->ndp; 4905 cnp = fpl->cnp; 4906 dvp = fpl->dvp; 4907 tvp = fpl->tvp; 4908 4909 if (cache_fpl_islastcn(ndp)) { 4910 if ((cnp->cn_flags & FOLLOW) == 0) { 4911 return (cache_fplookup_final(fpl)); 4912 } 4913 } 4914 4915 mp = atomic_load_ptr(&dvp->v_mount); 4916 if (__predict_false(mp == NULL)) { 4917 return (cache_fpl_aborted(fpl)); 4918 } 4919 4920 /* 4921 * Note this check races against setting the flag just like regular 4922 * lookup. 4923 */ 4924 if (__predict_false((mp->mnt_flag & MNT_NOSYMFOLLOW) != 0)) { 4925 cache_fpl_smr_exit(fpl); 4926 return (cache_fpl_handled_error(fpl, EACCES)); 4927 } 4928 4929 error = VOP_FPLOOKUP_SYMLINK(tvp, fpl); 4930 if (__predict_false(error != 0)) { 4931 switch (error) { 4932 case EAGAIN: 4933 return (cache_fpl_partial(fpl)); 4934 case ENOENT: 4935 case ENAMETOOLONG: 4936 case ELOOP: 4937 cache_fpl_smr_exit(fpl); 4938 return (cache_fpl_handled_error(fpl, error)); 4939 default: 4940 return (cache_fpl_aborted(fpl)); 4941 } 4942 } 4943 4944 if (*(cnp->cn_nameptr) == '/') { 4945 fpl->dvp = cache_fpl_handle_root(fpl); 4946 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); 4947 if (seqc_in_modify(fpl->dvp_seqc)) { 4948 return (cache_fpl_aborted(fpl)); 4949 } 4950 } 4951 return (0); 4952 } 4953 4954 static int 4955 cache_fplookup_next(struct cache_fpl *fpl) 4956 { 4957 struct componentname *cnp; 4958 struct namecache *ncp; 4959 struct vnode *dvp, *tvp; 4960 u_char nc_flag; 4961 uint32_t hash; 4962 int error; 4963 4964 cnp = fpl->cnp; 4965 dvp = fpl->dvp; 4966 hash = fpl->hash; 4967 4968 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 4969 if (cnp->cn_namelen == 1) { 4970 return (cache_fplookup_dot(fpl)); 4971 } 4972 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 4973 return (cache_fplookup_dotdot(fpl)); 4974 } 4975 } 4976 4977 MPASS(!cache_fpl_isdotdot(cnp)); 4978 4979 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 4980 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 4981 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 4982 break; 4983 } 4984 4985 if (__predict_false(ncp == NULL)) { 4986 return (cache_fplookup_noentry(fpl)); 4987 } 4988 4989 tvp = atomic_load_ptr(&ncp->nc_vp); 4990 nc_flag = atomic_load_char(&ncp->nc_flag); 4991 if ((nc_flag & NCF_NEGATIVE) != 0) { 4992 return (cache_fplookup_neg(fpl, ncp, hash)); 4993 } 4994 4995 if (!cache_ncp_canuse(ncp)) { 4996 return (cache_fpl_partial(fpl)); 4997 } 4998 4999 fpl->tvp = tvp; 5000 fpl->tvp_seqc = vn_seqc_read_any(tvp); 5001 if (seqc_in_modify(fpl->tvp_seqc)) { 5002 return (cache_fpl_partial(fpl)); 5003 } 5004 5005 counter_u64_add(numposhits, 1); 5006 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp); 5007 5008 error = 0; 5009 if (cache_fplookup_is_mp(fpl)) { 5010 error = cache_fplookup_cross_mount(fpl); 5011 } 5012 return (error); 5013 } 5014 5015 static bool 5016 cache_fplookup_mp_supported(struct mount *mp) 5017 { 5018 5019 MPASS(mp != NULL); 5020 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) 5021 return (false); 5022 return (true); 5023 } 5024 5025 /* 5026 * Walk up the mount stack (if any). 5027 * 5028 * Correctness is provided in the following ways: 5029 * - all vnodes are protected from freeing with SMR 5030 * - struct mount objects are type stable making them always safe to access 5031 * - stability of the particular mount is provided by busying it 5032 * - relationship between the vnode which is mounted on and the mount is 5033 * verified with the vnode sequence counter after busying 5034 * - association between root vnode of the mount and the mount is protected 5035 * by busy 5036 * 5037 * From that point on we can read the sequence counter of the root vnode 5038 * and get the next mount on the stack (if any) using the same protection. 5039 * 5040 * By the end of successful walk we are guaranteed the reached state was 5041 * indeed present at least at some point which matches the regular lookup. 5042 */ 5043 static int __noinline 5044 cache_fplookup_climb_mount(struct cache_fpl *fpl) 5045 { 5046 struct mount *mp, *prev_mp; 5047 struct mount_pcpu *mpcpu, *prev_mpcpu; 5048 struct vnode *vp; 5049 seqc_t vp_seqc; 5050 5051 vp = fpl->tvp; 5052 vp_seqc = fpl->tvp_seqc; 5053 5054 VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp); 5055 mp = atomic_load_ptr(&vp->v_mountedhere); 5056 if (__predict_false(mp == NULL)) { 5057 return (0); 5058 } 5059 5060 prev_mp = NULL; 5061 for (;;) { 5062 if (!vfs_op_thread_enter_crit(mp, mpcpu)) { 5063 if (prev_mp != NULL) 5064 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); 5065 return (cache_fpl_partial(fpl)); 5066 } 5067 if (prev_mp != NULL) 5068 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); 5069 if (!vn_seqc_consistent(vp, vp_seqc)) { 5070 vfs_op_thread_exit_crit(mp, mpcpu); 5071 return (cache_fpl_partial(fpl)); 5072 } 5073 if (!cache_fplookup_mp_supported(mp)) { 5074 vfs_op_thread_exit_crit(mp, mpcpu); 5075 return (cache_fpl_partial(fpl)); 5076 } 5077 vp = atomic_load_ptr(&mp->mnt_rootvnode); 5078 if (vp == NULL) { 5079 vfs_op_thread_exit_crit(mp, mpcpu); 5080 return (cache_fpl_partial(fpl)); 5081 } 5082 vp_seqc = vn_seqc_read_any(vp); 5083 if (seqc_in_modify(vp_seqc)) { 5084 vfs_op_thread_exit_crit(mp, mpcpu); 5085 return (cache_fpl_partial(fpl)); 5086 } 5087 prev_mp = mp; 5088 prev_mpcpu = mpcpu; 5089 mp = atomic_load_ptr(&vp->v_mountedhere); 5090 if (mp == NULL) 5091 break; 5092 } 5093 5094 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); 5095 fpl->tvp = vp; 5096 fpl->tvp_seqc = vp_seqc; 5097 return (0); 5098 } 5099 5100 static int __noinline 5101 cache_fplookup_cross_mount(struct cache_fpl *fpl) 5102 { 5103 struct mount *mp; 5104 struct mount_pcpu *mpcpu; 5105 struct vnode *vp; 5106 seqc_t vp_seqc; 5107 5108 vp = fpl->tvp; 5109 vp_seqc = fpl->tvp_seqc; 5110 5111 VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp); 5112 mp = atomic_load_ptr(&vp->v_mountedhere); 5113 if (__predict_false(mp == NULL)) { 5114 return (0); 5115 } 5116 5117 if (!vfs_op_thread_enter_crit(mp, mpcpu)) { 5118 return (cache_fpl_partial(fpl)); 5119 } 5120 if (!vn_seqc_consistent(vp, vp_seqc)) { 5121 vfs_op_thread_exit_crit(mp, mpcpu); 5122 return (cache_fpl_partial(fpl)); 5123 } 5124 if (!cache_fplookup_mp_supported(mp)) { 5125 vfs_op_thread_exit_crit(mp, mpcpu); 5126 return (cache_fpl_partial(fpl)); 5127 } 5128 vp = atomic_load_ptr(&mp->mnt_rootvnode); 5129 if (__predict_false(vp == NULL)) { 5130 vfs_op_thread_exit_crit(mp, mpcpu); 5131 return (cache_fpl_partial(fpl)); 5132 } 5133 vp_seqc = vn_seqc_read_any(vp); 5134 vfs_op_thread_exit_crit(mp, mpcpu); 5135 if (seqc_in_modify(vp_seqc)) { 5136 return (cache_fpl_partial(fpl)); 5137 } 5138 mp = atomic_load_ptr(&vp->v_mountedhere); 5139 if (__predict_false(mp != NULL)) { 5140 /* 5141 * There are possibly more mount points on top. 5142 * Normally this does not happen so for simplicity just start 5143 * over. 5144 */ 5145 return (cache_fplookup_climb_mount(fpl)); 5146 } 5147 5148 fpl->tvp = vp; 5149 fpl->tvp_seqc = vp_seqc; 5150 return (0); 5151 } 5152 5153 /* 5154 * Check if a vnode is mounted on. 5155 */ 5156 static bool 5157 cache_fplookup_is_mp(struct cache_fpl *fpl) 5158 { 5159 struct vnode *vp; 5160 5161 vp = fpl->tvp; 5162 return ((vn_irflag_read(vp) & VIRF_MOUNTPOINT) != 0); 5163 } 5164 5165 /* 5166 * Parse the path. 5167 * 5168 * The code was originally copy-pasted from regular lookup and despite 5169 * clean ups leaves performance on the table. Any modifications here 5170 * must take into account that in case off fallback the resulting 5171 * nameidata state has to be compatible with the original. 5172 */ 5173 5174 /* 5175 * Debug ni_pathlen tracking. 5176 */ 5177 #ifdef INVARIANTS 5178 static void 5179 cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n) 5180 { 5181 5182 fpl->debug.ni_pathlen += n; 5183 KASSERT(fpl->debug.ni_pathlen <= PATH_MAX, 5184 ("%s: pathlen overflow to %zd\n", __func__, fpl->debug.ni_pathlen)); 5185 } 5186 5187 static void 5188 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n) 5189 { 5190 5191 fpl->debug.ni_pathlen -= n; 5192 KASSERT(fpl->debug.ni_pathlen <= PATH_MAX, 5193 ("%s: pathlen underflow to %zd\n", __func__, fpl->debug.ni_pathlen)); 5194 } 5195 5196 static void 5197 cache_fpl_pathlen_inc(struct cache_fpl *fpl) 5198 { 5199 5200 cache_fpl_pathlen_add(fpl, 1); 5201 } 5202 5203 static void 5204 cache_fpl_pathlen_dec(struct cache_fpl *fpl) 5205 { 5206 5207 cache_fpl_pathlen_sub(fpl, 1); 5208 } 5209 #else 5210 static void 5211 cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n) 5212 { 5213 } 5214 5215 static void 5216 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n) 5217 { 5218 } 5219 5220 static void 5221 cache_fpl_pathlen_inc(struct cache_fpl *fpl) 5222 { 5223 } 5224 5225 static void 5226 cache_fpl_pathlen_dec(struct cache_fpl *fpl) 5227 { 5228 } 5229 #endif 5230 5231 static void 5232 cache_fplookup_parse(struct cache_fpl *fpl) 5233 { 5234 struct nameidata *ndp; 5235 struct componentname *cnp; 5236 struct vnode *dvp; 5237 char *cp; 5238 uint32_t hash; 5239 5240 ndp = fpl->ndp; 5241 cnp = fpl->cnp; 5242 dvp = fpl->dvp; 5243 5244 /* 5245 * Find the end of this path component, it is either / or nul. 5246 * 5247 * Store / as a temporary sentinel so that we only have one character 5248 * to test for. Pathnames tend to be short so this should not be 5249 * resulting in cache misses. 5250 * 5251 * TODO: fix this to be word-sized. 5252 */ 5253 KASSERT(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] == fpl->nulchar, 5254 ("%s: mismatch between pathlen (%zu) and nulchar (%p != %p), string [%s]\n", 5255 __func__, fpl->debug.ni_pathlen, &cnp->cn_nameptr[fpl->debug.ni_pathlen - 1], 5256 fpl->nulchar, cnp->cn_pnbuf)); 5257 KASSERT(*fpl->nulchar == '\0', 5258 ("%s: expected nul at %p; string [%s]\n", __func__, fpl->nulchar, 5259 cnp->cn_pnbuf)); 5260 hash = cache_get_hash_iter_start(dvp); 5261 *fpl->nulchar = '/'; 5262 for (cp = cnp->cn_nameptr; *cp != '/'; cp++) { 5263 KASSERT(*cp != '\0', 5264 ("%s: encountered unexpected nul; string [%s]\n", __func__, 5265 cnp->cn_nameptr)); 5266 hash = cache_get_hash_iter(*cp, hash); 5267 continue; 5268 } 5269 *fpl->nulchar = '\0'; 5270 fpl->hash = cache_get_hash_iter_finish(hash); 5271 5272 cnp->cn_namelen = cp - cnp->cn_nameptr; 5273 cache_fpl_pathlen_sub(fpl, cnp->cn_namelen); 5274 5275 #ifdef INVARIANTS 5276 /* 5277 * cache_get_hash only accepts lengths up to NAME_MAX. This is fine since 5278 * we are going to fail this lookup with ENAMETOOLONG (see below). 5279 */ 5280 if (cnp->cn_namelen <= NAME_MAX) { 5281 if (fpl->hash != cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp)) { 5282 panic("%s: mismatched hash for [%s] len %ld", __func__, 5283 cnp->cn_nameptr, cnp->cn_namelen); 5284 } 5285 } 5286 #endif 5287 5288 /* 5289 * Hack: we have to check if the found path component's length exceeds 5290 * NAME_MAX. However, the condition is very rarely true and check can 5291 * be elided in the common case -- if an entry was found in the cache, 5292 * then it could not have been too long to begin with. 5293 */ 5294 ndp->ni_next = cp; 5295 } 5296 5297 static void 5298 cache_fplookup_parse_advance(struct cache_fpl *fpl) 5299 { 5300 struct nameidata *ndp; 5301 struct componentname *cnp; 5302 5303 ndp = fpl->ndp; 5304 cnp = fpl->cnp; 5305 5306 cnp->cn_nameptr = ndp->ni_next; 5307 KASSERT(*(cnp->cn_nameptr) == '/', 5308 ("%s: should have seen slash at %p ; buf %p [%s]\n", __func__, 5309 cnp->cn_nameptr, cnp->cn_pnbuf, cnp->cn_pnbuf)); 5310 cnp->cn_nameptr++; 5311 cache_fpl_pathlen_dec(fpl); 5312 } 5313 5314 /* 5315 * Skip spurious slashes in a pathname (e.g., "foo///bar") and retry. 5316 * 5317 * Lockless lookup tries to elide checking for spurious slashes and should they 5318 * be present is guaranteed to fail to find an entry. In this case the caller 5319 * must check if the name starts with a slash and call this routine. It is 5320 * going to fast forward across the spurious slashes and set the state up for 5321 * retry. 5322 */ 5323 static int __noinline 5324 cache_fplookup_skip_slashes(struct cache_fpl *fpl) 5325 { 5326 struct nameidata *ndp; 5327 struct componentname *cnp; 5328 5329 ndp = fpl->ndp; 5330 cnp = fpl->cnp; 5331 5332 MPASS(*(cnp->cn_nameptr) == '/'); 5333 do { 5334 cnp->cn_nameptr++; 5335 cache_fpl_pathlen_dec(fpl); 5336 } while (*(cnp->cn_nameptr) == '/'); 5337 5338 /* 5339 * Go back to one slash so that cache_fplookup_parse_advance has 5340 * something to skip. 5341 */ 5342 cnp->cn_nameptr--; 5343 cache_fpl_pathlen_inc(fpl); 5344 5345 /* 5346 * cache_fplookup_parse_advance starts from ndp->ni_next 5347 */ 5348 ndp->ni_next = cnp->cn_nameptr; 5349 5350 /* 5351 * See cache_fplookup_dot. 5352 */ 5353 fpl->tvp = fpl->dvp; 5354 fpl->tvp_seqc = fpl->dvp_seqc; 5355 5356 return (0); 5357 } 5358 5359 /* 5360 * Handle trailing slashes (e.g., "foo/"). 5361 * 5362 * If a trailing slash is found the terminal vnode must be a directory. 5363 * Regular lookup shortens the path by nulifying the first trailing slash and 5364 * sets the TRAILINGSLASH flag to denote this took place. There are several 5365 * checks on it performed later. 5366 * 5367 * Similarly to spurious slashes, lockless lookup handles this in a speculative 5368 * manner relying on an invariant that a non-directory vnode will get a miss. 5369 * In this case cn_nameptr[0] == '\0' and cn_namelen == 0. 5370 * 5371 * Thus for a path like "foo/bar/" the code unwinds the state back to "bar/" 5372 * and denotes this is the last path component, which avoids looping back. 5373 * 5374 * Only plain lookups are supported for now to restrict corner cases to handle. 5375 */ 5376 static int __noinline 5377 cache_fplookup_trailingslash(struct cache_fpl *fpl) 5378 { 5379 #ifdef INVARIANTS 5380 size_t ni_pathlen; 5381 #endif 5382 struct nameidata *ndp; 5383 struct componentname *cnp; 5384 struct namecache *ncp; 5385 struct vnode *tvp; 5386 char *cn_nameptr_orig, *cn_nameptr_slash; 5387 seqc_t tvp_seqc; 5388 u_char nc_flag; 5389 5390 ndp = fpl->ndp; 5391 cnp = fpl->cnp; 5392 tvp = fpl->tvp; 5393 tvp_seqc = fpl->tvp_seqc; 5394 5395 MPASS(fpl->dvp == fpl->tvp); 5396 KASSERT(cache_fpl_istrailingslash(fpl), 5397 ("%s: expected trailing slash at %p; string [%s]\n", __func__, fpl->nulchar - 1, 5398 cnp->cn_pnbuf)); 5399 KASSERT(cnp->cn_nameptr[0] == '\0', 5400 ("%s: expected nul char at %p; string [%s]\n", __func__, &cnp->cn_nameptr[0], 5401 cnp->cn_pnbuf)); 5402 KASSERT(cnp->cn_namelen == 0, 5403 ("%s: namelen 0 but got %ld; string [%s]\n", __func__, cnp->cn_namelen, 5404 cnp->cn_pnbuf)); 5405 MPASS(cnp->cn_nameptr > cnp->cn_pnbuf); 5406 5407 if (cnp->cn_nameiop != LOOKUP) { 5408 return (cache_fpl_aborted(fpl)); 5409 } 5410 5411 if (__predict_false(tvp->v_type != VDIR)) { 5412 if (!vn_seqc_consistent(tvp, tvp_seqc)) { 5413 return (cache_fpl_aborted(fpl)); 5414 } 5415 cache_fpl_smr_exit(fpl); 5416 return (cache_fpl_handled_error(fpl, ENOTDIR)); 5417 } 5418 5419 /* 5420 * Denote the last component. 5421 */ 5422 ndp->ni_next = &cnp->cn_nameptr[0]; 5423 MPASS(cache_fpl_islastcn(ndp)); 5424 5425 /* 5426 * Unwind trailing slashes. 5427 */ 5428 cn_nameptr_orig = cnp->cn_nameptr; 5429 while (cnp->cn_nameptr >= cnp->cn_pnbuf) { 5430 cnp->cn_nameptr--; 5431 if (cnp->cn_nameptr[0] != '/') { 5432 break; 5433 } 5434 } 5435 5436 /* 5437 * Unwind to the beginning of the path component. 5438 * 5439 * Note the path may or may not have started with a slash. 5440 */ 5441 cn_nameptr_slash = cnp->cn_nameptr; 5442 while (cnp->cn_nameptr > cnp->cn_pnbuf) { 5443 cnp->cn_nameptr--; 5444 if (cnp->cn_nameptr[0] == '/') { 5445 break; 5446 } 5447 } 5448 if (cnp->cn_nameptr[0] == '/') { 5449 cnp->cn_nameptr++; 5450 } 5451 5452 cnp->cn_namelen = cn_nameptr_slash - cnp->cn_nameptr + 1; 5453 cache_fpl_pathlen_add(fpl, cn_nameptr_orig - cnp->cn_nameptr); 5454 cache_fpl_checkpoint(fpl); 5455 5456 #ifdef INVARIANTS 5457 ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1; 5458 if (ni_pathlen != fpl->debug.ni_pathlen) { 5459 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n", 5460 __func__, ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar, 5461 cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf); 5462 } 5463 #endif 5464 5465 /* 5466 * If this was a "./" lookup the parent directory is already correct. 5467 */ 5468 if (cnp->cn_nameptr[0] == '.' && cnp->cn_namelen == 1) { 5469 return (0); 5470 } 5471 5472 /* 5473 * Otherwise we need to look it up. 5474 */ 5475 tvp = fpl->tvp; 5476 ncp = atomic_load_consume_ptr(&tvp->v_cache_dd); 5477 if (__predict_false(ncp == NULL)) { 5478 return (cache_fpl_aborted(fpl)); 5479 } 5480 nc_flag = atomic_load_char(&ncp->nc_flag); 5481 if ((nc_flag & NCF_ISDOTDOT) != 0) { 5482 return (cache_fpl_aborted(fpl)); 5483 } 5484 fpl->dvp = ncp->nc_dvp; 5485 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); 5486 if (seqc_in_modify(fpl->dvp_seqc)) { 5487 return (cache_fpl_aborted(fpl)); 5488 } 5489 return (0); 5490 } 5491 5492 /* 5493 * See the API contract for VOP_FPLOOKUP_VEXEC. 5494 */ 5495 static int __noinline 5496 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error) 5497 { 5498 struct componentname *cnp; 5499 struct vnode *dvp; 5500 seqc_t dvp_seqc; 5501 5502 cnp = fpl->cnp; 5503 dvp = fpl->dvp; 5504 dvp_seqc = fpl->dvp_seqc; 5505 5506 /* 5507 * TODO: Due to ignoring trailing slashes lookup will perform a 5508 * permission check on the last dir when it should not be doing it. It 5509 * may fail, but said failure should be ignored. It is possible to fix 5510 * it up fully without resorting to regular lookup, but for now just 5511 * abort. 5512 */ 5513 if (cache_fpl_istrailingslash(fpl)) { 5514 return (cache_fpl_aborted(fpl)); 5515 } 5516 5517 /* 5518 * Hack: delayed degenerate path checking. 5519 */ 5520 if (cnp->cn_nameptr[0] == '\0' && fpl->tvp == NULL) { 5521 return (cache_fplookup_degenerate(fpl)); 5522 } 5523 5524 /* 5525 * Hack: delayed name len checking. 5526 */ 5527 if (__predict_false(cnp->cn_namelen > NAME_MAX)) { 5528 cache_fpl_smr_exit(fpl); 5529 return (cache_fpl_handled_error(fpl, ENAMETOOLONG)); 5530 } 5531 5532 /* 5533 * Hack: they may be looking up foo/bar, where foo is not a directory. 5534 * In such a case we need to return ENOTDIR, but we may happen to get 5535 * here with a different error. 5536 */ 5537 if (dvp->v_type != VDIR) { 5538 error = ENOTDIR; 5539 } 5540 5541 /* 5542 * Hack: handle O_SEARCH. 5543 * 5544 * Open Group Base Specifications Issue 7, 2018 edition states: 5545 * <quote> 5546 * If the access mode of the open file description associated with the 5547 * file descriptor is not O_SEARCH, the function shall check whether 5548 * directory searches are permitted using the current permissions of 5549 * the directory underlying the file descriptor. If the access mode is 5550 * O_SEARCH, the function shall not perform the check. 5551 * </quote> 5552 * 5553 * Regular lookup tests for the NOEXECCHECK flag for every path 5554 * component to decide whether to do the permission check. However, 5555 * since most lookups never have the flag (and when they do it is only 5556 * present for the first path component), lockless lookup only acts on 5557 * it if there is a permission problem. Here the flag is represented 5558 * with a boolean so that we don't have to clear it on the way out. 5559 * 5560 * For simplicity this always aborts. 5561 * TODO: check if this is the first lookup and ignore the permission 5562 * problem. Note the flag has to survive fallback (if it happens to be 5563 * performed). 5564 */ 5565 if (fpl->fsearch) { 5566 return (cache_fpl_aborted(fpl)); 5567 } 5568 5569 switch (error) { 5570 case EAGAIN: 5571 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 5572 error = cache_fpl_aborted(fpl); 5573 } else { 5574 cache_fpl_partial(fpl); 5575 } 5576 break; 5577 default: 5578 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 5579 error = cache_fpl_aborted(fpl); 5580 } else { 5581 cache_fpl_smr_exit(fpl); 5582 cache_fpl_handled_error(fpl, error); 5583 } 5584 break; 5585 } 5586 return (error); 5587 } 5588 5589 static int 5590 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl) 5591 { 5592 struct nameidata *ndp; 5593 struct componentname *cnp; 5594 struct mount *mp; 5595 int error; 5596 5597 ndp = fpl->ndp; 5598 cnp = fpl->cnp; 5599 5600 cache_fpl_checkpoint(fpl); 5601 5602 /* 5603 * The vnode at hand is almost always stable, skip checking for it. 5604 * Worst case this postpones the check towards the end of the iteration 5605 * of the main loop. 5606 */ 5607 fpl->dvp = dvp; 5608 fpl->dvp_seqc = vn_seqc_read_notmodify(fpl->dvp); 5609 5610 mp = atomic_load_ptr(&dvp->v_mount); 5611 if (__predict_false(mp == NULL || !cache_fplookup_mp_supported(mp))) { 5612 return (cache_fpl_aborted(fpl)); 5613 } 5614 5615 MPASS(fpl->tvp == NULL); 5616 5617 for (;;) { 5618 cache_fplookup_parse(fpl); 5619 5620 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred); 5621 if (__predict_false(error != 0)) { 5622 error = cache_fplookup_failed_vexec(fpl, error); 5623 break; 5624 } 5625 5626 error = cache_fplookup_next(fpl); 5627 if (__predict_false(cache_fpl_terminated(fpl))) { 5628 break; 5629 } 5630 5631 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 5632 5633 if (fpl->tvp->v_type == VLNK) { 5634 error = cache_fplookup_symlink(fpl); 5635 if (cache_fpl_terminated(fpl)) { 5636 break; 5637 } 5638 } else { 5639 if (cache_fpl_islastcn(ndp)) { 5640 error = cache_fplookup_final(fpl); 5641 break; 5642 } 5643 5644 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { 5645 error = cache_fpl_aborted(fpl); 5646 break; 5647 } 5648 5649 fpl->dvp = fpl->tvp; 5650 fpl->dvp_seqc = fpl->tvp_seqc; 5651 cache_fplookup_parse_advance(fpl); 5652 } 5653 5654 cache_fpl_checkpoint(fpl); 5655 } 5656 5657 return (error); 5658 } 5659 5660 /* 5661 * Fast path lookup protected with SMR and sequence counters. 5662 * 5663 * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one. 5664 * 5665 * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria 5666 * outlined below. 5667 * 5668 * Traditional vnode lookup conceptually looks like this: 5669 * 5670 * vn_lock(current); 5671 * for (;;) { 5672 * next = find(); 5673 * vn_lock(next); 5674 * vn_unlock(current); 5675 * current = next; 5676 * if (last) 5677 * break; 5678 * } 5679 * return (current); 5680 * 5681 * Each jump to the next vnode is safe memory-wise and atomic with respect to 5682 * any modifications thanks to holding respective locks. 5683 * 5684 * The same guarantee can be provided with a combination of safe memory 5685 * reclamation and sequence counters instead. If all operations which affect 5686 * the relationship between the current vnode and the one we are looking for 5687 * also modify the counter, we can verify whether all the conditions held as 5688 * we made the jump. This includes things like permissions, mount points etc. 5689 * Counter modification is provided by enclosing relevant places in 5690 * vn_seqc_write_begin()/end() calls. 5691 * 5692 * Thus this translates to: 5693 * 5694 * vfs_smr_enter(); 5695 * dvp_seqc = seqc_read_any(dvp); 5696 * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode 5697 * abort(); 5698 * for (;;) { 5699 * tvp = find(); 5700 * tvp_seqc = seqc_read_any(tvp); 5701 * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode 5702 * abort(); 5703 * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode 5704 * abort(); 5705 * dvp = tvp; // we know nothing of importance has changed 5706 * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration 5707 * if (last) 5708 * break; 5709 * } 5710 * vget(); // secure the vnode 5711 * if (!seqc_consistent(tvp, tvp_seqc) // final check 5712 * abort(); 5713 * // at this point we know nothing has changed for any parent<->child pair 5714 * // as they were crossed during the lookup, meaning we matched the guarantee 5715 * // of the locked variant 5716 * return (tvp); 5717 * 5718 * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows: 5719 * - they are called while within vfs_smr protection which they must never exit 5720 * - EAGAIN can be returned to denote checking could not be performed, it is 5721 * always valid to return it 5722 * - if the sequence counter has not changed the result must be valid 5723 * - if the sequence counter has changed both false positives and false negatives 5724 * are permitted (since the result will be rejected later) 5725 * - for simple cases of unix permission checks vaccess_vexec_smr can be used 5726 * 5727 * Caveats to watch out for: 5728 * - vnodes are passed unlocked and unreferenced with nothing stopping 5729 * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised 5730 * to use atomic_load_ptr to fetch it. 5731 * - the aforementioned object can also get freed, meaning absent other means it 5732 * should be protected with vfs_smr 5733 * - either safely checking permissions as they are modified or guaranteeing 5734 * their stability is left to the routine 5735 */ 5736 int 5737 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status, 5738 struct pwd **pwdp) 5739 { 5740 struct cache_fpl fpl; 5741 struct pwd *pwd; 5742 struct vnode *dvp; 5743 struct componentname *cnp; 5744 int error; 5745 5746 fpl.status = CACHE_FPL_STATUS_UNSET; 5747 fpl.in_smr = false; 5748 fpl.ndp = ndp; 5749 fpl.cnp = cnp = &ndp->ni_cnd; 5750 MPASS(ndp->ni_lcf == 0); 5751 MPASS(curthread == cnp->cn_thread); 5752 KASSERT ((cnp->cn_flags & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, 5753 ("%s: internal flags found in cn_flags %" PRIx64, __func__, 5754 cnp->cn_flags)); 5755 if ((cnp->cn_flags & SAVESTART) != 0) { 5756 MPASS(cnp->cn_nameiop != LOOKUP); 5757 } 5758 MPASS(cnp->cn_nameptr == cnp->cn_pnbuf); 5759 5760 if (__predict_false(!cache_can_fplookup(&fpl))) { 5761 *status = fpl.status; 5762 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 5763 return (EOPNOTSUPP); 5764 } 5765 5766 cache_fpl_checkpoint_outer(&fpl); 5767 5768 cache_fpl_smr_enter_initial(&fpl); 5769 #ifdef INVARIANTS 5770 fpl.debug.ni_pathlen = ndp->ni_pathlen; 5771 #endif 5772 fpl.nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1]; 5773 fpl.fsearch = false; 5774 fpl.savename = (cnp->cn_flags & SAVENAME) != 0; 5775 fpl.tvp = NULL; /* for degenerate path handling */ 5776 fpl.pwd = pwdp; 5777 pwd = pwd_get_smr(); 5778 *(fpl.pwd) = pwd; 5779 ndp->ni_rootdir = pwd->pwd_rdir; 5780 ndp->ni_topdir = pwd->pwd_jdir; 5781 5782 if (cnp->cn_pnbuf[0] == '/') { 5783 dvp = cache_fpl_handle_root(&fpl); 5784 MPASS(ndp->ni_resflags == 0); 5785 ndp->ni_resflags = NIRES_ABS; 5786 } else { 5787 if (ndp->ni_dirfd == AT_FDCWD) { 5788 dvp = pwd->pwd_cdir; 5789 } else { 5790 error = cache_fplookup_dirfd(&fpl, &dvp); 5791 if (__predict_false(error != 0)) { 5792 goto out; 5793 } 5794 } 5795 } 5796 5797 SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true); 5798 error = cache_fplookup_impl(dvp, &fpl); 5799 out: 5800 cache_fpl_smr_assert_not_entered(&fpl); 5801 cache_fpl_assert_status(&fpl); 5802 *status = fpl.status; 5803 if (SDT_PROBES_ENABLED()) { 5804 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 5805 if (fpl.status == CACHE_FPL_STATUS_HANDLED) 5806 SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true, 5807 ndp); 5808 } 5809 5810 if (__predict_true(fpl.status == CACHE_FPL_STATUS_HANDLED)) { 5811 MPASS(error != CACHE_FPL_FAILED); 5812 if (error != 0) { 5813 MPASS(fpl.dvp == NULL); 5814 MPASS(fpl.tvp == NULL); 5815 MPASS(fpl.savename == false); 5816 } 5817 ndp->ni_dvp = fpl.dvp; 5818 ndp->ni_vp = fpl.tvp; 5819 if (fpl.savename) { 5820 cnp->cn_flags |= HASBUF; 5821 } else { 5822 cache_fpl_cleanup_cnp(cnp); 5823 } 5824 } 5825 return (error); 5826 } 5827