1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Poul-Henning Kamp of the FreeBSD Project. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_ddb.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/capsicum.h> 46 #include <sys/counter.h> 47 #include <sys/filedesc.h> 48 #include <sys/fnv_hash.h> 49 #include <sys/kernel.h> 50 #include <sys/ktr.h> 51 #include <sys/lock.h> 52 #include <sys/malloc.h> 53 #include <sys/fcntl.h> 54 #include <sys/jail.h> 55 #include <sys/mount.h> 56 #include <sys/namei.h> 57 #include <sys/proc.h> 58 #include <sys/seqc.h> 59 #include <sys/sdt.h> 60 #include <sys/smr.h> 61 #include <sys/smp.h> 62 #include <sys/syscallsubr.h> 63 #include <sys/sysctl.h> 64 #include <sys/sysproto.h> 65 #include <sys/vnode.h> 66 #include <ck_queue.h> 67 #ifdef KTRACE 68 #include <sys/ktrace.h> 69 #endif 70 71 #include <sys/capsicum.h> 72 73 #include <security/audit/audit.h> 74 #include <security/mac/mac_framework.h> 75 76 #ifdef DDB 77 #include <ddb/ddb.h> 78 #endif 79 80 #include <vm/uma.h> 81 82 SDT_PROVIDER_DECLARE(vfs); 83 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", 84 "struct vnode *"); 85 SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *", 86 "struct vnode *"); 87 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", 88 "char *"); 89 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *", 90 "const char *"); 91 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *", 92 "struct namecache *", "int", "int"); 93 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); 94 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", 95 "char *", "struct vnode *"); 96 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); 97 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", 98 "struct vnode *", "char *"); 99 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", 100 "struct vnode *"); 101 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, 102 "struct vnode *", "char *"); 103 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", 104 "char *"); 105 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *", 106 "struct componentname *"); 107 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *", 108 "struct componentname *"); 109 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); 110 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); 111 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); 112 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", 113 "struct vnode *"); 114 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", 115 "char *"); 116 SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *", 117 "char *"); 118 119 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool"); 120 SDT_PROBE_DECLARE(vfs, namei, lookup, entry); 121 SDT_PROBE_DECLARE(vfs, namei, lookup, return); 122 123 /* 124 * This structure describes the elements in the cache of recent 125 * names looked up by namei. 126 */ 127 struct negstate { 128 u_char neg_flag; 129 }; 130 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *), 131 "the state must fit in a union with a pointer without growing it"); 132 133 struct namecache { 134 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 135 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 136 CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */ 137 struct vnode *nc_dvp; /* vnode of parent of name */ 138 union { 139 struct vnode *nu_vp; /* vnode the name refers to */ 140 struct negstate nu_neg;/* negative entry state */ 141 } n_un; 142 u_char nc_flag; /* flag bits */ 143 u_char nc_nlen; /* length of name */ 144 char nc_name[0]; /* segment name + nul */ 145 }; 146 147 /* 148 * struct namecache_ts repeats struct namecache layout up to the 149 * nc_nlen member. 150 * struct namecache_ts is used in place of struct namecache when time(s) need 151 * to be stored. The nc_dotdottime field is used when a cache entry is mapping 152 * both a non-dotdot directory name plus dotdot for the directory's 153 * parent. 154 * 155 * See below for alignment requirement. 156 */ 157 struct namecache_ts { 158 struct timespec nc_time; /* timespec provided by fs */ 159 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ 160 int nc_ticks; /* ticks value when entry was added */ 161 struct namecache nc_nc; 162 }; 163 164 /* 165 * At least mips n32 performs 64-bit accesses to timespec as found 166 * in namecache_ts and requires them to be aligned. Since others 167 * may be in the same spot suffer a little bit and enforce the 168 * alignment for everyone. Note this is a nop for 64-bit platforms. 169 */ 170 #define CACHE_ZONE_ALIGNMENT UMA_ALIGNOF(time_t) 171 #define CACHE_PATH_CUTOFF 39 172 173 #define CACHE_ZONE_SMALL_SIZE (sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1) 174 #define CACHE_ZONE_SMALL_TS_SIZE (sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1) 175 #define CACHE_ZONE_LARGE_SIZE (sizeof(struct namecache) + NAME_MAX + 1) 176 #define CACHE_ZONE_LARGE_TS_SIZE (sizeof(struct namecache_ts) + NAME_MAX + 1) 177 178 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 179 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 180 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 181 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 182 183 #define nc_vp n_un.nu_vp 184 #define nc_neg n_un.nu_neg 185 186 /* 187 * Flags in namecache.nc_flag 188 */ 189 #define NCF_WHITE 0x01 190 #define NCF_ISDOTDOT 0x02 191 #define NCF_TS 0x04 192 #define NCF_DTS 0x08 193 #define NCF_DVDROP 0x10 194 #define NCF_NEGATIVE 0x20 195 #define NCF_INVALID 0x40 196 #define NCF_WIP 0x80 197 198 /* 199 * Flags in negstate.neg_flag 200 */ 201 #define NEG_HOT 0x01 202 203 /* 204 * Mark an entry as invalid. 205 * 206 * This is called before it starts getting deconstructed. 207 */ 208 static void 209 cache_ncp_invalidate(struct namecache *ncp) 210 { 211 212 KASSERT((ncp->nc_flag & NCF_INVALID) == 0, 213 ("%s: entry %p already invalid", __func__, ncp)); 214 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID); 215 atomic_thread_fence_rel(); 216 } 217 218 /* 219 * Check whether the entry can be safely used. 220 * 221 * All places which elide locks are supposed to call this after they are 222 * done with reading from an entry. 223 */ 224 static bool 225 cache_ncp_canuse(struct namecache *ncp) 226 { 227 228 atomic_thread_fence_acq(); 229 return ((atomic_load_char(&ncp->nc_flag) & (NCF_INVALID | NCF_WIP)) == 0); 230 } 231 232 /* 233 * Name caching works as follows: 234 * 235 * Names found by directory scans are retained in a cache 236 * for future reference. It is managed LRU, so frequently 237 * used names will hang around. Cache is indexed by hash value 238 * obtained from (dvp, name) where dvp refers to the directory 239 * containing name. 240 * 241 * If it is a "negative" entry, (i.e. for a name that is known NOT to 242 * exist) the vnode pointer will be NULL. 243 * 244 * Upon reaching the last segment of a path, if the reference 245 * is for DELETE, or NOCACHE is set (rewrite), and the 246 * name is located in the cache, it will be dropped. 247 * 248 * These locks are used (in the order in which they can be taken): 249 * NAME TYPE ROLE 250 * vnodelock mtx vnode lists and v_cache_dd field protection 251 * bucketlock mtx for access to given set of hash buckets 252 * neglist mtx negative entry LRU management 253 * 254 * Additionally, ncneg_shrink_lock mtx is used to have at most one thread 255 * shrinking the LRU list. 256 * 257 * It is legal to take multiple vnodelock and bucketlock locks. The locking 258 * order is lower address first. Both are recursive. 259 * 260 * "." lookups are lockless. 261 * 262 * ".." and vnode -> name lookups require vnodelock. 263 * 264 * name -> vnode lookup requires the relevant bucketlock to be held for reading. 265 * 266 * Insertions and removals of entries require involved vnodes and bucketlocks 267 * to be locked to provide safe operation against other threads modifying the 268 * cache. 269 * 270 * Some lookups result in removal of the found entry (e.g. getting rid of a 271 * negative entry with the intent to create a positive one), which poses a 272 * problem when multiple threads reach the state. Similarly, two different 273 * threads can purge two different vnodes and try to remove the same name. 274 * 275 * If the already held vnode lock is lower than the second required lock, we 276 * can just take the other lock. However, in the opposite case, this could 277 * deadlock. As such, this is resolved by trylocking and if that fails unlocking 278 * the first node, locking everything in order and revalidating the state. 279 */ 280 281 VFS_SMR_DECLARE; 282 283 /* 284 * Structures associated with name caching. 285 */ 286 #define NCHHASH(hash) \ 287 (&nchashtbl[(hash) & nchash]) 288 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ 289 static u_long __read_mostly nchash; /* size of hash table */ 290 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 291 "Size of namecache hash table"); 292 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */ 293 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 294 "Ratio of negative namecache entries"); 295 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ 296 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ 297 u_int ncsizefactor = 2; 298 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0, 299 "Size factor for namecache"); 300 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */ 301 302 struct nchstats nchstats; /* cache effectiveness statistics */ 303 304 static bool __read_frequently cache_fast_revlookup = true; 305 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_revlookup, CTLFLAG_RW, 306 &cache_fast_revlookup, 0, ""); 307 308 static struct mtx __exclusive_cache_line ncneg_shrink_lock; 309 310 #define ncneghash 3 311 #define numneglists (ncneghash + 1) 312 313 struct neglist { 314 struct mtx nl_lock; 315 TAILQ_HEAD(, namecache) nl_list; 316 TAILQ_HEAD(, namecache) nl_hotlist; 317 u_long nl_hotnum; 318 } __aligned(CACHE_LINE_SIZE); 319 320 static struct neglist neglists[numneglists]; 321 322 static inline struct neglist * 323 NCP2NEGLIST(struct namecache *ncp) 324 { 325 326 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); 327 } 328 329 static inline struct negstate * 330 NCP2NEGSTATE(struct namecache *ncp) 331 { 332 333 MPASS(ncp->nc_flag & NCF_NEGATIVE); 334 return (&ncp->nc_neg); 335 } 336 337 #define numbucketlocks (ncbuckethash + 1) 338 static u_int __read_mostly ncbuckethash; 339 static struct mtx_padalign __read_mostly *bucketlocks; 340 #define HASH2BUCKETLOCK(hash) \ 341 ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)])) 342 343 #define numvnodelocks (ncvnodehash + 1) 344 static u_int __read_mostly ncvnodehash; 345 static struct mtx __read_mostly *vnodelocks; 346 static inline struct mtx * 347 VP2VNODELOCK(struct vnode *vp) 348 { 349 350 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); 351 } 352 353 /* 354 * UMA zones for the VFS cache. 355 * 356 * The small cache is used for entries with short names, which are the 357 * most common. The large cache is used for entries which are too big to 358 * fit in the small cache. 359 */ 360 static uma_zone_t __read_mostly cache_zone_small; 361 static uma_zone_t __read_mostly cache_zone_small_ts; 362 static uma_zone_t __read_mostly cache_zone_large; 363 static uma_zone_t __read_mostly cache_zone_large_ts; 364 365 static struct namecache * 366 cache_alloc(int len, int ts) 367 { 368 struct namecache_ts *ncp_ts; 369 struct namecache *ncp; 370 371 if (__predict_false(ts)) { 372 if (len <= CACHE_PATH_CUTOFF) 373 ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK); 374 else 375 ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK); 376 ncp = &ncp_ts->nc_nc; 377 } else { 378 if (len <= CACHE_PATH_CUTOFF) 379 ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK); 380 else 381 ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK); 382 } 383 return (ncp); 384 } 385 386 static void 387 cache_free(struct namecache *ncp) 388 { 389 struct namecache_ts *ncp_ts; 390 391 MPASS(ncp != NULL); 392 if ((ncp->nc_flag & NCF_DVDROP) != 0) 393 vdrop(ncp->nc_dvp); 394 if (__predict_false(ncp->nc_flag & NCF_TS)) { 395 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 396 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 397 uma_zfree_smr(cache_zone_small_ts, ncp_ts); 398 else 399 uma_zfree_smr(cache_zone_large_ts, ncp_ts); 400 } else { 401 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 402 uma_zfree_smr(cache_zone_small, ncp); 403 else 404 uma_zfree_smr(cache_zone_large, ncp); 405 } 406 } 407 408 static void 409 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) 410 { 411 struct namecache_ts *ncp_ts; 412 413 KASSERT((ncp->nc_flag & NCF_TS) != 0 || 414 (tsp == NULL && ticksp == NULL), 415 ("No NCF_TS")); 416 417 if (tsp == NULL) 418 return; 419 420 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 421 *tsp = ncp_ts->nc_time; 422 *ticksp = ncp_ts->nc_ticks; 423 } 424 425 #ifdef DEBUG_CACHE 426 static int __read_mostly doingcache = 1; /* 1 => enable the cache */ 427 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, 428 "VFS namecache enabled"); 429 #endif 430 431 /* Export size information to userland */ 432 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 433 sizeof(struct namecache), "sizeof(struct namecache)"); 434 435 /* 436 * The new name cache statistics 437 */ 438 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 439 "Name cache statistics"); 440 #define STATNODE_ULONG(name, descr) \ 441 SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr); 442 #define STATNODE_COUNTER(name, descr) \ 443 static COUNTER_U64_DEFINE_EARLY(name); \ 444 SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, \ 445 descr); 446 STATNODE_ULONG(numneg, "Number of negative cache entries"); 447 STATNODE_ULONG(numcache, "Number of cache entries"); 448 STATNODE_COUNTER(numcachehv, "Number of namecache entries with vnodes held"); 449 STATNODE_COUNTER(numdrops, "Number of dropped entries due to reaching the limit"); 450 STATNODE_COUNTER(dothits, "Number of '.' hits"); 451 STATNODE_COUNTER(dotdothits, "Number of '..' hits"); 452 STATNODE_COUNTER(nummiss, "Number of cache misses"); 453 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache"); 454 STATNODE_COUNTER(numposzaps, 455 "Number of cache hits (positive) we do not want to cache"); 456 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)"); 457 STATNODE_COUNTER(numnegzaps, 458 "Number of cache hits (negative) we do not want to cache"); 459 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)"); 460 /* These count for vn_getcwd(), too. */ 461 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls"); 462 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); 463 STATNODE_COUNTER(numfullpathfail2, 464 "Number of fullpath search errors (VOP_VPTOCNP failures)"); 465 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); 466 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls"); 467 STATNODE_COUNTER(zap_and_exit_bucket_relock_success, 468 "Number of successful removals after relocking"); 469 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail, 470 "Number of times zap_and_exit failed to lock"); 471 static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2, 472 "Number of times zap_and_exit failed to lock"); 473 static long cache_lock_vnodes_cel_3_failures; 474 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures, 475 "Number of times 3-way vnode locking failed"); 476 STATNODE_COUNTER(numneg_evicted, 477 "Number of negative entries evicted when adding a new entry"); 478 STATNODE_COUNTER(shrinking_skipped, 479 "Number of times shrinking was already in progress"); 480 481 static void cache_zap_locked(struct namecache *ncp); 482 static int vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, 483 char **freebuf, size_t *buflen); 484 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 485 char **retbuf, size_t *buflen, bool slash_prefixed, size_t addend); 486 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, 487 char **retbuf, size_t *buflen); 488 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, 489 char **retbuf, size_t *len, bool slash_prefixed, size_t addend); 490 491 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 492 493 static inline void 494 cache_assert_vlp_locked(struct mtx *vlp) 495 { 496 497 if (vlp != NULL) 498 mtx_assert(vlp, MA_OWNED); 499 } 500 501 static inline void 502 cache_assert_vnode_locked(struct vnode *vp) 503 { 504 struct mtx *vlp; 505 506 vlp = VP2VNODELOCK(vp); 507 cache_assert_vlp_locked(vlp); 508 } 509 510 /* 511 * TODO: With the value stored we can do better than computing the hash based 512 * on the address. The choice of FNV should also be revisited. 513 */ 514 static void 515 cache_prehash(struct vnode *vp) 516 { 517 518 vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT); 519 } 520 521 static uint32_t 522 cache_get_hash(char *name, u_char len, struct vnode *dvp) 523 { 524 525 return (fnv_32_buf(name, len, dvp->v_nchash)); 526 } 527 528 static inline struct nchashhead * 529 NCP2BUCKET(struct namecache *ncp) 530 { 531 uint32_t hash; 532 533 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 534 return (NCHHASH(hash)); 535 } 536 537 static inline struct mtx * 538 NCP2BUCKETLOCK(struct namecache *ncp) 539 { 540 uint32_t hash; 541 542 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 543 return (HASH2BUCKETLOCK(hash)); 544 } 545 546 #ifdef INVARIANTS 547 static void 548 cache_assert_bucket_locked(struct namecache *ncp) 549 { 550 struct mtx *blp; 551 552 blp = NCP2BUCKETLOCK(ncp); 553 mtx_assert(blp, MA_OWNED); 554 } 555 556 static void 557 cache_assert_bucket_unlocked(struct namecache *ncp) 558 { 559 struct mtx *blp; 560 561 blp = NCP2BUCKETLOCK(ncp); 562 mtx_assert(blp, MA_NOTOWNED); 563 } 564 #else 565 #define cache_assert_bucket_locked(x) do { } while (0) 566 #define cache_assert_bucket_unlocked(x) do { } while (0) 567 #endif 568 569 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) 570 static void 571 _cache_sort_vnodes(void **p1, void **p2) 572 { 573 void *tmp; 574 575 MPASS(*p1 != NULL || *p2 != NULL); 576 577 if (*p1 > *p2) { 578 tmp = *p2; 579 *p2 = *p1; 580 *p1 = tmp; 581 } 582 } 583 584 static void 585 cache_lock_all_buckets(void) 586 { 587 u_int i; 588 589 for (i = 0; i < numbucketlocks; i++) 590 mtx_lock(&bucketlocks[i]); 591 } 592 593 static void 594 cache_unlock_all_buckets(void) 595 { 596 u_int i; 597 598 for (i = 0; i < numbucketlocks; i++) 599 mtx_unlock(&bucketlocks[i]); 600 } 601 602 static void 603 cache_lock_all_vnodes(void) 604 { 605 u_int i; 606 607 for (i = 0; i < numvnodelocks; i++) 608 mtx_lock(&vnodelocks[i]); 609 } 610 611 static void 612 cache_unlock_all_vnodes(void) 613 { 614 u_int i; 615 616 for (i = 0; i < numvnodelocks; i++) 617 mtx_unlock(&vnodelocks[i]); 618 } 619 620 static int 621 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 622 { 623 624 cache_sort_vnodes(&vlp1, &vlp2); 625 626 if (vlp1 != NULL) { 627 if (!mtx_trylock(vlp1)) 628 return (EAGAIN); 629 } 630 if (!mtx_trylock(vlp2)) { 631 if (vlp1 != NULL) 632 mtx_unlock(vlp1); 633 return (EAGAIN); 634 } 635 636 return (0); 637 } 638 639 static void 640 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 641 { 642 643 MPASS(vlp1 != NULL || vlp2 != NULL); 644 MPASS(vlp1 <= vlp2); 645 646 if (vlp1 != NULL) 647 mtx_lock(vlp1); 648 if (vlp2 != NULL) 649 mtx_lock(vlp2); 650 } 651 652 static void 653 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 654 { 655 656 MPASS(vlp1 != NULL || vlp2 != NULL); 657 658 if (vlp1 != NULL) 659 mtx_unlock(vlp1); 660 if (vlp2 != NULL) 661 mtx_unlock(vlp2); 662 } 663 664 static int 665 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 666 { 667 struct nchstats snap; 668 669 if (req->oldptr == NULL) 670 return (SYSCTL_OUT(req, 0, sizeof(snap))); 671 672 snap = nchstats; 673 snap.ncs_goodhits = counter_u64_fetch(numposhits); 674 snap.ncs_neghits = counter_u64_fetch(numneghits); 675 snap.ncs_badhits = counter_u64_fetch(numposzaps) + 676 counter_u64_fetch(numnegzaps); 677 snap.ncs_miss = counter_u64_fetch(nummisszap) + 678 counter_u64_fetch(nummiss); 679 680 return (SYSCTL_OUT(req, &snap, sizeof(snap))); 681 } 682 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | 683 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", 684 "VFS cache effectiveness statistics"); 685 686 static int 687 sysctl_hotnum(SYSCTL_HANDLER_ARGS) 688 { 689 int i, out; 690 691 out = 0; 692 for (i = 0; i < numneglists; i++) 693 out += neglists[i].nl_hotnum; 694 695 return (SYSCTL_OUT(req, &out, sizeof(out))); 696 } 697 SYSCTL_PROC(_vfs_cache, OID_AUTO, hotnum, CTLTYPE_INT | CTLFLAG_RD | 698 CTLFLAG_MPSAFE, 0, 0, sysctl_hotnum, "I", 699 "Number of hot negative entries"); 700 701 #ifdef DIAGNOSTIC 702 /* 703 * Grab an atomic snapshot of the name cache hash chain lengths 704 */ 705 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, 706 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 707 "hash table stats"); 708 709 static int 710 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) 711 { 712 struct nchashhead *ncpp; 713 struct namecache *ncp; 714 int i, error, n_nchash, *cntbuf; 715 716 retry: 717 n_nchash = nchash + 1; /* nchash is max index, not count */ 718 if (req->oldptr == NULL) 719 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); 720 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); 721 cache_lock_all_buckets(); 722 if (n_nchash != nchash + 1) { 723 cache_unlock_all_buckets(); 724 free(cntbuf, M_TEMP); 725 goto retry; 726 } 727 /* Scan hash tables counting entries */ 728 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) 729 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) 730 cntbuf[i]++; 731 cache_unlock_all_buckets(); 732 for (error = 0, i = 0; i < n_nchash; i++) 733 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) 734 break; 735 free(cntbuf, M_TEMP); 736 return (error); 737 } 738 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| 739 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", 740 "nchash chain lengths"); 741 742 static int 743 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) 744 { 745 int error; 746 struct nchashhead *ncpp; 747 struct namecache *ncp; 748 int n_nchash; 749 int count, maxlength, used, pct; 750 751 if (!req->oldptr) 752 return SYSCTL_OUT(req, 0, 4 * sizeof(int)); 753 754 cache_lock_all_buckets(); 755 n_nchash = nchash + 1; /* nchash is max index, not count */ 756 used = 0; 757 maxlength = 0; 758 759 /* Scan hash tables for applicable entries */ 760 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { 761 count = 0; 762 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) { 763 count++; 764 } 765 if (count) 766 used++; 767 if (maxlength < count) 768 maxlength = count; 769 } 770 n_nchash = nchash + 1; 771 cache_unlock_all_buckets(); 772 pct = (used * 100) / (n_nchash / 100); 773 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); 774 if (error) 775 return (error); 776 error = SYSCTL_OUT(req, &used, sizeof(used)); 777 if (error) 778 return (error); 779 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); 780 if (error) 781 return (error); 782 error = SYSCTL_OUT(req, &pct, sizeof(pct)); 783 if (error) 784 return (error); 785 return (0); 786 } 787 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| 788 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", 789 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); 790 #endif 791 792 /* 793 * Negative entries management 794 * 795 * A variation of LRU scheme is used. New entries are hashed into one of 796 * numneglists cold lists. Entries get promoted to the hot list on first hit. 797 * 798 * The shrinker will demote hot list head and evict from the cold list in a 799 * round-robin manner. 800 */ 801 static void 802 cache_negative_init(struct namecache *ncp) 803 { 804 struct negstate *ns; 805 806 ncp->nc_flag |= NCF_NEGATIVE; 807 ns = NCP2NEGSTATE(ncp); 808 ns->neg_flag = 0; 809 } 810 811 /* 812 * Move a negative entry to the hot list. 813 */ 814 static void 815 cache_negative_promote(struct namecache *ncp) 816 { 817 struct neglist *nl; 818 struct negstate *ns; 819 820 ns = NCP2NEGSTATE(ncp); 821 nl = NCP2NEGLIST(ncp); 822 mtx_assert(&nl->nl_lock, MA_OWNED); 823 if ((ns->neg_flag & NEG_HOT) == 0) { 824 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst); 825 TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst); 826 nl->nl_hotnum++; 827 ns->neg_flag |= NEG_HOT; 828 } 829 } 830 831 /* 832 * Move a negative entry to the hot list if it matches the lookup. 833 * 834 * We have to take locks, but they may be contended and in the worst 835 * case we may need to go off CPU. We don't want to spin within the 836 * smr section and we can't block with it. Exiting the section means 837 * the found entry could have been evicted. We are going to look it 838 * up again. 839 */ 840 static bool 841 cache_negative_promote_cond(struct vnode *dvp, struct componentname *cnp, 842 struct namecache *oncp, uint32_t hash) 843 { 844 struct namecache *ncp; 845 struct neglist *nl; 846 u_char nc_flag; 847 848 nl = NCP2NEGLIST(oncp); 849 850 mtx_lock(&nl->nl_lock); 851 /* 852 * For hash iteration. 853 */ 854 vfs_smr_enter(); 855 856 /* 857 * Avoid all surprises by only succeeding if we got the same entry and 858 * bailing completely otherwise. 859 * XXX There are no provisions to keep the vnode around, meaning we may 860 * end up promoting a negative entry for a *new* vnode and returning 861 * ENOENT on its account. This is the error we want to return anyway 862 * and promotion is harmless. 863 * 864 * In particular at this point there can be a new ncp which matches the 865 * search but hashes to a different neglist. 866 */ 867 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 868 if (ncp == oncp) 869 break; 870 } 871 872 /* 873 * No match to begin with. 874 */ 875 if (__predict_false(ncp == NULL)) { 876 goto out_abort; 877 } 878 879 /* 880 * The newly found entry may be something different... 881 */ 882 if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 883 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) { 884 goto out_abort; 885 } 886 887 /* 888 * ... and not even negative. 889 */ 890 nc_flag = atomic_load_char(&ncp->nc_flag); 891 if ((nc_flag & NCF_NEGATIVE) == 0) { 892 goto out_abort; 893 } 894 895 if (__predict_false(!cache_ncp_canuse(ncp))) { 896 goto out_abort; 897 } 898 899 cache_negative_promote(ncp); 900 901 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 902 counter_u64_add(numneghits, 1); 903 vfs_smr_exit(); 904 mtx_unlock(&nl->nl_lock); 905 return (true); 906 out_abort: 907 vfs_smr_exit(); 908 mtx_unlock(&nl->nl_lock); 909 return (false); 910 } 911 912 static void 913 cache_negative_hit(struct namecache *ncp) 914 { 915 struct neglist *nl; 916 struct negstate *ns; 917 918 ns = NCP2NEGSTATE(ncp); 919 if ((ns->neg_flag & NEG_HOT) != 0) 920 return; 921 nl = NCP2NEGLIST(ncp); 922 mtx_lock(&nl->nl_lock); 923 cache_negative_promote(ncp); 924 mtx_unlock(&nl->nl_lock); 925 } 926 927 static void 928 cache_negative_insert(struct namecache *ncp) 929 { 930 struct neglist *nl; 931 932 MPASS(ncp->nc_flag & NCF_NEGATIVE); 933 cache_assert_bucket_locked(ncp); 934 nl = NCP2NEGLIST(ncp); 935 mtx_lock(&nl->nl_lock); 936 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst); 937 mtx_unlock(&nl->nl_lock); 938 atomic_add_long(&numneg, 1); 939 } 940 941 static void 942 cache_negative_remove(struct namecache *ncp) 943 { 944 struct neglist *nl; 945 struct negstate *ns; 946 947 cache_assert_bucket_locked(ncp); 948 nl = NCP2NEGLIST(ncp); 949 ns = NCP2NEGSTATE(ncp); 950 mtx_lock(&nl->nl_lock); 951 if ((ns->neg_flag & NEG_HOT) != 0) { 952 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst); 953 nl->nl_hotnum--; 954 } else { 955 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst); 956 } 957 mtx_unlock(&nl->nl_lock); 958 atomic_subtract_long(&numneg, 1); 959 } 960 961 static struct neglist * 962 cache_negative_shrink_select(void) 963 { 964 struct neglist *nl; 965 static u_int cycle; 966 u_int i; 967 968 cycle++; 969 for (i = 0; i < numneglists; i++) { 970 nl = &neglists[(cycle + i) % numneglists]; 971 if (TAILQ_FIRST(&nl->nl_list) == NULL && 972 TAILQ_FIRST(&nl->nl_hotlist) == NULL) 973 continue; 974 mtx_lock(&nl->nl_lock); 975 if (TAILQ_FIRST(&nl->nl_list) != NULL || 976 TAILQ_FIRST(&nl->nl_hotlist) != NULL) 977 return (nl); 978 mtx_unlock(&nl->nl_lock); 979 } 980 981 return (NULL); 982 } 983 984 static void 985 cache_negative_zap_one(void) 986 { 987 struct namecache *ncp, *ncp2; 988 struct neglist *nl; 989 struct negstate *ns; 990 struct mtx *dvlp; 991 struct mtx *blp; 992 993 if (mtx_owner(&ncneg_shrink_lock) != NULL || 994 !mtx_trylock(&ncneg_shrink_lock)) { 995 counter_u64_add(shrinking_skipped, 1); 996 return; 997 } 998 999 nl = cache_negative_shrink_select(); 1000 mtx_unlock(&ncneg_shrink_lock); 1001 if (nl == NULL) { 1002 return; 1003 } 1004 1005 ncp = TAILQ_FIRST(&nl->nl_hotlist); 1006 if (ncp != NULL) { 1007 ns = NCP2NEGSTATE(ncp); 1008 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst); 1009 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst); 1010 nl->nl_hotnum--; 1011 ns->neg_flag &= ~NEG_HOT; 1012 } 1013 ncp = TAILQ_FIRST(&nl->nl_list); 1014 MPASS(ncp != NULL); 1015 ns = NCP2NEGSTATE(ncp); 1016 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1017 blp = NCP2BUCKETLOCK(ncp); 1018 mtx_unlock(&nl->nl_lock); 1019 mtx_lock(dvlp); 1020 mtx_lock(blp); 1021 /* 1022 * Enter SMR to safely check the negative list. 1023 * Even if the found pointer matches, the entry may now be reallocated 1024 * and used by a different vnode. 1025 */ 1026 vfs_smr_enter(); 1027 ncp2 = TAILQ_FIRST(&nl->nl_list); 1028 if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) || 1029 blp != NCP2BUCKETLOCK(ncp2)) { 1030 vfs_smr_exit(); 1031 ncp = NULL; 1032 } else { 1033 vfs_smr_exit(); 1034 SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp, 1035 ncp->nc_name); 1036 cache_zap_locked(ncp); 1037 counter_u64_add(numneg_evicted, 1); 1038 } 1039 mtx_unlock(blp); 1040 mtx_unlock(dvlp); 1041 if (ncp != NULL) 1042 cache_free(ncp); 1043 } 1044 1045 /* 1046 * cache_zap_locked(): 1047 * 1048 * Removes a namecache entry from cache, whether it contains an actual 1049 * pointer to a vnode or if it is just a negative cache entry. 1050 */ 1051 static void 1052 cache_zap_locked(struct namecache *ncp) 1053 { 1054 struct nchashhead *ncpp; 1055 1056 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1057 cache_assert_vnode_locked(ncp->nc_vp); 1058 cache_assert_vnode_locked(ncp->nc_dvp); 1059 cache_assert_bucket_locked(ncp); 1060 1061 cache_ncp_invalidate(ncp); 1062 1063 ncpp = NCP2BUCKET(ncp); 1064 CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash); 1065 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 1066 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, 1067 ncp->nc_name, ncp->nc_vp); 1068 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); 1069 if (ncp == ncp->nc_vp->v_cache_dd) { 1070 vn_seqc_write_begin_unheld(ncp->nc_vp); 1071 ncp->nc_vp->v_cache_dd = NULL; 1072 vn_seqc_write_end(ncp->nc_vp); 1073 } 1074 } else { 1075 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp, 1076 ncp->nc_name); 1077 cache_negative_remove(ncp); 1078 } 1079 if (ncp->nc_flag & NCF_ISDOTDOT) { 1080 if (ncp == ncp->nc_dvp->v_cache_dd) { 1081 vn_seqc_write_begin_unheld(ncp->nc_dvp); 1082 ncp->nc_dvp->v_cache_dd = NULL; 1083 vn_seqc_write_end(ncp->nc_dvp); 1084 } 1085 } else { 1086 LIST_REMOVE(ncp, nc_src); 1087 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { 1088 ncp->nc_flag |= NCF_DVDROP; 1089 counter_u64_add(numcachehv, -1); 1090 } 1091 } 1092 atomic_subtract_long(&numcache, 1); 1093 } 1094 1095 static void 1096 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) 1097 { 1098 struct mtx *blp; 1099 1100 MPASS(ncp->nc_dvp == vp); 1101 MPASS(ncp->nc_flag & NCF_NEGATIVE); 1102 cache_assert_vnode_locked(vp); 1103 1104 blp = NCP2BUCKETLOCK(ncp); 1105 mtx_lock(blp); 1106 cache_zap_locked(ncp); 1107 mtx_unlock(blp); 1108 } 1109 1110 static bool 1111 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, 1112 struct mtx **vlpp) 1113 { 1114 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 1115 struct mtx *blp; 1116 1117 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 1118 cache_assert_vnode_locked(vp); 1119 1120 if (ncp->nc_flag & NCF_NEGATIVE) { 1121 if (*vlpp != NULL) { 1122 mtx_unlock(*vlpp); 1123 *vlpp = NULL; 1124 } 1125 cache_zap_negative_locked_vnode_kl(ncp, vp); 1126 return (true); 1127 } 1128 1129 pvlp = VP2VNODELOCK(vp); 1130 blp = NCP2BUCKETLOCK(ncp); 1131 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 1132 vlp2 = VP2VNODELOCK(ncp->nc_vp); 1133 1134 if (*vlpp == vlp1 || *vlpp == vlp2) { 1135 to_unlock = *vlpp; 1136 *vlpp = NULL; 1137 } else { 1138 if (*vlpp != NULL) { 1139 mtx_unlock(*vlpp); 1140 *vlpp = NULL; 1141 } 1142 cache_sort_vnodes(&vlp1, &vlp2); 1143 if (vlp1 == pvlp) { 1144 mtx_lock(vlp2); 1145 to_unlock = vlp2; 1146 } else { 1147 if (!mtx_trylock(vlp1)) 1148 goto out_relock; 1149 to_unlock = vlp1; 1150 } 1151 } 1152 mtx_lock(blp); 1153 cache_zap_locked(ncp); 1154 mtx_unlock(blp); 1155 if (to_unlock != NULL) 1156 mtx_unlock(to_unlock); 1157 return (true); 1158 1159 out_relock: 1160 mtx_unlock(vlp2); 1161 mtx_lock(vlp1); 1162 mtx_lock(vlp2); 1163 MPASS(*vlpp == NULL); 1164 *vlpp = vlp1; 1165 return (false); 1166 } 1167 1168 /* 1169 * If trylocking failed we can get here. We know enough to take all needed locks 1170 * in the right order and re-lookup the entry. 1171 */ 1172 static int 1173 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1174 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, 1175 struct mtx *blp) 1176 { 1177 struct namecache *rncp; 1178 1179 cache_assert_bucket_unlocked(ncp); 1180 1181 cache_sort_vnodes(&dvlp, &vlp); 1182 cache_lock_vnodes(dvlp, vlp); 1183 mtx_lock(blp); 1184 CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { 1185 if (rncp == ncp && rncp->nc_dvp == dvp && 1186 rncp->nc_nlen == cnp->cn_namelen && 1187 !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) 1188 break; 1189 } 1190 if (rncp != NULL) { 1191 cache_zap_locked(rncp); 1192 mtx_unlock(blp); 1193 cache_unlock_vnodes(dvlp, vlp); 1194 counter_u64_add(zap_and_exit_bucket_relock_success, 1); 1195 return (0); 1196 } 1197 1198 mtx_unlock(blp); 1199 cache_unlock_vnodes(dvlp, vlp); 1200 return (EAGAIN); 1201 } 1202 1203 static int __noinline 1204 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp, 1205 uint32_t hash, struct mtx *blp) 1206 { 1207 struct mtx *dvlp, *vlp; 1208 struct vnode *dvp; 1209 1210 cache_assert_bucket_locked(ncp); 1211 1212 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1213 vlp = NULL; 1214 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1215 vlp = VP2VNODELOCK(ncp->nc_vp); 1216 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1217 cache_zap_locked(ncp); 1218 mtx_unlock(blp); 1219 cache_unlock_vnodes(dvlp, vlp); 1220 return (0); 1221 } 1222 1223 dvp = ncp->nc_dvp; 1224 mtx_unlock(blp); 1225 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1226 } 1227 1228 static __noinline int 1229 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp) 1230 { 1231 struct namecache *ncp; 1232 struct mtx *blp; 1233 struct mtx *dvlp, *dvlp2; 1234 uint32_t hash; 1235 int error; 1236 1237 if (cnp->cn_namelen == 2 && 1238 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1239 dvlp = VP2VNODELOCK(dvp); 1240 dvlp2 = NULL; 1241 mtx_lock(dvlp); 1242 retry_dotdot: 1243 ncp = dvp->v_cache_dd; 1244 if (ncp == NULL) { 1245 mtx_unlock(dvlp); 1246 if (dvlp2 != NULL) 1247 mtx_unlock(dvlp2); 1248 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1249 return (0); 1250 } 1251 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1252 if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2)) 1253 goto retry_dotdot; 1254 MPASS(dvp->v_cache_dd == NULL); 1255 mtx_unlock(dvlp); 1256 if (dvlp2 != NULL) 1257 mtx_unlock(dvlp2); 1258 cache_free(ncp); 1259 } else { 1260 vn_seqc_write_begin(dvp); 1261 dvp->v_cache_dd = NULL; 1262 vn_seqc_write_end(dvp); 1263 mtx_unlock(dvlp); 1264 if (dvlp2 != NULL) 1265 mtx_unlock(dvlp2); 1266 } 1267 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1268 return (1); 1269 } 1270 1271 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1272 blp = HASH2BUCKETLOCK(hash); 1273 retry: 1274 if (CK_SLIST_EMPTY(NCHHASH(hash))) 1275 goto out_no_entry; 1276 1277 mtx_lock(blp); 1278 1279 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1280 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1281 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1282 break; 1283 } 1284 1285 if (ncp == NULL) { 1286 mtx_unlock(blp); 1287 goto out_no_entry; 1288 } 1289 1290 error = cache_zap_locked_bucket(ncp, cnp, hash, blp); 1291 if (__predict_false(error != 0)) { 1292 zap_and_exit_bucket_fail++; 1293 goto retry; 1294 } 1295 counter_u64_add(numposzaps, 1); 1296 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1297 cache_free(ncp); 1298 return (1); 1299 out_no_entry: 1300 counter_u64_add(nummisszap, 1); 1301 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1302 return (0); 1303 } 1304 1305 static int __noinline 1306 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1307 struct timespec *tsp, int *ticksp) 1308 { 1309 int ltype; 1310 1311 *vpp = dvp; 1312 counter_u64_add(dothits, 1); 1313 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); 1314 if (tsp != NULL) 1315 timespecclear(tsp); 1316 if (ticksp != NULL) 1317 *ticksp = ticks; 1318 vrefact(*vpp); 1319 /* 1320 * When we lookup "." we still can be asked to lock it 1321 * differently... 1322 */ 1323 ltype = cnp->cn_lkflags & LK_TYPE_MASK; 1324 if (ltype != VOP_ISLOCKED(*vpp)) { 1325 if (ltype == LK_EXCLUSIVE) { 1326 vn_lock(*vpp, LK_UPGRADE | LK_RETRY); 1327 if (VN_IS_DOOMED((*vpp))) { 1328 /* forced unmount */ 1329 vrele(*vpp); 1330 *vpp = NULL; 1331 return (ENOENT); 1332 } 1333 } else 1334 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); 1335 } 1336 return (-1); 1337 } 1338 1339 static int __noinline 1340 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1341 struct timespec *tsp, int *ticksp) 1342 { 1343 struct namecache_ts *ncp_ts; 1344 struct namecache *ncp; 1345 struct mtx *dvlp; 1346 enum vgetstate vs; 1347 int error, ltype; 1348 bool whiteout; 1349 1350 MPASS((cnp->cn_flags & ISDOTDOT) != 0); 1351 1352 if ((cnp->cn_flags & MAKEENTRY) == 0) { 1353 cache_remove_cnp(dvp, cnp); 1354 return (0); 1355 } 1356 1357 counter_u64_add(dotdothits, 1); 1358 retry: 1359 dvlp = VP2VNODELOCK(dvp); 1360 mtx_lock(dvlp); 1361 ncp = dvp->v_cache_dd; 1362 if (ncp == NULL) { 1363 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, "..", NULL); 1364 mtx_unlock(dvlp); 1365 return (0); 1366 } 1367 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1368 if (ncp->nc_flag & NCF_NEGATIVE) 1369 *vpp = NULL; 1370 else 1371 *vpp = ncp->nc_vp; 1372 } else 1373 *vpp = ncp->nc_dvp; 1374 if (*vpp == NULL) 1375 goto negative_success; 1376 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp); 1377 cache_out_ts(ncp, tsp, ticksp); 1378 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == 1379 NCF_DTS && tsp != NULL) { 1380 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1381 *tsp = ncp_ts->nc_dotdottime; 1382 } 1383 1384 MPASS(dvp != *vpp); 1385 ltype = VOP_ISLOCKED(dvp); 1386 VOP_UNLOCK(dvp); 1387 vs = vget_prep(*vpp); 1388 mtx_unlock(dvlp); 1389 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1390 vn_lock(dvp, ltype | LK_RETRY); 1391 if (VN_IS_DOOMED(dvp)) { 1392 if (error == 0) 1393 vput(*vpp); 1394 *vpp = NULL; 1395 return (ENOENT); 1396 } 1397 if (error) { 1398 *vpp = NULL; 1399 goto retry; 1400 } 1401 return (-1); 1402 negative_success: 1403 if (__predict_false(cnp->cn_nameiop == CREATE)) { 1404 if (cnp->cn_flags & ISLASTCN) { 1405 counter_u64_add(numnegzaps, 1); 1406 cache_zap_negative_locked_vnode_kl(ncp, dvp); 1407 mtx_unlock(dvlp); 1408 cache_free(ncp); 1409 return (0); 1410 } 1411 } 1412 1413 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 1414 cache_out_ts(ncp, tsp, ticksp); 1415 counter_u64_add(numneghits, 1); 1416 whiteout = (ncp->nc_flag & NCF_WHITE); 1417 cache_negative_hit(ncp); 1418 mtx_unlock(dvlp); 1419 if (whiteout) 1420 cnp->cn_flags |= ISWHITEOUT; 1421 return (ENOENT); 1422 } 1423 1424 /** 1425 * Lookup a name in the name cache 1426 * 1427 * # Arguments 1428 * 1429 * - dvp: Parent directory in which to search. 1430 * - vpp: Return argument. Will contain desired vnode on cache hit. 1431 * - cnp: Parameters of the name search. The most interesting bits of 1432 * the cn_flags field have the following meanings: 1433 * - MAKEENTRY: If clear, free an entry from the cache rather than look 1434 * it up. 1435 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." 1436 * - tsp: Return storage for cache timestamp. On a successful (positive 1437 * or negative) lookup, tsp will be filled with any timespec that 1438 * was stored when this cache entry was created. However, it will 1439 * be clear for "." entries. 1440 * - ticks: Return storage for alternate cache timestamp. On a successful 1441 * (positive or negative) lookup, it will contain the ticks value 1442 * that was current when the cache entry was created, unless cnp 1443 * was ".". 1444 * 1445 * Either both tsp and ticks have to be provided or neither of them. 1446 * 1447 * # Returns 1448 * 1449 * - -1: A positive cache hit. vpp will contain the desired vnode. 1450 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due 1451 * to a forced unmount. vpp will not be modified. If the entry 1452 * is a whiteout, then the ISWHITEOUT flag will be set in 1453 * cnp->cn_flags. 1454 * - 0: A cache miss. vpp will not be modified. 1455 * 1456 * # Locking 1457 * 1458 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up 1459 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the 1460 * lock is not recursively acquired. 1461 */ 1462 static int __noinline 1463 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1464 struct timespec *tsp, int *ticksp) 1465 { 1466 struct namecache *ncp; 1467 struct mtx *blp; 1468 uint32_t hash; 1469 enum vgetstate vs; 1470 int error; 1471 bool whiteout; 1472 1473 MPASS((cnp->cn_flags & (MAKEENTRY | ISDOTDOT)) == MAKEENTRY); 1474 1475 retry: 1476 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1477 blp = HASH2BUCKETLOCK(hash); 1478 mtx_lock(blp); 1479 1480 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1481 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1482 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1483 break; 1484 } 1485 1486 if (__predict_false(ncp == NULL)) { 1487 mtx_unlock(blp); 1488 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1489 NULL); 1490 counter_u64_add(nummiss, 1); 1491 return (0); 1492 } 1493 1494 if (ncp->nc_flag & NCF_NEGATIVE) 1495 goto negative_success; 1496 1497 counter_u64_add(numposhits, 1); 1498 *vpp = ncp->nc_vp; 1499 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); 1500 cache_out_ts(ncp, tsp, ticksp); 1501 MPASS(dvp != *vpp); 1502 vs = vget_prep(*vpp); 1503 mtx_unlock(blp); 1504 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1505 if (error) { 1506 *vpp = NULL; 1507 goto retry; 1508 } 1509 return (-1); 1510 negative_success: 1511 if (__predict_false(cnp->cn_nameiop == CREATE)) { 1512 if (cnp->cn_flags & ISLASTCN) { 1513 counter_u64_add(numnegzaps, 1); 1514 error = cache_zap_locked_bucket(ncp, cnp, hash, blp); 1515 if (__predict_false(error != 0)) { 1516 zap_and_exit_bucket_fail2++; 1517 goto retry; 1518 } 1519 cache_free(ncp); 1520 return (0); 1521 } 1522 } 1523 1524 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 1525 cache_out_ts(ncp, tsp, ticksp); 1526 counter_u64_add(numneghits, 1); 1527 whiteout = (ncp->nc_flag & NCF_WHITE); 1528 cache_negative_hit(ncp); 1529 mtx_unlock(blp); 1530 if (whiteout) 1531 cnp->cn_flags |= ISWHITEOUT; 1532 return (ENOENT); 1533 } 1534 1535 int 1536 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1537 struct timespec *tsp, int *ticksp) 1538 { 1539 struct namecache *ncp; 1540 struct negstate *ns; 1541 uint32_t hash; 1542 enum vgetstate vs; 1543 int error; 1544 bool whiteout, neg_hot; 1545 u_short nc_flag; 1546 1547 MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL)); 1548 1549 #ifdef DEBUG_CACHE 1550 if (__predict_false(!doingcache)) { 1551 cnp->cn_flags &= ~MAKEENTRY; 1552 return (0); 1553 } 1554 #endif 1555 1556 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 1557 if (cnp->cn_namelen == 1) 1558 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); 1559 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') 1560 return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp)); 1561 } 1562 1563 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 1564 1565 if ((cnp->cn_flags & MAKEENTRY) == 0) { 1566 cache_remove_cnp(dvp, cnp); 1567 return (0); 1568 } 1569 1570 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1571 vfs_smr_enter(); 1572 1573 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1574 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1575 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1576 break; 1577 } 1578 1579 if (__predict_false(ncp == NULL)) { 1580 vfs_smr_exit(); 1581 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1582 NULL); 1583 counter_u64_add(nummiss, 1); 1584 return (0); 1585 } 1586 1587 nc_flag = atomic_load_char(&ncp->nc_flag); 1588 if (nc_flag & NCF_NEGATIVE) 1589 goto negative_success; 1590 1591 counter_u64_add(numposhits, 1); 1592 *vpp = ncp->nc_vp; 1593 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); 1594 cache_out_ts(ncp, tsp, ticksp); 1595 MPASS(dvp != *vpp); 1596 if (!cache_ncp_canuse(ncp)) { 1597 vfs_smr_exit(); 1598 *vpp = NULL; 1599 goto out_fallback; 1600 } 1601 vs = vget_prep_smr(*vpp); 1602 vfs_smr_exit(); 1603 if (__predict_false(vs == VGET_NONE)) { 1604 *vpp = NULL; 1605 goto out_fallback; 1606 } 1607 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1608 if (error) { 1609 *vpp = NULL; 1610 goto out_fallback; 1611 } 1612 return (-1); 1613 negative_success: 1614 if (__predict_false(cnp->cn_nameiop == CREATE)) { 1615 if (cnp->cn_flags & ISLASTCN) { 1616 vfs_smr_exit(); 1617 goto out_fallback; 1618 } 1619 } 1620 1621 cache_out_ts(ncp, tsp, ticksp); 1622 whiteout = (ncp->nc_flag & NCF_WHITE); 1623 ns = NCP2NEGSTATE(ncp); 1624 neg_hot = ((ns->neg_flag & NEG_HOT) != 0); 1625 if (__predict_false(!cache_ncp_canuse(ncp))) { 1626 vfs_smr_exit(); 1627 goto out_fallback; 1628 } 1629 if (!neg_hot) { 1630 vfs_smr_exit(); 1631 if (!cache_negative_promote_cond(dvp, cnp, ncp, hash)) 1632 goto out_fallback; 1633 } else { 1634 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 1635 counter_u64_add(numneghits, 1); 1636 vfs_smr_exit(); 1637 } 1638 if (whiteout) 1639 cnp->cn_flags |= ISWHITEOUT; 1640 return (ENOENT); 1641 out_fallback: 1642 return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp)); 1643 } 1644 1645 struct celockstate { 1646 struct mtx *vlp[3]; 1647 struct mtx *blp[2]; 1648 }; 1649 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); 1650 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); 1651 1652 static inline void 1653 cache_celockstate_init(struct celockstate *cel) 1654 { 1655 1656 bzero(cel, sizeof(*cel)); 1657 } 1658 1659 static void 1660 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, 1661 struct vnode *dvp) 1662 { 1663 struct mtx *vlp1, *vlp2; 1664 1665 MPASS(cel->vlp[0] == NULL); 1666 MPASS(cel->vlp[1] == NULL); 1667 MPASS(cel->vlp[2] == NULL); 1668 1669 MPASS(vp != NULL || dvp != NULL); 1670 1671 vlp1 = VP2VNODELOCK(vp); 1672 vlp2 = VP2VNODELOCK(dvp); 1673 cache_sort_vnodes(&vlp1, &vlp2); 1674 1675 if (vlp1 != NULL) { 1676 mtx_lock(vlp1); 1677 cel->vlp[0] = vlp1; 1678 } 1679 mtx_lock(vlp2); 1680 cel->vlp[1] = vlp2; 1681 } 1682 1683 static void 1684 cache_unlock_vnodes_cel(struct celockstate *cel) 1685 { 1686 1687 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); 1688 1689 if (cel->vlp[0] != NULL) 1690 mtx_unlock(cel->vlp[0]); 1691 if (cel->vlp[1] != NULL) 1692 mtx_unlock(cel->vlp[1]); 1693 if (cel->vlp[2] != NULL) 1694 mtx_unlock(cel->vlp[2]); 1695 } 1696 1697 static bool 1698 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) 1699 { 1700 struct mtx *vlp; 1701 bool ret; 1702 1703 cache_assert_vlp_locked(cel->vlp[0]); 1704 cache_assert_vlp_locked(cel->vlp[1]); 1705 MPASS(cel->vlp[2] == NULL); 1706 1707 MPASS(vp != NULL); 1708 vlp = VP2VNODELOCK(vp); 1709 1710 ret = true; 1711 if (vlp >= cel->vlp[1]) { 1712 mtx_lock(vlp); 1713 } else { 1714 if (mtx_trylock(vlp)) 1715 goto out; 1716 cache_lock_vnodes_cel_3_failures++; 1717 cache_unlock_vnodes_cel(cel); 1718 if (vlp < cel->vlp[0]) { 1719 mtx_lock(vlp); 1720 mtx_lock(cel->vlp[0]); 1721 mtx_lock(cel->vlp[1]); 1722 } else { 1723 if (cel->vlp[0] != NULL) 1724 mtx_lock(cel->vlp[0]); 1725 mtx_lock(vlp); 1726 mtx_lock(cel->vlp[1]); 1727 } 1728 ret = false; 1729 } 1730 out: 1731 cel->vlp[2] = vlp; 1732 return (ret); 1733 } 1734 1735 static void 1736 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1, 1737 struct mtx *blp2) 1738 { 1739 1740 MPASS(cel->blp[0] == NULL); 1741 MPASS(cel->blp[1] == NULL); 1742 1743 cache_sort_vnodes(&blp1, &blp2); 1744 1745 if (blp1 != NULL) { 1746 mtx_lock(blp1); 1747 cel->blp[0] = blp1; 1748 } 1749 mtx_lock(blp2); 1750 cel->blp[1] = blp2; 1751 } 1752 1753 static void 1754 cache_unlock_buckets_cel(struct celockstate *cel) 1755 { 1756 1757 if (cel->blp[0] != NULL) 1758 mtx_unlock(cel->blp[0]); 1759 mtx_unlock(cel->blp[1]); 1760 } 1761 1762 /* 1763 * Lock part of the cache affected by the insertion. 1764 * 1765 * This means vnodelocks for dvp, vp and the relevant bucketlock. 1766 * However, insertion can result in removal of an old entry. In this 1767 * case we have an additional vnode and bucketlock pair to lock. 1768 * 1769 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while 1770 * preserving the locking order (smaller address first). 1771 */ 1772 static void 1773 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1774 uint32_t hash) 1775 { 1776 struct namecache *ncp; 1777 struct mtx *blps[2]; 1778 1779 blps[0] = HASH2BUCKETLOCK(hash); 1780 for (;;) { 1781 blps[1] = NULL; 1782 cache_lock_vnodes_cel(cel, dvp, vp); 1783 if (vp == NULL || vp->v_type != VDIR) 1784 break; 1785 ncp = vp->v_cache_dd; 1786 if (ncp == NULL) 1787 break; 1788 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1789 break; 1790 MPASS(ncp->nc_dvp == vp); 1791 blps[1] = NCP2BUCKETLOCK(ncp); 1792 if (ncp->nc_flag & NCF_NEGATIVE) 1793 break; 1794 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1795 break; 1796 /* 1797 * All vnodes got re-locked. Re-validate the state and if 1798 * nothing changed we are done. Otherwise restart. 1799 */ 1800 if (ncp == vp->v_cache_dd && 1801 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1802 blps[1] == NCP2BUCKETLOCK(ncp) && 1803 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1804 break; 1805 cache_unlock_vnodes_cel(cel); 1806 cel->vlp[0] = NULL; 1807 cel->vlp[1] = NULL; 1808 cel->vlp[2] = NULL; 1809 } 1810 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1811 } 1812 1813 static void 1814 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1815 uint32_t hash) 1816 { 1817 struct namecache *ncp; 1818 struct mtx *blps[2]; 1819 1820 blps[0] = HASH2BUCKETLOCK(hash); 1821 for (;;) { 1822 blps[1] = NULL; 1823 cache_lock_vnodes_cel(cel, dvp, vp); 1824 ncp = dvp->v_cache_dd; 1825 if (ncp == NULL) 1826 break; 1827 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1828 break; 1829 MPASS(ncp->nc_dvp == dvp); 1830 blps[1] = NCP2BUCKETLOCK(ncp); 1831 if (ncp->nc_flag & NCF_NEGATIVE) 1832 break; 1833 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1834 break; 1835 if (ncp == dvp->v_cache_dd && 1836 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1837 blps[1] == NCP2BUCKETLOCK(ncp) && 1838 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1839 break; 1840 cache_unlock_vnodes_cel(cel); 1841 cel->vlp[0] = NULL; 1842 cel->vlp[1] = NULL; 1843 cel->vlp[2] = NULL; 1844 } 1845 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1846 } 1847 1848 static void 1849 cache_enter_unlock(struct celockstate *cel) 1850 { 1851 1852 cache_unlock_buckets_cel(cel); 1853 cache_unlock_vnodes_cel(cel); 1854 } 1855 1856 static void __noinline 1857 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, 1858 struct componentname *cnp) 1859 { 1860 struct celockstate cel; 1861 struct namecache *ncp; 1862 uint32_t hash; 1863 int len; 1864 1865 if (dvp->v_cache_dd == NULL) 1866 return; 1867 len = cnp->cn_namelen; 1868 cache_celockstate_init(&cel); 1869 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1870 cache_enter_lock_dd(&cel, dvp, vp, hash); 1871 vn_seqc_write_begin(dvp); 1872 ncp = dvp->v_cache_dd; 1873 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) { 1874 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); 1875 cache_zap_locked(ncp); 1876 } else { 1877 ncp = NULL; 1878 } 1879 dvp->v_cache_dd = NULL; 1880 vn_seqc_write_end(dvp); 1881 cache_enter_unlock(&cel); 1882 if (ncp != NULL) 1883 cache_free(ncp); 1884 } 1885 1886 /* 1887 * Add an entry to the cache. 1888 */ 1889 void 1890 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 1891 struct timespec *tsp, struct timespec *dtsp) 1892 { 1893 struct celockstate cel; 1894 struct namecache *ncp, *n2, *ndd; 1895 struct namecache_ts *ncp_ts; 1896 struct nchashhead *ncpp; 1897 uint32_t hash; 1898 int flag; 1899 int len; 1900 u_long lnumcache; 1901 1902 VNPASS(!VN_IS_DOOMED(dvp), dvp); 1903 VNPASS(dvp->v_type != VNON, dvp); 1904 if (vp != NULL) { 1905 VNPASS(!VN_IS_DOOMED(vp), vp); 1906 VNPASS(vp->v_type != VNON, vp); 1907 } 1908 1909 #ifdef DEBUG_CACHE 1910 if (__predict_false(!doingcache)) 1911 return; 1912 #endif 1913 1914 flag = 0; 1915 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 1916 if (cnp->cn_namelen == 1) 1917 return; 1918 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 1919 cache_enter_dotdot_prep(dvp, vp, cnp); 1920 flag = NCF_ISDOTDOT; 1921 } 1922 } 1923 1924 /* 1925 * Avoid blowout in namecache entries. 1926 */ 1927 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; 1928 if (__predict_false(lnumcache >= ncsize)) { 1929 atomic_subtract_long(&numcache, 1); 1930 counter_u64_add(numdrops, 1); 1931 return; 1932 } 1933 1934 cache_celockstate_init(&cel); 1935 ndd = NULL; 1936 ncp_ts = NULL; 1937 1938 /* 1939 * Calculate the hash key and setup as much of the new 1940 * namecache entry as possible before acquiring the lock. 1941 */ 1942 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); 1943 ncp->nc_flag = flag | NCF_WIP; 1944 ncp->nc_vp = vp; 1945 if (vp == NULL) 1946 cache_negative_init(ncp); 1947 ncp->nc_dvp = dvp; 1948 if (tsp != NULL) { 1949 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1950 ncp_ts->nc_time = *tsp; 1951 ncp_ts->nc_ticks = ticks; 1952 ncp_ts->nc_nc.nc_flag |= NCF_TS; 1953 if (dtsp != NULL) { 1954 ncp_ts->nc_dotdottime = *dtsp; 1955 ncp_ts->nc_nc.nc_flag |= NCF_DTS; 1956 } 1957 } 1958 len = ncp->nc_nlen = cnp->cn_namelen; 1959 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1960 memcpy(ncp->nc_name, cnp->cn_nameptr, len); 1961 ncp->nc_name[len] = '\0'; 1962 cache_enter_lock(&cel, dvp, vp, hash); 1963 1964 /* 1965 * See if this vnode or negative entry is already in the cache 1966 * with this name. This can happen with concurrent lookups of 1967 * the same path name. 1968 */ 1969 ncpp = NCHHASH(hash); 1970 CK_SLIST_FOREACH(n2, ncpp, nc_hash) { 1971 if (n2->nc_dvp == dvp && 1972 n2->nc_nlen == cnp->cn_namelen && 1973 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { 1974 MPASS(cache_ncp_canuse(n2)); 1975 if ((n2->nc_flag & NCF_NEGATIVE) != 0) 1976 KASSERT(vp == NULL, 1977 ("%s: found entry pointing to a different vnode (%p != %p)", 1978 __func__, NULL, vp)); 1979 else 1980 KASSERT(n2->nc_vp == vp, 1981 ("%s: found entry pointing to a different vnode (%p != %p)", 1982 __func__, n2->nc_vp, vp)); 1983 /* 1984 * Entries are supposed to be immutable unless in the 1985 * process of getting destroyed. Accommodating for 1986 * changing timestamps is possible but not worth it. 1987 * This should be harmless in terms of correctness, in 1988 * the worst case resulting in an earlier expiration. 1989 * Alternatively, the found entry can be replaced 1990 * altogether. 1991 */ 1992 MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS))); 1993 #if 0 1994 if (tsp != NULL) { 1995 KASSERT((n2->nc_flag & NCF_TS) != 0, 1996 ("no NCF_TS")); 1997 n2_ts = __containerof(n2, struct namecache_ts, nc_nc); 1998 n2_ts->nc_time = ncp_ts->nc_time; 1999 n2_ts->nc_ticks = ncp_ts->nc_ticks; 2000 if (dtsp != NULL) { 2001 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; 2002 n2_ts->nc_nc.nc_flag |= NCF_DTS; 2003 } 2004 } 2005 #endif 2006 SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name, 2007 vp); 2008 goto out_unlock_free; 2009 } 2010 } 2011 2012 if (flag == NCF_ISDOTDOT) { 2013 /* 2014 * See if we are trying to add .. entry, but some other lookup 2015 * has populated v_cache_dd pointer already. 2016 */ 2017 if (dvp->v_cache_dd != NULL) 2018 goto out_unlock_free; 2019 KASSERT(vp == NULL || vp->v_type == VDIR, 2020 ("wrong vnode type %p", vp)); 2021 vn_seqc_write_begin(dvp); 2022 dvp->v_cache_dd = ncp; 2023 vn_seqc_write_end(dvp); 2024 } 2025 2026 if (vp != NULL) { 2027 if (flag != NCF_ISDOTDOT) { 2028 /* 2029 * For this case, the cache entry maps both the 2030 * directory name in it and the name ".." for the 2031 * directory's parent. 2032 */ 2033 vn_seqc_write_begin(vp); 2034 if ((ndd = vp->v_cache_dd) != NULL) { 2035 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) 2036 cache_zap_locked(ndd); 2037 else 2038 ndd = NULL; 2039 } 2040 vp->v_cache_dd = ncp; 2041 vn_seqc_write_end(vp); 2042 } else if (vp->v_type != VDIR) { 2043 if (vp->v_cache_dd != NULL) { 2044 vn_seqc_write_begin(vp); 2045 vp->v_cache_dd = NULL; 2046 vn_seqc_write_end(vp); 2047 } 2048 } 2049 } 2050 2051 if (flag != NCF_ISDOTDOT) { 2052 if (LIST_EMPTY(&dvp->v_cache_src)) { 2053 vhold(dvp); 2054 counter_u64_add(numcachehv, 1); 2055 } 2056 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); 2057 } 2058 2059 /* 2060 * If the entry is "negative", we place it into the 2061 * "negative" cache queue, otherwise, we place it into the 2062 * destination vnode's cache entries queue. 2063 */ 2064 if (vp != NULL) { 2065 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); 2066 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, 2067 vp); 2068 } else { 2069 if (cnp->cn_flags & ISWHITEOUT) 2070 ncp->nc_flag |= NCF_WHITE; 2071 cache_negative_insert(ncp); 2072 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, 2073 ncp->nc_name); 2074 } 2075 2076 /* 2077 * Insert the new namecache entry into the appropriate chain 2078 * within the cache entries table. 2079 */ 2080 CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash); 2081 2082 atomic_thread_fence_rel(); 2083 /* 2084 * Mark the entry as fully constructed. 2085 * It is immutable past this point until its removal. 2086 */ 2087 atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP); 2088 2089 cache_enter_unlock(&cel); 2090 if (numneg * ncnegfactor > lnumcache) 2091 cache_negative_zap_one(); 2092 if (ndd != NULL) 2093 cache_free(ndd); 2094 return; 2095 out_unlock_free: 2096 cache_enter_unlock(&cel); 2097 atomic_subtract_long(&numcache, 1); 2098 cache_free(ncp); 2099 return; 2100 } 2101 2102 static u_int 2103 cache_roundup_2(u_int val) 2104 { 2105 u_int res; 2106 2107 for (res = 1; res <= val; res <<= 1) 2108 continue; 2109 2110 return (res); 2111 } 2112 2113 static struct nchashhead * 2114 nchinittbl(u_long elements, u_long *hashmask) 2115 { 2116 struct nchashhead *hashtbl; 2117 u_long hashsize, i; 2118 2119 hashsize = cache_roundup_2(elements) / 2; 2120 2121 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK); 2122 for (i = 0; i < hashsize; i++) 2123 CK_SLIST_INIT(&hashtbl[i]); 2124 *hashmask = hashsize - 1; 2125 return (hashtbl); 2126 } 2127 2128 static void 2129 ncfreetbl(struct nchashhead *hashtbl) 2130 { 2131 2132 free(hashtbl, M_VFSCACHE); 2133 } 2134 2135 /* 2136 * Name cache initialization, from vfs_init() when we are booting 2137 */ 2138 static void 2139 nchinit(void *dummy __unused) 2140 { 2141 u_int i; 2142 2143 cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE, 2144 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2145 cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE, 2146 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2147 cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE, 2148 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2149 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE, 2150 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2151 2152 VFS_SMR_ZONE_SET(cache_zone_small); 2153 VFS_SMR_ZONE_SET(cache_zone_small_ts); 2154 VFS_SMR_ZONE_SET(cache_zone_large); 2155 VFS_SMR_ZONE_SET(cache_zone_large_ts); 2156 2157 ncsize = desiredvnodes * ncsizefactor; 2158 nchashtbl = nchinittbl(desiredvnodes * 2, &nchash); 2159 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; 2160 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ 2161 ncbuckethash = 7; 2162 if (ncbuckethash > nchash) 2163 ncbuckethash = nchash; 2164 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, 2165 M_WAITOK | M_ZERO); 2166 for (i = 0; i < numbucketlocks; i++) 2167 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE); 2168 ncvnodehash = ncbuckethash; 2169 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, 2170 M_WAITOK | M_ZERO); 2171 for (i = 0; i < numvnodelocks; i++) 2172 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); 2173 2174 for (i = 0; i < numneglists; i++) { 2175 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); 2176 TAILQ_INIT(&neglists[i].nl_list); 2177 TAILQ_INIT(&neglists[i].nl_hotlist); 2178 } 2179 2180 mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF); 2181 } 2182 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); 2183 2184 void 2185 cache_vnode_init(struct vnode *vp) 2186 { 2187 2188 LIST_INIT(&vp->v_cache_src); 2189 TAILQ_INIT(&vp->v_cache_dst); 2190 vp->v_cache_dd = NULL; 2191 cache_prehash(vp); 2192 } 2193 2194 void 2195 cache_changesize(u_long newmaxvnodes) 2196 { 2197 struct nchashhead *new_nchashtbl, *old_nchashtbl; 2198 u_long new_nchash, old_nchash; 2199 struct namecache *ncp; 2200 uint32_t hash; 2201 u_long newncsize; 2202 int i; 2203 2204 newncsize = newmaxvnodes * ncsizefactor; 2205 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); 2206 if (newmaxvnodes < numbucketlocks) 2207 newmaxvnodes = numbucketlocks; 2208 2209 new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash); 2210 /* If same hash table size, nothing to do */ 2211 if (nchash == new_nchash) { 2212 ncfreetbl(new_nchashtbl); 2213 return; 2214 } 2215 /* 2216 * Move everything from the old hash table to the new table. 2217 * None of the namecache entries in the table can be removed 2218 * because to do so, they have to be removed from the hash table. 2219 */ 2220 cache_lock_all_vnodes(); 2221 cache_lock_all_buckets(); 2222 old_nchashtbl = nchashtbl; 2223 old_nchash = nchash; 2224 nchashtbl = new_nchashtbl; 2225 nchash = new_nchash; 2226 for (i = 0; i <= old_nchash; i++) { 2227 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) { 2228 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, 2229 ncp->nc_dvp); 2230 CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash); 2231 CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); 2232 } 2233 } 2234 ncsize = newncsize; 2235 cache_unlock_all_buckets(); 2236 cache_unlock_all_vnodes(); 2237 ncfreetbl(old_nchashtbl); 2238 } 2239 2240 /* 2241 * Invalidate all entries from and to a particular vnode. 2242 */ 2243 static void 2244 cache_purge_impl(struct vnode *vp) 2245 { 2246 TAILQ_HEAD(, namecache) ncps; 2247 struct namecache *ncp, *nnp; 2248 struct mtx *vlp, *vlp2; 2249 2250 TAILQ_INIT(&ncps); 2251 vlp = VP2VNODELOCK(vp); 2252 vlp2 = NULL; 2253 mtx_lock(vlp); 2254 retry: 2255 while (!LIST_EMPTY(&vp->v_cache_src)) { 2256 ncp = LIST_FIRST(&vp->v_cache_src); 2257 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2258 goto retry; 2259 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2260 } 2261 while (!TAILQ_EMPTY(&vp->v_cache_dst)) { 2262 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2263 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2264 goto retry; 2265 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2266 } 2267 ncp = vp->v_cache_dd; 2268 if (ncp != NULL) { 2269 KASSERT(ncp->nc_flag & NCF_ISDOTDOT, 2270 ("lost dotdot link")); 2271 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2272 goto retry; 2273 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2274 } 2275 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); 2276 mtx_unlock(vlp); 2277 if (vlp2 != NULL) 2278 mtx_unlock(vlp2); 2279 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2280 cache_free(ncp); 2281 } 2282 } 2283 2284 /* 2285 * Opportunistic check to see if there is anything to do. 2286 */ 2287 static bool 2288 cache_has_entries(struct vnode *vp) 2289 { 2290 2291 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2292 vp->v_cache_dd == NULL) 2293 return (false); 2294 return (true); 2295 } 2296 2297 void 2298 cache_purge(struct vnode *vp) 2299 { 2300 2301 SDT_PROBE1(vfs, namecache, purge, done, vp); 2302 if (!cache_has_entries(vp)) 2303 return; 2304 cache_purge_impl(vp); 2305 } 2306 2307 /* 2308 * Only to be used by vgone. 2309 */ 2310 void 2311 cache_purge_vgone(struct vnode *vp) 2312 { 2313 struct mtx *vlp; 2314 2315 VNPASS(VN_IS_DOOMED(vp), vp); 2316 if (cache_has_entries(vp)) { 2317 cache_purge_impl(vp); 2318 return; 2319 } 2320 2321 /* 2322 * Serialize against a potential thread doing cache_purge. 2323 */ 2324 vlp = VP2VNODELOCK(vp); 2325 mtx_wait_unlocked(vlp); 2326 if (cache_has_entries(vp)) { 2327 cache_purge_impl(vp); 2328 return; 2329 } 2330 return; 2331 } 2332 2333 /* 2334 * Invalidate all negative entries for a particular directory vnode. 2335 */ 2336 void 2337 cache_purge_negative(struct vnode *vp) 2338 { 2339 TAILQ_HEAD(, namecache) ncps; 2340 struct namecache *ncp, *nnp; 2341 struct mtx *vlp; 2342 2343 SDT_PROBE1(vfs, namecache, purge_negative, done, vp); 2344 if (LIST_EMPTY(&vp->v_cache_src)) 2345 return; 2346 TAILQ_INIT(&ncps); 2347 vlp = VP2VNODELOCK(vp); 2348 mtx_lock(vlp); 2349 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { 2350 if (!(ncp->nc_flag & NCF_NEGATIVE)) 2351 continue; 2352 cache_zap_negative_locked_vnode_kl(ncp, vp); 2353 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2354 } 2355 mtx_unlock(vlp); 2356 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2357 cache_free(ncp); 2358 } 2359 } 2360 2361 void 2362 cache_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, 2363 struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp) 2364 { 2365 2366 ASSERT_VOP_IN_SEQC(fdvp); 2367 ASSERT_VOP_IN_SEQC(fvp); 2368 ASSERT_VOP_IN_SEQC(tdvp); 2369 if (tvp != NULL) 2370 ASSERT_VOP_IN_SEQC(tvp); 2371 2372 cache_purge(fvp); 2373 if (tvp != NULL) { 2374 cache_purge(tvp); 2375 KASSERT(!cache_remove_cnp(tdvp, tcnp), 2376 ("%s: lingering negative entry", __func__)); 2377 } else { 2378 cache_remove_cnp(tdvp, tcnp); 2379 } 2380 } 2381 2382 /* 2383 * Flush all entries referencing a particular filesystem. 2384 */ 2385 void 2386 cache_purgevfs(struct mount *mp) 2387 { 2388 struct vnode *vp, *mvp; 2389 2390 SDT_PROBE1(vfs, namecache, purgevfs, done, mp); 2391 /* 2392 * Somewhat wasteful iteration over all vnodes. Would be better to 2393 * support filtering and avoid the interlock to begin with. 2394 */ 2395 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 2396 if (!cache_has_entries(vp)) { 2397 VI_UNLOCK(vp); 2398 continue; 2399 } 2400 vholdl(vp); 2401 VI_UNLOCK(vp); 2402 cache_purge(vp); 2403 vdrop(vp); 2404 } 2405 } 2406 2407 /* 2408 * Perform canonical checks and cache lookup and pass on to filesystem 2409 * through the vop_cachedlookup only if needed. 2410 */ 2411 2412 int 2413 vfs_cache_lookup(struct vop_lookup_args *ap) 2414 { 2415 struct vnode *dvp; 2416 int error; 2417 struct vnode **vpp = ap->a_vpp; 2418 struct componentname *cnp = ap->a_cnp; 2419 int flags = cnp->cn_flags; 2420 2421 *vpp = NULL; 2422 dvp = ap->a_dvp; 2423 2424 if (dvp->v_type != VDIR) 2425 return (ENOTDIR); 2426 2427 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && 2428 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) 2429 return (EROFS); 2430 2431 error = vn_dir_check_exec(dvp, cnp); 2432 if (error != 0) 2433 return (error); 2434 2435 error = cache_lookup(dvp, vpp, cnp, NULL, NULL); 2436 if (error == 0) 2437 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); 2438 if (error == -1) 2439 return (0); 2440 return (error); 2441 } 2442 2443 /* Implementation of the getcwd syscall. */ 2444 int 2445 sys___getcwd(struct thread *td, struct __getcwd_args *uap) 2446 { 2447 char *buf, *retbuf; 2448 size_t buflen; 2449 int error; 2450 2451 buflen = uap->buflen; 2452 if (__predict_false(buflen < 2)) 2453 return (EINVAL); 2454 if (buflen > MAXPATHLEN) 2455 buflen = MAXPATHLEN; 2456 2457 buf = uma_zalloc(namei_zone, M_WAITOK); 2458 error = vn_getcwd(buf, &retbuf, &buflen); 2459 if (error == 0) 2460 error = copyout(retbuf, uap->buf, buflen); 2461 uma_zfree(namei_zone, buf); 2462 return (error); 2463 } 2464 2465 int 2466 vn_getcwd(char *buf, char **retbuf, size_t *buflen) 2467 { 2468 struct pwd *pwd; 2469 int error; 2470 2471 vfs_smr_enter(); 2472 pwd = pwd_get_smr(); 2473 error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, 2474 buflen, false, 0); 2475 VFS_SMR_ASSERT_NOT_ENTERED(); 2476 if (error < 0) { 2477 pwd = pwd_hold(curthread); 2478 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf, 2479 retbuf, buflen); 2480 pwd_drop(pwd); 2481 } 2482 2483 #ifdef KTRACE 2484 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0) 2485 ktrnamei(*retbuf); 2486 #endif 2487 return (error); 2488 } 2489 2490 static int 2491 kern___realpathat(struct thread *td, int fd, const char *path, char *buf, 2492 size_t size, int flags, enum uio_seg pathseg) 2493 { 2494 struct nameidata nd; 2495 char *retbuf, *freebuf; 2496 int error; 2497 2498 if (flags != 0) 2499 return (EINVAL); 2500 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1, 2501 pathseg, path, fd, &cap_fstat_rights, td); 2502 if ((error = namei(&nd)) != 0) 2503 return (error); 2504 error = vn_fullpath_hardlink(&nd, &retbuf, &freebuf, &size); 2505 if (error == 0) { 2506 error = copyout(retbuf, buf, size); 2507 free(freebuf, M_TEMP); 2508 } 2509 NDFREE(&nd, 0); 2510 return (error); 2511 } 2512 2513 int 2514 sys___realpathat(struct thread *td, struct __realpathat_args *uap) 2515 { 2516 2517 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size, 2518 uap->flags, UIO_USERSPACE)); 2519 } 2520 2521 /* 2522 * Retrieve the full filesystem path that correspond to a vnode from the name 2523 * cache (if available) 2524 */ 2525 int 2526 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf) 2527 { 2528 struct pwd *pwd; 2529 char *buf; 2530 size_t buflen; 2531 int error; 2532 2533 if (__predict_false(vp == NULL)) 2534 return (EINVAL); 2535 2536 buflen = MAXPATHLEN; 2537 buf = malloc(buflen, M_TEMP, M_WAITOK); 2538 vfs_smr_enter(); 2539 pwd = pwd_get_smr(); 2540 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, false, 0); 2541 VFS_SMR_ASSERT_NOT_ENTERED(); 2542 if (error < 0) { 2543 pwd = pwd_hold(curthread); 2544 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen); 2545 pwd_drop(pwd); 2546 } 2547 if (error == 0) 2548 *freebuf = buf; 2549 else 2550 free(buf, M_TEMP); 2551 return (error); 2552 } 2553 2554 /* 2555 * This function is similar to vn_fullpath, but it attempts to lookup the 2556 * pathname relative to the global root mount point. This is required for the 2557 * auditing sub-system, as audited pathnames must be absolute, relative to the 2558 * global root mount point. 2559 */ 2560 int 2561 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf) 2562 { 2563 char *buf; 2564 size_t buflen; 2565 int error; 2566 2567 if (__predict_false(vp == NULL)) 2568 return (EINVAL); 2569 buflen = MAXPATHLEN; 2570 buf = malloc(buflen, M_TEMP, M_WAITOK); 2571 vfs_smr_enter(); 2572 error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, false, 0); 2573 VFS_SMR_ASSERT_NOT_ENTERED(); 2574 if (error < 0) { 2575 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen); 2576 } 2577 if (error == 0) 2578 *freebuf = buf; 2579 else 2580 free(buf, M_TEMP); 2581 return (error); 2582 } 2583 2584 static struct namecache * 2585 vn_dd_from_dst(struct vnode *vp) 2586 { 2587 struct namecache *ncp; 2588 2589 cache_assert_vnode_locked(vp); 2590 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) { 2591 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2592 return (ncp); 2593 } 2594 return (NULL); 2595 } 2596 2597 int 2598 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, size_t *buflen) 2599 { 2600 struct vnode *dvp; 2601 struct namecache *ncp; 2602 struct mtx *vlp; 2603 int error; 2604 2605 vlp = VP2VNODELOCK(*vp); 2606 mtx_lock(vlp); 2607 ncp = (*vp)->v_cache_dd; 2608 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) { 2609 KASSERT(ncp == vn_dd_from_dst(*vp), 2610 ("%s: mismatch for dd entry (%p != %p)", __func__, 2611 ncp, vn_dd_from_dst(*vp))); 2612 } else { 2613 ncp = vn_dd_from_dst(*vp); 2614 } 2615 if (ncp != NULL) { 2616 if (*buflen < ncp->nc_nlen) { 2617 mtx_unlock(vlp); 2618 vrele(*vp); 2619 counter_u64_add(numfullpathfail4, 1); 2620 error = ENOMEM; 2621 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2622 vp, NULL); 2623 return (error); 2624 } 2625 *buflen -= ncp->nc_nlen; 2626 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 2627 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, 2628 ncp->nc_name, vp); 2629 dvp = *vp; 2630 *vp = ncp->nc_dvp; 2631 vref(*vp); 2632 mtx_unlock(vlp); 2633 vrele(dvp); 2634 return (0); 2635 } 2636 SDT_PROBE1(vfs, namecache, fullpath, miss, vp); 2637 2638 mtx_unlock(vlp); 2639 vn_lock(*vp, LK_SHARED | LK_RETRY); 2640 error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen); 2641 vput(*vp); 2642 if (error) { 2643 counter_u64_add(numfullpathfail2, 1); 2644 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2645 return (error); 2646 } 2647 2648 *vp = dvp; 2649 if (VN_IS_DOOMED(dvp)) { 2650 /* forced unmount */ 2651 vrele(dvp); 2652 error = ENOENT; 2653 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2654 return (error); 2655 } 2656 /* 2657 * *vp has its use count incremented still. 2658 */ 2659 2660 return (0); 2661 } 2662 2663 /* 2664 * Resolve a directory to a pathname. 2665 * 2666 * The name of the directory can always be found in the namecache or fetched 2667 * from the filesystem. There is also guaranteed to be only one parent, meaning 2668 * we can just follow vnodes up until we find the root. 2669 * 2670 * The vnode must be referenced. 2671 */ 2672 static int 2673 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 2674 size_t *len, bool slash_prefixed, size_t addend) 2675 { 2676 #ifdef KDTRACE_HOOKS 2677 struct vnode *startvp = vp; 2678 #endif 2679 struct vnode *vp1; 2680 size_t buflen; 2681 int error; 2682 2683 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); 2684 VNPASS(vp->v_usecount > 0, vp); 2685 2686 buflen = *len; 2687 2688 if (!slash_prefixed) { 2689 MPASS(*len >= 2); 2690 buflen--; 2691 buf[buflen] = '\0'; 2692 } 2693 2694 error = 0; 2695 2696 SDT_PROBE1(vfs, namecache, fullpath, entry, vp); 2697 counter_u64_add(numfullpathcalls, 1); 2698 while (vp != rdir && vp != rootvnode) { 2699 /* 2700 * The vp vnode must be already fully constructed, 2701 * since it is either found in namecache or obtained 2702 * from VOP_VPTOCNP(). We may test for VV_ROOT safely 2703 * without obtaining the vnode lock. 2704 */ 2705 if ((vp->v_vflag & VV_ROOT) != 0) { 2706 vn_lock(vp, LK_RETRY | LK_SHARED); 2707 2708 /* 2709 * With the vnode locked, check for races with 2710 * unmount, forced or not. Note that we 2711 * already verified that vp is not equal to 2712 * the root vnode, which means that 2713 * mnt_vnodecovered can be NULL only for the 2714 * case of unmount. 2715 */ 2716 if (VN_IS_DOOMED(vp) || 2717 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || 2718 vp1->v_mountedhere != vp->v_mount) { 2719 vput(vp); 2720 error = ENOENT; 2721 SDT_PROBE3(vfs, namecache, fullpath, return, 2722 error, vp, NULL); 2723 break; 2724 } 2725 2726 vref(vp1); 2727 vput(vp); 2728 vp = vp1; 2729 continue; 2730 } 2731 if (vp->v_type != VDIR) { 2732 vrele(vp); 2733 counter_u64_add(numfullpathfail1, 1); 2734 error = ENOTDIR; 2735 SDT_PROBE3(vfs, namecache, fullpath, return, 2736 error, vp, NULL); 2737 break; 2738 } 2739 error = vn_vptocnp(&vp, curthread->td_ucred, buf, &buflen); 2740 if (error) 2741 break; 2742 if (buflen == 0) { 2743 vrele(vp); 2744 error = ENOMEM; 2745 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2746 startvp, NULL); 2747 break; 2748 } 2749 buf[--buflen] = '/'; 2750 slash_prefixed = true; 2751 } 2752 if (error) 2753 return (error); 2754 if (!slash_prefixed) { 2755 if (buflen == 0) { 2756 vrele(vp); 2757 counter_u64_add(numfullpathfail4, 1); 2758 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, 2759 startvp, NULL); 2760 return (ENOMEM); 2761 } 2762 buf[--buflen] = '/'; 2763 } 2764 counter_u64_add(numfullpathfound, 1); 2765 vrele(vp); 2766 2767 *retbuf = buf + buflen; 2768 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf); 2769 *len -= buflen; 2770 *len += addend; 2771 return (0); 2772 } 2773 2774 /* 2775 * Resolve an arbitrary vnode to a pathname. 2776 * 2777 * Note 2 caveats: 2778 * - hardlinks are not tracked, thus if the vnode is not a directory this can 2779 * resolve to a different path than the one used to find it 2780 * - namecache is not mandatory, meaning names are not guaranteed to be added 2781 * (in which case resolving fails) 2782 */ 2783 static void __inline 2784 cache_rev_failed_impl(int *reason, int line) 2785 { 2786 2787 *reason = line; 2788 } 2789 #define cache_rev_failed(var) cache_rev_failed_impl((var), __LINE__) 2790 2791 static int 2792 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 2793 char **retbuf, size_t *buflen, bool slash_prefixed, size_t addend) 2794 { 2795 #ifdef KDTRACE_HOOKS 2796 struct vnode *startvp = vp; 2797 #endif 2798 struct vnode *tvp; 2799 struct mount *mp; 2800 struct namecache *ncp; 2801 size_t orig_buflen; 2802 int reason; 2803 int error; 2804 #ifdef KDTRACE_HOOKS 2805 int i; 2806 #endif 2807 seqc_t vp_seqc, tvp_seqc; 2808 u_char nc_flag; 2809 2810 VFS_SMR_ASSERT_ENTERED(); 2811 2812 if (!cache_fast_revlookup) { 2813 vfs_smr_exit(); 2814 return (-1); 2815 } 2816 2817 orig_buflen = *buflen; 2818 2819 if (!slash_prefixed) { 2820 MPASS(*buflen >= 2); 2821 *buflen -= 1; 2822 buf[*buflen] = '\0'; 2823 } 2824 2825 if (vp == rdir || vp == rootvnode) { 2826 if (!slash_prefixed) { 2827 *buflen -= 1; 2828 buf[*buflen] = '/'; 2829 } 2830 goto out_ok; 2831 } 2832 2833 #ifdef KDTRACE_HOOKS 2834 i = 0; 2835 #endif 2836 error = -1; 2837 ncp = NULL; /* for sdt probe down below */ 2838 vp_seqc = vn_seqc_read_any(vp); 2839 if (seqc_in_modify(vp_seqc)) { 2840 cache_rev_failed(&reason); 2841 goto out_abort; 2842 } 2843 2844 for (;;) { 2845 #ifdef KDTRACE_HOOKS 2846 i++; 2847 #endif 2848 if ((vp->v_vflag & VV_ROOT) != 0) { 2849 mp = atomic_load_ptr(&vp->v_mount); 2850 if (mp == NULL) { 2851 cache_rev_failed(&reason); 2852 goto out_abort; 2853 } 2854 tvp = atomic_load_ptr(&mp->mnt_vnodecovered); 2855 tvp_seqc = vn_seqc_read_any(tvp); 2856 if (seqc_in_modify(tvp_seqc)) { 2857 cache_rev_failed(&reason); 2858 goto out_abort; 2859 } 2860 if (!vn_seqc_consistent(vp, vp_seqc)) { 2861 cache_rev_failed(&reason); 2862 goto out_abort; 2863 } 2864 vp = tvp; 2865 vp_seqc = tvp_seqc; 2866 continue; 2867 } 2868 ncp = atomic_load_ptr(&vp->v_cache_dd); 2869 if (ncp == NULL) { 2870 cache_rev_failed(&reason); 2871 goto out_abort; 2872 } 2873 nc_flag = atomic_load_char(&ncp->nc_flag); 2874 if ((nc_flag & NCF_ISDOTDOT) != 0) { 2875 cache_rev_failed(&reason); 2876 goto out_abort; 2877 } 2878 if (!cache_ncp_canuse(ncp)) { 2879 cache_rev_failed(&reason); 2880 goto out_abort; 2881 } 2882 if (ncp->nc_nlen >= *buflen) { 2883 cache_rev_failed(&reason); 2884 error = ENOMEM; 2885 goto out_abort; 2886 } 2887 *buflen -= ncp->nc_nlen; 2888 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 2889 *buflen -= 1; 2890 buf[*buflen] = '/'; 2891 tvp = ncp->nc_dvp; 2892 tvp_seqc = vn_seqc_read_any(tvp); 2893 if (seqc_in_modify(tvp_seqc)) { 2894 cache_rev_failed(&reason); 2895 goto out_abort; 2896 } 2897 if (!vn_seqc_consistent(vp, vp_seqc)) { 2898 cache_rev_failed(&reason); 2899 goto out_abort; 2900 } 2901 vp = tvp; 2902 vp_seqc = tvp_seqc; 2903 if (vp == rdir || vp == rootvnode) 2904 break; 2905 } 2906 out_ok: 2907 vfs_smr_exit(); 2908 *retbuf = buf + *buflen; 2909 *buflen = orig_buflen - *buflen + addend; 2910 SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf); 2911 return (0); 2912 2913 out_abort: 2914 *buflen = orig_buflen; 2915 SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i); 2916 vfs_smr_exit(); 2917 return (error); 2918 } 2919 2920 static int 2921 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 2922 size_t *buflen) 2923 { 2924 size_t orig_buflen; 2925 bool slash_prefixed; 2926 int error; 2927 2928 if (*buflen < 2) 2929 return (EINVAL); 2930 2931 orig_buflen = *buflen; 2932 2933 vref(vp); 2934 slash_prefixed = false; 2935 if (vp->v_type != VDIR) { 2936 *buflen -= 1; 2937 buf[*buflen] = '\0'; 2938 error = vn_vptocnp(&vp, curthread->td_ucred, buf, buflen); 2939 if (error) 2940 return (error); 2941 if (*buflen == 0) { 2942 vrele(vp); 2943 return (ENOMEM); 2944 } 2945 *buflen -= 1; 2946 buf[*buflen] = '/'; 2947 slash_prefixed = true; 2948 } 2949 2950 return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, slash_prefixed, 2951 orig_buflen - *buflen)); 2952 } 2953 2954 /* 2955 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks). 2956 * 2957 * Since the namecache does not track handlings, the caller is expected to first 2958 * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei. 2959 * 2960 * Then we have 2 cases: 2961 * - if the found vnode is a directory, the path can be constructed just by 2962 * fullowing names up the chain 2963 * - otherwise we populate the buffer with the saved name and start resolving 2964 * from the parent 2965 */ 2966 static int 2967 vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, char **freebuf, 2968 size_t *buflen) 2969 { 2970 char *buf, *tmpbuf; 2971 struct pwd *pwd; 2972 struct componentname *cnp; 2973 struct vnode *vp; 2974 size_t addend; 2975 int error; 2976 bool slash_prefixed; 2977 enum vtype type; 2978 2979 if (*buflen < 2) 2980 return (EINVAL); 2981 if (*buflen > MAXPATHLEN) 2982 *buflen = MAXPATHLEN; 2983 2984 slash_prefixed = false; 2985 2986 buf = malloc(*buflen, M_TEMP, M_WAITOK); 2987 2988 addend = 0; 2989 vp = ndp->ni_vp; 2990 /* 2991 * Check for VBAD to work around the vp_crossmp bug in lookup(). 2992 * 2993 * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be 2994 * set to mount point's root vnode while ni_dvp will be vp_crossmp. 2995 * If the type is VDIR (like in this very case) we can skip looking 2996 * at ni_dvp in the first place. However, since vnodes get passed here 2997 * unlocked the target may transition to doomed state (type == VBAD) 2998 * before we get to evaluate the condition. If this happens, we will 2999 * populate part of the buffer and descend to vn_fullpath_dir with 3000 * vp == vp_crossmp. Prevent the problem by checking for VBAD. 3001 * 3002 * This should be atomic_load(&vp->v_type) but it is ilegal to take 3003 * an address of a bit field, even if said field is sized to char. 3004 * Work around the problem by reading the value into a full-sized enum 3005 * and then re-reading it with atomic_load which will still prevent 3006 * the compiler from re-reading down the road. 3007 */ 3008 type = vp->v_type; 3009 type = atomic_load_int(&type); 3010 if (type == VBAD) { 3011 error = ENOENT; 3012 goto out_bad; 3013 } 3014 if (type != VDIR) { 3015 cnp = &ndp->ni_cnd; 3016 addend = cnp->cn_namelen + 2; 3017 if (*buflen < addend) { 3018 error = ENOMEM; 3019 goto out_bad; 3020 } 3021 *buflen -= addend; 3022 tmpbuf = buf + *buflen; 3023 tmpbuf[0] = '/'; 3024 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen); 3025 tmpbuf[addend - 1] = '\0'; 3026 slash_prefixed = true; 3027 vp = ndp->ni_dvp; 3028 } 3029 3030 vfs_smr_enter(); 3031 pwd = pwd_get_smr(); 3032 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen, 3033 slash_prefixed, addend); 3034 VFS_SMR_ASSERT_NOT_ENTERED(); 3035 if (error < 0) { 3036 pwd = pwd_hold(curthread); 3037 vref(vp); 3038 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen, 3039 slash_prefixed, addend); 3040 pwd_drop(pwd); 3041 if (error != 0) 3042 goto out_bad; 3043 } 3044 3045 *freebuf = buf; 3046 3047 return (0); 3048 out_bad: 3049 free(buf, M_TEMP); 3050 return (error); 3051 } 3052 3053 struct vnode * 3054 vn_dir_dd_ino(struct vnode *vp) 3055 { 3056 struct namecache *ncp; 3057 struct vnode *ddvp; 3058 struct mtx *vlp; 3059 enum vgetstate vs; 3060 3061 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); 3062 vlp = VP2VNODELOCK(vp); 3063 mtx_lock(vlp); 3064 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { 3065 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 3066 continue; 3067 ddvp = ncp->nc_dvp; 3068 vs = vget_prep(ddvp); 3069 mtx_unlock(vlp); 3070 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) 3071 return (NULL); 3072 return (ddvp); 3073 } 3074 mtx_unlock(vlp); 3075 return (NULL); 3076 } 3077 3078 int 3079 vn_commname(struct vnode *vp, char *buf, u_int buflen) 3080 { 3081 struct namecache *ncp; 3082 struct mtx *vlp; 3083 int l; 3084 3085 vlp = VP2VNODELOCK(vp); 3086 mtx_lock(vlp); 3087 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) 3088 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 3089 break; 3090 if (ncp == NULL) { 3091 mtx_unlock(vlp); 3092 return (ENOENT); 3093 } 3094 l = min(ncp->nc_nlen, buflen - 1); 3095 memcpy(buf, ncp->nc_name, l); 3096 mtx_unlock(vlp); 3097 buf[l] = '\0'; 3098 return (0); 3099 } 3100 3101 /* 3102 * This function updates path string to vnode's full global path 3103 * and checks the size of the new path string against the pathlen argument. 3104 * 3105 * Requires a locked, referenced vnode. 3106 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 3107 * 3108 * If vp is a directory, the call to vn_fullpath_global() always succeeds 3109 * because it falls back to the ".." lookup if the namecache lookup fails. 3110 */ 3111 int 3112 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, 3113 u_int pathlen) 3114 { 3115 struct nameidata nd; 3116 struct vnode *vp1; 3117 char *rpath, *fbuf; 3118 int error; 3119 3120 ASSERT_VOP_ELOCKED(vp, __func__); 3121 3122 /* Construct global filesystem path from vp. */ 3123 VOP_UNLOCK(vp); 3124 error = vn_fullpath_global(vp, &rpath, &fbuf); 3125 3126 if (error != 0) { 3127 vrele(vp); 3128 return (error); 3129 } 3130 3131 if (strlen(rpath) >= pathlen) { 3132 vrele(vp); 3133 error = ENAMETOOLONG; 3134 goto out; 3135 } 3136 3137 /* 3138 * Re-lookup the vnode by path to detect a possible rename. 3139 * As a side effect, the vnode is relocked. 3140 * If vnode was renamed, return ENOENT. 3141 */ 3142 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, 3143 UIO_SYSSPACE, path, td); 3144 error = namei(&nd); 3145 if (error != 0) { 3146 vrele(vp); 3147 goto out; 3148 } 3149 NDFREE(&nd, NDF_ONLY_PNBUF); 3150 vp1 = nd.ni_vp; 3151 vrele(vp); 3152 if (vp1 == vp) 3153 strcpy(path, rpath); 3154 else { 3155 vput(vp1); 3156 error = ENOENT; 3157 } 3158 3159 out: 3160 free(fbuf, M_TEMP); 3161 return (error); 3162 } 3163 3164 #ifdef DDB 3165 static void 3166 db_print_vpath(struct vnode *vp) 3167 { 3168 3169 while (vp != NULL) { 3170 db_printf("%p: ", vp); 3171 if (vp == rootvnode) { 3172 db_printf("/"); 3173 vp = NULL; 3174 } else { 3175 if (vp->v_vflag & VV_ROOT) { 3176 db_printf("<mount point>"); 3177 vp = vp->v_mount->mnt_vnodecovered; 3178 } else { 3179 struct namecache *ncp; 3180 char *ncn; 3181 int i; 3182 3183 ncp = TAILQ_FIRST(&vp->v_cache_dst); 3184 if (ncp != NULL) { 3185 ncn = ncp->nc_name; 3186 for (i = 0; i < ncp->nc_nlen; i++) 3187 db_printf("%c", *ncn++); 3188 vp = ncp->nc_dvp; 3189 } else { 3190 vp = NULL; 3191 } 3192 } 3193 } 3194 db_printf("\n"); 3195 } 3196 3197 return; 3198 } 3199 3200 DB_SHOW_COMMAND(vpath, db_show_vpath) 3201 { 3202 struct vnode *vp; 3203 3204 if (!have_addr) { 3205 db_printf("usage: show vpath <struct vnode *>\n"); 3206 return; 3207 } 3208 3209 vp = (struct vnode *)addr; 3210 db_print_vpath(vp); 3211 } 3212 3213 #endif 3214 3215 static bool __read_frequently cache_fast_lookup = true; 3216 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW, 3217 &cache_fast_lookup, 0, ""); 3218 3219 #define CACHE_FPL_FAILED -2020 3220 3221 static void 3222 cache_fpl_cleanup_cnp(struct componentname *cnp) 3223 { 3224 3225 uma_zfree(namei_zone, cnp->cn_pnbuf); 3226 #ifdef DIAGNOSTIC 3227 cnp->cn_pnbuf = NULL; 3228 cnp->cn_nameptr = NULL; 3229 #endif 3230 } 3231 3232 static void 3233 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp) 3234 { 3235 struct componentname *cnp; 3236 3237 cnp = &ndp->ni_cnd; 3238 while (*(cnp->cn_nameptr) == '/') { 3239 cnp->cn_nameptr++; 3240 ndp->ni_pathlen--; 3241 } 3242 3243 *dpp = ndp->ni_rootdir; 3244 } 3245 3246 /* 3247 * Components of nameidata (or objects it can point to) which may 3248 * need restoring in case fast path lookup fails. 3249 */ 3250 struct nameidata_saved { 3251 long cn_namelen; 3252 char *cn_nameptr; 3253 size_t ni_pathlen; 3254 int cn_flags; 3255 }; 3256 3257 struct cache_fpl { 3258 struct nameidata *ndp; 3259 struct componentname *cnp; 3260 struct pwd *pwd; 3261 struct vnode *dvp; 3262 struct vnode *tvp; 3263 seqc_t dvp_seqc; 3264 seqc_t tvp_seqc; 3265 struct nameidata_saved snd; 3266 int line; 3267 enum cache_fpl_status status:8; 3268 bool in_smr; 3269 bool fsearch; 3270 }; 3271 3272 static void 3273 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd) 3274 { 3275 3276 snd->cn_flags = fpl->ndp->ni_cnd.cn_flags; 3277 snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen; 3278 snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr; 3279 snd->ni_pathlen = fpl->ndp->ni_pathlen; 3280 } 3281 3282 static void 3283 cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd) 3284 { 3285 3286 fpl->ndp->ni_cnd.cn_flags = snd->cn_flags; 3287 fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen; 3288 fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr; 3289 fpl->ndp->ni_pathlen = snd->ni_pathlen; 3290 } 3291 3292 #ifdef INVARIANTS 3293 #define cache_fpl_smr_assert_entered(fpl) ({ \ 3294 struct cache_fpl *_fpl = (fpl); \ 3295 MPASS(_fpl->in_smr == true); \ 3296 VFS_SMR_ASSERT_ENTERED(); \ 3297 }) 3298 #define cache_fpl_smr_assert_not_entered(fpl) ({ \ 3299 struct cache_fpl *_fpl = (fpl); \ 3300 MPASS(_fpl->in_smr == false); \ 3301 VFS_SMR_ASSERT_NOT_ENTERED(); \ 3302 }) 3303 #else 3304 #define cache_fpl_smr_assert_entered(fpl) do { } while (0) 3305 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0) 3306 #endif 3307 3308 #define cache_fpl_smr_enter_initial(fpl) ({ \ 3309 struct cache_fpl *_fpl = (fpl); \ 3310 vfs_smr_enter(); \ 3311 _fpl->in_smr = true; \ 3312 }) 3313 3314 #define cache_fpl_smr_enter(fpl) ({ \ 3315 struct cache_fpl *_fpl = (fpl); \ 3316 MPASS(_fpl->in_smr == false); \ 3317 vfs_smr_enter(); \ 3318 _fpl->in_smr = true; \ 3319 }) 3320 3321 #define cache_fpl_smr_exit(fpl) ({ \ 3322 struct cache_fpl *_fpl = (fpl); \ 3323 MPASS(_fpl->in_smr == true); \ 3324 vfs_smr_exit(); \ 3325 _fpl->in_smr = false; \ 3326 }) 3327 3328 static int 3329 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line) 3330 { 3331 3332 if (fpl->status != CACHE_FPL_STATUS_UNSET) { 3333 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, 3334 ("%s: converting to abort from %d at %d, set at %d\n", 3335 __func__, fpl->status, line, fpl->line)); 3336 } 3337 fpl->status = CACHE_FPL_STATUS_ABORTED; 3338 fpl->line = line; 3339 return (CACHE_FPL_FAILED); 3340 } 3341 3342 #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__) 3343 3344 static int 3345 cache_fpl_partial_impl(struct cache_fpl *fpl, int line) 3346 { 3347 3348 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3349 ("%s: setting to partial at %d, but already set to %d at %d\n", 3350 __func__, line, fpl->status, fpl->line)); 3351 cache_fpl_smr_assert_entered(fpl); 3352 fpl->status = CACHE_FPL_STATUS_PARTIAL; 3353 fpl->line = line; 3354 return (CACHE_FPL_FAILED); 3355 } 3356 3357 #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__) 3358 3359 static int 3360 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line) 3361 { 3362 3363 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3364 ("%s: setting to handled at %d, but already set to %d at %d\n", 3365 __func__, line, fpl->status, fpl->line)); 3366 cache_fpl_smr_assert_not_entered(fpl); 3367 MPASS(error != CACHE_FPL_FAILED); 3368 fpl->status = CACHE_FPL_STATUS_HANDLED; 3369 fpl->line = line; 3370 return (error); 3371 } 3372 3373 #define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__) 3374 3375 #define CACHE_FPL_SUPPORTED_CN_FLAGS \ 3376 (LOCKLEAF | LOCKPARENT | WANTPARENT | NOCACHE | FOLLOW | LOCKSHARED | SAVENAME | \ 3377 SAVESTART | WILLBEDIR | ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK) 3378 3379 #define CACHE_FPL_INTERNAL_CN_FLAGS \ 3380 (ISDOTDOT | MAKEENTRY | ISLASTCN) 3381 3382 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, 3383 "supported and internal flags overlap"); 3384 3385 static bool 3386 cache_fpl_islastcn(struct nameidata *ndp) 3387 { 3388 3389 return (*ndp->ni_next == 0); 3390 } 3391 3392 static bool 3393 cache_fpl_isdotdot(struct componentname *cnp) 3394 { 3395 3396 if (cnp->cn_namelen == 2 && 3397 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') 3398 return (true); 3399 return (false); 3400 } 3401 3402 static bool 3403 cache_can_fplookup(struct cache_fpl *fpl) 3404 { 3405 struct nameidata *ndp; 3406 struct componentname *cnp; 3407 struct thread *td; 3408 3409 ndp = fpl->ndp; 3410 cnp = fpl->cnp; 3411 td = cnp->cn_thread; 3412 3413 if (!cache_fast_lookup) { 3414 cache_fpl_aborted(fpl); 3415 return (false); 3416 } 3417 #ifdef MAC 3418 if (mac_vnode_check_lookup_enabled()) { 3419 cache_fpl_aborted(fpl); 3420 return (false); 3421 } 3422 #endif 3423 if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) { 3424 cache_fpl_aborted(fpl); 3425 return (false); 3426 } 3427 if (IN_CAPABILITY_MODE(td)) { 3428 cache_fpl_aborted(fpl); 3429 return (false); 3430 } 3431 if (AUDITING_TD(td)) { 3432 cache_fpl_aborted(fpl); 3433 return (false); 3434 } 3435 if (ndp->ni_startdir != NULL) { 3436 cache_fpl_aborted(fpl); 3437 return (false); 3438 } 3439 return (true); 3440 } 3441 3442 static int 3443 cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp) 3444 { 3445 struct nameidata *ndp; 3446 int error; 3447 bool fsearch; 3448 3449 ndp = fpl->ndp; 3450 error = fgetvp_lookup_smr(ndp->ni_dirfd, ndp, vpp, &fsearch); 3451 if (__predict_false(error != 0)) { 3452 cache_fpl_smr_exit(fpl); 3453 return (cache_fpl_aborted(fpl)); 3454 } 3455 fpl->fsearch = fsearch; 3456 return (0); 3457 } 3458 3459 static bool 3460 cache_fplookup_vnode_supported(struct vnode *vp) 3461 { 3462 3463 return (vp->v_type != VLNK); 3464 } 3465 3466 static int __noinline 3467 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp, 3468 uint32_t hash) 3469 { 3470 struct componentname *cnp; 3471 struct vnode *dvp; 3472 3473 cnp = fpl->cnp; 3474 dvp = fpl->dvp; 3475 3476 cache_fpl_smr_exit(fpl); 3477 if (cache_negative_promote_cond(dvp, cnp, oncp, hash)) 3478 return (cache_fpl_handled(fpl, ENOENT)); 3479 else 3480 return (cache_fpl_aborted(fpl)); 3481 } 3482 3483 /* 3484 * The target vnode is not supported, prepare for the slow path to take over. 3485 */ 3486 static int __noinline 3487 cache_fplookup_partial_setup(struct cache_fpl *fpl) 3488 { 3489 struct nameidata *ndp; 3490 struct componentname *cnp; 3491 enum vgetstate dvs; 3492 struct vnode *dvp; 3493 struct pwd *pwd; 3494 seqc_t dvp_seqc; 3495 3496 ndp = fpl->ndp; 3497 cnp = fpl->cnp; 3498 pwd = fpl->pwd; 3499 dvp = fpl->dvp; 3500 dvp_seqc = fpl->dvp_seqc; 3501 3502 if (!pwd_hold_smr(pwd)) { 3503 cache_fpl_smr_exit(fpl); 3504 return (cache_fpl_aborted(fpl)); 3505 } 3506 3507 dvs = vget_prep_smr(dvp); 3508 cache_fpl_smr_exit(fpl); 3509 if (__predict_false(dvs == VGET_NONE)) { 3510 pwd_drop(pwd); 3511 return (cache_fpl_aborted(fpl)); 3512 } 3513 3514 vget_finish_ref(dvp, dvs); 3515 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3516 vrele(dvp); 3517 pwd_drop(pwd); 3518 return (cache_fpl_aborted(fpl)); 3519 } 3520 3521 cache_fpl_restore(fpl, &fpl->snd); 3522 3523 ndp->ni_startdir = dvp; 3524 cnp->cn_flags |= MAKEENTRY; 3525 if (cache_fpl_islastcn(ndp)) 3526 cnp->cn_flags |= ISLASTCN; 3527 if (cache_fpl_isdotdot(cnp)) 3528 cnp->cn_flags |= ISDOTDOT; 3529 3530 return (0); 3531 } 3532 3533 static int 3534 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs) 3535 { 3536 struct componentname *cnp; 3537 struct vnode *tvp; 3538 seqc_t tvp_seqc; 3539 int error, lkflags; 3540 3541 cnp = fpl->cnp; 3542 tvp = fpl->tvp; 3543 tvp_seqc = fpl->tvp_seqc; 3544 3545 if ((cnp->cn_flags & LOCKLEAF) != 0) { 3546 lkflags = LK_SHARED; 3547 if ((cnp->cn_flags & LOCKSHARED) == 0) 3548 lkflags = LK_EXCLUSIVE; 3549 error = vget_finish(tvp, lkflags, tvs); 3550 if (__predict_false(error != 0)) { 3551 return (cache_fpl_aborted(fpl)); 3552 } 3553 } else { 3554 vget_finish_ref(tvp, tvs); 3555 } 3556 3557 if (!vn_seqc_consistent(tvp, tvp_seqc)) { 3558 if ((cnp->cn_flags & LOCKLEAF) != 0) 3559 vput(tvp); 3560 else 3561 vrele(tvp); 3562 return (cache_fpl_aborted(fpl)); 3563 } 3564 3565 return (cache_fpl_handled(fpl, 0)); 3566 } 3567 3568 /* 3569 * They want to possibly modify the state of the namecache. 3570 * 3571 * Don't try to match the API contract, just leave. 3572 * TODO: this leaves scalability on the table 3573 */ 3574 static int 3575 cache_fplookup_final_modifying(struct cache_fpl *fpl) 3576 { 3577 struct componentname *cnp; 3578 3579 cnp = fpl->cnp; 3580 MPASS(cnp->cn_nameiop != LOOKUP); 3581 return (cache_fpl_partial(fpl)); 3582 } 3583 3584 static int __noinline 3585 cache_fplookup_final_withparent(struct cache_fpl *fpl) 3586 { 3587 struct componentname *cnp; 3588 enum vgetstate dvs, tvs; 3589 struct vnode *dvp, *tvp; 3590 seqc_t dvp_seqc; 3591 int error; 3592 3593 cnp = fpl->cnp; 3594 dvp = fpl->dvp; 3595 dvp_seqc = fpl->dvp_seqc; 3596 tvp = fpl->tvp; 3597 3598 MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0); 3599 3600 /* 3601 * This is less efficient than it can be for simplicity. 3602 */ 3603 dvs = vget_prep_smr(dvp); 3604 if (__predict_false(dvs == VGET_NONE)) { 3605 return (cache_fpl_aborted(fpl)); 3606 } 3607 tvs = vget_prep_smr(tvp); 3608 if (__predict_false(tvs == VGET_NONE)) { 3609 cache_fpl_smr_exit(fpl); 3610 vget_abort(dvp, dvs); 3611 return (cache_fpl_aborted(fpl)); 3612 } 3613 3614 cache_fpl_smr_exit(fpl); 3615 3616 if ((cnp->cn_flags & LOCKPARENT) != 0) { 3617 error = vget_finish(dvp, LK_EXCLUSIVE, dvs); 3618 if (__predict_false(error != 0)) { 3619 vget_abort(tvp, tvs); 3620 return (cache_fpl_aborted(fpl)); 3621 } 3622 } else { 3623 vget_finish_ref(dvp, dvs); 3624 } 3625 3626 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3627 vget_abort(tvp, tvs); 3628 if ((cnp->cn_flags & LOCKPARENT) != 0) 3629 vput(dvp); 3630 else 3631 vrele(dvp); 3632 return (cache_fpl_aborted(fpl)); 3633 } 3634 3635 error = cache_fplookup_final_child(fpl, tvs); 3636 if (__predict_false(error != 0)) { 3637 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED); 3638 if ((cnp->cn_flags & LOCKPARENT) != 0) 3639 vput(dvp); 3640 else 3641 vrele(dvp); 3642 return (error); 3643 } 3644 3645 MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED); 3646 return (0); 3647 } 3648 3649 static int 3650 cache_fplookup_final(struct cache_fpl *fpl) 3651 { 3652 struct componentname *cnp; 3653 enum vgetstate tvs; 3654 struct vnode *dvp, *tvp; 3655 seqc_t dvp_seqc; 3656 3657 cnp = fpl->cnp; 3658 dvp = fpl->dvp; 3659 dvp_seqc = fpl->dvp_seqc; 3660 tvp = fpl->tvp; 3661 3662 VNPASS(cache_fplookup_vnode_supported(dvp), dvp); 3663 3664 if (cnp->cn_nameiop != LOOKUP) { 3665 return (cache_fplookup_final_modifying(fpl)); 3666 } 3667 3668 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) 3669 return (cache_fplookup_final_withparent(fpl)); 3670 3671 tvs = vget_prep_smr(tvp); 3672 if (__predict_false(tvs == VGET_NONE)) { 3673 return (cache_fpl_partial(fpl)); 3674 } 3675 3676 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3677 cache_fpl_smr_exit(fpl); 3678 vget_abort(tvp, tvs); 3679 return (cache_fpl_aborted(fpl)); 3680 } 3681 3682 cache_fpl_smr_exit(fpl); 3683 return (cache_fplookup_final_child(fpl, tvs)); 3684 } 3685 3686 static int __noinline 3687 cache_fplookup_dot(struct cache_fpl *fpl) 3688 { 3689 struct vnode *dvp; 3690 3691 dvp = fpl->dvp; 3692 3693 fpl->tvp = dvp; 3694 fpl->tvp_seqc = vn_seqc_read_any(dvp); 3695 if (seqc_in_modify(fpl->tvp_seqc)) { 3696 return (cache_fpl_aborted(fpl)); 3697 } 3698 3699 counter_u64_add(dothits, 1); 3700 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", dvp); 3701 3702 return (0); 3703 } 3704 3705 static int __noinline 3706 cache_fplookup_dotdot(struct cache_fpl *fpl) 3707 { 3708 struct nameidata *ndp; 3709 struct componentname *cnp; 3710 struct namecache *ncp; 3711 struct vnode *dvp; 3712 struct prison *pr; 3713 u_char nc_flag; 3714 3715 ndp = fpl->ndp; 3716 cnp = fpl->cnp; 3717 dvp = fpl->dvp; 3718 3719 /* 3720 * XXX this is racy the same way regular lookup is 3721 */ 3722 for (pr = cnp->cn_cred->cr_prison; pr != NULL; 3723 pr = pr->pr_parent) 3724 if (dvp == pr->pr_root) 3725 break; 3726 3727 if (dvp == ndp->ni_rootdir || 3728 dvp == ndp->ni_topdir || 3729 dvp == rootvnode || 3730 pr != NULL) { 3731 fpl->tvp = dvp; 3732 fpl->tvp_seqc = vn_seqc_read_any(dvp); 3733 if (seqc_in_modify(fpl->tvp_seqc)) { 3734 return (cache_fpl_aborted(fpl)); 3735 } 3736 return (0); 3737 } 3738 3739 if ((dvp->v_vflag & VV_ROOT) != 0) { 3740 /* 3741 * TODO 3742 * The opposite of climb mount is needed here. 3743 */ 3744 return (cache_fpl_aborted(fpl)); 3745 } 3746 3747 ncp = atomic_load_ptr(&dvp->v_cache_dd); 3748 if (ncp == NULL) { 3749 return (cache_fpl_aborted(fpl)); 3750 } 3751 3752 nc_flag = atomic_load_char(&ncp->nc_flag); 3753 if ((nc_flag & NCF_ISDOTDOT) != 0) { 3754 if ((nc_flag & NCF_NEGATIVE) != 0) 3755 return (cache_fpl_aborted(fpl)); 3756 fpl->tvp = ncp->nc_vp; 3757 } else { 3758 fpl->tvp = ncp->nc_dvp; 3759 } 3760 3761 if (__predict_false(!cache_ncp_canuse(ncp))) { 3762 return (cache_fpl_aborted(fpl)); 3763 } 3764 3765 fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp); 3766 if (seqc_in_modify(fpl->tvp_seqc)) { 3767 return (cache_fpl_partial(fpl)); 3768 } 3769 3770 counter_u64_add(dotdothits, 1); 3771 return (0); 3772 } 3773 3774 static int 3775 cache_fplookup_next(struct cache_fpl *fpl) 3776 { 3777 struct componentname *cnp; 3778 struct namecache *ncp; 3779 struct negstate *ns; 3780 struct vnode *dvp, *tvp; 3781 u_char nc_flag; 3782 uint32_t hash; 3783 bool neg_hot; 3784 3785 cnp = fpl->cnp; 3786 dvp = fpl->dvp; 3787 3788 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) { 3789 return (cache_fplookup_dot(fpl)); 3790 } 3791 3792 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 3793 3794 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 3795 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 3796 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 3797 break; 3798 } 3799 3800 /* 3801 * If there is no entry we have to punt to the slow path to perform 3802 * actual lookup. Should there be nothing with this name a negative 3803 * entry will be created. 3804 */ 3805 if (__predict_false(ncp == NULL)) { 3806 return (cache_fpl_partial(fpl)); 3807 } 3808 3809 tvp = atomic_load_ptr(&ncp->nc_vp); 3810 nc_flag = atomic_load_char(&ncp->nc_flag); 3811 if ((nc_flag & NCF_NEGATIVE) != 0) { 3812 /* 3813 * If they want to create an entry we need to replace this one. 3814 */ 3815 if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) { 3816 return (cache_fpl_partial(fpl)); 3817 } 3818 ns = NCP2NEGSTATE(ncp); 3819 neg_hot = ((ns->neg_flag & NEG_HOT) != 0); 3820 if (__predict_false(!cache_ncp_canuse(ncp))) { 3821 return (cache_fpl_partial(fpl)); 3822 } 3823 if (__predict_false((nc_flag & NCF_WHITE) != 0)) { 3824 return (cache_fpl_partial(fpl)); 3825 } 3826 if (!neg_hot) { 3827 return (cache_fplookup_negative_promote(fpl, ncp, hash)); 3828 } 3829 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, 3830 ncp->nc_name); 3831 counter_u64_add(numneghits, 1); 3832 cache_fpl_smr_exit(fpl); 3833 return (cache_fpl_handled(fpl, ENOENT)); 3834 } 3835 3836 if (__predict_false(!cache_ncp_canuse(ncp))) { 3837 return (cache_fpl_partial(fpl)); 3838 } 3839 3840 fpl->tvp = tvp; 3841 fpl->tvp_seqc = vn_seqc_read_any(tvp); 3842 if (seqc_in_modify(fpl->tvp_seqc)) { 3843 return (cache_fpl_partial(fpl)); 3844 } 3845 3846 if (!cache_fplookup_vnode_supported(tvp)) { 3847 return (cache_fpl_partial(fpl)); 3848 } 3849 3850 counter_u64_add(numposhits, 1); 3851 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp); 3852 return (0); 3853 } 3854 3855 static bool 3856 cache_fplookup_mp_supported(struct mount *mp) 3857 { 3858 3859 if (mp == NULL) 3860 return (false); 3861 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) 3862 return (false); 3863 return (true); 3864 } 3865 3866 /* 3867 * Walk up the mount stack (if any). 3868 * 3869 * Correctness is provided in the following ways: 3870 * - all vnodes are protected from freeing with SMR 3871 * - struct mount objects are type stable making them always safe to access 3872 * - stability of the particular mount is provided by busying it 3873 * - relationship between the vnode which is mounted on and the mount is 3874 * verified with the vnode sequence counter after busying 3875 * - association between root vnode of the mount and the mount is protected 3876 * by busy 3877 * 3878 * From that point on we can read the sequence counter of the root vnode 3879 * and get the next mount on the stack (if any) using the same protection. 3880 * 3881 * By the end of successful walk we are guaranteed the reached state was 3882 * indeed present at least at some point which matches the regular lookup. 3883 */ 3884 static int __noinline 3885 cache_fplookup_climb_mount(struct cache_fpl *fpl) 3886 { 3887 struct mount *mp, *prev_mp; 3888 struct vnode *vp; 3889 seqc_t vp_seqc; 3890 3891 vp = fpl->tvp; 3892 vp_seqc = fpl->tvp_seqc; 3893 3894 VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp); 3895 mp = atomic_load_ptr(&vp->v_mountedhere); 3896 if (mp == NULL) 3897 return (0); 3898 3899 prev_mp = NULL; 3900 for (;;) { 3901 if (!vfs_op_thread_enter_crit(mp)) { 3902 if (prev_mp != NULL) 3903 vfs_op_thread_exit_crit(prev_mp); 3904 return (cache_fpl_partial(fpl)); 3905 } 3906 if (prev_mp != NULL) 3907 vfs_op_thread_exit_crit(prev_mp); 3908 if (!vn_seqc_consistent(vp, vp_seqc)) { 3909 vfs_op_thread_exit_crit(mp); 3910 return (cache_fpl_partial(fpl)); 3911 } 3912 if (!cache_fplookup_mp_supported(mp)) { 3913 vfs_op_thread_exit_crit(mp); 3914 return (cache_fpl_partial(fpl)); 3915 } 3916 vp = atomic_load_ptr(&mp->mnt_rootvnode); 3917 if (vp == NULL || VN_IS_DOOMED(vp)) { 3918 vfs_op_thread_exit_crit(mp); 3919 return (cache_fpl_partial(fpl)); 3920 } 3921 vp_seqc = vn_seqc_read_any(vp); 3922 if (seqc_in_modify(vp_seqc)) { 3923 vfs_op_thread_exit_crit(mp); 3924 return (cache_fpl_partial(fpl)); 3925 } 3926 prev_mp = mp; 3927 mp = atomic_load_ptr(&vp->v_mountedhere); 3928 if (mp == NULL) 3929 break; 3930 } 3931 3932 vfs_op_thread_exit_crit(prev_mp); 3933 fpl->tvp = vp; 3934 fpl->tvp_seqc = vp_seqc; 3935 return (0); 3936 } 3937 3938 static bool 3939 cache_fplookup_need_climb_mount(struct cache_fpl *fpl) 3940 { 3941 struct mount *mp; 3942 struct vnode *vp; 3943 3944 vp = fpl->tvp; 3945 3946 /* 3947 * Hack: while this is a union, the pointer tends to be NULL so save on 3948 * a branch. 3949 */ 3950 mp = atomic_load_ptr(&vp->v_mountedhere); 3951 if (mp == NULL) 3952 return (false); 3953 if (vp->v_type == VDIR) 3954 return (true); 3955 return (false); 3956 } 3957 3958 /* 3959 * Parse the path. 3960 * 3961 * The code was originally copy-pasted from regular lookup and despite 3962 * clean ups leaves performance on the table. Any modifications here 3963 * must take into account that in case off fallback the resulting 3964 * nameidata state has to be compatible with the original. 3965 */ 3966 static int 3967 cache_fplookup_parse(struct cache_fpl *fpl) 3968 { 3969 struct nameidata *ndp; 3970 struct componentname *cnp; 3971 char *cp; 3972 3973 ndp = fpl->ndp; 3974 cnp = fpl->cnp; 3975 3976 /* 3977 * Search a new directory. 3978 * 3979 * The last component of the filename is left accessible via 3980 * cnp->cn_nameptr for callers that need the name. Callers needing 3981 * the name set the SAVENAME flag. When done, they assume 3982 * responsibility for freeing the pathname buffer. 3983 */ 3984 for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++) 3985 continue; 3986 cnp->cn_namelen = cp - cnp->cn_nameptr; 3987 if (__predict_false(cnp->cn_namelen > NAME_MAX)) { 3988 cache_fpl_smr_exit(fpl); 3989 return (cache_fpl_handled(fpl, ENAMETOOLONG)); 3990 } 3991 ndp->ni_pathlen -= cnp->cn_namelen; 3992 KASSERT(ndp->ni_pathlen <= PATH_MAX, 3993 ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen)); 3994 ndp->ni_next = cp; 3995 3996 /* 3997 * Replace multiple slashes by a single slash and trailing slashes 3998 * by a null. This must be done before VOP_LOOKUP() because some 3999 * fs's don't know about trailing slashes. Remember if there were 4000 * trailing slashes to handle symlinks, existing non-directories 4001 * and non-existing files that won't be directories specially later. 4002 */ 4003 while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) { 4004 cp++; 4005 ndp->ni_pathlen--; 4006 if (*cp == '\0') { 4007 /* 4008 * TODO 4009 * Regular lookup performs the following: 4010 * *ndp->ni_next = '\0'; 4011 * cnp->cn_flags |= TRAILINGSLASH; 4012 * 4013 * Which is problematic since it modifies data read 4014 * from userspace. Then if fast path lookup was to 4015 * abort we would have to either restore it or convey 4016 * the flag. Since this is a corner case just ignore 4017 * it for simplicity. 4018 */ 4019 return (cache_fpl_partial(fpl)); 4020 } 4021 } 4022 ndp->ni_next = cp; 4023 4024 /* 4025 * Check for degenerate name (e.g. / or "") 4026 * which is a way of talking about a directory, 4027 * e.g. like "/." or ".". 4028 * 4029 * TODO 4030 * Another corner case handled by the regular lookup 4031 */ 4032 if (__predict_false(cnp->cn_nameptr[0] == '\0')) { 4033 return (cache_fpl_partial(fpl)); 4034 } 4035 return (0); 4036 } 4037 4038 static void 4039 cache_fplookup_parse_advance(struct cache_fpl *fpl) 4040 { 4041 struct nameidata *ndp; 4042 struct componentname *cnp; 4043 4044 ndp = fpl->ndp; 4045 cnp = fpl->cnp; 4046 4047 cnp->cn_nameptr = ndp->ni_next; 4048 while (*cnp->cn_nameptr == '/') { 4049 cnp->cn_nameptr++; 4050 ndp->ni_pathlen--; 4051 } 4052 } 4053 4054 /* 4055 * See the API contract for VOP_FPLOOKUP_VEXEC. 4056 */ 4057 static int __noinline 4058 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error) 4059 { 4060 struct componentname *cnp; 4061 struct vnode *dvp; 4062 seqc_t dvp_seqc; 4063 4064 cnp = fpl->cnp; 4065 dvp = fpl->dvp; 4066 dvp_seqc = fpl->dvp_seqc; 4067 4068 /* 4069 * Hack: they may be looking up foo/bar, where foo is a 4070 * regular file. In such a case we need to turn ENOTDIR, 4071 * but we may happen to get here with a different error. 4072 */ 4073 if (dvp->v_type != VDIR) { 4074 /* 4075 * The check here is predominantly to catch 4076 * EOPNOTSUPP from dead_vnodeops. If the vnode 4077 * gets doomed past this point it is going to 4078 * fail seqc verification. 4079 */ 4080 if (VN_IS_DOOMED(dvp)) { 4081 return (cache_fpl_aborted(fpl)); 4082 } 4083 error = ENOTDIR; 4084 } 4085 4086 /* 4087 * Hack: handle O_SEARCH. 4088 * 4089 * Open Group Base Specifications Issue 7, 2018 edition states: 4090 * If the access mode of the open file description associated with the 4091 * file descriptor is not O_SEARCH, the function shall check whether 4092 * directory searches are permitted using the current permissions of 4093 * the directory underlying the file descriptor. If the access mode is 4094 * O_SEARCH, the function shall not perform the check. 4095 * 4096 * Regular lookup tests for the NOEXECCHECK flag for every path 4097 * component to decide whether to do the permission check. However, 4098 * since most lookups never have the flag (and when they do it is only 4099 * present for the first path component), lockless lookup only acts on 4100 * it if there is a permission problem. Here the flag is represented 4101 * with a boolean so that we don't have to clear it on the way out. 4102 * 4103 * For simplicity this always aborts. 4104 * TODO: check if this is the first lookup and ignore the permission 4105 * problem. Note the flag has to survive fallback (if it happens to be 4106 * performed). 4107 */ 4108 if (fpl->fsearch) { 4109 return (cache_fpl_aborted(fpl)); 4110 } 4111 4112 switch (error) { 4113 case EAGAIN: 4114 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4115 error = cache_fpl_aborted(fpl); 4116 } else { 4117 cache_fpl_partial(fpl); 4118 } 4119 break; 4120 default: 4121 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4122 error = cache_fpl_aborted(fpl); 4123 } else { 4124 cache_fpl_smr_exit(fpl); 4125 cache_fpl_handled(fpl, error); 4126 } 4127 break; 4128 } 4129 return (error); 4130 } 4131 4132 static int 4133 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl) 4134 { 4135 struct nameidata *ndp; 4136 struct componentname *cnp; 4137 struct mount *mp; 4138 int error; 4139 4140 error = CACHE_FPL_FAILED; 4141 ndp = fpl->ndp; 4142 cnp = fpl->cnp; 4143 4144 cache_fpl_checkpoint(fpl, &fpl->snd); 4145 4146 fpl->dvp = dvp; 4147 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); 4148 if (seqc_in_modify(fpl->dvp_seqc)) { 4149 cache_fpl_aborted(fpl); 4150 goto out; 4151 } 4152 mp = atomic_load_ptr(&fpl->dvp->v_mount); 4153 if (!cache_fplookup_mp_supported(mp)) { 4154 cache_fpl_aborted(fpl); 4155 goto out; 4156 } 4157 4158 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); 4159 4160 for (;;) { 4161 error = cache_fplookup_parse(fpl); 4162 if (__predict_false(error != 0)) { 4163 break; 4164 } 4165 4166 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); 4167 4168 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred); 4169 if (__predict_false(error != 0)) { 4170 error = cache_fplookup_failed_vexec(fpl, error); 4171 break; 4172 } 4173 4174 if (__predict_false(cache_fpl_isdotdot(cnp))) { 4175 error = cache_fplookup_dotdot(fpl); 4176 if (__predict_false(error != 0)) { 4177 break; 4178 } 4179 } else { 4180 error = cache_fplookup_next(fpl); 4181 if (__predict_false(error != 0)) { 4182 break; 4183 } 4184 4185 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 4186 4187 if (cache_fplookup_need_climb_mount(fpl)) { 4188 error = cache_fplookup_climb_mount(fpl); 4189 if (__predict_false(error != 0)) { 4190 break; 4191 } 4192 } 4193 } 4194 4195 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 4196 4197 if (cache_fpl_islastcn(ndp)) { 4198 error = cache_fplookup_final(fpl); 4199 break; 4200 } 4201 4202 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { 4203 error = cache_fpl_aborted(fpl); 4204 break; 4205 } 4206 4207 fpl->dvp = fpl->tvp; 4208 fpl->dvp_seqc = fpl->tvp_seqc; 4209 4210 cache_fplookup_parse_advance(fpl); 4211 cache_fpl_checkpoint(fpl, &fpl->snd); 4212 } 4213 out: 4214 switch (fpl->status) { 4215 case CACHE_FPL_STATUS_UNSET: 4216 __assert_unreachable(); 4217 break; 4218 case CACHE_FPL_STATUS_PARTIAL: 4219 cache_fpl_smr_assert_entered(fpl); 4220 return (cache_fplookup_partial_setup(fpl)); 4221 case CACHE_FPL_STATUS_ABORTED: 4222 if (fpl->in_smr) 4223 cache_fpl_smr_exit(fpl); 4224 return (CACHE_FPL_FAILED); 4225 case CACHE_FPL_STATUS_HANDLED: 4226 MPASS(error != CACHE_FPL_FAILED); 4227 cache_fpl_smr_assert_not_entered(fpl); 4228 if (__predict_false(error != 0)) { 4229 ndp->ni_dvp = NULL; 4230 ndp->ni_vp = NULL; 4231 cache_fpl_cleanup_cnp(cnp); 4232 return (error); 4233 } 4234 ndp->ni_dvp = fpl->dvp; 4235 ndp->ni_vp = fpl->tvp; 4236 if (cnp->cn_flags & SAVENAME) 4237 cnp->cn_flags |= HASBUF; 4238 else 4239 cache_fpl_cleanup_cnp(cnp); 4240 return (error); 4241 } 4242 } 4243 4244 /* 4245 * Fast path lookup protected with SMR and sequence counters. 4246 * 4247 * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one. 4248 * 4249 * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria 4250 * outlined below. 4251 * 4252 * Traditional vnode lookup conceptually looks like this: 4253 * 4254 * vn_lock(current); 4255 * for (;;) { 4256 * next = find(); 4257 * vn_lock(next); 4258 * vn_unlock(current); 4259 * current = next; 4260 * if (last) 4261 * break; 4262 * } 4263 * return (current); 4264 * 4265 * Each jump to the next vnode is safe memory-wise and atomic with respect to 4266 * any modifications thanks to holding respective locks. 4267 * 4268 * The same guarantee can be provided with a combination of safe memory 4269 * reclamation and sequence counters instead. If all operations which affect 4270 * the relationship between the current vnode and the one we are looking for 4271 * also modify the counter, we can verify whether all the conditions held as 4272 * we made the jump. This includes things like permissions, mount points etc. 4273 * Counter modification is provided by enclosing relevant places in 4274 * vn_seqc_write_begin()/end() calls. 4275 * 4276 * Thus this translates to: 4277 * 4278 * vfs_smr_enter(); 4279 * dvp_seqc = seqc_read_any(dvp); 4280 * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode 4281 * abort(); 4282 * for (;;) { 4283 * tvp = find(); 4284 * tvp_seqc = seqc_read_any(tvp); 4285 * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode 4286 * abort(); 4287 * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode 4288 * abort(); 4289 * dvp = tvp; // we know nothing of importance has changed 4290 * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration 4291 * if (last) 4292 * break; 4293 * } 4294 * vget(); // secure the vnode 4295 * if (!seqc_consistent(tvp, tvp_seqc) // final check 4296 * abort(); 4297 * // at this point we know nothing has changed for any parent<->child pair 4298 * // as they were crossed during the lookup, meaning we matched the guarantee 4299 * // of the locked variant 4300 * return (tvp); 4301 * 4302 * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows: 4303 * - they are called while within vfs_smr protection which they must never exit 4304 * - EAGAIN can be returned to denote checking could not be performed, it is 4305 * always valid to return it 4306 * - if the sequence counter has not changed the result must be valid 4307 * - if the sequence counter has changed both false positives and false negatives 4308 * are permitted (since the result will be rejected later) 4309 * - for simple cases of unix permission checks vaccess_vexec_smr can be used 4310 * 4311 * Caveats to watch out for: 4312 * - vnodes are passed unlocked and unreferenced with nothing stopping 4313 * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised 4314 * to use atomic_load_ptr to fetch it. 4315 * - the aforementioned object can also get freed, meaning absent other means it 4316 * should be protected with vfs_smr 4317 * - either safely checking permissions as they are modified or guaranteeing 4318 * their stability is left to the routine 4319 */ 4320 int 4321 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status, 4322 struct pwd **pwdp) 4323 { 4324 struct cache_fpl fpl; 4325 struct pwd *pwd; 4326 struct vnode *dvp; 4327 struct componentname *cnp; 4328 struct nameidata_saved orig; 4329 int error; 4330 4331 MPASS(ndp->ni_lcf == 0); 4332 4333 fpl.status = CACHE_FPL_STATUS_UNSET; 4334 fpl.ndp = ndp; 4335 fpl.cnp = &ndp->ni_cnd; 4336 MPASS(curthread == fpl.cnp->cn_thread); 4337 4338 if ((fpl.cnp->cn_flags & SAVESTART) != 0) 4339 MPASS(fpl.cnp->cn_nameiop != LOOKUP); 4340 4341 if (!cache_can_fplookup(&fpl)) { 4342 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 4343 *status = fpl.status; 4344 return (EOPNOTSUPP); 4345 } 4346 4347 cache_fpl_checkpoint(&fpl, &orig); 4348 4349 cache_fpl_smr_enter_initial(&fpl); 4350 fpl.fsearch = false; 4351 pwd = pwd_get_smr(); 4352 fpl.pwd = pwd; 4353 ndp->ni_rootdir = pwd->pwd_rdir; 4354 ndp->ni_topdir = pwd->pwd_jdir; 4355 4356 cnp = fpl.cnp; 4357 cnp->cn_nameptr = cnp->cn_pnbuf; 4358 if (cnp->cn_pnbuf[0] == '/') { 4359 cache_fpl_handle_root(ndp, &dvp); 4360 } else { 4361 if (ndp->ni_dirfd == AT_FDCWD) { 4362 dvp = pwd->pwd_cdir; 4363 } else { 4364 error = cache_fplookup_dirfd(&fpl, &dvp); 4365 if (__predict_false(error != 0)) { 4366 goto out; 4367 } 4368 } 4369 } 4370 4371 SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true); 4372 4373 error = cache_fplookup_impl(dvp, &fpl); 4374 out: 4375 cache_fpl_smr_assert_not_entered(&fpl); 4376 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 4377 4378 *status = fpl.status; 4379 switch (fpl.status) { 4380 case CACHE_FPL_STATUS_UNSET: 4381 __assert_unreachable(); 4382 break; 4383 case CACHE_FPL_STATUS_HANDLED: 4384 SDT_PROBE3(vfs, namei, lookup, return, error, 4385 (error == 0 ? ndp->ni_vp : NULL), true); 4386 break; 4387 case CACHE_FPL_STATUS_PARTIAL: 4388 *pwdp = fpl.pwd; 4389 /* 4390 * Status restored by cache_fplookup_partial_setup. 4391 */ 4392 break; 4393 case CACHE_FPL_STATUS_ABORTED: 4394 cache_fpl_restore(&fpl, &orig); 4395 break; 4396 } 4397 return (error); 4398 } 4399