1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Poul-Henning Kamp of the FreeBSD Project. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_ddb.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/capsicum.h> 46 #include <sys/counter.h> 47 #include <sys/filedesc.h> 48 #include <sys/fnv_hash.h> 49 #include <sys/kernel.h> 50 #include <sys/ktr.h> 51 #include <sys/lock.h> 52 #include <sys/malloc.h> 53 #include <sys/fcntl.h> 54 #include <sys/jail.h> 55 #include <sys/mount.h> 56 #include <sys/namei.h> 57 #include <sys/proc.h> 58 #include <sys/seqc.h> 59 #include <sys/sdt.h> 60 #include <sys/smr.h> 61 #include <sys/smp.h> 62 #include <sys/syscallsubr.h> 63 #include <sys/sysctl.h> 64 #include <sys/sysproto.h> 65 #include <sys/vnode.h> 66 #include <ck_queue.h> 67 #ifdef KTRACE 68 #include <sys/ktrace.h> 69 #endif 70 71 #include <sys/capsicum.h> 72 73 #include <security/audit/audit.h> 74 #include <security/mac/mac_framework.h> 75 76 #ifdef DDB 77 #include <ddb/ddb.h> 78 #endif 79 80 #include <vm/uma.h> 81 82 SDT_PROVIDER_DECLARE(vfs); 83 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", 84 "struct vnode *"); 85 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", 86 "char *"); 87 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *", 88 "const char *"); 89 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *", 90 "struct namecache *", "int", "int"); 91 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); 92 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", 93 "char *", "struct vnode *"); 94 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); 95 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", 96 "struct vnode *", "char *"); 97 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", 98 "struct vnode *"); 99 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, 100 "struct vnode *", "char *"); 101 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", 102 "char *"); 103 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *", 104 "struct componentname *"); 105 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *", 106 "struct componentname *"); 107 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); 108 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); 109 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); 110 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", 111 "struct vnode *"); 112 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", 113 "char *"); 114 SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *", 115 "char *"); 116 117 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool"); 118 SDT_PROBE_DECLARE(vfs, namei, lookup, entry); 119 SDT_PROBE_DECLARE(vfs, namei, lookup, return); 120 121 /* 122 * This structure describes the elements in the cache of recent 123 * names looked up by namei. 124 */ 125 struct negstate { 126 u_char neg_flag; 127 }; 128 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *), 129 "the state must fit in a union with a pointer without growing it"); 130 131 struct namecache { 132 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 133 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 134 CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */ 135 struct vnode *nc_dvp; /* vnode of parent of name */ 136 union { 137 struct vnode *nu_vp; /* vnode the name refers to */ 138 struct negstate nu_neg;/* negative entry state */ 139 } n_un; 140 u_char nc_flag; /* flag bits */ 141 u_char nc_nlen; /* length of name */ 142 char nc_name[0]; /* segment name + nul */ 143 }; 144 145 /* 146 * struct namecache_ts repeats struct namecache layout up to the 147 * nc_nlen member. 148 * struct namecache_ts is used in place of struct namecache when time(s) need 149 * to be stored. The nc_dotdottime field is used when a cache entry is mapping 150 * both a non-dotdot directory name plus dotdot for the directory's 151 * parent. 152 * 153 * See below for alignment requirement. 154 */ 155 struct namecache_ts { 156 struct timespec nc_time; /* timespec provided by fs */ 157 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ 158 int nc_ticks; /* ticks value when entry was added */ 159 struct namecache nc_nc; 160 }; 161 162 /* 163 * At least mips n32 performs 64-bit accesses to timespec as found 164 * in namecache_ts and requires them to be aligned. Since others 165 * may be in the same spot suffer a little bit and enforce the 166 * alignment for everyone. Note this is a nop for 64-bit platforms. 167 */ 168 #define CACHE_ZONE_ALIGNMENT UMA_ALIGNOF(time_t) 169 #define CACHE_PATH_CUTOFF 39 170 171 #define CACHE_ZONE_SMALL_SIZE (sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1) 172 #define CACHE_ZONE_SMALL_TS_SIZE (sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1) 173 #define CACHE_ZONE_LARGE_SIZE (sizeof(struct namecache) + NAME_MAX + 1) 174 #define CACHE_ZONE_LARGE_TS_SIZE (sizeof(struct namecache_ts) + NAME_MAX + 1) 175 176 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 177 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 178 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 179 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 180 181 #define nc_vp n_un.nu_vp 182 #define nc_neg n_un.nu_neg 183 184 /* 185 * Flags in namecache.nc_flag 186 */ 187 #define NCF_WHITE 0x01 188 #define NCF_ISDOTDOT 0x02 189 #define NCF_TS 0x04 190 #define NCF_DTS 0x08 191 #define NCF_DVDROP 0x10 192 #define NCF_NEGATIVE 0x20 193 #define NCF_INVALID 0x40 194 #define NCF_WIP 0x80 195 196 /* 197 * Flags in negstate.neg_flag 198 */ 199 #define NEG_HOT 0x01 200 201 /* 202 * Mark an entry as invalid. 203 * 204 * This is called before it starts getting deconstructed. 205 */ 206 static void 207 cache_ncp_invalidate(struct namecache *ncp) 208 { 209 210 KASSERT((ncp->nc_flag & NCF_INVALID) == 0, 211 ("%s: entry %p already invalid", __func__, ncp)); 212 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID); 213 atomic_thread_fence_rel(); 214 } 215 216 /* 217 * Check whether the entry can be safely used. 218 * 219 * All places which elide locks are supposed to call this after they are 220 * done with reading from an entry. 221 */ 222 static bool 223 cache_ncp_canuse(struct namecache *ncp) 224 { 225 226 atomic_thread_fence_acq(); 227 return ((atomic_load_char(&ncp->nc_flag) & (NCF_INVALID | NCF_WIP)) == 0); 228 } 229 230 /* 231 * Name caching works as follows: 232 * 233 * Names found by directory scans are retained in a cache 234 * for future reference. It is managed LRU, so frequently 235 * used names will hang around. Cache is indexed by hash value 236 * obtained from (dvp, name) where dvp refers to the directory 237 * containing name. 238 * 239 * If it is a "negative" entry, (i.e. for a name that is known NOT to 240 * exist) the vnode pointer will be NULL. 241 * 242 * Upon reaching the last segment of a path, if the reference 243 * is for DELETE, or NOCACHE is set (rewrite), and the 244 * name is located in the cache, it will be dropped. 245 * 246 * These locks are used (in the order in which they can be taken): 247 * NAME TYPE ROLE 248 * vnodelock mtx vnode lists and v_cache_dd field protection 249 * bucketlock mtx for access to given set of hash buckets 250 * neglist mtx negative entry LRU management 251 * 252 * Additionally, ncneg_shrink_lock mtx is used to have at most one thread 253 * shrinking the LRU list. 254 * 255 * It is legal to take multiple vnodelock and bucketlock locks. The locking 256 * order is lower address first. Both are recursive. 257 * 258 * "." lookups are lockless. 259 * 260 * ".." and vnode -> name lookups require vnodelock. 261 * 262 * name -> vnode lookup requires the relevant bucketlock to be held for reading. 263 * 264 * Insertions and removals of entries require involved vnodes and bucketlocks 265 * to be locked to provide safe operation against other threads modifying the 266 * cache. 267 * 268 * Some lookups result in removal of the found entry (e.g. getting rid of a 269 * negative entry with the intent to create a positive one), which poses a 270 * problem when multiple threads reach the state. Similarly, two different 271 * threads can purge two different vnodes and try to remove the same name. 272 * 273 * If the already held vnode lock is lower than the second required lock, we 274 * can just take the other lock. However, in the opposite case, this could 275 * deadlock. As such, this is resolved by trylocking and if that fails unlocking 276 * the first node, locking everything in order and revalidating the state. 277 */ 278 279 VFS_SMR_DECLARE; 280 281 /* 282 * Structures associated with name caching. 283 */ 284 #define NCHHASH(hash) \ 285 (&nchashtbl[(hash) & nchash]) 286 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ 287 static u_long __read_mostly nchash; /* size of hash table */ 288 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 289 "Size of namecache hash table"); 290 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */ 291 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 292 "Ratio of negative namecache entries"); 293 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ 294 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ 295 u_int ncsizefactor = 2; 296 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0, 297 "Size factor for namecache"); 298 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */ 299 300 struct nchstats nchstats; /* cache effectiveness statistics */ 301 302 static bool __read_frequently cache_fast_revlookup = true; 303 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_revlookup, CTLFLAG_RW, 304 &cache_fast_revlookup, 0, ""); 305 306 static struct mtx __exclusive_cache_line ncneg_shrink_lock; 307 308 struct neglist { 309 struct mtx nl_lock; 310 TAILQ_HEAD(, namecache) nl_list; 311 } __aligned(CACHE_LINE_SIZE); 312 313 static struct neglist __read_mostly *neglists; 314 static struct neglist ncneg_hot; 315 static u_long numhotneg; 316 317 #define ncneghash 3 318 #define numneglists (ncneghash + 1) 319 static inline struct neglist * 320 NCP2NEGLIST(struct namecache *ncp) 321 { 322 323 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); 324 } 325 326 static inline struct negstate * 327 NCP2NEGSTATE(struct namecache *ncp) 328 { 329 330 MPASS(ncp->nc_flag & NCF_NEGATIVE); 331 return (&ncp->nc_neg); 332 } 333 334 #define numbucketlocks (ncbuckethash + 1) 335 static u_int __read_mostly ncbuckethash; 336 static struct mtx_padalign __read_mostly *bucketlocks; 337 #define HASH2BUCKETLOCK(hash) \ 338 ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)])) 339 340 #define numvnodelocks (ncvnodehash + 1) 341 static u_int __read_mostly ncvnodehash; 342 static struct mtx __read_mostly *vnodelocks; 343 static inline struct mtx * 344 VP2VNODELOCK(struct vnode *vp) 345 { 346 347 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); 348 } 349 350 /* 351 * UMA zones for the VFS cache. 352 * 353 * The small cache is used for entries with short names, which are the 354 * most common. The large cache is used for entries which are too big to 355 * fit in the small cache. 356 */ 357 static uma_zone_t __read_mostly cache_zone_small; 358 static uma_zone_t __read_mostly cache_zone_small_ts; 359 static uma_zone_t __read_mostly cache_zone_large; 360 static uma_zone_t __read_mostly cache_zone_large_ts; 361 362 static struct namecache * 363 cache_alloc(int len, int ts) 364 { 365 struct namecache_ts *ncp_ts; 366 struct namecache *ncp; 367 368 if (__predict_false(ts)) { 369 if (len <= CACHE_PATH_CUTOFF) 370 ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK); 371 else 372 ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK); 373 ncp = &ncp_ts->nc_nc; 374 } else { 375 if (len <= CACHE_PATH_CUTOFF) 376 ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK); 377 else 378 ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK); 379 } 380 return (ncp); 381 } 382 383 static void 384 cache_free(struct namecache *ncp) 385 { 386 struct namecache_ts *ncp_ts; 387 388 MPASS(ncp != NULL); 389 if ((ncp->nc_flag & NCF_DVDROP) != 0) 390 vdrop(ncp->nc_dvp); 391 if (__predict_false(ncp->nc_flag & NCF_TS)) { 392 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 393 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 394 uma_zfree_smr(cache_zone_small_ts, ncp_ts); 395 else 396 uma_zfree_smr(cache_zone_large_ts, ncp_ts); 397 } else { 398 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 399 uma_zfree_smr(cache_zone_small, ncp); 400 else 401 uma_zfree_smr(cache_zone_large, ncp); 402 } 403 } 404 405 static void 406 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) 407 { 408 struct namecache_ts *ncp_ts; 409 410 KASSERT((ncp->nc_flag & NCF_TS) != 0 || 411 (tsp == NULL && ticksp == NULL), 412 ("No NCF_TS")); 413 414 if (tsp == NULL) 415 return; 416 417 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 418 *tsp = ncp_ts->nc_time; 419 *ticksp = ncp_ts->nc_ticks; 420 } 421 422 #ifdef DEBUG_CACHE 423 static int __read_mostly doingcache = 1; /* 1 => enable the cache */ 424 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, 425 "VFS namecache enabled"); 426 #endif 427 428 /* Export size information to userland */ 429 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 430 sizeof(struct namecache), "sizeof(struct namecache)"); 431 432 /* 433 * The new name cache statistics 434 */ 435 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 436 "Name cache statistics"); 437 #define STATNODE_ULONG(name, descr) \ 438 SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr); 439 #define STATNODE_COUNTER(name, descr) \ 440 static COUNTER_U64_DEFINE_EARLY(name); \ 441 SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, \ 442 descr); 443 STATNODE_ULONG(numneg, "Number of negative cache entries"); 444 STATNODE_ULONG(numcache, "Number of cache entries"); 445 STATNODE_COUNTER(numcachehv, "Number of namecache entries with vnodes held"); 446 STATNODE_COUNTER(numdrops, "Number of dropped entries due to reaching the limit"); 447 STATNODE_COUNTER(dothits, "Number of '.' hits"); 448 STATNODE_COUNTER(dotdothits, "Number of '..' hits"); 449 STATNODE_COUNTER(nummiss, "Number of cache misses"); 450 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache"); 451 STATNODE_COUNTER(numposzaps, 452 "Number of cache hits (positive) we do not want to cache"); 453 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)"); 454 STATNODE_COUNTER(numnegzaps, 455 "Number of cache hits (negative) we do not want to cache"); 456 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)"); 457 /* These count for vn_getcwd(), too. */ 458 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls"); 459 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); 460 STATNODE_COUNTER(numfullpathfail2, 461 "Number of fullpath search errors (VOP_VPTOCNP failures)"); 462 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); 463 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls"); 464 STATNODE_COUNTER(zap_and_exit_bucket_relock_success, 465 "Number of successful removals after relocking"); 466 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail, 467 "Number of times zap_and_exit failed to lock"); 468 static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2, 469 "Number of times zap_and_exit failed to lock"); 470 static long cache_lock_vnodes_cel_3_failures; 471 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures, 472 "Number of times 3-way vnode locking failed"); 473 STATNODE_ULONG(numhotneg, "Number of hot negative entries"); 474 STATNODE_COUNTER(numneg_evicted, 475 "Number of negative entries evicted when adding a new entry"); 476 STATNODE_COUNTER(shrinking_skipped, 477 "Number of times shrinking was already in progress"); 478 479 static void cache_zap_locked(struct namecache *ncp); 480 static int vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, 481 char **freebuf, size_t *buflen); 482 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 483 char **retbuf, size_t *buflen, bool slash_prefixed, size_t addend); 484 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, 485 char **retbuf, size_t *buflen); 486 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, 487 char **retbuf, size_t *len, bool slash_prefixed, size_t addend); 488 489 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 490 491 static inline void 492 cache_assert_vlp_locked(struct mtx *vlp) 493 { 494 495 if (vlp != NULL) 496 mtx_assert(vlp, MA_OWNED); 497 } 498 499 static inline void 500 cache_assert_vnode_locked(struct vnode *vp) 501 { 502 struct mtx *vlp; 503 504 vlp = VP2VNODELOCK(vp); 505 cache_assert_vlp_locked(vlp); 506 } 507 508 /* 509 * TODO: With the value stored we can do better than computing the hash based 510 * on the address. The choice of FNV should also be revisited. 511 */ 512 static void 513 cache_prehash(struct vnode *vp) 514 { 515 516 vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT); 517 } 518 519 static uint32_t 520 cache_get_hash(char *name, u_char len, struct vnode *dvp) 521 { 522 523 return (fnv_32_buf(name, len, dvp->v_nchash)); 524 } 525 526 static inline struct nchashhead * 527 NCP2BUCKET(struct namecache *ncp) 528 { 529 uint32_t hash; 530 531 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 532 return (NCHHASH(hash)); 533 } 534 535 static inline struct mtx * 536 NCP2BUCKETLOCK(struct namecache *ncp) 537 { 538 uint32_t hash; 539 540 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 541 return (HASH2BUCKETLOCK(hash)); 542 } 543 544 #ifdef INVARIANTS 545 static void 546 cache_assert_bucket_locked(struct namecache *ncp) 547 { 548 struct mtx *blp; 549 550 blp = NCP2BUCKETLOCK(ncp); 551 mtx_assert(blp, MA_OWNED); 552 } 553 554 static void 555 cache_assert_bucket_unlocked(struct namecache *ncp) 556 { 557 struct mtx *blp; 558 559 blp = NCP2BUCKETLOCK(ncp); 560 mtx_assert(blp, MA_NOTOWNED); 561 } 562 #else 563 #define cache_assert_bucket_locked(x) do { } while (0) 564 #define cache_assert_bucket_unlocked(x) do { } while (0) 565 #endif 566 567 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) 568 static void 569 _cache_sort_vnodes(void **p1, void **p2) 570 { 571 void *tmp; 572 573 MPASS(*p1 != NULL || *p2 != NULL); 574 575 if (*p1 > *p2) { 576 tmp = *p2; 577 *p2 = *p1; 578 *p1 = tmp; 579 } 580 } 581 582 static void 583 cache_lock_all_buckets(void) 584 { 585 u_int i; 586 587 for (i = 0; i < numbucketlocks; i++) 588 mtx_lock(&bucketlocks[i]); 589 } 590 591 static void 592 cache_unlock_all_buckets(void) 593 { 594 u_int i; 595 596 for (i = 0; i < numbucketlocks; i++) 597 mtx_unlock(&bucketlocks[i]); 598 } 599 600 static void 601 cache_lock_all_vnodes(void) 602 { 603 u_int i; 604 605 for (i = 0; i < numvnodelocks; i++) 606 mtx_lock(&vnodelocks[i]); 607 } 608 609 static void 610 cache_unlock_all_vnodes(void) 611 { 612 u_int i; 613 614 for (i = 0; i < numvnodelocks; i++) 615 mtx_unlock(&vnodelocks[i]); 616 } 617 618 static int 619 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 620 { 621 622 cache_sort_vnodes(&vlp1, &vlp2); 623 624 if (vlp1 != NULL) { 625 if (!mtx_trylock(vlp1)) 626 return (EAGAIN); 627 } 628 if (!mtx_trylock(vlp2)) { 629 if (vlp1 != NULL) 630 mtx_unlock(vlp1); 631 return (EAGAIN); 632 } 633 634 return (0); 635 } 636 637 static void 638 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 639 { 640 641 MPASS(vlp1 != NULL || vlp2 != NULL); 642 MPASS(vlp1 <= vlp2); 643 644 if (vlp1 != NULL) 645 mtx_lock(vlp1); 646 if (vlp2 != NULL) 647 mtx_lock(vlp2); 648 } 649 650 static void 651 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 652 { 653 654 MPASS(vlp1 != NULL || vlp2 != NULL); 655 656 if (vlp1 != NULL) 657 mtx_unlock(vlp1); 658 if (vlp2 != NULL) 659 mtx_unlock(vlp2); 660 } 661 662 static int 663 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 664 { 665 struct nchstats snap; 666 667 if (req->oldptr == NULL) 668 return (SYSCTL_OUT(req, 0, sizeof(snap))); 669 670 snap = nchstats; 671 snap.ncs_goodhits = counter_u64_fetch(numposhits); 672 snap.ncs_neghits = counter_u64_fetch(numneghits); 673 snap.ncs_badhits = counter_u64_fetch(numposzaps) + 674 counter_u64_fetch(numnegzaps); 675 snap.ncs_miss = counter_u64_fetch(nummisszap) + 676 counter_u64_fetch(nummiss); 677 678 return (SYSCTL_OUT(req, &snap, sizeof(snap))); 679 } 680 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | 681 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", 682 "VFS cache effectiveness statistics"); 683 684 #ifdef DIAGNOSTIC 685 /* 686 * Grab an atomic snapshot of the name cache hash chain lengths 687 */ 688 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, 689 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 690 "hash table stats"); 691 692 static int 693 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) 694 { 695 struct nchashhead *ncpp; 696 struct namecache *ncp; 697 int i, error, n_nchash, *cntbuf; 698 699 retry: 700 n_nchash = nchash + 1; /* nchash is max index, not count */ 701 if (req->oldptr == NULL) 702 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); 703 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); 704 cache_lock_all_buckets(); 705 if (n_nchash != nchash + 1) { 706 cache_unlock_all_buckets(); 707 free(cntbuf, M_TEMP); 708 goto retry; 709 } 710 /* Scan hash tables counting entries */ 711 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) 712 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) 713 cntbuf[i]++; 714 cache_unlock_all_buckets(); 715 for (error = 0, i = 0; i < n_nchash; i++) 716 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) 717 break; 718 free(cntbuf, M_TEMP); 719 return (error); 720 } 721 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| 722 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", 723 "nchash chain lengths"); 724 725 static int 726 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) 727 { 728 int error; 729 struct nchashhead *ncpp; 730 struct namecache *ncp; 731 int n_nchash; 732 int count, maxlength, used, pct; 733 734 if (!req->oldptr) 735 return SYSCTL_OUT(req, 0, 4 * sizeof(int)); 736 737 cache_lock_all_buckets(); 738 n_nchash = nchash + 1; /* nchash is max index, not count */ 739 used = 0; 740 maxlength = 0; 741 742 /* Scan hash tables for applicable entries */ 743 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { 744 count = 0; 745 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) { 746 count++; 747 } 748 if (count) 749 used++; 750 if (maxlength < count) 751 maxlength = count; 752 } 753 n_nchash = nchash + 1; 754 cache_unlock_all_buckets(); 755 pct = (used * 100) / (n_nchash / 100); 756 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); 757 if (error) 758 return (error); 759 error = SYSCTL_OUT(req, &used, sizeof(used)); 760 if (error) 761 return (error); 762 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); 763 if (error) 764 return (error); 765 error = SYSCTL_OUT(req, &pct, sizeof(pct)); 766 if (error) 767 return (error); 768 return (0); 769 } 770 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| 771 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", 772 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); 773 #endif 774 775 /* 776 * Negative entries management 777 * 778 * A variation of LRU scheme is used. New entries are hashed into one of 779 * numneglists cold lists. Entries get promoted to the hot list on first hit. 780 * 781 * The shrinker will demote hot list head and evict from the cold list in a 782 * round-robin manner. 783 */ 784 static void 785 cache_negative_init(struct namecache *ncp) 786 { 787 struct negstate *negstate; 788 789 ncp->nc_flag |= NCF_NEGATIVE; 790 negstate = NCP2NEGSTATE(ncp); 791 negstate->neg_flag = 0; 792 } 793 794 static void 795 cache_negative_hit(struct namecache *ncp) 796 { 797 struct neglist *neglist; 798 struct negstate *negstate; 799 800 negstate = NCP2NEGSTATE(ncp); 801 if ((negstate->neg_flag & NEG_HOT) != 0) 802 return; 803 neglist = NCP2NEGLIST(ncp); 804 mtx_lock(&ncneg_hot.nl_lock); 805 mtx_lock(&neglist->nl_lock); 806 if ((negstate->neg_flag & NEG_HOT) == 0) { 807 numhotneg++; 808 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 809 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); 810 negstate->neg_flag |= NEG_HOT; 811 } 812 mtx_unlock(&neglist->nl_lock); 813 mtx_unlock(&ncneg_hot.nl_lock); 814 } 815 816 static void 817 cache_negative_insert(struct namecache *ncp) 818 { 819 struct neglist *neglist; 820 821 MPASS(ncp->nc_flag & NCF_NEGATIVE); 822 cache_assert_bucket_locked(ncp); 823 neglist = NCP2NEGLIST(ncp); 824 mtx_lock(&neglist->nl_lock); 825 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 826 mtx_unlock(&neglist->nl_lock); 827 atomic_add_long(&numneg, 1); 828 } 829 830 static void 831 cache_negative_remove(struct namecache *ncp) 832 { 833 struct neglist *neglist; 834 struct negstate *negstate; 835 bool hot_locked = false; 836 bool list_locked = false; 837 838 cache_assert_bucket_locked(ncp); 839 neglist = NCP2NEGLIST(ncp); 840 negstate = NCP2NEGSTATE(ncp); 841 if ((negstate->neg_flag & NEG_HOT) != 0) { 842 hot_locked = true; 843 mtx_lock(&ncneg_hot.nl_lock); 844 if ((negstate->neg_flag & NEG_HOT) == 0) { 845 list_locked = true; 846 mtx_lock(&neglist->nl_lock); 847 } 848 } else { 849 list_locked = true; 850 mtx_lock(&neglist->nl_lock); 851 /* 852 * We may be racing against promotion in lockless lookup. 853 */ 854 if ((negstate->neg_flag & NEG_HOT) != 0) { 855 mtx_unlock(&neglist->nl_lock); 856 hot_locked = true; 857 mtx_lock(&ncneg_hot.nl_lock); 858 mtx_lock(&neglist->nl_lock); 859 } 860 } 861 if ((negstate->neg_flag & NEG_HOT) != 0) { 862 mtx_assert(&ncneg_hot.nl_lock, MA_OWNED); 863 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 864 numhotneg--; 865 } else { 866 mtx_assert(&neglist->nl_lock, MA_OWNED); 867 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 868 } 869 if (list_locked) 870 mtx_unlock(&neglist->nl_lock); 871 if (hot_locked) 872 mtx_unlock(&ncneg_hot.nl_lock); 873 atomic_subtract_long(&numneg, 1); 874 } 875 876 static void 877 cache_negative_shrink_select(struct namecache **ncpp, 878 struct neglist **neglistpp) 879 { 880 struct neglist *neglist; 881 struct namecache *ncp; 882 static u_int cycle; 883 u_int i; 884 885 *ncpp = ncp = NULL; 886 887 for (i = 0; i < numneglists; i++) { 888 neglist = &neglists[(cycle + i) % numneglists]; 889 if (TAILQ_FIRST(&neglist->nl_list) == NULL) 890 continue; 891 mtx_lock(&neglist->nl_lock); 892 ncp = TAILQ_FIRST(&neglist->nl_list); 893 if (ncp != NULL) 894 break; 895 mtx_unlock(&neglist->nl_lock); 896 } 897 898 *neglistpp = neglist; 899 *ncpp = ncp; 900 cycle++; 901 } 902 903 static void 904 cache_negative_zap_one(void) 905 { 906 struct namecache *ncp, *ncp2; 907 struct neglist *neglist; 908 struct negstate *negstate; 909 struct mtx *dvlp; 910 struct mtx *blp; 911 912 if (mtx_owner(&ncneg_shrink_lock) != NULL || 913 !mtx_trylock(&ncneg_shrink_lock)) { 914 counter_u64_add(shrinking_skipped, 1); 915 return; 916 } 917 918 mtx_lock(&ncneg_hot.nl_lock); 919 ncp = TAILQ_FIRST(&ncneg_hot.nl_list); 920 if (ncp != NULL) { 921 neglist = NCP2NEGLIST(ncp); 922 negstate = NCP2NEGSTATE(ncp); 923 mtx_lock(&neglist->nl_lock); 924 MPASS((negstate->neg_flag & NEG_HOT) != 0); 925 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 926 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 927 negstate->neg_flag &= ~NEG_HOT; 928 numhotneg--; 929 mtx_unlock(&neglist->nl_lock); 930 } 931 mtx_unlock(&ncneg_hot.nl_lock); 932 933 cache_negative_shrink_select(&ncp, &neglist); 934 935 mtx_unlock(&ncneg_shrink_lock); 936 if (ncp == NULL) 937 return; 938 939 MPASS(ncp->nc_flag & NCF_NEGATIVE); 940 dvlp = VP2VNODELOCK(ncp->nc_dvp); 941 blp = NCP2BUCKETLOCK(ncp); 942 mtx_unlock(&neglist->nl_lock); 943 mtx_lock(dvlp); 944 mtx_lock(blp); 945 /* 946 * Enter SMR to safely check the negative list. 947 * Even if the found pointer matches, the entry may now be reallocated 948 * and used by a different vnode. 949 */ 950 vfs_smr_enter(); 951 ncp2 = TAILQ_FIRST(&neglist->nl_list); 952 if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) || 953 blp != NCP2BUCKETLOCK(ncp2)) { 954 vfs_smr_exit(); 955 ncp = NULL; 956 } else { 957 vfs_smr_exit(); 958 SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp, 959 ncp->nc_name); 960 cache_zap_locked(ncp); 961 counter_u64_add(numneg_evicted, 1); 962 } 963 mtx_unlock(blp); 964 mtx_unlock(dvlp); 965 if (ncp != NULL) 966 cache_free(ncp); 967 } 968 969 /* 970 * cache_zap_locked(): 971 * 972 * Removes a namecache entry from cache, whether it contains an actual 973 * pointer to a vnode or if it is just a negative cache entry. 974 */ 975 static void 976 cache_zap_locked(struct namecache *ncp) 977 { 978 struct nchashhead *ncpp; 979 980 if (!(ncp->nc_flag & NCF_NEGATIVE)) 981 cache_assert_vnode_locked(ncp->nc_vp); 982 cache_assert_vnode_locked(ncp->nc_dvp); 983 cache_assert_bucket_locked(ncp); 984 985 cache_ncp_invalidate(ncp); 986 987 ncpp = NCP2BUCKET(ncp); 988 CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash); 989 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 990 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, 991 ncp->nc_name, ncp->nc_vp); 992 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); 993 if (ncp == ncp->nc_vp->v_cache_dd) { 994 vn_seqc_write_begin_unheld(ncp->nc_vp); 995 ncp->nc_vp->v_cache_dd = NULL; 996 vn_seqc_write_end(ncp->nc_vp); 997 } 998 } else { 999 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp, 1000 ncp->nc_name); 1001 cache_negative_remove(ncp); 1002 } 1003 if (ncp->nc_flag & NCF_ISDOTDOT) { 1004 if (ncp == ncp->nc_dvp->v_cache_dd) { 1005 vn_seqc_write_begin_unheld(ncp->nc_dvp); 1006 ncp->nc_dvp->v_cache_dd = NULL; 1007 vn_seqc_write_end(ncp->nc_dvp); 1008 } 1009 } else { 1010 LIST_REMOVE(ncp, nc_src); 1011 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { 1012 ncp->nc_flag |= NCF_DVDROP; 1013 counter_u64_add(numcachehv, -1); 1014 } 1015 } 1016 atomic_subtract_long(&numcache, 1); 1017 } 1018 1019 static void 1020 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) 1021 { 1022 struct mtx *blp; 1023 1024 MPASS(ncp->nc_dvp == vp); 1025 MPASS(ncp->nc_flag & NCF_NEGATIVE); 1026 cache_assert_vnode_locked(vp); 1027 1028 blp = NCP2BUCKETLOCK(ncp); 1029 mtx_lock(blp); 1030 cache_zap_locked(ncp); 1031 mtx_unlock(blp); 1032 } 1033 1034 static bool 1035 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, 1036 struct mtx **vlpp) 1037 { 1038 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 1039 struct mtx *blp; 1040 1041 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 1042 cache_assert_vnode_locked(vp); 1043 1044 if (ncp->nc_flag & NCF_NEGATIVE) { 1045 if (*vlpp != NULL) { 1046 mtx_unlock(*vlpp); 1047 *vlpp = NULL; 1048 } 1049 cache_zap_negative_locked_vnode_kl(ncp, vp); 1050 return (true); 1051 } 1052 1053 pvlp = VP2VNODELOCK(vp); 1054 blp = NCP2BUCKETLOCK(ncp); 1055 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 1056 vlp2 = VP2VNODELOCK(ncp->nc_vp); 1057 1058 if (*vlpp == vlp1 || *vlpp == vlp2) { 1059 to_unlock = *vlpp; 1060 *vlpp = NULL; 1061 } else { 1062 if (*vlpp != NULL) { 1063 mtx_unlock(*vlpp); 1064 *vlpp = NULL; 1065 } 1066 cache_sort_vnodes(&vlp1, &vlp2); 1067 if (vlp1 == pvlp) { 1068 mtx_lock(vlp2); 1069 to_unlock = vlp2; 1070 } else { 1071 if (!mtx_trylock(vlp1)) 1072 goto out_relock; 1073 to_unlock = vlp1; 1074 } 1075 } 1076 mtx_lock(blp); 1077 cache_zap_locked(ncp); 1078 mtx_unlock(blp); 1079 if (to_unlock != NULL) 1080 mtx_unlock(to_unlock); 1081 return (true); 1082 1083 out_relock: 1084 mtx_unlock(vlp2); 1085 mtx_lock(vlp1); 1086 mtx_lock(vlp2); 1087 MPASS(*vlpp == NULL); 1088 *vlpp = vlp1; 1089 return (false); 1090 } 1091 1092 /* 1093 * If trylocking failed we can get here. We know enough to take all needed locks 1094 * in the right order and re-lookup the entry. 1095 */ 1096 static int 1097 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1098 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, 1099 struct mtx *blp) 1100 { 1101 struct namecache *rncp; 1102 1103 cache_assert_bucket_unlocked(ncp); 1104 1105 cache_sort_vnodes(&dvlp, &vlp); 1106 cache_lock_vnodes(dvlp, vlp); 1107 mtx_lock(blp); 1108 CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { 1109 if (rncp == ncp && rncp->nc_dvp == dvp && 1110 rncp->nc_nlen == cnp->cn_namelen && 1111 !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) 1112 break; 1113 } 1114 if (rncp != NULL) { 1115 cache_zap_locked(rncp); 1116 mtx_unlock(blp); 1117 cache_unlock_vnodes(dvlp, vlp); 1118 counter_u64_add(zap_and_exit_bucket_relock_success, 1); 1119 return (0); 1120 } 1121 1122 mtx_unlock(blp); 1123 cache_unlock_vnodes(dvlp, vlp); 1124 return (EAGAIN); 1125 } 1126 1127 static int __noinline 1128 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp, 1129 uint32_t hash, struct mtx *blp) 1130 { 1131 struct mtx *dvlp, *vlp; 1132 struct vnode *dvp; 1133 1134 cache_assert_bucket_locked(ncp); 1135 1136 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1137 vlp = NULL; 1138 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1139 vlp = VP2VNODELOCK(ncp->nc_vp); 1140 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1141 cache_zap_locked(ncp); 1142 mtx_unlock(blp); 1143 cache_unlock_vnodes(dvlp, vlp); 1144 return (0); 1145 } 1146 1147 dvp = ncp->nc_dvp; 1148 mtx_unlock(blp); 1149 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1150 } 1151 1152 static __noinline int 1153 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp) 1154 { 1155 struct namecache *ncp; 1156 struct mtx *blp; 1157 struct mtx *dvlp, *dvlp2; 1158 uint32_t hash; 1159 int error; 1160 1161 if (cnp->cn_namelen == 2 && 1162 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1163 dvlp = VP2VNODELOCK(dvp); 1164 dvlp2 = NULL; 1165 mtx_lock(dvlp); 1166 retry_dotdot: 1167 ncp = dvp->v_cache_dd; 1168 if (ncp == NULL) { 1169 mtx_unlock(dvlp); 1170 if (dvlp2 != NULL) 1171 mtx_unlock(dvlp2); 1172 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1173 return (0); 1174 } 1175 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1176 if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2)) 1177 goto retry_dotdot; 1178 MPASS(dvp->v_cache_dd == NULL); 1179 mtx_unlock(dvlp); 1180 if (dvlp2 != NULL) 1181 mtx_unlock(dvlp2); 1182 cache_free(ncp); 1183 } else { 1184 vn_seqc_write_begin(dvp); 1185 dvp->v_cache_dd = NULL; 1186 vn_seqc_write_end(dvp); 1187 mtx_unlock(dvlp); 1188 if (dvlp2 != NULL) 1189 mtx_unlock(dvlp2); 1190 } 1191 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1192 return (1); 1193 } 1194 1195 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1196 blp = HASH2BUCKETLOCK(hash); 1197 retry: 1198 if (CK_SLIST_EMPTY(NCHHASH(hash))) 1199 goto out_no_entry; 1200 1201 mtx_lock(blp); 1202 1203 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1204 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1205 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1206 break; 1207 } 1208 1209 if (ncp == NULL) { 1210 mtx_unlock(blp); 1211 goto out_no_entry; 1212 } 1213 1214 error = cache_zap_locked_bucket(ncp, cnp, hash, blp); 1215 if (__predict_false(error != 0)) { 1216 zap_and_exit_bucket_fail++; 1217 goto retry; 1218 } 1219 counter_u64_add(numposzaps, 1); 1220 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1221 cache_free(ncp); 1222 return (1); 1223 out_no_entry: 1224 counter_u64_add(nummisszap, 1); 1225 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1226 return (0); 1227 } 1228 1229 static int __noinline 1230 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1231 struct timespec *tsp, int *ticksp) 1232 { 1233 int ltype; 1234 1235 *vpp = dvp; 1236 counter_u64_add(dothits, 1); 1237 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); 1238 if (tsp != NULL) 1239 timespecclear(tsp); 1240 if (ticksp != NULL) 1241 *ticksp = ticks; 1242 vrefact(*vpp); 1243 /* 1244 * When we lookup "." we still can be asked to lock it 1245 * differently... 1246 */ 1247 ltype = cnp->cn_lkflags & LK_TYPE_MASK; 1248 if (ltype != VOP_ISLOCKED(*vpp)) { 1249 if (ltype == LK_EXCLUSIVE) { 1250 vn_lock(*vpp, LK_UPGRADE | LK_RETRY); 1251 if (VN_IS_DOOMED((*vpp))) { 1252 /* forced unmount */ 1253 vrele(*vpp); 1254 *vpp = NULL; 1255 return (ENOENT); 1256 } 1257 } else 1258 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); 1259 } 1260 return (-1); 1261 } 1262 1263 static int __noinline 1264 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1265 struct timespec *tsp, int *ticksp) 1266 { 1267 struct namecache_ts *ncp_ts; 1268 struct namecache *ncp; 1269 struct mtx *dvlp; 1270 enum vgetstate vs; 1271 int error, ltype; 1272 bool whiteout; 1273 1274 MPASS((cnp->cn_flags & ISDOTDOT) != 0); 1275 1276 if ((cnp->cn_flags & MAKEENTRY) == 0) { 1277 cache_remove_cnp(dvp, cnp); 1278 return (0); 1279 } 1280 1281 counter_u64_add(dotdothits, 1); 1282 retry: 1283 dvlp = VP2VNODELOCK(dvp); 1284 mtx_lock(dvlp); 1285 ncp = dvp->v_cache_dd; 1286 if (ncp == NULL) { 1287 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, "..", NULL); 1288 mtx_unlock(dvlp); 1289 return (0); 1290 } 1291 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1292 if (ncp->nc_flag & NCF_NEGATIVE) 1293 *vpp = NULL; 1294 else 1295 *vpp = ncp->nc_vp; 1296 } else 1297 *vpp = ncp->nc_dvp; 1298 if (*vpp == NULL) 1299 goto negative_success; 1300 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp); 1301 cache_out_ts(ncp, tsp, ticksp); 1302 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == 1303 NCF_DTS && tsp != NULL) { 1304 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1305 *tsp = ncp_ts->nc_dotdottime; 1306 } 1307 1308 MPASS(dvp != *vpp); 1309 ltype = VOP_ISLOCKED(dvp); 1310 VOP_UNLOCK(dvp); 1311 vs = vget_prep(*vpp); 1312 mtx_unlock(dvlp); 1313 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1314 vn_lock(dvp, ltype | LK_RETRY); 1315 if (VN_IS_DOOMED(dvp)) { 1316 if (error == 0) 1317 vput(*vpp); 1318 *vpp = NULL; 1319 return (ENOENT); 1320 } 1321 if (error) { 1322 *vpp = NULL; 1323 goto retry; 1324 } 1325 return (-1); 1326 negative_success: 1327 if (__predict_false(cnp->cn_nameiop == CREATE)) { 1328 if (cnp->cn_flags & ISLASTCN) { 1329 counter_u64_add(numnegzaps, 1); 1330 cache_zap_negative_locked_vnode_kl(ncp, dvp); 1331 mtx_unlock(dvlp); 1332 cache_free(ncp); 1333 return (0); 1334 } 1335 } 1336 1337 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 1338 cache_out_ts(ncp, tsp, ticksp); 1339 counter_u64_add(numneghits, 1); 1340 whiteout = (ncp->nc_flag & NCF_WHITE); 1341 cache_negative_hit(ncp); 1342 mtx_unlock(dvlp); 1343 if (whiteout) 1344 cnp->cn_flags |= ISWHITEOUT; 1345 return (ENOENT); 1346 } 1347 1348 /** 1349 * Lookup a name in the name cache 1350 * 1351 * # Arguments 1352 * 1353 * - dvp: Parent directory in which to search. 1354 * - vpp: Return argument. Will contain desired vnode on cache hit. 1355 * - cnp: Parameters of the name search. The most interesting bits of 1356 * the cn_flags field have the following meanings: 1357 * - MAKEENTRY: If clear, free an entry from the cache rather than look 1358 * it up. 1359 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." 1360 * - tsp: Return storage for cache timestamp. On a successful (positive 1361 * or negative) lookup, tsp will be filled with any timespec that 1362 * was stored when this cache entry was created. However, it will 1363 * be clear for "." entries. 1364 * - ticks: Return storage for alternate cache timestamp. On a successful 1365 * (positive or negative) lookup, it will contain the ticks value 1366 * that was current when the cache entry was created, unless cnp 1367 * was ".". 1368 * 1369 * Either both tsp and ticks have to be provided or neither of them. 1370 * 1371 * # Returns 1372 * 1373 * - -1: A positive cache hit. vpp will contain the desired vnode. 1374 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due 1375 * to a forced unmount. vpp will not be modified. If the entry 1376 * is a whiteout, then the ISWHITEOUT flag will be set in 1377 * cnp->cn_flags. 1378 * - 0: A cache miss. vpp will not be modified. 1379 * 1380 * # Locking 1381 * 1382 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up 1383 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the 1384 * lock is not recursively acquired. 1385 */ 1386 static int __noinline 1387 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1388 struct timespec *tsp, int *ticksp) 1389 { 1390 struct namecache *ncp; 1391 struct mtx *blp; 1392 uint32_t hash; 1393 enum vgetstate vs; 1394 int error; 1395 bool whiteout; 1396 1397 MPASS((cnp->cn_flags & (MAKEENTRY | ISDOTDOT)) == MAKEENTRY); 1398 1399 retry: 1400 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1401 blp = HASH2BUCKETLOCK(hash); 1402 mtx_lock(blp); 1403 1404 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1405 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1406 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1407 break; 1408 } 1409 1410 if (__predict_false(ncp == NULL)) { 1411 mtx_unlock(blp); 1412 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1413 NULL); 1414 counter_u64_add(nummiss, 1); 1415 return (0); 1416 } 1417 1418 if (ncp->nc_flag & NCF_NEGATIVE) 1419 goto negative_success; 1420 1421 counter_u64_add(numposhits, 1); 1422 *vpp = ncp->nc_vp; 1423 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); 1424 cache_out_ts(ncp, tsp, ticksp); 1425 MPASS(dvp != *vpp); 1426 vs = vget_prep(*vpp); 1427 mtx_unlock(blp); 1428 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1429 if (error) { 1430 *vpp = NULL; 1431 goto retry; 1432 } 1433 return (-1); 1434 negative_success: 1435 if (__predict_false(cnp->cn_nameiop == CREATE)) { 1436 if (cnp->cn_flags & ISLASTCN) { 1437 counter_u64_add(numnegzaps, 1); 1438 error = cache_zap_locked_bucket(ncp, cnp, hash, blp); 1439 if (__predict_false(error != 0)) { 1440 zap_and_exit_bucket_fail2++; 1441 goto retry; 1442 } 1443 cache_free(ncp); 1444 return (0); 1445 } 1446 } 1447 1448 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 1449 cache_out_ts(ncp, tsp, ticksp); 1450 counter_u64_add(numneghits, 1); 1451 whiteout = (ncp->nc_flag & NCF_WHITE); 1452 cache_negative_hit(ncp); 1453 mtx_unlock(blp); 1454 if (whiteout) 1455 cnp->cn_flags |= ISWHITEOUT; 1456 return (ENOENT); 1457 } 1458 1459 int 1460 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1461 struct timespec *tsp, int *ticksp) 1462 { 1463 struct namecache *ncp; 1464 struct negstate *negstate; 1465 uint32_t hash; 1466 enum vgetstate vs; 1467 int error; 1468 bool whiteout; 1469 u_short nc_flag; 1470 1471 MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL)); 1472 1473 #ifdef DEBUG_CACHE 1474 if (__predict_false(!doingcache)) { 1475 cnp->cn_flags &= ~MAKEENTRY; 1476 return (0); 1477 } 1478 #endif 1479 1480 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 1481 if (cnp->cn_namelen == 1) 1482 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); 1483 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') 1484 return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp)); 1485 } 1486 1487 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 1488 1489 if ((cnp->cn_flags & MAKEENTRY) == 0) { 1490 cache_remove_cnp(dvp, cnp); 1491 return (0); 1492 } 1493 1494 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1495 vfs_smr_enter(); 1496 1497 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1498 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1499 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1500 break; 1501 } 1502 1503 if (__predict_false(ncp == NULL)) { 1504 vfs_smr_exit(); 1505 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1506 NULL); 1507 counter_u64_add(nummiss, 1); 1508 return (0); 1509 } 1510 1511 nc_flag = atomic_load_char(&ncp->nc_flag); 1512 if (nc_flag & NCF_NEGATIVE) 1513 goto negative_success; 1514 1515 counter_u64_add(numposhits, 1); 1516 *vpp = ncp->nc_vp; 1517 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); 1518 cache_out_ts(ncp, tsp, ticksp); 1519 MPASS(dvp != *vpp); 1520 if (!cache_ncp_canuse(ncp)) { 1521 vfs_smr_exit(); 1522 *vpp = NULL; 1523 goto out_fallback; 1524 } 1525 vs = vget_prep_smr(*vpp); 1526 vfs_smr_exit(); 1527 if (__predict_false(vs == VGET_NONE)) { 1528 *vpp = NULL; 1529 goto out_fallback; 1530 } 1531 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1532 if (error) { 1533 *vpp = NULL; 1534 goto out_fallback; 1535 } 1536 return (-1); 1537 negative_success: 1538 if (__predict_false(cnp->cn_nameiop == CREATE)) { 1539 if (cnp->cn_flags & ISLASTCN) { 1540 vfs_smr_exit(); 1541 goto out_fallback; 1542 } 1543 } 1544 1545 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 1546 cache_out_ts(ncp, tsp, ticksp); 1547 counter_u64_add(numneghits, 1); 1548 whiteout = (ncp->nc_flag & NCF_WHITE); 1549 /* 1550 * TODO: We need to take locks to promote an entry. Code doing it 1551 * in SMR lookup can be modified to be shared. 1552 */ 1553 negstate = NCP2NEGSTATE(ncp); 1554 if ((negstate->neg_flag & NEG_HOT) == 0 || 1555 !cache_ncp_canuse(ncp)) { 1556 vfs_smr_exit(); 1557 goto out_fallback; 1558 } 1559 vfs_smr_exit(); 1560 if (whiteout) 1561 cnp->cn_flags |= ISWHITEOUT; 1562 return (ENOENT); 1563 out_fallback: 1564 return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp)); 1565 } 1566 1567 struct celockstate { 1568 struct mtx *vlp[3]; 1569 struct mtx *blp[2]; 1570 }; 1571 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); 1572 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); 1573 1574 static inline void 1575 cache_celockstate_init(struct celockstate *cel) 1576 { 1577 1578 bzero(cel, sizeof(*cel)); 1579 } 1580 1581 static void 1582 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, 1583 struct vnode *dvp) 1584 { 1585 struct mtx *vlp1, *vlp2; 1586 1587 MPASS(cel->vlp[0] == NULL); 1588 MPASS(cel->vlp[1] == NULL); 1589 MPASS(cel->vlp[2] == NULL); 1590 1591 MPASS(vp != NULL || dvp != NULL); 1592 1593 vlp1 = VP2VNODELOCK(vp); 1594 vlp2 = VP2VNODELOCK(dvp); 1595 cache_sort_vnodes(&vlp1, &vlp2); 1596 1597 if (vlp1 != NULL) { 1598 mtx_lock(vlp1); 1599 cel->vlp[0] = vlp1; 1600 } 1601 mtx_lock(vlp2); 1602 cel->vlp[1] = vlp2; 1603 } 1604 1605 static void 1606 cache_unlock_vnodes_cel(struct celockstate *cel) 1607 { 1608 1609 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); 1610 1611 if (cel->vlp[0] != NULL) 1612 mtx_unlock(cel->vlp[0]); 1613 if (cel->vlp[1] != NULL) 1614 mtx_unlock(cel->vlp[1]); 1615 if (cel->vlp[2] != NULL) 1616 mtx_unlock(cel->vlp[2]); 1617 } 1618 1619 static bool 1620 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) 1621 { 1622 struct mtx *vlp; 1623 bool ret; 1624 1625 cache_assert_vlp_locked(cel->vlp[0]); 1626 cache_assert_vlp_locked(cel->vlp[1]); 1627 MPASS(cel->vlp[2] == NULL); 1628 1629 MPASS(vp != NULL); 1630 vlp = VP2VNODELOCK(vp); 1631 1632 ret = true; 1633 if (vlp >= cel->vlp[1]) { 1634 mtx_lock(vlp); 1635 } else { 1636 if (mtx_trylock(vlp)) 1637 goto out; 1638 cache_lock_vnodes_cel_3_failures++; 1639 cache_unlock_vnodes_cel(cel); 1640 if (vlp < cel->vlp[0]) { 1641 mtx_lock(vlp); 1642 mtx_lock(cel->vlp[0]); 1643 mtx_lock(cel->vlp[1]); 1644 } else { 1645 if (cel->vlp[0] != NULL) 1646 mtx_lock(cel->vlp[0]); 1647 mtx_lock(vlp); 1648 mtx_lock(cel->vlp[1]); 1649 } 1650 ret = false; 1651 } 1652 out: 1653 cel->vlp[2] = vlp; 1654 return (ret); 1655 } 1656 1657 static void 1658 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1, 1659 struct mtx *blp2) 1660 { 1661 1662 MPASS(cel->blp[0] == NULL); 1663 MPASS(cel->blp[1] == NULL); 1664 1665 cache_sort_vnodes(&blp1, &blp2); 1666 1667 if (blp1 != NULL) { 1668 mtx_lock(blp1); 1669 cel->blp[0] = blp1; 1670 } 1671 mtx_lock(blp2); 1672 cel->blp[1] = blp2; 1673 } 1674 1675 static void 1676 cache_unlock_buckets_cel(struct celockstate *cel) 1677 { 1678 1679 if (cel->blp[0] != NULL) 1680 mtx_unlock(cel->blp[0]); 1681 mtx_unlock(cel->blp[1]); 1682 } 1683 1684 /* 1685 * Lock part of the cache affected by the insertion. 1686 * 1687 * This means vnodelocks for dvp, vp and the relevant bucketlock. 1688 * However, insertion can result in removal of an old entry. In this 1689 * case we have an additional vnode and bucketlock pair to lock. 1690 * 1691 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while 1692 * preserving the locking order (smaller address first). 1693 */ 1694 static void 1695 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1696 uint32_t hash) 1697 { 1698 struct namecache *ncp; 1699 struct mtx *blps[2]; 1700 1701 blps[0] = HASH2BUCKETLOCK(hash); 1702 for (;;) { 1703 blps[1] = NULL; 1704 cache_lock_vnodes_cel(cel, dvp, vp); 1705 if (vp == NULL || vp->v_type != VDIR) 1706 break; 1707 ncp = vp->v_cache_dd; 1708 if (ncp == NULL) 1709 break; 1710 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1711 break; 1712 MPASS(ncp->nc_dvp == vp); 1713 blps[1] = NCP2BUCKETLOCK(ncp); 1714 if (ncp->nc_flag & NCF_NEGATIVE) 1715 break; 1716 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1717 break; 1718 /* 1719 * All vnodes got re-locked. Re-validate the state and if 1720 * nothing changed we are done. Otherwise restart. 1721 */ 1722 if (ncp == vp->v_cache_dd && 1723 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1724 blps[1] == NCP2BUCKETLOCK(ncp) && 1725 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1726 break; 1727 cache_unlock_vnodes_cel(cel); 1728 cel->vlp[0] = NULL; 1729 cel->vlp[1] = NULL; 1730 cel->vlp[2] = NULL; 1731 } 1732 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1733 } 1734 1735 static void 1736 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1737 uint32_t hash) 1738 { 1739 struct namecache *ncp; 1740 struct mtx *blps[2]; 1741 1742 blps[0] = HASH2BUCKETLOCK(hash); 1743 for (;;) { 1744 blps[1] = NULL; 1745 cache_lock_vnodes_cel(cel, dvp, vp); 1746 ncp = dvp->v_cache_dd; 1747 if (ncp == NULL) 1748 break; 1749 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1750 break; 1751 MPASS(ncp->nc_dvp == dvp); 1752 blps[1] = NCP2BUCKETLOCK(ncp); 1753 if (ncp->nc_flag & NCF_NEGATIVE) 1754 break; 1755 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1756 break; 1757 if (ncp == dvp->v_cache_dd && 1758 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1759 blps[1] == NCP2BUCKETLOCK(ncp) && 1760 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1761 break; 1762 cache_unlock_vnodes_cel(cel); 1763 cel->vlp[0] = NULL; 1764 cel->vlp[1] = NULL; 1765 cel->vlp[2] = NULL; 1766 } 1767 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1768 } 1769 1770 static void 1771 cache_enter_unlock(struct celockstate *cel) 1772 { 1773 1774 cache_unlock_buckets_cel(cel); 1775 cache_unlock_vnodes_cel(cel); 1776 } 1777 1778 static void __noinline 1779 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, 1780 struct componentname *cnp) 1781 { 1782 struct celockstate cel; 1783 struct namecache *ncp; 1784 uint32_t hash; 1785 int len; 1786 1787 if (dvp->v_cache_dd == NULL) 1788 return; 1789 len = cnp->cn_namelen; 1790 cache_celockstate_init(&cel); 1791 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1792 cache_enter_lock_dd(&cel, dvp, vp, hash); 1793 vn_seqc_write_begin(dvp); 1794 ncp = dvp->v_cache_dd; 1795 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) { 1796 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); 1797 cache_zap_locked(ncp); 1798 } else { 1799 ncp = NULL; 1800 } 1801 dvp->v_cache_dd = NULL; 1802 vn_seqc_write_end(dvp); 1803 cache_enter_unlock(&cel); 1804 if (ncp != NULL) 1805 cache_free(ncp); 1806 } 1807 1808 /* 1809 * Add an entry to the cache. 1810 */ 1811 void 1812 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 1813 struct timespec *tsp, struct timespec *dtsp) 1814 { 1815 struct celockstate cel; 1816 struct namecache *ncp, *n2, *ndd; 1817 struct namecache_ts *ncp_ts; 1818 struct nchashhead *ncpp; 1819 uint32_t hash; 1820 int flag; 1821 int len; 1822 u_long lnumcache; 1823 1824 VNPASS(!VN_IS_DOOMED(dvp), dvp); 1825 VNPASS(dvp->v_type != VNON, dvp); 1826 if (vp != NULL) { 1827 VNPASS(!VN_IS_DOOMED(vp), vp); 1828 VNPASS(vp->v_type != VNON, vp); 1829 } 1830 1831 #ifdef DEBUG_CACHE 1832 if (__predict_false(!doingcache)) 1833 return; 1834 #endif 1835 1836 flag = 0; 1837 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 1838 if (cnp->cn_namelen == 1) 1839 return; 1840 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 1841 cache_enter_dotdot_prep(dvp, vp, cnp); 1842 flag = NCF_ISDOTDOT; 1843 } 1844 } 1845 1846 /* 1847 * Avoid blowout in namecache entries. 1848 */ 1849 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; 1850 if (__predict_false(lnumcache >= ncsize)) { 1851 atomic_subtract_long(&numcache, 1); 1852 counter_u64_add(numdrops, 1); 1853 return; 1854 } 1855 1856 cache_celockstate_init(&cel); 1857 ndd = NULL; 1858 ncp_ts = NULL; 1859 1860 /* 1861 * Calculate the hash key and setup as much of the new 1862 * namecache entry as possible before acquiring the lock. 1863 */ 1864 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); 1865 ncp->nc_flag = flag | NCF_WIP; 1866 ncp->nc_vp = vp; 1867 if (vp == NULL) 1868 cache_negative_init(ncp); 1869 ncp->nc_dvp = dvp; 1870 if (tsp != NULL) { 1871 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1872 ncp_ts->nc_time = *tsp; 1873 ncp_ts->nc_ticks = ticks; 1874 ncp_ts->nc_nc.nc_flag |= NCF_TS; 1875 if (dtsp != NULL) { 1876 ncp_ts->nc_dotdottime = *dtsp; 1877 ncp_ts->nc_nc.nc_flag |= NCF_DTS; 1878 } 1879 } 1880 len = ncp->nc_nlen = cnp->cn_namelen; 1881 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1882 memcpy(ncp->nc_name, cnp->cn_nameptr, len); 1883 ncp->nc_name[len] = '\0'; 1884 cache_enter_lock(&cel, dvp, vp, hash); 1885 1886 /* 1887 * See if this vnode or negative entry is already in the cache 1888 * with this name. This can happen with concurrent lookups of 1889 * the same path name. 1890 */ 1891 ncpp = NCHHASH(hash); 1892 CK_SLIST_FOREACH(n2, ncpp, nc_hash) { 1893 if (n2->nc_dvp == dvp && 1894 n2->nc_nlen == cnp->cn_namelen && 1895 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { 1896 MPASS(cache_ncp_canuse(n2)); 1897 if ((n2->nc_flag & NCF_NEGATIVE) != 0) 1898 KASSERT(vp == NULL, 1899 ("%s: found entry pointing to a different vnode (%p != %p)", 1900 __func__, NULL, vp)); 1901 else 1902 KASSERT(n2->nc_vp == vp, 1903 ("%s: found entry pointing to a different vnode (%p != %p)", 1904 __func__, n2->nc_vp, vp)); 1905 /* 1906 * Entries are supposed to be immutable unless in the 1907 * process of getting destroyed. Accommodating for 1908 * changing timestamps is possible but not worth it. 1909 * This should be harmless in terms of correctness, in 1910 * the worst case resulting in an earlier expiration. 1911 * Alternatively, the found entry can be replaced 1912 * altogether. 1913 */ 1914 MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS))); 1915 #if 0 1916 if (tsp != NULL) { 1917 KASSERT((n2->nc_flag & NCF_TS) != 0, 1918 ("no NCF_TS")); 1919 n2_ts = __containerof(n2, struct namecache_ts, nc_nc); 1920 n2_ts->nc_time = ncp_ts->nc_time; 1921 n2_ts->nc_ticks = ncp_ts->nc_ticks; 1922 if (dtsp != NULL) { 1923 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; 1924 n2_ts->nc_nc.nc_flag |= NCF_DTS; 1925 } 1926 } 1927 #endif 1928 goto out_unlock_free; 1929 } 1930 } 1931 1932 if (flag == NCF_ISDOTDOT) { 1933 /* 1934 * See if we are trying to add .. entry, but some other lookup 1935 * has populated v_cache_dd pointer already. 1936 */ 1937 if (dvp->v_cache_dd != NULL) 1938 goto out_unlock_free; 1939 KASSERT(vp == NULL || vp->v_type == VDIR, 1940 ("wrong vnode type %p", vp)); 1941 vn_seqc_write_begin(dvp); 1942 dvp->v_cache_dd = ncp; 1943 vn_seqc_write_end(dvp); 1944 } 1945 1946 if (vp != NULL) { 1947 if (flag != NCF_ISDOTDOT) { 1948 /* 1949 * For this case, the cache entry maps both the 1950 * directory name in it and the name ".." for the 1951 * directory's parent. 1952 */ 1953 vn_seqc_write_begin(vp); 1954 if ((ndd = vp->v_cache_dd) != NULL) { 1955 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) 1956 cache_zap_locked(ndd); 1957 else 1958 ndd = NULL; 1959 } 1960 vp->v_cache_dd = ncp; 1961 vn_seqc_write_end(vp); 1962 } else if (vp->v_type != VDIR) { 1963 if (vp->v_cache_dd != NULL) { 1964 vn_seqc_write_begin(vp); 1965 vp->v_cache_dd = NULL; 1966 vn_seqc_write_end(vp); 1967 } 1968 } 1969 } 1970 1971 if (flag != NCF_ISDOTDOT) { 1972 if (LIST_EMPTY(&dvp->v_cache_src)) { 1973 vhold(dvp); 1974 counter_u64_add(numcachehv, 1); 1975 } 1976 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); 1977 } 1978 1979 /* 1980 * If the entry is "negative", we place it into the 1981 * "negative" cache queue, otherwise, we place it into the 1982 * destination vnode's cache entries queue. 1983 */ 1984 if (vp != NULL) { 1985 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); 1986 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, 1987 vp); 1988 } else { 1989 if (cnp->cn_flags & ISWHITEOUT) 1990 ncp->nc_flag |= NCF_WHITE; 1991 cache_negative_insert(ncp); 1992 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, 1993 ncp->nc_name); 1994 } 1995 1996 /* 1997 * Insert the new namecache entry into the appropriate chain 1998 * within the cache entries table. 1999 */ 2000 CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash); 2001 2002 atomic_thread_fence_rel(); 2003 /* 2004 * Mark the entry as fully constructed. 2005 * It is immutable past this point until its removal. 2006 */ 2007 atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP); 2008 2009 cache_enter_unlock(&cel); 2010 if (numneg * ncnegfactor > lnumcache) 2011 cache_negative_zap_one(); 2012 if (ndd != NULL) 2013 cache_free(ndd); 2014 return; 2015 out_unlock_free: 2016 cache_enter_unlock(&cel); 2017 atomic_subtract_long(&numcache, 1); 2018 cache_free(ncp); 2019 return; 2020 } 2021 2022 static u_int 2023 cache_roundup_2(u_int val) 2024 { 2025 u_int res; 2026 2027 for (res = 1; res <= val; res <<= 1) 2028 continue; 2029 2030 return (res); 2031 } 2032 2033 static struct nchashhead * 2034 nchinittbl(u_long elements, u_long *hashmask) 2035 { 2036 struct nchashhead *hashtbl; 2037 u_long hashsize, i; 2038 2039 hashsize = cache_roundup_2(elements) / 2; 2040 2041 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK); 2042 for (i = 0; i < hashsize; i++) 2043 CK_SLIST_INIT(&hashtbl[i]); 2044 *hashmask = hashsize - 1; 2045 return (hashtbl); 2046 } 2047 2048 static void 2049 ncfreetbl(struct nchashhead *hashtbl) 2050 { 2051 2052 free(hashtbl, M_VFSCACHE); 2053 } 2054 2055 /* 2056 * Name cache initialization, from vfs_init() when we are booting 2057 */ 2058 static void 2059 nchinit(void *dummy __unused) 2060 { 2061 u_int i; 2062 2063 cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE, 2064 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2065 cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE, 2066 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2067 cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE, 2068 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2069 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE, 2070 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2071 2072 VFS_SMR_ZONE_SET(cache_zone_small); 2073 VFS_SMR_ZONE_SET(cache_zone_small_ts); 2074 VFS_SMR_ZONE_SET(cache_zone_large); 2075 VFS_SMR_ZONE_SET(cache_zone_large_ts); 2076 2077 ncsize = desiredvnodes * ncsizefactor; 2078 nchashtbl = nchinittbl(desiredvnodes * 2, &nchash); 2079 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; 2080 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ 2081 ncbuckethash = 7; 2082 if (ncbuckethash > nchash) 2083 ncbuckethash = nchash; 2084 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, 2085 M_WAITOK | M_ZERO); 2086 for (i = 0; i < numbucketlocks; i++) 2087 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE); 2088 ncvnodehash = ncbuckethash; 2089 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, 2090 M_WAITOK | M_ZERO); 2091 for (i = 0; i < numvnodelocks; i++) 2092 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); 2093 2094 neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE, 2095 M_WAITOK | M_ZERO); 2096 for (i = 0; i < numneglists; i++) { 2097 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); 2098 TAILQ_INIT(&neglists[i].nl_list); 2099 } 2100 mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF); 2101 TAILQ_INIT(&ncneg_hot.nl_list); 2102 2103 mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF); 2104 } 2105 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); 2106 2107 void 2108 cache_vnode_init(struct vnode *vp) 2109 { 2110 2111 LIST_INIT(&vp->v_cache_src); 2112 TAILQ_INIT(&vp->v_cache_dst); 2113 vp->v_cache_dd = NULL; 2114 cache_prehash(vp); 2115 } 2116 2117 void 2118 cache_changesize(u_long newmaxvnodes) 2119 { 2120 struct nchashhead *new_nchashtbl, *old_nchashtbl; 2121 u_long new_nchash, old_nchash; 2122 struct namecache *ncp; 2123 uint32_t hash; 2124 u_long newncsize; 2125 int i; 2126 2127 newncsize = newmaxvnodes * ncsizefactor; 2128 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); 2129 if (newmaxvnodes < numbucketlocks) 2130 newmaxvnodes = numbucketlocks; 2131 2132 new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash); 2133 /* If same hash table size, nothing to do */ 2134 if (nchash == new_nchash) { 2135 ncfreetbl(new_nchashtbl); 2136 return; 2137 } 2138 /* 2139 * Move everything from the old hash table to the new table. 2140 * None of the namecache entries in the table can be removed 2141 * because to do so, they have to be removed from the hash table. 2142 */ 2143 cache_lock_all_vnodes(); 2144 cache_lock_all_buckets(); 2145 old_nchashtbl = nchashtbl; 2146 old_nchash = nchash; 2147 nchashtbl = new_nchashtbl; 2148 nchash = new_nchash; 2149 for (i = 0; i <= old_nchash; i++) { 2150 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) { 2151 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, 2152 ncp->nc_dvp); 2153 CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash); 2154 CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); 2155 } 2156 } 2157 ncsize = newncsize; 2158 cache_unlock_all_buckets(); 2159 cache_unlock_all_vnodes(); 2160 ncfreetbl(old_nchashtbl); 2161 } 2162 2163 /* 2164 * Invalidate all entries from and to a particular vnode. 2165 */ 2166 static void 2167 cache_purge_impl(struct vnode *vp) 2168 { 2169 TAILQ_HEAD(, namecache) ncps; 2170 struct namecache *ncp, *nnp; 2171 struct mtx *vlp, *vlp2; 2172 2173 TAILQ_INIT(&ncps); 2174 vlp = VP2VNODELOCK(vp); 2175 vlp2 = NULL; 2176 mtx_assert(vlp, MA_OWNED); 2177 retry: 2178 while (!LIST_EMPTY(&vp->v_cache_src)) { 2179 ncp = LIST_FIRST(&vp->v_cache_src); 2180 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2181 goto retry; 2182 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2183 } 2184 while (!TAILQ_EMPTY(&vp->v_cache_dst)) { 2185 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2186 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2187 goto retry; 2188 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2189 } 2190 ncp = vp->v_cache_dd; 2191 if (ncp != NULL) { 2192 KASSERT(ncp->nc_flag & NCF_ISDOTDOT, 2193 ("lost dotdot link")); 2194 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2195 goto retry; 2196 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2197 } 2198 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); 2199 mtx_unlock(vlp); 2200 if (vlp2 != NULL) 2201 mtx_unlock(vlp2); 2202 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2203 cache_free(ncp); 2204 } 2205 } 2206 2207 /* 2208 * Opportunistic check to see if there is anything to do. 2209 */ 2210 static bool 2211 cache_has_entries(struct vnode *vp) 2212 { 2213 2214 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2215 vp->v_cache_dd == NULL) 2216 return (false); 2217 return (true); 2218 } 2219 2220 void 2221 cache_purge(struct vnode *vp) 2222 { 2223 struct mtx *vlp; 2224 2225 SDT_PROBE1(vfs, namecache, purge, done, vp); 2226 if (!cache_has_entries(vp)) 2227 return; 2228 vlp = VP2VNODELOCK(vp); 2229 mtx_lock(vlp); 2230 cache_purge_impl(vp); 2231 } 2232 2233 /* 2234 * Only to be used by vgone. 2235 */ 2236 void 2237 cache_purge_vgone(struct vnode *vp) 2238 { 2239 struct mtx *vlp; 2240 2241 VNPASS(VN_IS_DOOMED(vp), vp); 2242 vlp = VP2VNODELOCK(vp); 2243 if (!(LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2244 vp->v_cache_dd == NULL)) { 2245 mtx_lock(vlp); 2246 cache_purge_impl(vp); 2247 mtx_assert(vlp, MA_NOTOWNED); 2248 return; 2249 } 2250 2251 /* 2252 * All the NULL pointer state we found above may be transient. 2253 * Serialize against a possible thread doing cache_purge. 2254 */ 2255 mtx_wait_unlocked(vlp); 2256 if (!(LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2257 vp->v_cache_dd == NULL)) { 2258 mtx_lock(vlp); 2259 cache_purge_impl(vp); 2260 mtx_assert(vlp, MA_NOTOWNED); 2261 return; 2262 } 2263 return; 2264 } 2265 2266 /* 2267 * Invalidate all negative entries for a particular directory vnode. 2268 */ 2269 void 2270 cache_purge_negative(struct vnode *vp) 2271 { 2272 TAILQ_HEAD(, namecache) ncps; 2273 struct namecache *ncp, *nnp; 2274 struct mtx *vlp; 2275 2276 SDT_PROBE1(vfs, namecache, purge_negative, done, vp); 2277 if (LIST_EMPTY(&vp->v_cache_src)) 2278 return; 2279 TAILQ_INIT(&ncps); 2280 vlp = VP2VNODELOCK(vp); 2281 mtx_lock(vlp); 2282 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { 2283 if (!(ncp->nc_flag & NCF_NEGATIVE)) 2284 continue; 2285 cache_zap_negative_locked_vnode_kl(ncp, vp); 2286 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2287 } 2288 mtx_unlock(vlp); 2289 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2290 cache_free(ncp); 2291 } 2292 } 2293 2294 void 2295 cache_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, 2296 struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp) 2297 { 2298 2299 ASSERT_VOP_IN_SEQC(fdvp); 2300 ASSERT_VOP_IN_SEQC(fvp); 2301 ASSERT_VOP_IN_SEQC(tdvp); 2302 if (tvp != NULL) 2303 ASSERT_VOP_IN_SEQC(tvp); 2304 2305 cache_purge(fvp); 2306 if (tvp != NULL) { 2307 cache_purge(tvp); 2308 KASSERT(!cache_remove_cnp(tdvp, tcnp), 2309 ("%s: lingering negative entry", __func__)); 2310 } else { 2311 cache_remove_cnp(tdvp, tcnp); 2312 } 2313 } 2314 2315 /* 2316 * Flush all entries referencing a particular filesystem. 2317 */ 2318 void 2319 cache_purgevfs(struct mount *mp) 2320 { 2321 struct vnode *vp, *mvp; 2322 2323 SDT_PROBE1(vfs, namecache, purgevfs, done, mp); 2324 /* 2325 * Somewhat wasteful iteration over all vnodes. Would be better to 2326 * support filtering and avoid the interlock to begin with. 2327 */ 2328 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 2329 if (!cache_has_entries(vp)) { 2330 VI_UNLOCK(vp); 2331 continue; 2332 } 2333 vholdl(vp); 2334 VI_UNLOCK(vp); 2335 cache_purge(vp); 2336 vdrop(vp); 2337 } 2338 } 2339 2340 /* 2341 * Perform canonical checks and cache lookup and pass on to filesystem 2342 * through the vop_cachedlookup only if needed. 2343 */ 2344 2345 int 2346 vfs_cache_lookup(struct vop_lookup_args *ap) 2347 { 2348 struct vnode *dvp; 2349 int error; 2350 struct vnode **vpp = ap->a_vpp; 2351 struct componentname *cnp = ap->a_cnp; 2352 int flags = cnp->cn_flags; 2353 2354 *vpp = NULL; 2355 dvp = ap->a_dvp; 2356 2357 if (dvp->v_type != VDIR) 2358 return (ENOTDIR); 2359 2360 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && 2361 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) 2362 return (EROFS); 2363 2364 error = vn_dir_check_exec(dvp, cnp); 2365 if (error != 0) 2366 return (error); 2367 2368 error = cache_lookup(dvp, vpp, cnp, NULL, NULL); 2369 if (error == 0) 2370 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); 2371 if (error == -1) 2372 return (0); 2373 return (error); 2374 } 2375 2376 /* Implementation of the getcwd syscall. */ 2377 int 2378 sys___getcwd(struct thread *td, struct __getcwd_args *uap) 2379 { 2380 char *buf, *retbuf; 2381 size_t buflen; 2382 int error; 2383 2384 buflen = uap->buflen; 2385 if (__predict_false(buflen < 2)) 2386 return (EINVAL); 2387 if (buflen > MAXPATHLEN) 2388 buflen = MAXPATHLEN; 2389 2390 buf = uma_zalloc(namei_zone, M_WAITOK); 2391 error = vn_getcwd(buf, &retbuf, &buflen); 2392 if (error == 0) 2393 error = copyout(retbuf, uap->buf, buflen); 2394 uma_zfree(namei_zone, buf); 2395 return (error); 2396 } 2397 2398 int 2399 vn_getcwd(char *buf, char **retbuf, size_t *buflen) 2400 { 2401 struct pwd *pwd; 2402 int error; 2403 2404 vfs_smr_enter(); 2405 pwd = pwd_get_smr(); 2406 error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, 2407 buflen, false, 0); 2408 VFS_SMR_ASSERT_NOT_ENTERED(); 2409 if (error < 0) { 2410 pwd = pwd_hold(curthread); 2411 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf, 2412 retbuf, buflen); 2413 pwd_drop(pwd); 2414 } 2415 2416 #ifdef KTRACE 2417 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0) 2418 ktrnamei(*retbuf); 2419 #endif 2420 return (error); 2421 } 2422 2423 static int 2424 kern___realpathat(struct thread *td, int fd, const char *path, char *buf, 2425 size_t size, int flags, enum uio_seg pathseg) 2426 { 2427 struct nameidata nd; 2428 char *retbuf, *freebuf; 2429 int error; 2430 2431 if (flags != 0) 2432 return (EINVAL); 2433 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1, 2434 pathseg, path, fd, &cap_fstat_rights, td); 2435 if ((error = namei(&nd)) != 0) 2436 return (error); 2437 error = vn_fullpath_hardlink(&nd, &retbuf, &freebuf, &size); 2438 if (error == 0) { 2439 error = copyout(retbuf, buf, size); 2440 free(freebuf, M_TEMP); 2441 } 2442 NDFREE(&nd, 0); 2443 return (error); 2444 } 2445 2446 int 2447 sys___realpathat(struct thread *td, struct __realpathat_args *uap) 2448 { 2449 2450 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size, 2451 uap->flags, UIO_USERSPACE)); 2452 } 2453 2454 /* 2455 * Retrieve the full filesystem path that correspond to a vnode from the name 2456 * cache (if available) 2457 */ 2458 int 2459 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf) 2460 { 2461 struct pwd *pwd; 2462 char *buf; 2463 size_t buflen; 2464 int error; 2465 2466 if (__predict_false(vp == NULL)) 2467 return (EINVAL); 2468 2469 buflen = MAXPATHLEN; 2470 buf = malloc(buflen, M_TEMP, M_WAITOK); 2471 vfs_smr_enter(); 2472 pwd = pwd_get_smr(); 2473 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, false, 0); 2474 VFS_SMR_ASSERT_NOT_ENTERED(); 2475 if (error < 0) { 2476 pwd = pwd_hold(curthread); 2477 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen); 2478 pwd_drop(pwd); 2479 } 2480 if (error == 0) 2481 *freebuf = buf; 2482 else 2483 free(buf, M_TEMP); 2484 return (error); 2485 } 2486 2487 /* 2488 * This function is similar to vn_fullpath, but it attempts to lookup the 2489 * pathname relative to the global root mount point. This is required for the 2490 * auditing sub-system, as audited pathnames must be absolute, relative to the 2491 * global root mount point. 2492 */ 2493 int 2494 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf) 2495 { 2496 char *buf; 2497 size_t buflen; 2498 int error; 2499 2500 if (__predict_false(vp == NULL)) 2501 return (EINVAL); 2502 buflen = MAXPATHLEN; 2503 buf = malloc(buflen, M_TEMP, M_WAITOK); 2504 vfs_smr_enter(); 2505 error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, false, 0); 2506 VFS_SMR_ASSERT_NOT_ENTERED(); 2507 if (error < 0) { 2508 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen); 2509 } 2510 if (error == 0) 2511 *freebuf = buf; 2512 else 2513 free(buf, M_TEMP); 2514 return (error); 2515 } 2516 2517 static struct namecache * 2518 vn_dd_from_dst(struct vnode *vp) 2519 { 2520 struct namecache *ncp; 2521 2522 cache_assert_vnode_locked(vp); 2523 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) { 2524 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2525 return (ncp); 2526 } 2527 return (NULL); 2528 } 2529 2530 int 2531 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, size_t *buflen) 2532 { 2533 struct vnode *dvp; 2534 struct namecache *ncp; 2535 struct mtx *vlp; 2536 int error; 2537 2538 vlp = VP2VNODELOCK(*vp); 2539 mtx_lock(vlp); 2540 ncp = (*vp)->v_cache_dd; 2541 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) { 2542 KASSERT(ncp == vn_dd_from_dst(*vp), 2543 ("%s: mismatch for dd entry (%p != %p)", __func__, 2544 ncp, vn_dd_from_dst(*vp))); 2545 } else { 2546 ncp = vn_dd_from_dst(*vp); 2547 } 2548 if (ncp != NULL) { 2549 if (*buflen < ncp->nc_nlen) { 2550 mtx_unlock(vlp); 2551 vrele(*vp); 2552 counter_u64_add(numfullpathfail4, 1); 2553 error = ENOMEM; 2554 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2555 vp, NULL); 2556 return (error); 2557 } 2558 *buflen -= ncp->nc_nlen; 2559 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 2560 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, 2561 ncp->nc_name, vp); 2562 dvp = *vp; 2563 *vp = ncp->nc_dvp; 2564 vref(*vp); 2565 mtx_unlock(vlp); 2566 vrele(dvp); 2567 return (0); 2568 } 2569 SDT_PROBE1(vfs, namecache, fullpath, miss, vp); 2570 2571 mtx_unlock(vlp); 2572 vn_lock(*vp, LK_SHARED | LK_RETRY); 2573 error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen); 2574 vput(*vp); 2575 if (error) { 2576 counter_u64_add(numfullpathfail2, 1); 2577 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2578 return (error); 2579 } 2580 2581 *vp = dvp; 2582 if (VN_IS_DOOMED(dvp)) { 2583 /* forced unmount */ 2584 vrele(dvp); 2585 error = ENOENT; 2586 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2587 return (error); 2588 } 2589 /* 2590 * *vp has its use count incremented still. 2591 */ 2592 2593 return (0); 2594 } 2595 2596 /* 2597 * Resolve a directory to a pathname. 2598 * 2599 * The name of the directory can always be found in the namecache or fetched 2600 * from the filesystem. There is also guaranteed to be only one parent, meaning 2601 * we can just follow vnodes up until we find the root. 2602 * 2603 * The vnode must be referenced. 2604 */ 2605 static int 2606 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 2607 size_t *len, bool slash_prefixed, size_t addend) 2608 { 2609 #ifdef KDTRACE_HOOKS 2610 struct vnode *startvp = vp; 2611 #endif 2612 struct vnode *vp1; 2613 size_t buflen; 2614 int error; 2615 2616 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); 2617 VNPASS(vp->v_usecount > 0, vp); 2618 2619 buflen = *len; 2620 2621 if (!slash_prefixed) { 2622 MPASS(*len >= 2); 2623 buflen--; 2624 buf[buflen] = '\0'; 2625 } 2626 2627 error = 0; 2628 2629 SDT_PROBE1(vfs, namecache, fullpath, entry, vp); 2630 counter_u64_add(numfullpathcalls, 1); 2631 while (vp != rdir && vp != rootvnode) { 2632 /* 2633 * The vp vnode must be already fully constructed, 2634 * since it is either found in namecache or obtained 2635 * from VOP_VPTOCNP(). We may test for VV_ROOT safely 2636 * without obtaining the vnode lock. 2637 */ 2638 if ((vp->v_vflag & VV_ROOT) != 0) { 2639 vn_lock(vp, LK_RETRY | LK_SHARED); 2640 2641 /* 2642 * With the vnode locked, check for races with 2643 * unmount, forced or not. Note that we 2644 * already verified that vp is not equal to 2645 * the root vnode, which means that 2646 * mnt_vnodecovered can be NULL only for the 2647 * case of unmount. 2648 */ 2649 if (VN_IS_DOOMED(vp) || 2650 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || 2651 vp1->v_mountedhere != vp->v_mount) { 2652 vput(vp); 2653 error = ENOENT; 2654 SDT_PROBE3(vfs, namecache, fullpath, return, 2655 error, vp, NULL); 2656 break; 2657 } 2658 2659 vref(vp1); 2660 vput(vp); 2661 vp = vp1; 2662 continue; 2663 } 2664 if (vp->v_type != VDIR) { 2665 vrele(vp); 2666 counter_u64_add(numfullpathfail1, 1); 2667 error = ENOTDIR; 2668 SDT_PROBE3(vfs, namecache, fullpath, return, 2669 error, vp, NULL); 2670 break; 2671 } 2672 error = vn_vptocnp(&vp, curthread->td_ucred, buf, &buflen); 2673 if (error) 2674 break; 2675 if (buflen == 0) { 2676 vrele(vp); 2677 error = ENOMEM; 2678 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2679 startvp, NULL); 2680 break; 2681 } 2682 buf[--buflen] = '/'; 2683 slash_prefixed = true; 2684 } 2685 if (error) 2686 return (error); 2687 if (!slash_prefixed) { 2688 if (buflen == 0) { 2689 vrele(vp); 2690 counter_u64_add(numfullpathfail4, 1); 2691 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, 2692 startvp, NULL); 2693 return (ENOMEM); 2694 } 2695 buf[--buflen] = '/'; 2696 } 2697 counter_u64_add(numfullpathfound, 1); 2698 vrele(vp); 2699 2700 *retbuf = buf + buflen; 2701 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf); 2702 *len -= buflen; 2703 *len += addend; 2704 return (0); 2705 } 2706 2707 /* 2708 * Resolve an arbitrary vnode to a pathname. 2709 * 2710 * Note 2 caveats: 2711 * - hardlinks are not tracked, thus if the vnode is not a directory this can 2712 * resolve to a different path than the one used to find it 2713 * - namecache is not mandatory, meaning names are not guaranteed to be added 2714 * (in which case resolving fails) 2715 */ 2716 static void __inline 2717 cache_rev_failed_impl(int *reason, int line) 2718 { 2719 2720 *reason = line; 2721 } 2722 #define cache_rev_failed(var) cache_rev_failed_impl((var), __LINE__) 2723 2724 static int 2725 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 2726 char **retbuf, size_t *buflen, bool slash_prefixed, size_t addend) 2727 { 2728 #ifdef KDTRACE_HOOKS 2729 struct vnode *startvp = vp; 2730 #endif 2731 struct vnode *tvp; 2732 struct mount *mp; 2733 struct namecache *ncp; 2734 size_t orig_buflen; 2735 int reason; 2736 int error; 2737 #ifdef KDTRACE_HOOKS 2738 int i; 2739 #endif 2740 seqc_t vp_seqc, tvp_seqc; 2741 u_char nc_flag; 2742 2743 VFS_SMR_ASSERT_ENTERED(); 2744 2745 if (!cache_fast_revlookup) { 2746 vfs_smr_exit(); 2747 return (-1); 2748 } 2749 2750 orig_buflen = *buflen; 2751 2752 if (!slash_prefixed) { 2753 MPASS(*buflen >= 2); 2754 *buflen -= 1; 2755 buf[*buflen] = '\0'; 2756 } 2757 2758 if (vp == rdir || vp == rootvnode) { 2759 if (!slash_prefixed) { 2760 *buflen -= 1; 2761 buf[*buflen] = '/'; 2762 } 2763 goto out_ok; 2764 } 2765 2766 #ifdef KDTRACE_HOOKS 2767 i = 0; 2768 #endif 2769 error = -1; 2770 ncp = NULL; /* for sdt probe down below */ 2771 vp_seqc = vn_seqc_read_any(vp); 2772 if (seqc_in_modify(vp_seqc)) { 2773 cache_rev_failed(&reason); 2774 goto out_abort; 2775 } 2776 2777 for (;;) { 2778 #ifdef KDTRACE_HOOKS 2779 i++; 2780 #endif 2781 if ((vp->v_vflag & VV_ROOT) != 0) { 2782 mp = atomic_load_ptr(&vp->v_mount); 2783 if (mp == NULL) { 2784 cache_rev_failed(&reason); 2785 goto out_abort; 2786 } 2787 tvp = atomic_load_ptr(&mp->mnt_vnodecovered); 2788 tvp_seqc = vn_seqc_read_any(tvp); 2789 if (seqc_in_modify(tvp_seqc)) { 2790 cache_rev_failed(&reason); 2791 goto out_abort; 2792 } 2793 if (!vn_seqc_consistent(vp, vp_seqc)) { 2794 cache_rev_failed(&reason); 2795 goto out_abort; 2796 } 2797 vp = tvp; 2798 vp_seqc = tvp_seqc; 2799 continue; 2800 } 2801 ncp = atomic_load_ptr(&vp->v_cache_dd); 2802 if (ncp == NULL) { 2803 cache_rev_failed(&reason); 2804 goto out_abort; 2805 } 2806 nc_flag = atomic_load_char(&ncp->nc_flag); 2807 if ((nc_flag & NCF_ISDOTDOT) != 0) { 2808 cache_rev_failed(&reason); 2809 goto out_abort; 2810 } 2811 if (!cache_ncp_canuse(ncp)) { 2812 cache_rev_failed(&reason); 2813 goto out_abort; 2814 } 2815 if (ncp->nc_nlen >= *buflen) { 2816 cache_rev_failed(&reason); 2817 error = ENOMEM; 2818 goto out_abort; 2819 } 2820 *buflen -= ncp->nc_nlen; 2821 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 2822 *buflen -= 1; 2823 buf[*buflen] = '/'; 2824 tvp = ncp->nc_dvp; 2825 tvp_seqc = vn_seqc_read_any(tvp); 2826 if (seqc_in_modify(tvp_seqc)) { 2827 cache_rev_failed(&reason); 2828 goto out_abort; 2829 } 2830 if (!vn_seqc_consistent(vp, vp_seqc)) { 2831 cache_rev_failed(&reason); 2832 goto out_abort; 2833 } 2834 vp = tvp; 2835 vp_seqc = tvp_seqc; 2836 if (vp == rdir || vp == rootvnode) 2837 break; 2838 } 2839 out_ok: 2840 vfs_smr_exit(); 2841 *retbuf = buf + *buflen; 2842 *buflen = orig_buflen - *buflen + addend; 2843 SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf); 2844 return (0); 2845 2846 out_abort: 2847 *buflen = orig_buflen; 2848 SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i); 2849 vfs_smr_exit(); 2850 return (error); 2851 } 2852 2853 static int 2854 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 2855 size_t *buflen) 2856 { 2857 size_t orig_buflen; 2858 bool slash_prefixed; 2859 int error; 2860 2861 if (*buflen < 2) 2862 return (EINVAL); 2863 2864 orig_buflen = *buflen; 2865 2866 vref(vp); 2867 slash_prefixed = false; 2868 if (vp->v_type != VDIR) { 2869 *buflen -= 1; 2870 buf[*buflen] = '\0'; 2871 error = vn_vptocnp(&vp, curthread->td_ucred, buf, buflen); 2872 if (error) 2873 return (error); 2874 if (*buflen == 0) { 2875 vrele(vp); 2876 return (ENOMEM); 2877 } 2878 *buflen -= 1; 2879 buf[*buflen] = '/'; 2880 slash_prefixed = true; 2881 } 2882 2883 return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, slash_prefixed, 2884 orig_buflen - *buflen)); 2885 } 2886 2887 /* 2888 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks). 2889 * 2890 * Since the namecache does not track handlings, the caller is expected to first 2891 * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei. 2892 * 2893 * Then we have 2 cases: 2894 * - if the found vnode is a directory, the path can be constructed just by 2895 * fullowing names up the chain 2896 * - otherwise we populate the buffer with the saved name and start resolving 2897 * from the parent 2898 */ 2899 static int 2900 vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, char **freebuf, 2901 size_t *buflen) 2902 { 2903 char *buf, *tmpbuf; 2904 struct pwd *pwd; 2905 struct componentname *cnp; 2906 struct vnode *vp; 2907 size_t addend; 2908 int error; 2909 bool slash_prefixed; 2910 enum vtype type; 2911 2912 if (*buflen < 2) 2913 return (EINVAL); 2914 if (*buflen > MAXPATHLEN) 2915 *buflen = MAXPATHLEN; 2916 2917 slash_prefixed = false; 2918 2919 buf = malloc(*buflen, M_TEMP, M_WAITOK); 2920 2921 addend = 0; 2922 vp = ndp->ni_vp; 2923 /* 2924 * Check for VBAD to work around the vp_crossmp bug in lookup(). 2925 * 2926 * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be 2927 * set to mount point's root vnode while ni_dvp will be vp_crossmp. 2928 * If the type is VDIR (like in this very case) we can skip looking 2929 * at ni_dvp in the first place. However, since vnodes get passed here 2930 * unlocked the target may transition to doomed state (type == VBAD) 2931 * before we get to evaluate the condition. If this happens, we will 2932 * populate part of the buffer and descend to vn_fullpath_dir with 2933 * vp == vp_crossmp. Prevent the problem by checking for VBAD. 2934 * 2935 * This should be atomic_load(&vp->v_type) but it is ilegal to take 2936 * an address of a bit field, even if said field is sized to char. 2937 * Work around the problem by reading the value into a full-sized enum 2938 * and then re-reading it with atomic_load which will still prevent 2939 * the compiler from re-reading down the road. 2940 */ 2941 type = vp->v_type; 2942 type = atomic_load_int(&type); 2943 if (type == VBAD) { 2944 error = ENOENT; 2945 goto out_bad; 2946 } 2947 if (type != VDIR) { 2948 cnp = &ndp->ni_cnd; 2949 addend = cnp->cn_namelen + 2; 2950 if (*buflen < addend) { 2951 error = ENOMEM; 2952 goto out_bad; 2953 } 2954 *buflen -= addend; 2955 tmpbuf = buf + *buflen; 2956 tmpbuf[0] = '/'; 2957 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen); 2958 tmpbuf[addend - 1] = '\0'; 2959 slash_prefixed = true; 2960 vp = ndp->ni_dvp; 2961 } 2962 2963 vfs_smr_enter(); 2964 pwd = pwd_get_smr(); 2965 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen, 2966 slash_prefixed, addend); 2967 VFS_SMR_ASSERT_NOT_ENTERED(); 2968 if (error < 0) { 2969 pwd = pwd_hold(curthread); 2970 vref(vp); 2971 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen, 2972 slash_prefixed, addend); 2973 pwd_drop(pwd); 2974 if (error != 0) 2975 goto out_bad; 2976 } 2977 2978 *freebuf = buf; 2979 2980 return (0); 2981 out_bad: 2982 free(buf, M_TEMP); 2983 return (error); 2984 } 2985 2986 struct vnode * 2987 vn_dir_dd_ino(struct vnode *vp) 2988 { 2989 struct namecache *ncp; 2990 struct vnode *ddvp; 2991 struct mtx *vlp; 2992 enum vgetstate vs; 2993 2994 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); 2995 vlp = VP2VNODELOCK(vp); 2996 mtx_lock(vlp); 2997 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { 2998 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 2999 continue; 3000 ddvp = ncp->nc_dvp; 3001 vs = vget_prep(ddvp); 3002 mtx_unlock(vlp); 3003 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) 3004 return (NULL); 3005 return (ddvp); 3006 } 3007 mtx_unlock(vlp); 3008 return (NULL); 3009 } 3010 3011 int 3012 vn_commname(struct vnode *vp, char *buf, u_int buflen) 3013 { 3014 struct namecache *ncp; 3015 struct mtx *vlp; 3016 int l; 3017 3018 vlp = VP2VNODELOCK(vp); 3019 mtx_lock(vlp); 3020 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) 3021 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 3022 break; 3023 if (ncp == NULL) { 3024 mtx_unlock(vlp); 3025 return (ENOENT); 3026 } 3027 l = min(ncp->nc_nlen, buflen - 1); 3028 memcpy(buf, ncp->nc_name, l); 3029 mtx_unlock(vlp); 3030 buf[l] = '\0'; 3031 return (0); 3032 } 3033 3034 /* 3035 * This function updates path string to vnode's full global path 3036 * and checks the size of the new path string against the pathlen argument. 3037 * 3038 * Requires a locked, referenced vnode. 3039 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 3040 * 3041 * If vp is a directory, the call to vn_fullpath_global() always succeeds 3042 * because it falls back to the ".." lookup if the namecache lookup fails. 3043 */ 3044 int 3045 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, 3046 u_int pathlen) 3047 { 3048 struct nameidata nd; 3049 struct vnode *vp1; 3050 char *rpath, *fbuf; 3051 int error; 3052 3053 ASSERT_VOP_ELOCKED(vp, __func__); 3054 3055 /* Construct global filesystem path from vp. */ 3056 VOP_UNLOCK(vp); 3057 error = vn_fullpath_global(vp, &rpath, &fbuf); 3058 3059 if (error != 0) { 3060 vrele(vp); 3061 return (error); 3062 } 3063 3064 if (strlen(rpath) >= pathlen) { 3065 vrele(vp); 3066 error = ENAMETOOLONG; 3067 goto out; 3068 } 3069 3070 /* 3071 * Re-lookup the vnode by path to detect a possible rename. 3072 * As a side effect, the vnode is relocked. 3073 * If vnode was renamed, return ENOENT. 3074 */ 3075 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, 3076 UIO_SYSSPACE, path, td); 3077 error = namei(&nd); 3078 if (error != 0) { 3079 vrele(vp); 3080 goto out; 3081 } 3082 NDFREE(&nd, NDF_ONLY_PNBUF); 3083 vp1 = nd.ni_vp; 3084 vrele(vp); 3085 if (vp1 == vp) 3086 strcpy(path, rpath); 3087 else { 3088 vput(vp1); 3089 error = ENOENT; 3090 } 3091 3092 out: 3093 free(fbuf, M_TEMP); 3094 return (error); 3095 } 3096 3097 #ifdef DDB 3098 static void 3099 db_print_vpath(struct vnode *vp) 3100 { 3101 3102 while (vp != NULL) { 3103 db_printf("%p: ", vp); 3104 if (vp == rootvnode) { 3105 db_printf("/"); 3106 vp = NULL; 3107 } else { 3108 if (vp->v_vflag & VV_ROOT) { 3109 db_printf("<mount point>"); 3110 vp = vp->v_mount->mnt_vnodecovered; 3111 } else { 3112 struct namecache *ncp; 3113 char *ncn; 3114 int i; 3115 3116 ncp = TAILQ_FIRST(&vp->v_cache_dst); 3117 if (ncp != NULL) { 3118 ncn = ncp->nc_name; 3119 for (i = 0; i < ncp->nc_nlen; i++) 3120 db_printf("%c", *ncn++); 3121 vp = ncp->nc_dvp; 3122 } else { 3123 vp = NULL; 3124 } 3125 } 3126 } 3127 db_printf("\n"); 3128 } 3129 3130 return; 3131 } 3132 3133 DB_SHOW_COMMAND(vpath, db_show_vpath) 3134 { 3135 struct vnode *vp; 3136 3137 if (!have_addr) { 3138 db_printf("usage: show vpath <struct vnode *>\n"); 3139 return; 3140 } 3141 3142 vp = (struct vnode *)addr; 3143 db_print_vpath(vp); 3144 } 3145 3146 #endif 3147 3148 static bool __read_frequently cache_fast_lookup = true; 3149 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW, 3150 &cache_fast_lookup, 0, ""); 3151 3152 #define CACHE_FPL_FAILED -2020 3153 3154 static void 3155 cache_fpl_cleanup_cnp(struct componentname *cnp) 3156 { 3157 3158 uma_zfree(namei_zone, cnp->cn_pnbuf); 3159 #ifdef DIAGNOSTIC 3160 cnp->cn_pnbuf = NULL; 3161 cnp->cn_nameptr = NULL; 3162 #endif 3163 } 3164 3165 static void 3166 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp) 3167 { 3168 struct componentname *cnp; 3169 3170 cnp = &ndp->ni_cnd; 3171 while (*(cnp->cn_nameptr) == '/') { 3172 cnp->cn_nameptr++; 3173 ndp->ni_pathlen--; 3174 } 3175 3176 *dpp = ndp->ni_rootdir; 3177 } 3178 3179 /* 3180 * Components of nameidata (or objects it can point to) which may 3181 * need restoring in case fast path lookup fails. 3182 */ 3183 struct nameidata_saved { 3184 long cn_namelen; 3185 char *cn_nameptr; 3186 size_t ni_pathlen; 3187 int cn_flags; 3188 }; 3189 3190 struct cache_fpl { 3191 struct nameidata *ndp; 3192 struct componentname *cnp; 3193 struct pwd *pwd; 3194 struct vnode *dvp; 3195 struct vnode *tvp; 3196 seqc_t dvp_seqc; 3197 seqc_t tvp_seqc; 3198 struct nameidata_saved snd; 3199 int line; 3200 enum cache_fpl_status status:8; 3201 bool in_smr; 3202 }; 3203 3204 static void 3205 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd) 3206 { 3207 3208 snd->cn_flags = fpl->ndp->ni_cnd.cn_flags; 3209 snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen; 3210 snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr; 3211 snd->ni_pathlen = fpl->ndp->ni_pathlen; 3212 } 3213 3214 static void 3215 cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd) 3216 { 3217 3218 fpl->ndp->ni_cnd.cn_flags = snd->cn_flags; 3219 fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen; 3220 fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr; 3221 fpl->ndp->ni_pathlen = snd->ni_pathlen; 3222 } 3223 3224 #ifdef INVARIANTS 3225 #define cache_fpl_smr_assert_entered(fpl) ({ \ 3226 struct cache_fpl *_fpl = (fpl); \ 3227 MPASS(_fpl->in_smr == true); \ 3228 VFS_SMR_ASSERT_ENTERED(); \ 3229 }) 3230 #define cache_fpl_smr_assert_not_entered(fpl) ({ \ 3231 struct cache_fpl *_fpl = (fpl); \ 3232 MPASS(_fpl->in_smr == false); \ 3233 VFS_SMR_ASSERT_NOT_ENTERED(); \ 3234 }) 3235 #else 3236 #define cache_fpl_smr_assert_entered(fpl) do { } while (0) 3237 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0) 3238 #endif 3239 3240 #define cache_fpl_smr_enter_initial(fpl) ({ \ 3241 struct cache_fpl *_fpl = (fpl); \ 3242 vfs_smr_enter(); \ 3243 _fpl->in_smr = true; \ 3244 }) 3245 3246 #define cache_fpl_smr_enter(fpl) ({ \ 3247 struct cache_fpl *_fpl = (fpl); \ 3248 MPASS(_fpl->in_smr == false); \ 3249 vfs_smr_enter(); \ 3250 _fpl->in_smr = true; \ 3251 }) 3252 3253 #define cache_fpl_smr_exit(fpl) ({ \ 3254 struct cache_fpl *_fpl = (fpl); \ 3255 MPASS(_fpl->in_smr == true); \ 3256 vfs_smr_exit(); \ 3257 _fpl->in_smr = false; \ 3258 }) 3259 3260 static int 3261 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line) 3262 { 3263 3264 if (fpl->status != CACHE_FPL_STATUS_UNSET) { 3265 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, 3266 ("%s: converting to abort from %d at %d, set at %d\n", 3267 __func__, fpl->status, line, fpl->line)); 3268 } 3269 fpl->status = CACHE_FPL_STATUS_ABORTED; 3270 fpl->line = line; 3271 return (CACHE_FPL_FAILED); 3272 } 3273 3274 #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__) 3275 3276 static int 3277 cache_fpl_partial_impl(struct cache_fpl *fpl, int line) 3278 { 3279 3280 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3281 ("%s: setting to partial at %d, but already set to %d at %d\n", 3282 __func__, line, fpl->status, fpl->line)); 3283 cache_fpl_smr_assert_entered(fpl); 3284 fpl->status = CACHE_FPL_STATUS_PARTIAL; 3285 fpl->line = line; 3286 return (CACHE_FPL_FAILED); 3287 } 3288 3289 #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__) 3290 3291 static int 3292 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line) 3293 { 3294 3295 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3296 ("%s: setting to handled at %d, but already set to %d at %d\n", 3297 __func__, line, fpl->status, fpl->line)); 3298 cache_fpl_smr_assert_not_entered(fpl); 3299 MPASS(error != CACHE_FPL_FAILED); 3300 fpl->status = CACHE_FPL_STATUS_HANDLED; 3301 fpl->line = line; 3302 return (error); 3303 } 3304 3305 #define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__) 3306 3307 #define CACHE_FPL_SUPPORTED_CN_FLAGS \ 3308 (LOCKLEAF | LOCKPARENT | WANTPARENT | NOCACHE | FOLLOW | LOCKSHARED | SAVENAME | \ 3309 SAVESTART | WILLBEDIR | ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK) 3310 3311 #define CACHE_FPL_INTERNAL_CN_FLAGS \ 3312 (ISDOTDOT | MAKEENTRY | ISLASTCN) 3313 3314 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, 3315 "supported and internal flags overlap"); 3316 3317 static bool 3318 cache_fpl_islastcn(struct nameidata *ndp) 3319 { 3320 3321 return (*ndp->ni_next == 0); 3322 } 3323 3324 static bool 3325 cache_fpl_isdotdot(struct componentname *cnp) 3326 { 3327 3328 if (cnp->cn_namelen == 2 && 3329 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') 3330 return (true); 3331 return (false); 3332 } 3333 3334 static bool 3335 cache_can_fplookup(struct cache_fpl *fpl) 3336 { 3337 struct nameidata *ndp; 3338 struct componentname *cnp; 3339 struct thread *td; 3340 3341 ndp = fpl->ndp; 3342 cnp = fpl->cnp; 3343 td = cnp->cn_thread; 3344 3345 if (!cache_fast_lookup) { 3346 cache_fpl_aborted(fpl); 3347 return (false); 3348 } 3349 #ifdef MAC 3350 if (mac_vnode_check_lookup_enabled()) { 3351 cache_fpl_aborted(fpl); 3352 return (false); 3353 } 3354 #endif 3355 if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) { 3356 cache_fpl_aborted(fpl); 3357 return (false); 3358 } 3359 if (ndp->ni_dirfd != AT_FDCWD) { 3360 cache_fpl_aborted(fpl); 3361 return (false); 3362 } 3363 if (IN_CAPABILITY_MODE(td)) { 3364 cache_fpl_aborted(fpl); 3365 return (false); 3366 } 3367 if (AUDITING_TD(td)) { 3368 cache_fpl_aborted(fpl); 3369 return (false); 3370 } 3371 if (ndp->ni_startdir != NULL) { 3372 cache_fpl_aborted(fpl); 3373 return (false); 3374 } 3375 return (true); 3376 } 3377 3378 static bool 3379 cache_fplookup_vnode_supported(struct vnode *vp) 3380 { 3381 3382 return (vp->v_type != VLNK); 3383 } 3384 3385 /* 3386 * Move a negative entry to the hot list. 3387 * 3388 * We have to take locks, but they may be contended and in the worst 3389 * case we may need to go off CPU. We don't want to spin within the 3390 * smr section and we can't block with it. Instead we are going to 3391 * look up the entry again. 3392 */ 3393 static int __noinline 3394 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp, 3395 uint32_t hash) 3396 { 3397 struct componentname *cnp; 3398 struct namecache *ncp; 3399 struct neglist *neglist; 3400 struct negstate *negstate; 3401 struct vnode *dvp; 3402 u_char nc_flag; 3403 3404 cnp = fpl->cnp; 3405 dvp = fpl->dvp; 3406 3407 if (!vhold_smr(dvp)) 3408 return (cache_fpl_aborted(fpl)); 3409 3410 neglist = NCP2NEGLIST(oncp); 3411 cache_fpl_smr_exit(fpl); 3412 3413 mtx_lock(&ncneg_hot.nl_lock); 3414 mtx_lock(&neglist->nl_lock); 3415 /* 3416 * For hash iteration. 3417 */ 3418 cache_fpl_smr_enter(fpl); 3419 3420 /* 3421 * Avoid all surprises by only succeeding if we got the same entry and 3422 * bailing completely otherwise. 3423 * 3424 * In particular at this point there can be a new ncp which matches the 3425 * search but hashes to a different neglist. 3426 */ 3427 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 3428 if (ncp == oncp) 3429 break; 3430 } 3431 3432 /* 3433 * No match to begin with. 3434 */ 3435 if (__predict_false(ncp == NULL)) { 3436 goto out_abort; 3437 } 3438 3439 /* 3440 * The newly found entry may be something different... 3441 */ 3442 if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 3443 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) { 3444 goto out_abort; 3445 } 3446 3447 /* 3448 * ... and not even negative. 3449 */ 3450 nc_flag = atomic_load_char(&ncp->nc_flag); 3451 if ((nc_flag & NCF_NEGATIVE) == 0) { 3452 goto out_abort; 3453 } 3454 3455 if (__predict_false(!cache_ncp_canuse(ncp))) { 3456 goto out_abort; 3457 } 3458 3459 negstate = NCP2NEGSTATE(ncp); 3460 if ((negstate->neg_flag & NEG_HOT) == 0) { 3461 numhotneg++; 3462 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 3463 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); 3464 negstate->neg_flag |= NEG_HOT; 3465 } 3466 3467 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 3468 counter_u64_add(numneghits, 1); 3469 cache_fpl_smr_exit(fpl); 3470 mtx_unlock(&neglist->nl_lock); 3471 mtx_unlock(&ncneg_hot.nl_lock); 3472 vdrop(dvp); 3473 return (cache_fpl_handled(fpl, ENOENT)); 3474 out_abort: 3475 cache_fpl_smr_exit(fpl); 3476 mtx_unlock(&neglist->nl_lock); 3477 mtx_unlock(&ncneg_hot.nl_lock); 3478 vdrop(dvp); 3479 return (cache_fpl_aborted(fpl)); 3480 } 3481 3482 /* 3483 * The target vnode is not supported, prepare for the slow path to take over. 3484 */ 3485 static int __noinline 3486 cache_fplookup_partial_setup(struct cache_fpl *fpl) 3487 { 3488 struct nameidata *ndp; 3489 struct componentname *cnp; 3490 enum vgetstate dvs; 3491 struct vnode *dvp; 3492 struct pwd *pwd; 3493 seqc_t dvp_seqc; 3494 3495 ndp = fpl->ndp; 3496 cnp = fpl->cnp; 3497 dvp = fpl->dvp; 3498 dvp_seqc = fpl->dvp_seqc; 3499 3500 dvs = vget_prep_smr(dvp); 3501 if (__predict_false(dvs == VGET_NONE)) { 3502 cache_fpl_smr_exit(fpl); 3503 return (cache_fpl_aborted(fpl)); 3504 } 3505 3506 cache_fpl_smr_exit(fpl); 3507 3508 vget_finish_ref(dvp, dvs); 3509 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3510 vrele(dvp); 3511 return (cache_fpl_aborted(fpl)); 3512 } 3513 3514 pwd = pwd_hold(curthread); 3515 if (fpl->pwd != pwd) { 3516 vrele(dvp); 3517 pwd_drop(pwd); 3518 return (cache_fpl_aborted(fpl)); 3519 } 3520 3521 cache_fpl_restore(fpl, &fpl->snd); 3522 3523 ndp->ni_startdir = dvp; 3524 cnp->cn_flags |= MAKEENTRY; 3525 if (cache_fpl_islastcn(ndp)) 3526 cnp->cn_flags |= ISLASTCN; 3527 if (cache_fpl_isdotdot(cnp)) 3528 cnp->cn_flags |= ISDOTDOT; 3529 3530 return (0); 3531 } 3532 3533 static int 3534 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs) 3535 { 3536 struct componentname *cnp; 3537 struct vnode *tvp; 3538 seqc_t tvp_seqc; 3539 int error, lkflags; 3540 3541 cnp = fpl->cnp; 3542 tvp = fpl->tvp; 3543 tvp_seqc = fpl->tvp_seqc; 3544 3545 if ((cnp->cn_flags & LOCKLEAF) != 0) { 3546 lkflags = LK_SHARED; 3547 if ((cnp->cn_flags & LOCKSHARED) == 0) 3548 lkflags = LK_EXCLUSIVE; 3549 error = vget_finish(tvp, lkflags, tvs); 3550 if (__predict_false(error != 0)) { 3551 return (cache_fpl_aborted(fpl)); 3552 } 3553 } else { 3554 vget_finish_ref(tvp, tvs); 3555 } 3556 3557 if (!vn_seqc_consistent(tvp, tvp_seqc)) { 3558 if ((cnp->cn_flags & LOCKLEAF) != 0) 3559 vput(tvp); 3560 else 3561 vrele(tvp); 3562 return (cache_fpl_aborted(fpl)); 3563 } 3564 3565 return (cache_fpl_handled(fpl, 0)); 3566 } 3567 3568 /* 3569 * They want to possibly modify the state of the namecache. 3570 * 3571 * Don't try to match the API contract, just leave. 3572 * TODO: this leaves scalability on the table 3573 */ 3574 static int 3575 cache_fplookup_final_modifying(struct cache_fpl *fpl) 3576 { 3577 struct componentname *cnp; 3578 3579 cnp = fpl->cnp; 3580 MPASS(cnp->cn_nameiop != LOOKUP); 3581 return (cache_fpl_partial(fpl)); 3582 } 3583 3584 static int __noinline 3585 cache_fplookup_final_withparent(struct cache_fpl *fpl) 3586 { 3587 struct componentname *cnp; 3588 enum vgetstate dvs, tvs; 3589 struct vnode *dvp, *tvp; 3590 seqc_t dvp_seqc; 3591 int error; 3592 3593 cnp = fpl->cnp; 3594 dvp = fpl->dvp; 3595 dvp_seqc = fpl->dvp_seqc; 3596 tvp = fpl->tvp; 3597 3598 MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0); 3599 3600 /* 3601 * This is less efficient than it can be for simplicity. 3602 */ 3603 dvs = vget_prep_smr(dvp); 3604 if (__predict_false(dvs == VGET_NONE)) { 3605 return (cache_fpl_aborted(fpl)); 3606 } 3607 tvs = vget_prep_smr(tvp); 3608 if (__predict_false(tvs == VGET_NONE)) { 3609 cache_fpl_smr_exit(fpl); 3610 vget_abort(dvp, dvs); 3611 return (cache_fpl_aborted(fpl)); 3612 } 3613 3614 cache_fpl_smr_exit(fpl); 3615 3616 if ((cnp->cn_flags & LOCKPARENT) != 0) { 3617 error = vget_finish(dvp, LK_EXCLUSIVE, dvs); 3618 if (__predict_false(error != 0)) { 3619 vget_abort(tvp, tvs); 3620 return (cache_fpl_aborted(fpl)); 3621 } 3622 } else { 3623 vget_finish_ref(dvp, dvs); 3624 } 3625 3626 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3627 vget_abort(tvp, tvs); 3628 if ((cnp->cn_flags & LOCKPARENT) != 0) 3629 vput(dvp); 3630 else 3631 vrele(dvp); 3632 return (cache_fpl_aborted(fpl)); 3633 } 3634 3635 error = cache_fplookup_final_child(fpl, tvs); 3636 if (__predict_false(error != 0)) { 3637 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED); 3638 if ((cnp->cn_flags & LOCKPARENT) != 0) 3639 vput(dvp); 3640 else 3641 vrele(dvp); 3642 return (error); 3643 } 3644 3645 MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED); 3646 return (0); 3647 } 3648 3649 static int 3650 cache_fplookup_final(struct cache_fpl *fpl) 3651 { 3652 struct componentname *cnp; 3653 enum vgetstate tvs; 3654 struct vnode *dvp, *tvp; 3655 seqc_t dvp_seqc; 3656 3657 cnp = fpl->cnp; 3658 dvp = fpl->dvp; 3659 dvp_seqc = fpl->dvp_seqc; 3660 tvp = fpl->tvp; 3661 3662 VNPASS(cache_fplookup_vnode_supported(dvp), dvp); 3663 3664 if (cnp->cn_nameiop != LOOKUP) { 3665 return (cache_fplookup_final_modifying(fpl)); 3666 } 3667 3668 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) 3669 return (cache_fplookup_final_withparent(fpl)); 3670 3671 tvs = vget_prep_smr(tvp); 3672 if (__predict_false(tvs == VGET_NONE)) { 3673 return (cache_fpl_partial(fpl)); 3674 } 3675 3676 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3677 cache_fpl_smr_exit(fpl); 3678 vget_abort(tvp, tvs); 3679 return (cache_fpl_aborted(fpl)); 3680 } 3681 3682 cache_fpl_smr_exit(fpl); 3683 return (cache_fplookup_final_child(fpl, tvs)); 3684 } 3685 3686 static int __noinline 3687 cache_fplookup_dot(struct cache_fpl *fpl) 3688 { 3689 struct vnode *dvp; 3690 3691 dvp = fpl->dvp; 3692 3693 fpl->tvp = dvp; 3694 fpl->tvp_seqc = vn_seqc_read_any(dvp); 3695 if (seqc_in_modify(fpl->tvp_seqc)) { 3696 return (cache_fpl_aborted(fpl)); 3697 } 3698 3699 counter_u64_add(dothits, 1); 3700 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", dvp); 3701 3702 return (0); 3703 } 3704 3705 static int __noinline 3706 cache_fplookup_dotdot(struct cache_fpl *fpl) 3707 { 3708 struct nameidata *ndp; 3709 struct componentname *cnp; 3710 struct namecache *ncp; 3711 struct vnode *dvp; 3712 struct prison *pr; 3713 u_char nc_flag; 3714 3715 ndp = fpl->ndp; 3716 cnp = fpl->cnp; 3717 dvp = fpl->dvp; 3718 3719 /* 3720 * XXX this is racy the same way regular lookup is 3721 */ 3722 for (pr = cnp->cn_cred->cr_prison; pr != NULL; 3723 pr = pr->pr_parent) 3724 if (dvp == pr->pr_root) 3725 break; 3726 3727 if (dvp == ndp->ni_rootdir || 3728 dvp == ndp->ni_topdir || 3729 dvp == rootvnode || 3730 pr != NULL) { 3731 fpl->tvp = dvp; 3732 fpl->tvp_seqc = vn_seqc_read_any(dvp); 3733 if (seqc_in_modify(fpl->tvp_seqc)) { 3734 return (cache_fpl_aborted(fpl)); 3735 } 3736 return (0); 3737 } 3738 3739 if ((dvp->v_vflag & VV_ROOT) != 0) { 3740 /* 3741 * TODO 3742 * The opposite of climb mount is needed here. 3743 */ 3744 return (cache_fpl_aborted(fpl)); 3745 } 3746 3747 ncp = atomic_load_ptr(&dvp->v_cache_dd); 3748 if (ncp == NULL) { 3749 return (cache_fpl_aborted(fpl)); 3750 } 3751 3752 nc_flag = atomic_load_char(&ncp->nc_flag); 3753 if ((nc_flag & NCF_ISDOTDOT) != 0) { 3754 if ((nc_flag & NCF_NEGATIVE) != 0) 3755 return (cache_fpl_aborted(fpl)); 3756 fpl->tvp = ncp->nc_vp; 3757 } else { 3758 fpl->tvp = ncp->nc_dvp; 3759 } 3760 3761 if (__predict_false(!cache_ncp_canuse(ncp))) { 3762 return (cache_fpl_aborted(fpl)); 3763 } 3764 3765 fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp); 3766 if (seqc_in_modify(fpl->tvp_seqc)) { 3767 return (cache_fpl_partial(fpl)); 3768 } 3769 3770 counter_u64_add(dotdothits, 1); 3771 return (0); 3772 } 3773 3774 static int 3775 cache_fplookup_next(struct cache_fpl *fpl) 3776 { 3777 struct componentname *cnp; 3778 struct namecache *ncp; 3779 struct negstate *negstate; 3780 struct vnode *dvp, *tvp; 3781 u_char nc_flag; 3782 uint32_t hash; 3783 bool neg_hot; 3784 3785 cnp = fpl->cnp; 3786 dvp = fpl->dvp; 3787 3788 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) { 3789 return (cache_fplookup_dot(fpl)); 3790 } 3791 3792 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 3793 3794 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 3795 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 3796 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 3797 break; 3798 } 3799 3800 /* 3801 * If there is no entry we have to punt to the slow path to perform 3802 * actual lookup. Should there be nothing with this name a negative 3803 * entry will be created. 3804 */ 3805 if (__predict_false(ncp == NULL)) { 3806 return (cache_fpl_partial(fpl)); 3807 } 3808 3809 tvp = atomic_load_ptr(&ncp->nc_vp); 3810 nc_flag = atomic_load_char(&ncp->nc_flag); 3811 if ((nc_flag & NCF_NEGATIVE) != 0) { 3812 /* 3813 * If they want to create an entry we need to replace this one. 3814 */ 3815 if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) { 3816 return (cache_fpl_partial(fpl)); 3817 } 3818 negstate = NCP2NEGSTATE(ncp); 3819 neg_hot = ((negstate->neg_flag & NEG_HOT) != 0); 3820 if (__predict_false(!cache_ncp_canuse(ncp))) { 3821 return (cache_fpl_partial(fpl)); 3822 } 3823 if (__predict_false((nc_flag & NCF_WHITE) != 0)) { 3824 return (cache_fpl_partial(fpl)); 3825 } 3826 if (!neg_hot) { 3827 return (cache_fplookup_negative_promote(fpl, ncp, hash)); 3828 } 3829 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, 3830 ncp->nc_name); 3831 counter_u64_add(numneghits, 1); 3832 cache_fpl_smr_exit(fpl); 3833 return (cache_fpl_handled(fpl, ENOENT)); 3834 } 3835 3836 if (__predict_false(!cache_ncp_canuse(ncp))) { 3837 return (cache_fpl_partial(fpl)); 3838 } 3839 3840 fpl->tvp = tvp; 3841 fpl->tvp_seqc = vn_seqc_read_any(tvp); 3842 if (seqc_in_modify(fpl->tvp_seqc)) { 3843 return (cache_fpl_partial(fpl)); 3844 } 3845 3846 if (!cache_fplookup_vnode_supported(tvp)) { 3847 return (cache_fpl_partial(fpl)); 3848 } 3849 3850 counter_u64_add(numposhits, 1); 3851 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp); 3852 return (0); 3853 } 3854 3855 static bool 3856 cache_fplookup_mp_supported(struct mount *mp) 3857 { 3858 3859 if (mp == NULL) 3860 return (false); 3861 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) 3862 return (false); 3863 return (true); 3864 } 3865 3866 /* 3867 * Walk up the mount stack (if any). 3868 * 3869 * Correctness is provided in the following ways: 3870 * - all vnodes are protected from freeing with SMR 3871 * - struct mount objects are type stable making them always safe to access 3872 * - stability of the particular mount is provided by busying it 3873 * - relationship between the vnode which is mounted on and the mount is 3874 * verified with the vnode sequence counter after busying 3875 * - association between root vnode of the mount and the mount is protected 3876 * by busy 3877 * 3878 * From that point on we can read the sequence counter of the root vnode 3879 * and get the next mount on the stack (if any) using the same protection. 3880 * 3881 * By the end of successful walk we are guaranteed the reached state was 3882 * indeed present at least at some point which matches the regular lookup. 3883 */ 3884 static int __noinline 3885 cache_fplookup_climb_mount(struct cache_fpl *fpl) 3886 { 3887 struct mount *mp, *prev_mp; 3888 struct vnode *vp; 3889 seqc_t vp_seqc; 3890 3891 vp = fpl->tvp; 3892 vp_seqc = fpl->tvp_seqc; 3893 3894 VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp); 3895 mp = atomic_load_ptr(&vp->v_mountedhere); 3896 if (mp == NULL) 3897 return (0); 3898 3899 prev_mp = NULL; 3900 for (;;) { 3901 if (!vfs_op_thread_enter_crit(mp)) { 3902 if (prev_mp != NULL) 3903 vfs_op_thread_exit_crit(prev_mp); 3904 return (cache_fpl_partial(fpl)); 3905 } 3906 if (prev_mp != NULL) 3907 vfs_op_thread_exit_crit(prev_mp); 3908 if (!vn_seqc_consistent(vp, vp_seqc)) { 3909 vfs_op_thread_exit_crit(mp); 3910 return (cache_fpl_partial(fpl)); 3911 } 3912 if (!cache_fplookup_mp_supported(mp)) { 3913 vfs_op_thread_exit_crit(mp); 3914 return (cache_fpl_partial(fpl)); 3915 } 3916 vp = atomic_load_ptr(&mp->mnt_rootvnode); 3917 if (vp == NULL || VN_IS_DOOMED(vp)) { 3918 vfs_op_thread_exit_crit(mp); 3919 return (cache_fpl_partial(fpl)); 3920 } 3921 vp_seqc = vn_seqc_read_any(vp); 3922 if (seqc_in_modify(vp_seqc)) { 3923 vfs_op_thread_exit_crit(mp); 3924 return (cache_fpl_partial(fpl)); 3925 } 3926 prev_mp = mp; 3927 mp = atomic_load_ptr(&vp->v_mountedhere); 3928 if (mp == NULL) 3929 break; 3930 } 3931 3932 vfs_op_thread_exit_crit(prev_mp); 3933 fpl->tvp = vp; 3934 fpl->tvp_seqc = vp_seqc; 3935 return (0); 3936 } 3937 3938 static bool 3939 cache_fplookup_need_climb_mount(struct cache_fpl *fpl) 3940 { 3941 struct mount *mp; 3942 struct vnode *vp; 3943 3944 vp = fpl->tvp; 3945 3946 /* 3947 * Hack: while this is a union, the pointer tends to be NULL so save on 3948 * a branch. 3949 */ 3950 mp = atomic_load_ptr(&vp->v_mountedhere); 3951 if (mp == NULL) 3952 return (false); 3953 if (vp->v_type == VDIR) 3954 return (true); 3955 return (false); 3956 } 3957 3958 /* 3959 * Parse the path. 3960 * 3961 * The code is mostly copy-pasted from regular lookup, see lookup(). 3962 * The structure is maintained along with comments for easier maintenance. 3963 * Deduplicating the code will become feasible after fast path lookup 3964 * becomes more feature-complete. 3965 */ 3966 static int 3967 cache_fplookup_parse(struct cache_fpl *fpl) 3968 { 3969 struct nameidata *ndp; 3970 struct componentname *cnp; 3971 char *cp; 3972 3973 ndp = fpl->ndp; 3974 cnp = fpl->cnp; 3975 3976 /* 3977 * Search a new directory. 3978 * 3979 * The last component of the filename is left accessible via 3980 * cnp->cn_nameptr for callers that need the name. Callers needing 3981 * the name set the SAVENAME flag. When done, they assume 3982 * responsibility for freeing the pathname buffer. 3983 */ 3984 for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++) 3985 continue; 3986 cnp->cn_namelen = cp - cnp->cn_nameptr; 3987 if (__predict_false(cnp->cn_namelen > NAME_MAX)) { 3988 cache_fpl_smr_exit(fpl); 3989 return (cache_fpl_handled(fpl, ENAMETOOLONG)); 3990 } 3991 ndp->ni_pathlen -= cnp->cn_namelen; 3992 KASSERT(ndp->ni_pathlen <= PATH_MAX, 3993 ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen)); 3994 ndp->ni_next = cp; 3995 3996 /* 3997 * Replace multiple slashes by a single slash and trailing slashes 3998 * by a null. This must be done before VOP_LOOKUP() because some 3999 * fs's don't know about trailing slashes. Remember if there were 4000 * trailing slashes to handle symlinks, existing non-directories 4001 * and non-existing files that won't be directories specially later. 4002 */ 4003 while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) { 4004 cp++; 4005 ndp->ni_pathlen--; 4006 if (*cp == '\0') { 4007 /* 4008 * TODO 4009 * Regular lookup performs the following: 4010 * *ndp->ni_next = '\0'; 4011 * cnp->cn_flags |= TRAILINGSLASH; 4012 * 4013 * Which is problematic since it modifies data read 4014 * from userspace. Then if fast path lookup was to 4015 * abort we would have to either restore it or convey 4016 * the flag. Since this is a corner case just ignore 4017 * it for simplicity. 4018 */ 4019 return (cache_fpl_partial(fpl)); 4020 } 4021 } 4022 ndp->ni_next = cp; 4023 4024 /* 4025 * Check for degenerate name (e.g. / or "") 4026 * which is a way of talking about a directory, 4027 * e.g. like "/." or ".". 4028 * 4029 * TODO 4030 * Another corner case handled by the regular lookup 4031 */ 4032 if (__predict_false(cnp->cn_nameptr[0] == '\0')) { 4033 return (cache_fpl_partial(fpl)); 4034 } 4035 return (0); 4036 } 4037 4038 static void 4039 cache_fplookup_parse_advance(struct cache_fpl *fpl) 4040 { 4041 struct nameidata *ndp; 4042 struct componentname *cnp; 4043 4044 ndp = fpl->ndp; 4045 cnp = fpl->cnp; 4046 4047 cnp->cn_nameptr = ndp->ni_next; 4048 while (*cnp->cn_nameptr == '/') { 4049 cnp->cn_nameptr++; 4050 ndp->ni_pathlen--; 4051 } 4052 } 4053 4054 static int __noinline 4055 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error) 4056 { 4057 4058 switch (error) { 4059 case EAGAIN: 4060 /* 4061 * Can happen when racing against vgone. 4062 * */ 4063 case EOPNOTSUPP: 4064 cache_fpl_partial(fpl); 4065 break; 4066 default: 4067 /* 4068 * See the API contract for VOP_FPLOOKUP_VEXEC. 4069 */ 4070 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { 4071 error = cache_fpl_aborted(fpl); 4072 } else { 4073 cache_fpl_smr_exit(fpl); 4074 cache_fpl_handled(fpl, error); 4075 } 4076 break; 4077 } 4078 return (error); 4079 } 4080 4081 static int 4082 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl) 4083 { 4084 struct nameidata *ndp; 4085 struct componentname *cnp; 4086 struct mount *mp; 4087 int error; 4088 4089 error = CACHE_FPL_FAILED; 4090 ndp = fpl->ndp; 4091 cnp = fpl->cnp; 4092 4093 cache_fpl_checkpoint(fpl, &fpl->snd); 4094 4095 fpl->dvp = dvp; 4096 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); 4097 if (seqc_in_modify(fpl->dvp_seqc)) { 4098 cache_fpl_aborted(fpl); 4099 goto out; 4100 } 4101 mp = atomic_load_ptr(&fpl->dvp->v_mount); 4102 if (!cache_fplookup_mp_supported(mp)) { 4103 cache_fpl_aborted(fpl); 4104 goto out; 4105 } 4106 4107 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); 4108 4109 for (;;) { 4110 error = cache_fplookup_parse(fpl); 4111 if (__predict_false(error != 0)) { 4112 break; 4113 } 4114 4115 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); 4116 4117 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred); 4118 if (__predict_false(error != 0)) { 4119 error = cache_fplookup_failed_vexec(fpl, error); 4120 break; 4121 } 4122 4123 if (__predict_false(cache_fpl_isdotdot(cnp))) { 4124 error = cache_fplookup_dotdot(fpl); 4125 if (__predict_false(error != 0)) { 4126 break; 4127 } 4128 } else { 4129 error = cache_fplookup_next(fpl); 4130 if (__predict_false(error != 0)) { 4131 break; 4132 } 4133 4134 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 4135 4136 if (cache_fplookup_need_climb_mount(fpl)) { 4137 error = cache_fplookup_climb_mount(fpl); 4138 if (__predict_false(error != 0)) { 4139 break; 4140 } 4141 } 4142 } 4143 4144 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 4145 4146 if (cache_fpl_islastcn(ndp)) { 4147 error = cache_fplookup_final(fpl); 4148 break; 4149 } 4150 4151 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { 4152 error = cache_fpl_aborted(fpl); 4153 break; 4154 } 4155 4156 fpl->dvp = fpl->tvp; 4157 fpl->dvp_seqc = fpl->tvp_seqc; 4158 4159 cache_fplookup_parse_advance(fpl); 4160 cache_fpl_checkpoint(fpl, &fpl->snd); 4161 } 4162 out: 4163 switch (fpl->status) { 4164 case CACHE_FPL_STATUS_UNSET: 4165 __assert_unreachable(); 4166 break; 4167 case CACHE_FPL_STATUS_PARTIAL: 4168 cache_fpl_smr_assert_entered(fpl); 4169 return (cache_fplookup_partial_setup(fpl)); 4170 case CACHE_FPL_STATUS_ABORTED: 4171 if (fpl->in_smr) 4172 cache_fpl_smr_exit(fpl); 4173 return (CACHE_FPL_FAILED); 4174 case CACHE_FPL_STATUS_HANDLED: 4175 MPASS(error != CACHE_FPL_FAILED); 4176 cache_fpl_smr_assert_not_entered(fpl); 4177 if (__predict_false(error != 0)) { 4178 ndp->ni_dvp = NULL; 4179 ndp->ni_vp = NULL; 4180 cache_fpl_cleanup_cnp(cnp); 4181 return (error); 4182 } 4183 ndp->ni_dvp = fpl->dvp; 4184 ndp->ni_vp = fpl->tvp; 4185 if (cnp->cn_flags & SAVENAME) 4186 cnp->cn_flags |= HASBUF; 4187 else 4188 cache_fpl_cleanup_cnp(cnp); 4189 return (error); 4190 } 4191 } 4192 4193 /* 4194 * Fast path lookup protected with SMR and sequence counters. 4195 * 4196 * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one. 4197 * 4198 * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria 4199 * outlined below. 4200 * 4201 * Traditional vnode lookup conceptually looks like this: 4202 * 4203 * vn_lock(current); 4204 * for (;;) { 4205 * next = find(); 4206 * vn_lock(next); 4207 * vn_unlock(current); 4208 * current = next; 4209 * if (last) 4210 * break; 4211 * } 4212 * return (current); 4213 * 4214 * Each jump to the next vnode is safe memory-wise and atomic with respect to 4215 * any modifications thanks to holding respective locks. 4216 * 4217 * The same guarantee can be provided with a combination of safe memory 4218 * reclamation and sequence counters instead. If all operations which affect 4219 * the relationship between the current vnode and the one we are looking for 4220 * also modify the counter, we can verify whether all the conditions held as 4221 * we made the jump. This includes things like permissions, mount points etc. 4222 * Counter modification is provided by enclosing relevant places in 4223 * vn_seqc_write_begin()/end() calls. 4224 * 4225 * Thus this translates to: 4226 * 4227 * vfs_smr_enter(); 4228 * dvp_seqc = seqc_read_any(dvp); 4229 * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode 4230 * abort(); 4231 * for (;;) { 4232 * tvp = find(); 4233 * tvp_seqc = seqc_read_any(tvp); 4234 * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode 4235 * abort(); 4236 * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode 4237 * abort(); 4238 * dvp = tvp; // we know nothing of importance has changed 4239 * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration 4240 * if (last) 4241 * break; 4242 * } 4243 * vget(); // secure the vnode 4244 * if (!seqc_consistent(tvp, tvp_seqc) // final check 4245 * abort(); 4246 * // at this point we know nothing has changed for any parent<->child pair 4247 * // as they were crossed during the lookup, meaning we matched the guarantee 4248 * // of the locked variant 4249 * return (tvp); 4250 * 4251 * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows: 4252 * - they are called while within vfs_smr protection which they must never exit 4253 * - EAGAIN can be returned to denote checking could not be performed, it is 4254 * always valid to return it 4255 * - if the sequence counter has not changed the result must be valid 4256 * - if the sequence counter has changed both false positives and false negatives 4257 * are permitted (since the result will be rejected later) 4258 * - for simple cases of unix permission checks vaccess_vexec_smr can be used 4259 * 4260 * Caveats to watch out for: 4261 * - vnodes are passed unlocked and unreferenced with nothing stopping 4262 * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised 4263 * to use atomic_load_ptr to fetch it. 4264 * - the aforementioned object can also get freed, meaning absent other means it 4265 * should be protected with vfs_smr 4266 * - either safely checking permissions as they are modified or guaranteeing 4267 * their stability is left to the routine 4268 */ 4269 int 4270 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status, 4271 struct pwd **pwdp) 4272 { 4273 struct cache_fpl fpl; 4274 struct pwd *pwd; 4275 struct vnode *dvp; 4276 struct componentname *cnp; 4277 struct nameidata_saved orig; 4278 int error; 4279 4280 MPASS(ndp->ni_lcf == 0); 4281 4282 fpl.status = CACHE_FPL_STATUS_UNSET; 4283 fpl.ndp = ndp; 4284 fpl.cnp = &ndp->ni_cnd; 4285 MPASS(curthread == fpl.cnp->cn_thread); 4286 4287 if ((fpl.cnp->cn_flags & SAVESTART) != 0) 4288 MPASS(fpl.cnp->cn_nameiop != LOOKUP); 4289 4290 if (!cache_can_fplookup(&fpl)) { 4291 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 4292 *status = fpl.status; 4293 return (EOPNOTSUPP); 4294 } 4295 4296 cache_fpl_checkpoint(&fpl, &orig); 4297 4298 cache_fpl_smr_enter_initial(&fpl); 4299 pwd = pwd_get_smr(); 4300 fpl.pwd = pwd; 4301 ndp->ni_rootdir = pwd->pwd_rdir; 4302 ndp->ni_topdir = pwd->pwd_jdir; 4303 4304 cnp = fpl.cnp; 4305 cnp->cn_nameptr = cnp->cn_pnbuf; 4306 if (cnp->cn_pnbuf[0] == '/') { 4307 cache_fpl_handle_root(ndp, &dvp); 4308 } else { 4309 MPASS(ndp->ni_dirfd == AT_FDCWD); 4310 dvp = pwd->pwd_cdir; 4311 } 4312 4313 SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true); 4314 4315 error = cache_fplookup_impl(dvp, &fpl); 4316 cache_fpl_smr_assert_not_entered(&fpl); 4317 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 4318 4319 *status = fpl.status; 4320 switch (fpl.status) { 4321 case CACHE_FPL_STATUS_UNSET: 4322 __assert_unreachable(); 4323 break; 4324 case CACHE_FPL_STATUS_HANDLED: 4325 SDT_PROBE3(vfs, namei, lookup, return, error, 4326 (error == 0 ? ndp->ni_vp : NULL), true); 4327 break; 4328 case CACHE_FPL_STATUS_PARTIAL: 4329 *pwdp = fpl.pwd; 4330 /* 4331 * Status restored by cache_fplookup_partial_setup. 4332 */ 4333 break; 4334 case CACHE_FPL_STATUS_ABORTED: 4335 cache_fpl_restore(&fpl, &orig); 4336 break; 4337 } 4338 return (error); 4339 } 4340