1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Poul-Henning Kamp of the FreeBSD Project. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_ddb.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/capsicum.h> 46 #include <sys/counter.h> 47 #include <sys/filedesc.h> 48 #include <sys/fnv_hash.h> 49 #include <sys/kernel.h> 50 #include <sys/ktr.h> 51 #include <sys/lock.h> 52 #include <sys/malloc.h> 53 #include <sys/fcntl.h> 54 #include <sys/jail.h> 55 #include <sys/mount.h> 56 #include <sys/namei.h> 57 #include <sys/proc.h> 58 #include <sys/seqc.h> 59 #include <sys/sdt.h> 60 #include <sys/smr.h> 61 #include <sys/smp.h> 62 #include <sys/syscallsubr.h> 63 #include <sys/sysctl.h> 64 #include <sys/sysproto.h> 65 #include <sys/vnode.h> 66 #include <ck_queue.h> 67 #ifdef KTRACE 68 #include <sys/ktrace.h> 69 #endif 70 71 #include <sys/capsicum.h> 72 73 #include <security/audit/audit.h> 74 #include <security/mac/mac_framework.h> 75 76 #ifdef DDB 77 #include <ddb/ddb.h> 78 #endif 79 80 #include <vm/uma.h> 81 82 SDT_PROVIDER_DECLARE(vfs); 83 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", 84 "struct vnode *"); 85 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", 86 "char *"); 87 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *", 88 "const char *"); 89 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *", 90 "struct namecache *", "int", "int"); 91 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); 92 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", 93 "char *", "struct vnode *"); 94 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); 95 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", 96 "struct vnode *", "char *"); 97 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", 98 "struct vnode *"); 99 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, 100 "struct vnode *", "char *"); 101 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", 102 "char *"); 103 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *", 104 "struct componentname *"); 105 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *", 106 "struct componentname *"); 107 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); 108 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); 109 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); 110 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", 111 "struct vnode *"); 112 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", 113 "char *"); 114 SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *", 115 "char *"); 116 117 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool"); 118 SDT_PROBE_DECLARE(vfs, namei, lookup, entry); 119 SDT_PROBE_DECLARE(vfs, namei, lookup, return); 120 121 /* 122 * This structure describes the elements in the cache of recent 123 * names looked up by namei. 124 */ 125 struct negstate { 126 u_char neg_flag; 127 }; 128 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *), 129 "the state must fit in a union with a pointer without growing it"); 130 131 struct namecache { 132 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 133 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 134 CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */ 135 struct vnode *nc_dvp; /* vnode of parent of name */ 136 union { 137 struct vnode *nu_vp; /* vnode the name refers to */ 138 struct negstate nu_neg;/* negative entry state */ 139 } n_un; 140 u_char nc_flag; /* flag bits */ 141 u_char nc_nlen; /* length of name */ 142 char nc_name[0]; /* segment name + nul */ 143 }; 144 145 /* 146 * struct namecache_ts repeats struct namecache layout up to the 147 * nc_nlen member. 148 * struct namecache_ts is used in place of struct namecache when time(s) need 149 * to be stored. The nc_dotdottime field is used when a cache entry is mapping 150 * both a non-dotdot directory name plus dotdot for the directory's 151 * parent. 152 * 153 * See below for alignment requirement. 154 */ 155 struct namecache_ts { 156 struct timespec nc_time; /* timespec provided by fs */ 157 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ 158 int nc_ticks; /* ticks value when entry was added */ 159 struct namecache nc_nc; 160 }; 161 162 /* 163 * At least mips n32 performs 64-bit accesses to timespec as found 164 * in namecache_ts and requires them to be aligned. Since others 165 * may be in the same spot suffer a little bit and enforce the 166 * alignment for everyone. Note this is a nop for 64-bit platforms. 167 */ 168 #define CACHE_ZONE_ALIGNMENT UMA_ALIGNOF(time_t) 169 #define CACHE_PATH_CUTOFF 39 170 171 #define CACHE_ZONE_SMALL_SIZE (sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1) 172 #define CACHE_ZONE_SMALL_TS_SIZE (sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1) 173 #define CACHE_ZONE_LARGE_SIZE (sizeof(struct namecache) + NAME_MAX + 1) 174 #define CACHE_ZONE_LARGE_TS_SIZE (sizeof(struct namecache_ts) + NAME_MAX + 1) 175 176 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 177 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 178 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 179 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 180 181 #define nc_vp n_un.nu_vp 182 #define nc_neg n_un.nu_neg 183 184 /* 185 * Flags in namecache.nc_flag 186 */ 187 #define NCF_WHITE 0x01 188 #define NCF_ISDOTDOT 0x02 189 #define NCF_TS 0x04 190 #define NCF_DTS 0x08 191 #define NCF_DVDROP 0x10 192 #define NCF_NEGATIVE 0x20 193 #define NCF_INVALID 0x40 194 #define NCF_WIP 0x80 195 196 /* 197 * Flags in negstate.neg_flag 198 */ 199 #define NEG_HOT 0x01 200 201 /* 202 * Mark an entry as invalid. 203 * 204 * This is called before it starts getting deconstructed. 205 */ 206 static void 207 cache_ncp_invalidate(struct namecache *ncp) 208 { 209 210 KASSERT((ncp->nc_flag & NCF_INVALID) == 0, 211 ("%s: entry %p already invalid", __func__, ncp)); 212 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID); 213 atomic_thread_fence_rel(); 214 } 215 216 /* 217 * Check whether the entry can be safely used. 218 * 219 * All places which elide locks are supposed to call this after they are 220 * done with reading from an entry. 221 */ 222 static bool 223 cache_ncp_canuse(struct namecache *ncp) 224 { 225 226 atomic_thread_fence_acq(); 227 return ((atomic_load_char(&ncp->nc_flag) & (NCF_INVALID | NCF_WIP)) == 0); 228 } 229 230 /* 231 * Name caching works as follows: 232 * 233 * Names found by directory scans are retained in a cache 234 * for future reference. It is managed LRU, so frequently 235 * used names will hang around. Cache is indexed by hash value 236 * obtained from (dvp, name) where dvp refers to the directory 237 * containing name. 238 * 239 * If it is a "negative" entry, (i.e. for a name that is known NOT to 240 * exist) the vnode pointer will be NULL. 241 * 242 * Upon reaching the last segment of a path, if the reference 243 * is for DELETE, or NOCACHE is set (rewrite), and the 244 * name is located in the cache, it will be dropped. 245 * 246 * These locks are used (in the order in which they can be taken): 247 * NAME TYPE ROLE 248 * vnodelock mtx vnode lists and v_cache_dd field protection 249 * bucketlock mtx for access to given set of hash buckets 250 * neglist mtx negative entry LRU management 251 * 252 * Additionally, ncneg_shrink_lock mtx is used to have at most one thread 253 * shrinking the LRU list. 254 * 255 * It is legal to take multiple vnodelock and bucketlock locks. The locking 256 * order is lower address first. Both are recursive. 257 * 258 * "." lookups are lockless. 259 * 260 * ".." and vnode -> name lookups require vnodelock. 261 * 262 * name -> vnode lookup requires the relevant bucketlock to be held for reading. 263 * 264 * Insertions and removals of entries require involved vnodes and bucketlocks 265 * to be locked to provide safe operation against other threads modifying the 266 * cache. 267 * 268 * Some lookups result in removal of the found entry (e.g. getting rid of a 269 * negative entry with the intent to create a positive one), which poses a 270 * problem when multiple threads reach the state. Similarly, two different 271 * threads can purge two different vnodes and try to remove the same name. 272 * 273 * If the already held vnode lock is lower than the second required lock, we 274 * can just take the other lock. However, in the opposite case, this could 275 * deadlock. As such, this is resolved by trylocking and if that fails unlocking 276 * the first node, locking everything in order and revalidating the state. 277 */ 278 279 VFS_SMR_DECLARE; 280 281 /* 282 * Structures associated with name caching. 283 */ 284 #define NCHHASH(hash) \ 285 (&nchashtbl[(hash) & nchash]) 286 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ 287 static u_long __read_mostly nchash; /* size of hash table */ 288 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 289 "Size of namecache hash table"); 290 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */ 291 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 292 "Ratio of negative namecache entries"); 293 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ 294 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ 295 u_int ncsizefactor = 2; 296 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0, 297 "Size factor for namecache"); 298 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */ 299 300 struct nchstats nchstats; /* cache effectiveness statistics */ 301 302 static bool __read_frequently cache_fast_revlookup = true; 303 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_revlookup, CTLFLAG_RW, 304 &cache_fast_revlookup, 0, ""); 305 306 static struct mtx __exclusive_cache_line ncneg_shrink_lock; 307 308 struct neglist { 309 struct mtx nl_lock; 310 TAILQ_HEAD(, namecache) nl_list; 311 } __aligned(CACHE_LINE_SIZE); 312 313 static struct neglist __read_mostly *neglists; 314 static struct neglist ncneg_hot; 315 static u_long numhotneg; 316 317 #define ncneghash 3 318 #define numneglists (ncneghash + 1) 319 static inline struct neglist * 320 NCP2NEGLIST(struct namecache *ncp) 321 { 322 323 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); 324 } 325 326 static inline struct negstate * 327 NCP2NEGSTATE(struct namecache *ncp) 328 { 329 330 MPASS(ncp->nc_flag & NCF_NEGATIVE); 331 return (&ncp->nc_neg); 332 } 333 334 #define numbucketlocks (ncbuckethash + 1) 335 static u_int __read_mostly ncbuckethash; 336 static struct mtx_padalign __read_mostly *bucketlocks; 337 #define HASH2BUCKETLOCK(hash) \ 338 ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)])) 339 340 #define numvnodelocks (ncvnodehash + 1) 341 static u_int __read_mostly ncvnodehash; 342 static struct mtx __read_mostly *vnodelocks; 343 static inline struct mtx * 344 VP2VNODELOCK(struct vnode *vp) 345 { 346 347 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); 348 } 349 350 /* 351 * UMA zones for the VFS cache. 352 * 353 * The small cache is used for entries with short names, which are the 354 * most common. The large cache is used for entries which are too big to 355 * fit in the small cache. 356 */ 357 static uma_zone_t __read_mostly cache_zone_small; 358 static uma_zone_t __read_mostly cache_zone_small_ts; 359 static uma_zone_t __read_mostly cache_zone_large; 360 static uma_zone_t __read_mostly cache_zone_large_ts; 361 362 static struct namecache * 363 cache_alloc(int len, int ts) 364 { 365 struct namecache_ts *ncp_ts; 366 struct namecache *ncp; 367 368 if (__predict_false(ts)) { 369 if (len <= CACHE_PATH_CUTOFF) 370 ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK); 371 else 372 ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK); 373 ncp = &ncp_ts->nc_nc; 374 } else { 375 if (len <= CACHE_PATH_CUTOFF) 376 ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK); 377 else 378 ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK); 379 } 380 return (ncp); 381 } 382 383 static void 384 cache_free(struct namecache *ncp) 385 { 386 struct namecache_ts *ncp_ts; 387 388 MPASS(ncp != NULL); 389 if ((ncp->nc_flag & NCF_DVDROP) != 0) 390 vdrop(ncp->nc_dvp); 391 if (__predict_false(ncp->nc_flag & NCF_TS)) { 392 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 393 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 394 uma_zfree_smr(cache_zone_small_ts, ncp_ts); 395 else 396 uma_zfree_smr(cache_zone_large_ts, ncp_ts); 397 } else { 398 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 399 uma_zfree_smr(cache_zone_small, ncp); 400 else 401 uma_zfree_smr(cache_zone_large, ncp); 402 } 403 } 404 405 static void 406 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) 407 { 408 struct namecache_ts *ncp_ts; 409 410 KASSERT((ncp->nc_flag & NCF_TS) != 0 || 411 (tsp == NULL && ticksp == NULL), 412 ("No NCF_TS")); 413 414 if (tsp == NULL) 415 return; 416 417 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 418 *tsp = ncp_ts->nc_time; 419 *ticksp = ncp_ts->nc_ticks; 420 } 421 422 #ifdef DEBUG_CACHE 423 static int __read_mostly doingcache = 1; /* 1 => enable the cache */ 424 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, 425 "VFS namecache enabled"); 426 #endif 427 428 /* Export size information to userland */ 429 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 430 sizeof(struct namecache), "sizeof(struct namecache)"); 431 432 /* 433 * The new name cache statistics 434 */ 435 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 436 "Name cache statistics"); 437 #define STATNODE_ULONG(name, descr) \ 438 SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr); 439 #define STATNODE_COUNTER(name, descr) \ 440 static COUNTER_U64_DEFINE_EARLY(name); \ 441 SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, \ 442 descr); 443 STATNODE_ULONG(numneg, "Number of negative cache entries"); 444 STATNODE_ULONG(numcache, "Number of cache entries"); 445 STATNODE_COUNTER(numcachehv, "Number of namecache entries with vnodes held"); 446 STATNODE_COUNTER(numdrops, "Number of dropped entries due to reaching the limit"); 447 STATNODE_COUNTER(dothits, "Number of '.' hits"); 448 STATNODE_COUNTER(dotdothits, "Number of '..' hits"); 449 STATNODE_COUNTER(nummiss, "Number of cache misses"); 450 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache"); 451 STATNODE_COUNTER(numposzaps, 452 "Number of cache hits (positive) we do not want to cache"); 453 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)"); 454 STATNODE_COUNTER(numnegzaps, 455 "Number of cache hits (negative) we do not want to cache"); 456 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)"); 457 /* These count for vn_getcwd(), too. */ 458 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls"); 459 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); 460 STATNODE_COUNTER(numfullpathfail2, 461 "Number of fullpath search errors (VOP_VPTOCNP failures)"); 462 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); 463 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls"); 464 STATNODE_COUNTER(zap_and_exit_bucket_relock_success, 465 "Number of successful removals after relocking"); 466 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail, 467 "Number of times zap_and_exit failed to lock"); 468 static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2, 469 "Number of times zap_and_exit failed to lock"); 470 static long cache_lock_vnodes_cel_3_failures; 471 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures, 472 "Number of times 3-way vnode locking failed"); 473 STATNODE_ULONG(numhotneg, "Number of hot negative entries"); 474 STATNODE_COUNTER(numneg_evicted, 475 "Number of negative entries evicted when adding a new entry"); 476 STATNODE_COUNTER(shrinking_skipped, 477 "Number of times shrinking was already in progress"); 478 479 static void cache_zap_locked(struct namecache *ncp); 480 static int vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, 481 char **freebuf, size_t *buflen); 482 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 483 char **retbuf, size_t *buflen, bool slash_prefixed, size_t addend); 484 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, 485 char **retbuf, size_t *buflen); 486 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, 487 char **retbuf, size_t *len, bool slash_prefixed, size_t addend); 488 489 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 490 491 static inline void 492 cache_assert_vlp_locked(struct mtx *vlp) 493 { 494 495 if (vlp != NULL) 496 mtx_assert(vlp, MA_OWNED); 497 } 498 499 static inline void 500 cache_assert_vnode_locked(struct vnode *vp) 501 { 502 struct mtx *vlp; 503 504 vlp = VP2VNODELOCK(vp); 505 cache_assert_vlp_locked(vlp); 506 } 507 508 /* 509 * TODO: With the value stored we can do better than computing the hash based 510 * on the address. The choice of FNV should also be revisited. 511 */ 512 static void 513 cache_prehash(struct vnode *vp) 514 { 515 516 vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT); 517 } 518 519 static uint32_t 520 cache_get_hash(char *name, u_char len, struct vnode *dvp) 521 { 522 523 return (fnv_32_buf(name, len, dvp->v_nchash)); 524 } 525 526 static inline struct nchashhead * 527 NCP2BUCKET(struct namecache *ncp) 528 { 529 uint32_t hash; 530 531 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 532 return (NCHHASH(hash)); 533 } 534 535 static inline struct mtx * 536 NCP2BUCKETLOCK(struct namecache *ncp) 537 { 538 uint32_t hash; 539 540 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 541 return (HASH2BUCKETLOCK(hash)); 542 } 543 544 #ifdef INVARIANTS 545 static void 546 cache_assert_bucket_locked(struct namecache *ncp) 547 { 548 struct mtx *blp; 549 550 blp = NCP2BUCKETLOCK(ncp); 551 mtx_assert(blp, MA_OWNED); 552 } 553 554 static void 555 cache_assert_bucket_unlocked(struct namecache *ncp) 556 { 557 struct mtx *blp; 558 559 blp = NCP2BUCKETLOCK(ncp); 560 mtx_assert(blp, MA_NOTOWNED); 561 } 562 #else 563 #define cache_assert_bucket_locked(x) do { } while (0) 564 #define cache_assert_bucket_unlocked(x) do { } while (0) 565 #endif 566 567 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) 568 static void 569 _cache_sort_vnodes(void **p1, void **p2) 570 { 571 void *tmp; 572 573 MPASS(*p1 != NULL || *p2 != NULL); 574 575 if (*p1 > *p2) { 576 tmp = *p2; 577 *p2 = *p1; 578 *p1 = tmp; 579 } 580 } 581 582 static void 583 cache_lock_all_buckets(void) 584 { 585 u_int i; 586 587 for (i = 0; i < numbucketlocks; i++) 588 mtx_lock(&bucketlocks[i]); 589 } 590 591 static void 592 cache_unlock_all_buckets(void) 593 { 594 u_int i; 595 596 for (i = 0; i < numbucketlocks; i++) 597 mtx_unlock(&bucketlocks[i]); 598 } 599 600 static void 601 cache_lock_all_vnodes(void) 602 { 603 u_int i; 604 605 for (i = 0; i < numvnodelocks; i++) 606 mtx_lock(&vnodelocks[i]); 607 } 608 609 static void 610 cache_unlock_all_vnodes(void) 611 { 612 u_int i; 613 614 for (i = 0; i < numvnodelocks; i++) 615 mtx_unlock(&vnodelocks[i]); 616 } 617 618 static int 619 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 620 { 621 622 cache_sort_vnodes(&vlp1, &vlp2); 623 624 if (vlp1 != NULL) { 625 if (!mtx_trylock(vlp1)) 626 return (EAGAIN); 627 } 628 if (!mtx_trylock(vlp2)) { 629 if (vlp1 != NULL) 630 mtx_unlock(vlp1); 631 return (EAGAIN); 632 } 633 634 return (0); 635 } 636 637 static void 638 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 639 { 640 641 MPASS(vlp1 != NULL || vlp2 != NULL); 642 MPASS(vlp1 <= vlp2); 643 644 if (vlp1 != NULL) 645 mtx_lock(vlp1); 646 if (vlp2 != NULL) 647 mtx_lock(vlp2); 648 } 649 650 static void 651 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 652 { 653 654 MPASS(vlp1 != NULL || vlp2 != NULL); 655 656 if (vlp1 != NULL) 657 mtx_unlock(vlp1); 658 if (vlp2 != NULL) 659 mtx_unlock(vlp2); 660 } 661 662 static int 663 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 664 { 665 struct nchstats snap; 666 667 if (req->oldptr == NULL) 668 return (SYSCTL_OUT(req, 0, sizeof(snap))); 669 670 snap = nchstats; 671 snap.ncs_goodhits = counter_u64_fetch(numposhits); 672 snap.ncs_neghits = counter_u64_fetch(numneghits); 673 snap.ncs_badhits = counter_u64_fetch(numposzaps) + 674 counter_u64_fetch(numnegzaps); 675 snap.ncs_miss = counter_u64_fetch(nummisszap) + 676 counter_u64_fetch(nummiss); 677 678 return (SYSCTL_OUT(req, &snap, sizeof(snap))); 679 } 680 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | 681 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", 682 "VFS cache effectiveness statistics"); 683 684 #ifdef DIAGNOSTIC 685 /* 686 * Grab an atomic snapshot of the name cache hash chain lengths 687 */ 688 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, 689 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 690 "hash table stats"); 691 692 static int 693 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) 694 { 695 struct nchashhead *ncpp; 696 struct namecache *ncp; 697 int i, error, n_nchash, *cntbuf; 698 699 retry: 700 n_nchash = nchash + 1; /* nchash is max index, not count */ 701 if (req->oldptr == NULL) 702 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); 703 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); 704 cache_lock_all_buckets(); 705 if (n_nchash != nchash + 1) { 706 cache_unlock_all_buckets(); 707 free(cntbuf, M_TEMP); 708 goto retry; 709 } 710 /* Scan hash tables counting entries */ 711 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) 712 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) 713 cntbuf[i]++; 714 cache_unlock_all_buckets(); 715 for (error = 0, i = 0; i < n_nchash; i++) 716 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) 717 break; 718 free(cntbuf, M_TEMP); 719 return (error); 720 } 721 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| 722 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", 723 "nchash chain lengths"); 724 725 static int 726 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) 727 { 728 int error; 729 struct nchashhead *ncpp; 730 struct namecache *ncp; 731 int n_nchash; 732 int count, maxlength, used, pct; 733 734 if (!req->oldptr) 735 return SYSCTL_OUT(req, 0, 4 * sizeof(int)); 736 737 cache_lock_all_buckets(); 738 n_nchash = nchash + 1; /* nchash is max index, not count */ 739 used = 0; 740 maxlength = 0; 741 742 /* Scan hash tables for applicable entries */ 743 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { 744 count = 0; 745 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) { 746 count++; 747 } 748 if (count) 749 used++; 750 if (maxlength < count) 751 maxlength = count; 752 } 753 n_nchash = nchash + 1; 754 cache_unlock_all_buckets(); 755 pct = (used * 100) / (n_nchash / 100); 756 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); 757 if (error) 758 return (error); 759 error = SYSCTL_OUT(req, &used, sizeof(used)); 760 if (error) 761 return (error); 762 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); 763 if (error) 764 return (error); 765 error = SYSCTL_OUT(req, &pct, sizeof(pct)); 766 if (error) 767 return (error); 768 return (0); 769 } 770 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| 771 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", 772 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); 773 #endif 774 775 /* 776 * Negative entries management 777 * 778 * A variation of LRU scheme is used. New entries are hashed into one of 779 * numneglists cold lists. Entries get promoted to the hot list on first hit. 780 * 781 * The shrinker will demote hot list head and evict from the cold list in a 782 * round-robin manner. 783 */ 784 static void 785 cache_negative_init(struct namecache *ncp) 786 { 787 struct negstate *negstate; 788 789 ncp->nc_flag |= NCF_NEGATIVE; 790 negstate = NCP2NEGSTATE(ncp); 791 negstate->neg_flag = 0; 792 } 793 794 static void 795 cache_negative_hit(struct namecache *ncp) 796 { 797 struct neglist *neglist; 798 struct negstate *negstate; 799 800 negstate = NCP2NEGSTATE(ncp); 801 if ((negstate->neg_flag & NEG_HOT) != 0) 802 return; 803 neglist = NCP2NEGLIST(ncp); 804 mtx_lock(&ncneg_hot.nl_lock); 805 mtx_lock(&neglist->nl_lock); 806 if ((negstate->neg_flag & NEG_HOT) == 0) { 807 numhotneg++; 808 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 809 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); 810 negstate->neg_flag |= NEG_HOT; 811 } 812 mtx_unlock(&neglist->nl_lock); 813 mtx_unlock(&ncneg_hot.nl_lock); 814 } 815 816 static void 817 cache_negative_insert(struct namecache *ncp) 818 { 819 struct neglist *neglist; 820 821 MPASS(ncp->nc_flag & NCF_NEGATIVE); 822 cache_assert_bucket_locked(ncp); 823 neglist = NCP2NEGLIST(ncp); 824 mtx_lock(&neglist->nl_lock); 825 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 826 mtx_unlock(&neglist->nl_lock); 827 atomic_add_long(&numneg, 1); 828 } 829 830 static void 831 cache_negative_remove(struct namecache *ncp) 832 { 833 struct neglist *neglist; 834 struct negstate *negstate; 835 bool hot_locked = false; 836 bool list_locked = false; 837 838 cache_assert_bucket_locked(ncp); 839 neglist = NCP2NEGLIST(ncp); 840 negstate = NCP2NEGSTATE(ncp); 841 if ((negstate->neg_flag & NEG_HOT) != 0) { 842 hot_locked = true; 843 mtx_lock(&ncneg_hot.nl_lock); 844 if ((negstate->neg_flag & NEG_HOT) == 0) { 845 list_locked = true; 846 mtx_lock(&neglist->nl_lock); 847 } 848 } else { 849 list_locked = true; 850 mtx_lock(&neglist->nl_lock); 851 /* 852 * We may be racing against promotion in lockless lookup. 853 */ 854 if ((negstate->neg_flag & NEG_HOT) != 0) { 855 mtx_unlock(&neglist->nl_lock); 856 hot_locked = true; 857 mtx_lock(&ncneg_hot.nl_lock); 858 mtx_lock(&neglist->nl_lock); 859 } 860 } 861 if ((negstate->neg_flag & NEG_HOT) != 0) { 862 mtx_assert(&ncneg_hot.nl_lock, MA_OWNED); 863 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 864 numhotneg--; 865 } else { 866 mtx_assert(&neglist->nl_lock, MA_OWNED); 867 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 868 } 869 if (list_locked) 870 mtx_unlock(&neglist->nl_lock); 871 if (hot_locked) 872 mtx_unlock(&ncneg_hot.nl_lock); 873 atomic_subtract_long(&numneg, 1); 874 } 875 876 static void 877 cache_negative_shrink_select(struct namecache **ncpp, 878 struct neglist **neglistpp) 879 { 880 struct neglist *neglist; 881 struct namecache *ncp; 882 static u_int cycle; 883 u_int i; 884 885 *ncpp = ncp = NULL; 886 887 for (i = 0; i < numneglists; i++) { 888 neglist = &neglists[(cycle + i) % numneglists]; 889 if (TAILQ_FIRST(&neglist->nl_list) == NULL) 890 continue; 891 mtx_lock(&neglist->nl_lock); 892 ncp = TAILQ_FIRST(&neglist->nl_list); 893 if (ncp != NULL) 894 break; 895 mtx_unlock(&neglist->nl_lock); 896 } 897 898 *neglistpp = neglist; 899 *ncpp = ncp; 900 cycle++; 901 } 902 903 static void 904 cache_negative_zap_one(void) 905 { 906 struct namecache *ncp, *ncp2; 907 struct neglist *neglist; 908 struct negstate *negstate; 909 struct mtx *dvlp; 910 struct mtx *blp; 911 912 if (mtx_owner(&ncneg_shrink_lock) != NULL || 913 !mtx_trylock(&ncneg_shrink_lock)) { 914 counter_u64_add(shrinking_skipped, 1); 915 return; 916 } 917 918 mtx_lock(&ncneg_hot.nl_lock); 919 ncp = TAILQ_FIRST(&ncneg_hot.nl_list); 920 if (ncp != NULL) { 921 neglist = NCP2NEGLIST(ncp); 922 negstate = NCP2NEGSTATE(ncp); 923 mtx_lock(&neglist->nl_lock); 924 MPASS((negstate->neg_flag & NEG_HOT) != 0); 925 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 926 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 927 negstate->neg_flag &= ~NEG_HOT; 928 numhotneg--; 929 mtx_unlock(&neglist->nl_lock); 930 } 931 mtx_unlock(&ncneg_hot.nl_lock); 932 933 cache_negative_shrink_select(&ncp, &neglist); 934 935 mtx_unlock(&ncneg_shrink_lock); 936 if (ncp == NULL) 937 return; 938 939 MPASS(ncp->nc_flag & NCF_NEGATIVE); 940 dvlp = VP2VNODELOCK(ncp->nc_dvp); 941 blp = NCP2BUCKETLOCK(ncp); 942 mtx_unlock(&neglist->nl_lock); 943 mtx_lock(dvlp); 944 mtx_lock(blp); 945 /* 946 * Enter SMR to safely check the negative list. 947 * Even if the found pointer matches, the entry may now be reallocated 948 * and used by a different vnode. 949 */ 950 vfs_smr_enter(); 951 ncp2 = TAILQ_FIRST(&neglist->nl_list); 952 if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) || 953 blp != NCP2BUCKETLOCK(ncp2)) { 954 vfs_smr_exit(); 955 ncp = NULL; 956 } else { 957 vfs_smr_exit(); 958 SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp, 959 ncp->nc_name); 960 cache_zap_locked(ncp); 961 counter_u64_add(numneg_evicted, 1); 962 } 963 mtx_unlock(blp); 964 mtx_unlock(dvlp); 965 if (ncp != NULL) 966 cache_free(ncp); 967 } 968 969 /* 970 * cache_zap_locked(): 971 * 972 * Removes a namecache entry from cache, whether it contains an actual 973 * pointer to a vnode or if it is just a negative cache entry. 974 */ 975 static void 976 cache_zap_locked(struct namecache *ncp) 977 { 978 struct nchashhead *ncpp; 979 980 if (!(ncp->nc_flag & NCF_NEGATIVE)) 981 cache_assert_vnode_locked(ncp->nc_vp); 982 cache_assert_vnode_locked(ncp->nc_dvp); 983 cache_assert_bucket_locked(ncp); 984 985 cache_ncp_invalidate(ncp); 986 987 ncpp = NCP2BUCKET(ncp); 988 CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash); 989 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 990 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, 991 ncp->nc_name, ncp->nc_vp); 992 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); 993 if (ncp == ncp->nc_vp->v_cache_dd) { 994 vn_seqc_write_begin_unheld(ncp->nc_vp); 995 ncp->nc_vp->v_cache_dd = NULL; 996 vn_seqc_write_end(ncp->nc_vp); 997 } 998 } else { 999 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp, 1000 ncp->nc_name); 1001 cache_negative_remove(ncp); 1002 } 1003 if (ncp->nc_flag & NCF_ISDOTDOT) { 1004 if (ncp == ncp->nc_dvp->v_cache_dd) { 1005 vn_seqc_write_begin_unheld(ncp->nc_dvp); 1006 ncp->nc_dvp->v_cache_dd = NULL; 1007 vn_seqc_write_end(ncp->nc_dvp); 1008 } 1009 } else { 1010 LIST_REMOVE(ncp, nc_src); 1011 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { 1012 ncp->nc_flag |= NCF_DVDROP; 1013 counter_u64_add(numcachehv, -1); 1014 } 1015 } 1016 atomic_subtract_long(&numcache, 1); 1017 } 1018 1019 static void 1020 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) 1021 { 1022 struct mtx *blp; 1023 1024 MPASS(ncp->nc_dvp == vp); 1025 MPASS(ncp->nc_flag & NCF_NEGATIVE); 1026 cache_assert_vnode_locked(vp); 1027 1028 blp = NCP2BUCKETLOCK(ncp); 1029 mtx_lock(blp); 1030 cache_zap_locked(ncp); 1031 mtx_unlock(blp); 1032 } 1033 1034 static bool 1035 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, 1036 struct mtx **vlpp) 1037 { 1038 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 1039 struct mtx *blp; 1040 1041 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 1042 cache_assert_vnode_locked(vp); 1043 1044 if (ncp->nc_flag & NCF_NEGATIVE) { 1045 if (*vlpp != NULL) { 1046 mtx_unlock(*vlpp); 1047 *vlpp = NULL; 1048 } 1049 cache_zap_negative_locked_vnode_kl(ncp, vp); 1050 return (true); 1051 } 1052 1053 pvlp = VP2VNODELOCK(vp); 1054 blp = NCP2BUCKETLOCK(ncp); 1055 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 1056 vlp2 = VP2VNODELOCK(ncp->nc_vp); 1057 1058 if (*vlpp == vlp1 || *vlpp == vlp2) { 1059 to_unlock = *vlpp; 1060 *vlpp = NULL; 1061 } else { 1062 if (*vlpp != NULL) { 1063 mtx_unlock(*vlpp); 1064 *vlpp = NULL; 1065 } 1066 cache_sort_vnodes(&vlp1, &vlp2); 1067 if (vlp1 == pvlp) { 1068 mtx_lock(vlp2); 1069 to_unlock = vlp2; 1070 } else { 1071 if (!mtx_trylock(vlp1)) 1072 goto out_relock; 1073 to_unlock = vlp1; 1074 } 1075 } 1076 mtx_lock(blp); 1077 cache_zap_locked(ncp); 1078 mtx_unlock(blp); 1079 if (to_unlock != NULL) 1080 mtx_unlock(to_unlock); 1081 return (true); 1082 1083 out_relock: 1084 mtx_unlock(vlp2); 1085 mtx_lock(vlp1); 1086 mtx_lock(vlp2); 1087 MPASS(*vlpp == NULL); 1088 *vlpp = vlp1; 1089 return (false); 1090 } 1091 1092 static int __noinline 1093 cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp) 1094 { 1095 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 1096 struct mtx *blp; 1097 int error = 0; 1098 1099 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 1100 cache_assert_vnode_locked(vp); 1101 1102 pvlp = VP2VNODELOCK(vp); 1103 if (ncp->nc_flag & NCF_NEGATIVE) { 1104 cache_zap_negative_locked_vnode_kl(ncp, vp); 1105 goto out; 1106 } 1107 1108 blp = NCP2BUCKETLOCK(ncp); 1109 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 1110 vlp2 = VP2VNODELOCK(ncp->nc_vp); 1111 cache_sort_vnodes(&vlp1, &vlp2); 1112 if (vlp1 == pvlp) { 1113 mtx_lock(vlp2); 1114 to_unlock = vlp2; 1115 } else { 1116 if (!mtx_trylock(vlp1)) { 1117 /* 1118 * TODO: Very wasteful but rare. 1119 */ 1120 mtx_unlock(pvlp); 1121 mtx_lock(vlp1); 1122 mtx_lock(vlp2); 1123 mtx_unlock(vlp2); 1124 mtx_unlock(vlp1); 1125 return (EAGAIN); 1126 } 1127 to_unlock = vlp1; 1128 } 1129 mtx_lock(blp); 1130 cache_zap_locked(ncp); 1131 mtx_unlock(blp); 1132 mtx_unlock(to_unlock); 1133 out: 1134 mtx_unlock(pvlp); 1135 return (error); 1136 } 1137 1138 /* 1139 * If trylocking failed we can get here. We know enough to take all needed locks 1140 * in the right order and re-lookup the entry. 1141 */ 1142 static int 1143 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1144 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, 1145 struct mtx *blp) 1146 { 1147 struct namecache *rncp; 1148 1149 cache_assert_bucket_unlocked(ncp); 1150 1151 cache_sort_vnodes(&dvlp, &vlp); 1152 cache_lock_vnodes(dvlp, vlp); 1153 mtx_lock(blp); 1154 CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { 1155 if (rncp == ncp && rncp->nc_dvp == dvp && 1156 rncp->nc_nlen == cnp->cn_namelen && 1157 !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) 1158 break; 1159 } 1160 if (rncp != NULL) { 1161 cache_zap_locked(rncp); 1162 mtx_unlock(blp); 1163 cache_unlock_vnodes(dvlp, vlp); 1164 counter_u64_add(zap_and_exit_bucket_relock_success, 1); 1165 return (0); 1166 } 1167 1168 mtx_unlock(blp); 1169 cache_unlock_vnodes(dvlp, vlp); 1170 return (EAGAIN); 1171 } 1172 1173 static int __noinline 1174 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp, 1175 uint32_t hash, struct mtx *blp) 1176 { 1177 struct mtx *dvlp, *vlp; 1178 struct vnode *dvp; 1179 1180 cache_assert_bucket_locked(ncp); 1181 1182 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1183 vlp = NULL; 1184 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1185 vlp = VP2VNODELOCK(ncp->nc_vp); 1186 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1187 cache_zap_locked(ncp); 1188 mtx_unlock(blp); 1189 cache_unlock_vnodes(dvlp, vlp); 1190 return (0); 1191 } 1192 1193 dvp = ncp->nc_dvp; 1194 mtx_unlock(blp); 1195 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1196 } 1197 1198 static __noinline int 1199 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp) 1200 { 1201 struct namecache *ncp; 1202 struct mtx *blp; 1203 struct mtx *dvlp, *dvlp2; 1204 uint32_t hash; 1205 int error; 1206 1207 if (cnp->cn_namelen == 2 && 1208 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1209 dvlp = VP2VNODELOCK(dvp); 1210 dvlp2 = NULL; 1211 mtx_lock(dvlp); 1212 retry_dotdot: 1213 ncp = dvp->v_cache_dd; 1214 if (ncp == NULL) { 1215 mtx_unlock(dvlp); 1216 if (dvlp2 != NULL) 1217 mtx_unlock(dvlp2); 1218 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1219 return (0); 1220 } 1221 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1222 if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2)) 1223 goto retry_dotdot; 1224 MPASS(dvp->v_cache_dd == NULL); 1225 mtx_unlock(dvlp); 1226 if (dvlp2 != NULL) 1227 mtx_unlock(dvlp2); 1228 cache_free(ncp); 1229 } else { 1230 vn_seqc_write_begin(dvp); 1231 dvp->v_cache_dd = NULL; 1232 vn_seqc_write_end(dvp); 1233 mtx_unlock(dvlp); 1234 if (dvlp2 != NULL) 1235 mtx_unlock(dvlp2); 1236 } 1237 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1238 return (1); 1239 } 1240 1241 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1242 blp = HASH2BUCKETLOCK(hash); 1243 retry: 1244 if (CK_SLIST_EMPTY(NCHHASH(hash))) 1245 goto out_no_entry; 1246 1247 mtx_lock(blp); 1248 1249 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1250 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1251 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1252 break; 1253 } 1254 1255 if (ncp == NULL) { 1256 mtx_unlock(blp); 1257 goto out_no_entry; 1258 } 1259 1260 error = cache_zap_locked_bucket(ncp, cnp, hash, blp); 1261 if (__predict_false(error != 0)) { 1262 zap_and_exit_bucket_fail++; 1263 goto retry; 1264 } 1265 counter_u64_add(numposzaps, 1); 1266 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1267 cache_free(ncp); 1268 return (1); 1269 out_no_entry: 1270 counter_u64_add(nummisszap, 1); 1271 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1272 return (0); 1273 } 1274 1275 static int __noinline 1276 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1277 struct timespec *tsp, int *ticksp) 1278 { 1279 int ltype; 1280 1281 *vpp = dvp; 1282 counter_u64_add(dothits, 1); 1283 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); 1284 if (tsp != NULL) 1285 timespecclear(tsp); 1286 if (ticksp != NULL) 1287 *ticksp = ticks; 1288 vrefact(*vpp); 1289 /* 1290 * When we lookup "." we still can be asked to lock it 1291 * differently... 1292 */ 1293 ltype = cnp->cn_lkflags & LK_TYPE_MASK; 1294 if (ltype != VOP_ISLOCKED(*vpp)) { 1295 if (ltype == LK_EXCLUSIVE) { 1296 vn_lock(*vpp, LK_UPGRADE | LK_RETRY); 1297 if (VN_IS_DOOMED((*vpp))) { 1298 /* forced unmount */ 1299 vrele(*vpp); 1300 *vpp = NULL; 1301 return (ENOENT); 1302 } 1303 } else 1304 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); 1305 } 1306 return (-1); 1307 } 1308 1309 static int __noinline 1310 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1311 struct timespec *tsp, int *ticksp) 1312 { 1313 struct namecache_ts *ncp_ts; 1314 struct namecache *ncp; 1315 struct mtx *dvlp; 1316 enum vgetstate vs; 1317 int error, ltype; 1318 bool whiteout; 1319 1320 MPASS((cnp->cn_flags & ISDOTDOT) != 0); 1321 1322 if ((cnp->cn_flags & MAKEENTRY) == 0) { 1323 cache_remove_cnp(dvp, cnp); 1324 return (0); 1325 } 1326 1327 counter_u64_add(dotdothits, 1); 1328 retry: 1329 dvlp = VP2VNODELOCK(dvp); 1330 mtx_lock(dvlp); 1331 ncp = dvp->v_cache_dd; 1332 if (ncp == NULL) { 1333 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, "..", NULL); 1334 mtx_unlock(dvlp); 1335 return (0); 1336 } 1337 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1338 if (ncp->nc_flag & NCF_NEGATIVE) 1339 *vpp = NULL; 1340 else 1341 *vpp = ncp->nc_vp; 1342 } else 1343 *vpp = ncp->nc_dvp; 1344 if (*vpp == NULL) 1345 goto negative_success; 1346 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp); 1347 cache_out_ts(ncp, tsp, ticksp); 1348 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == 1349 NCF_DTS && tsp != NULL) { 1350 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1351 *tsp = ncp_ts->nc_dotdottime; 1352 } 1353 1354 MPASS(dvp != *vpp); 1355 ltype = VOP_ISLOCKED(dvp); 1356 VOP_UNLOCK(dvp); 1357 vs = vget_prep(*vpp); 1358 mtx_unlock(dvlp); 1359 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1360 vn_lock(dvp, ltype | LK_RETRY); 1361 if (VN_IS_DOOMED(dvp)) { 1362 if (error == 0) 1363 vput(*vpp); 1364 *vpp = NULL; 1365 return (ENOENT); 1366 } 1367 if (error) { 1368 *vpp = NULL; 1369 goto retry; 1370 } 1371 return (-1); 1372 negative_success: 1373 if (__predict_false(cnp->cn_nameiop == CREATE)) { 1374 if (cnp->cn_flags & ISLASTCN) { 1375 counter_u64_add(numnegzaps, 1); 1376 error = cache_zap_locked_vnode(ncp, dvp); 1377 if (__predict_false(error != 0)) { 1378 zap_and_exit_bucket_fail2++; 1379 goto retry; 1380 } 1381 cache_free(ncp); 1382 return (0); 1383 } 1384 } 1385 1386 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 1387 cache_out_ts(ncp, tsp, ticksp); 1388 counter_u64_add(numneghits, 1); 1389 whiteout = (ncp->nc_flag & NCF_WHITE); 1390 cache_negative_hit(ncp); 1391 mtx_unlock(dvlp); 1392 if (whiteout) 1393 cnp->cn_flags |= ISWHITEOUT; 1394 return (ENOENT); 1395 } 1396 1397 /** 1398 * Lookup a name in the name cache 1399 * 1400 * # Arguments 1401 * 1402 * - dvp: Parent directory in which to search. 1403 * - vpp: Return argument. Will contain desired vnode on cache hit. 1404 * - cnp: Parameters of the name search. The most interesting bits of 1405 * the cn_flags field have the following meanings: 1406 * - MAKEENTRY: If clear, free an entry from the cache rather than look 1407 * it up. 1408 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." 1409 * - tsp: Return storage for cache timestamp. On a successful (positive 1410 * or negative) lookup, tsp will be filled with any timespec that 1411 * was stored when this cache entry was created. However, it will 1412 * be clear for "." entries. 1413 * - ticks: Return storage for alternate cache timestamp. On a successful 1414 * (positive or negative) lookup, it will contain the ticks value 1415 * that was current when the cache entry was created, unless cnp 1416 * was ".". 1417 * 1418 * Either both tsp and ticks have to be provided or neither of them. 1419 * 1420 * # Returns 1421 * 1422 * - -1: A positive cache hit. vpp will contain the desired vnode. 1423 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due 1424 * to a forced unmount. vpp will not be modified. If the entry 1425 * is a whiteout, then the ISWHITEOUT flag will be set in 1426 * cnp->cn_flags. 1427 * - 0: A cache miss. vpp will not be modified. 1428 * 1429 * # Locking 1430 * 1431 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up 1432 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the 1433 * lock is not recursively acquired. 1434 */ 1435 static int __noinline 1436 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1437 struct timespec *tsp, int *ticksp) 1438 { 1439 struct namecache *ncp; 1440 struct mtx *blp; 1441 uint32_t hash; 1442 enum vgetstate vs; 1443 int error; 1444 bool whiteout; 1445 1446 MPASS((cnp->cn_flags & (MAKEENTRY | ISDOTDOT)) == MAKEENTRY); 1447 1448 retry: 1449 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1450 blp = HASH2BUCKETLOCK(hash); 1451 mtx_lock(blp); 1452 1453 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1454 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1455 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1456 break; 1457 } 1458 1459 if (__predict_false(ncp == NULL)) { 1460 mtx_unlock(blp); 1461 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1462 NULL); 1463 counter_u64_add(nummiss, 1); 1464 return (0); 1465 } 1466 1467 if (ncp->nc_flag & NCF_NEGATIVE) 1468 goto negative_success; 1469 1470 counter_u64_add(numposhits, 1); 1471 *vpp = ncp->nc_vp; 1472 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); 1473 cache_out_ts(ncp, tsp, ticksp); 1474 MPASS(dvp != *vpp); 1475 vs = vget_prep(*vpp); 1476 mtx_unlock(blp); 1477 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1478 if (error) { 1479 *vpp = NULL; 1480 goto retry; 1481 } 1482 return (-1); 1483 negative_success: 1484 if (__predict_false(cnp->cn_nameiop == CREATE)) { 1485 if (cnp->cn_flags & ISLASTCN) { 1486 counter_u64_add(numnegzaps, 1); 1487 error = cache_zap_locked_vnode(ncp, dvp); 1488 if (__predict_false(error != 0)) { 1489 zap_and_exit_bucket_fail2++; 1490 goto retry; 1491 } 1492 cache_free(ncp); 1493 return (0); 1494 } 1495 } 1496 1497 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 1498 cache_out_ts(ncp, tsp, ticksp); 1499 counter_u64_add(numneghits, 1); 1500 whiteout = (ncp->nc_flag & NCF_WHITE); 1501 cache_negative_hit(ncp); 1502 mtx_unlock(blp); 1503 if (whiteout) 1504 cnp->cn_flags |= ISWHITEOUT; 1505 return (ENOENT); 1506 } 1507 1508 int 1509 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1510 struct timespec *tsp, int *ticksp) 1511 { 1512 struct namecache *ncp; 1513 struct negstate *negstate; 1514 uint32_t hash; 1515 enum vgetstate vs; 1516 int error; 1517 bool whiteout; 1518 u_short nc_flag; 1519 1520 MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL)); 1521 1522 #ifdef DEBUG_CACHE 1523 if (__predict_false(!doingcache)) { 1524 cnp->cn_flags &= ~MAKEENTRY; 1525 return (0); 1526 } 1527 #endif 1528 1529 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 1530 if (cnp->cn_namelen == 1) 1531 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); 1532 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') 1533 return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp)); 1534 } 1535 1536 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 1537 1538 if ((cnp->cn_flags & MAKEENTRY) == 0) { 1539 cache_remove_cnp(dvp, cnp); 1540 return (0); 1541 } 1542 1543 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1544 vfs_smr_enter(); 1545 1546 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1547 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1548 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1549 break; 1550 } 1551 1552 if (__predict_false(ncp == NULL)) { 1553 vfs_smr_exit(); 1554 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1555 NULL); 1556 counter_u64_add(nummiss, 1); 1557 return (0); 1558 } 1559 1560 nc_flag = atomic_load_char(&ncp->nc_flag); 1561 if (nc_flag & NCF_NEGATIVE) 1562 goto negative_success; 1563 1564 counter_u64_add(numposhits, 1); 1565 *vpp = ncp->nc_vp; 1566 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); 1567 cache_out_ts(ncp, tsp, ticksp); 1568 MPASS(dvp != *vpp); 1569 if (!cache_ncp_canuse(ncp)) { 1570 vfs_smr_exit(); 1571 *vpp = NULL; 1572 goto out_fallback; 1573 } 1574 vs = vget_prep_smr(*vpp); 1575 vfs_smr_exit(); 1576 if (__predict_false(vs == VGET_NONE)) { 1577 *vpp = NULL; 1578 goto out_fallback; 1579 } 1580 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1581 if (error) { 1582 *vpp = NULL; 1583 goto out_fallback; 1584 } 1585 return (-1); 1586 negative_success: 1587 if (__predict_false(cnp->cn_nameiop == CREATE)) { 1588 if (cnp->cn_flags & ISLASTCN) { 1589 vfs_smr_exit(); 1590 goto out_fallback; 1591 } 1592 } 1593 1594 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 1595 cache_out_ts(ncp, tsp, ticksp); 1596 counter_u64_add(numneghits, 1); 1597 whiteout = (ncp->nc_flag & NCF_WHITE); 1598 /* 1599 * TODO: We need to take locks to promote an entry. Code doing it 1600 * in SMR lookup can be modified to be shared. 1601 */ 1602 negstate = NCP2NEGSTATE(ncp); 1603 if ((negstate->neg_flag & NEG_HOT) == 0 || 1604 !cache_ncp_canuse(ncp)) { 1605 vfs_smr_exit(); 1606 goto out_fallback; 1607 } 1608 vfs_smr_exit(); 1609 if (whiteout) 1610 cnp->cn_flags |= ISWHITEOUT; 1611 return (ENOENT); 1612 out_fallback: 1613 return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp)); 1614 } 1615 1616 struct celockstate { 1617 struct mtx *vlp[3]; 1618 struct mtx *blp[2]; 1619 }; 1620 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); 1621 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); 1622 1623 static inline void 1624 cache_celockstate_init(struct celockstate *cel) 1625 { 1626 1627 bzero(cel, sizeof(*cel)); 1628 } 1629 1630 static void 1631 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, 1632 struct vnode *dvp) 1633 { 1634 struct mtx *vlp1, *vlp2; 1635 1636 MPASS(cel->vlp[0] == NULL); 1637 MPASS(cel->vlp[1] == NULL); 1638 MPASS(cel->vlp[2] == NULL); 1639 1640 MPASS(vp != NULL || dvp != NULL); 1641 1642 vlp1 = VP2VNODELOCK(vp); 1643 vlp2 = VP2VNODELOCK(dvp); 1644 cache_sort_vnodes(&vlp1, &vlp2); 1645 1646 if (vlp1 != NULL) { 1647 mtx_lock(vlp1); 1648 cel->vlp[0] = vlp1; 1649 } 1650 mtx_lock(vlp2); 1651 cel->vlp[1] = vlp2; 1652 } 1653 1654 static void 1655 cache_unlock_vnodes_cel(struct celockstate *cel) 1656 { 1657 1658 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); 1659 1660 if (cel->vlp[0] != NULL) 1661 mtx_unlock(cel->vlp[0]); 1662 if (cel->vlp[1] != NULL) 1663 mtx_unlock(cel->vlp[1]); 1664 if (cel->vlp[2] != NULL) 1665 mtx_unlock(cel->vlp[2]); 1666 } 1667 1668 static bool 1669 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) 1670 { 1671 struct mtx *vlp; 1672 bool ret; 1673 1674 cache_assert_vlp_locked(cel->vlp[0]); 1675 cache_assert_vlp_locked(cel->vlp[1]); 1676 MPASS(cel->vlp[2] == NULL); 1677 1678 MPASS(vp != NULL); 1679 vlp = VP2VNODELOCK(vp); 1680 1681 ret = true; 1682 if (vlp >= cel->vlp[1]) { 1683 mtx_lock(vlp); 1684 } else { 1685 if (mtx_trylock(vlp)) 1686 goto out; 1687 cache_lock_vnodes_cel_3_failures++; 1688 cache_unlock_vnodes_cel(cel); 1689 if (vlp < cel->vlp[0]) { 1690 mtx_lock(vlp); 1691 mtx_lock(cel->vlp[0]); 1692 mtx_lock(cel->vlp[1]); 1693 } else { 1694 if (cel->vlp[0] != NULL) 1695 mtx_lock(cel->vlp[0]); 1696 mtx_lock(vlp); 1697 mtx_lock(cel->vlp[1]); 1698 } 1699 ret = false; 1700 } 1701 out: 1702 cel->vlp[2] = vlp; 1703 return (ret); 1704 } 1705 1706 static void 1707 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1, 1708 struct mtx *blp2) 1709 { 1710 1711 MPASS(cel->blp[0] == NULL); 1712 MPASS(cel->blp[1] == NULL); 1713 1714 cache_sort_vnodes(&blp1, &blp2); 1715 1716 if (blp1 != NULL) { 1717 mtx_lock(blp1); 1718 cel->blp[0] = blp1; 1719 } 1720 mtx_lock(blp2); 1721 cel->blp[1] = blp2; 1722 } 1723 1724 static void 1725 cache_unlock_buckets_cel(struct celockstate *cel) 1726 { 1727 1728 if (cel->blp[0] != NULL) 1729 mtx_unlock(cel->blp[0]); 1730 mtx_unlock(cel->blp[1]); 1731 } 1732 1733 /* 1734 * Lock part of the cache affected by the insertion. 1735 * 1736 * This means vnodelocks for dvp, vp and the relevant bucketlock. 1737 * However, insertion can result in removal of an old entry. In this 1738 * case we have an additional vnode and bucketlock pair to lock. 1739 * 1740 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while 1741 * preserving the locking order (smaller address first). 1742 */ 1743 static void 1744 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1745 uint32_t hash) 1746 { 1747 struct namecache *ncp; 1748 struct mtx *blps[2]; 1749 1750 blps[0] = HASH2BUCKETLOCK(hash); 1751 for (;;) { 1752 blps[1] = NULL; 1753 cache_lock_vnodes_cel(cel, dvp, vp); 1754 if (vp == NULL || vp->v_type != VDIR) 1755 break; 1756 ncp = vp->v_cache_dd; 1757 if (ncp == NULL) 1758 break; 1759 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1760 break; 1761 MPASS(ncp->nc_dvp == vp); 1762 blps[1] = NCP2BUCKETLOCK(ncp); 1763 if (ncp->nc_flag & NCF_NEGATIVE) 1764 break; 1765 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1766 break; 1767 /* 1768 * All vnodes got re-locked. Re-validate the state and if 1769 * nothing changed we are done. Otherwise restart. 1770 */ 1771 if (ncp == vp->v_cache_dd && 1772 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1773 blps[1] == NCP2BUCKETLOCK(ncp) && 1774 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1775 break; 1776 cache_unlock_vnodes_cel(cel); 1777 cel->vlp[0] = NULL; 1778 cel->vlp[1] = NULL; 1779 cel->vlp[2] = NULL; 1780 } 1781 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1782 } 1783 1784 static void 1785 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1786 uint32_t hash) 1787 { 1788 struct namecache *ncp; 1789 struct mtx *blps[2]; 1790 1791 blps[0] = HASH2BUCKETLOCK(hash); 1792 for (;;) { 1793 blps[1] = NULL; 1794 cache_lock_vnodes_cel(cel, dvp, vp); 1795 ncp = dvp->v_cache_dd; 1796 if (ncp == NULL) 1797 break; 1798 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1799 break; 1800 MPASS(ncp->nc_dvp == dvp); 1801 blps[1] = NCP2BUCKETLOCK(ncp); 1802 if (ncp->nc_flag & NCF_NEGATIVE) 1803 break; 1804 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1805 break; 1806 if (ncp == dvp->v_cache_dd && 1807 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1808 blps[1] == NCP2BUCKETLOCK(ncp) && 1809 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1810 break; 1811 cache_unlock_vnodes_cel(cel); 1812 cel->vlp[0] = NULL; 1813 cel->vlp[1] = NULL; 1814 cel->vlp[2] = NULL; 1815 } 1816 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1817 } 1818 1819 static void 1820 cache_enter_unlock(struct celockstate *cel) 1821 { 1822 1823 cache_unlock_buckets_cel(cel); 1824 cache_unlock_vnodes_cel(cel); 1825 } 1826 1827 static void __noinline 1828 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, 1829 struct componentname *cnp) 1830 { 1831 struct celockstate cel; 1832 struct namecache *ncp; 1833 uint32_t hash; 1834 int len; 1835 1836 if (dvp->v_cache_dd == NULL) 1837 return; 1838 len = cnp->cn_namelen; 1839 cache_celockstate_init(&cel); 1840 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1841 cache_enter_lock_dd(&cel, dvp, vp, hash); 1842 vn_seqc_write_begin(dvp); 1843 ncp = dvp->v_cache_dd; 1844 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) { 1845 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); 1846 cache_zap_locked(ncp); 1847 } else { 1848 ncp = NULL; 1849 } 1850 dvp->v_cache_dd = NULL; 1851 vn_seqc_write_end(dvp); 1852 cache_enter_unlock(&cel); 1853 if (ncp != NULL) 1854 cache_free(ncp); 1855 } 1856 1857 /* 1858 * Add an entry to the cache. 1859 */ 1860 void 1861 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 1862 struct timespec *tsp, struct timespec *dtsp) 1863 { 1864 struct celockstate cel; 1865 struct namecache *ncp, *n2, *ndd; 1866 struct namecache_ts *ncp_ts; 1867 struct nchashhead *ncpp; 1868 uint32_t hash; 1869 int flag; 1870 int len; 1871 u_long lnumcache; 1872 1873 VNPASS(!VN_IS_DOOMED(dvp), dvp); 1874 VNPASS(dvp->v_type != VNON, dvp); 1875 if (vp != NULL) { 1876 VNPASS(!VN_IS_DOOMED(vp), vp); 1877 VNPASS(vp->v_type != VNON, vp); 1878 } 1879 1880 #ifdef DEBUG_CACHE 1881 if (__predict_false(!doingcache)) 1882 return; 1883 #endif 1884 1885 flag = 0; 1886 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 1887 if (cnp->cn_namelen == 1) 1888 return; 1889 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 1890 cache_enter_dotdot_prep(dvp, vp, cnp); 1891 flag = NCF_ISDOTDOT; 1892 } 1893 } 1894 1895 /* 1896 * Avoid blowout in namecache entries. 1897 */ 1898 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; 1899 if (__predict_false(lnumcache >= ncsize)) { 1900 atomic_subtract_long(&numcache, 1); 1901 counter_u64_add(numdrops, 1); 1902 return; 1903 } 1904 1905 cache_celockstate_init(&cel); 1906 ndd = NULL; 1907 ncp_ts = NULL; 1908 1909 /* 1910 * Calculate the hash key and setup as much of the new 1911 * namecache entry as possible before acquiring the lock. 1912 */ 1913 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); 1914 ncp->nc_flag = flag | NCF_WIP; 1915 ncp->nc_vp = vp; 1916 if (vp == NULL) 1917 cache_negative_init(ncp); 1918 ncp->nc_dvp = dvp; 1919 if (tsp != NULL) { 1920 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1921 ncp_ts->nc_time = *tsp; 1922 ncp_ts->nc_ticks = ticks; 1923 ncp_ts->nc_nc.nc_flag |= NCF_TS; 1924 if (dtsp != NULL) { 1925 ncp_ts->nc_dotdottime = *dtsp; 1926 ncp_ts->nc_nc.nc_flag |= NCF_DTS; 1927 } 1928 } 1929 len = ncp->nc_nlen = cnp->cn_namelen; 1930 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1931 memcpy(ncp->nc_name, cnp->cn_nameptr, len); 1932 ncp->nc_name[len] = '\0'; 1933 cache_enter_lock(&cel, dvp, vp, hash); 1934 1935 /* 1936 * See if this vnode or negative entry is already in the cache 1937 * with this name. This can happen with concurrent lookups of 1938 * the same path name. 1939 */ 1940 ncpp = NCHHASH(hash); 1941 CK_SLIST_FOREACH(n2, ncpp, nc_hash) { 1942 if (n2->nc_dvp == dvp && 1943 n2->nc_nlen == cnp->cn_namelen && 1944 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { 1945 MPASS(cache_ncp_canuse(n2)); 1946 if ((n2->nc_flag & NCF_NEGATIVE) != 0) 1947 KASSERT(vp == NULL, 1948 ("%s: found entry pointing to a different vnode (%p != %p)", 1949 __func__, NULL, vp)); 1950 else 1951 KASSERT(n2->nc_vp == vp, 1952 ("%s: found entry pointing to a different vnode (%p != %p)", 1953 __func__, n2->nc_vp, vp)); 1954 /* 1955 * Entries are supposed to be immutable unless in the 1956 * process of getting destroyed. Accommodating for 1957 * changing timestamps is possible but not worth it. 1958 * This should be harmless in terms of correctness, in 1959 * the worst case resulting in an earlier expiration. 1960 * Alternatively, the found entry can be replaced 1961 * altogether. 1962 */ 1963 MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS))); 1964 #if 0 1965 if (tsp != NULL) { 1966 KASSERT((n2->nc_flag & NCF_TS) != 0, 1967 ("no NCF_TS")); 1968 n2_ts = __containerof(n2, struct namecache_ts, nc_nc); 1969 n2_ts->nc_time = ncp_ts->nc_time; 1970 n2_ts->nc_ticks = ncp_ts->nc_ticks; 1971 if (dtsp != NULL) { 1972 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; 1973 n2_ts->nc_nc.nc_flag |= NCF_DTS; 1974 } 1975 } 1976 #endif 1977 goto out_unlock_free; 1978 } 1979 } 1980 1981 if (flag == NCF_ISDOTDOT) { 1982 /* 1983 * See if we are trying to add .. entry, but some other lookup 1984 * has populated v_cache_dd pointer already. 1985 */ 1986 if (dvp->v_cache_dd != NULL) 1987 goto out_unlock_free; 1988 KASSERT(vp == NULL || vp->v_type == VDIR, 1989 ("wrong vnode type %p", vp)); 1990 vn_seqc_write_begin(dvp); 1991 dvp->v_cache_dd = ncp; 1992 vn_seqc_write_end(dvp); 1993 } 1994 1995 if (vp != NULL) { 1996 if (flag != NCF_ISDOTDOT) { 1997 /* 1998 * For this case, the cache entry maps both the 1999 * directory name in it and the name ".." for the 2000 * directory's parent. 2001 */ 2002 vn_seqc_write_begin(vp); 2003 if ((ndd = vp->v_cache_dd) != NULL) { 2004 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) 2005 cache_zap_locked(ndd); 2006 else 2007 ndd = NULL; 2008 } 2009 vp->v_cache_dd = ncp; 2010 vn_seqc_write_end(vp); 2011 } else if (vp->v_type != VDIR) { 2012 if (vp->v_cache_dd != NULL) { 2013 vn_seqc_write_begin(vp); 2014 vp->v_cache_dd = NULL; 2015 vn_seqc_write_end(vp); 2016 } 2017 } 2018 } 2019 2020 if (flag != NCF_ISDOTDOT) { 2021 if (LIST_EMPTY(&dvp->v_cache_src)) { 2022 vhold(dvp); 2023 counter_u64_add(numcachehv, 1); 2024 } 2025 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); 2026 } 2027 2028 /* 2029 * If the entry is "negative", we place it into the 2030 * "negative" cache queue, otherwise, we place it into the 2031 * destination vnode's cache entries queue. 2032 */ 2033 if (vp != NULL) { 2034 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); 2035 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, 2036 vp); 2037 } else { 2038 if (cnp->cn_flags & ISWHITEOUT) 2039 ncp->nc_flag |= NCF_WHITE; 2040 cache_negative_insert(ncp); 2041 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, 2042 ncp->nc_name); 2043 } 2044 2045 /* 2046 * Insert the new namecache entry into the appropriate chain 2047 * within the cache entries table. 2048 */ 2049 CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash); 2050 2051 atomic_thread_fence_rel(); 2052 /* 2053 * Mark the entry as fully constructed. 2054 * It is immutable past this point until its removal. 2055 */ 2056 atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP); 2057 2058 cache_enter_unlock(&cel); 2059 if (numneg * ncnegfactor > lnumcache) 2060 cache_negative_zap_one(); 2061 if (ndd != NULL) 2062 cache_free(ndd); 2063 return; 2064 out_unlock_free: 2065 cache_enter_unlock(&cel); 2066 atomic_subtract_long(&numcache, 1); 2067 cache_free(ncp); 2068 return; 2069 } 2070 2071 static u_int 2072 cache_roundup_2(u_int val) 2073 { 2074 u_int res; 2075 2076 for (res = 1; res <= val; res <<= 1) 2077 continue; 2078 2079 return (res); 2080 } 2081 2082 static struct nchashhead * 2083 nchinittbl(u_long elements, u_long *hashmask) 2084 { 2085 struct nchashhead *hashtbl; 2086 u_long hashsize, i; 2087 2088 hashsize = cache_roundup_2(elements) / 2; 2089 2090 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK); 2091 for (i = 0; i < hashsize; i++) 2092 CK_SLIST_INIT(&hashtbl[i]); 2093 *hashmask = hashsize - 1; 2094 return (hashtbl); 2095 } 2096 2097 static void 2098 ncfreetbl(struct nchashhead *hashtbl) 2099 { 2100 2101 free(hashtbl, M_VFSCACHE); 2102 } 2103 2104 /* 2105 * Name cache initialization, from vfs_init() when we are booting 2106 */ 2107 static void 2108 nchinit(void *dummy __unused) 2109 { 2110 u_int i; 2111 2112 cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE, 2113 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2114 cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE, 2115 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2116 cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE, 2117 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2118 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE, 2119 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2120 2121 VFS_SMR_ZONE_SET(cache_zone_small); 2122 VFS_SMR_ZONE_SET(cache_zone_small_ts); 2123 VFS_SMR_ZONE_SET(cache_zone_large); 2124 VFS_SMR_ZONE_SET(cache_zone_large_ts); 2125 2126 ncsize = desiredvnodes * ncsizefactor; 2127 nchashtbl = nchinittbl(desiredvnodes * 2, &nchash); 2128 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; 2129 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ 2130 ncbuckethash = 7; 2131 if (ncbuckethash > nchash) 2132 ncbuckethash = nchash; 2133 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, 2134 M_WAITOK | M_ZERO); 2135 for (i = 0; i < numbucketlocks; i++) 2136 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE); 2137 ncvnodehash = ncbuckethash; 2138 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, 2139 M_WAITOK | M_ZERO); 2140 for (i = 0; i < numvnodelocks; i++) 2141 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); 2142 2143 neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE, 2144 M_WAITOK | M_ZERO); 2145 for (i = 0; i < numneglists; i++) { 2146 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); 2147 TAILQ_INIT(&neglists[i].nl_list); 2148 } 2149 mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF); 2150 TAILQ_INIT(&ncneg_hot.nl_list); 2151 2152 mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF); 2153 } 2154 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); 2155 2156 void 2157 cache_vnode_init(struct vnode *vp) 2158 { 2159 2160 LIST_INIT(&vp->v_cache_src); 2161 TAILQ_INIT(&vp->v_cache_dst); 2162 vp->v_cache_dd = NULL; 2163 cache_prehash(vp); 2164 } 2165 2166 void 2167 cache_changesize(u_long newmaxvnodes) 2168 { 2169 struct nchashhead *new_nchashtbl, *old_nchashtbl; 2170 u_long new_nchash, old_nchash; 2171 struct namecache *ncp; 2172 uint32_t hash; 2173 u_long newncsize; 2174 int i; 2175 2176 newncsize = newmaxvnodes * ncsizefactor; 2177 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); 2178 if (newmaxvnodes < numbucketlocks) 2179 newmaxvnodes = numbucketlocks; 2180 2181 new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash); 2182 /* If same hash table size, nothing to do */ 2183 if (nchash == new_nchash) { 2184 ncfreetbl(new_nchashtbl); 2185 return; 2186 } 2187 /* 2188 * Move everything from the old hash table to the new table. 2189 * None of the namecache entries in the table can be removed 2190 * because to do so, they have to be removed from the hash table. 2191 */ 2192 cache_lock_all_vnodes(); 2193 cache_lock_all_buckets(); 2194 old_nchashtbl = nchashtbl; 2195 old_nchash = nchash; 2196 nchashtbl = new_nchashtbl; 2197 nchash = new_nchash; 2198 for (i = 0; i <= old_nchash; i++) { 2199 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) { 2200 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, 2201 ncp->nc_dvp); 2202 CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash); 2203 CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); 2204 } 2205 } 2206 ncsize = newncsize; 2207 cache_unlock_all_buckets(); 2208 cache_unlock_all_vnodes(); 2209 ncfreetbl(old_nchashtbl); 2210 } 2211 2212 /* 2213 * Invalidate all entries from and to a particular vnode. 2214 */ 2215 static void 2216 cache_purge_impl(struct vnode *vp) 2217 { 2218 TAILQ_HEAD(, namecache) ncps; 2219 struct namecache *ncp, *nnp; 2220 struct mtx *vlp, *vlp2; 2221 2222 TAILQ_INIT(&ncps); 2223 vlp = VP2VNODELOCK(vp); 2224 vlp2 = NULL; 2225 mtx_assert(vlp, MA_OWNED); 2226 retry: 2227 while (!LIST_EMPTY(&vp->v_cache_src)) { 2228 ncp = LIST_FIRST(&vp->v_cache_src); 2229 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2230 goto retry; 2231 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2232 } 2233 while (!TAILQ_EMPTY(&vp->v_cache_dst)) { 2234 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2235 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2236 goto retry; 2237 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2238 } 2239 ncp = vp->v_cache_dd; 2240 if (ncp != NULL) { 2241 KASSERT(ncp->nc_flag & NCF_ISDOTDOT, 2242 ("lost dotdot link")); 2243 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2244 goto retry; 2245 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2246 } 2247 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); 2248 mtx_unlock(vlp); 2249 if (vlp2 != NULL) 2250 mtx_unlock(vlp2); 2251 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2252 cache_free(ncp); 2253 } 2254 } 2255 2256 /* 2257 * Opportunistic check to see if there is anything to do. 2258 */ 2259 static bool 2260 cache_has_entries(struct vnode *vp) 2261 { 2262 2263 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2264 vp->v_cache_dd == NULL) 2265 return (false); 2266 return (true); 2267 } 2268 2269 void 2270 cache_purge(struct vnode *vp) 2271 { 2272 struct mtx *vlp; 2273 2274 SDT_PROBE1(vfs, namecache, purge, done, vp); 2275 if (!cache_has_entries(vp)) 2276 return; 2277 vlp = VP2VNODELOCK(vp); 2278 mtx_lock(vlp); 2279 cache_purge_impl(vp); 2280 } 2281 2282 /* 2283 * Only to be used by vgone. 2284 */ 2285 void 2286 cache_purge_vgone(struct vnode *vp) 2287 { 2288 struct mtx *vlp; 2289 2290 VNPASS(VN_IS_DOOMED(vp), vp); 2291 vlp = VP2VNODELOCK(vp); 2292 if (!(LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2293 vp->v_cache_dd == NULL)) { 2294 mtx_lock(vlp); 2295 cache_purge_impl(vp); 2296 mtx_assert(vlp, MA_NOTOWNED); 2297 return; 2298 } 2299 2300 /* 2301 * All the NULL pointer state we found above may be transient. 2302 * Serialize against a possible thread doing cache_purge. 2303 */ 2304 mtx_wait_unlocked(vlp); 2305 if (!(LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2306 vp->v_cache_dd == NULL)) { 2307 mtx_lock(vlp); 2308 cache_purge_impl(vp); 2309 mtx_assert(vlp, MA_NOTOWNED); 2310 return; 2311 } 2312 return; 2313 } 2314 2315 /* 2316 * Invalidate all negative entries for a particular directory vnode. 2317 */ 2318 void 2319 cache_purge_negative(struct vnode *vp) 2320 { 2321 TAILQ_HEAD(, namecache) ncps; 2322 struct namecache *ncp, *nnp; 2323 struct mtx *vlp; 2324 2325 SDT_PROBE1(vfs, namecache, purge_negative, done, vp); 2326 if (LIST_EMPTY(&vp->v_cache_src)) 2327 return; 2328 TAILQ_INIT(&ncps); 2329 vlp = VP2VNODELOCK(vp); 2330 mtx_lock(vlp); 2331 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { 2332 if (!(ncp->nc_flag & NCF_NEGATIVE)) 2333 continue; 2334 cache_zap_negative_locked_vnode_kl(ncp, vp); 2335 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2336 } 2337 mtx_unlock(vlp); 2338 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2339 cache_free(ncp); 2340 } 2341 } 2342 2343 void 2344 cache_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, 2345 struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp) 2346 { 2347 2348 ASSERT_VOP_IN_SEQC(fdvp); 2349 ASSERT_VOP_IN_SEQC(fvp); 2350 ASSERT_VOP_IN_SEQC(tdvp); 2351 if (tvp != NULL) 2352 ASSERT_VOP_IN_SEQC(tvp); 2353 2354 cache_purge(fvp); 2355 if (tvp != NULL) { 2356 cache_purge(tvp); 2357 KASSERT(!cache_remove_cnp(tdvp, tcnp), 2358 ("%s: lingering negative entry", __func__)); 2359 } else { 2360 cache_remove_cnp(tdvp, tcnp); 2361 } 2362 } 2363 2364 /* 2365 * Flush all entries referencing a particular filesystem. 2366 */ 2367 void 2368 cache_purgevfs(struct mount *mp) 2369 { 2370 struct vnode *vp, *mvp; 2371 2372 SDT_PROBE1(vfs, namecache, purgevfs, done, mp); 2373 /* 2374 * Somewhat wasteful iteration over all vnodes. Would be better to 2375 * support filtering and avoid the interlock to begin with. 2376 */ 2377 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 2378 if (!cache_has_entries(vp)) { 2379 VI_UNLOCK(vp); 2380 continue; 2381 } 2382 vholdl(vp); 2383 VI_UNLOCK(vp); 2384 cache_purge(vp); 2385 vdrop(vp); 2386 } 2387 } 2388 2389 /* 2390 * Perform canonical checks and cache lookup and pass on to filesystem 2391 * through the vop_cachedlookup only if needed. 2392 */ 2393 2394 int 2395 vfs_cache_lookup(struct vop_lookup_args *ap) 2396 { 2397 struct vnode *dvp; 2398 int error; 2399 struct vnode **vpp = ap->a_vpp; 2400 struct componentname *cnp = ap->a_cnp; 2401 int flags = cnp->cn_flags; 2402 2403 *vpp = NULL; 2404 dvp = ap->a_dvp; 2405 2406 if (dvp->v_type != VDIR) 2407 return (ENOTDIR); 2408 2409 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && 2410 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) 2411 return (EROFS); 2412 2413 error = vn_dir_check_exec(dvp, cnp); 2414 if (error != 0) 2415 return (error); 2416 2417 error = cache_lookup(dvp, vpp, cnp, NULL, NULL); 2418 if (error == 0) 2419 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); 2420 if (error == -1) 2421 return (0); 2422 return (error); 2423 } 2424 2425 /* Implementation of the getcwd syscall. */ 2426 int 2427 sys___getcwd(struct thread *td, struct __getcwd_args *uap) 2428 { 2429 char *buf, *retbuf; 2430 size_t buflen; 2431 int error; 2432 2433 buflen = uap->buflen; 2434 if (__predict_false(buflen < 2)) 2435 return (EINVAL); 2436 if (buflen > MAXPATHLEN) 2437 buflen = MAXPATHLEN; 2438 2439 buf = uma_zalloc(namei_zone, M_WAITOK); 2440 error = vn_getcwd(buf, &retbuf, &buflen); 2441 if (error == 0) 2442 error = copyout(retbuf, uap->buf, buflen); 2443 uma_zfree(namei_zone, buf); 2444 return (error); 2445 } 2446 2447 int 2448 vn_getcwd(char *buf, char **retbuf, size_t *buflen) 2449 { 2450 struct pwd *pwd; 2451 int error; 2452 2453 vfs_smr_enter(); 2454 pwd = pwd_get_smr(); 2455 error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, 2456 buflen, false, 0); 2457 VFS_SMR_ASSERT_NOT_ENTERED(); 2458 if (error < 0) { 2459 pwd = pwd_hold(curthread); 2460 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf, 2461 retbuf, buflen); 2462 pwd_drop(pwd); 2463 } 2464 2465 #ifdef KTRACE 2466 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0) 2467 ktrnamei(*retbuf); 2468 #endif 2469 return (error); 2470 } 2471 2472 static int 2473 kern___realpathat(struct thread *td, int fd, const char *path, char *buf, 2474 size_t size, int flags, enum uio_seg pathseg) 2475 { 2476 struct nameidata nd; 2477 char *retbuf, *freebuf; 2478 int error; 2479 2480 if (flags != 0) 2481 return (EINVAL); 2482 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1, 2483 pathseg, path, fd, &cap_fstat_rights, td); 2484 if ((error = namei(&nd)) != 0) 2485 return (error); 2486 error = vn_fullpath_hardlink(&nd, &retbuf, &freebuf, &size); 2487 if (error == 0) { 2488 error = copyout(retbuf, buf, size); 2489 free(freebuf, M_TEMP); 2490 } 2491 NDFREE(&nd, 0); 2492 return (error); 2493 } 2494 2495 int 2496 sys___realpathat(struct thread *td, struct __realpathat_args *uap) 2497 { 2498 2499 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size, 2500 uap->flags, UIO_USERSPACE)); 2501 } 2502 2503 /* 2504 * Retrieve the full filesystem path that correspond to a vnode from the name 2505 * cache (if available) 2506 */ 2507 int 2508 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf) 2509 { 2510 struct pwd *pwd; 2511 char *buf; 2512 size_t buflen; 2513 int error; 2514 2515 if (__predict_false(vp == NULL)) 2516 return (EINVAL); 2517 2518 buflen = MAXPATHLEN; 2519 buf = malloc(buflen, M_TEMP, M_WAITOK); 2520 vfs_smr_enter(); 2521 pwd = pwd_get_smr(); 2522 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, false, 0); 2523 VFS_SMR_ASSERT_NOT_ENTERED(); 2524 if (error < 0) { 2525 pwd = pwd_hold(curthread); 2526 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen); 2527 pwd_drop(pwd); 2528 } 2529 if (error == 0) 2530 *freebuf = buf; 2531 else 2532 free(buf, M_TEMP); 2533 return (error); 2534 } 2535 2536 /* 2537 * This function is similar to vn_fullpath, but it attempts to lookup the 2538 * pathname relative to the global root mount point. This is required for the 2539 * auditing sub-system, as audited pathnames must be absolute, relative to the 2540 * global root mount point. 2541 */ 2542 int 2543 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf) 2544 { 2545 char *buf; 2546 size_t buflen; 2547 int error; 2548 2549 if (__predict_false(vp == NULL)) 2550 return (EINVAL); 2551 buflen = MAXPATHLEN; 2552 buf = malloc(buflen, M_TEMP, M_WAITOK); 2553 vfs_smr_enter(); 2554 error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, false, 0); 2555 VFS_SMR_ASSERT_NOT_ENTERED(); 2556 if (error < 0) { 2557 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen); 2558 } 2559 if (error == 0) 2560 *freebuf = buf; 2561 else 2562 free(buf, M_TEMP); 2563 return (error); 2564 } 2565 2566 static struct namecache * 2567 vn_dd_from_dst(struct vnode *vp) 2568 { 2569 struct namecache *ncp; 2570 2571 cache_assert_vnode_locked(vp); 2572 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) { 2573 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2574 return (ncp); 2575 } 2576 return (NULL); 2577 } 2578 2579 int 2580 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, size_t *buflen) 2581 { 2582 struct vnode *dvp; 2583 struct namecache *ncp; 2584 struct mtx *vlp; 2585 int error; 2586 2587 vlp = VP2VNODELOCK(*vp); 2588 mtx_lock(vlp); 2589 ncp = (*vp)->v_cache_dd; 2590 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) { 2591 KASSERT(ncp == vn_dd_from_dst(*vp), 2592 ("%s: mismatch for dd entry (%p != %p)", __func__, 2593 ncp, vn_dd_from_dst(*vp))); 2594 } else { 2595 ncp = vn_dd_from_dst(*vp); 2596 } 2597 if (ncp != NULL) { 2598 if (*buflen < ncp->nc_nlen) { 2599 mtx_unlock(vlp); 2600 vrele(*vp); 2601 counter_u64_add(numfullpathfail4, 1); 2602 error = ENOMEM; 2603 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2604 vp, NULL); 2605 return (error); 2606 } 2607 *buflen -= ncp->nc_nlen; 2608 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 2609 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, 2610 ncp->nc_name, vp); 2611 dvp = *vp; 2612 *vp = ncp->nc_dvp; 2613 vref(*vp); 2614 mtx_unlock(vlp); 2615 vrele(dvp); 2616 return (0); 2617 } 2618 SDT_PROBE1(vfs, namecache, fullpath, miss, vp); 2619 2620 mtx_unlock(vlp); 2621 vn_lock(*vp, LK_SHARED | LK_RETRY); 2622 error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen); 2623 vput(*vp); 2624 if (error) { 2625 counter_u64_add(numfullpathfail2, 1); 2626 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2627 return (error); 2628 } 2629 2630 *vp = dvp; 2631 if (VN_IS_DOOMED(dvp)) { 2632 /* forced unmount */ 2633 vrele(dvp); 2634 error = ENOENT; 2635 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2636 return (error); 2637 } 2638 /* 2639 * *vp has its use count incremented still. 2640 */ 2641 2642 return (0); 2643 } 2644 2645 /* 2646 * Resolve a directory to a pathname. 2647 * 2648 * The name of the directory can always be found in the namecache or fetched 2649 * from the filesystem. There is also guaranteed to be only one parent, meaning 2650 * we can just follow vnodes up until we find the root. 2651 * 2652 * The vnode must be referenced. 2653 */ 2654 static int 2655 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 2656 size_t *len, bool slash_prefixed, size_t addend) 2657 { 2658 #ifdef KDTRACE_HOOKS 2659 struct vnode *startvp = vp; 2660 #endif 2661 struct vnode *vp1; 2662 size_t buflen; 2663 int error; 2664 2665 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); 2666 VNPASS(vp->v_usecount > 0, vp); 2667 2668 buflen = *len; 2669 2670 if (!slash_prefixed) { 2671 MPASS(*len >= 2); 2672 buflen--; 2673 buf[buflen] = '\0'; 2674 } 2675 2676 error = 0; 2677 2678 SDT_PROBE1(vfs, namecache, fullpath, entry, vp); 2679 counter_u64_add(numfullpathcalls, 1); 2680 while (vp != rdir && vp != rootvnode) { 2681 /* 2682 * The vp vnode must be already fully constructed, 2683 * since it is either found in namecache or obtained 2684 * from VOP_VPTOCNP(). We may test for VV_ROOT safely 2685 * without obtaining the vnode lock. 2686 */ 2687 if ((vp->v_vflag & VV_ROOT) != 0) { 2688 vn_lock(vp, LK_RETRY | LK_SHARED); 2689 2690 /* 2691 * With the vnode locked, check for races with 2692 * unmount, forced or not. Note that we 2693 * already verified that vp is not equal to 2694 * the root vnode, which means that 2695 * mnt_vnodecovered can be NULL only for the 2696 * case of unmount. 2697 */ 2698 if (VN_IS_DOOMED(vp) || 2699 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || 2700 vp1->v_mountedhere != vp->v_mount) { 2701 vput(vp); 2702 error = ENOENT; 2703 SDT_PROBE3(vfs, namecache, fullpath, return, 2704 error, vp, NULL); 2705 break; 2706 } 2707 2708 vref(vp1); 2709 vput(vp); 2710 vp = vp1; 2711 continue; 2712 } 2713 if (vp->v_type != VDIR) { 2714 vrele(vp); 2715 counter_u64_add(numfullpathfail1, 1); 2716 error = ENOTDIR; 2717 SDT_PROBE3(vfs, namecache, fullpath, return, 2718 error, vp, NULL); 2719 break; 2720 } 2721 error = vn_vptocnp(&vp, curthread->td_ucred, buf, &buflen); 2722 if (error) 2723 break; 2724 if (buflen == 0) { 2725 vrele(vp); 2726 error = ENOMEM; 2727 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2728 startvp, NULL); 2729 break; 2730 } 2731 buf[--buflen] = '/'; 2732 slash_prefixed = true; 2733 } 2734 if (error) 2735 return (error); 2736 if (!slash_prefixed) { 2737 if (buflen == 0) { 2738 vrele(vp); 2739 counter_u64_add(numfullpathfail4, 1); 2740 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, 2741 startvp, NULL); 2742 return (ENOMEM); 2743 } 2744 buf[--buflen] = '/'; 2745 } 2746 counter_u64_add(numfullpathfound, 1); 2747 vrele(vp); 2748 2749 *retbuf = buf + buflen; 2750 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf); 2751 *len -= buflen; 2752 *len += addend; 2753 return (0); 2754 } 2755 2756 /* 2757 * Resolve an arbitrary vnode to a pathname. 2758 * 2759 * Note 2 caveats: 2760 * - hardlinks are not tracked, thus if the vnode is not a directory this can 2761 * resolve to a different path than the one used to find it 2762 * - namecache is not mandatory, meaning names are not guaranteed to be added 2763 * (in which case resolving fails) 2764 */ 2765 static void __inline 2766 cache_rev_failed_impl(int *reason, int line) 2767 { 2768 2769 *reason = line; 2770 } 2771 #define cache_rev_failed(var) cache_rev_failed_impl((var), __LINE__) 2772 2773 static int 2774 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 2775 char **retbuf, size_t *buflen, bool slash_prefixed, size_t addend) 2776 { 2777 #ifdef KDTRACE_HOOKS 2778 struct vnode *startvp = vp; 2779 #endif 2780 struct vnode *tvp; 2781 struct mount *mp; 2782 struct namecache *ncp; 2783 size_t orig_buflen; 2784 int reason; 2785 int error; 2786 #ifdef KDTRACE_HOOKS 2787 int i; 2788 #endif 2789 seqc_t vp_seqc, tvp_seqc; 2790 u_char nc_flag; 2791 2792 VFS_SMR_ASSERT_ENTERED(); 2793 2794 if (!cache_fast_revlookup) { 2795 vfs_smr_exit(); 2796 return (-1); 2797 } 2798 2799 orig_buflen = *buflen; 2800 2801 if (!slash_prefixed) { 2802 MPASS(*buflen >= 2); 2803 *buflen -= 1; 2804 buf[*buflen] = '\0'; 2805 } 2806 2807 if (vp == rdir || vp == rootvnode) { 2808 if (!slash_prefixed) { 2809 *buflen -= 1; 2810 buf[*buflen] = '/'; 2811 } 2812 goto out_ok; 2813 } 2814 2815 #ifdef KDTRACE_HOOKS 2816 i = 0; 2817 #endif 2818 error = -1; 2819 ncp = NULL; /* for sdt probe down below */ 2820 vp_seqc = vn_seqc_read_any(vp); 2821 if (seqc_in_modify(vp_seqc)) { 2822 cache_rev_failed(&reason); 2823 goto out_abort; 2824 } 2825 2826 for (;;) { 2827 #ifdef KDTRACE_HOOKS 2828 i++; 2829 #endif 2830 if ((vp->v_vflag & VV_ROOT) != 0) { 2831 mp = atomic_load_ptr(&vp->v_mount); 2832 if (mp == NULL) { 2833 cache_rev_failed(&reason); 2834 goto out_abort; 2835 } 2836 tvp = atomic_load_ptr(&mp->mnt_vnodecovered); 2837 tvp_seqc = vn_seqc_read_any(tvp); 2838 if (seqc_in_modify(tvp_seqc)) { 2839 cache_rev_failed(&reason); 2840 goto out_abort; 2841 } 2842 if (!vn_seqc_consistent(vp, vp_seqc)) { 2843 cache_rev_failed(&reason); 2844 goto out_abort; 2845 } 2846 vp = tvp; 2847 vp_seqc = tvp_seqc; 2848 continue; 2849 } 2850 ncp = atomic_load_ptr(&vp->v_cache_dd); 2851 if (ncp == NULL) { 2852 cache_rev_failed(&reason); 2853 goto out_abort; 2854 } 2855 nc_flag = atomic_load_char(&ncp->nc_flag); 2856 if ((nc_flag & NCF_ISDOTDOT) != 0) { 2857 cache_rev_failed(&reason); 2858 goto out_abort; 2859 } 2860 if (!cache_ncp_canuse(ncp)) { 2861 cache_rev_failed(&reason); 2862 goto out_abort; 2863 } 2864 if (ncp->nc_nlen >= *buflen) { 2865 cache_rev_failed(&reason); 2866 error = ENOMEM; 2867 goto out_abort; 2868 } 2869 *buflen -= ncp->nc_nlen; 2870 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 2871 *buflen -= 1; 2872 buf[*buflen] = '/'; 2873 tvp = ncp->nc_dvp; 2874 tvp_seqc = vn_seqc_read_any(tvp); 2875 if (seqc_in_modify(tvp_seqc)) { 2876 cache_rev_failed(&reason); 2877 goto out_abort; 2878 } 2879 if (!vn_seqc_consistent(vp, vp_seqc)) { 2880 cache_rev_failed(&reason); 2881 goto out_abort; 2882 } 2883 vp = tvp; 2884 vp_seqc = tvp_seqc; 2885 if (vp == rdir || vp == rootvnode) 2886 break; 2887 } 2888 out_ok: 2889 vfs_smr_exit(); 2890 *retbuf = buf + *buflen; 2891 *buflen = orig_buflen - *buflen + addend; 2892 SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf); 2893 return (0); 2894 2895 out_abort: 2896 *buflen = orig_buflen; 2897 SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i); 2898 vfs_smr_exit(); 2899 return (error); 2900 } 2901 2902 static int 2903 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 2904 size_t *buflen) 2905 { 2906 size_t orig_buflen; 2907 bool slash_prefixed; 2908 int error; 2909 2910 if (*buflen < 2) 2911 return (EINVAL); 2912 2913 orig_buflen = *buflen; 2914 2915 vref(vp); 2916 slash_prefixed = false; 2917 if (vp->v_type != VDIR) { 2918 *buflen -= 1; 2919 buf[*buflen] = '\0'; 2920 error = vn_vptocnp(&vp, curthread->td_ucred, buf, buflen); 2921 if (error) 2922 return (error); 2923 if (*buflen == 0) { 2924 vrele(vp); 2925 return (ENOMEM); 2926 } 2927 *buflen -= 1; 2928 buf[*buflen] = '/'; 2929 slash_prefixed = true; 2930 } 2931 2932 return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, slash_prefixed, 2933 orig_buflen - *buflen)); 2934 } 2935 2936 /* 2937 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks). 2938 * 2939 * Since the namecache does not track handlings, the caller is expected to first 2940 * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei. 2941 * 2942 * Then we have 2 cases: 2943 * - if the found vnode is a directory, the path can be constructed just by 2944 * fullowing names up the chain 2945 * - otherwise we populate the buffer with the saved name and start resolving 2946 * from the parent 2947 */ 2948 static int 2949 vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, char **freebuf, 2950 size_t *buflen) 2951 { 2952 char *buf, *tmpbuf; 2953 struct pwd *pwd; 2954 struct componentname *cnp; 2955 struct vnode *vp; 2956 size_t addend; 2957 int error; 2958 bool slash_prefixed; 2959 enum vtype type; 2960 2961 if (*buflen < 2) 2962 return (EINVAL); 2963 if (*buflen > MAXPATHLEN) 2964 *buflen = MAXPATHLEN; 2965 2966 slash_prefixed = false; 2967 2968 buf = malloc(*buflen, M_TEMP, M_WAITOK); 2969 2970 addend = 0; 2971 vp = ndp->ni_vp; 2972 /* 2973 * Check for VBAD to work around the vp_crossmp bug in lookup(). 2974 * 2975 * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be 2976 * set to mount point's root vnode while ni_dvp will be vp_crossmp. 2977 * If the type is VDIR (like in this very case) we can skip looking 2978 * at ni_dvp in the first place. However, since vnodes get passed here 2979 * unlocked the target may transition to doomed state (type == VBAD) 2980 * before we get to evaluate the condition. If this happens, we will 2981 * populate part of the buffer and descend to vn_fullpath_dir with 2982 * vp == vp_crossmp. Prevent the problem by checking for VBAD. 2983 * 2984 * This should be atomic_load(&vp->v_type) but it is ilegal to take 2985 * an address of a bit field, even if said field is sized to char. 2986 * Work around the problem by reading the value into a full-sized enum 2987 * and then re-reading it with atomic_load which will still prevent 2988 * the compiler from re-reading down the road. 2989 */ 2990 type = vp->v_type; 2991 type = atomic_load_int(&type); 2992 if (type == VBAD) { 2993 error = ENOENT; 2994 goto out_bad; 2995 } 2996 if (type != VDIR) { 2997 cnp = &ndp->ni_cnd; 2998 addend = cnp->cn_namelen + 2; 2999 if (*buflen < addend) { 3000 error = ENOMEM; 3001 goto out_bad; 3002 } 3003 *buflen -= addend; 3004 tmpbuf = buf + *buflen; 3005 tmpbuf[0] = '/'; 3006 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen); 3007 tmpbuf[addend - 1] = '\0'; 3008 slash_prefixed = true; 3009 vp = ndp->ni_dvp; 3010 } 3011 3012 vfs_smr_enter(); 3013 pwd = pwd_get_smr(); 3014 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen, 3015 slash_prefixed, addend); 3016 VFS_SMR_ASSERT_NOT_ENTERED(); 3017 if (error < 0) { 3018 pwd = pwd_hold(curthread); 3019 vref(vp); 3020 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen, 3021 slash_prefixed, addend); 3022 pwd_drop(pwd); 3023 if (error != 0) 3024 goto out_bad; 3025 } 3026 3027 *freebuf = buf; 3028 3029 return (0); 3030 out_bad: 3031 free(buf, M_TEMP); 3032 return (error); 3033 } 3034 3035 struct vnode * 3036 vn_dir_dd_ino(struct vnode *vp) 3037 { 3038 struct namecache *ncp; 3039 struct vnode *ddvp; 3040 struct mtx *vlp; 3041 enum vgetstate vs; 3042 3043 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); 3044 vlp = VP2VNODELOCK(vp); 3045 mtx_lock(vlp); 3046 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { 3047 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 3048 continue; 3049 ddvp = ncp->nc_dvp; 3050 vs = vget_prep(ddvp); 3051 mtx_unlock(vlp); 3052 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) 3053 return (NULL); 3054 return (ddvp); 3055 } 3056 mtx_unlock(vlp); 3057 return (NULL); 3058 } 3059 3060 int 3061 vn_commname(struct vnode *vp, char *buf, u_int buflen) 3062 { 3063 struct namecache *ncp; 3064 struct mtx *vlp; 3065 int l; 3066 3067 vlp = VP2VNODELOCK(vp); 3068 mtx_lock(vlp); 3069 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) 3070 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 3071 break; 3072 if (ncp == NULL) { 3073 mtx_unlock(vlp); 3074 return (ENOENT); 3075 } 3076 l = min(ncp->nc_nlen, buflen - 1); 3077 memcpy(buf, ncp->nc_name, l); 3078 mtx_unlock(vlp); 3079 buf[l] = '\0'; 3080 return (0); 3081 } 3082 3083 /* 3084 * This function updates path string to vnode's full global path 3085 * and checks the size of the new path string against the pathlen argument. 3086 * 3087 * Requires a locked, referenced vnode. 3088 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 3089 * 3090 * If vp is a directory, the call to vn_fullpath_global() always succeeds 3091 * because it falls back to the ".." lookup if the namecache lookup fails. 3092 */ 3093 int 3094 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, 3095 u_int pathlen) 3096 { 3097 struct nameidata nd; 3098 struct vnode *vp1; 3099 char *rpath, *fbuf; 3100 int error; 3101 3102 ASSERT_VOP_ELOCKED(vp, __func__); 3103 3104 /* Construct global filesystem path from vp. */ 3105 VOP_UNLOCK(vp); 3106 error = vn_fullpath_global(vp, &rpath, &fbuf); 3107 3108 if (error != 0) { 3109 vrele(vp); 3110 return (error); 3111 } 3112 3113 if (strlen(rpath) >= pathlen) { 3114 vrele(vp); 3115 error = ENAMETOOLONG; 3116 goto out; 3117 } 3118 3119 /* 3120 * Re-lookup the vnode by path to detect a possible rename. 3121 * As a side effect, the vnode is relocked. 3122 * If vnode was renamed, return ENOENT. 3123 */ 3124 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, 3125 UIO_SYSSPACE, path, td); 3126 error = namei(&nd); 3127 if (error != 0) { 3128 vrele(vp); 3129 goto out; 3130 } 3131 NDFREE(&nd, NDF_ONLY_PNBUF); 3132 vp1 = nd.ni_vp; 3133 vrele(vp); 3134 if (vp1 == vp) 3135 strcpy(path, rpath); 3136 else { 3137 vput(vp1); 3138 error = ENOENT; 3139 } 3140 3141 out: 3142 free(fbuf, M_TEMP); 3143 return (error); 3144 } 3145 3146 #ifdef DDB 3147 static void 3148 db_print_vpath(struct vnode *vp) 3149 { 3150 3151 while (vp != NULL) { 3152 db_printf("%p: ", vp); 3153 if (vp == rootvnode) { 3154 db_printf("/"); 3155 vp = NULL; 3156 } else { 3157 if (vp->v_vflag & VV_ROOT) { 3158 db_printf("<mount point>"); 3159 vp = vp->v_mount->mnt_vnodecovered; 3160 } else { 3161 struct namecache *ncp; 3162 char *ncn; 3163 int i; 3164 3165 ncp = TAILQ_FIRST(&vp->v_cache_dst); 3166 if (ncp != NULL) { 3167 ncn = ncp->nc_name; 3168 for (i = 0; i < ncp->nc_nlen; i++) 3169 db_printf("%c", *ncn++); 3170 vp = ncp->nc_dvp; 3171 } else { 3172 vp = NULL; 3173 } 3174 } 3175 } 3176 db_printf("\n"); 3177 } 3178 3179 return; 3180 } 3181 3182 DB_SHOW_COMMAND(vpath, db_show_vpath) 3183 { 3184 struct vnode *vp; 3185 3186 if (!have_addr) { 3187 db_printf("usage: show vpath <struct vnode *>\n"); 3188 return; 3189 } 3190 3191 vp = (struct vnode *)addr; 3192 db_print_vpath(vp); 3193 } 3194 3195 #endif 3196 3197 static bool __read_frequently cache_fast_lookup = true; 3198 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW, 3199 &cache_fast_lookup, 0, ""); 3200 3201 #define CACHE_FPL_FAILED -2020 3202 3203 static void 3204 cache_fpl_cleanup_cnp(struct componentname *cnp) 3205 { 3206 3207 uma_zfree(namei_zone, cnp->cn_pnbuf); 3208 #ifdef DIAGNOSTIC 3209 cnp->cn_pnbuf = NULL; 3210 cnp->cn_nameptr = NULL; 3211 #endif 3212 } 3213 3214 static void 3215 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp) 3216 { 3217 struct componentname *cnp; 3218 3219 cnp = &ndp->ni_cnd; 3220 while (*(cnp->cn_nameptr) == '/') { 3221 cnp->cn_nameptr++; 3222 ndp->ni_pathlen--; 3223 } 3224 3225 *dpp = ndp->ni_rootdir; 3226 } 3227 3228 /* 3229 * Components of nameidata (or objects it can point to) which may 3230 * need restoring in case fast path lookup fails. 3231 */ 3232 struct nameidata_saved { 3233 long cn_namelen; 3234 char *cn_nameptr; 3235 size_t ni_pathlen; 3236 int cn_flags; 3237 }; 3238 3239 struct cache_fpl { 3240 struct nameidata *ndp; 3241 struct componentname *cnp; 3242 struct pwd *pwd; 3243 struct vnode *dvp; 3244 struct vnode *tvp; 3245 seqc_t dvp_seqc; 3246 seqc_t tvp_seqc; 3247 struct nameidata_saved snd; 3248 int line; 3249 enum cache_fpl_status status:8; 3250 bool in_smr; 3251 }; 3252 3253 static void 3254 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd) 3255 { 3256 3257 snd->cn_flags = fpl->ndp->ni_cnd.cn_flags; 3258 snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen; 3259 snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr; 3260 snd->ni_pathlen = fpl->ndp->ni_pathlen; 3261 } 3262 3263 static void 3264 cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd) 3265 { 3266 3267 fpl->ndp->ni_cnd.cn_flags = snd->cn_flags; 3268 fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen; 3269 fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr; 3270 fpl->ndp->ni_pathlen = snd->ni_pathlen; 3271 } 3272 3273 #ifdef INVARIANTS 3274 #define cache_fpl_smr_assert_entered(fpl) ({ \ 3275 struct cache_fpl *_fpl = (fpl); \ 3276 MPASS(_fpl->in_smr == true); \ 3277 VFS_SMR_ASSERT_ENTERED(); \ 3278 }) 3279 #define cache_fpl_smr_assert_not_entered(fpl) ({ \ 3280 struct cache_fpl *_fpl = (fpl); \ 3281 MPASS(_fpl->in_smr == false); \ 3282 VFS_SMR_ASSERT_NOT_ENTERED(); \ 3283 }) 3284 #else 3285 #define cache_fpl_smr_assert_entered(fpl) do { } while (0) 3286 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0) 3287 #endif 3288 3289 #define cache_fpl_smr_enter_initial(fpl) ({ \ 3290 struct cache_fpl *_fpl = (fpl); \ 3291 vfs_smr_enter(); \ 3292 _fpl->in_smr = true; \ 3293 }) 3294 3295 #define cache_fpl_smr_enter(fpl) ({ \ 3296 struct cache_fpl *_fpl = (fpl); \ 3297 MPASS(_fpl->in_smr == false); \ 3298 vfs_smr_enter(); \ 3299 _fpl->in_smr = true; \ 3300 }) 3301 3302 #define cache_fpl_smr_exit(fpl) ({ \ 3303 struct cache_fpl *_fpl = (fpl); \ 3304 MPASS(_fpl->in_smr == true); \ 3305 vfs_smr_exit(); \ 3306 _fpl->in_smr = false; \ 3307 }) 3308 3309 static int 3310 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line) 3311 { 3312 3313 if (fpl->status != CACHE_FPL_STATUS_UNSET) { 3314 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, 3315 ("%s: converting to abort from %d at %d, set at %d\n", 3316 __func__, fpl->status, line, fpl->line)); 3317 } 3318 fpl->status = CACHE_FPL_STATUS_ABORTED; 3319 fpl->line = line; 3320 return (CACHE_FPL_FAILED); 3321 } 3322 3323 #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__) 3324 3325 static int 3326 cache_fpl_partial_impl(struct cache_fpl *fpl, int line) 3327 { 3328 3329 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3330 ("%s: setting to partial at %d, but already set to %d at %d\n", 3331 __func__, line, fpl->status, fpl->line)); 3332 cache_fpl_smr_assert_entered(fpl); 3333 fpl->status = CACHE_FPL_STATUS_PARTIAL; 3334 fpl->line = line; 3335 return (CACHE_FPL_FAILED); 3336 } 3337 3338 #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__) 3339 3340 static int 3341 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line) 3342 { 3343 3344 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3345 ("%s: setting to handled at %d, but already set to %d at %d\n", 3346 __func__, line, fpl->status, fpl->line)); 3347 cache_fpl_smr_assert_not_entered(fpl); 3348 MPASS(error != CACHE_FPL_FAILED); 3349 fpl->status = CACHE_FPL_STATUS_HANDLED; 3350 fpl->line = line; 3351 return (error); 3352 } 3353 3354 #define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__) 3355 3356 #define CACHE_FPL_SUPPORTED_CN_FLAGS \ 3357 (LOCKLEAF | LOCKPARENT | WANTPARENT | NOCACHE | FOLLOW | LOCKSHARED | SAVENAME | \ 3358 SAVESTART | WILLBEDIR | ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK) 3359 3360 #define CACHE_FPL_INTERNAL_CN_FLAGS \ 3361 (ISDOTDOT | MAKEENTRY | ISLASTCN) 3362 3363 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, 3364 "supported and internal flags overlap"); 3365 3366 static bool 3367 cache_fpl_islastcn(struct nameidata *ndp) 3368 { 3369 3370 return (*ndp->ni_next == 0); 3371 } 3372 3373 static bool 3374 cache_fpl_isdotdot(struct componentname *cnp) 3375 { 3376 3377 if (cnp->cn_namelen == 2 && 3378 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') 3379 return (true); 3380 return (false); 3381 } 3382 3383 static bool 3384 cache_can_fplookup(struct cache_fpl *fpl) 3385 { 3386 struct nameidata *ndp; 3387 struct componentname *cnp; 3388 struct thread *td; 3389 3390 ndp = fpl->ndp; 3391 cnp = fpl->cnp; 3392 td = cnp->cn_thread; 3393 3394 if (!cache_fast_lookup) { 3395 cache_fpl_aborted(fpl); 3396 return (false); 3397 } 3398 #ifdef MAC 3399 if (mac_vnode_check_lookup_enabled()) { 3400 cache_fpl_aborted(fpl); 3401 return (false); 3402 } 3403 #endif 3404 if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) { 3405 cache_fpl_aborted(fpl); 3406 return (false); 3407 } 3408 if (ndp->ni_dirfd != AT_FDCWD) { 3409 cache_fpl_aborted(fpl); 3410 return (false); 3411 } 3412 if (IN_CAPABILITY_MODE(td)) { 3413 cache_fpl_aborted(fpl); 3414 return (false); 3415 } 3416 if (AUDITING_TD(td)) { 3417 cache_fpl_aborted(fpl); 3418 return (false); 3419 } 3420 if (ndp->ni_startdir != NULL) { 3421 cache_fpl_aborted(fpl); 3422 return (false); 3423 } 3424 return (true); 3425 } 3426 3427 static bool 3428 cache_fplookup_vnode_supported(struct vnode *vp) 3429 { 3430 3431 return (vp->v_type != VLNK); 3432 } 3433 3434 /* 3435 * Move a negative entry to the hot list. 3436 * 3437 * We have to take locks, but they may be contended and in the worst 3438 * case we may need to go off CPU. We don't want to spin within the 3439 * smr section and we can't block with it. Instead we are going to 3440 * look up the entry again. 3441 */ 3442 static int __noinline 3443 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp, 3444 uint32_t hash) 3445 { 3446 struct componentname *cnp; 3447 struct namecache *ncp; 3448 struct neglist *neglist; 3449 struct negstate *negstate; 3450 struct vnode *dvp; 3451 u_char nc_flag; 3452 3453 cnp = fpl->cnp; 3454 dvp = fpl->dvp; 3455 3456 if (!vhold_smr(dvp)) 3457 return (cache_fpl_aborted(fpl)); 3458 3459 neglist = NCP2NEGLIST(oncp); 3460 cache_fpl_smr_exit(fpl); 3461 3462 mtx_lock(&ncneg_hot.nl_lock); 3463 mtx_lock(&neglist->nl_lock); 3464 /* 3465 * For hash iteration. 3466 */ 3467 cache_fpl_smr_enter(fpl); 3468 3469 /* 3470 * Avoid all surprises by only succeeding if we got the same entry and 3471 * bailing completely otherwise. 3472 * 3473 * In particular at this point there can be a new ncp which matches the 3474 * search but hashes to a different neglist. 3475 */ 3476 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 3477 if (ncp == oncp) 3478 break; 3479 } 3480 3481 /* 3482 * No match to begin with. 3483 */ 3484 if (__predict_false(ncp == NULL)) { 3485 goto out_abort; 3486 } 3487 3488 /* 3489 * The newly found entry may be something different... 3490 */ 3491 if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 3492 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) { 3493 goto out_abort; 3494 } 3495 3496 /* 3497 * ... and not even negative. 3498 */ 3499 nc_flag = atomic_load_char(&ncp->nc_flag); 3500 if ((nc_flag & NCF_NEGATIVE) == 0) { 3501 goto out_abort; 3502 } 3503 3504 if (__predict_false(!cache_ncp_canuse(ncp))) { 3505 goto out_abort; 3506 } 3507 3508 negstate = NCP2NEGSTATE(ncp); 3509 if ((negstate->neg_flag & NEG_HOT) == 0) { 3510 numhotneg++; 3511 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 3512 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); 3513 negstate->neg_flag |= NEG_HOT; 3514 } 3515 3516 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 3517 counter_u64_add(numneghits, 1); 3518 cache_fpl_smr_exit(fpl); 3519 mtx_unlock(&neglist->nl_lock); 3520 mtx_unlock(&ncneg_hot.nl_lock); 3521 vdrop(dvp); 3522 return (cache_fpl_handled(fpl, ENOENT)); 3523 out_abort: 3524 cache_fpl_smr_exit(fpl); 3525 mtx_unlock(&neglist->nl_lock); 3526 mtx_unlock(&ncneg_hot.nl_lock); 3527 vdrop(dvp); 3528 return (cache_fpl_aborted(fpl)); 3529 } 3530 3531 /* 3532 * The target vnode is not supported, prepare for the slow path to take over. 3533 */ 3534 static int __noinline 3535 cache_fplookup_partial_setup(struct cache_fpl *fpl) 3536 { 3537 struct nameidata *ndp; 3538 struct componentname *cnp; 3539 enum vgetstate dvs; 3540 struct vnode *dvp; 3541 struct pwd *pwd; 3542 seqc_t dvp_seqc; 3543 3544 ndp = fpl->ndp; 3545 cnp = fpl->cnp; 3546 dvp = fpl->dvp; 3547 dvp_seqc = fpl->dvp_seqc; 3548 3549 dvs = vget_prep_smr(dvp); 3550 if (__predict_false(dvs == VGET_NONE)) { 3551 cache_fpl_smr_exit(fpl); 3552 return (cache_fpl_aborted(fpl)); 3553 } 3554 3555 cache_fpl_smr_exit(fpl); 3556 3557 vget_finish_ref(dvp, dvs); 3558 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3559 vrele(dvp); 3560 return (cache_fpl_aborted(fpl)); 3561 } 3562 3563 pwd = pwd_hold(curthread); 3564 if (fpl->pwd != pwd) { 3565 vrele(dvp); 3566 pwd_drop(pwd); 3567 return (cache_fpl_aborted(fpl)); 3568 } 3569 3570 cache_fpl_restore(fpl, &fpl->snd); 3571 3572 ndp->ni_startdir = dvp; 3573 cnp->cn_flags |= MAKEENTRY; 3574 if (cache_fpl_islastcn(ndp)) 3575 cnp->cn_flags |= ISLASTCN; 3576 if (cache_fpl_isdotdot(cnp)) 3577 cnp->cn_flags |= ISDOTDOT; 3578 3579 return (0); 3580 } 3581 3582 static int 3583 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs) 3584 { 3585 struct componentname *cnp; 3586 struct vnode *tvp; 3587 seqc_t tvp_seqc; 3588 int error, lkflags; 3589 3590 cnp = fpl->cnp; 3591 tvp = fpl->tvp; 3592 tvp_seqc = fpl->tvp_seqc; 3593 3594 if ((cnp->cn_flags & LOCKLEAF) != 0) { 3595 lkflags = LK_SHARED; 3596 if ((cnp->cn_flags & LOCKSHARED) == 0) 3597 lkflags = LK_EXCLUSIVE; 3598 error = vget_finish(tvp, lkflags, tvs); 3599 if (__predict_false(error != 0)) { 3600 return (cache_fpl_aborted(fpl)); 3601 } 3602 } else { 3603 vget_finish_ref(tvp, tvs); 3604 } 3605 3606 if (!vn_seqc_consistent(tvp, tvp_seqc)) { 3607 if ((cnp->cn_flags & LOCKLEAF) != 0) 3608 vput(tvp); 3609 else 3610 vrele(tvp); 3611 return (cache_fpl_aborted(fpl)); 3612 } 3613 3614 return (cache_fpl_handled(fpl, 0)); 3615 } 3616 3617 /* 3618 * They want to possibly modify the state of the namecache. 3619 * 3620 * Don't try to match the API contract, just leave. 3621 * TODO: this leaves scalability on the table 3622 */ 3623 static int 3624 cache_fplookup_final_modifying(struct cache_fpl *fpl) 3625 { 3626 struct componentname *cnp; 3627 3628 cnp = fpl->cnp; 3629 MPASS(cnp->cn_nameiop != LOOKUP); 3630 return (cache_fpl_partial(fpl)); 3631 } 3632 3633 static int __noinline 3634 cache_fplookup_final_withparent(struct cache_fpl *fpl) 3635 { 3636 struct componentname *cnp; 3637 enum vgetstate dvs, tvs; 3638 struct vnode *dvp, *tvp; 3639 seqc_t dvp_seqc; 3640 int error; 3641 3642 cnp = fpl->cnp; 3643 dvp = fpl->dvp; 3644 dvp_seqc = fpl->dvp_seqc; 3645 tvp = fpl->tvp; 3646 3647 MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0); 3648 3649 /* 3650 * This is less efficient than it can be for simplicity. 3651 */ 3652 dvs = vget_prep_smr(dvp); 3653 if (__predict_false(dvs == VGET_NONE)) { 3654 return (cache_fpl_aborted(fpl)); 3655 } 3656 tvs = vget_prep_smr(tvp); 3657 if (__predict_false(tvs == VGET_NONE)) { 3658 cache_fpl_smr_exit(fpl); 3659 vget_abort(dvp, dvs); 3660 return (cache_fpl_aborted(fpl)); 3661 } 3662 3663 cache_fpl_smr_exit(fpl); 3664 3665 if ((cnp->cn_flags & LOCKPARENT) != 0) { 3666 error = vget_finish(dvp, LK_EXCLUSIVE, dvs); 3667 if (__predict_false(error != 0)) { 3668 vget_abort(tvp, tvs); 3669 return (cache_fpl_aborted(fpl)); 3670 } 3671 } else { 3672 vget_finish_ref(dvp, dvs); 3673 } 3674 3675 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3676 vget_abort(tvp, tvs); 3677 if ((cnp->cn_flags & LOCKPARENT) != 0) 3678 vput(dvp); 3679 else 3680 vrele(dvp); 3681 return (cache_fpl_aborted(fpl)); 3682 } 3683 3684 error = cache_fplookup_final_child(fpl, tvs); 3685 if (__predict_false(error != 0)) { 3686 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED); 3687 if ((cnp->cn_flags & LOCKPARENT) != 0) 3688 vput(dvp); 3689 else 3690 vrele(dvp); 3691 return (error); 3692 } 3693 3694 MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED); 3695 return (0); 3696 } 3697 3698 static int 3699 cache_fplookup_final(struct cache_fpl *fpl) 3700 { 3701 struct componentname *cnp; 3702 enum vgetstate tvs; 3703 struct vnode *dvp, *tvp; 3704 seqc_t dvp_seqc; 3705 3706 cnp = fpl->cnp; 3707 dvp = fpl->dvp; 3708 dvp_seqc = fpl->dvp_seqc; 3709 tvp = fpl->tvp; 3710 3711 VNPASS(cache_fplookup_vnode_supported(dvp), dvp); 3712 3713 if (cnp->cn_nameiop != LOOKUP) { 3714 return (cache_fplookup_final_modifying(fpl)); 3715 } 3716 3717 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) 3718 return (cache_fplookup_final_withparent(fpl)); 3719 3720 tvs = vget_prep_smr(tvp); 3721 if (__predict_false(tvs == VGET_NONE)) { 3722 return (cache_fpl_partial(fpl)); 3723 } 3724 3725 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3726 cache_fpl_smr_exit(fpl); 3727 vget_abort(tvp, tvs); 3728 return (cache_fpl_aborted(fpl)); 3729 } 3730 3731 cache_fpl_smr_exit(fpl); 3732 return (cache_fplookup_final_child(fpl, tvs)); 3733 } 3734 3735 static int __noinline 3736 cache_fplookup_dot(struct cache_fpl *fpl) 3737 { 3738 struct vnode *dvp; 3739 3740 dvp = fpl->dvp; 3741 3742 fpl->tvp = dvp; 3743 fpl->tvp_seqc = vn_seqc_read_any(dvp); 3744 if (seqc_in_modify(fpl->tvp_seqc)) { 3745 return (cache_fpl_aborted(fpl)); 3746 } 3747 3748 counter_u64_add(dothits, 1); 3749 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", dvp); 3750 3751 return (0); 3752 } 3753 3754 static int __noinline 3755 cache_fplookup_dotdot(struct cache_fpl *fpl) 3756 { 3757 struct nameidata *ndp; 3758 struct componentname *cnp; 3759 struct namecache *ncp; 3760 struct vnode *dvp; 3761 struct prison *pr; 3762 u_char nc_flag; 3763 3764 ndp = fpl->ndp; 3765 cnp = fpl->cnp; 3766 dvp = fpl->dvp; 3767 3768 /* 3769 * XXX this is racy the same way regular lookup is 3770 */ 3771 for (pr = cnp->cn_cred->cr_prison; pr != NULL; 3772 pr = pr->pr_parent) 3773 if (dvp == pr->pr_root) 3774 break; 3775 3776 if (dvp == ndp->ni_rootdir || 3777 dvp == ndp->ni_topdir || 3778 dvp == rootvnode || 3779 pr != NULL) { 3780 fpl->tvp = dvp; 3781 fpl->tvp_seqc = vn_seqc_read_any(dvp); 3782 if (seqc_in_modify(fpl->tvp_seqc)) { 3783 return (cache_fpl_aborted(fpl)); 3784 } 3785 return (0); 3786 } 3787 3788 if ((dvp->v_vflag & VV_ROOT) != 0) { 3789 /* 3790 * TODO 3791 * The opposite of climb mount is needed here. 3792 */ 3793 return (cache_fpl_aborted(fpl)); 3794 } 3795 3796 ncp = atomic_load_ptr(&dvp->v_cache_dd); 3797 if (ncp == NULL) { 3798 return (cache_fpl_aborted(fpl)); 3799 } 3800 3801 nc_flag = atomic_load_char(&ncp->nc_flag); 3802 if ((nc_flag & NCF_ISDOTDOT) != 0) { 3803 if ((nc_flag & NCF_NEGATIVE) != 0) 3804 return (cache_fpl_aborted(fpl)); 3805 fpl->tvp = ncp->nc_vp; 3806 } else { 3807 fpl->tvp = ncp->nc_dvp; 3808 } 3809 3810 if (__predict_false(!cache_ncp_canuse(ncp))) { 3811 return (cache_fpl_aborted(fpl)); 3812 } 3813 3814 fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp); 3815 if (seqc_in_modify(fpl->tvp_seqc)) { 3816 return (cache_fpl_partial(fpl)); 3817 } 3818 3819 counter_u64_add(dotdothits, 1); 3820 return (0); 3821 } 3822 3823 static int 3824 cache_fplookup_next(struct cache_fpl *fpl) 3825 { 3826 struct componentname *cnp; 3827 struct namecache *ncp; 3828 struct negstate *negstate; 3829 struct vnode *dvp, *tvp; 3830 u_char nc_flag; 3831 uint32_t hash; 3832 bool neg_hot; 3833 3834 cnp = fpl->cnp; 3835 dvp = fpl->dvp; 3836 3837 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) { 3838 return (cache_fplookup_dot(fpl)); 3839 } 3840 3841 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 3842 3843 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 3844 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 3845 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 3846 break; 3847 } 3848 3849 /* 3850 * If there is no entry we have to punt to the slow path to perform 3851 * actual lookup. Should there be nothing with this name a negative 3852 * entry will be created. 3853 */ 3854 if (__predict_false(ncp == NULL)) { 3855 return (cache_fpl_partial(fpl)); 3856 } 3857 3858 tvp = atomic_load_ptr(&ncp->nc_vp); 3859 nc_flag = atomic_load_char(&ncp->nc_flag); 3860 if ((nc_flag & NCF_NEGATIVE) != 0) { 3861 /* 3862 * If they want to create an entry we need to replace this one. 3863 */ 3864 if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) { 3865 return (cache_fpl_partial(fpl)); 3866 } 3867 negstate = NCP2NEGSTATE(ncp); 3868 neg_hot = ((negstate->neg_flag & NEG_HOT) != 0); 3869 if (__predict_false(!cache_ncp_canuse(ncp))) { 3870 return (cache_fpl_partial(fpl)); 3871 } 3872 if (__predict_false((nc_flag & NCF_WHITE) != 0)) { 3873 return (cache_fpl_partial(fpl)); 3874 } 3875 if (!neg_hot) { 3876 return (cache_fplookup_negative_promote(fpl, ncp, hash)); 3877 } 3878 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, 3879 ncp->nc_name); 3880 counter_u64_add(numneghits, 1); 3881 cache_fpl_smr_exit(fpl); 3882 return (cache_fpl_handled(fpl, ENOENT)); 3883 } 3884 3885 if (__predict_false(!cache_ncp_canuse(ncp))) { 3886 return (cache_fpl_partial(fpl)); 3887 } 3888 3889 fpl->tvp = tvp; 3890 fpl->tvp_seqc = vn_seqc_read_any(tvp); 3891 if (seqc_in_modify(fpl->tvp_seqc)) { 3892 return (cache_fpl_partial(fpl)); 3893 } 3894 3895 if (!cache_fplookup_vnode_supported(tvp)) { 3896 return (cache_fpl_partial(fpl)); 3897 } 3898 3899 counter_u64_add(numposhits, 1); 3900 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp); 3901 return (0); 3902 } 3903 3904 static bool 3905 cache_fplookup_mp_supported(struct mount *mp) 3906 { 3907 3908 if (mp == NULL) 3909 return (false); 3910 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) 3911 return (false); 3912 return (true); 3913 } 3914 3915 /* 3916 * Walk up the mount stack (if any). 3917 * 3918 * Correctness is provided in the following ways: 3919 * - all vnodes are protected from freeing with SMR 3920 * - struct mount objects are type stable making them always safe to access 3921 * - stability of the particular mount is provided by busying it 3922 * - relationship between the vnode which is mounted on and the mount is 3923 * verified with the vnode sequence counter after busying 3924 * - association between root vnode of the mount and the mount is protected 3925 * by busy 3926 * 3927 * From that point on we can read the sequence counter of the root vnode 3928 * and get the next mount on the stack (if any) using the same protection. 3929 * 3930 * By the end of successful walk we are guaranteed the reached state was 3931 * indeed present at least at some point which matches the regular lookup. 3932 */ 3933 static int __noinline 3934 cache_fplookup_climb_mount(struct cache_fpl *fpl) 3935 { 3936 struct mount *mp, *prev_mp; 3937 struct vnode *vp; 3938 seqc_t vp_seqc; 3939 3940 vp = fpl->tvp; 3941 vp_seqc = fpl->tvp_seqc; 3942 3943 VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp); 3944 mp = atomic_load_ptr(&vp->v_mountedhere); 3945 if (mp == NULL) 3946 return (0); 3947 3948 prev_mp = NULL; 3949 for (;;) { 3950 if (!vfs_op_thread_enter_crit(mp)) { 3951 if (prev_mp != NULL) 3952 vfs_op_thread_exit_crit(prev_mp); 3953 return (cache_fpl_partial(fpl)); 3954 } 3955 if (prev_mp != NULL) 3956 vfs_op_thread_exit_crit(prev_mp); 3957 if (!vn_seqc_consistent(vp, vp_seqc)) { 3958 vfs_op_thread_exit_crit(mp); 3959 return (cache_fpl_partial(fpl)); 3960 } 3961 if (!cache_fplookup_mp_supported(mp)) { 3962 vfs_op_thread_exit_crit(mp); 3963 return (cache_fpl_partial(fpl)); 3964 } 3965 vp = atomic_load_ptr(&mp->mnt_rootvnode); 3966 if (vp == NULL || VN_IS_DOOMED(vp)) { 3967 vfs_op_thread_exit_crit(mp); 3968 return (cache_fpl_partial(fpl)); 3969 } 3970 vp_seqc = vn_seqc_read_any(vp); 3971 if (seqc_in_modify(vp_seqc)) { 3972 vfs_op_thread_exit_crit(mp); 3973 return (cache_fpl_partial(fpl)); 3974 } 3975 prev_mp = mp; 3976 mp = atomic_load_ptr(&vp->v_mountedhere); 3977 if (mp == NULL) 3978 break; 3979 } 3980 3981 vfs_op_thread_exit_crit(prev_mp); 3982 fpl->tvp = vp; 3983 fpl->tvp_seqc = vp_seqc; 3984 return (0); 3985 } 3986 3987 static bool 3988 cache_fplookup_need_climb_mount(struct cache_fpl *fpl) 3989 { 3990 struct mount *mp; 3991 struct vnode *vp; 3992 3993 vp = fpl->tvp; 3994 3995 /* 3996 * Hack: while this is a union, the pointer tends to be NULL so save on 3997 * a branch. 3998 */ 3999 mp = atomic_load_ptr(&vp->v_mountedhere); 4000 if (mp == NULL) 4001 return (false); 4002 if (vp->v_type == VDIR) 4003 return (true); 4004 return (false); 4005 } 4006 4007 /* 4008 * Parse the path. 4009 * 4010 * The code is mostly copy-pasted from regular lookup, see lookup(). 4011 * The structure is maintained along with comments for easier maintenance. 4012 * Deduplicating the code will become feasible after fast path lookup 4013 * becomes more feature-complete. 4014 */ 4015 static int 4016 cache_fplookup_parse(struct cache_fpl *fpl) 4017 { 4018 struct nameidata *ndp; 4019 struct componentname *cnp; 4020 char *cp; 4021 4022 ndp = fpl->ndp; 4023 cnp = fpl->cnp; 4024 4025 /* 4026 * Search a new directory. 4027 * 4028 * The last component of the filename is left accessible via 4029 * cnp->cn_nameptr for callers that need the name. Callers needing 4030 * the name set the SAVENAME flag. When done, they assume 4031 * responsibility for freeing the pathname buffer. 4032 */ 4033 for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++) 4034 continue; 4035 cnp->cn_namelen = cp - cnp->cn_nameptr; 4036 if (__predict_false(cnp->cn_namelen > NAME_MAX)) { 4037 cache_fpl_smr_exit(fpl); 4038 return (cache_fpl_handled(fpl, ENAMETOOLONG)); 4039 } 4040 ndp->ni_pathlen -= cnp->cn_namelen; 4041 KASSERT(ndp->ni_pathlen <= PATH_MAX, 4042 ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen)); 4043 ndp->ni_next = cp; 4044 4045 /* 4046 * Replace multiple slashes by a single slash and trailing slashes 4047 * by a null. This must be done before VOP_LOOKUP() because some 4048 * fs's don't know about trailing slashes. Remember if there were 4049 * trailing slashes to handle symlinks, existing non-directories 4050 * and non-existing files that won't be directories specially later. 4051 */ 4052 while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) { 4053 cp++; 4054 ndp->ni_pathlen--; 4055 if (*cp == '\0') { 4056 /* 4057 * TODO 4058 * Regular lookup performs the following: 4059 * *ndp->ni_next = '\0'; 4060 * cnp->cn_flags |= TRAILINGSLASH; 4061 * 4062 * Which is problematic since it modifies data read 4063 * from userspace. Then if fast path lookup was to 4064 * abort we would have to either restore it or convey 4065 * the flag. Since this is a corner case just ignore 4066 * it for simplicity. 4067 */ 4068 return (cache_fpl_partial(fpl)); 4069 } 4070 } 4071 ndp->ni_next = cp; 4072 4073 /* 4074 * Check for degenerate name (e.g. / or "") 4075 * which is a way of talking about a directory, 4076 * e.g. like "/." or ".". 4077 * 4078 * TODO 4079 * Another corner case handled by the regular lookup 4080 */ 4081 if (__predict_false(cnp->cn_nameptr[0] == '\0')) { 4082 return (cache_fpl_partial(fpl)); 4083 } 4084 return (0); 4085 } 4086 4087 static void 4088 cache_fplookup_parse_advance(struct cache_fpl *fpl) 4089 { 4090 struct nameidata *ndp; 4091 struct componentname *cnp; 4092 4093 ndp = fpl->ndp; 4094 cnp = fpl->cnp; 4095 4096 cnp->cn_nameptr = ndp->ni_next; 4097 while (*cnp->cn_nameptr == '/') { 4098 cnp->cn_nameptr++; 4099 ndp->ni_pathlen--; 4100 } 4101 } 4102 4103 static int __noinline 4104 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error) 4105 { 4106 4107 switch (error) { 4108 case EAGAIN: 4109 /* 4110 * Can happen when racing against vgone. 4111 * */ 4112 case EOPNOTSUPP: 4113 cache_fpl_partial(fpl); 4114 break; 4115 default: 4116 /* 4117 * See the API contract for VOP_FPLOOKUP_VEXEC. 4118 */ 4119 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { 4120 error = cache_fpl_aborted(fpl); 4121 } else { 4122 cache_fpl_smr_exit(fpl); 4123 cache_fpl_handled(fpl, error); 4124 } 4125 break; 4126 } 4127 return (error); 4128 } 4129 4130 static int 4131 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl) 4132 { 4133 struct nameidata *ndp; 4134 struct componentname *cnp; 4135 struct mount *mp; 4136 int error; 4137 4138 error = CACHE_FPL_FAILED; 4139 ndp = fpl->ndp; 4140 cnp = fpl->cnp; 4141 4142 cache_fpl_checkpoint(fpl, &fpl->snd); 4143 4144 fpl->dvp = dvp; 4145 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); 4146 if (seqc_in_modify(fpl->dvp_seqc)) { 4147 cache_fpl_aborted(fpl); 4148 goto out; 4149 } 4150 mp = atomic_load_ptr(&fpl->dvp->v_mount); 4151 if (!cache_fplookup_mp_supported(mp)) { 4152 cache_fpl_aborted(fpl); 4153 goto out; 4154 } 4155 4156 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); 4157 4158 for (;;) { 4159 error = cache_fplookup_parse(fpl); 4160 if (__predict_false(error != 0)) { 4161 break; 4162 } 4163 4164 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); 4165 4166 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred); 4167 if (__predict_false(error != 0)) { 4168 error = cache_fplookup_failed_vexec(fpl, error); 4169 break; 4170 } 4171 4172 if (__predict_false(cache_fpl_isdotdot(cnp))) { 4173 error = cache_fplookup_dotdot(fpl); 4174 if (__predict_false(error != 0)) { 4175 break; 4176 } 4177 } else { 4178 error = cache_fplookup_next(fpl); 4179 if (__predict_false(error != 0)) { 4180 break; 4181 } 4182 4183 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 4184 4185 if (cache_fplookup_need_climb_mount(fpl)) { 4186 error = cache_fplookup_climb_mount(fpl); 4187 if (__predict_false(error != 0)) { 4188 break; 4189 } 4190 } 4191 } 4192 4193 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 4194 4195 if (cache_fpl_islastcn(ndp)) { 4196 error = cache_fplookup_final(fpl); 4197 break; 4198 } 4199 4200 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { 4201 error = cache_fpl_aborted(fpl); 4202 break; 4203 } 4204 4205 fpl->dvp = fpl->tvp; 4206 fpl->dvp_seqc = fpl->tvp_seqc; 4207 4208 cache_fplookup_parse_advance(fpl); 4209 cache_fpl_checkpoint(fpl, &fpl->snd); 4210 } 4211 out: 4212 switch (fpl->status) { 4213 case CACHE_FPL_STATUS_UNSET: 4214 __assert_unreachable(); 4215 break; 4216 case CACHE_FPL_STATUS_PARTIAL: 4217 cache_fpl_smr_assert_entered(fpl); 4218 return (cache_fplookup_partial_setup(fpl)); 4219 case CACHE_FPL_STATUS_ABORTED: 4220 if (fpl->in_smr) 4221 cache_fpl_smr_exit(fpl); 4222 return (CACHE_FPL_FAILED); 4223 case CACHE_FPL_STATUS_HANDLED: 4224 MPASS(error != CACHE_FPL_FAILED); 4225 cache_fpl_smr_assert_not_entered(fpl); 4226 if (__predict_false(error != 0)) { 4227 ndp->ni_dvp = NULL; 4228 ndp->ni_vp = NULL; 4229 cache_fpl_cleanup_cnp(cnp); 4230 return (error); 4231 } 4232 ndp->ni_dvp = fpl->dvp; 4233 ndp->ni_vp = fpl->tvp; 4234 if (cnp->cn_flags & SAVENAME) 4235 cnp->cn_flags |= HASBUF; 4236 else 4237 cache_fpl_cleanup_cnp(cnp); 4238 return (error); 4239 } 4240 } 4241 4242 /* 4243 * Fast path lookup protected with SMR and sequence counters. 4244 * 4245 * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one. 4246 * 4247 * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria 4248 * outlined below. 4249 * 4250 * Traditional vnode lookup conceptually looks like this: 4251 * 4252 * vn_lock(current); 4253 * for (;;) { 4254 * next = find(); 4255 * vn_lock(next); 4256 * vn_unlock(current); 4257 * current = next; 4258 * if (last) 4259 * break; 4260 * } 4261 * return (current); 4262 * 4263 * Each jump to the next vnode is safe memory-wise and atomic with respect to 4264 * any modifications thanks to holding respective locks. 4265 * 4266 * The same guarantee can be provided with a combination of safe memory 4267 * reclamation and sequence counters instead. If all operations which affect 4268 * the relationship between the current vnode and the one we are looking for 4269 * also modify the counter, we can verify whether all the conditions held as 4270 * we made the jump. This includes things like permissions, mount points etc. 4271 * Counter modification is provided by enclosing relevant places in 4272 * vn_seqc_write_begin()/end() calls. 4273 * 4274 * Thus this translates to: 4275 * 4276 * vfs_smr_enter(); 4277 * dvp_seqc = seqc_read_any(dvp); 4278 * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode 4279 * abort(); 4280 * for (;;) { 4281 * tvp = find(); 4282 * tvp_seqc = seqc_read_any(tvp); 4283 * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode 4284 * abort(); 4285 * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode 4286 * abort(); 4287 * dvp = tvp; // we know nothing of importance has changed 4288 * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration 4289 * if (last) 4290 * break; 4291 * } 4292 * vget(); // secure the vnode 4293 * if (!seqc_consistent(tvp, tvp_seqc) // final check 4294 * abort(); 4295 * // at this point we know nothing has changed for any parent<->child pair 4296 * // as they were crossed during the lookup, meaning we matched the guarantee 4297 * // of the locked variant 4298 * return (tvp); 4299 * 4300 * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows: 4301 * - they are called while within vfs_smr protection which they must never exit 4302 * - EAGAIN can be returned to denote checking could not be performed, it is 4303 * always valid to return it 4304 * - if the sequence counter has not changed the result must be valid 4305 * - if the sequence counter has changed both false positives and false negatives 4306 * are permitted (since the result will be rejected later) 4307 * - for simple cases of unix permission checks vaccess_vexec_smr can be used 4308 * 4309 * Caveats to watch out for: 4310 * - vnodes are passed unlocked and unreferenced with nothing stopping 4311 * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised 4312 * to use atomic_load_ptr to fetch it. 4313 * - the aforementioned object can also get freed, meaning absent other means it 4314 * should be protected with vfs_smr 4315 * - either safely checking permissions as they are modified or guaranteeing 4316 * their stability is left to the routine 4317 */ 4318 int 4319 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status, 4320 struct pwd **pwdp) 4321 { 4322 struct cache_fpl fpl; 4323 struct pwd *pwd; 4324 struct vnode *dvp; 4325 struct componentname *cnp; 4326 struct nameidata_saved orig; 4327 int error; 4328 4329 MPASS(ndp->ni_lcf == 0); 4330 4331 fpl.status = CACHE_FPL_STATUS_UNSET; 4332 fpl.ndp = ndp; 4333 fpl.cnp = &ndp->ni_cnd; 4334 MPASS(curthread == fpl.cnp->cn_thread); 4335 4336 if ((fpl.cnp->cn_flags & SAVESTART) != 0) 4337 MPASS(fpl.cnp->cn_nameiop != LOOKUP); 4338 4339 if (!cache_can_fplookup(&fpl)) { 4340 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 4341 *status = fpl.status; 4342 return (EOPNOTSUPP); 4343 } 4344 4345 cache_fpl_checkpoint(&fpl, &orig); 4346 4347 cache_fpl_smr_enter_initial(&fpl); 4348 pwd = pwd_get_smr(); 4349 fpl.pwd = pwd; 4350 ndp->ni_rootdir = pwd->pwd_rdir; 4351 ndp->ni_topdir = pwd->pwd_jdir; 4352 4353 cnp = fpl.cnp; 4354 cnp->cn_nameptr = cnp->cn_pnbuf; 4355 if (cnp->cn_pnbuf[0] == '/') { 4356 cache_fpl_handle_root(ndp, &dvp); 4357 } else { 4358 MPASS(ndp->ni_dirfd == AT_FDCWD); 4359 dvp = pwd->pwd_cdir; 4360 } 4361 4362 SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true); 4363 4364 error = cache_fplookup_impl(dvp, &fpl); 4365 cache_fpl_smr_assert_not_entered(&fpl); 4366 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 4367 4368 *status = fpl.status; 4369 switch (fpl.status) { 4370 case CACHE_FPL_STATUS_UNSET: 4371 __assert_unreachable(); 4372 break; 4373 case CACHE_FPL_STATUS_HANDLED: 4374 SDT_PROBE3(vfs, namei, lookup, return, error, 4375 (error == 0 ? ndp->ni_vp : NULL), true); 4376 break; 4377 case CACHE_FPL_STATUS_PARTIAL: 4378 *pwdp = fpl.pwd; 4379 /* 4380 * Status restored by cache_fplookup_partial_setup. 4381 */ 4382 break; 4383 case CACHE_FPL_STATUS_ABORTED: 4384 cache_fpl_restore(&fpl, &orig); 4385 break; 4386 } 4387 return (error); 4388 } 4389