1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Poul-Henning Kamp of the FreeBSD Project. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_ddb.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/capsicum.h> 46 #include <sys/counter.h> 47 #include <sys/filedesc.h> 48 #include <sys/fnv_hash.h> 49 #include <sys/kernel.h> 50 #include <sys/ktr.h> 51 #include <sys/lock.h> 52 #include <sys/malloc.h> 53 #include <sys/fcntl.h> 54 #include <sys/jail.h> 55 #include <sys/mount.h> 56 #include <sys/namei.h> 57 #include <sys/proc.h> 58 #include <sys/seqc.h> 59 #include <sys/sdt.h> 60 #include <sys/smr.h> 61 #include <sys/smp.h> 62 #include <sys/syscallsubr.h> 63 #include <sys/sysctl.h> 64 #include <sys/sysproto.h> 65 #include <sys/vnode.h> 66 #include <ck_queue.h> 67 #ifdef KTRACE 68 #include <sys/ktrace.h> 69 #endif 70 71 #include <sys/capsicum.h> 72 73 #include <security/audit/audit.h> 74 #include <security/mac/mac_framework.h> 75 76 #ifdef DDB 77 #include <ddb/ddb.h> 78 #endif 79 80 #include <vm/uma.h> 81 82 SDT_PROVIDER_DECLARE(vfs); 83 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", 84 "struct vnode *"); 85 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", 86 "char *"); 87 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *", 88 "const char *"); 89 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *", 90 "struct namecache *", "int", "int"); 91 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); 92 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", 93 "char *", "struct vnode *"); 94 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); 95 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", 96 "struct vnode *", "char *"); 97 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", 98 "struct vnode *"); 99 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, 100 "struct vnode *", "char *"); 101 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", 102 "char *"); 103 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *", 104 "struct componentname *"); 105 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *", 106 "struct componentname *"); 107 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); 108 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); 109 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); 110 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", 111 "struct vnode *"); 112 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", 113 "char *"); 114 SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *", 115 "char *"); 116 117 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool"); 118 SDT_PROBE_DECLARE(vfs, namei, lookup, entry); 119 SDT_PROBE_DECLARE(vfs, namei, lookup, return); 120 121 /* 122 * This structure describes the elements in the cache of recent 123 * names looked up by namei. 124 */ 125 struct negstate { 126 u_char neg_flag; 127 }; 128 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *), 129 "the state must fit in a union with a pointer without growing it"); 130 131 struct namecache { 132 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 133 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 134 CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */ 135 struct vnode *nc_dvp; /* vnode of parent of name */ 136 union { 137 struct vnode *nu_vp; /* vnode the name refers to */ 138 struct negstate nu_neg;/* negative entry state */ 139 } n_un; 140 u_char nc_flag; /* flag bits */ 141 u_char nc_nlen; /* length of name */ 142 char nc_name[0]; /* segment name + nul */ 143 }; 144 145 /* 146 * struct namecache_ts repeats struct namecache layout up to the 147 * nc_nlen member. 148 * struct namecache_ts is used in place of struct namecache when time(s) need 149 * to be stored. The nc_dotdottime field is used when a cache entry is mapping 150 * both a non-dotdot directory name plus dotdot for the directory's 151 * parent. 152 * 153 * See below for alignment requirement. 154 */ 155 struct namecache_ts { 156 struct timespec nc_time; /* timespec provided by fs */ 157 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ 158 int nc_ticks; /* ticks value when entry was added */ 159 struct namecache nc_nc; 160 }; 161 162 /* 163 * At least mips n32 performs 64-bit accesses to timespec as found 164 * in namecache_ts and requires them to be aligned. Since others 165 * may be in the same spot suffer a little bit and enforce the 166 * alignment for everyone. Note this is a nop for 64-bit platforms. 167 */ 168 #define CACHE_ZONE_ALIGNMENT UMA_ALIGNOF(time_t) 169 #define CACHE_PATH_CUTOFF 39 170 171 #define CACHE_ZONE_SMALL_SIZE (sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1) 172 #define CACHE_ZONE_SMALL_TS_SIZE (sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1) 173 #define CACHE_ZONE_LARGE_SIZE (sizeof(struct namecache) + NAME_MAX + 1) 174 #define CACHE_ZONE_LARGE_TS_SIZE (sizeof(struct namecache_ts) + NAME_MAX + 1) 175 176 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 177 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 178 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 179 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 180 181 #define nc_vp n_un.nu_vp 182 #define nc_neg n_un.nu_neg 183 184 /* 185 * Flags in namecache.nc_flag 186 */ 187 #define NCF_WHITE 0x01 188 #define NCF_ISDOTDOT 0x02 189 #define NCF_TS 0x04 190 #define NCF_DTS 0x08 191 #define NCF_DVDROP 0x10 192 #define NCF_NEGATIVE 0x20 193 #define NCF_INVALID 0x40 194 #define NCF_WIP 0x80 195 196 /* 197 * Flags in negstate.neg_flag 198 */ 199 #define NEG_HOT 0x01 200 201 /* 202 * Mark an entry as invalid. 203 * 204 * This is called before it starts getting deconstructed. 205 */ 206 static void 207 cache_ncp_invalidate(struct namecache *ncp) 208 { 209 210 KASSERT((ncp->nc_flag & NCF_INVALID) == 0, 211 ("%s: entry %p already invalid", __func__, ncp)); 212 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID); 213 atomic_thread_fence_rel(); 214 } 215 216 /* 217 * Check whether the entry can be safely used. 218 * 219 * All places which elide locks are supposed to call this after they are 220 * done with reading from an entry. 221 */ 222 static bool 223 cache_ncp_canuse(struct namecache *ncp) 224 { 225 226 atomic_thread_fence_acq(); 227 return ((atomic_load_char(&ncp->nc_flag) & (NCF_INVALID | NCF_WIP)) == 0); 228 } 229 230 /* 231 * Name caching works as follows: 232 * 233 * Names found by directory scans are retained in a cache 234 * for future reference. It is managed LRU, so frequently 235 * used names will hang around. Cache is indexed by hash value 236 * obtained from (dvp, name) where dvp refers to the directory 237 * containing name. 238 * 239 * If it is a "negative" entry, (i.e. for a name that is known NOT to 240 * exist) the vnode pointer will be NULL. 241 * 242 * Upon reaching the last segment of a path, if the reference 243 * is for DELETE, or NOCACHE is set (rewrite), and the 244 * name is located in the cache, it will be dropped. 245 * 246 * These locks are used (in the order in which they can be taken): 247 * NAME TYPE ROLE 248 * vnodelock mtx vnode lists and v_cache_dd field protection 249 * bucketlock mtx for access to given set of hash buckets 250 * neglist mtx negative entry LRU management 251 * 252 * Additionally, ncneg_shrink_lock mtx is used to have at most one thread 253 * shrinking the LRU list. 254 * 255 * It is legal to take multiple vnodelock and bucketlock locks. The locking 256 * order is lower address first. Both are recursive. 257 * 258 * "." lookups are lockless. 259 * 260 * ".." and vnode -> name lookups require vnodelock. 261 * 262 * name -> vnode lookup requires the relevant bucketlock to be held for reading. 263 * 264 * Insertions and removals of entries require involved vnodes and bucketlocks 265 * to be locked to provide safe operation against other threads modifying the 266 * cache. 267 * 268 * Some lookups result in removal of the found entry (e.g. getting rid of a 269 * negative entry with the intent to create a positive one), which poses a 270 * problem when multiple threads reach the state. Similarly, two different 271 * threads can purge two different vnodes and try to remove the same name. 272 * 273 * If the already held vnode lock is lower than the second required lock, we 274 * can just take the other lock. However, in the opposite case, this could 275 * deadlock. As such, this is resolved by trylocking and if that fails unlocking 276 * the first node, locking everything in order and revalidating the state. 277 */ 278 279 VFS_SMR_DECLARE; 280 281 /* 282 * Structures associated with name caching. 283 */ 284 #define NCHHASH(hash) \ 285 (&nchashtbl[(hash) & nchash]) 286 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ 287 static u_long __read_mostly nchash; /* size of hash table */ 288 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 289 "Size of namecache hash table"); 290 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */ 291 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 292 "Ratio of negative namecache entries"); 293 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ 294 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ 295 u_int ncsizefactor = 2; 296 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0, 297 "Size factor for namecache"); 298 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */ 299 300 struct nchstats nchstats; /* cache effectiveness statistics */ 301 302 static bool __read_frequently cache_fast_revlookup = true; 303 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_revlookup, CTLFLAG_RW, 304 &cache_fast_revlookup, 0, ""); 305 306 static struct mtx __exclusive_cache_line ncneg_shrink_lock; 307 308 struct neglist { 309 struct mtx nl_lock; 310 TAILQ_HEAD(, namecache) nl_list; 311 } __aligned(CACHE_LINE_SIZE); 312 313 static struct neglist __read_mostly *neglists; 314 static struct neglist ncneg_hot; 315 static u_long numhotneg; 316 317 #define ncneghash 3 318 #define numneglists (ncneghash + 1) 319 static inline struct neglist * 320 NCP2NEGLIST(struct namecache *ncp) 321 { 322 323 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); 324 } 325 326 static inline struct negstate * 327 NCP2NEGSTATE(struct namecache *ncp) 328 { 329 330 MPASS(ncp->nc_flag & NCF_NEGATIVE); 331 return (&ncp->nc_neg); 332 } 333 334 #define numbucketlocks (ncbuckethash + 1) 335 static u_int __read_mostly ncbuckethash; 336 static struct mtx_padalign __read_mostly *bucketlocks; 337 #define HASH2BUCKETLOCK(hash) \ 338 ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)])) 339 340 #define numvnodelocks (ncvnodehash + 1) 341 static u_int __read_mostly ncvnodehash; 342 static struct mtx __read_mostly *vnodelocks; 343 static inline struct mtx * 344 VP2VNODELOCK(struct vnode *vp) 345 { 346 347 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); 348 } 349 350 /* 351 * UMA zones for the VFS cache. 352 * 353 * The small cache is used for entries with short names, which are the 354 * most common. The large cache is used for entries which are too big to 355 * fit in the small cache. 356 */ 357 static uma_zone_t __read_mostly cache_zone_small; 358 static uma_zone_t __read_mostly cache_zone_small_ts; 359 static uma_zone_t __read_mostly cache_zone_large; 360 static uma_zone_t __read_mostly cache_zone_large_ts; 361 362 static struct namecache * 363 cache_alloc(int len, int ts) 364 { 365 struct namecache_ts *ncp_ts; 366 struct namecache *ncp; 367 368 if (__predict_false(ts)) { 369 if (len <= CACHE_PATH_CUTOFF) 370 ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK); 371 else 372 ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK); 373 ncp = &ncp_ts->nc_nc; 374 } else { 375 if (len <= CACHE_PATH_CUTOFF) 376 ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK); 377 else 378 ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK); 379 } 380 return (ncp); 381 } 382 383 static void 384 cache_free(struct namecache *ncp) 385 { 386 struct namecache_ts *ncp_ts; 387 388 MPASS(ncp != NULL); 389 if ((ncp->nc_flag & NCF_DVDROP) != 0) 390 vdrop(ncp->nc_dvp); 391 if (__predict_false(ncp->nc_flag & NCF_TS)) { 392 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 393 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 394 uma_zfree_smr(cache_zone_small_ts, ncp_ts); 395 else 396 uma_zfree_smr(cache_zone_large_ts, ncp_ts); 397 } else { 398 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 399 uma_zfree_smr(cache_zone_small, ncp); 400 else 401 uma_zfree_smr(cache_zone_large, ncp); 402 } 403 } 404 405 static void 406 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) 407 { 408 struct namecache_ts *ncp_ts; 409 410 KASSERT((ncp->nc_flag & NCF_TS) != 0 || 411 (tsp == NULL && ticksp == NULL), 412 ("No NCF_TS")); 413 414 if (tsp == NULL) 415 return; 416 417 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 418 *tsp = ncp_ts->nc_time; 419 *ticksp = ncp_ts->nc_ticks; 420 } 421 422 #ifdef DEBUG_CACHE 423 static int __read_mostly doingcache = 1; /* 1 => enable the cache */ 424 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, 425 "VFS namecache enabled"); 426 #endif 427 428 /* Export size information to userland */ 429 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 430 sizeof(struct namecache), "sizeof(struct namecache)"); 431 432 /* 433 * The new name cache statistics 434 */ 435 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 436 "Name cache statistics"); 437 #define STATNODE_ULONG(name, descr) \ 438 SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr); 439 #define STATNODE_COUNTER(name, descr) \ 440 static COUNTER_U64_DEFINE_EARLY(name); \ 441 SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, \ 442 descr); 443 STATNODE_ULONG(numneg, "Number of negative cache entries"); 444 STATNODE_ULONG(numcache, "Number of cache entries"); 445 STATNODE_COUNTER(numcachehv, "Number of namecache entries with vnodes held"); 446 STATNODE_COUNTER(numdrops, "Number of dropped entries due to reaching the limit"); 447 STATNODE_COUNTER(dothits, "Number of '.' hits"); 448 STATNODE_COUNTER(dotdothits, "Number of '..' hits"); 449 STATNODE_COUNTER(nummiss, "Number of cache misses"); 450 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache"); 451 STATNODE_COUNTER(numposzaps, 452 "Number of cache hits (positive) we do not want to cache"); 453 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)"); 454 STATNODE_COUNTER(numnegzaps, 455 "Number of cache hits (negative) we do not want to cache"); 456 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)"); 457 /* These count for vn_getcwd(), too. */ 458 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls"); 459 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); 460 STATNODE_COUNTER(numfullpathfail2, 461 "Number of fullpath search errors (VOP_VPTOCNP failures)"); 462 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); 463 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls"); 464 STATNODE_COUNTER(zap_and_exit_bucket_relock_success, 465 "Number of successful removals after relocking"); 466 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail, 467 "Number of times zap_and_exit failed to lock"); 468 static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2, 469 "Number of times zap_and_exit failed to lock"); 470 static long cache_lock_vnodes_cel_3_failures; 471 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures, 472 "Number of times 3-way vnode locking failed"); 473 STATNODE_ULONG(numhotneg, "Number of hot negative entries"); 474 STATNODE_COUNTER(numneg_evicted, 475 "Number of negative entries evicted when adding a new entry"); 476 STATNODE_COUNTER(shrinking_skipped, 477 "Number of times shrinking was already in progress"); 478 479 static void cache_zap_locked(struct namecache *ncp); 480 static int vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, 481 char **freebuf, size_t *buflen); 482 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 483 char **retbuf, size_t *buflen, bool slash_prefixed, size_t addend); 484 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, 485 char **retbuf, size_t *buflen); 486 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, 487 char **retbuf, size_t *len, bool slash_prefixed, size_t addend); 488 489 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 490 491 static inline void 492 cache_assert_vlp_locked(struct mtx *vlp) 493 { 494 495 if (vlp != NULL) 496 mtx_assert(vlp, MA_OWNED); 497 } 498 499 static inline void 500 cache_assert_vnode_locked(struct vnode *vp) 501 { 502 struct mtx *vlp; 503 504 vlp = VP2VNODELOCK(vp); 505 cache_assert_vlp_locked(vlp); 506 } 507 508 /* 509 * TODO: With the value stored we can do better than computing the hash based 510 * on the address. The choice of FNV should also be revisited. 511 */ 512 static void 513 cache_prehash(struct vnode *vp) 514 { 515 516 vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT); 517 } 518 519 static uint32_t 520 cache_get_hash(char *name, u_char len, struct vnode *dvp) 521 { 522 523 return (fnv_32_buf(name, len, dvp->v_nchash)); 524 } 525 526 static inline struct nchashhead * 527 NCP2BUCKET(struct namecache *ncp) 528 { 529 uint32_t hash; 530 531 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 532 return (NCHHASH(hash)); 533 } 534 535 static inline struct mtx * 536 NCP2BUCKETLOCK(struct namecache *ncp) 537 { 538 uint32_t hash; 539 540 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 541 return (HASH2BUCKETLOCK(hash)); 542 } 543 544 #ifdef INVARIANTS 545 static void 546 cache_assert_bucket_locked(struct namecache *ncp) 547 { 548 struct mtx *blp; 549 550 blp = NCP2BUCKETLOCK(ncp); 551 mtx_assert(blp, MA_OWNED); 552 } 553 554 static void 555 cache_assert_bucket_unlocked(struct namecache *ncp) 556 { 557 struct mtx *blp; 558 559 blp = NCP2BUCKETLOCK(ncp); 560 mtx_assert(blp, MA_NOTOWNED); 561 } 562 #else 563 #define cache_assert_bucket_locked(x) do { } while (0) 564 #define cache_assert_bucket_unlocked(x) do { } while (0) 565 #endif 566 567 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) 568 static void 569 _cache_sort_vnodes(void **p1, void **p2) 570 { 571 void *tmp; 572 573 MPASS(*p1 != NULL || *p2 != NULL); 574 575 if (*p1 > *p2) { 576 tmp = *p2; 577 *p2 = *p1; 578 *p1 = tmp; 579 } 580 } 581 582 static void 583 cache_lock_all_buckets(void) 584 { 585 u_int i; 586 587 for (i = 0; i < numbucketlocks; i++) 588 mtx_lock(&bucketlocks[i]); 589 } 590 591 static void 592 cache_unlock_all_buckets(void) 593 { 594 u_int i; 595 596 for (i = 0; i < numbucketlocks; i++) 597 mtx_unlock(&bucketlocks[i]); 598 } 599 600 static void 601 cache_lock_all_vnodes(void) 602 { 603 u_int i; 604 605 for (i = 0; i < numvnodelocks; i++) 606 mtx_lock(&vnodelocks[i]); 607 } 608 609 static void 610 cache_unlock_all_vnodes(void) 611 { 612 u_int i; 613 614 for (i = 0; i < numvnodelocks; i++) 615 mtx_unlock(&vnodelocks[i]); 616 } 617 618 static int 619 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 620 { 621 622 cache_sort_vnodes(&vlp1, &vlp2); 623 624 if (vlp1 != NULL) { 625 if (!mtx_trylock(vlp1)) 626 return (EAGAIN); 627 } 628 if (!mtx_trylock(vlp2)) { 629 if (vlp1 != NULL) 630 mtx_unlock(vlp1); 631 return (EAGAIN); 632 } 633 634 return (0); 635 } 636 637 static void 638 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 639 { 640 641 MPASS(vlp1 != NULL || vlp2 != NULL); 642 MPASS(vlp1 <= vlp2); 643 644 if (vlp1 != NULL) 645 mtx_lock(vlp1); 646 if (vlp2 != NULL) 647 mtx_lock(vlp2); 648 } 649 650 static void 651 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 652 { 653 654 MPASS(vlp1 != NULL || vlp2 != NULL); 655 656 if (vlp1 != NULL) 657 mtx_unlock(vlp1); 658 if (vlp2 != NULL) 659 mtx_unlock(vlp2); 660 } 661 662 static int 663 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 664 { 665 struct nchstats snap; 666 667 if (req->oldptr == NULL) 668 return (SYSCTL_OUT(req, 0, sizeof(snap))); 669 670 snap = nchstats; 671 snap.ncs_goodhits = counter_u64_fetch(numposhits); 672 snap.ncs_neghits = counter_u64_fetch(numneghits); 673 snap.ncs_badhits = counter_u64_fetch(numposzaps) + 674 counter_u64_fetch(numnegzaps); 675 snap.ncs_miss = counter_u64_fetch(nummisszap) + 676 counter_u64_fetch(nummiss); 677 678 return (SYSCTL_OUT(req, &snap, sizeof(snap))); 679 } 680 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | 681 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", 682 "VFS cache effectiveness statistics"); 683 684 #ifdef DIAGNOSTIC 685 /* 686 * Grab an atomic snapshot of the name cache hash chain lengths 687 */ 688 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, 689 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 690 "hash table stats"); 691 692 static int 693 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) 694 { 695 struct nchashhead *ncpp; 696 struct namecache *ncp; 697 int i, error, n_nchash, *cntbuf; 698 699 retry: 700 n_nchash = nchash + 1; /* nchash is max index, not count */ 701 if (req->oldptr == NULL) 702 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); 703 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); 704 cache_lock_all_buckets(); 705 if (n_nchash != nchash + 1) { 706 cache_unlock_all_buckets(); 707 free(cntbuf, M_TEMP); 708 goto retry; 709 } 710 /* Scan hash tables counting entries */ 711 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) 712 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) 713 cntbuf[i]++; 714 cache_unlock_all_buckets(); 715 for (error = 0, i = 0; i < n_nchash; i++) 716 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) 717 break; 718 free(cntbuf, M_TEMP); 719 return (error); 720 } 721 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| 722 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", 723 "nchash chain lengths"); 724 725 static int 726 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) 727 { 728 int error; 729 struct nchashhead *ncpp; 730 struct namecache *ncp; 731 int n_nchash; 732 int count, maxlength, used, pct; 733 734 if (!req->oldptr) 735 return SYSCTL_OUT(req, 0, 4 * sizeof(int)); 736 737 cache_lock_all_buckets(); 738 n_nchash = nchash + 1; /* nchash is max index, not count */ 739 used = 0; 740 maxlength = 0; 741 742 /* Scan hash tables for applicable entries */ 743 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { 744 count = 0; 745 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) { 746 count++; 747 } 748 if (count) 749 used++; 750 if (maxlength < count) 751 maxlength = count; 752 } 753 n_nchash = nchash + 1; 754 cache_unlock_all_buckets(); 755 pct = (used * 100) / (n_nchash / 100); 756 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); 757 if (error) 758 return (error); 759 error = SYSCTL_OUT(req, &used, sizeof(used)); 760 if (error) 761 return (error); 762 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); 763 if (error) 764 return (error); 765 error = SYSCTL_OUT(req, &pct, sizeof(pct)); 766 if (error) 767 return (error); 768 return (0); 769 } 770 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| 771 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", 772 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); 773 #endif 774 775 /* 776 * Negative entries management 777 * 778 * A variation of LRU scheme is used. New entries are hashed into one of 779 * numneglists cold lists. Entries get promoted to the hot list on first hit. 780 * 781 * The shrinker will demote hot list head and evict from the cold list in a 782 * round-robin manner. 783 */ 784 static void 785 cache_negative_init(struct namecache *ncp) 786 { 787 struct negstate *negstate; 788 789 ncp->nc_flag |= NCF_NEGATIVE; 790 negstate = NCP2NEGSTATE(ncp); 791 negstate->neg_flag = 0; 792 } 793 794 static void 795 cache_negative_hit(struct namecache *ncp) 796 { 797 struct neglist *neglist; 798 struct negstate *negstate; 799 800 negstate = NCP2NEGSTATE(ncp); 801 if ((negstate->neg_flag & NEG_HOT) != 0) 802 return; 803 neglist = NCP2NEGLIST(ncp); 804 mtx_lock(&ncneg_hot.nl_lock); 805 mtx_lock(&neglist->nl_lock); 806 if ((negstate->neg_flag & NEG_HOT) == 0) { 807 numhotneg++; 808 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 809 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); 810 negstate->neg_flag |= NEG_HOT; 811 } 812 mtx_unlock(&neglist->nl_lock); 813 mtx_unlock(&ncneg_hot.nl_lock); 814 } 815 816 static void 817 cache_negative_insert(struct namecache *ncp) 818 { 819 struct neglist *neglist; 820 821 MPASS(ncp->nc_flag & NCF_NEGATIVE); 822 cache_assert_bucket_locked(ncp); 823 neglist = NCP2NEGLIST(ncp); 824 mtx_lock(&neglist->nl_lock); 825 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 826 mtx_unlock(&neglist->nl_lock); 827 atomic_add_long(&numneg, 1); 828 } 829 830 static void 831 cache_negative_remove(struct namecache *ncp) 832 { 833 struct neglist *neglist; 834 struct negstate *negstate; 835 bool hot_locked = false; 836 bool list_locked = false; 837 838 cache_assert_bucket_locked(ncp); 839 neglist = NCP2NEGLIST(ncp); 840 negstate = NCP2NEGSTATE(ncp); 841 if ((negstate->neg_flag & NEG_HOT) != 0) { 842 hot_locked = true; 843 mtx_lock(&ncneg_hot.nl_lock); 844 if ((negstate->neg_flag & NEG_HOT) == 0) { 845 list_locked = true; 846 mtx_lock(&neglist->nl_lock); 847 } 848 } else { 849 list_locked = true; 850 mtx_lock(&neglist->nl_lock); 851 /* 852 * We may be racing against promotion in lockless lookup. 853 */ 854 if ((negstate->neg_flag & NEG_HOT) != 0) { 855 mtx_unlock(&neglist->nl_lock); 856 hot_locked = true; 857 mtx_lock(&ncneg_hot.nl_lock); 858 mtx_lock(&neglist->nl_lock); 859 } 860 } 861 if ((negstate->neg_flag & NEG_HOT) != 0) { 862 mtx_assert(&ncneg_hot.nl_lock, MA_OWNED); 863 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 864 numhotneg--; 865 } else { 866 mtx_assert(&neglist->nl_lock, MA_OWNED); 867 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 868 } 869 if (list_locked) 870 mtx_unlock(&neglist->nl_lock); 871 if (hot_locked) 872 mtx_unlock(&ncneg_hot.nl_lock); 873 atomic_subtract_long(&numneg, 1); 874 } 875 876 static void 877 cache_negative_shrink_select(struct namecache **ncpp, 878 struct neglist **neglistpp) 879 { 880 struct neglist *neglist; 881 struct namecache *ncp; 882 static u_int cycle; 883 u_int i; 884 885 *ncpp = ncp = NULL; 886 887 for (i = 0; i < numneglists; i++) { 888 neglist = &neglists[(cycle + i) % numneglists]; 889 if (TAILQ_FIRST(&neglist->nl_list) == NULL) 890 continue; 891 mtx_lock(&neglist->nl_lock); 892 ncp = TAILQ_FIRST(&neglist->nl_list); 893 if (ncp != NULL) 894 break; 895 mtx_unlock(&neglist->nl_lock); 896 } 897 898 *neglistpp = neglist; 899 *ncpp = ncp; 900 cycle++; 901 } 902 903 static void 904 cache_negative_zap_one(void) 905 { 906 struct namecache *ncp, *ncp2; 907 struct neglist *neglist; 908 struct negstate *negstate; 909 struct mtx *dvlp; 910 struct mtx *blp; 911 912 if (mtx_owner(&ncneg_shrink_lock) != NULL || 913 !mtx_trylock(&ncneg_shrink_lock)) { 914 counter_u64_add(shrinking_skipped, 1); 915 return; 916 } 917 918 mtx_lock(&ncneg_hot.nl_lock); 919 ncp = TAILQ_FIRST(&ncneg_hot.nl_list); 920 if (ncp != NULL) { 921 neglist = NCP2NEGLIST(ncp); 922 negstate = NCP2NEGSTATE(ncp); 923 mtx_lock(&neglist->nl_lock); 924 MPASS((negstate->neg_flag & NEG_HOT) != 0); 925 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 926 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 927 negstate->neg_flag &= ~NEG_HOT; 928 numhotneg--; 929 mtx_unlock(&neglist->nl_lock); 930 } 931 mtx_unlock(&ncneg_hot.nl_lock); 932 933 cache_negative_shrink_select(&ncp, &neglist); 934 935 mtx_unlock(&ncneg_shrink_lock); 936 if (ncp == NULL) 937 return; 938 939 MPASS(ncp->nc_flag & NCF_NEGATIVE); 940 dvlp = VP2VNODELOCK(ncp->nc_dvp); 941 blp = NCP2BUCKETLOCK(ncp); 942 mtx_unlock(&neglist->nl_lock); 943 mtx_lock(dvlp); 944 mtx_lock(blp); 945 /* 946 * Enter SMR to safely check the negative list. 947 * Even if the found pointer matches, the entry may now be reallocated 948 * and used by a different vnode. 949 */ 950 vfs_smr_enter(); 951 ncp2 = TAILQ_FIRST(&neglist->nl_list); 952 if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) || 953 blp != NCP2BUCKETLOCK(ncp2)) { 954 vfs_smr_exit(); 955 ncp = NULL; 956 } else { 957 vfs_smr_exit(); 958 SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp, 959 ncp->nc_name); 960 cache_zap_locked(ncp); 961 counter_u64_add(numneg_evicted, 1); 962 } 963 mtx_unlock(blp); 964 mtx_unlock(dvlp); 965 if (ncp != NULL) 966 cache_free(ncp); 967 } 968 969 /* 970 * cache_zap_locked(): 971 * 972 * Removes a namecache entry from cache, whether it contains an actual 973 * pointer to a vnode or if it is just a negative cache entry. 974 */ 975 static void 976 cache_zap_locked(struct namecache *ncp) 977 { 978 struct nchashhead *ncpp; 979 980 if (!(ncp->nc_flag & NCF_NEGATIVE)) 981 cache_assert_vnode_locked(ncp->nc_vp); 982 cache_assert_vnode_locked(ncp->nc_dvp); 983 cache_assert_bucket_locked(ncp); 984 985 cache_ncp_invalidate(ncp); 986 987 ncpp = NCP2BUCKET(ncp); 988 CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash); 989 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 990 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, 991 ncp->nc_name, ncp->nc_vp); 992 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); 993 if (ncp == ncp->nc_vp->v_cache_dd) { 994 vn_seqc_write_begin_unheld(ncp->nc_vp); 995 ncp->nc_vp->v_cache_dd = NULL; 996 vn_seqc_write_end(ncp->nc_vp); 997 } 998 } else { 999 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp, 1000 ncp->nc_name); 1001 cache_negative_remove(ncp); 1002 } 1003 if (ncp->nc_flag & NCF_ISDOTDOT) { 1004 if (ncp == ncp->nc_dvp->v_cache_dd) { 1005 vn_seqc_write_begin_unheld(ncp->nc_dvp); 1006 ncp->nc_dvp->v_cache_dd = NULL; 1007 vn_seqc_write_end(ncp->nc_dvp); 1008 } 1009 } else { 1010 LIST_REMOVE(ncp, nc_src); 1011 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { 1012 ncp->nc_flag |= NCF_DVDROP; 1013 counter_u64_add(numcachehv, -1); 1014 } 1015 } 1016 atomic_subtract_long(&numcache, 1); 1017 } 1018 1019 static void 1020 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) 1021 { 1022 struct mtx *blp; 1023 1024 MPASS(ncp->nc_dvp == vp); 1025 MPASS(ncp->nc_flag & NCF_NEGATIVE); 1026 cache_assert_vnode_locked(vp); 1027 1028 blp = NCP2BUCKETLOCK(ncp); 1029 mtx_lock(blp); 1030 cache_zap_locked(ncp); 1031 mtx_unlock(blp); 1032 } 1033 1034 static bool 1035 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, 1036 struct mtx **vlpp) 1037 { 1038 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 1039 struct mtx *blp; 1040 1041 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 1042 cache_assert_vnode_locked(vp); 1043 1044 if (ncp->nc_flag & NCF_NEGATIVE) { 1045 if (*vlpp != NULL) { 1046 mtx_unlock(*vlpp); 1047 *vlpp = NULL; 1048 } 1049 cache_zap_negative_locked_vnode_kl(ncp, vp); 1050 return (true); 1051 } 1052 1053 pvlp = VP2VNODELOCK(vp); 1054 blp = NCP2BUCKETLOCK(ncp); 1055 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 1056 vlp2 = VP2VNODELOCK(ncp->nc_vp); 1057 1058 if (*vlpp == vlp1 || *vlpp == vlp2) { 1059 to_unlock = *vlpp; 1060 *vlpp = NULL; 1061 } else { 1062 if (*vlpp != NULL) { 1063 mtx_unlock(*vlpp); 1064 *vlpp = NULL; 1065 } 1066 cache_sort_vnodes(&vlp1, &vlp2); 1067 if (vlp1 == pvlp) { 1068 mtx_lock(vlp2); 1069 to_unlock = vlp2; 1070 } else { 1071 if (!mtx_trylock(vlp1)) 1072 goto out_relock; 1073 to_unlock = vlp1; 1074 } 1075 } 1076 mtx_lock(blp); 1077 cache_zap_locked(ncp); 1078 mtx_unlock(blp); 1079 if (to_unlock != NULL) 1080 mtx_unlock(to_unlock); 1081 return (true); 1082 1083 out_relock: 1084 mtx_unlock(vlp2); 1085 mtx_lock(vlp1); 1086 mtx_lock(vlp2); 1087 MPASS(*vlpp == NULL); 1088 *vlpp = vlp1; 1089 return (false); 1090 } 1091 1092 /* 1093 * If trylocking failed we can get here. We know enough to take all needed locks 1094 * in the right order and re-lookup the entry. 1095 */ 1096 static int 1097 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1098 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, 1099 struct mtx *blp) 1100 { 1101 struct namecache *rncp; 1102 1103 cache_assert_bucket_unlocked(ncp); 1104 1105 cache_sort_vnodes(&dvlp, &vlp); 1106 cache_lock_vnodes(dvlp, vlp); 1107 mtx_lock(blp); 1108 CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { 1109 if (rncp == ncp && rncp->nc_dvp == dvp && 1110 rncp->nc_nlen == cnp->cn_namelen && 1111 !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) 1112 break; 1113 } 1114 if (rncp != NULL) { 1115 cache_zap_locked(rncp); 1116 mtx_unlock(blp); 1117 cache_unlock_vnodes(dvlp, vlp); 1118 counter_u64_add(zap_and_exit_bucket_relock_success, 1); 1119 return (0); 1120 } 1121 1122 mtx_unlock(blp); 1123 cache_unlock_vnodes(dvlp, vlp); 1124 return (EAGAIN); 1125 } 1126 1127 static int __noinline 1128 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp, 1129 uint32_t hash, struct mtx *blp) 1130 { 1131 struct mtx *dvlp, *vlp; 1132 struct vnode *dvp; 1133 1134 cache_assert_bucket_locked(ncp); 1135 1136 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1137 vlp = NULL; 1138 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1139 vlp = VP2VNODELOCK(ncp->nc_vp); 1140 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1141 cache_zap_locked(ncp); 1142 mtx_unlock(blp); 1143 cache_unlock_vnodes(dvlp, vlp); 1144 return (0); 1145 } 1146 1147 dvp = ncp->nc_dvp; 1148 mtx_unlock(blp); 1149 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1150 } 1151 1152 static __noinline int 1153 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp) 1154 { 1155 struct namecache *ncp; 1156 struct mtx *blp; 1157 struct mtx *dvlp, *dvlp2; 1158 uint32_t hash; 1159 int error; 1160 1161 if (cnp->cn_namelen == 2 && 1162 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1163 dvlp = VP2VNODELOCK(dvp); 1164 dvlp2 = NULL; 1165 mtx_lock(dvlp); 1166 retry_dotdot: 1167 ncp = dvp->v_cache_dd; 1168 if (ncp == NULL) { 1169 mtx_unlock(dvlp); 1170 if (dvlp2 != NULL) 1171 mtx_unlock(dvlp2); 1172 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1173 return (0); 1174 } 1175 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1176 if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2)) 1177 goto retry_dotdot; 1178 MPASS(dvp->v_cache_dd == NULL); 1179 mtx_unlock(dvlp); 1180 if (dvlp2 != NULL) 1181 mtx_unlock(dvlp2); 1182 cache_free(ncp); 1183 } else { 1184 vn_seqc_write_begin(dvp); 1185 dvp->v_cache_dd = NULL; 1186 vn_seqc_write_end(dvp); 1187 mtx_unlock(dvlp); 1188 if (dvlp2 != NULL) 1189 mtx_unlock(dvlp2); 1190 } 1191 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1192 return (1); 1193 } 1194 1195 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1196 blp = HASH2BUCKETLOCK(hash); 1197 retry: 1198 if (CK_SLIST_EMPTY(NCHHASH(hash))) 1199 goto out_no_entry; 1200 1201 mtx_lock(blp); 1202 1203 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1204 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1205 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1206 break; 1207 } 1208 1209 if (ncp == NULL) { 1210 mtx_unlock(blp); 1211 goto out_no_entry; 1212 } 1213 1214 error = cache_zap_locked_bucket(ncp, cnp, hash, blp); 1215 if (__predict_false(error != 0)) { 1216 zap_and_exit_bucket_fail++; 1217 goto retry; 1218 } 1219 counter_u64_add(numposzaps, 1); 1220 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1221 cache_free(ncp); 1222 return (1); 1223 out_no_entry: 1224 counter_u64_add(nummisszap, 1); 1225 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1226 return (0); 1227 } 1228 1229 static int __noinline 1230 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1231 struct timespec *tsp, int *ticksp) 1232 { 1233 int ltype; 1234 1235 *vpp = dvp; 1236 counter_u64_add(dothits, 1); 1237 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); 1238 if (tsp != NULL) 1239 timespecclear(tsp); 1240 if (ticksp != NULL) 1241 *ticksp = ticks; 1242 vrefact(*vpp); 1243 /* 1244 * When we lookup "." we still can be asked to lock it 1245 * differently... 1246 */ 1247 ltype = cnp->cn_lkflags & LK_TYPE_MASK; 1248 if (ltype != VOP_ISLOCKED(*vpp)) { 1249 if (ltype == LK_EXCLUSIVE) { 1250 vn_lock(*vpp, LK_UPGRADE | LK_RETRY); 1251 if (VN_IS_DOOMED((*vpp))) { 1252 /* forced unmount */ 1253 vrele(*vpp); 1254 *vpp = NULL; 1255 return (ENOENT); 1256 } 1257 } else 1258 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); 1259 } 1260 return (-1); 1261 } 1262 1263 static int __noinline 1264 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1265 struct timespec *tsp, int *ticksp) 1266 { 1267 struct namecache_ts *ncp_ts; 1268 struct namecache *ncp; 1269 struct mtx *dvlp; 1270 enum vgetstate vs; 1271 int error, ltype; 1272 bool whiteout; 1273 1274 MPASS((cnp->cn_flags & ISDOTDOT) != 0); 1275 1276 if ((cnp->cn_flags & MAKEENTRY) == 0) { 1277 cache_remove_cnp(dvp, cnp); 1278 return (0); 1279 } 1280 1281 counter_u64_add(dotdothits, 1); 1282 retry: 1283 dvlp = VP2VNODELOCK(dvp); 1284 mtx_lock(dvlp); 1285 ncp = dvp->v_cache_dd; 1286 if (ncp == NULL) { 1287 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, "..", NULL); 1288 mtx_unlock(dvlp); 1289 return (0); 1290 } 1291 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1292 if (ncp->nc_flag & NCF_NEGATIVE) 1293 *vpp = NULL; 1294 else 1295 *vpp = ncp->nc_vp; 1296 } else 1297 *vpp = ncp->nc_dvp; 1298 if (*vpp == NULL) 1299 goto negative_success; 1300 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp); 1301 cache_out_ts(ncp, tsp, ticksp); 1302 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == 1303 NCF_DTS && tsp != NULL) { 1304 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1305 *tsp = ncp_ts->nc_dotdottime; 1306 } 1307 1308 MPASS(dvp != *vpp); 1309 ltype = VOP_ISLOCKED(dvp); 1310 VOP_UNLOCK(dvp); 1311 vs = vget_prep(*vpp); 1312 mtx_unlock(dvlp); 1313 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1314 vn_lock(dvp, ltype | LK_RETRY); 1315 if (VN_IS_DOOMED(dvp)) { 1316 if (error == 0) 1317 vput(*vpp); 1318 *vpp = NULL; 1319 return (ENOENT); 1320 } 1321 if (error) { 1322 *vpp = NULL; 1323 goto retry; 1324 } 1325 return (-1); 1326 negative_success: 1327 if (__predict_false(cnp->cn_nameiop == CREATE)) { 1328 if (cnp->cn_flags & ISLASTCN) { 1329 counter_u64_add(numnegzaps, 1); 1330 cache_zap_negative_locked_vnode_kl(ncp, dvp); 1331 mtx_unlock(dvlp); 1332 cache_free(ncp); 1333 return (0); 1334 } 1335 } 1336 1337 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 1338 cache_out_ts(ncp, tsp, ticksp); 1339 counter_u64_add(numneghits, 1); 1340 whiteout = (ncp->nc_flag & NCF_WHITE); 1341 cache_negative_hit(ncp); 1342 mtx_unlock(dvlp); 1343 if (whiteout) 1344 cnp->cn_flags |= ISWHITEOUT; 1345 return (ENOENT); 1346 } 1347 1348 /** 1349 * Lookup a name in the name cache 1350 * 1351 * # Arguments 1352 * 1353 * - dvp: Parent directory in which to search. 1354 * - vpp: Return argument. Will contain desired vnode on cache hit. 1355 * - cnp: Parameters of the name search. The most interesting bits of 1356 * the cn_flags field have the following meanings: 1357 * - MAKEENTRY: If clear, free an entry from the cache rather than look 1358 * it up. 1359 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." 1360 * - tsp: Return storage for cache timestamp. On a successful (positive 1361 * or negative) lookup, tsp will be filled with any timespec that 1362 * was stored when this cache entry was created. However, it will 1363 * be clear for "." entries. 1364 * - ticks: Return storage for alternate cache timestamp. On a successful 1365 * (positive or negative) lookup, it will contain the ticks value 1366 * that was current when the cache entry was created, unless cnp 1367 * was ".". 1368 * 1369 * Either both tsp and ticks have to be provided or neither of them. 1370 * 1371 * # Returns 1372 * 1373 * - -1: A positive cache hit. vpp will contain the desired vnode. 1374 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due 1375 * to a forced unmount. vpp will not be modified. If the entry 1376 * is a whiteout, then the ISWHITEOUT flag will be set in 1377 * cnp->cn_flags. 1378 * - 0: A cache miss. vpp will not be modified. 1379 * 1380 * # Locking 1381 * 1382 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up 1383 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the 1384 * lock is not recursively acquired. 1385 */ 1386 static int __noinline 1387 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1388 struct timespec *tsp, int *ticksp) 1389 { 1390 struct namecache *ncp; 1391 struct mtx *blp; 1392 uint32_t hash; 1393 enum vgetstate vs; 1394 int error; 1395 bool whiteout; 1396 1397 MPASS((cnp->cn_flags & (MAKEENTRY | ISDOTDOT)) == MAKEENTRY); 1398 1399 retry: 1400 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1401 blp = HASH2BUCKETLOCK(hash); 1402 mtx_lock(blp); 1403 1404 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1405 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1406 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1407 break; 1408 } 1409 1410 if (__predict_false(ncp == NULL)) { 1411 mtx_unlock(blp); 1412 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1413 NULL); 1414 counter_u64_add(nummiss, 1); 1415 return (0); 1416 } 1417 1418 if (ncp->nc_flag & NCF_NEGATIVE) 1419 goto negative_success; 1420 1421 counter_u64_add(numposhits, 1); 1422 *vpp = ncp->nc_vp; 1423 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); 1424 cache_out_ts(ncp, tsp, ticksp); 1425 MPASS(dvp != *vpp); 1426 vs = vget_prep(*vpp); 1427 mtx_unlock(blp); 1428 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1429 if (error) { 1430 *vpp = NULL; 1431 goto retry; 1432 } 1433 return (-1); 1434 negative_success: 1435 if (__predict_false(cnp->cn_nameiop == CREATE)) { 1436 if (cnp->cn_flags & ISLASTCN) { 1437 counter_u64_add(numnegzaps, 1); 1438 error = cache_zap_locked_bucket(ncp, cnp, hash, blp); 1439 if (__predict_false(error != 0)) { 1440 zap_and_exit_bucket_fail2++; 1441 goto retry; 1442 } 1443 cache_free(ncp); 1444 return (0); 1445 } 1446 } 1447 1448 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 1449 cache_out_ts(ncp, tsp, ticksp); 1450 counter_u64_add(numneghits, 1); 1451 whiteout = (ncp->nc_flag & NCF_WHITE); 1452 cache_negative_hit(ncp); 1453 mtx_unlock(blp); 1454 if (whiteout) 1455 cnp->cn_flags |= ISWHITEOUT; 1456 return (ENOENT); 1457 } 1458 1459 int 1460 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1461 struct timespec *tsp, int *ticksp) 1462 { 1463 struct namecache *ncp; 1464 struct negstate *negstate; 1465 uint32_t hash; 1466 enum vgetstate vs; 1467 int error; 1468 bool whiteout; 1469 u_short nc_flag; 1470 1471 MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL)); 1472 1473 #ifdef DEBUG_CACHE 1474 if (__predict_false(!doingcache)) { 1475 cnp->cn_flags &= ~MAKEENTRY; 1476 return (0); 1477 } 1478 #endif 1479 1480 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 1481 if (cnp->cn_namelen == 1) 1482 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); 1483 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') 1484 return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp)); 1485 } 1486 1487 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 1488 1489 if ((cnp->cn_flags & MAKEENTRY) == 0) { 1490 cache_remove_cnp(dvp, cnp); 1491 return (0); 1492 } 1493 1494 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1495 vfs_smr_enter(); 1496 1497 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1498 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1499 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1500 break; 1501 } 1502 1503 if (__predict_false(ncp == NULL)) { 1504 vfs_smr_exit(); 1505 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1506 NULL); 1507 counter_u64_add(nummiss, 1); 1508 return (0); 1509 } 1510 1511 nc_flag = atomic_load_char(&ncp->nc_flag); 1512 if (nc_flag & NCF_NEGATIVE) 1513 goto negative_success; 1514 1515 counter_u64_add(numposhits, 1); 1516 *vpp = ncp->nc_vp; 1517 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); 1518 cache_out_ts(ncp, tsp, ticksp); 1519 MPASS(dvp != *vpp); 1520 if (!cache_ncp_canuse(ncp)) { 1521 vfs_smr_exit(); 1522 *vpp = NULL; 1523 goto out_fallback; 1524 } 1525 vs = vget_prep_smr(*vpp); 1526 vfs_smr_exit(); 1527 if (__predict_false(vs == VGET_NONE)) { 1528 *vpp = NULL; 1529 goto out_fallback; 1530 } 1531 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1532 if (error) { 1533 *vpp = NULL; 1534 goto out_fallback; 1535 } 1536 return (-1); 1537 negative_success: 1538 if (__predict_false(cnp->cn_nameiop == CREATE)) { 1539 if (cnp->cn_flags & ISLASTCN) { 1540 vfs_smr_exit(); 1541 goto out_fallback; 1542 } 1543 } 1544 1545 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 1546 cache_out_ts(ncp, tsp, ticksp); 1547 counter_u64_add(numneghits, 1); 1548 whiteout = (ncp->nc_flag & NCF_WHITE); 1549 /* 1550 * TODO: We need to take locks to promote an entry. Code doing it 1551 * in SMR lookup can be modified to be shared. 1552 */ 1553 negstate = NCP2NEGSTATE(ncp); 1554 if ((negstate->neg_flag & NEG_HOT) == 0 || 1555 !cache_ncp_canuse(ncp)) { 1556 vfs_smr_exit(); 1557 goto out_fallback; 1558 } 1559 vfs_smr_exit(); 1560 if (whiteout) 1561 cnp->cn_flags |= ISWHITEOUT; 1562 return (ENOENT); 1563 out_fallback: 1564 return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp)); 1565 } 1566 1567 struct celockstate { 1568 struct mtx *vlp[3]; 1569 struct mtx *blp[2]; 1570 }; 1571 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); 1572 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); 1573 1574 static inline void 1575 cache_celockstate_init(struct celockstate *cel) 1576 { 1577 1578 bzero(cel, sizeof(*cel)); 1579 } 1580 1581 static void 1582 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, 1583 struct vnode *dvp) 1584 { 1585 struct mtx *vlp1, *vlp2; 1586 1587 MPASS(cel->vlp[0] == NULL); 1588 MPASS(cel->vlp[1] == NULL); 1589 MPASS(cel->vlp[2] == NULL); 1590 1591 MPASS(vp != NULL || dvp != NULL); 1592 1593 vlp1 = VP2VNODELOCK(vp); 1594 vlp2 = VP2VNODELOCK(dvp); 1595 cache_sort_vnodes(&vlp1, &vlp2); 1596 1597 if (vlp1 != NULL) { 1598 mtx_lock(vlp1); 1599 cel->vlp[0] = vlp1; 1600 } 1601 mtx_lock(vlp2); 1602 cel->vlp[1] = vlp2; 1603 } 1604 1605 static void 1606 cache_unlock_vnodes_cel(struct celockstate *cel) 1607 { 1608 1609 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); 1610 1611 if (cel->vlp[0] != NULL) 1612 mtx_unlock(cel->vlp[0]); 1613 if (cel->vlp[1] != NULL) 1614 mtx_unlock(cel->vlp[1]); 1615 if (cel->vlp[2] != NULL) 1616 mtx_unlock(cel->vlp[2]); 1617 } 1618 1619 static bool 1620 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) 1621 { 1622 struct mtx *vlp; 1623 bool ret; 1624 1625 cache_assert_vlp_locked(cel->vlp[0]); 1626 cache_assert_vlp_locked(cel->vlp[1]); 1627 MPASS(cel->vlp[2] == NULL); 1628 1629 MPASS(vp != NULL); 1630 vlp = VP2VNODELOCK(vp); 1631 1632 ret = true; 1633 if (vlp >= cel->vlp[1]) { 1634 mtx_lock(vlp); 1635 } else { 1636 if (mtx_trylock(vlp)) 1637 goto out; 1638 cache_lock_vnodes_cel_3_failures++; 1639 cache_unlock_vnodes_cel(cel); 1640 if (vlp < cel->vlp[0]) { 1641 mtx_lock(vlp); 1642 mtx_lock(cel->vlp[0]); 1643 mtx_lock(cel->vlp[1]); 1644 } else { 1645 if (cel->vlp[0] != NULL) 1646 mtx_lock(cel->vlp[0]); 1647 mtx_lock(vlp); 1648 mtx_lock(cel->vlp[1]); 1649 } 1650 ret = false; 1651 } 1652 out: 1653 cel->vlp[2] = vlp; 1654 return (ret); 1655 } 1656 1657 static void 1658 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1, 1659 struct mtx *blp2) 1660 { 1661 1662 MPASS(cel->blp[0] == NULL); 1663 MPASS(cel->blp[1] == NULL); 1664 1665 cache_sort_vnodes(&blp1, &blp2); 1666 1667 if (blp1 != NULL) { 1668 mtx_lock(blp1); 1669 cel->blp[0] = blp1; 1670 } 1671 mtx_lock(blp2); 1672 cel->blp[1] = blp2; 1673 } 1674 1675 static void 1676 cache_unlock_buckets_cel(struct celockstate *cel) 1677 { 1678 1679 if (cel->blp[0] != NULL) 1680 mtx_unlock(cel->blp[0]); 1681 mtx_unlock(cel->blp[1]); 1682 } 1683 1684 /* 1685 * Lock part of the cache affected by the insertion. 1686 * 1687 * This means vnodelocks for dvp, vp and the relevant bucketlock. 1688 * However, insertion can result in removal of an old entry. In this 1689 * case we have an additional vnode and bucketlock pair to lock. 1690 * 1691 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while 1692 * preserving the locking order (smaller address first). 1693 */ 1694 static void 1695 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1696 uint32_t hash) 1697 { 1698 struct namecache *ncp; 1699 struct mtx *blps[2]; 1700 1701 blps[0] = HASH2BUCKETLOCK(hash); 1702 for (;;) { 1703 blps[1] = NULL; 1704 cache_lock_vnodes_cel(cel, dvp, vp); 1705 if (vp == NULL || vp->v_type != VDIR) 1706 break; 1707 ncp = vp->v_cache_dd; 1708 if (ncp == NULL) 1709 break; 1710 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1711 break; 1712 MPASS(ncp->nc_dvp == vp); 1713 blps[1] = NCP2BUCKETLOCK(ncp); 1714 if (ncp->nc_flag & NCF_NEGATIVE) 1715 break; 1716 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1717 break; 1718 /* 1719 * All vnodes got re-locked. Re-validate the state and if 1720 * nothing changed we are done. Otherwise restart. 1721 */ 1722 if (ncp == vp->v_cache_dd && 1723 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1724 blps[1] == NCP2BUCKETLOCK(ncp) && 1725 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1726 break; 1727 cache_unlock_vnodes_cel(cel); 1728 cel->vlp[0] = NULL; 1729 cel->vlp[1] = NULL; 1730 cel->vlp[2] = NULL; 1731 } 1732 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1733 } 1734 1735 static void 1736 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1737 uint32_t hash) 1738 { 1739 struct namecache *ncp; 1740 struct mtx *blps[2]; 1741 1742 blps[0] = HASH2BUCKETLOCK(hash); 1743 for (;;) { 1744 blps[1] = NULL; 1745 cache_lock_vnodes_cel(cel, dvp, vp); 1746 ncp = dvp->v_cache_dd; 1747 if (ncp == NULL) 1748 break; 1749 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1750 break; 1751 MPASS(ncp->nc_dvp == dvp); 1752 blps[1] = NCP2BUCKETLOCK(ncp); 1753 if (ncp->nc_flag & NCF_NEGATIVE) 1754 break; 1755 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1756 break; 1757 if (ncp == dvp->v_cache_dd && 1758 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1759 blps[1] == NCP2BUCKETLOCK(ncp) && 1760 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1761 break; 1762 cache_unlock_vnodes_cel(cel); 1763 cel->vlp[0] = NULL; 1764 cel->vlp[1] = NULL; 1765 cel->vlp[2] = NULL; 1766 } 1767 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1768 } 1769 1770 static void 1771 cache_enter_unlock(struct celockstate *cel) 1772 { 1773 1774 cache_unlock_buckets_cel(cel); 1775 cache_unlock_vnodes_cel(cel); 1776 } 1777 1778 static void __noinline 1779 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, 1780 struct componentname *cnp) 1781 { 1782 struct celockstate cel; 1783 struct namecache *ncp; 1784 uint32_t hash; 1785 int len; 1786 1787 if (dvp->v_cache_dd == NULL) 1788 return; 1789 len = cnp->cn_namelen; 1790 cache_celockstate_init(&cel); 1791 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1792 cache_enter_lock_dd(&cel, dvp, vp, hash); 1793 vn_seqc_write_begin(dvp); 1794 ncp = dvp->v_cache_dd; 1795 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) { 1796 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); 1797 cache_zap_locked(ncp); 1798 } else { 1799 ncp = NULL; 1800 } 1801 dvp->v_cache_dd = NULL; 1802 vn_seqc_write_end(dvp); 1803 cache_enter_unlock(&cel); 1804 if (ncp != NULL) 1805 cache_free(ncp); 1806 } 1807 1808 /* 1809 * Add an entry to the cache. 1810 */ 1811 void 1812 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 1813 struct timespec *tsp, struct timespec *dtsp) 1814 { 1815 struct celockstate cel; 1816 struct namecache *ncp, *n2, *ndd; 1817 struct namecache_ts *ncp_ts; 1818 struct nchashhead *ncpp; 1819 uint32_t hash; 1820 int flag; 1821 int len; 1822 u_long lnumcache; 1823 1824 VNPASS(!VN_IS_DOOMED(dvp), dvp); 1825 VNPASS(dvp->v_type != VNON, dvp); 1826 if (vp != NULL) { 1827 VNPASS(!VN_IS_DOOMED(vp), vp); 1828 VNPASS(vp->v_type != VNON, vp); 1829 } 1830 1831 #ifdef DEBUG_CACHE 1832 if (__predict_false(!doingcache)) 1833 return; 1834 #endif 1835 1836 flag = 0; 1837 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 1838 if (cnp->cn_namelen == 1) 1839 return; 1840 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 1841 cache_enter_dotdot_prep(dvp, vp, cnp); 1842 flag = NCF_ISDOTDOT; 1843 } 1844 } 1845 1846 /* 1847 * Avoid blowout in namecache entries. 1848 */ 1849 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; 1850 if (__predict_false(lnumcache >= ncsize)) { 1851 atomic_subtract_long(&numcache, 1); 1852 counter_u64_add(numdrops, 1); 1853 return; 1854 } 1855 1856 cache_celockstate_init(&cel); 1857 ndd = NULL; 1858 ncp_ts = NULL; 1859 1860 /* 1861 * Calculate the hash key and setup as much of the new 1862 * namecache entry as possible before acquiring the lock. 1863 */ 1864 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); 1865 ncp->nc_flag = flag | NCF_WIP; 1866 ncp->nc_vp = vp; 1867 if (vp == NULL) 1868 cache_negative_init(ncp); 1869 ncp->nc_dvp = dvp; 1870 if (tsp != NULL) { 1871 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1872 ncp_ts->nc_time = *tsp; 1873 ncp_ts->nc_ticks = ticks; 1874 ncp_ts->nc_nc.nc_flag |= NCF_TS; 1875 if (dtsp != NULL) { 1876 ncp_ts->nc_dotdottime = *dtsp; 1877 ncp_ts->nc_nc.nc_flag |= NCF_DTS; 1878 } 1879 } 1880 len = ncp->nc_nlen = cnp->cn_namelen; 1881 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1882 memcpy(ncp->nc_name, cnp->cn_nameptr, len); 1883 ncp->nc_name[len] = '\0'; 1884 cache_enter_lock(&cel, dvp, vp, hash); 1885 1886 /* 1887 * See if this vnode or negative entry is already in the cache 1888 * with this name. This can happen with concurrent lookups of 1889 * the same path name. 1890 */ 1891 ncpp = NCHHASH(hash); 1892 CK_SLIST_FOREACH(n2, ncpp, nc_hash) { 1893 if (n2->nc_dvp == dvp && 1894 n2->nc_nlen == cnp->cn_namelen && 1895 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { 1896 MPASS(cache_ncp_canuse(n2)); 1897 if ((n2->nc_flag & NCF_NEGATIVE) != 0) 1898 KASSERT(vp == NULL, 1899 ("%s: found entry pointing to a different vnode (%p != %p)", 1900 __func__, NULL, vp)); 1901 else 1902 KASSERT(n2->nc_vp == vp, 1903 ("%s: found entry pointing to a different vnode (%p != %p)", 1904 __func__, n2->nc_vp, vp)); 1905 /* 1906 * Entries are supposed to be immutable unless in the 1907 * process of getting destroyed. Accommodating for 1908 * changing timestamps is possible but not worth it. 1909 * This should be harmless in terms of correctness, in 1910 * the worst case resulting in an earlier expiration. 1911 * Alternatively, the found entry can be replaced 1912 * altogether. 1913 */ 1914 MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS))); 1915 #if 0 1916 if (tsp != NULL) { 1917 KASSERT((n2->nc_flag & NCF_TS) != 0, 1918 ("no NCF_TS")); 1919 n2_ts = __containerof(n2, struct namecache_ts, nc_nc); 1920 n2_ts->nc_time = ncp_ts->nc_time; 1921 n2_ts->nc_ticks = ncp_ts->nc_ticks; 1922 if (dtsp != NULL) { 1923 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; 1924 n2_ts->nc_nc.nc_flag |= NCF_DTS; 1925 } 1926 } 1927 #endif 1928 goto out_unlock_free; 1929 } 1930 } 1931 1932 if (flag == NCF_ISDOTDOT) { 1933 /* 1934 * See if we are trying to add .. entry, but some other lookup 1935 * has populated v_cache_dd pointer already. 1936 */ 1937 if (dvp->v_cache_dd != NULL) 1938 goto out_unlock_free; 1939 KASSERT(vp == NULL || vp->v_type == VDIR, 1940 ("wrong vnode type %p", vp)); 1941 vn_seqc_write_begin(dvp); 1942 dvp->v_cache_dd = ncp; 1943 vn_seqc_write_end(dvp); 1944 } 1945 1946 if (vp != NULL) { 1947 if (flag != NCF_ISDOTDOT) { 1948 /* 1949 * For this case, the cache entry maps both the 1950 * directory name in it and the name ".." for the 1951 * directory's parent. 1952 */ 1953 vn_seqc_write_begin(vp); 1954 if ((ndd = vp->v_cache_dd) != NULL) { 1955 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) 1956 cache_zap_locked(ndd); 1957 else 1958 ndd = NULL; 1959 } 1960 vp->v_cache_dd = ncp; 1961 vn_seqc_write_end(vp); 1962 } else if (vp->v_type != VDIR) { 1963 if (vp->v_cache_dd != NULL) { 1964 vn_seqc_write_begin(vp); 1965 vp->v_cache_dd = NULL; 1966 vn_seqc_write_end(vp); 1967 } 1968 } 1969 } 1970 1971 if (flag != NCF_ISDOTDOT) { 1972 if (LIST_EMPTY(&dvp->v_cache_src)) { 1973 vhold(dvp); 1974 counter_u64_add(numcachehv, 1); 1975 } 1976 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); 1977 } 1978 1979 /* 1980 * If the entry is "negative", we place it into the 1981 * "negative" cache queue, otherwise, we place it into the 1982 * destination vnode's cache entries queue. 1983 */ 1984 if (vp != NULL) { 1985 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); 1986 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, 1987 vp); 1988 } else { 1989 if (cnp->cn_flags & ISWHITEOUT) 1990 ncp->nc_flag |= NCF_WHITE; 1991 cache_negative_insert(ncp); 1992 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, 1993 ncp->nc_name); 1994 } 1995 1996 /* 1997 * Insert the new namecache entry into the appropriate chain 1998 * within the cache entries table. 1999 */ 2000 CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash); 2001 2002 atomic_thread_fence_rel(); 2003 /* 2004 * Mark the entry as fully constructed. 2005 * It is immutable past this point until its removal. 2006 */ 2007 atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP); 2008 2009 cache_enter_unlock(&cel); 2010 if (numneg * ncnegfactor > lnumcache) 2011 cache_negative_zap_one(); 2012 if (ndd != NULL) 2013 cache_free(ndd); 2014 return; 2015 out_unlock_free: 2016 cache_enter_unlock(&cel); 2017 atomic_subtract_long(&numcache, 1); 2018 cache_free(ncp); 2019 return; 2020 } 2021 2022 static u_int 2023 cache_roundup_2(u_int val) 2024 { 2025 u_int res; 2026 2027 for (res = 1; res <= val; res <<= 1) 2028 continue; 2029 2030 return (res); 2031 } 2032 2033 static struct nchashhead * 2034 nchinittbl(u_long elements, u_long *hashmask) 2035 { 2036 struct nchashhead *hashtbl; 2037 u_long hashsize, i; 2038 2039 hashsize = cache_roundup_2(elements) / 2; 2040 2041 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK); 2042 for (i = 0; i < hashsize; i++) 2043 CK_SLIST_INIT(&hashtbl[i]); 2044 *hashmask = hashsize - 1; 2045 return (hashtbl); 2046 } 2047 2048 static void 2049 ncfreetbl(struct nchashhead *hashtbl) 2050 { 2051 2052 free(hashtbl, M_VFSCACHE); 2053 } 2054 2055 /* 2056 * Name cache initialization, from vfs_init() when we are booting 2057 */ 2058 static void 2059 nchinit(void *dummy __unused) 2060 { 2061 u_int i; 2062 2063 cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE, 2064 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2065 cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE, 2066 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2067 cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE, 2068 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2069 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE, 2070 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2071 2072 VFS_SMR_ZONE_SET(cache_zone_small); 2073 VFS_SMR_ZONE_SET(cache_zone_small_ts); 2074 VFS_SMR_ZONE_SET(cache_zone_large); 2075 VFS_SMR_ZONE_SET(cache_zone_large_ts); 2076 2077 ncsize = desiredvnodes * ncsizefactor; 2078 nchashtbl = nchinittbl(desiredvnodes * 2, &nchash); 2079 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; 2080 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ 2081 ncbuckethash = 7; 2082 if (ncbuckethash > nchash) 2083 ncbuckethash = nchash; 2084 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, 2085 M_WAITOK | M_ZERO); 2086 for (i = 0; i < numbucketlocks; i++) 2087 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE); 2088 ncvnodehash = ncbuckethash; 2089 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, 2090 M_WAITOK | M_ZERO); 2091 for (i = 0; i < numvnodelocks; i++) 2092 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); 2093 2094 neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE, 2095 M_WAITOK | M_ZERO); 2096 for (i = 0; i < numneglists; i++) { 2097 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); 2098 TAILQ_INIT(&neglists[i].nl_list); 2099 } 2100 mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF); 2101 TAILQ_INIT(&ncneg_hot.nl_list); 2102 2103 mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF); 2104 } 2105 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); 2106 2107 void 2108 cache_vnode_init(struct vnode *vp) 2109 { 2110 2111 LIST_INIT(&vp->v_cache_src); 2112 TAILQ_INIT(&vp->v_cache_dst); 2113 vp->v_cache_dd = NULL; 2114 cache_prehash(vp); 2115 } 2116 2117 void 2118 cache_changesize(u_long newmaxvnodes) 2119 { 2120 struct nchashhead *new_nchashtbl, *old_nchashtbl; 2121 u_long new_nchash, old_nchash; 2122 struct namecache *ncp; 2123 uint32_t hash; 2124 u_long newncsize; 2125 int i; 2126 2127 newncsize = newmaxvnodes * ncsizefactor; 2128 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); 2129 if (newmaxvnodes < numbucketlocks) 2130 newmaxvnodes = numbucketlocks; 2131 2132 new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash); 2133 /* If same hash table size, nothing to do */ 2134 if (nchash == new_nchash) { 2135 ncfreetbl(new_nchashtbl); 2136 return; 2137 } 2138 /* 2139 * Move everything from the old hash table to the new table. 2140 * None of the namecache entries in the table can be removed 2141 * because to do so, they have to be removed from the hash table. 2142 */ 2143 cache_lock_all_vnodes(); 2144 cache_lock_all_buckets(); 2145 old_nchashtbl = nchashtbl; 2146 old_nchash = nchash; 2147 nchashtbl = new_nchashtbl; 2148 nchash = new_nchash; 2149 for (i = 0; i <= old_nchash; i++) { 2150 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) { 2151 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, 2152 ncp->nc_dvp); 2153 CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash); 2154 CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); 2155 } 2156 } 2157 ncsize = newncsize; 2158 cache_unlock_all_buckets(); 2159 cache_unlock_all_vnodes(); 2160 ncfreetbl(old_nchashtbl); 2161 } 2162 2163 /* 2164 * Invalidate all entries from and to a particular vnode. 2165 */ 2166 static void 2167 cache_purge_impl(struct vnode *vp) 2168 { 2169 TAILQ_HEAD(, namecache) ncps; 2170 struct namecache *ncp, *nnp; 2171 struct mtx *vlp, *vlp2; 2172 2173 TAILQ_INIT(&ncps); 2174 vlp = VP2VNODELOCK(vp); 2175 vlp2 = NULL; 2176 mtx_lock(vlp); 2177 retry: 2178 while (!LIST_EMPTY(&vp->v_cache_src)) { 2179 ncp = LIST_FIRST(&vp->v_cache_src); 2180 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2181 goto retry; 2182 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2183 } 2184 while (!TAILQ_EMPTY(&vp->v_cache_dst)) { 2185 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2186 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2187 goto retry; 2188 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2189 } 2190 ncp = vp->v_cache_dd; 2191 if (ncp != NULL) { 2192 KASSERT(ncp->nc_flag & NCF_ISDOTDOT, 2193 ("lost dotdot link")); 2194 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2195 goto retry; 2196 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2197 } 2198 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); 2199 mtx_unlock(vlp); 2200 if (vlp2 != NULL) 2201 mtx_unlock(vlp2); 2202 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2203 cache_free(ncp); 2204 } 2205 } 2206 2207 /* 2208 * Opportunistic check to see if there is anything to do. 2209 */ 2210 static bool 2211 cache_has_entries(struct vnode *vp) 2212 { 2213 2214 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2215 vp->v_cache_dd == NULL) 2216 return (false); 2217 return (true); 2218 } 2219 2220 void 2221 cache_purge(struct vnode *vp) 2222 { 2223 2224 SDT_PROBE1(vfs, namecache, purge, done, vp); 2225 if (!cache_has_entries(vp)) 2226 return; 2227 cache_purge_impl(vp); 2228 } 2229 2230 /* 2231 * Only to be used by vgone. 2232 */ 2233 void 2234 cache_purge_vgone(struct vnode *vp) 2235 { 2236 struct mtx *vlp; 2237 2238 VNPASS(VN_IS_DOOMED(vp), vp); 2239 if (cache_has_entries(vp)) { 2240 cache_purge_impl(vp); 2241 return; 2242 } 2243 2244 /* 2245 * Serialize against a potential thread doing cache_purge. 2246 */ 2247 vlp = VP2VNODELOCK(vp); 2248 mtx_wait_unlocked(vlp); 2249 if (cache_has_entries(vp)) { 2250 cache_purge_impl(vp); 2251 return; 2252 } 2253 return; 2254 } 2255 2256 /* 2257 * Invalidate all negative entries for a particular directory vnode. 2258 */ 2259 void 2260 cache_purge_negative(struct vnode *vp) 2261 { 2262 TAILQ_HEAD(, namecache) ncps; 2263 struct namecache *ncp, *nnp; 2264 struct mtx *vlp; 2265 2266 SDT_PROBE1(vfs, namecache, purge_negative, done, vp); 2267 if (LIST_EMPTY(&vp->v_cache_src)) 2268 return; 2269 TAILQ_INIT(&ncps); 2270 vlp = VP2VNODELOCK(vp); 2271 mtx_lock(vlp); 2272 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { 2273 if (!(ncp->nc_flag & NCF_NEGATIVE)) 2274 continue; 2275 cache_zap_negative_locked_vnode_kl(ncp, vp); 2276 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2277 } 2278 mtx_unlock(vlp); 2279 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2280 cache_free(ncp); 2281 } 2282 } 2283 2284 void 2285 cache_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, 2286 struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp) 2287 { 2288 2289 ASSERT_VOP_IN_SEQC(fdvp); 2290 ASSERT_VOP_IN_SEQC(fvp); 2291 ASSERT_VOP_IN_SEQC(tdvp); 2292 if (tvp != NULL) 2293 ASSERT_VOP_IN_SEQC(tvp); 2294 2295 cache_purge(fvp); 2296 if (tvp != NULL) { 2297 cache_purge(tvp); 2298 KASSERT(!cache_remove_cnp(tdvp, tcnp), 2299 ("%s: lingering negative entry", __func__)); 2300 } else { 2301 cache_remove_cnp(tdvp, tcnp); 2302 } 2303 } 2304 2305 /* 2306 * Flush all entries referencing a particular filesystem. 2307 */ 2308 void 2309 cache_purgevfs(struct mount *mp) 2310 { 2311 struct vnode *vp, *mvp; 2312 2313 SDT_PROBE1(vfs, namecache, purgevfs, done, mp); 2314 /* 2315 * Somewhat wasteful iteration over all vnodes. Would be better to 2316 * support filtering and avoid the interlock to begin with. 2317 */ 2318 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 2319 if (!cache_has_entries(vp)) { 2320 VI_UNLOCK(vp); 2321 continue; 2322 } 2323 vholdl(vp); 2324 VI_UNLOCK(vp); 2325 cache_purge(vp); 2326 vdrop(vp); 2327 } 2328 } 2329 2330 /* 2331 * Perform canonical checks and cache lookup and pass on to filesystem 2332 * through the vop_cachedlookup only if needed. 2333 */ 2334 2335 int 2336 vfs_cache_lookup(struct vop_lookup_args *ap) 2337 { 2338 struct vnode *dvp; 2339 int error; 2340 struct vnode **vpp = ap->a_vpp; 2341 struct componentname *cnp = ap->a_cnp; 2342 int flags = cnp->cn_flags; 2343 2344 *vpp = NULL; 2345 dvp = ap->a_dvp; 2346 2347 if (dvp->v_type != VDIR) 2348 return (ENOTDIR); 2349 2350 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && 2351 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) 2352 return (EROFS); 2353 2354 error = vn_dir_check_exec(dvp, cnp); 2355 if (error != 0) 2356 return (error); 2357 2358 error = cache_lookup(dvp, vpp, cnp, NULL, NULL); 2359 if (error == 0) 2360 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); 2361 if (error == -1) 2362 return (0); 2363 return (error); 2364 } 2365 2366 /* Implementation of the getcwd syscall. */ 2367 int 2368 sys___getcwd(struct thread *td, struct __getcwd_args *uap) 2369 { 2370 char *buf, *retbuf; 2371 size_t buflen; 2372 int error; 2373 2374 buflen = uap->buflen; 2375 if (__predict_false(buflen < 2)) 2376 return (EINVAL); 2377 if (buflen > MAXPATHLEN) 2378 buflen = MAXPATHLEN; 2379 2380 buf = uma_zalloc(namei_zone, M_WAITOK); 2381 error = vn_getcwd(buf, &retbuf, &buflen); 2382 if (error == 0) 2383 error = copyout(retbuf, uap->buf, buflen); 2384 uma_zfree(namei_zone, buf); 2385 return (error); 2386 } 2387 2388 int 2389 vn_getcwd(char *buf, char **retbuf, size_t *buflen) 2390 { 2391 struct pwd *pwd; 2392 int error; 2393 2394 vfs_smr_enter(); 2395 pwd = pwd_get_smr(); 2396 error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, 2397 buflen, false, 0); 2398 VFS_SMR_ASSERT_NOT_ENTERED(); 2399 if (error < 0) { 2400 pwd = pwd_hold(curthread); 2401 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf, 2402 retbuf, buflen); 2403 pwd_drop(pwd); 2404 } 2405 2406 #ifdef KTRACE 2407 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0) 2408 ktrnamei(*retbuf); 2409 #endif 2410 return (error); 2411 } 2412 2413 static int 2414 kern___realpathat(struct thread *td, int fd, const char *path, char *buf, 2415 size_t size, int flags, enum uio_seg pathseg) 2416 { 2417 struct nameidata nd; 2418 char *retbuf, *freebuf; 2419 int error; 2420 2421 if (flags != 0) 2422 return (EINVAL); 2423 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1, 2424 pathseg, path, fd, &cap_fstat_rights, td); 2425 if ((error = namei(&nd)) != 0) 2426 return (error); 2427 error = vn_fullpath_hardlink(&nd, &retbuf, &freebuf, &size); 2428 if (error == 0) { 2429 error = copyout(retbuf, buf, size); 2430 free(freebuf, M_TEMP); 2431 } 2432 NDFREE(&nd, 0); 2433 return (error); 2434 } 2435 2436 int 2437 sys___realpathat(struct thread *td, struct __realpathat_args *uap) 2438 { 2439 2440 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size, 2441 uap->flags, UIO_USERSPACE)); 2442 } 2443 2444 /* 2445 * Retrieve the full filesystem path that correspond to a vnode from the name 2446 * cache (if available) 2447 */ 2448 int 2449 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf) 2450 { 2451 struct pwd *pwd; 2452 char *buf; 2453 size_t buflen; 2454 int error; 2455 2456 if (__predict_false(vp == NULL)) 2457 return (EINVAL); 2458 2459 buflen = MAXPATHLEN; 2460 buf = malloc(buflen, M_TEMP, M_WAITOK); 2461 vfs_smr_enter(); 2462 pwd = pwd_get_smr(); 2463 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, false, 0); 2464 VFS_SMR_ASSERT_NOT_ENTERED(); 2465 if (error < 0) { 2466 pwd = pwd_hold(curthread); 2467 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen); 2468 pwd_drop(pwd); 2469 } 2470 if (error == 0) 2471 *freebuf = buf; 2472 else 2473 free(buf, M_TEMP); 2474 return (error); 2475 } 2476 2477 /* 2478 * This function is similar to vn_fullpath, but it attempts to lookup the 2479 * pathname relative to the global root mount point. This is required for the 2480 * auditing sub-system, as audited pathnames must be absolute, relative to the 2481 * global root mount point. 2482 */ 2483 int 2484 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf) 2485 { 2486 char *buf; 2487 size_t buflen; 2488 int error; 2489 2490 if (__predict_false(vp == NULL)) 2491 return (EINVAL); 2492 buflen = MAXPATHLEN; 2493 buf = malloc(buflen, M_TEMP, M_WAITOK); 2494 vfs_smr_enter(); 2495 error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, false, 0); 2496 VFS_SMR_ASSERT_NOT_ENTERED(); 2497 if (error < 0) { 2498 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen); 2499 } 2500 if (error == 0) 2501 *freebuf = buf; 2502 else 2503 free(buf, M_TEMP); 2504 return (error); 2505 } 2506 2507 static struct namecache * 2508 vn_dd_from_dst(struct vnode *vp) 2509 { 2510 struct namecache *ncp; 2511 2512 cache_assert_vnode_locked(vp); 2513 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) { 2514 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2515 return (ncp); 2516 } 2517 return (NULL); 2518 } 2519 2520 int 2521 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, size_t *buflen) 2522 { 2523 struct vnode *dvp; 2524 struct namecache *ncp; 2525 struct mtx *vlp; 2526 int error; 2527 2528 vlp = VP2VNODELOCK(*vp); 2529 mtx_lock(vlp); 2530 ncp = (*vp)->v_cache_dd; 2531 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) { 2532 KASSERT(ncp == vn_dd_from_dst(*vp), 2533 ("%s: mismatch for dd entry (%p != %p)", __func__, 2534 ncp, vn_dd_from_dst(*vp))); 2535 } else { 2536 ncp = vn_dd_from_dst(*vp); 2537 } 2538 if (ncp != NULL) { 2539 if (*buflen < ncp->nc_nlen) { 2540 mtx_unlock(vlp); 2541 vrele(*vp); 2542 counter_u64_add(numfullpathfail4, 1); 2543 error = ENOMEM; 2544 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2545 vp, NULL); 2546 return (error); 2547 } 2548 *buflen -= ncp->nc_nlen; 2549 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 2550 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, 2551 ncp->nc_name, vp); 2552 dvp = *vp; 2553 *vp = ncp->nc_dvp; 2554 vref(*vp); 2555 mtx_unlock(vlp); 2556 vrele(dvp); 2557 return (0); 2558 } 2559 SDT_PROBE1(vfs, namecache, fullpath, miss, vp); 2560 2561 mtx_unlock(vlp); 2562 vn_lock(*vp, LK_SHARED | LK_RETRY); 2563 error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen); 2564 vput(*vp); 2565 if (error) { 2566 counter_u64_add(numfullpathfail2, 1); 2567 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2568 return (error); 2569 } 2570 2571 *vp = dvp; 2572 if (VN_IS_DOOMED(dvp)) { 2573 /* forced unmount */ 2574 vrele(dvp); 2575 error = ENOENT; 2576 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2577 return (error); 2578 } 2579 /* 2580 * *vp has its use count incremented still. 2581 */ 2582 2583 return (0); 2584 } 2585 2586 /* 2587 * Resolve a directory to a pathname. 2588 * 2589 * The name of the directory can always be found in the namecache or fetched 2590 * from the filesystem. There is also guaranteed to be only one parent, meaning 2591 * we can just follow vnodes up until we find the root. 2592 * 2593 * The vnode must be referenced. 2594 */ 2595 static int 2596 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 2597 size_t *len, bool slash_prefixed, size_t addend) 2598 { 2599 #ifdef KDTRACE_HOOKS 2600 struct vnode *startvp = vp; 2601 #endif 2602 struct vnode *vp1; 2603 size_t buflen; 2604 int error; 2605 2606 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); 2607 VNPASS(vp->v_usecount > 0, vp); 2608 2609 buflen = *len; 2610 2611 if (!slash_prefixed) { 2612 MPASS(*len >= 2); 2613 buflen--; 2614 buf[buflen] = '\0'; 2615 } 2616 2617 error = 0; 2618 2619 SDT_PROBE1(vfs, namecache, fullpath, entry, vp); 2620 counter_u64_add(numfullpathcalls, 1); 2621 while (vp != rdir && vp != rootvnode) { 2622 /* 2623 * The vp vnode must be already fully constructed, 2624 * since it is either found in namecache or obtained 2625 * from VOP_VPTOCNP(). We may test for VV_ROOT safely 2626 * without obtaining the vnode lock. 2627 */ 2628 if ((vp->v_vflag & VV_ROOT) != 0) { 2629 vn_lock(vp, LK_RETRY | LK_SHARED); 2630 2631 /* 2632 * With the vnode locked, check for races with 2633 * unmount, forced or not. Note that we 2634 * already verified that vp is not equal to 2635 * the root vnode, which means that 2636 * mnt_vnodecovered can be NULL only for the 2637 * case of unmount. 2638 */ 2639 if (VN_IS_DOOMED(vp) || 2640 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || 2641 vp1->v_mountedhere != vp->v_mount) { 2642 vput(vp); 2643 error = ENOENT; 2644 SDT_PROBE3(vfs, namecache, fullpath, return, 2645 error, vp, NULL); 2646 break; 2647 } 2648 2649 vref(vp1); 2650 vput(vp); 2651 vp = vp1; 2652 continue; 2653 } 2654 if (vp->v_type != VDIR) { 2655 vrele(vp); 2656 counter_u64_add(numfullpathfail1, 1); 2657 error = ENOTDIR; 2658 SDT_PROBE3(vfs, namecache, fullpath, return, 2659 error, vp, NULL); 2660 break; 2661 } 2662 error = vn_vptocnp(&vp, curthread->td_ucred, buf, &buflen); 2663 if (error) 2664 break; 2665 if (buflen == 0) { 2666 vrele(vp); 2667 error = ENOMEM; 2668 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2669 startvp, NULL); 2670 break; 2671 } 2672 buf[--buflen] = '/'; 2673 slash_prefixed = true; 2674 } 2675 if (error) 2676 return (error); 2677 if (!slash_prefixed) { 2678 if (buflen == 0) { 2679 vrele(vp); 2680 counter_u64_add(numfullpathfail4, 1); 2681 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, 2682 startvp, NULL); 2683 return (ENOMEM); 2684 } 2685 buf[--buflen] = '/'; 2686 } 2687 counter_u64_add(numfullpathfound, 1); 2688 vrele(vp); 2689 2690 *retbuf = buf + buflen; 2691 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf); 2692 *len -= buflen; 2693 *len += addend; 2694 return (0); 2695 } 2696 2697 /* 2698 * Resolve an arbitrary vnode to a pathname. 2699 * 2700 * Note 2 caveats: 2701 * - hardlinks are not tracked, thus if the vnode is not a directory this can 2702 * resolve to a different path than the one used to find it 2703 * - namecache is not mandatory, meaning names are not guaranteed to be added 2704 * (in which case resolving fails) 2705 */ 2706 static void __inline 2707 cache_rev_failed_impl(int *reason, int line) 2708 { 2709 2710 *reason = line; 2711 } 2712 #define cache_rev_failed(var) cache_rev_failed_impl((var), __LINE__) 2713 2714 static int 2715 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 2716 char **retbuf, size_t *buflen, bool slash_prefixed, size_t addend) 2717 { 2718 #ifdef KDTRACE_HOOKS 2719 struct vnode *startvp = vp; 2720 #endif 2721 struct vnode *tvp; 2722 struct mount *mp; 2723 struct namecache *ncp; 2724 size_t orig_buflen; 2725 int reason; 2726 int error; 2727 #ifdef KDTRACE_HOOKS 2728 int i; 2729 #endif 2730 seqc_t vp_seqc, tvp_seqc; 2731 u_char nc_flag; 2732 2733 VFS_SMR_ASSERT_ENTERED(); 2734 2735 if (!cache_fast_revlookup) { 2736 vfs_smr_exit(); 2737 return (-1); 2738 } 2739 2740 orig_buflen = *buflen; 2741 2742 if (!slash_prefixed) { 2743 MPASS(*buflen >= 2); 2744 *buflen -= 1; 2745 buf[*buflen] = '\0'; 2746 } 2747 2748 if (vp == rdir || vp == rootvnode) { 2749 if (!slash_prefixed) { 2750 *buflen -= 1; 2751 buf[*buflen] = '/'; 2752 } 2753 goto out_ok; 2754 } 2755 2756 #ifdef KDTRACE_HOOKS 2757 i = 0; 2758 #endif 2759 error = -1; 2760 ncp = NULL; /* for sdt probe down below */ 2761 vp_seqc = vn_seqc_read_any(vp); 2762 if (seqc_in_modify(vp_seqc)) { 2763 cache_rev_failed(&reason); 2764 goto out_abort; 2765 } 2766 2767 for (;;) { 2768 #ifdef KDTRACE_HOOKS 2769 i++; 2770 #endif 2771 if ((vp->v_vflag & VV_ROOT) != 0) { 2772 mp = atomic_load_ptr(&vp->v_mount); 2773 if (mp == NULL) { 2774 cache_rev_failed(&reason); 2775 goto out_abort; 2776 } 2777 tvp = atomic_load_ptr(&mp->mnt_vnodecovered); 2778 tvp_seqc = vn_seqc_read_any(tvp); 2779 if (seqc_in_modify(tvp_seqc)) { 2780 cache_rev_failed(&reason); 2781 goto out_abort; 2782 } 2783 if (!vn_seqc_consistent(vp, vp_seqc)) { 2784 cache_rev_failed(&reason); 2785 goto out_abort; 2786 } 2787 vp = tvp; 2788 vp_seqc = tvp_seqc; 2789 continue; 2790 } 2791 ncp = atomic_load_ptr(&vp->v_cache_dd); 2792 if (ncp == NULL) { 2793 cache_rev_failed(&reason); 2794 goto out_abort; 2795 } 2796 nc_flag = atomic_load_char(&ncp->nc_flag); 2797 if ((nc_flag & NCF_ISDOTDOT) != 0) { 2798 cache_rev_failed(&reason); 2799 goto out_abort; 2800 } 2801 if (!cache_ncp_canuse(ncp)) { 2802 cache_rev_failed(&reason); 2803 goto out_abort; 2804 } 2805 if (ncp->nc_nlen >= *buflen) { 2806 cache_rev_failed(&reason); 2807 error = ENOMEM; 2808 goto out_abort; 2809 } 2810 *buflen -= ncp->nc_nlen; 2811 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 2812 *buflen -= 1; 2813 buf[*buflen] = '/'; 2814 tvp = ncp->nc_dvp; 2815 tvp_seqc = vn_seqc_read_any(tvp); 2816 if (seqc_in_modify(tvp_seqc)) { 2817 cache_rev_failed(&reason); 2818 goto out_abort; 2819 } 2820 if (!vn_seqc_consistent(vp, vp_seqc)) { 2821 cache_rev_failed(&reason); 2822 goto out_abort; 2823 } 2824 vp = tvp; 2825 vp_seqc = tvp_seqc; 2826 if (vp == rdir || vp == rootvnode) 2827 break; 2828 } 2829 out_ok: 2830 vfs_smr_exit(); 2831 *retbuf = buf + *buflen; 2832 *buflen = orig_buflen - *buflen + addend; 2833 SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf); 2834 return (0); 2835 2836 out_abort: 2837 *buflen = orig_buflen; 2838 SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i); 2839 vfs_smr_exit(); 2840 return (error); 2841 } 2842 2843 static int 2844 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 2845 size_t *buflen) 2846 { 2847 size_t orig_buflen; 2848 bool slash_prefixed; 2849 int error; 2850 2851 if (*buflen < 2) 2852 return (EINVAL); 2853 2854 orig_buflen = *buflen; 2855 2856 vref(vp); 2857 slash_prefixed = false; 2858 if (vp->v_type != VDIR) { 2859 *buflen -= 1; 2860 buf[*buflen] = '\0'; 2861 error = vn_vptocnp(&vp, curthread->td_ucred, buf, buflen); 2862 if (error) 2863 return (error); 2864 if (*buflen == 0) { 2865 vrele(vp); 2866 return (ENOMEM); 2867 } 2868 *buflen -= 1; 2869 buf[*buflen] = '/'; 2870 slash_prefixed = true; 2871 } 2872 2873 return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, slash_prefixed, 2874 orig_buflen - *buflen)); 2875 } 2876 2877 /* 2878 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks). 2879 * 2880 * Since the namecache does not track handlings, the caller is expected to first 2881 * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei. 2882 * 2883 * Then we have 2 cases: 2884 * - if the found vnode is a directory, the path can be constructed just by 2885 * fullowing names up the chain 2886 * - otherwise we populate the buffer with the saved name and start resolving 2887 * from the parent 2888 */ 2889 static int 2890 vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, char **freebuf, 2891 size_t *buflen) 2892 { 2893 char *buf, *tmpbuf; 2894 struct pwd *pwd; 2895 struct componentname *cnp; 2896 struct vnode *vp; 2897 size_t addend; 2898 int error; 2899 bool slash_prefixed; 2900 enum vtype type; 2901 2902 if (*buflen < 2) 2903 return (EINVAL); 2904 if (*buflen > MAXPATHLEN) 2905 *buflen = MAXPATHLEN; 2906 2907 slash_prefixed = false; 2908 2909 buf = malloc(*buflen, M_TEMP, M_WAITOK); 2910 2911 addend = 0; 2912 vp = ndp->ni_vp; 2913 /* 2914 * Check for VBAD to work around the vp_crossmp bug in lookup(). 2915 * 2916 * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be 2917 * set to mount point's root vnode while ni_dvp will be vp_crossmp. 2918 * If the type is VDIR (like in this very case) we can skip looking 2919 * at ni_dvp in the first place. However, since vnodes get passed here 2920 * unlocked the target may transition to doomed state (type == VBAD) 2921 * before we get to evaluate the condition. If this happens, we will 2922 * populate part of the buffer and descend to vn_fullpath_dir with 2923 * vp == vp_crossmp. Prevent the problem by checking for VBAD. 2924 * 2925 * This should be atomic_load(&vp->v_type) but it is ilegal to take 2926 * an address of a bit field, even if said field is sized to char. 2927 * Work around the problem by reading the value into a full-sized enum 2928 * and then re-reading it with atomic_load which will still prevent 2929 * the compiler from re-reading down the road. 2930 */ 2931 type = vp->v_type; 2932 type = atomic_load_int(&type); 2933 if (type == VBAD) { 2934 error = ENOENT; 2935 goto out_bad; 2936 } 2937 if (type != VDIR) { 2938 cnp = &ndp->ni_cnd; 2939 addend = cnp->cn_namelen + 2; 2940 if (*buflen < addend) { 2941 error = ENOMEM; 2942 goto out_bad; 2943 } 2944 *buflen -= addend; 2945 tmpbuf = buf + *buflen; 2946 tmpbuf[0] = '/'; 2947 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen); 2948 tmpbuf[addend - 1] = '\0'; 2949 slash_prefixed = true; 2950 vp = ndp->ni_dvp; 2951 } 2952 2953 vfs_smr_enter(); 2954 pwd = pwd_get_smr(); 2955 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen, 2956 slash_prefixed, addend); 2957 VFS_SMR_ASSERT_NOT_ENTERED(); 2958 if (error < 0) { 2959 pwd = pwd_hold(curthread); 2960 vref(vp); 2961 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen, 2962 slash_prefixed, addend); 2963 pwd_drop(pwd); 2964 if (error != 0) 2965 goto out_bad; 2966 } 2967 2968 *freebuf = buf; 2969 2970 return (0); 2971 out_bad: 2972 free(buf, M_TEMP); 2973 return (error); 2974 } 2975 2976 struct vnode * 2977 vn_dir_dd_ino(struct vnode *vp) 2978 { 2979 struct namecache *ncp; 2980 struct vnode *ddvp; 2981 struct mtx *vlp; 2982 enum vgetstate vs; 2983 2984 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); 2985 vlp = VP2VNODELOCK(vp); 2986 mtx_lock(vlp); 2987 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { 2988 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 2989 continue; 2990 ddvp = ncp->nc_dvp; 2991 vs = vget_prep(ddvp); 2992 mtx_unlock(vlp); 2993 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) 2994 return (NULL); 2995 return (ddvp); 2996 } 2997 mtx_unlock(vlp); 2998 return (NULL); 2999 } 3000 3001 int 3002 vn_commname(struct vnode *vp, char *buf, u_int buflen) 3003 { 3004 struct namecache *ncp; 3005 struct mtx *vlp; 3006 int l; 3007 3008 vlp = VP2VNODELOCK(vp); 3009 mtx_lock(vlp); 3010 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) 3011 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 3012 break; 3013 if (ncp == NULL) { 3014 mtx_unlock(vlp); 3015 return (ENOENT); 3016 } 3017 l = min(ncp->nc_nlen, buflen - 1); 3018 memcpy(buf, ncp->nc_name, l); 3019 mtx_unlock(vlp); 3020 buf[l] = '\0'; 3021 return (0); 3022 } 3023 3024 /* 3025 * This function updates path string to vnode's full global path 3026 * and checks the size of the new path string against the pathlen argument. 3027 * 3028 * Requires a locked, referenced vnode. 3029 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 3030 * 3031 * If vp is a directory, the call to vn_fullpath_global() always succeeds 3032 * because it falls back to the ".." lookup if the namecache lookup fails. 3033 */ 3034 int 3035 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, 3036 u_int pathlen) 3037 { 3038 struct nameidata nd; 3039 struct vnode *vp1; 3040 char *rpath, *fbuf; 3041 int error; 3042 3043 ASSERT_VOP_ELOCKED(vp, __func__); 3044 3045 /* Construct global filesystem path from vp. */ 3046 VOP_UNLOCK(vp); 3047 error = vn_fullpath_global(vp, &rpath, &fbuf); 3048 3049 if (error != 0) { 3050 vrele(vp); 3051 return (error); 3052 } 3053 3054 if (strlen(rpath) >= pathlen) { 3055 vrele(vp); 3056 error = ENAMETOOLONG; 3057 goto out; 3058 } 3059 3060 /* 3061 * Re-lookup the vnode by path to detect a possible rename. 3062 * As a side effect, the vnode is relocked. 3063 * If vnode was renamed, return ENOENT. 3064 */ 3065 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, 3066 UIO_SYSSPACE, path, td); 3067 error = namei(&nd); 3068 if (error != 0) { 3069 vrele(vp); 3070 goto out; 3071 } 3072 NDFREE(&nd, NDF_ONLY_PNBUF); 3073 vp1 = nd.ni_vp; 3074 vrele(vp); 3075 if (vp1 == vp) 3076 strcpy(path, rpath); 3077 else { 3078 vput(vp1); 3079 error = ENOENT; 3080 } 3081 3082 out: 3083 free(fbuf, M_TEMP); 3084 return (error); 3085 } 3086 3087 #ifdef DDB 3088 static void 3089 db_print_vpath(struct vnode *vp) 3090 { 3091 3092 while (vp != NULL) { 3093 db_printf("%p: ", vp); 3094 if (vp == rootvnode) { 3095 db_printf("/"); 3096 vp = NULL; 3097 } else { 3098 if (vp->v_vflag & VV_ROOT) { 3099 db_printf("<mount point>"); 3100 vp = vp->v_mount->mnt_vnodecovered; 3101 } else { 3102 struct namecache *ncp; 3103 char *ncn; 3104 int i; 3105 3106 ncp = TAILQ_FIRST(&vp->v_cache_dst); 3107 if (ncp != NULL) { 3108 ncn = ncp->nc_name; 3109 for (i = 0; i < ncp->nc_nlen; i++) 3110 db_printf("%c", *ncn++); 3111 vp = ncp->nc_dvp; 3112 } else { 3113 vp = NULL; 3114 } 3115 } 3116 } 3117 db_printf("\n"); 3118 } 3119 3120 return; 3121 } 3122 3123 DB_SHOW_COMMAND(vpath, db_show_vpath) 3124 { 3125 struct vnode *vp; 3126 3127 if (!have_addr) { 3128 db_printf("usage: show vpath <struct vnode *>\n"); 3129 return; 3130 } 3131 3132 vp = (struct vnode *)addr; 3133 db_print_vpath(vp); 3134 } 3135 3136 #endif 3137 3138 static bool __read_frequently cache_fast_lookup = true; 3139 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW, 3140 &cache_fast_lookup, 0, ""); 3141 3142 #define CACHE_FPL_FAILED -2020 3143 3144 static void 3145 cache_fpl_cleanup_cnp(struct componentname *cnp) 3146 { 3147 3148 uma_zfree(namei_zone, cnp->cn_pnbuf); 3149 #ifdef DIAGNOSTIC 3150 cnp->cn_pnbuf = NULL; 3151 cnp->cn_nameptr = NULL; 3152 #endif 3153 } 3154 3155 static void 3156 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp) 3157 { 3158 struct componentname *cnp; 3159 3160 cnp = &ndp->ni_cnd; 3161 while (*(cnp->cn_nameptr) == '/') { 3162 cnp->cn_nameptr++; 3163 ndp->ni_pathlen--; 3164 } 3165 3166 *dpp = ndp->ni_rootdir; 3167 } 3168 3169 /* 3170 * Components of nameidata (or objects it can point to) which may 3171 * need restoring in case fast path lookup fails. 3172 */ 3173 struct nameidata_saved { 3174 long cn_namelen; 3175 char *cn_nameptr; 3176 size_t ni_pathlen; 3177 int cn_flags; 3178 }; 3179 3180 struct cache_fpl { 3181 struct nameidata *ndp; 3182 struct componentname *cnp; 3183 struct pwd *pwd; 3184 struct vnode *dvp; 3185 struct vnode *tvp; 3186 seqc_t dvp_seqc; 3187 seqc_t tvp_seqc; 3188 struct nameidata_saved snd; 3189 int line; 3190 enum cache_fpl_status status:8; 3191 bool in_smr; 3192 }; 3193 3194 static void 3195 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd) 3196 { 3197 3198 snd->cn_flags = fpl->ndp->ni_cnd.cn_flags; 3199 snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen; 3200 snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr; 3201 snd->ni_pathlen = fpl->ndp->ni_pathlen; 3202 } 3203 3204 static void 3205 cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd) 3206 { 3207 3208 fpl->ndp->ni_cnd.cn_flags = snd->cn_flags; 3209 fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen; 3210 fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr; 3211 fpl->ndp->ni_pathlen = snd->ni_pathlen; 3212 } 3213 3214 #ifdef INVARIANTS 3215 #define cache_fpl_smr_assert_entered(fpl) ({ \ 3216 struct cache_fpl *_fpl = (fpl); \ 3217 MPASS(_fpl->in_smr == true); \ 3218 VFS_SMR_ASSERT_ENTERED(); \ 3219 }) 3220 #define cache_fpl_smr_assert_not_entered(fpl) ({ \ 3221 struct cache_fpl *_fpl = (fpl); \ 3222 MPASS(_fpl->in_smr == false); \ 3223 VFS_SMR_ASSERT_NOT_ENTERED(); \ 3224 }) 3225 #else 3226 #define cache_fpl_smr_assert_entered(fpl) do { } while (0) 3227 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0) 3228 #endif 3229 3230 #define cache_fpl_smr_enter_initial(fpl) ({ \ 3231 struct cache_fpl *_fpl = (fpl); \ 3232 vfs_smr_enter(); \ 3233 _fpl->in_smr = true; \ 3234 }) 3235 3236 #define cache_fpl_smr_enter(fpl) ({ \ 3237 struct cache_fpl *_fpl = (fpl); \ 3238 MPASS(_fpl->in_smr == false); \ 3239 vfs_smr_enter(); \ 3240 _fpl->in_smr = true; \ 3241 }) 3242 3243 #define cache_fpl_smr_exit(fpl) ({ \ 3244 struct cache_fpl *_fpl = (fpl); \ 3245 MPASS(_fpl->in_smr == true); \ 3246 vfs_smr_exit(); \ 3247 _fpl->in_smr = false; \ 3248 }) 3249 3250 static int 3251 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line) 3252 { 3253 3254 if (fpl->status != CACHE_FPL_STATUS_UNSET) { 3255 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, 3256 ("%s: converting to abort from %d at %d, set at %d\n", 3257 __func__, fpl->status, line, fpl->line)); 3258 } 3259 fpl->status = CACHE_FPL_STATUS_ABORTED; 3260 fpl->line = line; 3261 return (CACHE_FPL_FAILED); 3262 } 3263 3264 #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__) 3265 3266 static int 3267 cache_fpl_partial_impl(struct cache_fpl *fpl, int line) 3268 { 3269 3270 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3271 ("%s: setting to partial at %d, but already set to %d at %d\n", 3272 __func__, line, fpl->status, fpl->line)); 3273 cache_fpl_smr_assert_entered(fpl); 3274 fpl->status = CACHE_FPL_STATUS_PARTIAL; 3275 fpl->line = line; 3276 return (CACHE_FPL_FAILED); 3277 } 3278 3279 #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__) 3280 3281 static int 3282 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line) 3283 { 3284 3285 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3286 ("%s: setting to handled at %d, but already set to %d at %d\n", 3287 __func__, line, fpl->status, fpl->line)); 3288 cache_fpl_smr_assert_not_entered(fpl); 3289 MPASS(error != CACHE_FPL_FAILED); 3290 fpl->status = CACHE_FPL_STATUS_HANDLED; 3291 fpl->line = line; 3292 return (error); 3293 } 3294 3295 #define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__) 3296 3297 #define CACHE_FPL_SUPPORTED_CN_FLAGS \ 3298 (LOCKLEAF | LOCKPARENT | WANTPARENT | NOCACHE | FOLLOW | LOCKSHARED | SAVENAME | \ 3299 SAVESTART | WILLBEDIR | ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK) 3300 3301 #define CACHE_FPL_INTERNAL_CN_FLAGS \ 3302 (ISDOTDOT | MAKEENTRY | ISLASTCN) 3303 3304 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, 3305 "supported and internal flags overlap"); 3306 3307 static bool 3308 cache_fpl_islastcn(struct nameidata *ndp) 3309 { 3310 3311 return (*ndp->ni_next == 0); 3312 } 3313 3314 static bool 3315 cache_fpl_isdotdot(struct componentname *cnp) 3316 { 3317 3318 if (cnp->cn_namelen == 2 && 3319 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') 3320 return (true); 3321 return (false); 3322 } 3323 3324 static bool 3325 cache_can_fplookup(struct cache_fpl *fpl) 3326 { 3327 struct nameidata *ndp; 3328 struct componentname *cnp; 3329 struct thread *td; 3330 3331 ndp = fpl->ndp; 3332 cnp = fpl->cnp; 3333 td = cnp->cn_thread; 3334 3335 if (!cache_fast_lookup) { 3336 cache_fpl_aborted(fpl); 3337 return (false); 3338 } 3339 #ifdef MAC 3340 if (mac_vnode_check_lookup_enabled()) { 3341 cache_fpl_aborted(fpl); 3342 return (false); 3343 } 3344 #endif 3345 if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) { 3346 cache_fpl_aborted(fpl); 3347 return (false); 3348 } 3349 if (ndp->ni_dirfd != AT_FDCWD) { 3350 cache_fpl_aborted(fpl); 3351 return (false); 3352 } 3353 if (IN_CAPABILITY_MODE(td)) { 3354 cache_fpl_aborted(fpl); 3355 return (false); 3356 } 3357 if (AUDITING_TD(td)) { 3358 cache_fpl_aborted(fpl); 3359 return (false); 3360 } 3361 if (ndp->ni_startdir != NULL) { 3362 cache_fpl_aborted(fpl); 3363 return (false); 3364 } 3365 return (true); 3366 } 3367 3368 static bool 3369 cache_fplookup_vnode_supported(struct vnode *vp) 3370 { 3371 3372 return (vp->v_type != VLNK); 3373 } 3374 3375 /* 3376 * Move a negative entry to the hot list. 3377 * 3378 * We have to take locks, but they may be contended and in the worst 3379 * case we may need to go off CPU. We don't want to spin within the 3380 * smr section and we can't block with it. Instead we are going to 3381 * look up the entry again. 3382 */ 3383 static int __noinline 3384 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp, 3385 uint32_t hash) 3386 { 3387 struct componentname *cnp; 3388 struct namecache *ncp; 3389 struct neglist *neglist; 3390 struct negstate *negstate; 3391 struct vnode *dvp; 3392 u_char nc_flag; 3393 3394 cnp = fpl->cnp; 3395 dvp = fpl->dvp; 3396 3397 if (!vhold_smr(dvp)) 3398 return (cache_fpl_aborted(fpl)); 3399 3400 neglist = NCP2NEGLIST(oncp); 3401 cache_fpl_smr_exit(fpl); 3402 3403 mtx_lock(&ncneg_hot.nl_lock); 3404 mtx_lock(&neglist->nl_lock); 3405 /* 3406 * For hash iteration. 3407 */ 3408 cache_fpl_smr_enter(fpl); 3409 3410 /* 3411 * Avoid all surprises by only succeeding if we got the same entry and 3412 * bailing completely otherwise. 3413 * 3414 * In particular at this point there can be a new ncp which matches the 3415 * search but hashes to a different neglist. 3416 */ 3417 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 3418 if (ncp == oncp) 3419 break; 3420 } 3421 3422 /* 3423 * No match to begin with. 3424 */ 3425 if (__predict_false(ncp == NULL)) { 3426 goto out_abort; 3427 } 3428 3429 /* 3430 * The newly found entry may be something different... 3431 */ 3432 if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 3433 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) { 3434 goto out_abort; 3435 } 3436 3437 /* 3438 * ... and not even negative. 3439 */ 3440 nc_flag = atomic_load_char(&ncp->nc_flag); 3441 if ((nc_flag & NCF_NEGATIVE) == 0) { 3442 goto out_abort; 3443 } 3444 3445 if (__predict_false(!cache_ncp_canuse(ncp))) { 3446 goto out_abort; 3447 } 3448 3449 negstate = NCP2NEGSTATE(ncp); 3450 if ((negstate->neg_flag & NEG_HOT) == 0) { 3451 numhotneg++; 3452 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 3453 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); 3454 negstate->neg_flag |= NEG_HOT; 3455 } 3456 3457 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 3458 counter_u64_add(numneghits, 1); 3459 cache_fpl_smr_exit(fpl); 3460 mtx_unlock(&neglist->nl_lock); 3461 mtx_unlock(&ncneg_hot.nl_lock); 3462 vdrop(dvp); 3463 return (cache_fpl_handled(fpl, ENOENT)); 3464 out_abort: 3465 cache_fpl_smr_exit(fpl); 3466 mtx_unlock(&neglist->nl_lock); 3467 mtx_unlock(&ncneg_hot.nl_lock); 3468 vdrop(dvp); 3469 return (cache_fpl_aborted(fpl)); 3470 } 3471 3472 /* 3473 * The target vnode is not supported, prepare for the slow path to take over. 3474 */ 3475 static int __noinline 3476 cache_fplookup_partial_setup(struct cache_fpl *fpl) 3477 { 3478 struct nameidata *ndp; 3479 struct componentname *cnp; 3480 enum vgetstate dvs; 3481 struct vnode *dvp; 3482 struct pwd *pwd; 3483 seqc_t dvp_seqc; 3484 3485 ndp = fpl->ndp; 3486 cnp = fpl->cnp; 3487 dvp = fpl->dvp; 3488 dvp_seqc = fpl->dvp_seqc; 3489 3490 dvs = vget_prep_smr(dvp); 3491 if (__predict_false(dvs == VGET_NONE)) { 3492 cache_fpl_smr_exit(fpl); 3493 return (cache_fpl_aborted(fpl)); 3494 } 3495 3496 cache_fpl_smr_exit(fpl); 3497 3498 vget_finish_ref(dvp, dvs); 3499 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3500 vrele(dvp); 3501 return (cache_fpl_aborted(fpl)); 3502 } 3503 3504 pwd = pwd_hold(curthread); 3505 if (fpl->pwd != pwd) { 3506 vrele(dvp); 3507 pwd_drop(pwd); 3508 return (cache_fpl_aborted(fpl)); 3509 } 3510 3511 cache_fpl_restore(fpl, &fpl->snd); 3512 3513 ndp->ni_startdir = dvp; 3514 cnp->cn_flags |= MAKEENTRY; 3515 if (cache_fpl_islastcn(ndp)) 3516 cnp->cn_flags |= ISLASTCN; 3517 if (cache_fpl_isdotdot(cnp)) 3518 cnp->cn_flags |= ISDOTDOT; 3519 3520 return (0); 3521 } 3522 3523 static int 3524 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs) 3525 { 3526 struct componentname *cnp; 3527 struct vnode *tvp; 3528 seqc_t tvp_seqc; 3529 int error, lkflags; 3530 3531 cnp = fpl->cnp; 3532 tvp = fpl->tvp; 3533 tvp_seqc = fpl->tvp_seqc; 3534 3535 if ((cnp->cn_flags & LOCKLEAF) != 0) { 3536 lkflags = LK_SHARED; 3537 if ((cnp->cn_flags & LOCKSHARED) == 0) 3538 lkflags = LK_EXCLUSIVE; 3539 error = vget_finish(tvp, lkflags, tvs); 3540 if (__predict_false(error != 0)) { 3541 return (cache_fpl_aborted(fpl)); 3542 } 3543 } else { 3544 vget_finish_ref(tvp, tvs); 3545 } 3546 3547 if (!vn_seqc_consistent(tvp, tvp_seqc)) { 3548 if ((cnp->cn_flags & LOCKLEAF) != 0) 3549 vput(tvp); 3550 else 3551 vrele(tvp); 3552 return (cache_fpl_aborted(fpl)); 3553 } 3554 3555 return (cache_fpl_handled(fpl, 0)); 3556 } 3557 3558 /* 3559 * They want to possibly modify the state of the namecache. 3560 * 3561 * Don't try to match the API contract, just leave. 3562 * TODO: this leaves scalability on the table 3563 */ 3564 static int 3565 cache_fplookup_final_modifying(struct cache_fpl *fpl) 3566 { 3567 struct componentname *cnp; 3568 3569 cnp = fpl->cnp; 3570 MPASS(cnp->cn_nameiop != LOOKUP); 3571 return (cache_fpl_partial(fpl)); 3572 } 3573 3574 static int __noinline 3575 cache_fplookup_final_withparent(struct cache_fpl *fpl) 3576 { 3577 struct componentname *cnp; 3578 enum vgetstate dvs, tvs; 3579 struct vnode *dvp, *tvp; 3580 seqc_t dvp_seqc; 3581 int error; 3582 3583 cnp = fpl->cnp; 3584 dvp = fpl->dvp; 3585 dvp_seqc = fpl->dvp_seqc; 3586 tvp = fpl->tvp; 3587 3588 MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0); 3589 3590 /* 3591 * This is less efficient than it can be for simplicity. 3592 */ 3593 dvs = vget_prep_smr(dvp); 3594 if (__predict_false(dvs == VGET_NONE)) { 3595 return (cache_fpl_aborted(fpl)); 3596 } 3597 tvs = vget_prep_smr(tvp); 3598 if (__predict_false(tvs == VGET_NONE)) { 3599 cache_fpl_smr_exit(fpl); 3600 vget_abort(dvp, dvs); 3601 return (cache_fpl_aborted(fpl)); 3602 } 3603 3604 cache_fpl_smr_exit(fpl); 3605 3606 if ((cnp->cn_flags & LOCKPARENT) != 0) { 3607 error = vget_finish(dvp, LK_EXCLUSIVE, dvs); 3608 if (__predict_false(error != 0)) { 3609 vget_abort(tvp, tvs); 3610 return (cache_fpl_aborted(fpl)); 3611 } 3612 } else { 3613 vget_finish_ref(dvp, dvs); 3614 } 3615 3616 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3617 vget_abort(tvp, tvs); 3618 if ((cnp->cn_flags & LOCKPARENT) != 0) 3619 vput(dvp); 3620 else 3621 vrele(dvp); 3622 return (cache_fpl_aborted(fpl)); 3623 } 3624 3625 error = cache_fplookup_final_child(fpl, tvs); 3626 if (__predict_false(error != 0)) { 3627 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED); 3628 if ((cnp->cn_flags & LOCKPARENT) != 0) 3629 vput(dvp); 3630 else 3631 vrele(dvp); 3632 return (error); 3633 } 3634 3635 MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED); 3636 return (0); 3637 } 3638 3639 static int 3640 cache_fplookup_final(struct cache_fpl *fpl) 3641 { 3642 struct componentname *cnp; 3643 enum vgetstate tvs; 3644 struct vnode *dvp, *tvp; 3645 seqc_t dvp_seqc; 3646 3647 cnp = fpl->cnp; 3648 dvp = fpl->dvp; 3649 dvp_seqc = fpl->dvp_seqc; 3650 tvp = fpl->tvp; 3651 3652 VNPASS(cache_fplookup_vnode_supported(dvp), dvp); 3653 3654 if (cnp->cn_nameiop != LOOKUP) { 3655 return (cache_fplookup_final_modifying(fpl)); 3656 } 3657 3658 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) 3659 return (cache_fplookup_final_withparent(fpl)); 3660 3661 tvs = vget_prep_smr(tvp); 3662 if (__predict_false(tvs == VGET_NONE)) { 3663 return (cache_fpl_partial(fpl)); 3664 } 3665 3666 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3667 cache_fpl_smr_exit(fpl); 3668 vget_abort(tvp, tvs); 3669 return (cache_fpl_aborted(fpl)); 3670 } 3671 3672 cache_fpl_smr_exit(fpl); 3673 return (cache_fplookup_final_child(fpl, tvs)); 3674 } 3675 3676 static int __noinline 3677 cache_fplookup_dot(struct cache_fpl *fpl) 3678 { 3679 struct vnode *dvp; 3680 3681 dvp = fpl->dvp; 3682 3683 fpl->tvp = dvp; 3684 fpl->tvp_seqc = vn_seqc_read_any(dvp); 3685 if (seqc_in_modify(fpl->tvp_seqc)) { 3686 return (cache_fpl_aborted(fpl)); 3687 } 3688 3689 counter_u64_add(dothits, 1); 3690 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", dvp); 3691 3692 return (0); 3693 } 3694 3695 static int __noinline 3696 cache_fplookup_dotdot(struct cache_fpl *fpl) 3697 { 3698 struct nameidata *ndp; 3699 struct componentname *cnp; 3700 struct namecache *ncp; 3701 struct vnode *dvp; 3702 struct prison *pr; 3703 u_char nc_flag; 3704 3705 ndp = fpl->ndp; 3706 cnp = fpl->cnp; 3707 dvp = fpl->dvp; 3708 3709 /* 3710 * XXX this is racy the same way regular lookup is 3711 */ 3712 for (pr = cnp->cn_cred->cr_prison; pr != NULL; 3713 pr = pr->pr_parent) 3714 if (dvp == pr->pr_root) 3715 break; 3716 3717 if (dvp == ndp->ni_rootdir || 3718 dvp == ndp->ni_topdir || 3719 dvp == rootvnode || 3720 pr != NULL) { 3721 fpl->tvp = dvp; 3722 fpl->tvp_seqc = vn_seqc_read_any(dvp); 3723 if (seqc_in_modify(fpl->tvp_seqc)) { 3724 return (cache_fpl_aborted(fpl)); 3725 } 3726 return (0); 3727 } 3728 3729 if ((dvp->v_vflag & VV_ROOT) != 0) { 3730 /* 3731 * TODO 3732 * The opposite of climb mount is needed here. 3733 */ 3734 return (cache_fpl_aborted(fpl)); 3735 } 3736 3737 ncp = atomic_load_ptr(&dvp->v_cache_dd); 3738 if (ncp == NULL) { 3739 return (cache_fpl_aborted(fpl)); 3740 } 3741 3742 nc_flag = atomic_load_char(&ncp->nc_flag); 3743 if ((nc_flag & NCF_ISDOTDOT) != 0) { 3744 if ((nc_flag & NCF_NEGATIVE) != 0) 3745 return (cache_fpl_aborted(fpl)); 3746 fpl->tvp = ncp->nc_vp; 3747 } else { 3748 fpl->tvp = ncp->nc_dvp; 3749 } 3750 3751 if (__predict_false(!cache_ncp_canuse(ncp))) { 3752 return (cache_fpl_aborted(fpl)); 3753 } 3754 3755 fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp); 3756 if (seqc_in_modify(fpl->tvp_seqc)) { 3757 return (cache_fpl_partial(fpl)); 3758 } 3759 3760 counter_u64_add(dotdothits, 1); 3761 return (0); 3762 } 3763 3764 static int 3765 cache_fplookup_next(struct cache_fpl *fpl) 3766 { 3767 struct componentname *cnp; 3768 struct namecache *ncp; 3769 struct negstate *negstate; 3770 struct vnode *dvp, *tvp; 3771 u_char nc_flag; 3772 uint32_t hash; 3773 bool neg_hot; 3774 3775 cnp = fpl->cnp; 3776 dvp = fpl->dvp; 3777 3778 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) { 3779 return (cache_fplookup_dot(fpl)); 3780 } 3781 3782 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 3783 3784 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 3785 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 3786 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 3787 break; 3788 } 3789 3790 /* 3791 * If there is no entry we have to punt to the slow path to perform 3792 * actual lookup. Should there be nothing with this name a negative 3793 * entry will be created. 3794 */ 3795 if (__predict_false(ncp == NULL)) { 3796 return (cache_fpl_partial(fpl)); 3797 } 3798 3799 tvp = atomic_load_ptr(&ncp->nc_vp); 3800 nc_flag = atomic_load_char(&ncp->nc_flag); 3801 if ((nc_flag & NCF_NEGATIVE) != 0) { 3802 /* 3803 * If they want to create an entry we need to replace this one. 3804 */ 3805 if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) { 3806 return (cache_fpl_partial(fpl)); 3807 } 3808 negstate = NCP2NEGSTATE(ncp); 3809 neg_hot = ((negstate->neg_flag & NEG_HOT) != 0); 3810 if (__predict_false(!cache_ncp_canuse(ncp))) { 3811 return (cache_fpl_partial(fpl)); 3812 } 3813 if (__predict_false((nc_flag & NCF_WHITE) != 0)) { 3814 return (cache_fpl_partial(fpl)); 3815 } 3816 if (!neg_hot) { 3817 return (cache_fplookup_negative_promote(fpl, ncp, hash)); 3818 } 3819 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, 3820 ncp->nc_name); 3821 counter_u64_add(numneghits, 1); 3822 cache_fpl_smr_exit(fpl); 3823 return (cache_fpl_handled(fpl, ENOENT)); 3824 } 3825 3826 if (__predict_false(!cache_ncp_canuse(ncp))) { 3827 return (cache_fpl_partial(fpl)); 3828 } 3829 3830 fpl->tvp = tvp; 3831 fpl->tvp_seqc = vn_seqc_read_any(tvp); 3832 if (seqc_in_modify(fpl->tvp_seqc)) { 3833 return (cache_fpl_partial(fpl)); 3834 } 3835 3836 if (!cache_fplookup_vnode_supported(tvp)) { 3837 return (cache_fpl_partial(fpl)); 3838 } 3839 3840 counter_u64_add(numposhits, 1); 3841 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp); 3842 return (0); 3843 } 3844 3845 static bool 3846 cache_fplookup_mp_supported(struct mount *mp) 3847 { 3848 3849 if (mp == NULL) 3850 return (false); 3851 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) 3852 return (false); 3853 return (true); 3854 } 3855 3856 /* 3857 * Walk up the mount stack (if any). 3858 * 3859 * Correctness is provided in the following ways: 3860 * - all vnodes are protected from freeing with SMR 3861 * - struct mount objects are type stable making them always safe to access 3862 * - stability of the particular mount is provided by busying it 3863 * - relationship between the vnode which is mounted on and the mount is 3864 * verified with the vnode sequence counter after busying 3865 * - association between root vnode of the mount and the mount is protected 3866 * by busy 3867 * 3868 * From that point on we can read the sequence counter of the root vnode 3869 * and get the next mount on the stack (if any) using the same protection. 3870 * 3871 * By the end of successful walk we are guaranteed the reached state was 3872 * indeed present at least at some point which matches the regular lookup. 3873 */ 3874 static int __noinline 3875 cache_fplookup_climb_mount(struct cache_fpl *fpl) 3876 { 3877 struct mount *mp, *prev_mp; 3878 struct vnode *vp; 3879 seqc_t vp_seqc; 3880 3881 vp = fpl->tvp; 3882 vp_seqc = fpl->tvp_seqc; 3883 3884 VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp); 3885 mp = atomic_load_ptr(&vp->v_mountedhere); 3886 if (mp == NULL) 3887 return (0); 3888 3889 prev_mp = NULL; 3890 for (;;) { 3891 if (!vfs_op_thread_enter_crit(mp)) { 3892 if (prev_mp != NULL) 3893 vfs_op_thread_exit_crit(prev_mp); 3894 return (cache_fpl_partial(fpl)); 3895 } 3896 if (prev_mp != NULL) 3897 vfs_op_thread_exit_crit(prev_mp); 3898 if (!vn_seqc_consistent(vp, vp_seqc)) { 3899 vfs_op_thread_exit_crit(mp); 3900 return (cache_fpl_partial(fpl)); 3901 } 3902 if (!cache_fplookup_mp_supported(mp)) { 3903 vfs_op_thread_exit_crit(mp); 3904 return (cache_fpl_partial(fpl)); 3905 } 3906 vp = atomic_load_ptr(&mp->mnt_rootvnode); 3907 if (vp == NULL || VN_IS_DOOMED(vp)) { 3908 vfs_op_thread_exit_crit(mp); 3909 return (cache_fpl_partial(fpl)); 3910 } 3911 vp_seqc = vn_seqc_read_any(vp); 3912 if (seqc_in_modify(vp_seqc)) { 3913 vfs_op_thread_exit_crit(mp); 3914 return (cache_fpl_partial(fpl)); 3915 } 3916 prev_mp = mp; 3917 mp = atomic_load_ptr(&vp->v_mountedhere); 3918 if (mp == NULL) 3919 break; 3920 } 3921 3922 vfs_op_thread_exit_crit(prev_mp); 3923 fpl->tvp = vp; 3924 fpl->tvp_seqc = vp_seqc; 3925 return (0); 3926 } 3927 3928 static bool 3929 cache_fplookup_need_climb_mount(struct cache_fpl *fpl) 3930 { 3931 struct mount *mp; 3932 struct vnode *vp; 3933 3934 vp = fpl->tvp; 3935 3936 /* 3937 * Hack: while this is a union, the pointer tends to be NULL so save on 3938 * a branch. 3939 */ 3940 mp = atomic_load_ptr(&vp->v_mountedhere); 3941 if (mp == NULL) 3942 return (false); 3943 if (vp->v_type == VDIR) 3944 return (true); 3945 return (false); 3946 } 3947 3948 /* 3949 * Parse the path. 3950 * 3951 * The code is mostly copy-pasted from regular lookup, see lookup(). 3952 * The structure is maintained along with comments for easier maintenance. 3953 * Deduplicating the code will become feasible after fast path lookup 3954 * becomes more feature-complete. 3955 */ 3956 static int 3957 cache_fplookup_parse(struct cache_fpl *fpl) 3958 { 3959 struct nameidata *ndp; 3960 struct componentname *cnp; 3961 char *cp; 3962 3963 ndp = fpl->ndp; 3964 cnp = fpl->cnp; 3965 3966 /* 3967 * Search a new directory. 3968 * 3969 * The last component of the filename is left accessible via 3970 * cnp->cn_nameptr for callers that need the name. Callers needing 3971 * the name set the SAVENAME flag. When done, they assume 3972 * responsibility for freeing the pathname buffer. 3973 */ 3974 for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++) 3975 continue; 3976 cnp->cn_namelen = cp - cnp->cn_nameptr; 3977 if (__predict_false(cnp->cn_namelen > NAME_MAX)) { 3978 cache_fpl_smr_exit(fpl); 3979 return (cache_fpl_handled(fpl, ENAMETOOLONG)); 3980 } 3981 ndp->ni_pathlen -= cnp->cn_namelen; 3982 KASSERT(ndp->ni_pathlen <= PATH_MAX, 3983 ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen)); 3984 ndp->ni_next = cp; 3985 3986 /* 3987 * Replace multiple slashes by a single slash and trailing slashes 3988 * by a null. This must be done before VOP_LOOKUP() because some 3989 * fs's don't know about trailing slashes. Remember if there were 3990 * trailing slashes to handle symlinks, existing non-directories 3991 * and non-existing files that won't be directories specially later. 3992 */ 3993 while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) { 3994 cp++; 3995 ndp->ni_pathlen--; 3996 if (*cp == '\0') { 3997 /* 3998 * TODO 3999 * Regular lookup performs the following: 4000 * *ndp->ni_next = '\0'; 4001 * cnp->cn_flags |= TRAILINGSLASH; 4002 * 4003 * Which is problematic since it modifies data read 4004 * from userspace. Then if fast path lookup was to 4005 * abort we would have to either restore it or convey 4006 * the flag. Since this is a corner case just ignore 4007 * it for simplicity. 4008 */ 4009 return (cache_fpl_partial(fpl)); 4010 } 4011 } 4012 ndp->ni_next = cp; 4013 4014 /* 4015 * Check for degenerate name (e.g. / or "") 4016 * which is a way of talking about a directory, 4017 * e.g. like "/." or ".". 4018 * 4019 * TODO 4020 * Another corner case handled by the regular lookup 4021 */ 4022 if (__predict_false(cnp->cn_nameptr[0] == '\0')) { 4023 return (cache_fpl_partial(fpl)); 4024 } 4025 return (0); 4026 } 4027 4028 static void 4029 cache_fplookup_parse_advance(struct cache_fpl *fpl) 4030 { 4031 struct nameidata *ndp; 4032 struct componentname *cnp; 4033 4034 ndp = fpl->ndp; 4035 cnp = fpl->cnp; 4036 4037 cnp->cn_nameptr = ndp->ni_next; 4038 while (*cnp->cn_nameptr == '/') { 4039 cnp->cn_nameptr++; 4040 ndp->ni_pathlen--; 4041 } 4042 } 4043 4044 static int __noinline 4045 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error) 4046 { 4047 4048 /* 4049 * Hack: they may be looking up foo/bar, where foo is a 4050 * regular file. In such a case we need to turn ENOTDIR, 4051 * but we may happen to get here with a different error. 4052 */ 4053 if (fpl->dvp->v_type != VDIR) { 4054 error = ENOTDIR; 4055 } 4056 4057 switch (error) { 4058 case EAGAIN: 4059 /* 4060 * Can happen when racing against vgone. 4061 * */ 4062 case EOPNOTSUPP: 4063 cache_fpl_partial(fpl); 4064 break; 4065 default: 4066 /* 4067 * See the API contract for VOP_FPLOOKUP_VEXEC. 4068 */ 4069 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { 4070 error = cache_fpl_aborted(fpl); 4071 } else { 4072 cache_fpl_smr_exit(fpl); 4073 cache_fpl_handled(fpl, error); 4074 } 4075 break; 4076 } 4077 return (error); 4078 } 4079 4080 static int 4081 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl) 4082 { 4083 struct nameidata *ndp; 4084 struct componentname *cnp; 4085 struct mount *mp; 4086 int error; 4087 4088 error = CACHE_FPL_FAILED; 4089 ndp = fpl->ndp; 4090 cnp = fpl->cnp; 4091 4092 cache_fpl_checkpoint(fpl, &fpl->snd); 4093 4094 fpl->dvp = dvp; 4095 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); 4096 if (seqc_in_modify(fpl->dvp_seqc)) { 4097 cache_fpl_aborted(fpl); 4098 goto out; 4099 } 4100 mp = atomic_load_ptr(&fpl->dvp->v_mount); 4101 if (!cache_fplookup_mp_supported(mp)) { 4102 cache_fpl_aborted(fpl); 4103 goto out; 4104 } 4105 4106 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); 4107 4108 for (;;) { 4109 error = cache_fplookup_parse(fpl); 4110 if (__predict_false(error != 0)) { 4111 break; 4112 } 4113 4114 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); 4115 4116 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred); 4117 if (__predict_false(error != 0)) { 4118 error = cache_fplookup_failed_vexec(fpl, error); 4119 break; 4120 } 4121 4122 if (__predict_false(cache_fpl_isdotdot(cnp))) { 4123 error = cache_fplookup_dotdot(fpl); 4124 if (__predict_false(error != 0)) { 4125 break; 4126 } 4127 } else { 4128 error = cache_fplookup_next(fpl); 4129 if (__predict_false(error != 0)) { 4130 break; 4131 } 4132 4133 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 4134 4135 if (cache_fplookup_need_climb_mount(fpl)) { 4136 error = cache_fplookup_climb_mount(fpl); 4137 if (__predict_false(error != 0)) { 4138 break; 4139 } 4140 } 4141 } 4142 4143 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 4144 4145 if (cache_fpl_islastcn(ndp)) { 4146 error = cache_fplookup_final(fpl); 4147 break; 4148 } 4149 4150 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { 4151 error = cache_fpl_aborted(fpl); 4152 break; 4153 } 4154 4155 fpl->dvp = fpl->tvp; 4156 fpl->dvp_seqc = fpl->tvp_seqc; 4157 4158 cache_fplookup_parse_advance(fpl); 4159 cache_fpl_checkpoint(fpl, &fpl->snd); 4160 } 4161 out: 4162 switch (fpl->status) { 4163 case CACHE_FPL_STATUS_UNSET: 4164 __assert_unreachable(); 4165 break; 4166 case CACHE_FPL_STATUS_PARTIAL: 4167 cache_fpl_smr_assert_entered(fpl); 4168 return (cache_fplookup_partial_setup(fpl)); 4169 case CACHE_FPL_STATUS_ABORTED: 4170 if (fpl->in_smr) 4171 cache_fpl_smr_exit(fpl); 4172 return (CACHE_FPL_FAILED); 4173 case CACHE_FPL_STATUS_HANDLED: 4174 MPASS(error != CACHE_FPL_FAILED); 4175 cache_fpl_smr_assert_not_entered(fpl); 4176 if (__predict_false(error != 0)) { 4177 ndp->ni_dvp = NULL; 4178 ndp->ni_vp = NULL; 4179 cache_fpl_cleanup_cnp(cnp); 4180 return (error); 4181 } 4182 ndp->ni_dvp = fpl->dvp; 4183 ndp->ni_vp = fpl->tvp; 4184 if (cnp->cn_flags & SAVENAME) 4185 cnp->cn_flags |= HASBUF; 4186 else 4187 cache_fpl_cleanup_cnp(cnp); 4188 return (error); 4189 } 4190 } 4191 4192 /* 4193 * Fast path lookup protected with SMR and sequence counters. 4194 * 4195 * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one. 4196 * 4197 * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria 4198 * outlined below. 4199 * 4200 * Traditional vnode lookup conceptually looks like this: 4201 * 4202 * vn_lock(current); 4203 * for (;;) { 4204 * next = find(); 4205 * vn_lock(next); 4206 * vn_unlock(current); 4207 * current = next; 4208 * if (last) 4209 * break; 4210 * } 4211 * return (current); 4212 * 4213 * Each jump to the next vnode is safe memory-wise and atomic with respect to 4214 * any modifications thanks to holding respective locks. 4215 * 4216 * The same guarantee can be provided with a combination of safe memory 4217 * reclamation and sequence counters instead. If all operations which affect 4218 * the relationship between the current vnode and the one we are looking for 4219 * also modify the counter, we can verify whether all the conditions held as 4220 * we made the jump. This includes things like permissions, mount points etc. 4221 * Counter modification is provided by enclosing relevant places in 4222 * vn_seqc_write_begin()/end() calls. 4223 * 4224 * Thus this translates to: 4225 * 4226 * vfs_smr_enter(); 4227 * dvp_seqc = seqc_read_any(dvp); 4228 * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode 4229 * abort(); 4230 * for (;;) { 4231 * tvp = find(); 4232 * tvp_seqc = seqc_read_any(tvp); 4233 * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode 4234 * abort(); 4235 * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode 4236 * abort(); 4237 * dvp = tvp; // we know nothing of importance has changed 4238 * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration 4239 * if (last) 4240 * break; 4241 * } 4242 * vget(); // secure the vnode 4243 * if (!seqc_consistent(tvp, tvp_seqc) // final check 4244 * abort(); 4245 * // at this point we know nothing has changed for any parent<->child pair 4246 * // as they were crossed during the lookup, meaning we matched the guarantee 4247 * // of the locked variant 4248 * return (tvp); 4249 * 4250 * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows: 4251 * - they are called while within vfs_smr protection which they must never exit 4252 * - EAGAIN can be returned to denote checking could not be performed, it is 4253 * always valid to return it 4254 * - if the sequence counter has not changed the result must be valid 4255 * - if the sequence counter has changed both false positives and false negatives 4256 * are permitted (since the result will be rejected later) 4257 * - for simple cases of unix permission checks vaccess_vexec_smr can be used 4258 * 4259 * Caveats to watch out for: 4260 * - vnodes are passed unlocked and unreferenced with nothing stopping 4261 * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised 4262 * to use atomic_load_ptr to fetch it. 4263 * - the aforementioned object can also get freed, meaning absent other means it 4264 * should be protected with vfs_smr 4265 * - either safely checking permissions as they are modified or guaranteeing 4266 * their stability is left to the routine 4267 */ 4268 int 4269 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status, 4270 struct pwd **pwdp) 4271 { 4272 struct cache_fpl fpl; 4273 struct pwd *pwd; 4274 struct vnode *dvp; 4275 struct componentname *cnp; 4276 struct nameidata_saved orig; 4277 int error; 4278 4279 MPASS(ndp->ni_lcf == 0); 4280 4281 fpl.status = CACHE_FPL_STATUS_UNSET; 4282 fpl.ndp = ndp; 4283 fpl.cnp = &ndp->ni_cnd; 4284 MPASS(curthread == fpl.cnp->cn_thread); 4285 4286 if ((fpl.cnp->cn_flags & SAVESTART) != 0) 4287 MPASS(fpl.cnp->cn_nameiop != LOOKUP); 4288 4289 if (!cache_can_fplookup(&fpl)) { 4290 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 4291 *status = fpl.status; 4292 return (EOPNOTSUPP); 4293 } 4294 4295 cache_fpl_checkpoint(&fpl, &orig); 4296 4297 cache_fpl_smr_enter_initial(&fpl); 4298 pwd = pwd_get_smr(); 4299 fpl.pwd = pwd; 4300 ndp->ni_rootdir = pwd->pwd_rdir; 4301 ndp->ni_topdir = pwd->pwd_jdir; 4302 4303 cnp = fpl.cnp; 4304 cnp->cn_nameptr = cnp->cn_pnbuf; 4305 if (cnp->cn_pnbuf[0] == '/') { 4306 cache_fpl_handle_root(ndp, &dvp); 4307 } else { 4308 MPASS(ndp->ni_dirfd == AT_FDCWD); 4309 dvp = pwd->pwd_cdir; 4310 } 4311 4312 SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true); 4313 4314 error = cache_fplookup_impl(dvp, &fpl); 4315 cache_fpl_smr_assert_not_entered(&fpl); 4316 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 4317 4318 *status = fpl.status; 4319 switch (fpl.status) { 4320 case CACHE_FPL_STATUS_UNSET: 4321 __assert_unreachable(); 4322 break; 4323 case CACHE_FPL_STATUS_HANDLED: 4324 SDT_PROBE3(vfs, namei, lookup, return, error, 4325 (error == 0 ? ndp->ni_vp : NULL), true); 4326 break; 4327 case CACHE_FPL_STATUS_PARTIAL: 4328 *pwdp = fpl.pwd; 4329 /* 4330 * Status restored by cache_fplookup_partial_setup. 4331 */ 4332 break; 4333 case CACHE_FPL_STATUS_ABORTED: 4334 cache_fpl_restore(&fpl, &orig); 4335 break; 4336 } 4337 return (error); 4338 } 4339