1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Poul-Henning Kamp of the FreeBSD Project. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_ddb.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/capsicum.h> 46 #include <sys/counter.h> 47 #include <sys/filedesc.h> 48 #include <sys/fnv_hash.h> 49 #include <sys/kernel.h> 50 #include <sys/ktr.h> 51 #include <sys/lock.h> 52 #include <sys/malloc.h> 53 #include <sys/fcntl.h> 54 #include <sys/jail.h> 55 #include <sys/mount.h> 56 #include <sys/namei.h> 57 #include <sys/proc.h> 58 #include <sys/seqc.h> 59 #include <sys/sdt.h> 60 #include <sys/smr.h> 61 #include <sys/smp.h> 62 #include <sys/syscallsubr.h> 63 #include <sys/sysctl.h> 64 #include <sys/sysproto.h> 65 #include <sys/vnode.h> 66 #include <ck_queue.h> 67 #ifdef KTRACE 68 #include <sys/ktrace.h> 69 #endif 70 71 #include <sys/capsicum.h> 72 73 #include <security/audit/audit.h> 74 #include <security/mac/mac_framework.h> 75 76 #ifdef DDB 77 #include <ddb/ddb.h> 78 #endif 79 80 #include <vm/uma.h> 81 82 SDT_PROVIDER_DECLARE(vfs); 83 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", 84 "struct vnode *"); 85 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", 86 "char *"); 87 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *", 88 "const char *"); 89 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *", 90 "struct namecache *", "int", "int"); 91 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); 92 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", 93 "char *", "struct vnode *"); 94 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); 95 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", 96 "struct vnode *", "char *"); 97 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", 98 "struct vnode *"); 99 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, 100 "struct vnode *", "char *"); 101 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", 102 "char *"); 103 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *", 104 "struct componentname *"); 105 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *", 106 "struct componentname *"); 107 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); 108 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); 109 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); 110 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", 111 "struct vnode *"); 112 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", 113 "char *"); 114 SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *", 115 "char *"); 116 117 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool"); 118 SDT_PROBE_DECLARE(vfs, namei, lookup, entry); 119 SDT_PROBE_DECLARE(vfs, namei, lookup, return); 120 121 /* 122 * This structure describes the elements in the cache of recent 123 * names looked up by namei. 124 */ 125 struct negstate { 126 u_char neg_flag; 127 }; 128 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *), 129 "the state must fit in a union with a pointer without growing it"); 130 131 struct namecache { 132 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 133 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 134 CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */ 135 struct vnode *nc_dvp; /* vnode of parent of name */ 136 union { 137 struct vnode *nu_vp; /* vnode the name refers to */ 138 struct negstate nu_neg;/* negative entry state */ 139 } n_un; 140 u_char nc_flag; /* flag bits */ 141 u_char nc_nlen; /* length of name */ 142 char nc_name[0]; /* segment name + nul */ 143 }; 144 145 /* 146 * struct namecache_ts repeats struct namecache layout up to the 147 * nc_nlen member. 148 * struct namecache_ts is used in place of struct namecache when time(s) need 149 * to be stored. The nc_dotdottime field is used when a cache entry is mapping 150 * both a non-dotdot directory name plus dotdot for the directory's 151 * parent. 152 * 153 * See below for alignment requirement. 154 */ 155 struct namecache_ts { 156 struct timespec nc_time; /* timespec provided by fs */ 157 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ 158 int nc_ticks; /* ticks value when entry was added */ 159 struct namecache nc_nc; 160 }; 161 162 /* 163 * At least mips n32 performs 64-bit accesses to timespec as found 164 * in namecache_ts and requires them to be aligned. Since others 165 * may be in the same spot suffer a little bit and enforce the 166 * alignment for everyone. Note this is a nop for 64-bit platforms. 167 */ 168 #define CACHE_ZONE_ALIGNMENT UMA_ALIGNOF(time_t) 169 #define CACHE_PATH_CUTOFF 39 170 171 #define CACHE_ZONE_SMALL_SIZE (sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1) 172 #define CACHE_ZONE_SMALL_TS_SIZE (sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1) 173 #define CACHE_ZONE_LARGE_SIZE (sizeof(struct namecache) + NAME_MAX + 1) 174 #define CACHE_ZONE_LARGE_TS_SIZE (sizeof(struct namecache_ts) + NAME_MAX + 1) 175 176 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 177 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 178 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 179 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); 180 181 #define nc_vp n_un.nu_vp 182 #define nc_neg n_un.nu_neg 183 184 /* 185 * Flags in namecache.nc_flag 186 */ 187 #define NCF_WHITE 0x01 188 #define NCF_ISDOTDOT 0x02 189 #define NCF_TS 0x04 190 #define NCF_DTS 0x08 191 #define NCF_DVDROP 0x10 192 #define NCF_NEGATIVE 0x20 193 #define NCF_INVALID 0x40 194 #define NCF_WIP 0x80 195 196 /* 197 * Flags in negstate.neg_flag 198 */ 199 #define NEG_HOT 0x01 200 201 /* 202 * Mark an entry as invalid. 203 * 204 * This is called before it starts getting deconstructed. 205 */ 206 static void 207 cache_ncp_invalidate(struct namecache *ncp) 208 { 209 210 KASSERT((ncp->nc_flag & NCF_INVALID) == 0, 211 ("%s: entry %p already invalid", __func__, ncp)); 212 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID); 213 atomic_thread_fence_rel(); 214 } 215 216 /* 217 * Check whether the entry can be safely used. 218 * 219 * All places which elide locks are supposed to call this after they are 220 * done with reading from an entry. 221 */ 222 static bool 223 cache_ncp_canuse(struct namecache *ncp) 224 { 225 226 atomic_thread_fence_acq(); 227 return ((atomic_load_char(&ncp->nc_flag) & (NCF_INVALID | NCF_WIP)) == 0); 228 } 229 230 /* 231 * Name caching works as follows: 232 * 233 * Names found by directory scans are retained in a cache 234 * for future reference. It is managed LRU, so frequently 235 * used names will hang around. Cache is indexed by hash value 236 * obtained from (dvp, name) where dvp refers to the directory 237 * containing name. 238 * 239 * If it is a "negative" entry, (i.e. for a name that is known NOT to 240 * exist) the vnode pointer will be NULL. 241 * 242 * Upon reaching the last segment of a path, if the reference 243 * is for DELETE, or NOCACHE is set (rewrite), and the 244 * name is located in the cache, it will be dropped. 245 * 246 * These locks are used (in the order in which they can be taken): 247 * NAME TYPE ROLE 248 * vnodelock mtx vnode lists and v_cache_dd field protection 249 * bucketlock mtx for access to given set of hash buckets 250 * neglist mtx negative entry LRU management 251 * 252 * Additionally, ncneg_shrink_lock mtx is used to have at most one thread 253 * shrinking the LRU list. 254 * 255 * It is legal to take multiple vnodelock and bucketlock locks. The locking 256 * order is lower address first. Both are recursive. 257 * 258 * "." lookups are lockless. 259 * 260 * ".." and vnode -> name lookups require vnodelock. 261 * 262 * name -> vnode lookup requires the relevant bucketlock to be held for reading. 263 * 264 * Insertions and removals of entries require involved vnodes and bucketlocks 265 * to be locked to provide safe operation against other threads modifying the 266 * cache. 267 * 268 * Some lookups result in removal of the found entry (e.g. getting rid of a 269 * negative entry with the intent to create a positive one), which poses a 270 * problem when multiple threads reach the state. Similarly, two different 271 * threads can purge two different vnodes and try to remove the same name. 272 * 273 * If the already held vnode lock is lower than the second required lock, we 274 * can just take the other lock. However, in the opposite case, this could 275 * deadlock. As such, this is resolved by trylocking and if that fails unlocking 276 * the first node, locking everything in order and revalidating the state. 277 */ 278 279 VFS_SMR_DECLARE; 280 281 /* 282 * Structures associated with name caching. 283 */ 284 #define NCHHASH(hash) \ 285 (&nchashtbl[(hash) & nchash]) 286 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ 287 static u_long __read_mostly nchash; /* size of hash table */ 288 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 289 "Size of namecache hash table"); 290 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */ 291 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 292 "Ratio of negative namecache entries"); 293 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ 294 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ 295 u_int ncsizefactor = 2; 296 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0, 297 "Size factor for namecache"); 298 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */ 299 300 struct nchstats nchstats; /* cache effectiveness statistics */ 301 302 static bool __read_frequently cache_fast_revlookup = true; 303 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_revlookup, CTLFLAG_RW, 304 &cache_fast_revlookup, 0, ""); 305 306 static struct mtx __exclusive_cache_line ncneg_shrink_lock; 307 308 struct neglist { 309 struct mtx nl_lock; 310 TAILQ_HEAD(, namecache) nl_list; 311 } __aligned(CACHE_LINE_SIZE); 312 313 static struct neglist __read_mostly *neglists; 314 static struct neglist ncneg_hot; 315 static u_long numhotneg; 316 317 #define ncneghash 3 318 #define numneglists (ncneghash + 1) 319 static inline struct neglist * 320 NCP2NEGLIST(struct namecache *ncp) 321 { 322 323 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); 324 } 325 326 static inline struct negstate * 327 NCP2NEGSTATE(struct namecache *ncp) 328 { 329 330 MPASS(ncp->nc_flag & NCF_NEGATIVE); 331 return (&ncp->nc_neg); 332 } 333 334 #define numbucketlocks (ncbuckethash + 1) 335 static u_int __read_mostly ncbuckethash; 336 static struct mtx_padalign __read_mostly *bucketlocks; 337 #define HASH2BUCKETLOCK(hash) \ 338 ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)])) 339 340 #define numvnodelocks (ncvnodehash + 1) 341 static u_int __read_mostly ncvnodehash; 342 static struct mtx __read_mostly *vnodelocks; 343 static inline struct mtx * 344 VP2VNODELOCK(struct vnode *vp) 345 { 346 347 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); 348 } 349 350 /* 351 * UMA zones for the VFS cache. 352 * 353 * The small cache is used for entries with short names, which are the 354 * most common. The large cache is used for entries which are too big to 355 * fit in the small cache. 356 */ 357 static uma_zone_t __read_mostly cache_zone_small; 358 static uma_zone_t __read_mostly cache_zone_small_ts; 359 static uma_zone_t __read_mostly cache_zone_large; 360 static uma_zone_t __read_mostly cache_zone_large_ts; 361 362 static struct namecache * 363 cache_alloc(int len, int ts) 364 { 365 struct namecache_ts *ncp_ts; 366 struct namecache *ncp; 367 368 if (__predict_false(ts)) { 369 if (len <= CACHE_PATH_CUTOFF) 370 ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK); 371 else 372 ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK); 373 ncp = &ncp_ts->nc_nc; 374 } else { 375 if (len <= CACHE_PATH_CUTOFF) 376 ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK); 377 else 378 ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK); 379 } 380 return (ncp); 381 } 382 383 static void 384 cache_free(struct namecache *ncp) 385 { 386 struct namecache_ts *ncp_ts; 387 388 MPASS(ncp != NULL); 389 if ((ncp->nc_flag & NCF_DVDROP) != 0) 390 vdrop(ncp->nc_dvp); 391 if (__predict_false(ncp->nc_flag & NCF_TS)) { 392 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 393 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 394 uma_zfree_smr(cache_zone_small_ts, ncp_ts); 395 else 396 uma_zfree_smr(cache_zone_large_ts, ncp_ts); 397 } else { 398 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 399 uma_zfree_smr(cache_zone_small, ncp); 400 else 401 uma_zfree_smr(cache_zone_large, ncp); 402 } 403 } 404 405 static void 406 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) 407 { 408 struct namecache_ts *ncp_ts; 409 410 KASSERT((ncp->nc_flag & NCF_TS) != 0 || 411 (tsp == NULL && ticksp == NULL), 412 ("No NCF_TS")); 413 414 if (tsp == NULL) 415 return; 416 417 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 418 *tsp = ncp_ts->nc_time; 419 *ticksp = ncp_ts->nc_ticks; 420 } 421 422 #ifdef DEBUG_CACHE 423 static int __read_mostly doingcache = 1; /* 1 => enable the cache */ 424 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, 425 "VFS namecache enabled"); 426 #endif 427 428 /* Export size information to userland */ 429 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 430 sizeof(struct namecache), "sizeof(struct namecache)"); 431 432 /* 433 * The new name cache statistics 434 */ 435 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 436 "Name cache statistics"); 437 #define STATNODE_ULONG(name, descr) \ 438 SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr); 439 #define STATNODE_COUNTER(name, descr) \ 440 static COUNTER_U64_DEFINE_EARLY(name); \ 441 SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, \ 442 descr); 443 STATNODE_ULONG(numneg, "Number of negative cache entries"); 444 STATNODE_ULONG(numcache, "Number of cache entries"); 445 STATNODE_COUNTER(numcachehv, "Number of namecache entries with vnodes held"); 446 STATNODE_COUNTER(numdrops, "Number of dropped entries due to reaching the limit"); 447 STATNODE_COUNTER(dothits, "Number of '.' hits"); 448 STATNODE_COUNTER(dotdothits, "Number of '..' hits"); 449 STATNODE_COUNTER(nummiss, "Number of cache misses"); 450 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache"); 451 STATNODE_COUNTER(numposzaps, 452 "Number of cache hits (positive) we do not want to cache"); 453 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)"); 454 STATNODE_COUNTER(numnegzaps, 455 "Number of cache hits (negative) we do not want to cache"); 456 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)"); 457 /* These count for vn_getcwd(), too. */ 458 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls"); 459 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); 460 STATNODE_COUNTER(numfullpathfail2, 461 "Number of fullpath search errors (VOP_VPTOCNP failures)"); 462 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); 463 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls"); 464 STATNODE_COUNTER(zap_and_exit_bucket_relock_success, 465 "Number of successful removals after relocking"); 466 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail, 467 "Number of times zap_and_exit failed to lock"); 468 static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2, 469 "Number of times zap_and_exit failed to lock"); 470 static long cache_lock_vnodes_cel_3_failures; 471 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures, 472 "Number of times 3-way vnode locking failed"); 473 STATNODE_ULONG(numhotneg, "Number of hot negative entries"); 474 STATNODE_COUNTER(numneg_evicted, 475 "Number of negative entries evicted when adding a new entry"); 476 STATNODE_COUNTER(shrinking_skipped, 477 "Number of times shrinking was already in progress"); 478 479 static void cache_zap_locked(struct namecache *ncp); 480 static int vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, 481 char **freebuf, size_t *buflen); 482 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 483 char **retbuf, size_t *buflen, bool slash_prefixed, size_t addend); 484 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, 485 char **retbuf, size_t *buflen); 486 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, 487 char **retbuf, size_t *len, bool slash_prefixed, size_t addend); 488 489 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 490 491 static inline void 492 cache_assert_vlp_locked(struct mtx *vlp) 493 { 494 495 if (vlp != NULL) 496 mtx_assert(vlp, MA_OWNED); 497 } 498 499 static inline void 500 cache_assert_vnode_locked(struct vnode *vp) 501 { 502 struct mtx *vlp; 503 504 vlp = VP2VNODELOCK(vp); 505 cache_assert_vlp_locked(vlp); 506 } 507 508 /* 509 * TODO: With the value stored we can do better than computing the hash based 510 * on the address. The choice of FNV should also be revisited. 511 */ 512 static void 513 cache_prehash(struct vnode *vp) 514 { 515 516 vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT); 517 } 518 519 static uint32_t 520 cache_get_hash(char *name, u_char len, struct vnode *dvp) 521 { 522 523 return (fnv_32_buf(name, len, dvp->v_nchash)); 524 } 525 526 static inline struct nchashhead * 527 NCP2BUCKET(struct namecache *ncp) 528 { 529 uint32_t hash; 530 531 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 532 return (NCHHASH(hash)); 533 } 534 535 static inline struct mtx * 536 NCP2BUCKETLOCK(struct namecache *ncp) 537 { 538 uint32_t hash; 539 540 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 541 return (HASH2BUCKETLOCK(hash)); 542 } 543 544 #ifdef INVARIANTS 545 static void 546 cache_assert_bucket_locked(struct namecache *ncp) 547 { 548 struct mtx *blp; 549 550 blp = NCP2BUCKETLOCK(ncp); 551 mtx_assert(blp, MA_OWNED); 552 } 553 554 static void 555 cache_assert_bucket_unlocked(struct namecache *ncp) 556 { 557 struct mtx *blp; 558 559 blp = NCP2BUCKETLOCK(ncp); 560 mtx_assert(blp, MA_NOTOWNED); 561 } 562 #else 563 #define cache_assert_bucket_locked(x) do { } while (0) 564 #define cache_assert_bucket_unlocked(x) do { } while (0) 565 #endif 566 567 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) 568 static void 569 _cache_sort_vnodes(void **p1, void **p2) 570 { 571 void *tmp; 572 573 MPASS(*p1 != NULL || *p2 != NULL); 574 575 if (*p1 > *p2) { 576 tmp = *p2; 577 *p2 = *p1; 578 *p1 = tmp; 579 } 580 } 581 582 static void 583 cache_lock_all_buckets(void) 584 { 585 u_int i; 586 587 for (i = 0; i < numbucketlocks; i++) 588 mtx_lock(&bucketlocks[i]); 589 } 590 591 static void 592 cache_unlock_all_buckets(void) 593 { 594 u_int i; 595 596 for (i = 0; i < numbucketlocks; i++) 597 mtx_unlock(&bucketlocks[i]); 598 } 599 600 static void 601 cache_lock_all_vnodes(void) 602 { 603 u_int i; 604 605 for (i = 0; i < numvnodelocks; i++) 606 mtx_lock(&vnodelocks[i]); 607 } 608 609 static void 610 cache_unlock_all_vnodes(void) 611 { 612 u_int i; 613 614 for (i = 0; i < numvnodelocks; i++) 615 mtx_unlock(&vnodelocks[i]); 616 } 617 618 static int 619 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 620 { 621 622 cache_sort_vnodes(&vlp1, &vlp2); 623 624 if (vlp1 != NULL) { 625 if (!mtx_trylock(vlp1)) 626 return (EAGAIN); 627 } 628 if (!mtx_trylock(vlp2)) { 629 if (vlp1 != NULL) 630 mtx_unlock(vlp1); 631 return (EAGAIN); 632 } 633 634 return (0); 635 } 636 637 static void 638 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 639 { 640 641 MPASS(vlp1 != NULL || vlp2 != NULL); 642 MPASS(vlp1 <= vlp2); 643 644 if (vlp1 != NULL) 645 mtx_lock(vlp1); 646 if (vlp2 != NULL) 647 mtx_lock(vlp2); 648 } 649 650 static void 651 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 652 { 653 654 MPASS(vlp1 != NULL || vlp2 != NULL); 655 656 if (vlp1 != NULL) 657 mtx_unlock(vlp1); 658 if (vlp2 != NULL) 659 mtx_unlock(vlp2); 660 } 661 662 static int 663 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 664 { 665 struct nchstats snap; 666 667 if (req->oldptr == NULL) 668 return (SYSCTL_OUT(req, 0, sizeof(snap))); 669 670 snap = nchstats; 671 snap.ncs_goodhits = counter_u64_fetch(numposhits); 672 snap.ncs_neghits = counter_u64_fetch(numneghits); 673 snap.ncs_badhits = counter_u64_fetch(numposzaps) + 674 counter_u64_fetch(numnegzaps); 675 snap.ncs_miss = counter_u64_fetch(nummisszap) + 676 counter_u64_fetch(nummiss); 677 678 return (SYSCTL_OUT(req, &snap, sizeof(snap))); 679 } 680 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | 681 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", 682 "VFS cache effectiveness statistics"); 683 684 #ifdef DIAGNOSTIC 685 /* 686 * Grab an atomic snapshot of the name cache hash chain lengths 687 */ 688 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, 689 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 690 "hash table stats"); 691 692 static int 693 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) 694 { 695 struct nchashhead *ncpp; 696 struct namecache *ncp; 697 int i, error, n_nchash, *cntbuf; 698 699 retry: 700 n_nchash = nchash + 1; /* nchash is max index, not count */ 701 if (req->oldptr == NULL) 702 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); 703 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); 704 cache_lock_all_buckets(); 705 if (n_nchash != nchash + 1) { 706 cache_unlock_all_buckets(); 707 free(cntbuf, M_TEMP); 708 goto retry; 709 } 710 /* Scan hash tables counting entries */ 711 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) 712 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) 713 cntbuf[i]++; 714 cache_unlock_all_buckets(); 715 for (error = 0, i = 0; i < n_nchash; i++) 716 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) 717 break; 718 free(cntbuf, M_TEMP); 719 return (error); 720 } 721 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| 722 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", 723 "nchash chain lengths"); 724 725 static int 726 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) 727 { 728 int error; 729 struct nchashhead *ncpp; 730 struct namecache *ncp; 731 int n_nchash; 732 int count, maxlength, used, pct; 733 734 if (!req->oldptr) 735 return SYSCTL_OUT(req, 0, 4 * sizeof(int)); 736 737 cache_lock_all_buckets(); 738 n_nchash = nchash + 1; /* nchash is max index, not count */ 739 used = 0; 740 maxlength = 0; 741 742 /* Scan hash tables for applicable entries */ 743 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { 744 count = 0; 745 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) { 746 count++; 747 } 748 if (count) 749 used++; 750 if (maxlength < count) 751 maxlength = count; 752 } 753 n_nchash = nchash + 1; 754 cache_unlock_all_buckets(); 755 pct = (used * 100) / (n_nchash / 100); 756 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); 757 if (error) 758 return (error); 759 error = SYSCTL_OUT(req, &used, sizeof(used)); 760 if (error) 761 return (error); 762 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); 763 if (error) 764 return (error); 765 error = SYSCTL_OUT(req, &pct, sizeof(pct)); 766 if (error) 767 return (error); 768 return (0); 769 } 770 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| 771 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", 772 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); 773 #endif 774 775 /* 776 * Negative entries management 777 * 778 * A variation of LRU scheme is used. New entries are hashed into one of 779 * numneglists cold lists. Entries get promoted to the hot list on first hit. 780 * 781 * The shrinker will demote hot list head and evict from the cold list in a 782 * round-robin manner. 783 */ 784 static void 785 cache_negative_init(struct namecache *ncp) 786 { 787 struct negstate *negstate; 788 789 ncp->nc_flag |= NCF_NEGATIVE; 790 negstate = NCP2NEGSTATE(ncp); 791 negstate->neg_flag = 0; 792 } 793 794 static void 795 cache_negative_hit(struct namecache *ncp) 796 { 797 struct neglist *neglist; 798 struct negstate *negstate; 799 800 negstate = NCP2NEGSTATE(ncp); 801 if ((negstate->neg_flag & NEG_HOT) != 0) 802 return; 803 neglist = NCP2NEGLIST(ncp); 804 mtx_lock(&ncneg_hot.nl_lock); 805 mtx_lock(&neglist->nl_lock); 806 if ((negstate->neg_flag & NEG_HOT) == 0) { 807 numhotneg++; 808 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 809 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); 810 negstate->neg_flag |= NEG_HOT; 811 } 812 mtx_unlock(&neglist->nl_lock); 813 mtx_unlock(&ncneg_hot.nl_lock); 814 } 815 816 static void 817 cache_negative_insert(struct namecache *ncp) 818 { 819 struct neglist *neglist; 820 821 MPASS(ncp->nc_flag & NCF_NEGATIVE); 822 cache_assert_bucket_locked(ncp); 823 neglist = NCP2NEGLIST(ncp); 824 mtx_lock(&neglist->nl_lock); 825 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 826 mtx_unlock(&neglist->nl_lock); 827 atomic_add_long(&numneg, 1); 828 } 829 830 static void 831 cache_negative_remove(struct namecache *ncp) 832 { 833 struct neglist *neglist; 834 struct negstate *negstate; 835 bool hot_locked = false; 836 bool list_locked = false; 837 838 cache_assert_bucket_locked(ncp); 839 neglist = NCP2NEGLIST(ncp); 840 negstate = NCP2NEGSTATE(ncp); 841 if ((negstate->neg_flag & NEG_HOT) != 0) { 842 hot_locked = true; 843 mtx_lock(&ncneg_hot.nl_lock); 844 if ((negstate->neg_flag & NEG_HOT) == 0) { 845 list_locked = true; 846 mtx_lock(&neglist->nl_lock); 847 } 848 } else { 849 list_locked = true; 850 mtx_lock(&neglist->nl_lock); 851 /* 852 * We may be racing against promotion in lockless lookup. 853 */ 854 if ((negstate->neg_flag & NEG_HOT) != 0) { 855 mtx_unlock(&neglist->nl_lock); 856 hot_locked = true; 857 mtx_lock(&ncneg_hot.nl_lock); 858 mtx_lock(&neglist->nl_lock); 859 } 860 } 861 if ((negstate->neg_flag & NEG_HOT) != 0) { 862 mtx_assert(&ncneg_hot.nl_lock, MA_OWNED); 863 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 864 numhotneg--; 865 } else { 866 mtx_assert(&neglist->nl_lock, MA_OWNED); 867 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 868 } 869 if (list_locked) 870 mtx_unlock(&neglist->nl_lock); 871 if (hot_locked) 872 mtx_unlock(&ncneg_hot.nl_lock); 873 atomic_subtract_long(&numneg, 1); 874 } 875 876 static void 877 cache_negative_shrink_select(struct namecache **ncpp, 878 struct neglist **neglistpp) 879 { 880 struct neglist *neglist; 881 struct namecache *ncp; 882 static u_int cycle; 883 u_int i; 884 885 *ncpp = ncp = NULL; 886 887 for (i = 0; i < numneglists; i++) { 888 neglist = &neglists[(cycle + i) % numneglists]; 889 if (TAILQ_FIRST(&neglist->nl_list) == NULL) 890 continue; 891 mtx_lock(&neglist->nl_lock); 892 ncp = TAILQ_FIRST(&neglist->nl_list); 893 if (ncp != NULL) 894 break; 895 mtx_unlock(&neglist->nl_lock); 896 } 897 898 *neglistpp = neglist; 899 *ncpp = ncp; 900 cycle++; 901 } 902 903 static void 904 cache_negative_zap_one(void) 905 { 906 struct namecache *ncp, *ncp2; 907 struct neglist *neglist; 908 struct negstate *negstate; 909 struct mtx *dvlp; 910 struct mtx *blp; 911 912 if (mtx_owner(&ncneg_shrink_lock) != NULL || 913 !mtx_trylock(&ncneg_shrink_lock)) { 914 counter_u64_add(shrinking_skipped, 1); 915 return; 916 } 917 918 mtx_lock(&ncneg_hot.nl_lock); 919 ncp = TAILQ_FIRST(&ncneg_hot.nl_list); 920 if (ncp != NULL) { 921 neglist = NCP2NEGLIST(ncp); 922 negstate = NCP2NEGSTATE(ncp); 923 mtx_lock(&neglist->nl_lock); 924 MPASS((negstate->neg_flag & NEG_HOT) != 0); 925 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 926 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 927 negstate->neg_flag &= ~NEG_HOT; 928 numhotneg--; 929 mtx_unlock(&neglist->nl_lock); 930 } 931 mtx_unlock(&ncneg_hot.nl_lock); 932 933 cache_negative_shrink_select(&ncp, &neglist); 934 935 mtx_unlock(&ncneg_shrink_lock); 936 if (ncp == NULL) 937 return; 938 939 MPASS(ncp->nc_flag & NCF_NEGATIVE); 940 dvlp = VP2VNODELOCK(ncp->nc_dvp); 941 blp = NCP2BUCKETLOCK(ncp); 942 mtx_unlock(&neglist->nl_lock); 943 mtx_lock(dvlp); 944 mtx_lock(blp); 945 /* 946 * Enter SMR to safely check the negative list. 947 * Even if the found pointer matches, the entry may now be reallocated 948 * and used by a different vnode. 949 */ 950 vfs_smr_enter(); 951 ncp2 = TAILQ_FIRST(&neglist->nl_list); 952 if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) || 953 blp != NCP2BUCKETLOCK(ncp2)) { 954 vfs_smr_exit(); 955 ncp = NULL; 956 } else { 957 vfs_smr_exit(); 958 SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp, 959 ncp->nc_name); 960 cache_zap_locked(ncp); 961 counter_u64_add(numneg_evicted, 1); 962 } 963 mtx_unlock(blp); 964 mtx_unlock(dvlp); 965 if (ncp != NULL) 966 cache_free(ncp); 967 } 968 969 /* 970 * cache_zap_locked(): 971 * 972 * Removes a namecache entry from cache, whether it contains an actual 973 * pointer to a vnode or if it is just a negative cache entry. 974 */ 975 static void 976 cache_zap_locked(struct namecache *ncp) 977 { 978 struct nchashhead *ncpp; 979 980 if (!(ncp->nc_flag & NCF_NEGATIVE)) 981 cache_assert_vnode_locked(ncp->nc_vp); 982 cache_assert_vnode_locked(ncp->nc_dvp); 983 cache_assert_bucket_locked(ncp); 984 985 cache_ncp_invalidate(ncp); 986 987 ncpp = NCP2BUCKET(ncp); 988 CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash); 989 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 990 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, 991 ncp->nc_name, ncp->nc_vp); 992 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); 993 if (ncp == ncp->nc_vp->v_cache_dd) { 994 vn_seqc_write_begin_unheld(ncp->nc_vp); 995 ncp->nc_vp->v_cache_dd = NULL; 996 vn_seqc_write_end(ncp->nc_vp); 997 } 998 } else { 999 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp, 1000 ncp->nc_name); 1001 cache_negative_remove(ncp); 1002 } 1003 if (ncp->nc_flag & NCF_ISDOTDOT) { 1004 if (ncp == ncp->nc_dvp->v_cache_dd) { 1005 vn_seqc_write_begin_unheld(ncp->nc_dvp); 1006 ncp->nc_dvp->v_cache_dd = NULL; 1007 vn_seqc_write_end(ncp->nc_dvp); 1008 } 1009 } else { 1010 LIST_REMOVE(ncp, nc_src); 1011 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { 1012 ncp->nc_flag |= NCF_DVDROP; 1013 counter_u64_add(numcachehv, -1); 1014 } 1015 } 1016 atomic_subtract_long(&numcache, 1); 1017 } 1018 1019 static void 1020 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) 1021 { 1022 struct mtx *blp; 1023 1024 MPASS(ncp->nc_dvp == vp); 1025 MPASS(ncp->nc_flag & NCF_NEGATIVE); 1026 cache_assert_vnode_locked(vp); 1027 1028 blp = NCP2BUCKETLOCK(ncp); 1029 mtx_lock(blp); 1030 cache_zap_locked(ncp); 1031 mtx_unlock(blp); 1032 } 1033 1034 static bool 1035 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, 1036 struct mtx **vlpp) 1037 { 1038 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 1039 struct mtx *blp; 1040 1041 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 1042 cache_assert_vnode_locked(vp); 1043 1044 if (ncp->nc_flag & NCF_NEGATIVE) { 1045 if (*vlpp != NULL) { 1046 mtx_unlock(*vlpp); 1047 *vlpp = NULL; 1048 } 1049 cache_zap_negative_locked_vnode_kl(ncp, vp); 1050 return (true); 1051 } 1052 1053 pvlp = VP2VNODELOCK(vp); 1054 blp = NCP2BUCKETLOCK(ncp); 1055 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 1056 vlp2 = VP2VNODELOCK(ncp->nc_vp); 1057 1058 if (*vlpp == vlp1 || *vlpp == vlp2) { 1059 to_unlock = *vlpp; 1060 *vlpp = NULL; 1061 } else { 1062 if (*vlpp != NULL) { 1063 mtx_unlock(*vlpp); 1064 *vlpp = NULL; 1065 } 1066 cache_sort_vnodes(&vlp1, &vlp2); 1067 if (vlp1 == pvlp) { 1068 mtx_lock(vlp2); 1069 to_unlock = vlp2; 1070 } else { 1071 if (!mtx_trylock(vlp1)) 1072 goto out_relock; 1073 to_unlock = vlp1; 1074 } 1075 } 1076 mtx_lock(blp); 1077 cache_zap_locked(ncp); 1078 mtx_unlock(blp); 1079 if (to_unlock != NULL) 1080 mtx_unlock(to_unlock); 1081 return (true); 1082 1083 out_relock: 1084 mtx_unlock(vlp2); 1085 mtx_lock(vlp1); 1086 mtx_lock(vlp2); 1087 MPASS(*vlpp == NULL); 1088 *vlpp = vlp1; 1089 return (false); 1090 } 1091 1092 /* 1093 * If trylocking failed we can get here. We know enough to take all needed locks 1094 * in the right order and re-lookup the entry. 1095 */ 1096 static int 1097 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1098 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, 1099 struct mtx *blp) 1100 { 1101 struct namecache *rncp; 1102 1103 cache_assert_bucket_unlocked(ncp); 1104 1105 cache_sort_vnodes(&dvlp, &vlp); 1106 cache_lock_vnodes(dvlp, vlp); 1107 mtx_lock(blp); 1108 CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { 1109 if (rncp == ncp && rncp->nc_dvp == dvp && 1110 rncp->nc_nlen == cnp->cn_namelen && 1111 !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) 1112 break; 1113 } 1114 if (rncp != NULL) { 1115 cache_zap_locked(rncp); 1116 mtx_unlock(blp); 1117 cache_unlock_vnodes(dvlp, vlp); 1118 counter_u64_add(zap_and_exit_bucket_relock_success, 1); 1119 return (0); 1120 } 1121 1122 mtx_unlock(blp); 1123 cache_unlock_vnodes(dvlp, vlp); 1124 return (EAGAIN); 1125 } 1126 1127 static int __noinline 1128 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp, 1129 uint32_t hash, struct mtx *blp) 1130 { 1131 struct mtx *dvlp, *vlp; 1132 struct vnode *dvp; 1133 1134 cache_assert_bucket_locked(ncp); 1135 1136 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1137 vlp = NULL; 1138 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1139 vlp = VP2VNODELOCK(ncp->nc_vp); 1140 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1141 cache_zap_locked(ncp); 1142 mtx_unlock(blp); 1143 cache_unlock_vnodes(dvlp, vlp); 1144 return (0); 1145 } 1146 1147 dvp = ncp->nc_dvp; 1148 mtx_unlock(blp); 1149 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1150 } 1151 1152 static __noinline int 1153 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp) 1154 { 1155 struct namecache *ncp; 1156 struct mtx *blp; 1157 struct mtx *dvlp, *dvlp2; 1158 uint32_t hash; 1159 int error; 1160 1161 if (cnp->cn_namelen == 2 && 1162 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1163 dvlp = VP2VNODELOCK(dvp); 1164 dvlp2 = NULL; 1165 mtx_lock(dvlp); 1166 retry_dotdot: 1167 ncp = dvp->v_cache_dd; 1168 if (ncp == NULL) { 1169 mtx_unlock(dvlp); 1170 if (dvlp2 != NULL) 1171 mtx_unlock(dvlp2); 1172 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1173 return (0); 1174 } 1175 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1176 if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2)) 1177 goto retry_dotdot; 1178 MPASS(dvp->v_cache_dd == NULL); 1179 mtx_unlock(dvlp); 1180 if (dvlp2 != NULL) 1181 mtx_unlock(dvlp2); 1182 cache_free(ncp); 1183 } else { 1184 vn_seqc_write_begin(dvp); 1185 dvp->v_cache_dd = NULL; 1186 vn_seqc_write_end(dvp); 1187 mtx_unlock(dvlp); 1188 if (dvlp2 != NULL) 1189 mtx_unlock(dvlp2); 1190 } 1191 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1192 return (1); 1193 } 1194 1195 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1196 blp = HASH2BUCKETLOCK(hash); 1197 retry: 1198 if (CK_SLIST_EMPTY(NCHHASH(hash))) 1199 goto out_no_entry; 1200 1201 mtx_lock(blp); 1202 1203 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1204 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1205 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1206 break; 1207 } 1208 1209 if (ncp == NULL) { 1210 mtx_unlock(blp); 1211 goto out_no_entry; 1212 } 1213 1214 error = cache_zap_locked_bucket(ncp, cnp, hash, blp); 1215 if (__predict_false(error != 0)) { 1216 zap_and_exit_bucket_fail++; 1217 goto retry; 1218 } 1219 counter_u64_add(numposzaps, 1); 1220 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); 1221 cache_free(ncp); 1222 return (1); 1223 out_no_entry: 1224 counter_u64_add(nummisszap, 1); 1225 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); 1226 return (0); 1227 } 1228 1229 static int __noinline 1230 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1231 struct timespec *tsp, int *ticksp) 1232 { 1233 int ltype; 1234 1235 *vpp = dvp; 1236 counter_u64_add(dothits, 1); 1237 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); 1238 if (tsp != NULL) 1239 timespecclear(tsp); 1240 if (ticksp != NULL) 1241 *ticksp = ticks; 1242 vrefact(*vpp); 1243 /* 1244 * When we lookup "." we still can be asked to lock it 1245 * differently... 1246 */ 1247 ltype = cnp->cn_lkflags & LK_TYPE_MASK; 1248 if (ltype != VOP_ISLOCKED(*vpp)) { 1249 if (ltype == LK_EXCLUSIVE) { 1250 vn_lock(*vpp, LK_UPGRADE | LK_RETRY); 1251 if (VN_IS_DOOMED((*vpp))) { 1252 /* forced unmount */ 1253 vrele(*vpp); 1254 *vpp = NULL; 1255 return (ENOENT); 1256 } 1257 } else 1258 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); 1259 } 1260 return (-1); 1261 } 1262 1263 static int __noinline 1264 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1265 struct timespec *tsp, int *ticksp) 1266 { 1267 struct namecache_ts *ncp_ts; 1268 struct namecache *ncp; 1269 struct mtx *dvlp; 1270 enum vgetstate vs; 1271 int error, ltype; 1272 bool whiteout; 1273 1274 MPASS((cnp->cn_flags & ISDOTDOT) != 0); 1275 1276 if ((cnp->cn_flags & MAKEENTRY) == 0) { 1277 cache_remove_cnp(dvp, cnp); 1278 return (0); 1279 } 1280 1281 counter_u64_add(dotdothits, 1); 1282 retry: 1283 dvlp = VP2VNODELOCK(dvp); 1284 mtx_lock(dvlp); 1285 ncp = dvp->v_cache_dd; 1286 if (ncp == NULL) { 1287 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, "..", NULL); 1288 mtx_unlock(dvlp); 1289 return (0); 1290 } 1291 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1292 if (ncp->nc_flag & NCF_NEGATIVE) 1293 *vpp = NULL; 1294 else 1295 *vpp = ncp->nc_vp; 1296 } else 1297 *vpp = ncp->nc_dvp; 1298 if (*vpp == NULL) 1299 goto negative_success; 1300 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp); 1301 cache_out_ts(ncp, tsp, ticksp); 1302 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == 1303 NCF_DTS && tsp != NULL) { 1304 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1305 *tsp = ncp_ts->nc_dotdottime; 1306 } 1307 1308 MPASS(dvp != *vpp); 1309 ltype = VOP_ISLOCKED(dvp); 1310 VOP_UNLOCK(dvp); 1311 vs = vget_prep(*vpp); 1312 mtx_unlock(dvlp); 1313 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1314 vn_lock(dvp, ltype | LK_RETRY); 1315 if (VN_IS_DOOMED(dvp)) { 1316 if (error == 0) 1317 vput(*vpp); 1318 *vpp = NULL; 1319 return (ENOENT); 1320 } 1321 if (error) { 1322 *vpp = NULL; 1323 goto retry; 1324 } 1325 return (-1); 1326 negative_success: 1327 if (__predict_false(cnp->cn_nameiop == CREATE)) { 1328 if (cnp->cn_flags & ISLASTCN) { 1329 counter_u64_add(numnegzaps, 1); 1330 cache_zap_negative_locked_vnode_kl(ncp, dvp); 1331 mtx_unlock(dvlp); 1332 cache_free(ncp); 1333 return (0); 1334 } 1335 } 1336 1337 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 1338 cache_out_ts(ncp, tsp, ticksp); 1339 counter_u64_add(numneghits, 1); 1340 whiteout = (ncp->nc_flag & NCF_WHITE); 1341 cache_negative_hit(ncp); 1342 mtx_unlock(dvlp); 1343 if (whiteout) 1344 cnp->cn_flags |= ISWHITEOUT; 1345 return (ENOENT); 1346 } 1347 1348 /** 1349 * Lookup a name in the name cache 1350 * 1351 * # Arguments 1352 * 1353 * - dvp: Parent directory in which to search. 1354 * - vpp: Return argument. Will contain desired vnode on cache hit. 1355 * - cnp: Parameters of the name search. The most interesting bits of 1356 * the cn_flags field have the following meanings: 1357 * - MAKEENTRY: If clear, free an entry from the cache rather than look 1358 * it up. 1359 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." 1360 * - tsp: Return storage for cache timestamp. On a successful (positive 1361 * or negative) lookup, tsp will be filled with any timespec that 1362 * was stored when this cache entry was created. However, it will 1363 * be clear for "." entries. 1364 * - ticks: Return storage for alternate cache timestamp. On a successful 1365 * (positive or negative) lookup, it will contain the ticks value 1366 * that was current when the cache entry was created, unless cnp 1367 * was ".". 1368 * 1369 * Either both tsp and ticks have to be provided or neither of them. 1370 * 1371 * # Returns 1372 * 1373 * - -1: A positive cache hit. vpp will contain the desired vnode. 1374 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due 1375 * to a forced unmount. vpp will not be modified. If the entry 1376 * is a whiteout, then the ISWHITEOUT flag will be set in 1377 * cnp->cn_flags. 1378 * - 0: A cache miss. vpp will not be modified. 1379 * 1380 * # Locking 1381 * 1382 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up 1383 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the 1384 * lock is not recursively acquired. 1385 */ 1386 static int __noinline 1387 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1388 struct timespec *tsp, int *ticksp) 1389 { 1390 struct namecache *ncp; 1391 struct mtx *blp; 1392 uint32_t hash; 1393 enum vgetstate vs; 1394 int error; 1395 bool whiteout; 1396 1397 MPASS((cnp->cn_flags & (MAKEENTRY | ISDOTDOT)) == MAKEENTRY); 1398 1399 retry: 1400 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1401 blp = HASH2BUCKETLOCK(hash); 1402 mtx_lock(blp); 1403 1404 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1405 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1406 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1407 break; 1408 } 1409 1410 if (__predict_false(ncp == NULL)) { 1411 mtx_unlock(blp); 1412 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1413 NULL); 1414 counter_u64_add(nummiss, 1); 1415 return (0); 1416 } 1417 1418 if (ncp->nc_flag & NCF_NEGATIVE) 1419 goto negative_success; 1420 1421 counter_u64_add(numposhits, 1); 1422 *vpp = ncp->nc_vp; 1423 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); 1424 cache_out_ts(ncp, tsp, ticksp); 1425 MPASS(dvp != *vpp); 1426 vs = vget_prep(*vpp); 1427 mtx_unlock(blp); 1428 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1429 if (error) { 1430 *vpp = NULL; 1431 goto retry; 1432 } 1433 return (-1); 1434 negative_success: 1435 if (__predict_false(cnp->cn_nameiop == CREATE)) { 1436 if (cnp->cn_flags & ISLASTCN) { 1437 counter_u64_add(numnegzaps, 1); 1438 error = cache_zap_locked_bucket(ncp, cnp, hash, blp); 1439 if (__predict_false(error != 0)) { 1440 zap_and_exit_bucket_fail2++; 1441 goto retry; 1442 } 1443 cache_free(ncp); 1444 return (0); 1445 } 1446 } 1447 1448 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 1449 cache_out_ts(ncp, tsp, ticksp); 1450 counter_u64_add(numneghits, 1); 1451 whiteout = (ncp->nc_flag & NCF_WHITE); 1452 cache_negative_hit(ncp); 1453 mtx_unlock(blp); 1454 if (whiteout) 1455 cnp->cn_flags |= ISWHITEOUT; 1456 return (ENOENT); 1457 } 1458 1459 int 1460 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1461 struct timespec *tsp, int *ticksp) 1462 { 1463 struct namecache *ncp; 1464 struct negstate *negstate; 1465 uint32_t hash; 1466 enum vgetstate vs; 1467 int error; 1468 bool whiteout; 1469 u_short nc_flag; 1470 1471 MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL)); 1472 1473 #ifdef DEBUG_CACHE 1474 if (__predict_false(!doingcache)) { 1475 cnp->cn_flags &= ~MAKEENTRY; 1476 return (0); 1477 } 1478 #endif 1479 1480 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 1481 if (cnp->cn_namelen == 1) 1482 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); 1483 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') 1484 return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp)); 1485 } 1486 1487 MPASS((cnp->cn_flags & ISDOTDOT) == 0); 1488 1489 if ((cnp->cn_flags & MAKEENTRY) == 0) { 1490 cache_remove_cnp(dvp, cnp); 1491 return (0); 1492 } 1493 1494 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1495 vfs_smr_enter(); 1496 1497 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1498 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1499 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1500 break; 1501 } 1502 1503 if (__predict_false(ncp == NULL)) { 1504 vfs_smr_exit(); 1505 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1506 NULL); 1507 counter_u64_add(nummiss, 1); 1508 return (0); 1509 } 1510 1511 nc_flag = atomic_load_char(&ncp->nc_flag); 1512 if (nc_flag & NCF_NEGATIVE) 1513 goto negative_success; 1514 1515 counter_u64_add(numposhits, 1); 1516 *vpp = ncp->nc_vp; 1517 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); 1518 cache_out_ts(ncp, tsp, ticksp); 1519 MPASS(dvp != *vpp); 1520 if (!cache_ncp_canuse(ncp)) { 1521 vfs_smr_exit(); 1522 *vpp = NULL; 1523 goto out_fallback; 1524 } 1525 vs = vget_prep_smr(*vpp); 1526 vfs_smr_exit(); 1527 if (__predict_false(vs == VGET_NONE)) { 1528 *vpp = NULL; 1529 goto out_fallback; 1530 } 1531 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1532 if (error) { 1533 *vpp = NULL; 1534 goto out_fallback; 1535 } 1536 return (-1); 1537 negative_success: 1538 if (__predict_false(cnp->cn_nameiop == CREATE)) { 1539 if (cnp->cn_flags & ISLASTCN) { 1540 vfs_smr_exit(); 1541 goto out_fallback; 1542 } 1543 } 1544 1545 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 1546 cache_out_ts(ncp, tsp, ticksp); 1547 counter_u64_add(numneghits, 1); 1548 whiteout = (ncp->nc_flag & NCF_WHITE); 1549 /* 1550 * TODO: We need to take locks to promote an entry. Code doing it 1551 * in SMR lookup can be modified to be shared. 1552 */ 1553 negstate = NCP2NEGSTATE(ncp); 1554 if ((negstate->neg_flag & NEG_HOT) == 0 || 1555 !cache_ncp_canuse(ncp)) { 1556 vfs_smr_exit(); 1557 goto out_fallback; 1558 } 1559 vfs_smr_exit(); 1560 if (whiteout) 1561 cnp->cn_flags |= ISWHITEOUT; 1562 return (ENOENT); 1563 out_fallback: 1564 return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp)); 1565 } 1566 1567 struct celockstate { 1568 struct mtx *vlp[3]; 1569 struct mtx *blp[2]; 1570 }; 1571 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); 1572 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); 1573 1574 static inline void 1575 cache_celockstate_init(struct celockstate *cel) 1576 { 1577 1578 bzero(cel, sizeof(*cel)); 1579 } 1580 1581 static void 1582 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, 1583 struct vnode *dvp) 1584 { 1585 struct mtx *vlp1, *vlp2; 1586 1587 MPASS(cel->vlp[0] == NULL); 1588 MPASS(cel->vlp[1] == NULL); 1589 MPASS(cel->vlp[2] == NULL); 1590 1591 MPASS(vp != NULL || dvp != NULL); 1592 1593 vlp1 = VP2VNODELOCK(vp); 1594 vlp2 = VP2VNODELOCK(dvp); 1595 cache_sort_vnodes(&vlp1, &vlp2); 1596 1597 if (vlp1 != NULL) { 1598 mtx_lock(vlp1); 1599 cel->vlp[0] = vlp1; 1600 } 1601 mtx_lock(vlp2); 1602 cel->vlp[1] = vlp2; 1603 } 1604 1605 static void 1606 cache_unlock_vnodes_cel(struct celockstate *cel) 1607 { 1608 1609 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); 1610 1611 if (cel->vlp[0] != NULL) 1612 mtx_unlock(cel->vlp[0]); 1613 if (cel->vlp[1] != NULL) 1614 mtx_unlock(cel->vlp[1]); 1615 if (cel->vlp[2] != NULL) 1616 mtx_unlock(cel->vlp[2]); 1617 } 1618 1619 static bool 1620 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) 1621 { 1622 struct mtx *vlp; 1623 bool ret; 1624 1625 cache_assert_vlp_locked(cel->vlp[0]); 1626 cache_assert_vlp_locked(cel->vlp[1]); 1627 MPASS(cel->vlp[2] == NULL); 1628 1629 MPASS(vp != NULL); 1630 vlp = VP2VNODELOCK(vp); 1631 1632 ret = true; 1633 if (vlp >= cel->vlp[1]) { 1634 mtx_lock(vlp); 1635 } else { 1636 if (mtx_trylock(vlp)) 1637 goto out; 1638 cache_lock_vnodes_cel_3_failures++; 1639 cache_unlock_vnodes_cel(cel); 1640 if (vlp < cel->vlp[0]) { 1641 mtx_lock(vlp); 1642 mtx_lock(cel->vlp[0]); 1643 mtx_lock(cel->vlp[1]); 1644 } else { 1645 if (cel->vlp[0] != NULL) 1646 mtx_lock(cel->vlp[0]); 1647 mtx_lock(vlp); 1648 mtx_lock(cel->vlp[1]); 1649 } 1650 ret = false; 1651 } 1652 out: 1653 cel->vlp[2] = vlp; 1654 return (ret); 1655 } 1656 1657 static void 1658 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1, 1659 struct mtx *blp2) 1660 { 1661 1662 MPASS(cel->blp[0] == NULL); 1663 MPASS(cel->blp[1] == NULL); 1664 1665 cache_sort_vnodes(&blp1, &blp2); 1666 1667 if (blp1 != NULL) { 1668 mtx_lock(blp1); 1669 cel->blp[0] = blp1; 1670 } 1671 mtx_lock(blp2); 1672 cel->blp[1] = blp2; 1673 } 1674 1675 static void 1676 cache_unlock_buckets_cel(struct celockstate *cel) 1677 { 1678 1679 if (cel->blp[0] != NULL) 1680 mtx_unlock(cel->blp[0]); 1681 mtx_unlock(cel->blp[1]); 1682 } 1683 1684 /* 1685 * Lock part of the cache affected by the insertion. 1686 * 1687 * This means vnodelocks for dvp, vp and the relevant bucketlock. 1688 * However, insertion can result in removal of an old entry. In this 1689 * case we have an additional vnode and bucketlock pair to lock. 1690 * 1691 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while 1692 * preserving the locking order (smaller address first). 1693 */ 1694 static void 1695 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1696 uint32_t hash) 1697 { 1698 struct namecache *ncp; 1699 struct mtx *blps[2]; 1700 1701 blps[0] = HASH2BUCKETLOCK(hash); 1702 for (;;) { 1703 blps[1] = NULL; 1704 cache_lock_vnodes_cel(cel, dvp, vp); 1705 if (vp == NULL || vp->v_type != VDIR) 1706 break; 1707 ncp = vp->v_cache_dd; 1708 if (ncp == NULL) 1709 break; 1710 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1711 break; 1712 MPASS(ncp->nc_dvp == vp); 1713 blps[1] = NCP2BUCKETLOCK(ncp); 1714 if (ncp->nc_flag & NCF_NEGATIVE) 1715 break; 1716 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1717 break; 1718 /* 1719 * All vnodes got re-locked. Re-validate the state and if 1720 * nothing changed we are done. Otherwise restart. 1721 */ 1722 if (ncp == vp->v_cache_dd && 1723 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1724 blps[1] == NCP2BUCKETLOCK(ncp) && 1725 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1726 break; 1727 cache_unlock_vnodes_cel(cel); 1728 cel->vlp[0] = NULL; 1729 cel->vlp[1] = NULL; 1730 cel->vlp[2] = NULL; 1731 } 1732 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1733 } 1734 1735 static void 1736 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1737 uint32_t hash) 1738 { 1739 struct namecache *ncp; 1740 struct mtx *blps[2]; 1741 1742 blps[0] = HASH2BUCKETLOCK(hash); 1743 for (;;) { 1744 blps[1] = NULL; 1745 cache_lock_vnodes_cel(cel, dvp, vp); 1746 ncp = dvp->v_cache_dd; 1747 if (ncp == NULL) 1748 break; 1749 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1750 break; 1751 MPASS(ncp->nc_dvp == dvp); 1752 blps[1] = NCP2BUCKETLOCK(ncp); 1753 if (ncp->nc_flag & NCF_NEGATIVE) 1754 break; 1755 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1756 break; 1757 if (ncp == dvp->v_cache_dd && 1758 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1759 blps[1] == NCP2BUCKETLOCK(ncp) && 1760 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1761 break; 1762 cache_unlock_vnodes_cel(cel); 1763 cel->vlp[0] = NULL; 1764 cel->vlp[1] = NULL; 1765 cel->vlp[2] = NULL; 1766 } 1767 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1768 } 1769 1770 static void 1771 cache_enter_unlock(struct celockstate *cel) 1772 { 1773 1774 cache_unlock_buckets_cel(cel); 1775 cache_unlock_vnodes_cel(cel); 1776 } 1777 1778 static void __noinline 1779 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, 1780 struct componentname *cnp) 1781 { 1782 struct celockstate cel; 1783 struct namecache *ncp; 1784 uint32_t hash; 1785 int len; 1786 1787 if (dvp->v_cache_dd == NULL) 1788 return; 1789 len = cnp->cn_namelen; 1790 cache_celockstate_init(&cel); 1791 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1792 cache_enter_lock_dd(&cel, dvp, vp, hash); 1793 vn_seqc_write_begin(dvp); 1794 ncp = dvp->v_cache_dd; 1795 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) { 1796 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); 1797 cache_zap_locked(ncp); 1798 } else { 1799 ncp = NULL; 1800 } 1801 dvp->v_cache_dd = NULL; 1802 vn_seqc_write_end(dvp); 1803 cache_enter_unlock(&cel); 1804 if (ncp != NULL) 1805 cache_free(ncp); 1806 } 1807 1808 /* 1809 * Add an entry to the cache. 1810 */ 1811 void 1812 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 1813 struct timespec *tsp, struct timespec *dtsp) 1814 { 1815 struct celockstate cel; 1816 struct namecache *ncp, *n2, *ndd; 1817 struct namecache_ts *ncp_ts; 1818 struct nchashhead *ncpp; 1819 uint32_t hash; 1820 int flag; 1821 int len; 1822 u_long lnumcache; 1823 1824 VNPASS(!VN_IS_DOOMED(dvp), dvp); 1825 VNPASS(dvp->v_type != VNON, dvp); 1826 if (vp != NULL) { 1827 VNPASS(!VN_IS_DOOMED(vp), vp); 1828 VNPASS(vp->v_type != VNON, vp); 1829 } 1830 1831 #ifdef DEBUG_CACHE 1832 if (__predict_false(!doingcache)) 1833 return; 1834 #endif 1835 1836 flag = 0; 1837 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 1838 if (cnp->cn_namelen == 1) 1839 return; 1840 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 1841 cache_enter_dotdot_prep(dvp, vp, cnp); 1842 flag = NCF_ISDOTDOT; 1843 } 1844 } 1845 1846 /* 1847 * Avoid blowout in namecache entries. 1848 */ 1849 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; 1850 if (__predict_false(lnumcache >= ncsize)) { 1851 atomic_subtract_long(&numcache, 1); 1852 counter_u64_add(numdrops, 1); 1853 return; 1854 } 1855 1856 cache_celockstate_init(&cel); 1857 ndd = NULL; 1858 ncp_ts = NULL; 1859 1860 /* 1861 * Calculate the hash key and setup as much of the new 1862 * namecache entry as possible before acquiring the lock. 1863 */ 1864 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); 1865 ncp->nc_flag = flag | NCF_WIP; 1866 ncp->nc_vp = vp; 1867 if (vp == NULL) 1868 cache_negative_init(ncp); 1869 ncp->nc_dvp = dvp; 1870 if (tsp != NULL) { 1871 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1872 ncp_ts->nc_time = *tsp; 1873 ncp_ts->nc_ticks = ticks; 1874 ncp_ts->nc_nc.nc_flag |= NCF_TS; 1875 if (dtsp != NULL) { 1876 ncp_ts->nc_dotdottime = *dtsp; 1877 ncp_ts->nc_nc.nc_flag |= NCF_DTS; 1878 } 1879 } 1880 len = ncp->nc_nlen = cnp->cn_namelen; 1881 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1882 memcpy(ncp->nc_name, cnp->cn_nameptr, len); 1883 ncp->nc_name[len] = '\0'; 1884 cache_enter_lock(&cel, dvp, vp, hash); 1885 1886 /* 1887 * See if this vnode or negative entry is already in the cache 1888 * with this name. This can happen with concurrent lookups of 1889 * the same path name. 1890 */ 1891 ncpp = NCHHASH(hash); 1892 CK_SLIST_FOREACH(n2, ncpp, nc_hash) { 1893 if (n2->nc_dvp == dvp && 1894 n2->nc_nlen == cnp->cn_namelen && 1895 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { 1896 MPASS(cache_ncp_canuse(n2)); 1897 if ((n2->nc_flag & NCF_NEGATIVE) != 0) 1898 KASSERT(vp == NULL, 1899 ("%s: found entry pointing to a different vnode (%p != %p)", 1900 __func__, NULL, vp)); 1901 else 1902 KASSERT(n2->nc_vp == vp, 1903 ("%s: found entry pointing to a different vnode (%p != %p)", 1904 __func__, n2->nc_vp, vp)); 1905 /* 1906 * Entries are supposed to be immutable unless in the 1907 * process of getting destroyed. Accommodating for 1908 * changing timestamps is possible but not worth it. 1909 * This should be harmless in terms of correctness, in 1910 * the worst case resulting in an earlier expiration. 1911 * Alternatively, the found entry can be replaced 1912 * altogether. 1913 */ 1914 MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS))); 1915 #if 0 1916 if (tsp != NULL) { 1917 KASSERT((n2->nc_flag & NCF_TS) != 0, 1918 ("no NCF_TS")); 1919 n2_ts = __containerof(n2, struct namecache_ts, nc_nc); 1920 n2_ts->nc_time = ncp_ts->nc_time; 1921 n2_ts->nc_ticks = ncp_ts->nc_ticks; 1922 if (dtsp != NULL) { 1923 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; 1924 n2_ts->nc_nc.nc_flag |= NCF_DTS; 1925 } 1926 } 1927 #endif 1928 goto out_unlock_free; 1929 } 1930 } 1931 1932 if (flag == NCF_ISDOTDOT) { 1933 /* 1934 * See if we are trying to add .. entry, but some other lookup 1935 * has populated v_cache_dd pointer already. 1936 */ 1937 if (dvp->v_cache_dd != NULL) 1938 goto out_unlock_free; 1939 KASSERT(vp == NULL || vp->v_type == VDIR, 1940 ("wrong vnode type %p", vp)); 1941 vn_seqc_write_begin(dvp); 1942 dvp->v_cache_dd = ncp; 1943 vn_seqc_write_end(dvp); 1944 } 1945 1946 if (vp != NULL) { 1947 if (flag != NCF_ISDOTDOT) { 1948 /* 1949 * For this case, the cache entry maps both the 1950 * directory name in it and the name ".." for the 1951 * directory's parent. 1952 */ 1953 vn_seqc_write_begin(vp); 1954 if ((ndd = vp->v_cache_dd) != NULL) { 1955 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) 1956 cache_zap_locked(ndd); 1957 else 1958 ndd = NULL; 1959 } 1960 vp->v_cache_dd = ncp; 1961 vn_seqc_write_end(vp); 1962 } else if (vp->v_type != VDIR) { 1963 if (vp->v_cache_dd != NULL) { 1964 vn_seqc_write_begin(vp); 1965 vp->v_cache_dd = NULL; 1966 vn_seqc_write_end(vp); 1967 } 1968 } 1969 } 1970 1971 if (flag != NCF_ISDOTDOT) { 1972 if (LIST_EMPTY(&dvp->v_cache_src)) { 1973 vhold(dvp); 1974 counter_u64_add(numcachehv, 1); 1975 } 1976 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); 1977 } 1978 1979 /* 1980 * If the entry is "negative", we place it into the 1981 * "negative" cache queue, otherwise, we place it into the 1982 * destination vnode's cache entries queue. 1983 */ 1984 if (vp != NULL) { 1985 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); 1986 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, 1987 vp); 1988 } else { 1989 if (cnp->cn_flags & ISWHITEOUT) 1990 ncp->nc_flag |= NCF_WHITE; 1991 cache_negative_insert(ncp); 1992 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, 1993 ncp->nc_name); 1994 } 1995 1996 /* 1997 * Insert the new namecache entry into the appropriate chain 1998 * within the cache entries table. 1999 */ 2000 CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash); 2001 2002 atomic_thread_fence_rel(); 2003 /* 2004 * Mark the entry as fully constructed. 2005 * It is immutable past this point until its removal. 2006 */ 2007 atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP); 2008 2009 cache_enter_unlock(&cel); 2010 if (numneg * ncnegfactor > lnumcache) 2011 cache_negative_zap_one(); 2012 if (ndd != NULL) 2013 cache_free(ndd); 2014 return; 2015 out_unlock_free: 2016 cache_enter_unlock(&cel); 2017 atomic_subtract_long(&numcache, 1); 2018 cache_free(ncp); 2019 return; 2020 } 2021 2022 static u_int 2023 cache_roundup_2(u_int val) 2024 { 2025 u_int res; 2026 2027 for (res = 1; res <= val; res <<= 1) 2028 continue; 2029 2030 return (res); 2031 } 2032 2033 static struct nchashhead * 2034 nchinittbl(u_long elements, u_long *hashmask) 2035 { 2036 struct nchashhead *hashtbl; 2037 u_long hashsize, i; 2038 2039 hashsize = cache_roundup_2(elements) / 2; 2040 2041 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK); 2042 for (i = 0; i < hashsize; i++) 2043 CK_SLIST_INIT(&hashtbl[i]); 2044 *hashmask = hashsize - 1; 2045 return (hashtbl); 2046 } 2047 2048 static void 2049 ncfreetbl(struct nchashhead *hashtbl) 2050 { 2051 2052 free(hashtbl, M_VFSCACHE); 2053 } 2054 2055 /* 2056 * Name cache initialization, from vfs_init() when we are booting 2057 */ 2058 static void 2059 nchinit(void *dummy __unused) 2060 { 2061 u_int i; 2062 2063 cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE, 2064 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2065 cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE, 2066 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2067 cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE, 2068 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2069 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE, 2070 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); 2071 2072 VFS_SMR_ZONE_SET(cache_zone_small); 2073 VFS_SMR_ZONE_SET(cache_zone_small_ts); 2074 VFS_SMR_ZONE_SET(cache_zone_large); 2075 VFS_SMR_ZONE_SET(cache_zone_large_ts); 2076 2077 ncsize = desiredvnodes * ncsizefactor; 2078 nchashtbl = nchinittbl(desiredvnodes * 2, &nchash); 2079 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; 2080 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ 2081 ncbuckethash = 7; 2082 if (ncbuckethash > nchash) 2083 ncbuckethash = nchash; 2084 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, 2085 M_WAITOK | M_ZERO); 2086 for (i = 0; i < numbucketlocks; i++) 2087 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE); 2088 ncvnodehash = ncbuckethash; 2089 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, 2090 M_WAITOK | M_ZERO); 2091 for (i = 0; i < numvnodelocks; i++) 2092 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); 2093 2094 neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE, 2095 M_WAITOK | M_ZERO); 2096 for (i = 0; i < numneglists; i++) { 2097 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); 2098 TAILQ_INIT(&neglists[i].nl_list); 2099 } 2100 mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF); 2101 TAILQ_INIT(&ncneg_hot.nl_list); 2102 2103 mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF); 2104 } 2105 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); 2106 2107 void 2108 cache_vnode_init(struct vnode *vp) 2109 { 2110 2111 LIST_INIT(&vp->v_cache_src); 2112 TAILQ_INIT(&vp->v_cache_dst); 2113 vp->v_cache_dd = NULL; 2114 cache_prehash(vp); 2115 } 2116 2117 void 2118 cache_changesize(u_long newmaxvnodes) 2119 { 2120 struct nchashhead *new_nchashtbl, *old_nchashtbl; 2121 u_long new_nchash, old_nchash; 2122 struct namecache *ncp; 2123 uint32_t hash; 2124 u_long newncsize; 2125 int i; 2126 2127 newncsize = newmaxvnodes * ncsizefactor; 2128 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); 2129 if (newmaxvnodes < numbucketlocks) 2130 newmaxvnodes = numbucketlocks; 2131 2132 new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash); 2133 /* If same hash table size, nothing to do */ 2134 if (nchash == new_nchash) { 2135 ncfreetbl(new_nchashtbl); 2136 return; 2137 } 2138 /* 2139 * Move everything from the old hash table to the new table. 2140 * None of the namecache entries in the table can be removed 2141 * because to do so, they have to be removed from the hash table. 2142 */ 2143 cache_lock_all_vnodes(); 2144 cache_lock_all_buckets(); 2145 old_nchashtbl = nchashtbl; 2146 old_nchash = nchash; 2147 nchashtbl = new_nchashtbl; 2148 nchash = new_nchash; 2149 for (i = 0; i <= old_nchash; i++) { 2150 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) { 2151 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, 2152 ncp->nc_dvp); 2153 CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash); 2154 CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); 2155 } 2156 } 2157 ncsize = newncsize; 2158 cache_unlock_all_buckets(); 2159 cache_unlock_all_vnodes(); 2160 ncfreetbl(old_nchashtbl); 2161 } 2162 2163 /* 2164 * Invalidate all entries from and to a particular vnode. 2165 */ 2166 static void 2167 cache_purge_impl(struct vnode *vp) 2168 { 2169 TAILQ_HEAD(, namecache) ncps; 2170 struct namecache *ncp, *nnp; 2171 struct mtx *vlp, *vlp2; 2172 2173 TAILQ_INIT(&ncps); 2174 vlp = VP2VNODELOCK(vp); 2175 vlp2 = NULL; 2176 mtx_lock(vlp); 2177 retry: 2178 while (!LIST_EMPTY(&vp->v_cache_src)) { 2179 ncp = LIST_FIRST(&vp->v_cache_src); 2180 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2181 goto retry; 2182 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2183 } 2184 while (!TAILQ_EMPTY(&vp->v_cache_dst)) { 2185 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2186 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2187 goto retry; 2188 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2189 } 2190 ncp = vp->v_cache_dd; 2191 if (ncp != NULL) { 2192 KASSERT(ncp->nc_flag & NCF_ISDOTDOT, 2193 ("lost dotdot link")); 2194 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2195 goto retry; 2196 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2197 } 2198 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); 2199 mtx_unlock(vlp); 2200 if (vlp2 != NULL) 2201 mtx_unlock(vlp2); 2202 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2203 cache_free(ncp); 2204 } 2205 } 2206 2207 /* 2208 * Opportunistic check to see if there is anything to do. 2209 */ 2210 static bool 2211 cache_has_entries(struct vnode *vp) 2212 { 2213 2214 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2215 vp->v_cache_dd == NULL) 2216 return (false); 2217 return (true); 2218 } 2219 2220 void 2221 cache_purge(struct vnode *vp) 2222 { 2223 2224 SDT_PROBE1(vfs, namecache, purge, done, vp); 2225 if (!cache_has_entries(vp)) 2226 return; 2227 cache_purge_impl(vp); 2228 } 2229 2230 /* 2231 * Only to be used by vgone. 2232 */ 2233 void 2234 cache_purge_vgone(struct vnode *vp) 2235 { 2236 struct mtx *vlp; 2237 2238 VNPASS(VN_IS_DOOMED(vp), vp); 2239 if (cache_has_entries(vp)) { 2240 cache_purge_impl(vp); 2241 return; 2242 } 2243 2244 /* 2245 * Serialize against a potential thread doing cache_purge. 2246 */ 2247 vlp = VP2VNODELOCK(vp); 2248 mtx_wait_unlocked(vlp); 2249 if (cache_has_entries(vp)) { 2250 cache_purge_impl(vp); 2251 return; 2252 } 2253 return; 2254 } 2255 2256 /* 2257 * Invalidate all negative entries for a particular directory vnode. 2258 */ 2259 void 2260 cache_purge_negative(struct vnode *vp) 2261 { 2262 TAILQ_HEAD(, namecache) ncps; 2263 struct namecache *ncp, *nnp; 2264 struct mtx *vlp; 2265 2266 SDT_PROBE1(vfs, namecache, purge_negative, done, vp); 2267 if (LIST_EMPTY(&vp->v_cache_src)) 2268 return; 2269 TAILQ_INIT(&ncps); 2270 vlp = VP2VNODELOCK(vp); 2271 mtx_lock(vlp); 2272 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { 2273 if (!(ncp->nc_flag & NCF_NEGATIVE)) 2274 continue; 2275 cache_zap_negative_locked_vnode_kl(ncp, vp); 2276 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2277 } 2278 mtx_unlock(vlp); 2279 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2280 cache_free(ncp); 2281 } 2282 } 2283 2284 void 2285 cache_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, 2286 struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp) 2287 { 2288 2289 ASSERT_VOP_IN_SEQC(fdvp); 2290 ASSERT_VOP_IN_SEQC(fvp); 2291 ASSERT_VOP_IN_SEQC(tdvp); 2292 if (tvp != NULL) 2293 ASSERT_VOP_IN_SEQC(tvp); 2294 2295 cache_purge(fvp); 2296 if (tvp != NULL) { 2297 cache_purge(tvp); 2298 KASSERT(!cache_remove_cnp(tdvp, tcnp), 2299 ("%s: lingering negative entry", __func__)); 2300 } else { 2301 cache_remove_cnp(tdvp, tcnp); 2302 } 2303 } 2304 2305 /* 2306 * Flush all entries referencing a particular filesystem. 2307 */ 2308 void 2309 cache_purgevfs(struct mount *mp) 2310 { 2311 struct vnode *vp, *mvp; 2312 2313 SDT_PROBE1(vfs, namecache, purgevfs, done, mp); 2314 /* 2315 * Somewhat wasteful iteration over all vnodes. Would be better to 2316 * support filtering and avoid the interlock to begin with. 2317 */ 2318 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 2319 if (!cache_has_entries(vp)) { 2320 VI_UNLOCK(vp); 2321 continue; 2322 } 2323 vholdl(vp); 2324 VI_UNLOCK(vp); 2325 cache_purge(vp); 2326 vdrop(vp); 2327 } 2328 } 2329 2330 /* 2331 * Perform canonical checks and cache lookup and pass on to filesystem 2332 * through the vop_cachedlookup only if needed. 2333 */ 2334 2335 int 2336 vfs_cache_lookup(struct vop_lookup_args *ap) 2337 { 2338 struct vnode *dvp; 2339 int error; 2340 struct vnode **vpp = ap->a_vpp; 2341 struct componentname *cnp = ap->a_cnp; 2342 int flags = cnp->cn_flags; 2343 2344 *vpp = NULL; 2345 dvp = ap->a_dvp; 2346 2347 if (dvp->v_type != VDIR) 2348 return (ENOTDIR); 2349 2350 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && 2351 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) 2352 return (EROFS); 2353 2354 error = vn_dir_check_exec(dvp, cnp); 2355 if (error != 0) 2356 return (error); 2357 2358 error = cache_lookup(dvp, vpp, cnp, NULL, NULL); 2359 if (error == 0) 2360 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); 2361 if (error == -1) 2362 return (0); 2363 return (error); 2364 } 2365 2366 /* Implementation of the getcwd syscall. */ 2367 int 2368 sys___getcwd(struct thread *td, struct __getcwd_args *uap) 2369 { 2370 char *buf, *retbuf; 2371 size_t buflen; 2372 int error; 2373 2374 buflen = uap->buflen; 2375 if (__predict_false(buflen < 2)) 2376 return (EINVAL); 2377 if (buflen > MAXPATHLEN) 2378 buflen = MAXPATHLEN; 2379 2380 buf = uma_zalloc(namei_zone, M_WAITOK); 2381 error = vn_getcwd(buf, &retbuf, &buflen); 2382 if (error == 0) 2383 error = copyout(retbuf, uap->buf, buflen); 2384 uma_zfree(namei_zone, buf); 2385 return (error); 2386 } 2387 2388 int 2389 vn_getcwd(char *buf, char **retbuf, size_t *buflen) 2390 { 2391 struct pwd *pwd; 2392 int error; 2393 2394 vfs_smr_enter(); 2395 pwd = pwd_get_smr(); 2396 error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, 2397 buflen, false, 0); 2398 VFS_SMR_ASSERT_NOT_ENTERED(); 2399 if (error < 0) { 2400 pwd = pwd_hold(curthread); 2401 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf, 2402 retbuf, buflen); 2403 pwd_drop(pwd); 2404 } 2405 2406 #ifdef KTRACE 2407 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0) 2408 ktrnamei(*retbuf); 2409 #endif 2410 return (error); 2411 } 2412 2413 static int 2414 kern___realpathat(struct thread *td, int fd, const char *path, char *buf, 2415 size_t size, int flags, enum uio_seg pathseg) 2416 { 2417 struct nameidata nd; 2418 char *retbuf, *freebuf; 2419 int error; 2420 2421 if (flags != 0) 2422 return (EINVAL); 2423 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1, 2424 pathseg, path, fd, &cap_fstat_rights, td); 2425 if ((error = namei(&nd)) != 0) 2426 return (error); 2427 error = vn_fullpath_hardlink(&nd, &retbuf, &freebuf, &size); 2428 if (error == 0) { 2429 error = copyout(retbuf, buf, size); 2430 free(freebuf, M_TEMP); 2431 } 2432 NDFREE(&nd, 0); 2433 return (error); 2434 } 2435 2436 int 2437 sys___realpathat(struct thread *td, struct __realpathat_args *uap) 2438 { 2439 2440 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size, 2441 uap->flags, UIO_USERSPACE)); 2442 } 2443 2444 /* 2445 * Retrieve the full filesystem path that correspond to a vnode from the name 2446 * cache (if available) 2447 */ 2448 int 2449 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf) 2450 { 2451 struct pwd *pwd; 2452 char *buf; 2453 size_t buflen; 2454 int error; 2455 2456 if (__predict_false(vp == NULL)) 2457 return (EINVAL); 2458 2459 buflen = MAXPATHLEN; 2460 buf = malloc(buflen, M_TEMP, M_WAITOK); 2461 vfs_smr_enter(); 2462 pwd = pwd_get_smr(); 2463 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, false, 0); 2464 VFS_SMR_ASSERT_NOT_ENTERED(); 2465 if (error < 0) { 2466 pwd = pwd_hold(curthread); 2467 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen); 2468 pwd_drop(pwd); 2469 } 2470 if (error == 0) 2471 *freebuf = buf; 2472 else 2473 free(buf, M_TEMP); 2474 return (error); 2475 } 2476 2477 /* 2478 * This function is similar to vn_fullpath, but it attempts to lookup the 2479 * pathname relative to the global root mount point. This is required for the 2480 * auditing sub-system, as audited pathnames must be absolute, relative to the 2481 * global root mount point. 2482 */ 2483 int 2484 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf) 2485 { 2486 char *buf; 2487 size_t buflen; 2488 int error; 2489 2490 if (__predict_false(vp == NULL)) 2491 return (EINVAL); 2492 buflen = MAXPATHLEN; 2493 buf = malloc(buflen, M_TEMP, M_WAITOK); 2494 vfs_smr_enter(); 2495 error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, false, 0); 2496 VFS_SMR_ASSERT_NOT_ENTERED(); 2497 if (error < 0) { 2498 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen); 2499 } 2500 if (error == 0) 2501 *freebuf = buf; 2502 else 2503 free(buf, M_TEMP); 2504 return (error); 2505 } 2506 2507 static struct namecache * 2508 vn_dd_from_dst(struct vnode *vp) 2509 { 2510 struct namecache *ncp; 2511 2512 cache_assert_vnode_locked(vp); 2513 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) { 2514 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2515 return (ncp); 2516 } 2517 return (NULL); 2518 } 2519 2520 int 2521 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, size_t *buflen) 2522 { 2523 struct vnode *dvp; 2524 struct namecache *ncp; 2525 struct mtx *vlp; 2526 int error; 2527 2528 vlp = VP2VNODELOCK(*vp); 2529 mtx_lock(vlp); 2530 ncp = (*vp)->v_cache_dd; 2531 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) { 2532 KASSERT(ncp == vn_dd_from_dst(*vp), 2533 ("%s: mismatch for dd entry (%p != %p)", __func__, 2534 ncp, vn_dd_from_dst(*vp))); 2535 } else { 2536 ncp = vn_dd_from_dst(*vp); 2537 } 2538 if (ncp != NULL) { 2539 if (*buflen < ncp->nc_nlen) { 2540 mtx_unlock(vlp); 2541 vrele(*vp); 2542 counter_u64_add(numfullpathfail4, 1); 2543 error = ENOMEM; 2544 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2545 vp, NULL); 2546 return (error); 2547 } 2548 *buflen -= ncp->nc_nlen; 2549 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 2550 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, 2551 ncp->nc_name, vp); 2552 dvp = *vp; 2553 *vp = ncp->nc_dvp; 2554 vref(*vp); 2555 mtx_unlock(vlp); 2556 vrele(dvp); 2557 return (0); 2558 } 2559 SDT_PROBE1(vfs, namecache, fullpath, miss, vp); 2560 2561 mtx_unlock(vlp); 2562 vn_lock(*vp, LK_SHARED | LK_RETRY); 2563 error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen); 2564 vput(*vp); 2565 if (error) { 2566 counter_u64_add(numfullpathfail2, 1); 2567 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2568 return (error); 2569 } 2570 2571 *vp = dvp; 2572 if (VN_IS_DOOMED(dvp)) { 2573 /* forced unmount */ 2574 vrele(dvp); 2575 error = ENOENT; 2576 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2577 return (error); 2578 } 2579 /* 2580 * *vp has its use count incremented still. 2581 */ 2582 2583 return (0); 2584 } 2585 2586 /* 2587 * Resolve a directory to a pathname. 2588 * 2589 * The name of the directory can always be found in the namecache or fetched 2590 * from the filesystem. There is also guaranteed to be only one parent, meaning 2591 * we can just follow vnodes up until we find the root. 2592 * 2593 * The vnode must be referenced. 2594 */ 2595 static int 2596 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 2597 size_t *len, bool slash_prefixed, size_t addend) 2598 { 2599 #ifdef KDTRACE_HOOKS 2600 struct vnode *startvp = vp; 2601 #endif 2602 struct vnode *vp1; 2603 size_t buflen; 2604 int error; 2605 2606 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); 2607 VNPASS(vp->v_usecount > 0, vp); 2608 2609 buflen = *len; 2610 2611 if (!slash_prefixed) { 2612 MPASS(*len >= 2); 2613 buflen--; 2614 buf[buflen] = '\0'; 2615 } 2616 2617 error = 0; 2618 2619 SDT_PROBE1(vfs, namecache, fullpath, entry, vp); 2620 counter_u64_add(numfullpathcalls, 1); 2621 while (vp != rdir && vp != rootvnode) { 2622 /* 2623 * The vp vnode must be already fully constructed, 2624 * since it is either found in namecache or obtained 2625 * from VOP_VPTOCNP(). We may test for VV_ROOT safely 2626 * without obtaining the vnode lock. 2627 */ 2628 if ((vp->v_vflag & VV_ROOT) != 0) { 2629 vn_lock(vp, LK_RETRY | LK_SHARED); 2630 2631 /* 2632 * With the vnode locked, check for races with 2633 * unmount, forced or not. Note that we 2634 * already verified that vp is not equal to 2635 * the root vnode, which means that 2636 * mnt_vnodecovered can be NULL only for the 2637 * case of unmount. 2638 */ 2639 if (VN_IS_DOOMED(vp) || 2640 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || 2641 vp1->v_mountedhere != vp->v_mount) { 2642 vput(vp); 2643 error = ENOENT; 2644 SDT_PROBE3(vfs, namecache, fullpath, return, 2645 error, vp, NULL); 2646 break; 2647 } 2648 2649 vref(vp1); 2650 vput(vp); 2651 vp = vp1; 2652 continue; 2653 } 2654 if (vp->v_type != VDIR) { 2655 vrele(vp); 2656 counter_u64_add(numfullpathfail1, 1); 2657 error = ENOTDIR; 2658 SDT_PROBE3(vfs, namecache, fullpath, return, 2659 error, vp, NULL); 2660 break; 2661 } 2662 error = vn_vptocnp(&vp, curthread->td_ucred, buf, &buflen); 2663 if (error) 2664 break; 2665 if (buflen == 0) { 2666 vrele(vp); 2667 error = ENOMEM; 2668 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2669 startvp, NULL); 2670 break; 2671 } 2672 buf[--buflen] = '/'; 2673 slash_prefixed = true; 2674 } 2675 if (error) 2676 return (error); 2677 if (!slash_prefixed) { 2678 if (buflen == 0) { 2679 vrele(vp); 2680 counter_u64_add(numfullpathfail4, 1); 2681 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, 2682 startvp, NULL); 2683 return (ENOMEM); 2684 } 2685 buf[--buflen] = '/'; 2686 } 2687 counter_u64_add(numfullpathfound, 1); 2688 vrele(vp); 2689 2690 *retbuf = buf + buflen; 2691 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf); 2692 *len -= buflen; 2693 *len += addend; 2694 return (0); 2695 } 2696 2697 /* 2698 * Resolve an arbitrary vnode to a pathname. 2699 * 2700 * Note 2 caveats: 2701 * - hardlinks are not tracked, thus if the vnode is not a directory this can 2702 * resolve to a different path than the one used to find it 2703 * - namecache is not mandatory, meaning names are not guaranteed to be added 2704 * (in which case resolving fails) 2705 */ 2706 static void __inline 2707 cache_rev_failed_impl(int *reason, int line) 2708 { 2709 2710 *reason = line; 2711 } 2712 #define cache_rev_failed(var) cache_rev_failed_impl((var), __LINE__) 2713 2714 static int 2715 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, 2716 char **retbuf, size_t *buflen, bool slash_prefixed, size_t addend) 2717 { 2718 #ifdef KDTRACE_HOOKS 2719 struct vnode *startvp = vp; 2720 #endif 2721 struct vnode *tvp; 2722 struct mount *mp; 2723 struct namecache *ncp; 2724 size_t orig_buflen; 2725 int reason; 2726 int error; 2727 #ifdef KDTRACE_HOOKS 2728 int i; 2729 #endif 2730 seqc_t vp_seqc, tvp_seqc; 2731 u_char nc_flag; 2732 2733 VFS_SMR_ASSERT_ENTERED(); 2734 2735 if (!cache_fast_revlookup) { 2736 vfs_smr_exit(); 2737 return (-1); 2738 } 2739 2740 orig_buflen = *buflen; 2741 2742 if (!slash_prefixed) { 2743 MPASS(*buflen >= 2); 2744 *buflen -= 1; 2745 buf[*buflen] = '\0'; 2746 } 2747 2748 if (vp == rdir || vp == rootvnode) { 2749 if (!slash_prefixed) { 2750 *buflen -= 1; 2751 buf[*buflen] = '/'; 2752 } 2753 goto out_ok; 2754 } 2755 2756 #ifdef KDTRACE_HOOKS 2757 i = 0; 2758 #endif 2759 error = -1; 2760 ncp = NULL; /* for sdt probe down below */ 2761 vp_seqc = vn_seqc_read_any(vp); 2762 if (seqc_in_modify(vp_seqc)) { 2763 cache_rev_failed(&reason); 2764 goto out_abort; 2765 } 2766 2767 for (;;) { 2768 #ifdef KDTRACE_HOOKS 2769 i++; 2770 #endif 2771 if ((vp->v_vflag & VV_ROOT) != 0) { 2772 mp = atomic_load_ptr(&vp->v_mount); 2773 if (mp == NULL) { 2774 cache_rev_failed(&reason); 2775 goto out_abort; 2776 } 2777 tvp = atomic_load_ptr(&mp->mnt_vnodecovered); 2778 tvp_seqc = vn_seqc_read_any(tvp); 2779 if (seqc_in_modify(tvp_seqc)) { 2780 cache_rev_failed(&reason); 2781 goto out_abort; 2782 } 2783 if (!vn_seqc_consistent(vp, vp_seqc)) { 2784 cache_rev_failed(&reason); 2785 goto out_abort; 2786 } 2787 vp = tvp; 2788 vp_seqc = tvp_seqc; 2789 continue; 2790 } 2791 ncp = atomic_load_ptr(&vp->v_cache_dd); 2792 if (ncp == NULL) { 2793 cache_rev_failed(&reason); 2794 goto out_abort; 2795 } 2796 nc_flag = atomic_load_char(&ncp->nc_flag); 2797 if ((nc_flag & NCF_ISDOTDOT) != 0) { 2798 cache_rev_failed(&reason); 2799 goto out_abort; 2800 } 2801 if (!cache_ncp_canuse(ncp)) { 2802 cache_rev_failed(&reason); 2803 goto out_abort; 2804 } 2805 if (ncp->nc_nlen >= *buflen) { 2806 cache_rev_failed(&reason); 2807 error = ENOMEM; 2808 goto out_abort; 2809 } 2810 *buflen -= ncp->nc_nlen; 2811 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 2812 *buflen -= 1; 2813 buf[*buflen] = '/'; 2814 tvp = ncp->nc_dvp; 2815 tvp_seqc = vn_seqc_read_any(tvp); 2816 if (seqc_in_modify(tvp_seqc)) { 2817 cache_rev_failed(&reason); 2818 goto out_abort; 2819 } 2820 if (!vn_seqc_consistent(vp, vp_seqc)) { 2821 cache_rev_failed(&reason); 2822 goto out_abort; 2823 } 2824 vp = tvp; 2825 vp_seqc = tvp_seqc; 2826 if (vp == rdir || vp == rootvnode) 2827 break; 2828 } 2829 out_ok: 2830 vfs_smr_exit(); 2831 *retbuf = buf + *buflen; 2832 *buflen = orig_buflen - *buflen + addend; 2833 SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf); 2834 return (0); 2835 2836 out_abort: 2837 *buflen = orig_buflen; 2838 SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i); 2839 vfs_smr_exit(); 2840 return (error); 2841 } 2842 2843 static int 2844 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, 2845 size_t *buflen) 2846 { 2847 size_t orig_buflen; 2848 bool slash_prefixed; 2849 int error; 2850 2851 if (*buflen < 2) 2852 return (EINVAL); 2853 2854 orig_buflen = *buflen; 2855 2856 vref(vp); 2857 slash_prefixed = false; 2858 if (vp->v_type != VDIR) { 2859 *buflen -= 1; 2860 buf[*buflen] = '\0'; 2861 error = vn_vptocnp(&vp, curthread->td_ucred, buf, buflen); 2862 if (error) 2863 return (error); 2864 if (*buflen == 0) { 2865 vrele(vp); 2866 return (ENOMEM); 2867 } 2868 *buflen -= 1; 2869 buf[*buflen] = '/'; 2870 slash_prefixed = true; 2871 } 2872 2873 return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, slash_prefixed, 2874 orig_buflen - *buflen)); 2875 } 2876 2877 /* 2878 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks). 2879 * 2880 * Since the namecache does not track handlings, the caller is expected to first 2881 * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei. 2882 * 2883 * Then we have 2 cases: 2884 * - if the found vnode is a directory, the path can be constructed just by 2885 * fullowing names up the chain 2886 * - otherwise we populate the buffer with the saved name and start resolving 2887 * from the parent 2888 */ 2889 static int 2890 vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, char **freebuf, 2891 size_t *buflen) 2892 { 2893 char *buf, *tmpbuf; 2894 struct pwd *pwd; 2895 struct componentname *cnp; 2896 struct vnode *vp; 2897 size_t addend; 2898 int error; 2899 bool slash_prefixed; 2900 enum vtype type; 2901 2902 if (*buflen < 2) 2903 return (EINVAL); 2904 if (*buflen > MAXPATHLEN) 2905 *buflen = MAXPATHLEN; 2906 2907 slash_prefixed = false; 2908 2909 buf = malloc(*buflen, M_TEMP, M_WAITOK); 2910 2911 addend = 0; 2912 vp = ndp->ni_vp; 2913 /* 2914 * Check for VBAD to work around the vp_crossmp bug in lookup(). 2915 * 2916 * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be 2917 * set to mount point's root vnode while ni_dvp will be vp_crossmp. 2918 * If the type is VDIR (like in this very case) we can skip looking 2919 * at ni_dvp in the first place. However, since vnodes get passed here 2920 * unlocked the target may transition to doomed state (type == VBAD) 2921 * before we get to evaluate the condition. If this happens, we will 2922 * populate part of the buffer and descend to vn_fullpath_dir with 2923 * vp == vp_crossmp. Prevent the problem by checking for VBAD. 2924 * 2925 * This should be atomic_load(&vp->v_type) but it is ilegal to take 2926 * an address of a bit field, even if said field is sized to char. 2927 * Work around the problem by reading the value into a full-sized enum 2928 * and then re-reading it with atomic_load which will still prevent 2929 * the compiler from re-reading down the road. 2930 */ 2931 type = vp->v_type; 2932 type = atomic_load_int(&type); 2933 if (type == VBAD) { 2934 error = ENOENT; 2935 goto out_bad; 2936 } 2937 if (type != VDIR) { 2938 cnp = &ndp->ni_cnd; 2939 addend = cnp->cn_namelen + 2; 2940 if (*buflen < addend) { 2941 error = ENOMEM; 2942 goto out_bad; 2943 } 2944 *buflen -= addend; 2945 tmpbuf = buf + *buflen; 2946 tmpbuf[0] = '/'; 2947 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen); 2948 tmpbuf[addend - 1] = '\0'; 2949 slash_prefixed = true; 2950 vp = ndp->ni_dvp; 2951 } 2952 2953 vfs_smr_enter(); 2954 pwd = pwd_get_smr(); 2955 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen, 2956 slash_prefixed, addend); 2957 VFS_SMR_ASSERT_NOT_ENTERED(); 2958 if (error < 0) { 2959 pwd = pwd_hold(curthread); 2960 vref(vp); 2961 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen, 2962 slash_prefixed, addend); 2963 pwd_drop(pwd); 2964 if (error != 0) 2965 goto out_bad; 2966 } 2967 2968 *freebuf = buf; 2969 2970 return (0); 2971 out_bad: 2972 free(buf, M_TEMP); 2973 return (error); 2974 } 2975 2976 struct vnode * 2977 vn_dir_dd_ino(struct vnode *vp) 2978 { 2979 struct namecache *ncp; 2980 struct vnode *ddvp; 2981 struct mtx *vlp; 2982 enum vgetstate vs; 2983 2984 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); 2985 vlp = VP2VNODELOCK(vp); 2986 mtx_lock(vlp); 2987 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { 2988 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 2989 continue; 2990 ddvp = ncp->nc_dvp; 2991 vs = vget_prep(ddvp); 2992 mtx_unlock(vlp); 2993 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) 2994 return (NULL); 2995 return (ddvp); 2996 } 2997 mtx_unlock(vlp); 2998 return (NULL); 2999 } 3000 3001 int 3002 vn_commname(struct vnode *vp, char *buf, u_int buflen) 3003 { 3004 struct namecache *ncp; 3005 struct mtx *vlp; 3006 int l; 3007 3008 vlp = VP2VNODELOCK(vp); 3009 mtx_lock(vlp); 3010 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) 3011 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 3012 break; 3013 if (ncp == NULL) { 3014 mtx_unlock(vlp); 3015 return (ENOENT); 3016 } 3017 l = min(ncp->nc_nlen, buflen - 1); 3018 memcpy(buf, ncp->nc_name, l); 3019 mtx_unlock(vlp); 3020 buf[l] = '\0'; 3021 return (0); 3022 } 3023 3024 /* 3025 * This function updates path string to vnode's full global path 3026 * and checks the size of the new path string against the pathlen argument. 3027 * 3028 * Requires a locked, referenced vnode. 3029 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 3030 * 3031 * If vp is a directory, the call to vn_fullpath_global() always succeeds 3032 * because it falls back to the ".." lookup if the namecache lookup fails. 3033 */ 3034 int 3035 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, 3036 u_int pathlen) 3037 { 3038 struct nameidata nd; 3039 struct vnode *vp1; 3040 char *rpath, *fbuf; 3041 int error; 3042 3043 ASSERT_VOP_ELOCKED(vp, __func__); 3044 3045 /* Construct global filesystem path from vp. */ 3046 VOP_UNLOCK(vp); 3047 error = vn_fullpath_global(vp, &rpath, &fbuf); 3048 3049 if (error != 0) { 3050 vrele(vp); 3051 return (error); 3052 } 3053 3054 if (strlen(rpath) >= pathlen) { 3055 vrele(vp); 3056 error = ENAMETOOLONG; 3057 goto out; 3058 } 3059 3060 /* 3061 * Re-lookup the vnode by path to detect a possible rename. 3062 * As a side effect, the vnode is relocked. 3063 * If vnode was renamed, return ENOENT. 3064 */ 3065 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, 3066 UIO_SYSSPACE, path, td); 3067 error = namei(&nd); 3068 if (error != 0) { 3069 vrele(vp); 3070 goto out; 3071 } 3072 NDFREE(&nd, NDF_ONLY_PNBUF); 3073 vp1 = nd.ni_vp; 3074 vrele(vp); 3075 if (vp1 == vp) 3076 strcpy(path, rpath); 3077 else { 3078 vput(vp1); 3079 error = ENOENT; 3080 } 3081 3082 out: 3083 free(fbuf, M_TEMP); 3084 return (error); 3085 } 3086 3087 #ifdef DDB 3088 static void 3089 db_print_vpath(struct vnode *vp) 3090 { 3091 3092 while (vp != NULL) { 3093 db_printf("%p: ", vp); 3094 if (vp == rootvnode) { 3095 db_printf("/"); 3096 vp = NULL; 3097 } else { 3098 if (vp->v_vflag & VV_ROOT) { 3099 db_printf("<mount point>"); 3100 vp = vp->v_mount->mnt_vnodecovered; 3101 } else { 3102 struct namecache *ncp; 3103 char *ncn; 3104 int i; 3105 3106 ncp = TAILQ_FIRST(&vp->v_cache_dst); 3107 if (ncp != NULL) { 3108 ncn = ncp->nc_name; 3109 for (i = 0; i < ncp->nc_nlen; i++) 3110 db_printf("%c", *ncn++); 3111 vp = ncp->nc_dvp; 3112 } else { 3113 vp = NULL; 3114 } 3115 } 3116 } 3117 db_printf("\n"); 3118 } 3119 3120 return; 3121 } 3122 3123 DB_SHOW_COMMAND(vpath, db_show_vpath) 3124 { 3125 struct vnode *vp; 3126 3127 if (!have_addr) { 3128 db_printf("usage: show vpath <struct vnode *>\n"); 3129 return; 3130 } 3131 3132 vp = (struct vnode *)addr; 3133 db_print_vpath(vp); 3134 } 3135 3136 #endif 3137 3138 static bool __read_frequently cache_fast_lookup = true; 3139 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW, 3140 &cache_fast_lookup, 0, ""); 3141 3142 #define CACHE_FPL_FAILED -2020 3143 3144 static void 3145 cache_fpl_cleanup_cnp(struct componentname *cnp) 3146 { 3147 3148 uma_zfree(namei_zone, cnp->cn_pnbuf); 3149 #ifdef DIAGNOSTIC 3150 cnp->cn_pnbuf = NULL; 3151 cnp->cn_nameptr = NULL; 3152 #endif 3153 } 3154 3155 static void 3156 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp) 3157 { 3158 struct componentname *cnp; 3159 3160 cnp = &ndp->ni_cnd; 3161 while (*(cnp->cn_nameptr) == '/') { 3162 cnp->cn_nameptr++; 3163 ndp->ni_pathlen--; 3164 } 3165 3166 *dpp = ndp->ni_rootdir; 3167 } 3168 3169 /* 3170 * Components of nameidata (or objects it can point to) which may 3171 * need restoring in case fast path lookup fails. 3172 */ 3173 struct nameidata_saved { 3174 long cn_namelen; 3175 char *cn_nameptr; 3176 size_t ni_pathlen; 3177 int cn_flags; 3178 }; 3179 3180 struct cache_fpl { 3181 struct nameidata *ndp; 3182 struct componentname *cnp; 3183 struct pwd *pwd; 3184 struct vnode *dvp; 3185 struct vnode *tvp; 3186 seqc_t dvp_seqc; 3187 seqc_t tvp_seqc; 3188 struct nameidata_saved snd; 3189 int line; 3190 enum cache_fpl_status status:8; 3191 bool in_smr; 3192 bool fsearch; 3193 }; 3194 3195 static void 3196 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd) 3197 { 3198 3199 snd->cn_flags = fpl->ndp->ni_cnd.cn_flags; 3200 snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen; 3201 snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr; 3202 snd->ni_pathlen = fpl->ndp->ni_pathlen; 3203 } 3204 3205 static void 3206 cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd) 3207 { 3208 3209 fpl->ndp->ni_cnd.cn_flags = snd->cn_flags; 3210 fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen; 3211 fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr; 3212 fpl->ndp->ni_pathlen = snd->ni_pathlen; 3213 } 3214 3215 #ifdef INVARIANTS 3216 #define cache_fpl_smr_assert_entered(fpl) ({ \ 3217 struct cache_fpl *_fpl = (fpl); \ 3218 MPASS(_fpl->in_smr == true); \ 3219 VFS_SMR_ASSERT_ENTERED(); \ 3220 }) 3221 #define cache_fpl_smr_assert_not_entered(fpl) ({ \ 3222 struct cache_fpl *_fpl = (fpl); \ 3223 MPASS(_fpl->in_smr == false); \ 3224 VFS_SMR_ASSERT_NOT_ENTERED(); \ 3225 }) 3226 #else 3227 #define cache_fpl_smr_assert_entered(fpl) do { } while (0) 3228 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0) 3229 #endif 3230 3231 #define cache_fpl_smr_enter_initial(fpl) ({ \ 3232 struct cache_fpl *_fpl = (fpl); \ 3233 vfs_smr_enter(); \ 3234 _fpl->in_smr = true; \ 3235 }) 3236 3237 #define cache_fpl_smr_enter(fpl) ({ \ 3238 struct cache_fpl *_fpl = (fpl); \ 3239 MPASS(_fpl->in_smr == false); \ 3240 vfs_smr_enter(); \ 3241 _fpl->in_smr = true; \ 3242 }) 3243 3244 #define cache_fpl_smr_exit(fpl) ({ \ 3245 struct cache_fpl *_fpl = (fpl); \ 3246 MPASS(_fpl->in_smr == true); \ 3247 vfs_smr_exit(); \ 3248 _fpl->in_smr = false; \ 3249 }) 3250 3251 static int 3252 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line) 3253 { 3254 3255 if (fpl->status != CACHE_FPL_STATUS_UNSET) { 3256 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, 3257 ("%s: converting to abort from %d at %d, set at %d\n", 3258 __func__, fpl->status, line, fpl->line)); 3259 } 3260 fpl->status = CACHE_FPL_STATUS_ABORTED; 3261 fpl->line = line; 3262 return (CACHE_FPL_FAILED); 3263 } 3264 3265 #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__) 3266 3267 static int 3268 cache_fpl_partial_impl(struct cache_fpl *fpl, int line) 3269 { 3270 3271 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3272 ("%s: setting to partial at %d, but already set to %d at %d\n", 3273 __func__, line, fpl->status, fpl->line)); 3274 cache_fpl_smr_assert_entered(fpl); 3275 fpl->status = CACHE_FPL_STATUS_PARTIAL; 3276 fpl->line = line; 3277 return (CACHE_FPL_FAILED); 3278 } 3279 3280 #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__) 3281 3282 static int 3283 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line) 3284 { 3285 3286 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3287 ("%s: setting to handled at %d, but already set to %d at %d\n", 3288 __func__, line, fpl->status, fpl->line)); 3289 cache_fpl_smr_assert_not_entered(fpl); 3290 MPASS(error != CACHE_FPL_FAILED); 3291 fpl->status = CACHE_FPL_STATUS_HANDLED; 3292 fpl->line = line; 3293 return (error); 3294 } 3295 3296 #define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__) 3297 3298 #define CACHE_FPL_SUPPORTED_CN_FLAGS \ 3299 (LOCKLEAF | LOCKPARENT | WANTPARENT | NOCACHE | FOLLOW | LOCKSHARED | SAVENAME | \ 3300 SAVESTART | WILLBEDIR | ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK) 3301 3302 #define CACHE_FPL_INTERNAL_CN_FLAGS \ 3303 (ISDOTDOT | MAKEENTRY | ISLASTCN) 3304 3305 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, 3306 "supported and internal flags overlap"); 3307 3308 static bool 3309 cache_fpl_islastcn(struct nameidata *ndp) 3310 { 3311 3312 return (*ndp->ni_next == 0); 3313 } 3314 3315 static bool 3316 cache_fpl_isdotdot(struct componentname *cnp) 3317 { 3318 3319 if (cnp->cn_namelen == 2 && 3320 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') 3321 return (true); 3322 return (false); 3323 } 3324 3325 static bool 3326 cache_can_fplookup(struct cache_fpl *fpl) 3327 { 3328 struct nameidata *ndp; 3329 struct componentname *cnp; 3330 struct thread *td; 3331 3332 ndp = fpl->ndp; 3333 cnp = fpl->cnp; 3334 td = cnp->cn_thread; 3335 3336 if (!cache_fast_lookup) { 3337 cache_fpl_aborted(fpl); 3338 return (false); 3339 } 3340 #ifdef MAC 3341 if (mac_vnode_check_lookup_enabled()) { 3342 cache_fpl_aborted(fpl); 3343 return (false); 3344 } 3345 #endif 3346 if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) { 3347 cache_fpl_aborted(fpl); 3348 return (false); 3349 } 3350 if (IN_CAPABILITY_MODE(td)) { 3351 cache_fpl_aborted(fpl); 3352 return (false); 3353 } 3354 if (AUDITING_TD(td)) { 3355 cache_fpl_aborted(fpl); 3356 return (false); 3357 } 3358 if (ndp->ni_startdir != NULL) { 3359 cache_fpl_aborted(fpl); 3360 return (false); 3361 } 3362 return (true); 3363 } 3364 3365 static int 3366 cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp) 3367 { 3368 struct nameidata *ndp; 3369 int error; 3370 bool fsearch; 3371 3372 ndp = fpl->ndp; 3373 error = fgetvp_lookup_smr(ndp->ni_dirfd, ndp, vpp, &fsearch); 3374 if (__predict_false(error != 0)) { 3375 cache_fpl_smr_exit(fpl); 3376 return (cache_fpl_aborted(fpl)); 3377 } 3378 fpl->fsearch = fsearch; 3379 return (0); 3380 } 3381 3382 static bool 3383 cache_fplookup_vnode_supported(struct vnode *vp) 3384 { 3385 3386 return (vp->v_type != VLNK); 3387 } 3388 3389 /* 3390 * Move a negative entry to the hot list. 3391 * 3392 * We have to take locks, but they may be contended and in the worst 3393 * case we may need to go off CPU. We don't want to spin within the 3394 * smr section and we can't block with it. Instead we are going to 3395 * look up the entry again. 3396 */ 3397 static int __noinline 3398 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp, 3399 uint32_t hash) 3400 { 3401 struct componentname *cnp; 3402 struct namecache *ncp; 3403 struct neglist *neglist; 3404 struct negstate *negstate; 3405 struct vnode *dvp; 3406 u_char nc_flag; 3407 3408 cnp = fpl->cnp; 3409 dvp = fpl->dvp; 3410 3411 if (!vhold_smr(dvp)) 3412 return (cache_fpl_aborted(fpl)); 3413 3414 neglist = NCP2NEGLIST(oncp); 3415 cache_fpl_smr_exit(fpl); 3416 3417 mtx_lock(&ncneg_hot.nl_lock); 3418 mtx_lock(&neglist->nl_lock); 3419 /* 3420 * For hash iteration. 3421 */ 3422 cache_fpl_smr_enter(fpl); 3423 3424 /* 3425 * Avoid all surprises by only succeeding if we got the same entry and 3426 * bailing completely otherwise. 3427 * 3428 * In particular at this point there can be a new ncp which matches the 3429 * search but hashes to a different neglist. 3430 */ 3431 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 3432 if (ncp == oncp) 3433 break; 3434 } 3435 3436 /* 3437 * No match to begin with. 3438 */ 3439 if (__predict_false(ncp == NULL)) { 3440 goto out_abort; 3441 } 3442 3443 /* 3444 * The newly found entry may be something different... 3445 */ 3446 if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 3447 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) { 3448 goto out_abort; 3449 } 3450 3451 /* 3452 * ... and not even negative. 3453 */ 3454 nc_flag = atomic_load_char(&ncp->nc_flag); 3455 if ((nc_flag & NCF_NEGATIVE) == 0) { 3456 goto out_abort; 3457 } 3458 3459 if (__predict_false(!cache_ncp_canuse(ncp))) { 3460 goto out_abort; 3461 } 3462 3463 negstate = NCP2NEGSTATE(ncp); 3464 if ((negstate->neg_flag & NEG_HOT) == 0) { 3465 numhotneg++; 3466 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 3467 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); 3468 negstate->neg_flag |= NEG_HOT; 3469 } 3470 3471 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 3472 counter_u64_add(numneghits, 1); 3473 cache_fpl_smr_exit(fpl); 3474 mtx_unlock(&neglist->nl_lock); 3475 mtx_unlock(&ncneg_hot.nl_lock); 3476 vdrop(dvp); 3477 return (cache_fpl_handled(fpl, ENOENT)); 3478 out_abort: 3479 cache_fpl_smr_exit(fpl); 3480 mtx_unlock(&neglist->nl_lock); 3481 mtx_unlock(&ncneg_hot.nl_lock); 3482 vdrop(dvp); 3483 return (cache_fpl_aborted(fpl)); 3484 } 3485 3486 /* 3487 * The target vnode is not supported, prepare for the slow path to take over. 3488 */ 3489 static int __noinline 3490 cache_fplookup_partial_setup(struct cache_fpl *fpl) 3491 { 3492 struct nameidata *ndp; 3493 struct componentname *cnp; 3494 enum vgetstate dvs; 3495 struct vnode *dvp; 3496 struct pwd *pwd; 3497 seqc_t dvp_seqc; 3498 3499 ndp = fpl->ndp; 3500 cnp = fpl->cnp; 3501 pwd = fpl->pwd; 3502 dvp = fpl->dvp; 3503 dvp_seqc = fpl->dvp_seqc; 3504 3505 if (!pwd_hold_smr(pwd)) { 3506 cache_fpl_smr_exit(fpl); 3507 return (cache_fpl_aborted(fpl)); 3508 } 3509 3510 dvs = vget_prep_smr(dvp); 3511 cache_fpl_smr_exit(fpl); 3512 if (__predict_false(dvs == VGET_NONE)) { 3513 pwd_drop(pwd); 3514 return (cache_fpl_aborted(fpl)); 3515 } 3516 3517 vget_finish_ref(dvp, dvs); 3518 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3519 vrele(dvp); 3520 pwd_drop(pwd); 3521 return (cache_fpl_aborted(fpl)); 3522 } 3523 3524 cache_fpl_restore(fpl, &fpl->snd); 3525 3526 ndp->ni_startdir = dvp; 3527 cnp->cn_flags |= MAKEENTRY; 3528 if (cache_fpl_islastcn(ndp)) 3529 cnp->cn_flags |= ISLASTCN; 3530 if (cache_fpl_isdotdot(cnp)) 3531 cnp->cn_flags |= ISDOTDOT; 3532 3533 return (0); 3534 } 3535 3536 static int 3537 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs) 3538 { 3539 struct componentname *cnp; 3540 struct vnode *tvp; 3541 seqc_t tvp_seqc; 3542 int error, lkflags; 3543 3544 cnp = fpl->cnp; 3545 tvp = fpl->tvp; 3546 tvp_seqc = fpl->tvp_seqc; 3547 3548 if ((cnp->cn_flags & LOCKLEAF) != 0) { 3549 lkflags = LK_SHARED; 3550 if ((cnp->cn_flags & LOCKSHARED) == 0) 3551 lkflags = LK_EXCLUSIVE; 3552 error = vget_finish(tvp, lkflags, tvs); 3553 if (__predict_false(error != 0)) { 3554 return (cache_fpl_aborted(fpl)); 3555 } 3556 } else { 3557 vget_finish_ref(tvp, tvs); 3558 } 3559 3560 if (!vn_seqc_consistent(tvp, tvp_seqc)) { 3561 if ((cnp->cn_flags & LOCKLEAF) != 0) 3562 vput(tvp); 3563 else 3564 vrele(tvp); 3565 return (cache_fpl_aborted(fpl)); 3566 } 3567 3568 return (cache_fpl_handled(fpl, 0)); 3569 } 3570 3571 /* 3572 * They want to possibly modify the state of the namecache. 3573 * 3574 * Don't try to match the API contract, just leave. 3575 * TODO: this leaves scalability on the table 3576 */ 3577 static int 3578 cache_fplookup_final_modifying(struct cache_fpl *fpl) 3579 { 3580 struct componentname *cnp; 3581 3582 cnp = fpl->cnp; 3583 MPASS(cnp->cn_nameiop != LOOKUP); 3584 return (cache_fpl_partial(fpl)); 3585 } 3586 3587 static int __noinline 3588 cache_fplookup_final_withparent(struct cache_fpl *fpl) 3589 { 3590 struct componentname *cnp; 3591 enum vgetstate dvs, tvs; 3592 struct vnode *dvp, *tvp; 3593 seqc_t dvp_seqc; 3594 int error; 3595 3596 cnp = fpl->cnp; 3597 dvp = fpl->dvp; 3598 dvp_seqc = fpl->dvp_seqc; 3599 tvp = fpl->tvp; 3600 3601 MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0); 3602 3603 /* 3604 * This is less efficient than it can be for simplicity. 3605 */ 3606 dvs = vget_prep_smr(dvp); 3607 if (__predict_false(dvs == VGET_NONE)) { 3608 return (cache_fpl_aborted(fpl)); 3609 } 3610 tvs = vget_prep_smr(tvp); 3611 if (__predict_false(tvs == VGET_NONE)) { 3612 cache_fpl_smr_exit(fpl); 3613 vget_abort(dvp, dvs); 3614 return (cache_fpl_aborted(fpl)); 3615 } 3616 3617 cache_fpl_smr_exit(fpl); 3618 3619 if ((cnp->cn_flags & LOCKPARENT) != 0) { 3620 error = vget_finish(dvp, LK_EXCLUSIVE, dvs); 3621 if (__predict_false(error != 0)) { 3622 vget_abort(tvp, tvs); 3623 return (cache_fpl_aborted(fpl)); 3624 } 3625 } else { 3626 vget_finish_ref(dvp, dvs); 3627 } 3628 3629 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3630 vget_abort(tvp, tvs); 3631 if ((cnp->cn_flags & LOCKPARENT) != 0) 3632 vput(dvp); 3633 else 3634 vrele(dvp); 3635 return (cache_fpl_aborted(fpl)); 3636 } 3637 3638 error = cache_fplookup_final_child(fpl, tvs); 3639 if (__predict_false(error != 0)) { 3640 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED); 3641 if ((cnp->cn_flags & LOCKPARENT) != 0) 3642 vput(dvp); 3643 else 3644 vrele(dvp); 3645 return (error); 3646 } 3647 3648 MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED); 3649 return (0); 3650 } 3651 3652 static int 3653 cache_fplookup_final(struct cache_fpl *fpl) 3654 { 3655 struct componentname *cnp; 3656 enum vgetstate tvs; 3657 struct vnode *dvp, *tvp; 3658 seqc_t dvp_seqc; 3659 3660 cnp = fpl->cnp; 3661 dvp = fpl->dvp; 3662 dvp_seqc = fpl->dvp_seqc; 3663 tvp = fpl->tvp; 3664 3665 VNPASS(cache_fplookup_vnode_supported(dvp), dvp); 3666 3667 if (cnp->cn_nameiop != LOOKUP) { 3668 return (cache_fplookup_final_modifying(fpl)); 3669 } 3670 3671 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) 3672 return (cache_fplookup_final_withparent(fpl)); 3673 3674 tvs = vget_prep_smr(tvp); 3675 if (__predict_false(tvs == VGET_NONE)) { 3676 return (cache_fpl_partial(fpl)); 3677 } 3678 3679 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3680 cache_fpl_smr_exit(fpl); 3681 vget_abort(tvp, tvs); 3682 return (cache_fpl_aborted(fpl)); 3683 } 3684 3685 cache_fpl_smr_exit(fpl); 3686 return (cache_fplookup_final_child(fpl, tvs)); 3687 } 3688 3689 static int __noinline 3690 cache_fplookup_dot(struct cache_fpl *fpl) 3691 { 3692 struct vnode *dvp; 3693 3694 dvp = fpl->dvp; 3695 3696 fpl->tvp = dvp; 3697 fpl->tvp_seqc = vn_seqc_read_any(dvp); 3698 if (seqc_in_modify(fpl->tvp_seqc)) { 3699 return (cache_fpl_aborted(fpl)); 3700 } 3701 3702 counter_u64_add(dothits, 1); 3703 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", dvp); 3704 3705 return (0); 3706 } 3707 3708 static int __noinline 3709 cache_fplookup_dotdot(struct cache_fpl *fpl) 3710 { 3711 struct nameidata *ndp; 3712 struct componentname *cnp; 3713 struct namecache *ncp; 3714 struct vnode *dvp; 3715 struct prison *pr; 3716 u_char nc_flag; 3717 3718 ndp = fpl->ndp; 3719 cnp = fpl->cnp; 3720 dvp = fpl->dvp; 3721 3722 /* 3723 * XXX this is racy the same way regular lookup is 3724 */ 3725 for (pr = cnp->cn_cred->cr_prison; pr != NULL; 3726 pr = pr->pr_parent) 3727 if (dvp == pr->pr_root) 3728 break; 3729 3730 if (dvp == ndp->ni_rootdir || 3731 dvp == ndp->ni_topdir || 3732 dvp == rootvnode || 3733 pr != NULL) { 3734 fpl->tvp = dvp; 3735 fpl->tvp_seqc = vn_seqc_read_any(dvp); 3736 if (seqc_in_modify(fpl->tvp_seqc)) { 3737 return (cache_fpl_aborted(fpl)); 3738 } 3739 return (0); 3740 } 3741 3742 if ((dvp->v_vflag & VV_ROOT) != 0) { 3743 /* 3744 * TODO 3745 * The opposite of climb mount is needed here. 3746 */ 3747 return (cache_fpl_aborted(fpl)); 3748 } 3749 3750 ncp = atomic_load_ptr(&dvp->v_cache_dd); 3751 if (ncp == NULL) { 3752 return (cache_fpl_aborted(fpl)); 3753 } 3754 3755 nc_flag = atomic_load_char(&ncp->nc_flag); 3756 if ((nc_flag & NCF_ISDOTDOT) != 0) { 3757 if ((nc_flag & NCF_NEGATIVE) != 0) 3758 return (cache_fpl_aborted(fpl)); 3759 fpl->tvp = ncp->nc_vp; 3760 } else { 3761 fpl->tvp = ncp->nc_dvp; 3762 } 3763 3764 if (__predict_false(!cache_ncp_canuse(ncp))) { 3765 return (cache_fpl_aborted(fpl)); 3766 } 3767 3768 fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp); 3769 if (seqc_in_modify(fpl->tvp_seqc)) { 3770 return (cache_fpl_partial(fpl)); 3771 } 3772 3773 counter_u64_add(dotdothits, 1); 3774 return (0); 3775 } 3776 3777 static int 3778 cache_fplookup_next(struct cache_fpl *fpl) 3779 { 3780 struct componentname *cnp; 3781 struct namecache *ncp; 3782 struct negstate *negstate; 3783 struct vnode *dvp, *tvp; 3784 u_char nc_flag; 3785 uint32_t hash; 3786 bool neg_hot; 3787 3788 cnp = fpl->cnp; 3789 dvp = fpl->dvp; 3790 3791 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) { 3792 return (cache_fplookup_dot(fpl)); 3793 } 3794 3795 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 3796 3797 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 3798 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 3799 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 3800 break; 3801 } 3802 3803 /* 3804 * If there is no entry we have to punt to the slow path to perform 3805 * actual lookup. Should there be nothing with this name a negative 3806 * entry will be created. 3807 */ 3808 if (__predict_false(ncp == NULL)) { 3809 return (cache_fpl_partial(fpl)); 3810 } 3811 3812 tvp = atomic_load_ptr(&ncp->nc_vp); 3813 nc_flag = atomic_load_char(&ncp->nc_flag); 3814 if ((nc_flag & NCF_NEGATIVE) != 0) { 3815 /* 3816 * If they want to create an entry we need to replace this one. 3817 */ 3818 if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) { 3819 return (cache_fpl_partial(fpl)); 3820 } 3821 negstate = NCP2NEGSTATE(ncp); 3822 neg_hot = ((negstate->neg_flag & NEG_HOT) != 0); 3823 if (__predict_false(!cache_ncp_canuse(ncp))) { 3824 return (cache_fpl_partial(fpl)); 3825 } 3826 if (__predict_false((nc_flag & NCF_WHITE) != 0)) { 3827 return (cache_fpl_partial(fpl)); 3828 } 3829 if (!neg_hot) { 3830 return (cache_fplookup_negative_promote(fpl, ncp, hash)); 3831 } 3832 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, 3833 ncp->nc_name); 3834 counter_u64_add(numneghits, 1); 3835 cache_fpl_smr_exit(fpl); 3836 return (cache_fpl_handled(fpl, ENOENT)); 3837 } 3838 3839 if (__predict_false(!cache_ncp_canuse(ncp))) { 3840 return (cache_fpl_partial(fpl)); 3841 } 3842 3843 fpl->tvp = tvp; 3844 fpl->tvp_seqc = vn_seqc_read_any(tvp); 3845 if (seqc_in_modify(fpl->tvp_seqc)) { 3846 return (cache_fpl_partial(fpl)); 3847 } 3848 3849 if (!cache_fplookup_vnode_supported(tvp)) { 3850 return (cache_fpl_partial(fpl)); 3851 } 3852 3853 counter_u64_add(numposhits, 1); 3854 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp); 3855 return (0); 3856 } 3857 3858 static bool 3859 cache_fplookup_mp_supported(struct mount *mp) 3860 { 3861 3862 if (mp == NULL) 3863 return (false); 3864 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) 3865 return (false); 3866 return (true); 3867 } 3868 3869 /* 3870 * Walk up the mount stack (if any). 3871 * 3872 * Correctness is provided in the following ways: 3873 * - all vnodes are protected from freeing with SMR 3874 * - struct mount objects are type stable making them always safe to access 3875 * - stability of the particular mount is provided by busying it 3876 * - relationship between the vnode which is mounted on and the mount is 3877 * verified with the vnode sequence counter after busying 3878 * - association between root vnode of the mount and the mount is protected 3879 * by busy 3880 * 3881 * From that point on we can read the sequence counter of the root vnode 3882 * and get the next mount on the stack (if any) using the same protection. 3883 * 3884 * By the end of successful walk we are guaranteed the reached state was 3885 * indeed present at least at some point which matches the regular lookup. 3886 */ 3887 static int __noinline 3888 cache_fplookup_climb_mount(struct cache_fpl *fpl) 3889 { 3890 struct mount *mp, *prev_mp; 3891 struct vnode *vp; 3892 seqc_t vp_seqc; 3893 3894 vp = fpl->tvp; 3895 vp_seqc = fpl->tvp_seqc; 3896 3897 VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp); 3898 mp = atomic_load_ptr(&vp->v_mountedhere); 3899 if (mp == NULL) 3900 return (0); 3901 3902 prev_mp = NULL; 3903 for (;;) { 3904 if (!vfs_op_thread_enter_crit(mp)) { 3905 if (prev_mp != NULL) 3906 vfs_op_thread_exit_crit(prev_mp); 3907 return (cache_fpl_partial(fpl)); 3908 } 3909 if (prev_mp != NULL) 3910 vfs_op_thread_exit_crit(prev_mp); 3911 if (!vn_seqc_consistent(vp, vp_seqc)) { 3912 vfs_op_thread_exit_crit(mp); 3913 return (cache_fpl_partial(fpl)); 3914 } 3915 if (!cache_fplookup_mp_supported(mp)) { 3916 vfs_op_thread_exit_crit(mp); 3917 return (cache_fpl_partial(fpl)); 3918 } 3919 vp = atomic_load_ptr(&mp->mnt_rootvnode); 3920 if (vp == NULL || VN_IS_DOOMED(vp)) { 3921 vfs_op_thread_exit_crit(mp); 3922 return (cache_fpl_partial(fpl)); 3923 } 3924 vp_seqc = vn_seqc_read_any(vp); 3925 if (seqc_in_modify(vp_seqc)) { 3926 vfs_op_thread_exit_crit(mp); 3927 return (cache_fpl_partial(fpl)); 3928 } 3929 prev_mp = mp; 3930 mp = atomic_load_ptr(&vp->v_mountedhere); 3931 if (mp == NULL) 3932 break; 3933 } 3934 3935 vfs_op_thread_exit_crit(prev_mp); 3936 fpl->tvp = vp; 3937 fpl->tvp_seqc = vp_seqc; 3938 return (0); 3939 } 3940 3941 static bool 3942 cache_fplookup_need_climb_mount(struct cache_fpl *fpl) 3943 { 3944 struct mount *mp; 3945 struct vnode *vp; 3946 3947 vp = fpl->tvp; 3948 3949 /* 3950 * Hack: while this is a union, the pointer tends to be NULL so save on 3951 * a branch. 3952 */ 3953 mp = atomic_load_ptr(&vp->v_mountedhere); 3954 if (mp == NULL) 3955 return (false); 3956 if (vp->v_type == VDIR) 3957 return (true); 3958 return (false); 3959 } 3960 3961 /* 3962 * Parse the path. 3963 * 3964 * The code was originally copy-pasted from regular lookup and despite 3965 * clean ups leaves performance on the table. Any modifications here 3966 * must take into account that in case off fallback the resulting 3967 * nameidata state has to be compatible with the original. 3968 */ 3969 static int 3970 cache_fplookup_parse(struct cache_fpl *fpl) 3971 { 3972 struct nameidata *ndp; 3973 struct componentname *cnp; 3974 char *cp; 3975 3976 ndp = fpl->ndp; 3977 cnp = fpl->cnp; 3978 3979 /* 3980 * Search a new directory. 3981 * 3982 * The last component of the filename is left accessible via 3983 * cnp->cn_nameptr for callers that need the name. Callers needing 3984 * the name set the SAVENAME flag. When done, they assume 3985 * responsibility for freeing the pathname buffer. 3986 */ 3987 for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++) 3988 continue; 3989 cnp->cn_namelen = cp - cnp->cn_nameptr; 3990 if (__predict_false(cnp->cn_namelen > NAME_MAX)) { 3991 cache_fpl_smr_exit(fpl); 3992 return (cache_fpl_handled(fpl, ENAMETOOLONG)); 3993 } 3994 ndp->ni_pathlen -= cnp->cn_namelen; 3995 KASSERT(ndp->ni_pathlen <= PATH_MAX, 3996 ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen)); 3997 ndp->ni_next = cp; 3998 3999 /* 4000 * Replace multiple slashes by a single slash and trailing slashes 4001 * by a null. This must be done before VOP_LOOKUP() because some 4002 * fs's don't know about trailing slashes. Remember if there were 4003 * trailing slashes to handle symlinks, existing non-directories 4004 * and non-existing files that won't be directories specially later. 4005 */ 4006 while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) { 4007 cp++; 4008 ndp->ni_pathlen--; 4009 if (*cp == '\0') { 4010 /* 4011 * TODO 4012 * Regular lookup performs the following: 4013 * *ndp->ni_next = '\0'; 4014 * cnp->cn_flags |= TRAILINGSLASH; 4015 * 4016 * Which is problematic since it modifies data read 4017 * from userspace. Then if fast path lookup was to 4018 * abort we would have to either restore it or convey 4019 * the flag. Since this is a corner case just ignore 4020 * it for simplicity. 4021 */ 4022 return (cache_fpl_partial(fpl)); 4023 } 4024 } 4025 ndp->ni_next = cp; 4026 4027 /* 4028 * Check for degenerate name (e.g. / or "") 4029 * which is a way of talking about a directory, 4030 * e.g. like "/." or ".". 4031 * 4032 * TODO 4033 * Another corner case handled by the regular lookup 4034 */ 4035 if (__predict_false(cnp->cn_nameptr[0] == '\0')) { 4036 return (cache_fpl_partial(fpl)); 4037 } 4038 return (0); 4039 } 4040 4041 static void 4042 cache_fplookup_parse_advance(struct cache_fpl *fpl) 4043 { 4044 struct nameidata *ndp; 4045 struct componentname *cnp; 4046 4047 ndp = fpl->ndp; 4048 cnp = fpl->cnp; 4049 4050 cnp->cn_nameptr = ndp->ni_next; 4051 while (*cnp->cn_nameptr == '/') { 4052 cnp->cn_nameptr++; 4053 ndp->ni_pathlen--; 4054 } 4055 } 4056 4057 /* 4058 * See the API contract for VOP_FPLOOKUP_VEXEC. 4059 */ 4060 static int __noinline 4061 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error) 4062 { 4063 struct componentname *cnp; 4064 struct vnode *dvp; 4065 seqc_t dvp_seqc; 4066 4067 cnp = fpl->cnp; 4068 dvp = fpl->dvp; 4069 dvp_seqc = fpl->dvp_seqc; 4070 4071 /* 4072 * Hack: they may be looking up foo/bar, where foo is a 4073 * regular file. In such a case we need to turn ENOTDIR, 4074 * but we may happen to get here with a different error. 4075 */ 4076 if (dvp->v_type != VDIR) { 4077 /* 4078 * The check here is predominantly to catch 4079 * EOPNOTSUPP from dead_vnodeops. If the vnode 4080 * gets doomed past this point it is going to 4081 * fail seqc verification. 4082 */ 4083 if (VN_IS_DOOMED(dvp)) { 4084 return (cache_fpl_aborted(fpl)); 4085 } 4086 error = ENOTDIR; 4087 } 4088 4089 /* 4090 * Hack: handle O_SEARCH. 4091 * 4092 * Open Group Base Specifications Issue 7, 2018 edition states: 4093 * If the access mode of the open file description associated with the 4094 * file descriptor is not O_SEARCH, the function shall check whether 4095 * directory searches are permitted using the current permissions of 4096 * the directory underlying the file descriptor. If the access mode is 4097 * O_SEARCH, the function shall not perform the check. 4098 * 4099 * Regular lookup tests for the NOEXECCHECK flag for every path 4100 * component to decide whether to do the permission check. However, 4101 * since most lookups never have the flag (and when they do it is only 4102 * present for the first path component), lockless lookup only acts on 4103 * it if there is a permission problem. Here the flag is represented 4104 * with a boolean so that we don't have to clear it on the way out. 4105 * 4106 * For simplicity this always aborts. 4107 * TODO: check if this is the first lookup and ignore the permission 4108 * problem. Note the flag has to survive fallback (if it happens to be 4109 * performed). 4110 */ 4111 if (fpl->fsearch) { 4112 return (cache_fpl_aborted(fpl)); 4113 } 4114 4115 switch (error) { 4116 case EAGAIN: 4117 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4118 error = cache_fpl_aborted(fpl); 4119 } else { 4120 cache_fpl_partial(fpl); 4121 } 4122 break; 4123 default: 4124 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 4125 error = cache_fpl_aborted(fpl); 4126 } else { 4127 cache_fpl_smr_exit(fpl); 4128 cache_fpl_handled(fpl, error); 4129 } 4130 break; 4131 } 4132 return (error); 4133 } 4134 4135 static int 4136 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl) 4137 { 4138 struct nameidata *ndp; 4139 struct componentname *cnp; 4140 struct mount *mp; 4141 int error; 4142 4143 error = CACHE_FPL_FAILED; 4144 ndp = fpl->ndp; 4145 cnp = fpl->cnp; 4146 4147 cache_fpl_checkpoint(fpl, &fpl->snd); 4148 4149 fpl->dvp = dvp; 4150 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); 4151 if (seqc_in_modify(fpl->dvp_seqc)) { 4152 cache_fpl_aborted(fpl); 4153 goto out; 4154 } 4155 mp = atomic_load_ptr(&fpl->dvp->v_mount); 4156 if (!cache_fplookup_mp_supported(mp)) { 4157 cache_fpl_aborted(fpl); 4158 goto out; 4159 } 4160 4161 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); 4162 4163 for (;;) { 4164 error = cache_fplookup_parse(fpl); 4165 if (__predict_false(error != 0)) { 4166 break; 4167 } 4168 4169 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); 4170 4171 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred); 4172 if (__predict_false(error != 0)) { 4173 error = cache_fplookup_failed_vexec(fpl, error); 4174 break; 4175 } 4176 4177 if (__predict_false(cache_fpl_isdotdot(cnp))) { 4178 error = cache_fplookup_dotdot(fpl); 4179 if (__predict_false(error != 0)) { 4180 break; 4181 } 4182 } else { 4183 error = cache_fplookup_next(fpl); 4184 if (__predict_false(error != 0)) { 4185 break; 4186 } 4187 4188 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 4189 4190 if (cache_fplookup_need_climb_mount(fpl)) { 4191 error = cache_fplookup_climb_mount(fpl); 4192 if (__predict_false(error != 0)) { 4193 break; 4194 } 4195 } 4196 } 4197 4198 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 4199 4200 if (cache_fpl_islastcn(ndp)) { 4201 error = cache_fplookup_final(fpl); 4202 break; 4203 } 4204 4205 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { 4206 error = cache_fpl_aborted(fpl); 4207 break; 4208 } 4209 4210 fpl->dvp = fpl->tvp; 4211 fpl->dvp_seqc = fpl->tvp_seqc; 4212 4213 cache_fplookup_parse_advance(fpl); 4214 cache_fpl_checkpoint(fpl, &fpl->snd); 4215 } 4216 out: 4217 switch (fpl->status) { 4218 case CACHE_FPL_STATUS_UNSET: 4219 __assert_unreachable(); 4220 break; 4221 case CACHE_FPL_STATUS_PARTIAL: 4222 cache_fpl_smr_assert_entered(fpl); 4223 return (cache_fplookup_partial_setup(fpl)); 4224 case CACHE_FPL_STATUS_ABORTED: 4225 if (fpl->in_smr) 4226 cache_fpl_smr_exit(fpl); 4227 return (CACHE_FPL_FAILED); 4228 case CACHE_FPL_STATUS_HANDLED: 4229 MPASS(error != CACHE_FPL_FAILED); 4230 cache_fpl_smr_assert_not_entered(fpl); 4231 if (__predict_false(error != 0)) { 4232 ndp->ni_dvp = NULL; 4233 ndp->ni_vp = NULL; 4234 cache_fpl_cleanup_cnp(cnp); 4235 return (error); 4236 } 4237 ndp->ni_dvp = fpl->dvp; 4238 ndp->ni_vp = fpl->tvp; 4239 if (cnp->cn_flags & SAVENAME) 4240 cnp->cn_flags |= HASBUF; 4241 else 4242 cache_fpl_cleanup_cnp(cnp); 4243 return (error); 4244 } 4245 } 4246 4247 /* 4248 * Fast path lookup protected with SMR and sequence counters. 4249 * 4250 * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one. 4251 * 4252 * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria 4253 * outlined below. 4254 * 4255 * Traditional vnode lookup conceptually looks like this: 4256 * 4257 * vn_lock(current); 4258 * for (;;) { 4259 * next = find(); 4260 * vn_lock(next); 4261 * vn_unlock(current); 4262 * current = next; 4263 * if (last) 4264 * break; 4265 * } 4266 * return (current); 4267 * 4268 * Each jump to the next vnode is safe memory-wise and atomic with respect to 4269 * any modifications thanks to holding respective locks. 4270 * 4271 * The same guarantee can be provided with a combination of safe memory 4272 * reclamation and sequence counters instead. If all operations which affect 4273 * the relationship between the current vnode and the one we are looking for 4274 * also modify the counter, we can verify whether all the conditions held as 4275 * we made the jump. This includes things like permissions, mount points etc. 4276 * Counter modification is provided by enclosing relevant places in 4277 * vn_seqc_write_begin()/end() calls. 4278 * 4279 * Thus this translates to: 4280 * 4281 * vfs_smr_enter(); 4282 * dvp_seqc = seqc_read_any(dvp); 4283 * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode 4284 * abort(); 4285 * for (;;) { 4286 * tvp = find(); 4287 * tvp_seqc = seqc_read_any(tvp); 4288 * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode 4289 * abort(); 4290 * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode 4291 * abort(); 4292 * dvp = tvp; // we know nothing of importance has changed 4293 * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration 4294 * if (last) 4295 * break; 4296 * } 4297 * vget(); // secure the vnode 4298 * if (!seqc_consistent(tvp, tvp_seqc) // final check 4299 * abort(); 4300 * // at this point we know nothing has changed for any parent<->child pair 4301 * // as they were crossed during the lookup, meaning we matched the guarantee 4302 * // of the locked variant 4303 * return (tvp); 4304 * 4305 * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows: 4306 * - they are called while within vfs_smr protection which they must never exit 4307 * - EAGAIN can be returned to denote checking could not be performed, it is 4308 * always valid to return it 4309 * - if the sequence counter has not changed the result must be valid 4310 * - if the sequence counter has changed both false positives and false negatives 4311 * are permitted (since the result will be rejected later) 4312 * - for simple cases of unix permission checks vaccess_vexec_smr can be used 4313 * 4314 * Caveats to watch out for: 4315 * - vnodes are passed unlocked and unreferenced with nothing stopping 4316 * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised 4317 * to use atomic_load_ptr to fetch it. 4318 * - the aforementioned object can also get freed, meaning absent other means it 4319 * should be protected with vfs_smr 4320 * - either safely checking permissions as they are modified or guaranteeing 4321 * their stability is left to the routine 4322 */ 4323 int 4324 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status, 4325 struct pwd **pwdp) 4326 { 4327 struct cache_fpl fpl; 4328 struct pwd *pwd; 4329 struct vnode *dvp; 4330 struct componentname *cnp; 4331 struct nameidata_saved orig; 4332 int error; 4333 4334 MPASS(ndp->ni_lcf == 0); 4335 4336 fpl.status = CACHE_FPL_STATUS_UNSET; 4337 fpl.ndp = ndp; 4338 fpl.cnp = &ndp->ni_cnd; 4339 MPASS(curthread == fpl.cnp->cn_thread); 4340 4341 if ((fpl.cnp->cn_flags & SAVESTART) != 0) 4342 MPASS(fpl.cnp->cn_nameiop != LOOKUP); 4343 4344 if (!cache_can_fplookup(&fpl)) { 4345 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 4346 *status = fpl.status; 4347 return (EOPNOTSUPP); 4348 } 4349 4350 cache_fpl_checkpoint(&fpl, &orig); 4351 4352 cache_fpl_smr_enter_initial(&fpl); 4353 fpl.fsearch = false; 4354 pwd = pwd_get_smr(); 4355 fpl.pwd = pwd; 4356 ndp->ni_rootdir = pwd->pwd_rdir; 4357 ndp->ni_topdir = pwd->pwd_jdir; 4358 4359 cnp = fpl.cnp; 4360 cnp->cn_nameptr = cnp->cn_pnbuf; 4361 if (cnp->cn_pnbuf[0] == '/') { 4362 cache_fpl_handle_root(ndp, &dvp); 4363 } else { 4364 if (ndp->ni_dirfd == AT_FDCWD) { 4365 dvp = pwd->pwd_cdir; 4366 } else { 4367 error = cache_fplookup_dirfd(&fpl, &dvp); 4368 if (__predict_false(error != 0)) { 4369 goto out; 4370 } 4371 } 4372 } 4373 4374 SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true); 4375 4376 error = cache_fplookup_impl(dvp, &fpl); 4377 out: 4378 cache_fpl_smr_assert_not_entered(&fpl); 4379 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 4380 4381 *status = fpl.status; 4382 switch (fpl.status) { 4383 case CACHE_FPL_STATUS_UNSET: 4384 __assert_unreachable(); 4385 break; 4386 case CACHE_FPL_STATUS_HANDLED: 4387 SDT_PROBE3(vfs, namei, lookup, return, error, 4388 (error == 0 ? ndp->ni_vp : NULL), true); 4389 break; 4390 case CACHE_FPL_STATUS_PARTIAL: 4391 *pwdp = fpl.pwd; 4392 /* 4393 * Status restored by cache_fplookup_partial_setup. 4394 */ 4395 break; 4396 case CACHE_FPL_STATUS_ABORTED: 4397 cache_fpl_restore(&fpl, &orig); 4398 break; 4399 } 4400 return (error); 4401 } 4402