1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Poul-Henning Kamp of the FreeBSD Project. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_ddb.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/capsicum.h> 46 #include <sys/counter.h> 47 #include <sys/filedesc.h> 48 #include <sys/fnv_hash.h> 49 #include <sys/kernel.h> 50 #include <sys/ktr.h> 51 #include <sys/lock.h> 52 #include <sys/malloc.h> 53 #include <sys/fcntl.h> 54 #include <sys/jail.h> 55 #include <sys/mount.h> 56 #include <sys/namei.h> 57 #include <sys/proc.h> 58 #include <sys/rwlock.h> 59 #include <sys/seqc.h> 60 #include <sys/sdt.h> 61 #include <sys/smr.h> 62 #include <sys/smp.h> 63 #include <sys/syscallsubr.h> 64 #include <sys/sysctl.h> 65 #include <sys/sysproto.h> 66 #include <sys/vnode.h> 67 #include <ck_queue.h> 68 #ifdef KTRACE 69 #include <sys/ktrace.h> 70 #endif 71 72 #include <sys/capsicum.h> 73 74 #include <security/audit/audit.h> 75 #include <security/mac/mac_framework.h> 76 77 #ifdef DDB 78 #include <ddb/ddb.h> 79 #endif 80 81 #include <vm/uma.h> 82 83 SDT_PROVIDER_DECLARE(vfs); 84 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", 85 "struct vnode *"); 86 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", 87 "char *"); 88 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); 89 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", 90 "char *", "struct vnode *"); 91 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); 92 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", 93 "struct vnode *", "char *"); 94 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", 95 "struct vnode *"); 96 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, 97 "struct vnode *", "char *"); 98 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", 99 "char *"); 100 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); 101 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); 102 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); 103 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", 104 "struct vnode *"); 105 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", 106 "char *"); 107 SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *", 108 "char *"); 109 110 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool"); 111 SDT_PROBE_DECLARE(vfs, namei, lookup, entry); 112 SDT_PROBE_DECLARE(vfs, namei, lookup, return); 113 114 /* 115 * This structure describes the elements in the cache of recent 116 * names looked up by namei. 117 */ 118 struct negstate { 119 u_char neg_flag; 120 }; 121 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *), 122 "the state must fit in a union with a pointer without growing it"); 123 124 struct namecache { 125 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 126 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 127 CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */ 128 struct vnode *nc_dvp; /* vnode of parent of name */ 129 union { 130 struct vnode *nu_vp; /* vnode the name refers to */ 131 struct negstate nu_neg;/* negative entry state */ 132 } n_un; 133 u_char nc_flag; /* flag bits */ 134 u_char nc_nlen; /* length of name */ 135 char nc_name[0]; /* segment name + nul */ 136 }; 137 138 /* 139 * struct namecache_ts repeats struct namecache layout up to the 140 * nc_nlen member. 141 * struct namecache_ts is used in place of struct namecache when time(s) need 142 * to be stored. The nc_dotdottime field is used when a cache entry is mapping 143 * both a non-dotdot directory name plus dotdot for the directory's 144 * parent. 145 * 146 * See below for alignment requirement. 147 */ 148 struct namecache_ts { 149 struct timespec nc_time; /* timespec provided by fs */ 150 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ 151 int nc_ticks; /* ticks value when entry was added */ 152 struct namecache nc_nc; 153 }; 154 155 /* 156 * At least mips n32 performs 64-bit accesses to timespec as found 157 * in namecache_ts and requires them to be aligned. Since others 158 * may be in the same spot suffer a little bit and enforce the 159 * alignment for everyone. Note this is a nop for 64-bit platforms. 160 */ 161 #define CACHE_ZONE_ALIGNMENT UMA_ALIGNOF(time_t) 162 163 #define nc_vp n_un.nu_vp 164 #define nc_neg n_un.nu_neg 165 166 /* 167 * Flags in namecache.nc_flag 168 */ 169 #define NCF_WHITE 0x01 170 #define NCF_ISDOTDOT 0x02 171 #define NCF_TS 0x04 172 #define NCF_DTS 0x08 173 #define NCF_DVDROP 0x10 174 #define NCF_NEGATIVE 0x20 175 #define NCF_INVALID 0x40 176 #define NCF_WIP 0x80 177 178 /* 179 * Flags in negstate.neg_flag 180 */ 181 #define NEG_HOT 0x01 182 183 /* 184 * Mark an entry as invalid. 185 * 186 * This is called before it starts getting deconstructed. 187 */ 188 static void 189 cache_ncp_invalidate(struct namecache *ncp) 190 { 191 192 KASSERT((ncp->nc_flag & NCF_INVALID) == 0, 193 ("%s: entry %p already invalid", __func__, ncp)); 194 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID); 195 atomic_thread_fence_rel(); 196 } 197 198 /* 199 * Check whether the entry can be safely used. 200 * 201 * All places which elide locks are supposed to call this after they are 202 * done with reading from an entry. 203 */ 204 static bool 205 cache_ncp_canuse(struct namecache *ncp) 206 { 207 208 atomic_thread_fence_acq(); 209 return ((atomic_load_char(&ncp->nc_flag) & (NCF_INVALID | NCF_WIP)) == 0); 210 } 211 212 /* 213 * Name caching works as follows: 214 * 215 * Names found by directory scans are retained in a cache 216 * for future reference. It is managed LRU, so frequently 217 * used names will hang around. Cache is indexed by hash value 218 * obtained from (dvp, name) where dvp refers to the directory 219 * containing name. 220 * 221 * If it is a "negative" entry, (i.e. for a name that is known NOT to 222 * exist) the vnode pointer will be NULL. 223 * 224 * Upon reaching the last segment of a path, if the reference 225 * is for DELETE, or NOCACHE is set (rewrite), and the 226 * name is located in the cache, it will be dropped. 227 * 228 * These locks are used (in the order in which they can be taken): 229 * NAME TYPE ROLE 230 * vnodelock mtx vnode lists and v_cache_dd field protection 231 * bucketlock rwlock for access to given set of hash buckets 232 * neglist mtx negative entry LRU management 233 * 234 * Additionally, ncneg_shrink_lock mtx is used to have at most one thread 235 * shrinking the LRU list. 236 * 237 * It is legal to take multiple vnodelock and bucketlock locks. The locking 238 * order is lower address first. Both are recursive. 239 * 240 * "." lookups are lockless. 241 * 242 * ".." and vnode -> name lookups require vnodelock. 243 * 244 * name -> vnode lookup requires the relevant bucketlock to be held for reading. 245 * 246 * Insertions and removals of entries require involved vnodes and bucketlocks 247 * to be write-locked to prevent other threads from seeing the entry. 248 * 249 * Some lookups result in removal of the found entry (e.g. getting rid of a 250 * negative entry with the intent to create a positive one), which poses a 251 * problem when multiple threads reach the state. Similarly, two different 252 * threads can purge two different vnodes and try to remove the same name. 253 * 254 * If the already held vnode lock is lower than the second required lock, we 255 * can just take the other lock. However, in the opposite case, this could 256 * deadlock. As such, this is resolved by trylocking and if that fails unlocking 257 * the first node, locking everything in order and revalidating the state. 258 */ 259 260 VFS_SMR_DECLARE; 261 262 /* 263 * Structures associated with name caching. 264 */ 265 #define NCHHASH(hash) \ 266 (&nchashtbl[(hash) & nchash]) 267 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ 268 static u_long __read_mostly nchash; /* size of hash table */ 269 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 270 "Size of namecache hash table"); 271 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */ 272 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 273 "Ratio of negative namecache entries"); 274 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ 275 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ 276 u_int ncsizefactor = 2; 277 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0, 278 "Size factor for namecache"); 279 static u_int __read_mostly ncpurgeminvnodes; 280 SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0, 281 "Number of vnodes below which purgevfs ignores the request"); 282 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */ 283 284 struct nchstats nchstats; /* cache effectiveness statistics */ 285 286 static struct mtx __exclusive_cache_line ncneg_shrink_lock; 287 288 struct neglist { 289 struct mtx nl_lock; 290 TAILQ_HEAD(, namecache) nl_list; 291 } __aligned(CACHE_LINE_SIZE); 292 293 static struct neglist __read_mostly *neglists; 294 static struct neglist ncneg_hot; 295 static u_long numhotneg; 296 297 #define ncneghash 3 298 #define numneglists (ncneghash + 1) 299 static inline struct neglist * 300 NCP2NEGLIST(struct namecache *ncp) 301 { 302 303 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); 304 } 305 306 static inline struct negstate * 307 NCP2NEGSTATE(struct namecache *ncp) 308 { 309 310 MPASS(ncp->nc_flag & NCF_NEGATIVE); 311 return (&ncp->nc_neg); 312 } 313 314 #define numbucketlocks (ncbuckethash + 1) 315 static u_int __read_mostly ncbuckethash; 316 static struct rwlock_padalign __read_mostly *bucketlocks; 317 #define HASH2BUCKETLOCK(hash) \ 318 ((struct rwlock *)(&bucketlocks[((hash) & ncbuckethash)])) 319 320 #define numvnodelocks (ncvnodehash + 1) 321 static u_int __read_mostly ncvnodehash; 322 static struct mtx __read_mostly *vnodelocks; 323 static inline struct mtx * 324 VP2VNODELOCK(struct vnode *vp) 325 { 326 327 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); 328 } 329 330 /* 331 * UMA zones for the VFS cache. 332 * 333 * The small cache is used for entries with short names, which are the 334 * most common. The large cache is used for entries which are too big to 335 * fit in the small cache. 336 */ 337 static uma_zone_t __read_mostly cache_zone_small; 338 static uma_zone_t __read_mostly cache_zone_small_ts; 339 static uma_zone_t __read_mostly cache_zone_large; 340 static uma_zone_t __read_mostly cache_zone_large_ts; 341 342 #define CACHE_PATH_CUTOFF 35 343 344 static struct namecache * 345 cache_alloc(int len, int ts) 346 { 347 struct namecache_ts *ncp_ts; 348 struct namecache *ncp; 349 350 if (__predict_false(ts)) { 351 if (len <= CACHE_PATH_CUTOFF) 352 ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK); 353 else 354 ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK); 355 ncp = &ncp_ts->nc_nc; 356 } else { 357 if (len <= CACHE_PATH_CUTOFF) 358 ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK); 359 else 360 ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK); 361 } 362 return (ncp); 363 } 364 365 static void 366 cache_free(struct namecache *ncp) 367 { 368 struct namecache_ts *ncp_ts; 369 370 if (ncp == NULL) 371 return; 372 if ((ncp->nc_flag & NCF_DVDROP) != 0) 373 vdrop(ncp->nc_dvp); 374 if (__predict_false(ncp->nc_flag & NCF_TS)) { 375 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 376 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 377 uma_zfree_smr(cache_zone_small_ts, ncp_ts); 378 else 379 uma_zfree_smr(cache_zone_large_ts, ncp_ts); 380 } else { 381 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 382 uma_zfree_smr(cache_zone_small, ncp); 383 else 384 uma_zfree_smr(cache_zone_large, ncp); 385 } 386 } 387 388 static void 389 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) 390 { 391 struct namecache_ts *ncp_ts; 392 393 KASSERT((ncp->nc_flag & NCF_TS) != 0 || 394 (tsp == NULL && ticksp == NULL), 395 ("No NCF_TS")); 396 397 if (tsp == NULL && ticksp == NULL) 398 return; 399 400 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 401 if (tsp != NULL) 402 *tsp = ncp_ts->nc_time; 403 if (ticksp != NULL) 404 *ticksp = ncp_ts->nc_ticks; 405 } 406 407 #ifdef DEBUG_CACHE 408 static int __read_mostly doingcache = 1; /* 1 => enable the cache */ 409 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, 410 "VFS namecache enabled"); 411 #endif 412 413 /* Export size information to userland */ 414 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 415 sizeof(struct namecache), "sizeof(struct namecache)"); 416 417 /* 418 * The new name cache statistics 419 */ 420 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 421 "Name cache statistics"); 422 #define STATNODE_ULONG(name, descr) \ 423 SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr); 424 #define STATNODE_COUNTER(name, descr) \ 425 static COUNTER_U64_DEFINE_EARLY(name); \ 426 SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, \ 427 descr); 428 STATNODE_ULONG(numneg, "Number of negative cache entries"); 429 STATNODE_ULONG(numcache, "Number of cache entries"); 430 STATNODE_COUNTER(numcachehv, "Number of namecache entries with vnodes held"); 431 STATNODE_COUNTER(numdrops, "Number of dropped entries due to reaching the limit"); 432 STATNODE_COUNTER(dothits, "Number of '.' hits"); 433 STATNODE_COUNTER(dotdothits, "Number of '..' hits"); 434 STATNODE_COUNTER(nummiss, "Number of cache misses"); 435 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache"); 436 STATNODE_COUNTER(numposzaps, 437 "Number of cache hits (positive) we do not want to cache"); 438 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)"); 439 STATNODE_COUNTER(numnegzaps, 440 "Number of cache hits (negative) we do not want to cache"); 441 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)"); 442 /* These count for vn_getcwd(), too. */ 443 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls"); 444 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); 445 STATNODE_COUNTER(numfullpathfail2, 446 "Number of fullpath search errors (VOP_VPTOCNP failures)"); 447 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); 448 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls"); 449 STATNODE_COUNTER(zap_and_exit_bucket_relock_success, 450 "Number of successful removals after relocking"); 451 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail, 452 "Number of times zap_and_exit failed to lock"); 453 static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2, 454 "Number of times zap_and_exit failed to lock"); 455 static long cache_lock_vnodes_cel_3_failures; 456 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures, 457 "Number of times 3-way vnode locking failed"); 458 STATNODE_ULONG(numhotneg, "Number of hot negative entries"); 459 STATNODE_COUNTER(numneg_evicted, 460 "Number of negative entries evicted when adding a new entry"); 461 STATNODE_COUNTER(shrinking_skipped, 462 "Number of times shrinking was already in progress"); 463 464 static void cache_zap_locked(struct namecache *ncp); 465 static int vn_fullpath_hardlink(struct thread *td, struct nameidata *ndp, char **retbuf, 466 char **freebuf, size_t *buflen); 467 static int vn_fullpath_any(struct thread *td, struct vnode *vp, struct vnode *rdir, 468 char *buf, char **retbuf, size_t *buflen); 469 static int vn_fullpath_dir(struct thread *td, struct vnode *vp, struct vnode *rdir, 470 char *buf, char **retbuf, size_t *len, bool slash_prefixed, size_t addend); 471 472 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 473 474 static int cache_yield; 475 SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0, 476 "Number of times cache called yield"); 477 478 static void __noinline 479 cache_maybe_yield(void) 480 { 481 482 if (should_yield()) { 483 cache_yield++; 484 kern_yield(PRI_USER); 485 } 486 } 487 488 static inline void 489 cache_assert_vlp_locked(struct mtx *vlp) 490 { 491 492 if (vlp != NULL) 493 mtx_assert(vlp, MA_OWNED); 494 } 495 496 static inline void 497 cache_assert_vnode_locked(struct vnode *vp) 498 { 499 struct mtx *vlp; 500 501 vlp = VP2VNODELOCK(vp); 502 cache_assert_vlp_locked(vlp); 503 } 504 505 /* 506 * TODO: With the value stored we can do better than computing the hash based 507 * on the address and the choice of FNV should also be revisisted. 508 */ 509 static void 510 cache_prehash(struct vnode *vp) 511 { 512 513 vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT); 514 } 515 516 static uint32_t 517 cache_get_hash(char *name, u_char len, struct vnode *dvp) 518 { 519 520 return (fnv_32_buf(name, len, dvp->v_nchash)); 521 } 522 523 static inline struct nchashhead * 524 NCP2BUCKET(struct namecache *ncp) 525 { 526 uint32_t hash; 527 528 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 529 return (NCHHASH(hash)); 530 } 531 532 static inline struct rwlock * 533 NCP2BUCKETLOCK(struct namecache *ncp) 534 { 535 uint32_t hash; 536 537 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 538 return (HASH2BUCKETLOCK(hash)); 539 } 540 541 #ifdef INVARIANTS 542 static void 543 cache_assert_bucket_locked(struct namecache *ncp, int mode) 544 { 545 struct rwlock *blp; 546 547 blp = NCP2BUCKETLOCK(ncp); 548 rw_assert(blp, mode); 549 } 550 #else 551 #define cache_assert_bucket_locked(x, y) do { } while (0) 552 #endif 553 554 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) 555 static void 556 _cache_sort_vnodes(void **p1, void **p2) 557 { 558 void *tmp; 559 560 MPASS(*p1 != NULL || *p2 != NULL); 561 562 if (*p1 > *p2) { 563 tmp = *p2; 564 *p2 = *p1; 565 *p1 = tmp; 566 } 567 } 568 569 static void 570 cache_lock_all_buckets(void) 571 { 572 u_int i; 573 574 for (i = 0; i < numbucketlocks; i++) 575 rw_wlock(&bucketlocks[i]); 576 } 577 578 static void 579 cache_unlock_all_buckets(void) 580 { 581 u_int i; 582 583 for (i = 0; i < numbucketlocks; i++) 584 rw_wunlock(&bucketlocks[i]); 585 } 586 587 static void 588 cache_lock_all_vnodes(void) 589 { 590 u_int i; 591 592 for (i = 0; i < numvnodelocks; i++) 593 mtx_lock(&vnodelocks[i]); 594 } 595 596 static void 597 cache_unlock_all_vnodes(void) 598 { 599 u_int i; 600 601 for (i = 0; i < numvnodelocks; i++) 602 mtx_unlock(&vnodelocks[i]); 603 } 604 605 static int 606 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 607 { 608 609 cache_sort_vnodes(&vlp1, &vlp2); 610 611 if (vlp1 != NULL) { 612 if (!mtx_trylock(vlp1)) 613 return (EAGAIN); 614 } 615 if (!mtx_trylock(vlp2)) { 616 if (vlp1 != NULL) 617 mtx_unlock(vlp1); 618 return (EAGAIN); 619 } 620 621 return (0); 622 } 623 624 static void 625 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 626 { 627 628 MPASS(vlp1 != NULL || vlp2 != NULL); 629 MPASS(vlp1 <= vlp2); 630 631 if (vlp1 != NULL) 632 mtx_lock(vlp1); 633 if (vlp2 != NULL) 634 mtx_lock(vlp2); 635 } 636 637 static void 638 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 639 { 640 641 MPASS(vlp1 != NULL || vlp2 != NULL); 642 643 if (vlp1 != NULL) 644 mtx_unlock(vlp1); 645 if (vlp2 != NULL) 646 mtx_unlock(vlp2); 647 } 648 649 static int 650 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 651 { 652 struct nchstats snap; 653 654 if (req->oldptr == NULL) 655 return (SYSCTL_OUT(req, 0, sizeof(snap))); 656 657 snap = nchstats; 658 snap.ncs_goodhits = counter_u64_fetch(numposhits); 659 snap.ncs_neghits = counter_u64_fetch(numneghits); 660 snap.ncs_badhits = counter_u64_fetch(numposzaps) + 661 counter_u64_fetch(numnegzaps); 662 snap.ncs_miss = counter_u64_fetch(nummisszap) + 663 counter_u64_fetch(nummiss); 664 665 return (SYSCTL_OUT(req, &snap, sizeof(snap))); 666 } 667 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | 668 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", 669 "VFS cache effectiveness statistics"); 670 671 #ifdef DIAGNOSTIC 672 /* 673 * Grab an atomic snapshot of the name cache hash chain lengths 674 */ 675 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, 676 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 677 "hash table stats"); 678 679 static int 680 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) 681 { 682 struct nchashhead *ncpp; 683 struct namecache *ncp; 684 int i, error, n_nchash, *cntbuf; 685 686 retry: 687 n_nchash = nchash + 1; /* nchash is max index, not count */ 688 if (req->oldptr == NULL) 689 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); 690 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); 691 cache_lock_all_buckets(); 692 if (n_nchash != nchash + 1) { 693 cache_unlock_all_buckets(); 694 free(cntbuf, M_TEMP); 695 goto retry; 696 } 697 /* Scan hash tables counting entries */ 698 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) 699 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) 700 cntbuf[i]++; 701 cache_unlock_all_buckets(); 702 for (error = 0, i = 0; i < n_nchash; i++) 703 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) 704 break; 705 free(cntbuf, M_TEMP); 706 return (error); 707 } 708 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| 709 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", 710 "nchash chain lengths"); 711 712 static int 713 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) 714 { 715 int error; 716 struct nchashhead *ncpp; 717 struct namecache *ncp; 718 int n_nchash; 719 int count, maxlength, used, pct; 720 721 if (!req->oldptr) 722 return SYSCTL_OUT(req, 0, 4 * sizeof(int)); 723 724 cache_lock_all_buckets(); 725 n_nchash = nchash + 1; /* nchash is max index, not count */ 726 used = 0; 727 maxlength = 0; 728 729 /* Scan hash tables for applicable entries */ 730 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { 731 count = 0; 732 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) { 733 count++; 734 } 735 if (count) 736 used++; 737 if (maxlength < count) 738 maxlength = count; 739 } 740 n_nchash = nchash + 1; 741 cache_unlock_all_buckets(); 742 pct = (used * 100) / (n_nchash / 100); 743 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); 744 if (error) 745 return (error); 746 error = SYSCTL_OUT(req, &used, sizeof(used)); 747 if (error) 748 return (error); 749 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); 750 if (error) 751 return (error); 752 error = SYSCTL_OUT(req, &pct, sizeof(pct)); 753 if (error) 754 return (error); 755 return (0); 756 } 757 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| 758 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", 759 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); 760 #endif 761 762 /* 763 * Negative entries management 764 * 765 * A variation of LRU scheme is used. New entries are hashed into one of 766 * numneglists cold lists. Entries get promoted to the hot list on first hit. 767 * 768 * The shrinker will demote hot list head and evict from the cold list in a 769 * round-robin manner. 770 */ 771 static void 772 cache_negative_init(struct namecache *ncp) 773 { 774 struct negstate *negstate; 775 776 ncp->nc_flag |= NCF_NEGATIVE; 777 negstate = NCP2NEGSTATE(ncp); 778 negstate->neg_flag = 0; 779 } 780 781 static void 782 cache_negative_hit(struct namecache *ncp) 783 { 784 struct neglist *neglist; 785 struct negstate *negstate; 786 787 negstate = NCP2NEGSTATE(ncp); 788 if ((negstate->neg_flag & NEG_HOT) != 0) 789 return; 790 neglist = NCP2NEGLIST(ncp); 791 mtx_lock(&ncneg_hot.nl_lock); 792 mtx_lock(&neglist->nl_lock); 793 if ((negstate->neg_flag & NEG_HOT) == 0) { 794 numhotneg++; 795 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 796 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); 797 negstate->neg_flag |= NEG_HOT; 798 } 799 mtx_unlock(&neglist->nl_lock); 800 mtx_unlock(&ncneg_hot.nl_lock); 801 } 802 803 static void 804 cache_negative_insert(struct namecache *ncp) 805 { 806 struct neglist *neglist; 807 808 MPASS(ncp->nc_flag & NCF_NEGATIVE); 809 cache_assert_bucket_locked(ncp, RA_WLOCKED); 810 neglist = NCP2NEGLIST(ncp); 811 mtx_lock(&neglist->nl_lock); 812 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 813 mtx_unlock(&neglist->nl_lock); 814 atomic_add_rel_long(&numneg, 1); 815 } 816 817 static void 818 cache_negative_remove(struct namecache *ncp) 819 { 820 struct neglist *neglist; 821 struct negstate *negstate; 822 bool hot_locked = false; 823 bool list_locked = false; 824 825 cache_assert_bucket_locked(ncp, RA_WLOCKED); 826 neglist = NCP2NEGLIST(ncp); 827 negstate = NCP2NEGSTATE(ncp); 828 if ((negstate->neg_flag & NEG_HOT) != 0) { 829 hot_locked = true; 830 mtx_lock(&ncneg_hot.nl_lock); 831 if ((negstate->neg_flag & NEG_HOT) == 0) { 832 list_locked = true; 833 mtx_lock(&neglist->nl_lock); 834 } 835 } else { 836 list_locked = true; 837 mtx_lock(&neglist->nl_lock); 838 /* 839 * We may be racing against promotion in lockless lookup. 840 */ 841 if ((negstate->neg_flag & NEG_HOT) != 0) { 842 mtx_unlock(&neglist->nl_lock); 843 hot_locked = true; 844 mtx_lock(&ncneg_hot.nl_lock); 845 mtx_lock(&neglist->nl_lock); 846 } 847 } 848 if ((negstate->neg_flag & NEG_HOT) != 0) { 849 mtx_assert(&ncneg_hot.nl_lock, MA_OWNED); 850 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 851 numhotneg--; 852 } else { 853 mtx_assert(&neglist->nl_lock, MA_OWNED); 854 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 855 } 856 if (list_locked) 857 mtx_unlock(&neglist->nl_lock); 858 if (hot_locked) 859 mtx_unlock(&ncneg_hot.nl_lock); 860 atomic_subtract_rel_long(&numneg, 1); 861 } 862 863 static void 864 cache_negative_shrink_select(struct namecache **ncpp, 865 struct neglist **neglistpp) 866 { 867 struct neglist *neglist; 868 struct namecache *ncp; 869 static u_int cycle; 870 u_int i; 871 872 *ncpp = ncp = NULL; 873 874 for (i = 0; i < numneglists; i++) { 875 neglist = &neglists[(cycle + i) % numneglists]; 876 if (TAILQ_FIRST(&neglist->nl_list) == NULL) 877 continue; 878 mtx_lock(&neglist->nl_lock); 879 ncp = TAILQ_FIRST(&neglist->nl_list); 880 if (ncp != NULL) 881 break; 882 mtx_unlock(&neglist->nl_lock); 883 } 884 885 *neglistpp = neglist; 886 *ncpp = ncp; 887 cycle++; 888 } 889 890 static void 891 cache_negative_zap_one(void) 892 { 893 struct namecache *ncp, *ncp2; 894 struct neglist *neglist; 895 struct negstate *negstate; 896 struct mtx *dvlp; 897 struct rwlock *blp; 898 899 if (mtx_owner(&ncneg_shrink_lock) != NULL || 900 !mtx_trylock(&ncneg_shrink_lock)) { 901 counter_u64_add(shrinking_skipped, 1); 902 return; 903 } 904 905 mtx_lock(&ncneg_hot.nl_lock); 906 ncp = TAILQ_FIRST(&ncneg_hot.nl_list); 907 if (ncp != NULL) { 908 neglist = NCP2NEGLIST(ncp); 909 negstate = NCP2NEGSTATE(ncp); 910 mtx_lock(&neglist->nl_lock); 911 MPASS((negstate->neg_flag & NEG_HOT) != 0); 912 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 913 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 914 negstate->neg_flag &= ~NEG_HOT; 915 numhotneg--; 916 mtx_unlock(&neglist->nl_lock); 917 } 918 mtx_unlock(&ncneg_hot.nl_lock); 919 920 cache_negative_shrink_select(&ncp, &neglist); 921 922 mtx_unlock(&ncneg_shrink_lock); 923 if (ncp == NULL) 924 return; 925 926 MPASS(ncp->nc_flag & NCF_NEGATIVE); 927 dvlp = VP2VNODELOCK(ncp->nc_dvp); 928 blp = NCP2BUCKETLOCK(ncp); 929 mtx_unlock(&neglist->nl_lock); 930 mtx_lock(dvlp); 931 rw_wlock(blp); 932 /* 933 * Enter SMR to safely check the negative list. 934 * Even if the found pointer matches, the entry may now be reallocated 935 * and used by a different vnode. 936 */ 937 vfs_smr_enter(); 938 ncp2 = TAILQ_FIRST(&neglist->nl_list); 939 if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) || 940 blp != NCP2BUCKETLOCK(ncp2)) { 941 vfs_smr_exit(); 942 ncp = NULL; 943 } else { 944 vfs_smr_exit(); 945 SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp, 946 ncp->nc_name); 947 cache_zap_locked(ncp); 948 counter_u64_add(numneg_evicted, 1); 949 } 950 rw_wunlock(blp); 951 mtx_unlock(dvlp); 952 cache_free(ncp); 953 } 954 955 /* 956 * cache_zap_locked(): 957 * 958 * Removes a namecache entry from cache, whether it contains an actual 959 * pointer to a vnode or if it is just a negative cache entry. 960 */ 961 static void 962 cache_zap_locked(struct namecache *ncp) 963 { 964 struct nchashhead *ncpp; 965 966 if (!(ncp->nc_flag & NCF_NEGATIVE)) 967 cache_assert_vnode_locked(ncp->nc_vp); 968 cache_assert_vnode_locked(ncp->nc_dvp); 969 cache_assert_bucket_locked(ncp, RA_WLOCKED); 970 971 CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, 972 (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp); 973 974 cache_ncp_invalidate(ncp); 975 976 ncpp = NCP2BUCKET(ncp); 977 CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash); 978 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 979 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, 980 ncp->nc_name, ncp->nc_vp); 981 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); 982 if (ncp == ncp->nc_vp->v_cache_dd) { 983 vn_seqc_write_begin_unheld(ncp->nc_vp); 984 ncp->nc_vp->v_cache_dd = NULL; 985 vn_seqc_write_end(ncp->nc_vp); 986 } 987 } else { 988 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp, 989 ncp->nc_name); 990 cache_negative_remove(ncp); 991 } 992 if (ncp->nc_flag & NCF_ISDOTDOT) { 993 if (ncp == ncp->nc_dvp->v_cache_dd) { 994 vn_seqc_write_begin_unheld(ncp->nc_dvp); 995 ncp->nc_dvp->v_cache_dd = NULL; 996 vn_seqc_write_end(ncp->nc_dvp); 997 } 998 } else { 999 LIST_REMOVE(ncp, nc_src); 1000 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { 1001 ncp->nc_flag |= NCF_DVDROP; 1002 counter_u64_add(numcachehv, -1); 1003 } 1004 } 1005 atomic_subtract_rel_long(&numcache, 1); 1006 } 1007 1008 static void 1009 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) 1010 { 1011 struct rwlock *blp; 1012 1013 MPASS(ncp->nc_dvp == vp); 1014 MPASS(ncp->nc_flag & NCF_NEGATIVE); 1015 cache_assert_vnode_locked(vp); 1016 1017 blp = NCP2BUCKETLOCK(ncp); 1018 rw_wlock(blp); 1019 cache_zap_locked(ncp); 1020 rw_wunlock(blp); 1021 } 1022 1023 static bool 1024 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, 1025 struct mtx **vlpp) 1026 { 1027 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 1028 struct rwlock *blp; 1029 1030 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 1031 cache_assert_vnode_locked(vp); 1032 1033 if (ncp->nc_flag & NCF_NEGATIVE) { 1034 if (*vlpp != NULL) { 1035 mtx_unlock(*vlpp); 1036 *vlpp = NULL; 1037 } 1038 cache_zap_negative_locked_vnode_kl(ncp, vp); 1039 return (true); 1040 } 1041 1042 pvlp = VP2VNODELOCK(vp); 1043 blp = NCP2BUCKETLOCK(ncp); 1044 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 1045 vlp2 = VP2VNODELOCK(ncp->nc_vp); 1046 1047 if (*vlpp == vlp1 || *vlpp == vlp2) { 1048 to_unlock = *vlpp; 1049 *vlpp = NULL; 1050 } else { 1051 if (*vlpp != NULL) { 1052 mtx_unlock(*vlpp); 1053 *vlpp = NULL; 1054 } 1055 cache_sort_vnodes(&vlp1, &vlp2); 1056 if (vlp1 == pvlp) { 1057 mtx_lock(vlp2); 1058 to_unlock = vlp2; 1059 } else { 1060 if (!mtx_trylock(vlp1)) 1061 goto out_relock; 1062 to_unlock = vlp1; 1063 } 1064 } 1065 rw_wlock(blp); 1066 cache_zap_locked(ncp); 1067 rw_wunlock(blp); 1068 if (to_unlock != NULL) 1069 mtx_unlock(to_unlock); 1070 return (true); 1071 1072 out_relock: 1073 mtx_unlock(vlp2); 1074 mtx_lock(vlp1); 1075 mtx_lock(vlp2); 1076 MPASS(*vlpp == NULL); 1077 *vlpp = vlp1; 1078 return (false); 1079 } 1080 1081 static int __noinline 1082 cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp) 1083 { 1084 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 1085 struct rwlock *blp; 1086 int error = 0; 1087 1088 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 1089 cache_assert_vnode_locked(vp); 1090 1091 pvlp = VP2VNODELOCK(vp); 1092 if (ncp->nc_flag & NCF_NEGATIVE) { 1093 cache_zap_negative_locked_vnode_kl(ncp, vp); 1094 goto out; 1095 } 1096 1097 blp = NCP2BUCKETLOCK(ncp); 1098 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 1099 vlp2 = VP2VNODELOCK(ncp->nc_vp); 1100 cache_sort_vnodes(&vlp1, &vlp2); 1101 if (vlp1 == pvlp) { 1102 mtx_lock(vlp2); 1103 to_unlock = vlp2; 1104 } else { 1105 if (!mtx_trylock(vlp1)) { 1106 error = EAGAIN; 1107 goto out; 1108 } 1109 to_unlock = vlp1; 1110 } 1111 rw_wlock(blp); 1112 cache_zap_locked(ncp); 1113 rw_wunlock(blp); 1114 mtx_unlock(to_unlock); 1115 out: 1116 mtx_unlock(pvlp); 1117 return (error); 1118 } 1119 1120 /* 1121 * If trylocking failed we can get here. We know enough to take all needed locks 1122 * in the right order and re-lookup the entry. 1123 */ 1124 static int 1125 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1126 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, 1127 struct rwlock *blp) 1128 { 1129 struct namecache *rncp; 1130 1131 cache_assert_bucket_locked(ncp, RA_UNLOCKED); 1132 1133 cache_sort_vnodes(&dvlp, &vlp); 1134 cache_lock_vnodes(dvlp, vlp); 1135 rw_wlock(blp); 1136 CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { 1137 if (rncp == ncp && rncp->nc_dvp == dvp && 1138 rncp->nc_nlen == cnp->cn_namelen && 1139 !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) 1140 break; 1141 } 1142 if (rncp != NULL) { 1143 cache_zap_locked(rncp); 1144 rw_wunlock(blp); 1145 cache_unlock_vnodes(dvlp, vlp); 1146 counter_u64_add(zap_and_exit_bucket_relock_success, 1); 1147 return (0); 1148 } 1149 1150 rw_wunlock(blp); 1151 cache_unlock_vnodes(dvlp, vlp); 1152 return (EAGAIN); 1153 } 1154 1155 static int __noinline 1156 cache_zap_wlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1157 uint32_t hash, struct rwlock *blp) 1158 { 1159 struct mtx *dvlp, *vlp; 1160 struct vnode *dvp; 1161 1162 cache_assert_bucket_locked(ncp, RA_WLOCKED); 1163 1164 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1165 vlp = NULL; 1166 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1167 vlp = VP2VNODELOCK(ncp->nc_vp); 1168 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1169 cache_zap_locked(ncp); 1170 rw_wunlock(blp); 1171 cache_unlock_vnodes(dvlp, vlp); 1172 return (0); 1173 } 1174 1175 dvp = ncp->nc_dvp; 1176 rw_wunlock(blp); 1177 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1178 } 1179 1180 static int __noinline 1181 cache_zap_rlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1182 uint32_t hash, struct rwlock *blp) 1183 { 1184 struct mtx *dvlp, *vlp; 1185 struct vnode *dvp; 1186 1187 cache_assert_bucket_locked(ncp, RA_RLOCKED); 1188 1189 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1190 vlp = NULL; 1191 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1192 vlp = VP2VNODELOCK(ncp->nc_vp); 1193 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1194 rw_runlock(blp); 1195 rw_wlock(blp); 1196 cache_zap_locked(ncp); 1197 rw_wunlock(blp); 1198 cache_unlock_vnodes(dvlp, vlp); 1199 return (0); 1200 } 1201 1202 dvp = ncp->nc_dvp; 1203 rw_runlock(blp); 1204 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1205 } 1206 1207 static int 1208 cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp, 1209 struct mtx **vlpp1, struct mtx **vlpp2) 1210 { 1211 struct mtx *dvlp, *vlp; 1212 1213 cache_assert_bucket_locked(ncp, RA_WLOCKED); 1214 1215 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1216 vlp = NULL; 1217 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1218 vlp = VP2VNODELOCK(ncp->nc_vp); 1219 cache_sort_vnodes(&dvlp, &vlp); 1220 1221 if (*vlpp1 == dvlp && *vlpp2 == vlp) { 1222 cache_zap_locked(ncp); 1223 cache_unlock_vnodes(dvlp, vlp); 1224 *vlpp1 = NULL; 1225 *vlpp2 = NULL; 1226 return (0); 1227 } 1228 1229 if (*vlpp1 != NULL) 1230 mtx_unlock(*vlpp1); 1231 if (*vlpp2 != NULL) 1232 mtx_unlock(*vlpp2); 1233 *vlpp1 = NULL; 1234 *vlpp2 = NULL; 1235 1236 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1237 cache_zap_locked(ncp); 1238 cache_unlock_vnodes(dvlp, vlp); 1239 return (0); 1240 } 1241 1242 rw_wunlock(blp); 1243 *vlpp1 = dvlp; 1244 *vlpp2 = vlp; 1245 if (*vlpp1 != NULL) 1246 mtx_lock(*vlpp1); 1247 mtx_lock(*vlpp2); 1248 rw_wlock(blp); 1249 return (EAGAIN); 1250 } 1251 1252 static void 1253 cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp) 1254 { 1255 1256 if (blp != NULL) { 1257 rw_runlock(blp); 1258 } else { 1259 mtx_unlock(vlp); 1260 } 1261 } 1262 1263 static int __noinline 1264 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1265 struct timespec *tsp, int *ticksp) 1266 { 1267 int ltype; 1268 1269 *vpp = dvp; 1270 CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .", 1271 dvp, cnp->cn_nameptr); 1272 counter_u64_add(dothits, 1); 1273 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); 1274 if (tsp != NULL) 1275 timespecclear(tsp); 1276 if (ticksp != NULL) 1277 *ticksp = ticks; 1278 vrefact(*vpp); 1279 /* 1280 * When we lookup "." we still can be asked to lock it 1281 * differently... 1282 */ 1283 ltype = cnp->cn_lkflags & LK_TYPE_MASK; 1284 if (ltype != VOP_ISLOCKED(*vpp)) { 1285 if (ltype == LK_EXCLUSIVE) { 1286 vn_lock(*vpp, LK_UPGRADE | LK_RETRY); 1287 if (VN_IS_DOOMED((*vpp))) { 1288 /* forced unmount */ 1289 vrele(*vpp); 1290 *vpp = NULL; 1291 return (ENOENT); 1292 } 1293 } else 1294 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); 1295 } 1296 return (-1); 1297 } 1298 1299 static __noinline int 1300 cache_lookup_nomakeentry(struct vnode *dvp, struct vnode **vpp, 1301 struct componentname *cnp, struct timespec *tsp, int *ticksp) 1302 { 1303 struct namecache *ncp; 1304 struct rwlock *blp; 1305 struct mtx *dvlp, *dvlp2; 1306 uint32_t hash; 1307 int error; 1308 1309 if (cnp->cn_namelen == 2 && 1310 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1311 counter_u64_add(dotdothits, 1); 1312 dvlp = VP2VNODELOCK(dvp); 1313 dvlp2 = NULL; 1314 mtx_lock(dvlp); 1315 retry_dotdot: 1316 ncp = dvp->v_cache_dd; 1317 if (ncp == NULL) { 1318 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, 1319 "..", NULL); 1320 mtx_unlock(dvlp); 1321 if (dvlp2 != NULL) 1322 mtx_unlock(dvlp2); 1323 return (0); 1324 } 1325 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1326 if (ncp->nc_dvp != dvp) 1327 panic("dvp %p v_cache_dd %p\n", dvp, ncp); 1328 if (!cache_zap_locked_vnode_kl2(ncp, 1329 dvp, &dvlp2)) 1330 goto retry_dotdot; 1331 MPASS(dvp->v_cache_dd == NULL); 1332 mtx_unlock(dvlp); 1333 if (dvlp2 != NULL) 1334 mtx_unlock(dvlp2); 1335 cache_free(ncp); 1336 } else { 1337 vn_seqc_write_begin(dvp); 1338 dvp->v_cache_dd = NULL; 1339 vn_seqc_write_end(dvp); 1340 mtx_unlock(dvlp); 1341 if (dvlp2 != NULL) 1342 mtx_unlock(dvlp2); 1343 } 1344 return (0); 1345 } 1346 1347 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1348 blp = HASH2BUCKETLOCK(hash); 1349 retry: 1350 if (CK_SLIST_EMPTY(NCHHASH(hash))) 1351 goto out_no_entry; 1352 1353 rw_wlock(blp); 1354 1355 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1356 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1357 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1358 break; 1359 } 1360 1361 /* We failed to find an entry */ 1362 if (ncp == NULL) { 1363 rw_wunlock(blp); 1364 goto out_no_entry; 1365 } 1366 1367 error = cache_zap_wlocked_bucket(ncp, cnp, hash, blp); 1368 if (__predict_false(error != 0)) { 1369 zap_and_exit_bucket_fail++; 1370 cache_maybe_yield(); 1371 goto retry; 1372 } 1373 counter_u64_add(numposzaps, 1); 1374 cache_free(ncp); 1375 return (0); 1376 out_no_entry: 1377 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, NULL); 1378 counter_u64_add(nummisszap, 1); 1379 return (0); 1380 } 1381 1382 /** 1383 * Lookup a name in the name cache 1384 * 1385 * # Arguments 1386 * 1387 * - dvp: Parent directory in which to search. 1388 * - vpp: Return argument. Will contain desired vnode on cache hit. 1389 * - cnp: Parameters of the name search. The most interesting bits of 1390 * the cn_flags field have the following meanings: 1391 * - MAKEENTRY: If clear, free an entry from the cache rather than look 1392 * it up. 1393 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." 1394 * - tsp: Return storage for cache timestamp. On a successful (positive 1395 * or negative) lookup, tsp will be filled with any timespec that 1396 * was stored when this cache entry was created. However, it will 1397 * be clear for "." entries. 1398 * - ticks: Return storage for alternate cache timestamp. On a successful 1399 * (positive or negative) lookup, it will contain the ticks value 1400 * that was current when the cache entry was created, unless cnp 1401 * was ".". 1402 * 1403 * # Returns 1404 * 1405 * - -1: A positive cache hit. vpp will contain the desired vnode. 1406 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due 1407 * to a forced unmount. vpp will not be modified. If the entry 1408 * is a whiteout, then the ISWHITEOUT flag will be set in 1409 * cnp->cn_flags. 1410 * - 0: A cache miss. vpp will not be modified. 1411 * 1412 * # Locking 1413 * 1414 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up 1415 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the 1416 * lock is not recursively acquired. 1417 */ 1418 int 1419 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1420 struct timespec *tsp, int *ticksp) 1421 { 1422 struct namecache_ts *ncp_ts; 1423 struct namecache *ncp; 1424 struct negstate *negstate; 1425 struct rwlock *blp; 1426 struct mtx *dvlp; 1427 uint32_t hash; 1428 enum vgetstate vs; 1429 int error, ltype; 1430 bool try_smr, doing_smr, whiteout; 1431 1432 #ifdef DEBUG_CACHE 1433 if (__predict_false(!doingcache)) { 1434 cnp->cn_flags &= ~MAKEENTRY; 1435 return (0); 1436 } 1437 #endif 1438 1439 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) 1440 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); 1441 1442 if ((cnp->cn_flags & MAKEENTRY) == 0) 1443 return (cache_lookup_nomakeentry(dvp, vpp, cnp, tsp, ticksp)); 1444 1445 try_smr = true; 1446 if (cnp->cn_nameiop == CREATE) 1447 try_smr = false; 1448 retry: 1449 doing_smr = false; 1450 blp = NULL; 1451 dvlp = NULL; 1452 error = 0; 1453 if (cnp->cn_namelen == 2 && 1454 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1455 counter_u64_add(dotdothits, 1); 1456 dvlp = VP2VNODELOCK(dvp); 1457 mtx_lock(dvlp); 1458 ncp = dvp->v_cache_dd; 1459 if (ncp == NULL) { 1460 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, 1461 "..", NULL); 1462 mtx_unlock(dvlp); 1463 return (0); 1464 } 1465 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1466 if (ncp->nc_flag & NCF_NEGATIVE) 1467 *vpp = NULL; 1468 else 1469 *vpp = ncp->nc_vp; 1470 } else 1471 *vpp = ncp->nc_dvp; 1472 /* Return failure if negative entry was found. */ 1473 if (*vpp == NULL) 1474 goto negative_success; 1475 CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..", 1476 dvp, cnp->cn_nameptr, *vpp); 1477 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", 1478 *vpp); 1479 cache_out_ts(ncp, tsp, ticksp); 1480 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == 1481 NCF_DTS && tsp != NULL) { 1482 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1483 *tsp = ncp_ts->nc_dotdottime; 1484 } 1485 goto success; 1486 } 1487 1488 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1489 retry_hashed: 1490 if (try_smr) { 1491 vfs_smr_enter(); 1492 doing_smr = true; 1493 try_smr = false; 1494 } else { 1495 blp = HASH2BUCKETLOCK(hash); 1496 rw_rlock(blp); 1497 } 1498 1499 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1500 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1501 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1502 break; 1503 } 1504 1505 /* We failed to find an entry */ 1506 if (__predict_false(ncp == NULL)) { 1507 if (doing_smr) 1508 vfs_smr_exit(); 1509 else 1510 rw_runlock(blp); 1511 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1512 NULL); 1513 counter_u64_add(nummiss, 1); 1514 return (0); 1515 } 1516 1517 if (ncp->nc_flag & NCF_NEGATIVE) 1518 goto negative_success; 1519 1520 /* We found a "positive" match, return the vnode */ 1521 counter_u64_add(numposhits, 1); 1522 *vpp = ncp->nc_vp; 1523 CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p", 1524 dvp, cnp->cn_nameptr, *vpp, ncp); 1525 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, 1526 *vpp); 1527 cache_out_ts(ncp, tsp, ticksp); 1528 success: 1529 /* 1530 * On success we return a locked and ref'd vnode as per the lookup 1531 * protocol. 1532 */ 1533 MPASS(dvp != *vpp); 1534 ltype = 0; /* silence gcc warning */ 1535 if (cnp->cn_flags & ISDOTDOT) { 1536 ltype = VOP_ISLOCKED(dvp); 1537 VOP_UNLOCK(dvp); 1538 } 1539 if (doing_smr) { 1540 if (!cache_ncp_canuse(ncp)) { 1541 vfs_smr_exit(); 1542 *vpp = NULL; 1543 goto retry; 1544 } 1545 vs = vget_prep_smr(*vpp); 1546 vfs_smr_exit(); 1547 if (__predict_false(vs == VGET_NONE)) { 1548 *vpp = NULL; 1549 goto retry; 1550 } 1551 } else { 1552 vs = vget_prep(*vpp); 1553 cache_lookup_unlock(blp, dvlp); 1554 } 1555 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1556 if (cnp->cn_flags & ISDOTDOT) { 1557 vn_lock(dvp, ltype | LK_RETRY); 1558 if (VN_IS_DOOMED(dvp)) { 1559 if (error == 0) 1560 vput(*vpp); 1561 *vpp = NULL; 1562 return (ENOENT); 1563 } 1564 } 1565 if (error) { 1566 *vpp = NULL; 1567 goto retry; 1568 } 1569 if ((cnp->cn_flags & ISLASTCN) && 1570 (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) { 1571 ASSERT_VOP_ELOCKED(*vpp, "cache_lookup"); 1572 } 1573 return (-1); 1574 1575 negative_success: 1576 /* We found a negative match, and want to create it, so purge */ 1577 if (cnp->cn_nameiop == CREATE) { 1578 MPASS(!doing_smr); 1579 counter_u64_add(numnegzaps, 1); 1580 goto zap_and_exit; 1581 } 1582 1583 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 1584 cache_out_ts(ncp, tsp, ticksp); 1585 counter_u64_add(numneghits, 1); 1586 whiteout = (ncp->nc_flag & NCF_WHITE); 1587 1588 if (doing_smr) { 1589 /* 1590 * We need to take locks to promote an entry. 1591 */ 1592 negstate = NCP2NEGSTATE(ncp); 1593 if ((negstate->neg_flag & NEG_HOT) == 0 || 1594 !cache_ncp_canuse(ncp)) { 1595 vfs_smr_exit(); 1596 doing_smr = false; 1597 goto retry_hashed; 1598 } 1599 vfs_smr_exit(); 1600 } else { 1601 cache_negative_hit(ncp); 1602 cache_lookup_unlock(blp, dvlp); 1603 } 1604 if (whiteout) 1605 cnp->cn_flags |= ISWHITEOUT; 1606 return (ENOENT); 1607 1608 zap_and_exit: 1609 MPASS(!doing_smr); 1610 if (blp != NULL) 1611 error = cache_zap_rlocked_bucket(ncp, cnp, hash, blp); 1612 else 1613 error = cache_zap_locked_vnode(ncp, dvp); 1614 if (__predict_false(error != 0)) { 1615 zap_and_exit_bucket_fail2++; 1616 cache_maybe_yield(); 1617 goto retry; 1618 } 1619 cache_free(ncp); 1620 return (0); 1621 } 1622 1623 struct celockstate { 1624 struct mtx *vlp[3]; 1625 struct rwlock *blp[2]; 1626 }; 1627 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); 1628 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); 1629 1630 static inline void 1631 cache_celockstate_init(struct celockstate *cel) 1632 { 1633 1634 bzero(cel, sizeof(*cel)); 1635 } 1636 1637 static void 1638 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, 1639 struct vnode *dvp) 1640 { 1641 struct mtx *vlp1, *vlp2; 1642 1643 MPASS(cel->vlp[0] == NULL); 1644 MPASS(cel->vlp[1] == NULL); 1645 MPASS(cel->vlp[2] == NULL); 1646 1647 MPASS(vp != NULL || dvp != NULL); 1648 1649 vlp1 = VP2VNODELOCK(vp); 1650 vlp2 = VP2VNODELOCK(dvp); 1651 cache_sort_vnodes(&vlp1, &vlp2); 1652 1653 if (vlp1 != NULL) { 1654 mtx_lock(vlp1); 1655 cel->vlp[0] = vlp1; 1656 } 1657 mtx_lock(vlp2); 1658 cel->vlp[1] = vlp2; 1659 } 1660 1661 static void 1662 cache_unlock_vnodes_cel(struct celockstate *cel) 1663 { 1664 1665 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); 1666 1667 if (cel->vlp[0] != NULL) 1668 mtx_unlock(cel->vlp[0]); 1669 if (cel->vlp[1] != NULL) 1670 mtx_unlock(cel->vlp[1]); 1671 if (cel->vlp[2] != NULL) 1672 mtx_unlock(cel->vlp[2]); 1673 } 1674 1675 static bool 1676 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) 1677 { 1678 struct mtx *vlp; 1679 bool ret; 1680 1681 cache_assert_vlp_locked(cel->vlp[0]); 1682 cache_assert_vlp_locked(cel->vlp[1]); 1683 MPASS(cel->vlp[2] == NULL); 1684 1685 MPASS(vp != NULL); 1686 vlp = VP2VNODELOCK(vp); 1687 1688 ret = true; 1689 if (vlp >= cel->vlp[1]) { 1690 mtx_lock(vlp); 1691 } else { 1692 if (mtx_trylock(vlp)) 1693 goto out; 1694 cache_lock_vnodes_cel_3_failures++; 1695 cache_unlock_vnodes_cel(cel); 1696 if (vlp < cel->vlp[0]) { 1697 mtx_lock(vlp); 1698 mtx_lock(cel->vlp[0]); 1699 mtx_lock(cel->vlp[1]); 1700 } else { 1701 if (cel->vlp[0] != NULL) 1702 mtx_lock(cel->vlp[0]); 1703 mtx_lock(vlp); 1704 mtx_lock(cel->vlp[1]); 1705 } 1706 ret = false; 1707 } 1708 out: 1709 cel->vlp[2] = vlp; 1710 return (ret); 1711 } 1712 1713 static void 1714 cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1, 1715 struct rwlock *blp2) 1716 { 1717 1718 MPASS(cel->blp[0] == NULL); 1719 MPASS(cel->blp[1] == NULL); 1720 1721 cache_sort_vnodes(&blp1, &blp2); 1722 1723 if (blp1 != NULL) { 1724 rw_wlock(blp1); 1725 cel->blp[0] = blp1; 1726 } 1727 rw_wlock(blp2); 1728 cel->blp[1] = blp2; 1729 } 1730 1731 static void 1732 cache_unlock_buckets_cel(struct celockstate *cel) 1733 { 1734 1735 if (cel->blp[0] != NULL) 1736 rw_wunlock(cel->blp[0]); 1737 rw_wunlock(cel->blp[1]); 1738 } 1739 1740 /* 1741 * Lock part of the cache affected by the insertion. 1742 * 1743 * This means vnodelocks for dvp, vp and the relevant bucketlock. 1744 * However, insertion can result in removal of an old entry. In this 1745 * case we have an additional vnode and bucketlock pair to lock. If the 1746 * entry is negative, ncelock is locked instead of the vnode. 1747 * 1748 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while 1749 * preserving the locking order (smaller address first). 1750 */ 1751 static void 1752 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1753 uint32_t hash) 1754 { 1755 struct namecache *ncp; 1756 struct rwlock *blps[2]; 1757 1758 blps[0] = HASH2BUCKETLOCK(hash); 1759 for (;;) { 1760 blps[1] = NULL; 1761 cache_lock_vnodes_cel(cel, dvp, vp); 1762 if (vp == NULL || vp->v_type != VDIR) 1763 break; 1764 ncp = vp->v_cache_dd; 1765 if (ncp == NULL) 1766 break; 1767 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1768 break; 1769 MPASS(ncp->nc_dvp == vp); 1770 blps[1] = NCP2BUCKETLOCK(ncp); 1771 if (ncp->nc_flag & NCF_NEGATIVE) 1772 break; 1773 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1774 break; 1775 /* 1776 * All vnodes got re-locked. Re-validate the state and if 1777 * nothing changed we are done. Otherwise restart. 1778 */ 1779 if (ncp == vp->v_cache_dd && 1780 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1781 blps[1] == NCP2BUCKETLOCK(ncp) && 1782 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1783 break; 1784 cache_unlock_vnodes_cel(cel); 1785 cel->vlp[0] = NULL; 1786 cel->vlp[1] = NULL; 1787 cel->vlp[2] = NULL; 1788 } 1789 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1790 } 1791 1792 static void 1793 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1794 uint32_t hash) 1795 { 1796 struct namecache *ncp; 1797 struct rwlock *blps[2]; 1798 1799 blps[0] = HASH2BUCKETLOCK(hash); 1800 for (;;) { 1801 blps[1] = NULL; 1802 cache_lock_vnodes_cel(cel, dvp, vp); 1803 ncp = dvp->v_cache_dd; 1804 if (ncp == NULL) 1805 break; 1806 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1807 break; 1808 MPASS(ncp->nc_dvp == dvp); 1809 blps[1] = NCP2BUCKETLOCK(ncp); 1810 if (ncp->nc_flag & NCF_NEGATIVE) 1811 break; 1812 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1813 break; 1814 if (ncp == dvp->v_cache_dd && 1815 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1816 blps[1] == NCP2BUCKETLOCK(ncp) && 1817 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1818 break; 1819 cache_unlock_vnodes_cel(cel); 1820 cel->vlp[0] = NULL; 1821 cel->vlp[1] = NULL; 1822 cel->vlp[2] = NULL; 1823 } 1824 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1825 } 1826 1827 static void 1828 cache_enter_unlock(struct celockstate *cel) 1829 { 1830 1831 cache_unlock_buckets_cel(cel); 1832 cache_unlock_vnodes_cel(cel); 1833 } 1834 1835 static void __noinline 1836 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, 1837 struct componentname *cnp) 1838 { 1839 struct celockstate cel; 1840 struct namecache *ncp; 1841 uint32_t hash; 1842 int len; 1843 1844 if (dvp->v_cache_dd == NULL) 1845 return; 1846 len = cnp->cn_namelen; 1847 cache_celockstate_init(&cel); 1848 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1849 cache_enter_lock_dd(&cel, dvp, vp, hash); 1850 vn_seqc_write_begin(dvp); 1851 ncp = dvp->v_cache_dd; 1852 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) { 1853 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); 1854 cache_zap_locked(ncp); 1855 } else { 1856 ncp = NULL; 1857 } 1858 dvp->v_cache_dd = NULL; 1859 vn_seqc_write_end(dvp); 1860 cache_enter_unlock(&cel); 1861 cache_free(ncp); 1862 } 1863 1864 /* 1865 * Add an entry to the cache. 1866 */ 1867 void 1868 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 1869 struct timespec *tsp, struct timespec *dtsp) 1870 { 1871 struct celockstate cel; 1872 struct namecache *ncp, *n2, *ndd; 1873 struct namecache_ts *ncp_ts, *n2_ts; 1874 struct nchashhead *ncpp; 1875 uint32_t hash; 1876 int flag; 1877 int len; 1878 u_long lnumcache; 1879 1880 CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr); 1881 VNASSERT(vp == NULL || !VN_IS_DOOMED(vp), vp, 1882 ("cache_enter: Adding a doomed vnode")); 1883 VNASSERT(dvp == NULL || !VN_IS_DOOMED(dvp), dvp, 1884 ("cache_enter: Doomed vnode used as src")); 1885 1886 #ifdef DEBUG_CACHE 1887 if (__predict_false(!doingcache)) 1888 return; 1889 #endif 1890 1891 flag = 0; 1892 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 1893 if (cnp->cn_namelen == 1) 1894 return; 1895 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 1896 cache_enter_dotdot_prep(dvp, vp, cnp); 1897 flag = NCF_ISDOTDOT; 1898 } 1899 } 1900 1901 /* 1902 * Avoid blowout in namecache entries. 1903 */ 1904 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; 1905 if (__predict_false(lnumcache >= ncsize)) { 1906 atomic_add_long(&numcache, -1); 1907 counter_u64_add(numdrops, 1); 1908 return; 1909 } 1910 1911 cache_celockstate_init(&cel); 1912 ndd = NULL; 1913 ncp_ts = NULL; 1914 1915 /* 1916 * Calculate the hash key and setup as much of the new 1917 * namecache entry as possible before acquiring the lock. 1918 */ 1919 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); 1920 ncp->nc_flag = flag | NCF_WIP; 1921 ncp->nc_vp = vp; 1922 if (vp == NULL) 1923 cache_negative_init(ncp); 1924 ncp->nc_dvp = dvp; 1925 if (tsp != NULL) { 1926 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1927 ncp_ts->nc_time = *tsp; 1928 ncp_ts->nc_ticks = ticks; 1929 ncp_ts->nc_nc.nc_flag |= NCF_TS; 1930 if (dtsp != NULL) { 1931 ncp_ts->nc_dotdottime = *dtsp; 1932 ncp_ts->nc_nc.nc_flag |= NCF_DTS; 1933 } 1934 } 1935 len = ncp->nc_nlen = cnp->cn_namelen; 1936 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1937 strlcpy(ncp->nc_name, cnp->cn_nameptr, len + 1); 1938 cache_enter_lock(&cel, dvp, vp, hash); 1939 1940 /* 1941 * See if this vnode or negative entry is already in the cache 1942 * with this name. This can happen with concurrent lookups of 1943 * the same path name. 1944 */ 1945 ncpp = NCHHASH(hash); 1946 CK_SLIST_FOREACH(n2, ncpp, nc_hash) { 1947 if (n2->nc_dvp == dvp && 1948 n2->nc_nlen == cnp->cn_namelen && 1949 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { 1950 if (tsp != NULL) { 1951 KASSERT((n2->nc_flag & NCF_TS) != 0, 1952 ("no NCF_TS")); 1953 n2_ts = __containerof(n2, struct namecache_ts, nc_nc); 1954 n2_ts->nc_time = ncp_ts->nc_time; 1955 n2_ts->nc_ticks = ncp_ts->nc_ticks; 1956 if (dtsp != NULL) { 1957 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; 1958 n2_ts->nc_nc.nc_flag |= NCF_DTS; 1959 } 1960 } 1961 goto out_unlock_free; 1962 } 1963 } 1964 1965 if (flag == NCF_ISDOTDOT) { 1966 /* 1967 * See if we are trying to add .. entry, but some other lookup 1968 * has populated v_cache_dd pointer already. 1969 */ 1970 if (dvp->v_cache_dd != NULL) 1971 goto out_unlock_free; 1972 KASSERT(vp == NULL || vp->v_type == VDIR, 1973 ("wrong vnode type %p", vp)); 1974 vn_seqc_write_begin(dvp); 1975 dvp->v_cache_dd = ncp; 1976 vn_seqc_write_end(dvp); 1977 } 1978 1979 if (vp != NULL) { 1980 if (vp->v_type == VDIR) { 1981 if (flag != NCF_ISDOTDOT) { 1982 /* 1983 * For this case, the cache entry maps both the 1984 * directory name in it and the name ".." for the 1985 * directory's parent. 1986 */ 1987 vn_seqc_write_begin(vp); 1988 if ((ndd = vp->v_cache_dd) != NULL) { 1989 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) 1990 cache_zap_locked(ndd); 1991 else 1992 ndd = NULL; 1993 } 1994 vp->v_cache_dd = ncp; 1995 vn_seqc_write_end(vp); 1996 } 1997 } else { 1998 if (vp->v_cache_dd != NULL) { 1999 vn_seqc_write_begin(vp); 2000 vp->v_cache_dd = NULL; 2001 vn_seqc_write_end(vp); 2002 } 2003 } 2004 } 2005 2006 if (flag != NCF_ISDOTDOT) { 2007 if (LIST_EMPTY(&dvp->v_cache_src)) { 2008 vhold(dvp); 2009 counter_u64_add(numcachehv, 1); 2010 } 2011 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); 2012 } 2013 2014 /* 2015 * If the entry is "negative", we place it into the 2016 * "negative" cache queue, otherwise, we place it into the 2017 * destination vnode's cache entries queue. 2018 */ 2019 if (vp != NULL) { 2020 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); 2021 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, 2022 vp); 2023 } else { 2024 if (cnp->cn_flags & ISWHITEOUT) 2025 ncp->nc_flag |= NCF_WHITE; 2026 cache_negative_insert(ncp); 2027 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, 2028 ncp->nc_name); 2029 } 2030 2031 /* 2032 * Insert the new namecache entry into the appropriate chain 2033 * within the cache entries table. 2034 */ 2035 CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash); 2036 2037 atomic_thread_fence_rel(); 2038 /* 2039 * Mark the entry as fully constructed. 2040 * It is immutable past this point until its removal. 2041 */ 2042 atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP); 2043 2044 cache_enter_unlock(&cel); 2045 if (numneg * ncnegfactor > lnumcache) 2046 cache_negative_zap_one(); 2047 cache_free(ndd); 2048 return; 2049 out_unlock_free: 2050 cache_enter_unlock(&cel); 2051 atomic_add_long(&numcache, -1); 2052 cache_free(ncp); 2053 return; 2054 } 2055 2056 static u_int 2057 cache_roundup_2(u_int val) 2058 { 2059 u_int res; 2060 2061 for (res = 1; res <= val; res <<= 1) 2062 continue; 2063 2064 return (res); 2065 } 2066 2067 static struct nchashhead * 2068 nchinittbl(u_long elements, u_long *hashmask) 2069 { 2070 struct nchashhead *hashtbl; 2071 u_long hashsize, i; 2072 2073 hashsize = cache_roundup_2(elements) / 2; 2074 2075 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK); 2076 for (i = 0; i < hashsize; i++) 2077 CK_SLIST_INIT(&hashtbl[i]); 2078 *hashmask = hashsize - 1; 2079 return (hashtbl); 2080 } 2081 2082 static void 2083 ncfreetbl(struct nchashhead *hashtbl) 2084 { 2085 2086 free(hashtbl, M_VFSCACHE); 2087 } 2088 2089 /* 2090 * Name cache initialization, from vfs_init() when we are booting 2091 */ 2092 static void 2093 nchinit(void *dummy __unused) 2094 { 2095 u_int i; 2096 2097 cache_zone_small = uma_zcreate("S VFS Cache", 2098 sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1, 2099 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, 2100 UMA_ZONE_ZINIT); 2101 cache_zone_small_ts = uma_zcreate("STS VFS Cache", 2102 sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1, 2103 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, 2104 UMA_ZONE_ZINIT); 2105 cache_zone_large = uma_zcreate("L VFS Cache", 2106 sizeof(struct namecache) + NAME_MAX + 1, 2107 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, 2108 UMA_ZONE_ZINIT); 2109 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", 2110 sizeof(struct namecache_ts) + NAME_MAX + 1, 2111 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, 2112 UMA_ZONE_ZINIT); 2113 2114 VFS_SMR_ZONE_SET(cache_zone_small); 2115 VFS_SMR_ZONE_SET(cache_zone_small_ts); 2116 VFS_SMR_ZONE_SET(cache_zone_large); 2117 VFS_SMR_ZONE_SET(cache_zone_large_ts); 2118 2119 ncsize = desiredvnodes * ncsizefactor; 2120 nchashtbl = nchinittbl(desiredvnodes * 2, &nchash); 2121 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; 2122 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ 2123 ncbuckethash = 7; 2124 if (ncbuckethash > nchash) 2125 ncbuckethash = nchash; 2126 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, 2127 M_WAITOK | M_ZERO); 2128 for (i = 0; i < numbucketlocks; i++) 2129 rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE); 2130 ncvnodehash = ncbuckethash; 2131 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, 2132 M_WAITOK | M_ZERO); 2133 for (i = 0; i < numvnodelocks; i++) 2134 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); 2135 ncpurgeminvnodes = numbucketlocks * 2; 2136 2137 neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE, 2138 M_WAITOK | M_ZERO); 2139 for (i = 0; i < numneglists; i++) { 2140 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); 2141 TAILQ_INIT(&neglists[i].nl_list); 2142 } 2143 mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF); 2144 TAILQ_INIT(&ncneg_hot.nl_list); 2145 2146 mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF); 2147 } 2148 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); 2149 2150 void 2151 cache_vnode_init(struct vnode *vp) 2152 { 2153 2154 LIST_INIT(&vp->v_cache_src); 2155 TAILQ_INIT(&vp->v_cache_dst); 2156 vp->v_cache_dd = NULL; 2157 cache_prehash(vp); 2158 } 2159 2160 void 2161 cache_changesize(u_long newmaxvnodes) 2162 { 2163 struct nchashhead *new_nchashtbl, *old_nchashtbl; 2164 u_long new_nchash, old_nchash; 2165 struct namecache *ncp; 2166 uint32_t hash; 2167 u_long newncsize; 2168 int i; 2169 2170 newncsize = newmaxvnodes * ncsizefactor; 2171 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); 2172 if (newmaxvnodes < numbucketlocks) 2173 newmaxvnodes = numbucketlocks; 2174 2175 new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash); 2176 /* If same hash table size, nothing to do */ 2177 if (nchash == new_nchash) { 2178 ncfreetbl(new_nchashtbl); 2179 return; 2180 } 2181 /* 2182 * Move everything from the old hash table to the new table. 2183 * None of the namecache entries in the table can be removed 2184 * because to do so, they have to be removed from the hash table. 2185 */ 2186 cache_lock_all_vnodes(); 2187 cache_lock_all_buckets(); 2188 old_nchashtbl = nchashtbl; 2189 old_nchash = nchash; 2190 nchashtbl = new_nchashtbl; 2191 nchash = new_nchash; 2192 for (i = 0; i <= old_nchash; i++) { 2193 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) { 2194 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, 2195 ncp->nc_dvp); 2196 CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash); 2197 CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); 2198 } 2199 } 2200 ncsize = newncsize; 2201 cache_unlock_all_buckets(); 2202 cache_unlock_all_vnodes(); 2203 ncfreetbl(old_nchashtbl); 2204 } 2205 2206 /* 2207 * Invalidate all entries from and to a particular vnode. 2208 */ 2209 static void 2210 cache_purge_impl(struct vnode *vp) 2211 { 2212 TAILQ_HEAD(, namecache) ncps; 2213 struct namecache *ncp, *nnp; 2214 struct mtx *vlp, *vlp2; 2215 2216 TAILQ_INIT(&ncps); 2217 vlp = VP2VNODELOCK(vp); 2218 vlp2 = NULL; 2219 mtx_assert(vlp, MA_OWNED); 2220 retry: 2221 while (!LIST_EMPTY(&vp->v_cache_src)) { 2222 ncp = LIST_FIRST(&vp->v_cache_src); 2223 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2224 goto retry; 2225 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2226 } 2227 while (!TAILQ_EMPTY(&vp->v_cache_dst)) { 2228 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2229 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2230 goto retry; 2231 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2232 } 2233 ncp = vp->v_cache_dd; 2234 if (ncp != NULL) { 2235 KASSERT(ncp->nc_flag & NCF_ISDOTDOT, 2236 ("lost dotdot link")); 2237 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2238 goto retry; 2239 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2240 } 2241 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); 2242 mtx_unlock(vlp); 2243 if (vlp2 != NULL) 2244 mtx_unlock(vlp2); 2245 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2246 cache_free(ncp); 2247 } 2248 } 2249 2250 void 2251 cache_purge(struct vnode *vp) 2252 { 2253 struct mtx *vlp; 2254 2255 SDT_PROBE1(vfs, namecache, purge, done, vp); 2256 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2257 vp->v_cache_dd == NULL) 2258 return; 2259 vlp = VP2VNODELOCK(vp); 2260 mtx_lock(vlp); 2261 cache_purge_impl(vp); 2262 } 2263 2264 /* 2265 * Only to be used by vgone. 2266 */ 2267 void 2268 cache_purge_vgone(struct vnode *vp) 2269 { 2270 struct mtx *vlp; 2271 2272 VNPASS(VN_IS_DOOMED(vp), vp); 2273 vlp = VP2VNODELOCK(vp); 2274 if (!(LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2275 vp->v_cache_dd == NULL)) { 2276 mtx_lock(vlp); 2277 cache_purge_impl(vp); 2278 mtx_assert(vlp, MA_NOTOWNED); 2279 return; 2280 } 2281 2282 /* 2283 * All the NULL pointer state we found above may be transient. 2284 * Serialize against a possible thread doing cache_purge. 2285 */ 2286 mtx_wait_unlocked(vlp); 2287 if (!(LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2288 vp->v_cache_dd == NULL)) { 2289 mtx_lock(vlp); 2290 cache_purge_impl(vp); 2291 mtx_assert(vlp, MA_NOTOWNED); 2292 return; 2293 } 2294 return; 2295 } 2296 2297 /* 2298 * Invalidate all negative entries for a particular directory vnode. 2299 */ 2300 void 2301 cache_purge_negative(struct vnode *vp) 2302 { 2303 TAILQ_HEAD(, namecache) ncps; 2304 struct namecache *ncp, *nnp; 2305 struct mtx *vlp; 2306 2307 CTR1(KTR_VFS, "cache_purge_negative(%p)", vp); 2308 SDT_PROBE1(vfs, namecache, purge_negative, done, vp); 2309 if (LIST_EMPTY(&vp->v_cache_src)) 2310 return; 2311 TAILQ_INIT(&ncps); 2312 vlp = VP2VNODELOCK(vp); 2313 mtx_lock(vlp); 2314 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { 2315 if (!(ncp->nc_flag & NCF_NEGATIVE)) 2316 continue; 2317 cache_zap_negative_locked_vnode_kl(ncp, vp); 2318 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2319 } 2320 mtx_unlock(vlp); 2321 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2322 cache_free(ncp); 2323 } 2324 } 2325 2326 /* 2327 * Flush all entries referencing a particular filesystem. 2328 */ 2329 void 2330 cache_purgevfs(struct mount *mp, bool force) 2331 { 2332 TAILQ_HEAD(, namecache) ncps; 2333 struct mtx *vlp1, *vlp2; 2334 struct rwlock *blp; 2335 struct nchashhead *bucket; 2336 struct namecache *ncp, *nnp; 2337 u_long i, j, n_nchash; 2338 int error; 2339 2340 /* Scan hash tables for applicable entries */ 2341 SDT_PROBE1(vfs, namecache, purgevfs, done, mp); 2342 if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes) 2343 return; 2344 TAILQ_INIT(&ncps); 2345 n_nchash = nchash + 1; 2346 vlp1 = vlp2 = NULL; 2347 for (i = 0; i < numbucketlocks; i++) { 2348 blp = (struct rwlock *)&bucketlocks[i]; 2349 rw_wlock(blp); 2350 for (j = i; j < n_nchash; j += numbucketlocks) { 2351 retry: 2352 bucket = &nchashtbl[j]; 2353 CK_SLIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) { 2354 cache_assert_bucket_locked(ncp, RA_WLOCKED); 2355 if (ncp->nc_dvp->v_mount != mp) 2356 continue; 2357 error = cache_zap_wlocked_bucket_kl(ncp, blp, 2358 &vlp1, &vlp2); 2359 if (error != 0) 2360 goto retry; 2361 TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst); 2362 } 2363 } 2364 rw_wunlock(blp); 2365 if (vlp1 == NULL && vlp2 == NULL) 2366 cache_maybe_yield(); 2367 } 2368 if (vlp1 != NULL) 2369 mtx_unlock(vlp1); 2370 if (vlp2 != NULL) 2371 mtx_unlock(vlp2); 2372 2373 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2374 cache_free(ncp); 2375 } 2376 } 2377 2378 /* 2379 * Perform canonical checks and cache lookup and pass on to filesystem 2380 * through the vop_cachedlookup only if needed. 2381 */ 2382 2383 int 2384 vfs_cache_lookup(struct vop_lookup_args *ap) 2385 { 2386 struct vnode *dvp; 2387 int error; 2388 struct vnode **vpp = ap->a_vpp; 2389 struct componentname *cnp = ap->a_cnp; 2390 int flags = cnp->cn_flags; 2391 2392 *vpp = NULL; 2393 dvp = ap->a_dvp; 2394 2395 if (dvp->v_type != VDIR) 2396 return (ENOTDIR); 2397 2398 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && 2399 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) 2400 return (EROFS); 2401 2402 error = vn_dir_check_exec(dvp, cnp); 2403 if (error != 0) 2404 return (error); 2405 2406 error = cache_lookup(dvp, vpp, cnp, NULL, NULL); 2407 if (error == 0) 2408 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); 2409 if (error == -1) 2410 return (0); 2411 return (error); 2412 } 2413 2414 /* Implementation of the getcwd syscall. */ 2415 int 2416 sys___getcwd(struct thread *td, struct __getcwd_args *uap) 2417 { 2418 char *buf, *retbuf; 2419 size_t buflen; 2420 int error; 2421 2422 buflen = uap->buflen; 2423 if (__predict_false(buflen < 2)) 2424 return (EINVAL); 2425 if (buflen > MAXPATHLEN) 2426 buflen = MAXPATHLEN; 2427 2428 buf = malloc(buflen, M_TEMP, M_WAITOK); 2429 error = vn_getcwd(td, buf, &retbuf, &buflen); 2430 if (error == 0) 2431 error = copyout(retbuf, uap->buf, buflen); 2432 free(buf, M_TEMP); 2433 return (error); 2434 } 2435 2436 int 2437 vn_getcwd(struct thread *td, char *buf, char **retbuf, size_t *buflen) 2438 { 2439 struct pwd *pwd; 2440 int error; 2441 2442 pwd = pwd_hold(td); 2443 error = vn_fullpath_any(td, pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, buflen); 2444 pwd_drop(pwd); 2445 2446 #ifdef KTRACE 2447 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0) 2448 ktrnamei(*retbuf); 2449 #endif 2450 return (error); 2451 } 2452 2453 static int 2454 kern___realpathat(struct thread *td, int fd, const char *path, char *buf, 2455 size_t size, int flags, enum uio_seg pathseg) 2456 { 2457 struct nameidata nd; 2458 char *retbuf, *freebuf; 2459 int error; 2460 2461 if (flags != 0) 2462 return (EINVAL); 2463 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1, 2464 pathseg, path, fd, &cap_fstat_rights, td); 2465 if ((error = namei(&nd)) != 0) 2466 return (error); 2467 error = vn_fullpath_hardlink(td, &nd, &retbuf, &freebuf, &size); 2468 if (error == 0) { 2469 error = copyout(retbuf, buf, size); 2470 free(freebuf, M_TEMP); 2471 } 2472 NDFREE(&nd, 0); 2473 return (error); 2474 } 2475 2476 int 2477 sys___realpathat(struct thread *td, struct __realpathat_args *uap) 2478 { 2479 2480 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size, 2481 uap->flags, UIO_USERSPACE)); 2482 } 2483 2484 /* 2485 * Retrieve the full filesystem path that correspond to a vnode from the name 2486 * cache (if available) 2487 */ 2488 int 2489 vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf) 2490 { 2491 struct pwd *pwd; 2492 char *buf; 2493 size_t buflen; 2494 int error; 2495 2496 if (__predict_false(vn == NULL)) 2497 return (EINVAL); 2498 2499 buflen = MAXPATHLEN; 2500 buf = malloc(buflen, M_TEMP, M_WAITOK); 2501 pwd = pwd_hold(td); 2502 error = vn_fullpath_any(td, vn, pwd->pwd_rdir, buf, retbuf, &buflen); 2503 pwd_drop(pwd); 2504 2505 if (!error) 2506 *freebuf = buf; 2507 else 2508 free(buf, M_TEMP); 2509 return (error); 2510 } 2511 2512 /* 2513 * This function is similar to vn_fullpath, but it attempts to lookup the 2514 * pathname relative to the global root mount point. This is required for the 2515 * auditing sub-system, as audited pathnames must be absolute, relative to the 2516 * global root mount point. 2517 */ 2518 int 2519 vn_fullpath_global(struct thread *td, struct vnode *vn, 2520 char **retbuf, char **freebuf) 2521 { 2522 char *buf; 2523 size_t buflen; 2524 int error; 2525 2526 if (__predict_false(vn == NULL)) 2527 return (EINVAL); 2528 buflen = MAXPATHLEN; 2529 buf = malloc(buflen, M_TEMP, M_WAITOK); 2530 error = vn_fullpath_any(td, vn, rootvnode, buf, retbuf, &buflen); 2531 if (!error) 2532 *freebuf = buf; 2533 else 2534 free(buf, M_TEMP); 2535 return (error); 2536 } 2537 2538 int 2539 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, size_t *buflen) 2540 { 2541 struct vnode *dvp; 2542 struct namecache *ncp; 2543 struct mtx *vlp; 2544 int error; 2545 2546 vlp = VP2VNODELOCK(*vp); 2547 mtx_lock(vlp); 2548 TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) { 2549 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2550 break; 2551 } 2552 if (ncp != NULL) { 2553 if (*buflen < ncp->nc_nlen) { 2554 mtx_unlock(vlp); 2555 vrele(*vp); 2556 counter_u64_add(numfullpathfail4, 1); 2557 error = ENOMEM; 2558 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2559 vp, NULL); 2560 return (error); 2561 } 2562 *buflen -= ncp->nc_nlen; 2563 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 2564 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, 2565 ncp->nc_name, vp); 2566 dvp = *vp; 2567 *vp = ncp->nc_dvp; 2568 vref(*vp); 2569 mtx_unlock(vlp); 2570 vrele(dvp); 2571 return (0); 2572 } 2573 SDT_PROBE1(vfs, namecache, fullpath, miss, vp); 2574 2575 mtx_unlock(vlp); 2576 vn_lock(*vp, LK_SHARED | LK_RETRY); 2577 error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen); 2578 vput(*vp); 2579 if (error) { 2580 counter_u64_add(numfullpathfail2, 1); 2581 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2582 return (error); 2583 } 2584 2585 *vp = dvp; 2586 if (VN_IS_DOOMED(dvp)) { 2587 /* forced unmount */ 2588 vrele(dvp); 2589 error = ENOENT; 2590 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2591 return (error); 2592 } 2593 /* 2594 * *vp has its use count incremented still. 2595 */ 2596 2597 return (0); 2598 } 2599 2600 /* 2601 * Resolve a directory to a pathname. 2602 * 2603 * The name of the directory can always be found in the namecache or fetched 2604 * from the filesystem. There is also guaranteed to be only one parent, meaning 2605 * we can just follow vnodes up until we find the root. 2606 * 2607 * The vnode must be referenced. 2608 */ 2609 static int 2610 vn_fullpath_dir(struct thread *td, struct vnode *vp, struct vnode *rdir, 2611 char *buf, char **retbuf, size_t *len, bool slash_prefixed, size_t addend) 2612 { 2613 #ifdef KDTRACE_HOOKS 2614 struct vnode *startvp = vp; 2615 #endif 2616 struct vnode *vp1; 2617 size_t buflen; 2618 int error; 2619 2620 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); 2621 VNPASS(vp->v_usecount > 0, vp); 2622 2623 buflen = *len; 2624 2625 if (!slash_prefixed) { 2626 MPASS(*len >= 2); 2627 buflen--; 2628 buf[buflen] = '\0'; 2629 } 2630 2631 error = 0; 2632 2633 SDT_PROBE1(vfs, namecache, fullpath, entry, vp); 2634 counter_u64_add(numfullpathcalls, 1); 2635 while (vp != rdir && vp != rootvnode) { 2636 /* 2637 * The vp vnode must be already fully constructed, 2638 * since it is either found in namecache or obtained 2639 * from VOP_VPTOCNP(). We may test for VV_ROOT safely 2640 * without obtaining the vnode lock. 2641 */ 2642 if ((vp->v_vflag & VV_ROOT) != 0) { 2643 vn_lock(vp, LK_RETRY | LK_SHARED); 2644 2645 /* 2646 * With the vnode locked, check for races with 2647 * unmount, forced or not. Note that we 2648 * already verified that vp is not equal to 2649 * the root vnode, which means that 2650 * mnt_vnodecovered can be NULL only for the 2651 * case of unmount. 2652 */ 2653 if (VN_IS_DOOMED(vp) || 2654 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || 2655 vp1->v_mountedhere != vp->v_mount) { 2656 vput(vp); 2657 error = ENOENT; 2658 SDT_PROBE3(vfs, namecache, fullpath, return, 2659 error, vp, NULL); 2660 break; 2661 } 2662 2663 vref(vp1); 2664 vput(vp); 2665 vp = vp1; 2666 continue; 2667 } 2668 if (vp->v_type != VDIR) { 2669 vrele(vp); 2670 counter_u64_add(numfullpathfail1, 1); 2671 error = ENOTDIR; 2672 SDT_PROBE3(vfs, namecache, fullpath, return, 2673 error, vp, NULL); 2674 break; 2675 } 2676 error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen); 2677 if (error) 2678 break; 2679 if (buflen == 0) { 2680 vrele(vp); 2681 error = ENOMEM; 2682 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2683 startvp, NULL); 2684 break; 2685 } 2686 buf[--buflen] = '/'; 2687 slash_prefixed = true; 2688 } 2689 if (error) 2690 return (error); 2691 if (!slash_prefixed) { 2692 if (buflen == 0) { 2693 vrele(vp); 2694 counter_u64_add(numfullpathfail4, 1); 2695 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, 2696 startvp, NULL); 2697 return (ENOMEM); 2698 } 2699 buf[--buflen] = '/'; 2700 } 2701 counter_u64_add(numfullpathfound, 1); 2702 vrele(vp); 2703 2704 *retbuf = buf + buflen; 2705 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf); 2706 *len -= buflen; 2707 *len += addend; 2708 return (0); 2709 } 2710 2711 /* 2712 * Resolve an arbitrary vnode to a pathname. 2713 * 2714 * Note 2 caveats: 2715 * - hardlinks are not tracked, thus if the vnode is not a directory this can 2716 * resolve to a different path than the one used to find it 2717 * - namecache is not mandatory, meaning names are not guaranteed to be added 2718 * (in which case resolving fails) 2719 */ 2720 static int 2721 vn_fullpath_any(struct thread *td, struct vnode *vp, struct vnode *rdir, 2722 char *buf, char **retbuf, size_t *buflen) 2723 { 2724 size_t orig_buflen; 2725 bool slash_prefixed; 2726 int error; 2727 2728 if (*buflen < 2) 2729 return (EINVAL); 2730 2731 orig_buflen = *buflen; 2732 2733 vref(vp); 2734 slash_prefixed = false; 2735 if (vp->v_type != VDIR) { 2736 *buflen -= 1; 2737 buf[*buflen] = '\0'; 2738 error = vn_vptocnp(&vp, td->td_ucred, buf, buflen); 2739 if (error) 2740 return (error); 2741 if (*buflen == 0) { 2742 vrele(vp); 2743 return (ENOMEM); 2744 } 2745 *buflen -= 1; 2746 buf[*buflen] = '/'; 2747 slash_prefixed = true; 2748 } 2749 2750 return (vn_fullpath_dir(td, vp, rdir, buf, retbuf, buflen, slash_prefixed, 2751 orig_buflen - *buflen)); 2752 } 2753 2754 /* 2755 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks). 2756 * 2757 * Since the namecache does not track handlings, the caller is expected to first 2758 * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei. 2759 * 2760 * Then we have 2 cases: 2761 * - if the found vnode is a directory, the path can be constructed just by 2762 * fullowing names up the chain 2763 * - otherwise we populate the buffer with the saved name and start resolving 2764 * from the parent 2765 */ 2766 static int 2767 vn_fullpath_hardlink(struct thread *td, struct nameidata *ndp, char **retbuf, 2768 char **freebuf, size_t *buflen) 2769 { 2770 char *buf, *tmpbuf; 2771 struct pwd *pwd; 2772 struct componentname *cnp; 2773 struct vnode *vp; 2774 size_t addend; 2775 int error; 2776 bool slash_prefixed; 2777 2778 if (*buflen < 2) 2779 return (EINVAL); 2780 if (*buflen > MAXPATHLEN) 2781 *buflen = MAXPATHLEN; 2782 2783 slash_prefixed = false; 2784 2785 buf = malloc(*buflen, M_TEMP, M_WAITOK); 2786 pwd = pwd_hold(td); 2787 2788 addend = 0; 2789 vp = ndp->ni_vp; 2790 if (vp->v_type != VDIR) { 2791 cnp = &ndp->ni_cnd; 2792 addend = cnp->cn_namelen + 2; 2793 if (*buflen < addend) { 2794 error = ENOMEM; 2795 goto out_bad; 2796 } 2797 *buflen -= addend; 2798 tmpbuf = buf + *buflen; 2799 tmpbuf[0] = '/'; 2800 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen); 2801 tmpbuf[addend - 1] = '\0'; 2802 slash_prefixed = true; 2803 vp = ndp->ni_dvp; 2804 } 2805 2806 vref(vp); 2807 error = vn_fullpath_dir(td, vp, pwd->pwd_rdir, buf, retbuf, buflen, 2808 slash_prefixed, addend); 2809 if (error != 0) 2810 goto out_bad; 2811 2812 pwd_drop(pwd); 2813 *freebuf = buf; 2814 2815 return (0); 2816 out_bad: 2817 pwd_drop(pwd); 2818 free(buf, M_TEMP); 2819 return (error); 2820 } 2821 2822 struct vnode * 2823 vn_dir_dd_ino(struct vnode *vp) 2824 { 2825 struct namecache *ncp; 2826 struct vnode *ddvp; 2827 struct mtx *vlp; 2828 enum vgetstate vs; 2829 2830 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); 2831 vlp = VP2VNODELOCK(vp); 2832 mtx_lock(vlp); 2833 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { 2834 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 2835 continue; 2836 ddvp = ncp->nc_dvp; 2837 vs = vget_prep(ddvp); 2838 mtx_unlock(vlp); 2839 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) 2840 return (NULL); 2841 return (ddvp); 2842 } 2843 mtx_unlock(vlp); 2844 return (NULL); 2845 } 2846 2847 int 2848 vn_commname(struct vnode *vp, char *buf, u_int buflen) 2849 { 2850 struct namecache *ncp; 2851 struct mtx *vlp; 2852 int l; 2853 2854 vlp = VP2VNODELOCK(vp); 2855 mtx_lock(vlp); 2856 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) 2857 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2858 break; 2859 if (ncp == NULL) { 2860 mtx_unlock(vlp); 2861 return (ENOENT); 2862 } 2863 l = min(ncp->nc_nlen, buflen - 1); 2864 memcpy(buf, ncp->nc_name, l); 2865 mtx_unlock(vlp); 2866 buf[l] = '\0'; 2867 return (0); 2868 } 2869 2870 /* 2871 * This function updates path string to vnode's full global path 2872 * and checks the size of the new path string against the pathlen argument. 2873 * 2874 * Requires a locked, referenced vnode. 2875 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 2876 * 2877 * If vp is a directory, the call to vn_fullpath_global() always succeeds 2878 * because it falls back to the ".." lookup if the namecache lookup fails. 2879 */ 2880 int 2881 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, 2882 u_int pathlen) 2883 { 2884 struct nameidata nd; 2885 struct vnode *vp1; 2886 char *rpath, *fbuf; 2887 int error; 2888 2889 ASSERT_VOP_ELOCKED(vp, __func__); 2890 2891 /* Construct global filesystem path from vp. */ 2892 VOP_UNLOCK(vp); 2893 error = vn_fullpath_global(td, vp, &rpath, &fbuf); 2894 2895 if (error != 0) { 2896 vrele(vp); 2897 return (error); 2898 } 2899 2900 if (strlen(rpath) >= pathlen) { 2901 vrele(vp); 2902 error = ENAMETOOLONG; 2903 goto out; 2904 } 2905 2906 /* 2907 * Re-lookup the vnode by path to detect a possible rename. 2908 * As a side effect, the vnode is relocked. 2909 * If vnode was renamed, return ENOENT. 2910 */ 2911 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, 2912 UIO_SYSSPACE, path, td); 2913 error = namei(&nd); 2914 if (error != 0) { 2915 vrele(vp); 2916 goto out; 2917 } 2918 NDFREE(&nd, NDF_ONLY_PNBUF); 2919 vp1 = nd.ni_vp; 2920 vrele(vp); 2921 if (vp1 == vp) 2922 strcpy(path, rpath); 2923 else { 2924 vput(vp1); 2925 error = ENOENT; 2926 } 2927 2928 out: 2929 free(fbuf, M_TEMP); 2930 return (error); 2931 } 2932 2933 #ifdef DDB 2934 static void 2935 db_print_vpath(struct vnode *vp) 2936 { 2937 2938 while (vp != NULL) { 2939 db_printf("%p: ", vp); 2940 if (vp == rootvnode) { 2941 db_printf("/"); 2942 vp = NULL; 2943 } else { 2944 if (vp->v_vflag & VV_ROOT) { 2945 db_printf("<mount point>"); 2946 vp = vp->v_mount->mnt_vnodecovered; 2947 } else { 2948 struct namecache *ncp; 2949 char *ncn; 2950 int i; 2951 2952 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2953 if (ncp != NULL) { 2954 ncn = ncp->nc_name; 2955 for (i = 0; i < ncp->nc_nlen; i++) 2956 db_printf("%c", *ncn++); 2957 vp = ncp->nc_dvp; 2958 } else { 2959 vp = NULL; 2960 } 2961 } 2962 } 2963 db_printf("\n"); 2964 } 2965 2966 return; 2967 } 2968 2969 DB_SHOW_COMMAND(vpath, db_show_vpath) 2970 { 2971 struct vnode *vp; 2972 2973 if (!have_addr) { 2974 db_printf("usage: show vpath <struct vnode *>\n"); 2975 return; 2976 } 2977 2978 vp = (struct vnode *)addr; 2979 db_print_vpath(vp); 2980 } 2981 2982 #endif 2983 2984 extern uma_zone_t namei_zone; 2985 2986 static bool __read_frequently cache_fast_lookup = true; 2987 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW, 2988 &cache_fast_lookup, 0, ""); 2989 2990 #define CACHE_FPL_FAILED -2020 2991 2992 static void 2993 cache_fpl_cleanup_cnp(struct componentname *cnp) 2994 { 2995 2996 uma_zfree(namei_zone, cnp->cn_pnbuf); 2997 #ifdef DIAGNOSTIC 2998 cnp->cn_pnbuf = NULL; 2999 cnp->cn_nameptr = NULL; 3000 #endif 3001 } 3002 3003 static void 3004 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp) 3005 { 3006 struct componentname *cnp; 3007 3008 cnp = &ndp->ni_cnd; 3009 while (*(cnp->cn_nameptr) == '/') { 3010 cnp->cn_nameptr++; 3011 ndp->ni_pathlen--; 3012 } 3013 3014 *dpp = ndp->ni_rootdir; 3015 } 3016 3017 /* 3018 * Components of nameidata (or objects it can point to) which may 3019 * need restoring in case fast path lookup fails. 3020 */ 3021 struct nameidata_saved { 3022 long cn_namelen; 3023 char *cn_nameptr; 3024 size_t ni_pathlen; 3025 int cn_flags; 3026 }; 3027 3028 struct cache_fpl { 3029 struct nameidata *ndp; 3030 struct componentname *cnp; 3031 struct pwd *pwd; 3032 struct vnode *dvp; 3033 struct vnode *tvp; 3034 seqc_t dvp_seqc; 3035 seqc_t tvp_seqc; 3036 struct nameidata_saved snd; 3037 int line; 3038 enum cache_fpl_status status:8; 3039 bool in_smr; 3040 }; 3041 3042 static void 3043 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd) 3044 { 3045 3046 snd->cn_flags = fpl->ndp->ni_cnd.cn_flags; 3047 snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen; 3048 snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr; 3049 snd->ni_pathlen = fpl->ndp->ni_pathlen; 3050 } 3051 3052 static void 3053 cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd) 3054 { 3055 3056 fpl->ndp->ni_cnd.cn_flags = snd->cn_flags; 3057 fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen; 3058 fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr; 3059 fpl->ndp->ni_pathlen = snd->ni_pathlen; 3060 } 3061 3062 #ifdef INVARIANTS 3063 #define cache_fpl_smr_assert_entered(fpl) ({ \ 3064 struct cache_fpl *_fpl = (fpl); \ 3065 MPASS(_fpl->in_smr == true); \ 3066 VFS_SMR_ASSERT_ENTERED(); \ 3067 }) 3068 #define cache_fpl_smr_assert_not_entered(fpl) ({ \ 3069 struct cache_fpl *_fpl = (fpl); \ 3070 MPASS(_fpl->in_smr == false); \ 3071 VFS_SMR_ASSERT_NOT_ENTERED(); \ 3072 }) 3073 #else 3074 #define cache_fpl_smr_assert_entered(fpl) do { } while (0) 3075 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0) 3076 #endif 3077 3078 #define cache_fpl_smr_enter_initial(fpl) ({ \ 3079 struct cache_fpl *_fpl = (fpl); \ 3080 vfs_smr_enter(); \ 3081 _fpl->in_smr = true; \ 3082 }) 3083 3084 #define cache_fpl_smr_enter(fpl) ({ \ 3085 struct cache_fpl *_fpl = (fpl); \ 3086 MPASS(_fpl->in_smr == false); \ 3087 vfs_smr_enter(); \ 3088 _fpl->in_smr = true; \ 3089 }) 3090 3091 #define cache_fpl_smr_exit(fpl) ({ \ 3092 struct cache_fpl *_fpl = (fpl); \ 3093 MPASS(_fpl->in_smr == true); \ 3094 vfs_smr_exit(); \ 3095 _fpl->in_smr = false; \ 3096 }) 3097 3098 static int 3099 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line) 3100 { 3101 3102 if (fpl->status != CACHE_FPL_STATUS_UNSET) { 3103 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, 3104 ("%s: converting to abort from %d at %d, set at %d\n", 3105 __func__, fpl->status, line, fpl->line)); 3106 } 3107 fpl->status = CACHE_FPL_STATUS_ABORTED; 3108 fpl->line = line; 3109 return (CACHE_FPL_FAILED); 3110 } 3111 3112 #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__) 3113 3114 static int 3115 cache_fpl_partial_impl(struct cache_fpl *fpl, int line) 3116 { 3117 3118 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3119 ("%s: setting to partial at %d, but already set to %d at %d\n", 3120 __func__, line, fpl->status, fpl->line)); 3121 cache_fpl_smr_assert_entered(fpl); 3122 fpl->status = CACHE_FPL_STATUS_PARTIAL; 3123 fpl->line = line; 3124 return (CACHE_FPL_FAILED); 3125 } 3126 3127 #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__) 3128 3129 static int 3130 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line) 3131 { 3132 3133 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, 3134 ("%s: setting to handled at %d, but already set to %d at %d\n", 3135 __func__, line, fpl->status, fpl->line)); 3136 cache_fpl_smr_assert_not_entered(fpl); 3137 MPASS(error != CACHE_FPL_FAILED); 3138 fpl->status = CACHE_FPL_STATUS_HANDLED; 3139 fpl->line = line; 3140 return (error); 3141 } 3142 3143 #define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__) 3144 3145 #define CACHE_FPL_SUPPORTED_CN_FLAGS \ 3146 (LOCKLEAF | LOCKPARENT | WANTPARENT | FOLLOW | LOCKSHARED | SAVENAME | \ 3147 ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2) 3148 3149 #define CACHE_FPL_INTERNAL_CN_FLAGS \ 3150 (ISDOTDOT | MAKEENTRY | ISLASTCN) 3151 3152 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, 3153 "supported and internal flags overlap"); 3154 3155 static bool 3156 cache_fpl_islastcn(struct nameidata *ndp) 3157 { 3158 3159 return (*ndp->ni_next == 0); 3160 } 3161 3162 static bool 3163 cache_fpl_isdotdot(struct componentname *cnp) 3164 { 3165 3166 if (cnp->cn_namelen == 2 && 3167 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') 3168 return (true); 3169 return (false); 3170 } 3171 3172 static bool 3173 cache_can_fplookup(struct cache_fpl *fpl) 3174 { 3175 struct nameidata *ndp; 3176 struct componentname *cnp; 3177 struct thread *td; 3178 3179 ndp = fpl->ndp; 3180 cnp = fpl->cnp; 3181 td = cnp->cn_thread; 3182 3183 if (!cache_fast_lookup) { 3184 cache_fpl_aborted(fpl); 3185 return (false); 3186 } 3187 #ifdef MAC 3188 if (mac_vnode_check_lookup_enabled()) { 3189 cache_fpl_aborted(fpl); 3190 return (false); 3191 } 3192 #endif 3193 if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) { 3194 cache_fpl_aborted(fpl); 3195 return (false); 3196 } 3197 if (cnp->cn_nameiop != LOOKUP) { 3198 cache_fpl_aborted(fpl); 3199 return (false); 3200 } 3201 if (ndp->ni_dirfd != AT_FDCWD) { 3202 cache_fpl_aborted(fpl); 3203 return (false); 3204 } 3205 if (IN_CAPABILITY_MODE(td)) { 3206 cache_fpl_aborted(fpl); 3207 return (false); 3208 } 3209 if (AUDITING_TD(td)) { 3210 cache_fpl_aborted(fpl); 3211 return (false); 3212 } 3213 if (ndp->ni_startdir != NULL) { 3214 cache_fpl_aborted(fpl); 3215 return (false); 3216 } 3217 return (true); 3218 } 3219 3220 static bool 3221 cache_fplookup_vnode_supported(struct vnode *vp) 3222 { 3223 3224 return (vp->v_type != VLNK); 3225 } 3226 3227 /* 3228 * Move a negative entry to the hot list. 3229 * 3230 * We have to take locks, but they may be contended and in the worst 3231 * case we may need to go off CPU. We don't want to spin within the 3232 * smr section and we can't block with it. Instead we are going to 3233 * look up the entry again. 3234 */ 3235 static int __noinline 3236 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp, 3237 uint32_t hash) 3238 { 3239 struct componentname *cnp; 3240 struct namecache *ncp; 3241 struct neglist *neglist; 3242 struct negstate *negstate; 3243 struct vnode *dvp; 3244 u_char nc_flag; 3245 3246 cnp = fpl->cnp; 3247 dvp = fpl->dvp; 3248 3249 if (!vhold_smr(dvp)) 3250 return (cache_fpl_aborted(fpl)); 3251 3252 neglist = NCP2NEGLIST(oncp); 3253 cache_fpl_smr_exit(fpl); 3254 3255 mtx_lock(&ncneg_hot.nl_lock); 3256 mtx_lock(&neglist->nl_lock); 3257 /* 3258 * For hash iteration. 3259 */ 3260 cache_fpl_smr_enter(fpl); 3261 3262 /* 3263 * Avoid all surprises by only succeeding if we got the same entry and 3264 * bailing completely otherwise. 3265 * 3266 * In particular at this point there can be a new ncp which matches the 3267 * search but hashes to a different neglist. 3268 */ 3269 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 3270 if (ncp == oncp) 3271 break; 3272 } 3273 3274 /* 3275 * No match to begin with. 3276 */ 3277 if (__predict_false(ncp == NULL)) { 3278 goto out_abort; 3279 } 3280 3281 /* 3282 * The newly found entry may be something different... 3283 */ 3284 if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 3285 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) { 3286 goto out_abort; 3287 } 3288 3289 /* 3290 * ... and not even negative. 3291 */ 3292 nc_flag = atomic_load_char(&ncp->nc_flag); 3293 if ((nc_flag & NCF_NEGATIVE) == 0) { 3294 goto out_abort; 3295 } 3296 3297 if (__predict_false(!cache_ncp_canuse(ncp))) { 3298 goto out_abort; 3299 } 3300 3301 negstate = NCP2NEGSTATE(ncp); 3302 if ((negstate->neg_flag & NEG_HOT) == 0) { 3303 numhotneg++; 3304 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 3305 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); 3306 negstate->neg_flag |= NEG_HOT; 3307 } 3308 3309 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 3310 counter_u64_add(numneghits, 1); 3311 cache_fpl_smr_exit(fpl); 3312 mtx_unlock(&neglist->nl_lock); 3313 mtx_unlock(&ncneg_hot.nl_lock); 3314 vdrop(dvp); 3315 return (cache_fpl_handled(fpl, ENOENT)); 3316 out_abort: 3317 cache_fpl_smr_exit(fpl); 3318 mtx_unlock(&neglist->nl_lock); 3319 mtx_unlock(&ncneg_hot.nl_lock); 3320 vdrop(dvp); 3321 return (cache_fpl_aborted(fpl)); 3322 } 3323 3324 /* 3325 * The target vnode is not supported, prepare for the slow path to take over. 3326 */ 3327 static int __noinline 3328 cache_fplookup_partial_setup(struct cache_fpl *fpl) 3329 { 3330 struct nameidata *ndp; 3331 struct componentname *cnp; 3332 enum vgetstate dvs; 3333 struct vnode *dvp; 3334 struct pwd *pwd; 3335 seqc_t dvp_seqc; 3336 3337 ndp = fpl->ndp; 3338 cnp = fpl->cnp; 3339 dvp = fpl->dvp; 3340 dvp_seqc = fpl->dvp_seqc; 3341 3342 dvs = vget_prep_smr(dvp); 3343 if (__predict_false(dvs == VGET_NONE)) { 3344 cache_fpl_smr_exit(fpl); 3345 return (cache_fpl_aborted(fpl)); 3346 } 3347 3348 cache_fpl_smr_exit(fpl); 3349 3350 vget_finish_ref(dvp, dvs); 3351 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3352 vrele(dvp); 3353 return (cache_fpl_aborted(fpl)); 3354 } 3355 3356 pwd = pwd_hold(curthread); 3357 if (fpl->pwd != pwd) { 3358 vrele(dvp); 3359 pwd_drop(pwd); 3360 return (cache_fpl_aborted(fpl)); 3361 } 3362 3363 cache_fpl_restore(fpl, &fpl->snd); 3364 3365 ndp->ni_startdir = dvp; 3366 cnp->cn_flags |= MAKEENTRY; 3367 if (cache_fpl_islastcn(ndp)) 3368 cnp->cn_flags |= ISLASTCN; 3369 if (cache_fpl_isdotdot(cnp)) 3370 cnp->cn_flags |= ISDOTDOT; 3371 3372 return (0); 3373 } 3374 3375 static int 3376 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs) 3377 { 3378 struct componentname *cnp; 3379 struct vnode *tvp; 3380 seqc_t tvp_seqc; 3381 int error, lkflags; 3382 3383 cnp = fpl->cnp; 3384 tvp = fpl->tvp; 3385 tvp_seqc = fpl->tvp_seqc; 3386 3387 if ((cnp->cn_flags & LOCKLEAF) != 0) { 3388 lkflags = LK_SHARED; 3389 if ((cnp->cn_flags & LOCKSHARED) == 0) 3390 lkflags = LK_EXCLUSIVE; 3391 error = vget_finish(tvp, lkflags, tvs); 3392 if (error != 0) { 3393 return (cache_fpl_aborted(fpl)); 3394 } 3395 } else { 3396 vget_finish_ref(tvp, tvs); 3397 } 3398 3399 if (!vn_seqc_consistent(tvp, tvp_seqc)) { 3400 if ((cnp->cn_flags & LOCKLEAF) != 0) 3401 vput(tvp); 3402 else 3403 vrele(tvp); 3404 return (cache_fpl_aborted(fpl)); 3405 } 3406 3407 return (cache_fpl_handled(fpl, 0)); 3408 } 3409 3410 static int __noinline 3411 cache_fplookup_final_withparent(struct cache_fpl *fpl) 3412 { 3413 struct componentname *cnp; 3414 enum vgetstate dvs, tvs; 3415 struct vnode *dvp, *tvp; 3416 seqc_t dvp_seqc, tvp_seqc; 3417 int error; 3418 3419 cnp = fpl->cnp; 3420 dvp = fpl->dvp; 3421 dvp_seqc = fpl->dvp_seqc; 3422 tvp = fpl->tvp; 3423 tvp_seqc = fpl->tvp_seqc; 3424 3425 MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0); 3426 3427 /* 3428 * This is less efficient than it can be for simplicity. 3429 */ 3430 dvs = vget_prep_smr(dvp); 3431 if (__predict_false(dvs == VGET_NONE)) { 3432 return (cache_fpl_aborted(fpl)); 3433 } 3434 tvs = vget_prep_smr(tvp); 3435 if (__predict_false(tvs == VGET_NONE)) { 3436 cache_fpl_smr_exit(fpl); 3437 vget_abort(dvp, dvs); 3438 return (cache_fpl_aborted(fpl)); 3439 } 3440 3441 cache_fpl_smr_exit(fpl); 3442 3443 if ((cnp->cn_flags & LOCKPARENT) != 0) { 3444 error = vget_finish(dvp, LK_EXCLUSIVE, dvs); 3445 if (error != 0) { 3446 vget_abort(tvp, tvs); 3447 return (cache_fpl_aborted(fpl)); 3448 } 3449 } else { 3450 vget_finish_ref(dvp, dvs); 3451 } 3452 3453 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3454 vget_abort(tvp, tvs); 3455 if ((cnp->cn_flags & LOCKPARENT) != 0) 3456 vput(dvp); 3457 else 3458 vrele(dvp); 3459 cache_fpl_aborted(fpl); 3460 return (error); 3461 } 3462 3463 error = cache_fplookup_final_child(fpl, tvs); 3464 if (error != 0) { 3465 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED); 3466 if ((cnp->cn_flags & LOCKPARENT) != 0) 3467 vput(dvp); 3468 else 3469 vrele(dvp); 3470 return (error); 3471 } 3472 3473 MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED); 3474 return (0); 3475 } 3476 3477 static int 3478 cache_fplookup_final(struct cache_fpl *fpl) 3479 { 3480 struct componentname *cnp; 3481 enum vgetstate tvs; 3482 struct vnode *dvp, *tvp; 3483 seqc_t dvp_seqc, tvp_seqc; 3484 3485 cnp = fpl->cnp; 3486 dvp = fpl->dvp; 3487 dvp_seqc = fpl->dvp_seqc; 3488 tvp = fpl->tvp; 3489 tvp_seqc = fpl->tvp_seqc; 3490 3491 VNPASS(cache_fplookup_vnode_supported(dvp), dvp); 3492 3493 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) 3494 return (cache_fplookup_final_withparent(fpl)); 3495 3496 tvs = vget_prep_smr(tvp); 3497 if (__predict_false(tvs == VGET_NONE)) { 3498 return (cache_fpl_partial(fpl)); 3499 } 3500 3501 if (!vn_seqc_consistent(dvp, dvp_seqc)) { 3502 cache_fpl_smr_exit(fpl); 3503 vget_abort(tvp, tvs); 3504 return (cache_fpl_aborted(fpl)); 3505 } 3506 3507 cache_fpl_smr_exit(fpl); 3508 return (cache_fplookup_final_child(fpl, tvs)); 3509 } 3510 3511 static int __noinline 3512 cache_fplookup_dot(struct cache_fpl *fpl) 3513 { 3514 struct vnode *dvp; 3515 3516 dvp = fpl->dvp; 3517 3518 fpl->tvp = dvp; 3519 fpl->tvp_seqc = vn_seqc_read_any(dvp); 3520 if (seqc_in_modify(fpl->tvp_seqc)) { 3521 return (cache_fpl_aborted(fpl)); 3522 } 3523 3524 counter_u64_add(dothits, 1); 3525 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", dvp); 3526 3527 return (0); 3528 } 3529 3530 static int __noinline 3531 cache_fplookup_dotdot(struct cache_fpl *fpl) 3532 { 3533 struct nameidata *ndp; 3534 struct componentname *cnp; 3535 struct namecache *ncp; 3536 struct vnode *dvp; 3537 struct prison *pr; 3538 u_char nc_flag; 3539 3540 ndp = fpl->ndp; 3541 cnp = fpl->cnp; 3542 dvp = fpl->dvp; 3543 3544 /* 3545 * XXX this is racy the same way regular lookup is 3546 */ 3547 for (pr = cnp->cn_cred->cr_prison; pr != NULL; 3548 pr = pr->pr_parent) 3549 if (dvp == pr->pr_root) 3550 break; 3551 3552 if (dvp == ndp->ni_rootdir || 3553 dvp == ndp->ni_topdir || 3554 dvp == rootvnode || 3555 pr != NULL) { 3556 fpl->tvp = dvp; 3557 fpl->tvp_seqc = vn_seqc_read_any(dvp); 3558 if (seqc_in_modify(fpl->tvp_seqc)) { 3559 return (cache_fpl_aborted(fpl)); 3560 } 3561 return (0); 3562 } 3563 3564 if ((dvp->v_vflag & VV_ROOT) != 0) { 3565 /* 3566 * TODO 3567 * The opposite of climb mount is needed here. 3568 */ 3569 return (cache_fpl_aborted(fpl)); 3570 } 3571 3572 ncp = atomic_load_ptr(&dvp->v_cache_dd); 3573 if (ncp == NULL) { 3574 return (cache_fpl_aborted(fpl)); 3575 } 3576 3577 nc_flag = atomic_load_char(&ncp->nc_flag); 3578 if ((nc_flag & NCF_ISDOTDOT) != 0) { 3579 if ((nc_flag & NCF_NEGATIVE) != 0) 3580 return (cache_fpl_aborted(fpl)); 3581 fpl->tvp = ncp->nc_vp; 3582 } else { 3583 fpl->tvp = ncp->nc_dvp; 3584 } 3585 3586 if (__predict_false(!cache_ncp_canuse(ncp))) { 3587 return (cache_fpl_aborted(fpl)); 3588 } 3589 3590 fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp); 3591 if (seqc_in_modify(fpl->tvp_seqc)) { 3592 return (cache_fpl_partial(fpl)); 3593 } 3594 3595 counter_u64_add(dotdothits, 1); 3596 return (0); 3597 } 3598 3599 static int 3600 cache_fplookup_next(struct cache_fpl *fpl) 3601 { 3602 struct componentname *cnp; 3603 struct namecache *ncp; 3604 struct negstate *negstate; 3605 struct vnode *dvp, *tvp; 3606 u_char nc_flag; 3607 uint32_t hash; 3608 bool neg_hot; 3609 3610 cnp = fpl->cnp; 3611 dvp = fpl->dvp; 3612 3613 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) { 3614 return (cache_fplookup_dot(fpl)); 3615 } 3616 3617 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 3618 3619 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 3620 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 3621 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 3622 break; 3623 } 3624 3625 /* 3626 * If there is no entry we have to punt to the slow path to perform 3627 * actual lookup. Should there be nothing with this name a negative 3628 * entry will be created. 3629 */ 3630 if (__predict_false(ncp == NULL)) { 3631 return (cache_fpl_partial(fpl)); 3632 } 3633 3634 tvp = atomic_load_ptr(&ncp->nc_vp); 3635 nc_flag = atomic_load_char(&ncp->nc_flag); 3636 if ((nc_flag & NCF_NEGATIVE) != 0) { 3637 negstate = NCP2NEGSTATE(ncp); 3638 neg_hot = ((negstate->neg_flag & NEG_HOT) != 0); 3639 if (__predict_false(!cache_ncp_canuse(ncp))) { 3640 return (cache_fpl_partial(fpl)); 3641 } 3642 if (__predict_false((nc_flag & NCF_WHITE) != 0)) { 3643 return (cache_fpl_partial(fpl)); 3644 } 3645 if (!neg_hot) { 3646 return (cache_fplookup_negative_promote(fpl, ncp, hash)); 3647 } 3648 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, 3649 ncp->nc_name); 3650 counter_u64_add(numneghits, 1); 3651 cache_fpl_smr_exit(fpl); 3652 return (cache_fpl_handled(fpl, ENOENT)); 3653 } 3654 3655 if (__predict_false(!cache_ncp_canuse(ncp))) { 3656 return (cache_fpl_partial(fpl)); 3657 } 3658 3659 fpl->tvp = tvp; 3660 fpl->tvp_seqc = vn_seqc_read_any(tvp); 3661 if (seqc_in_modify(fpl->tvp_seqc)) { 3662 return (cache_fpl_partial(fpl)); 3663 } 3664 3665 if (!cache_fplookup_vnode_supported(tvp)) { 3666 return (cache_fpl_partial(fpl)); 3667 } 3668 3669 counter_u64_add(numposhits, 1); 3670 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp); 3671 return (0); 3672 } 3673 3674 static bool 3675 cache_fplookup_mp_supported(struct mount *mp) 3676 { 3677 3678 if (mp == NULL) 3679 return (false); 3680 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) 3681 return (false); 3682 if ((mp->mnt_flag & MNT_UNION) != 0) 3683 return (false); 3684 return (true); 3685 } 3686 3687 /* 3688 * Walk up the mount stack (if any). 3689 * 3690 * Correctness is provided in the following ways: 3691 * - all vnodes are protected from freeing with SMR 3692 * - struct mount objects are type stable making them always safe to access 3693 * - stability of the particular mount is provided by busying it 3694 * - relationship between the vnode which is mounted on and the mount is 3695 * verified with the vnode sequence counter after busying 3696 * - association between root vnode of the mount and the mount is protected 3697 * by busy 3698 * 3699 * From that point on we can read the sequence counter of the root vnode 3700 * and get the next mount on the stack (if any) using the same protection. 3701 * 3702 * By the end of successful walk we are guaranteed the reached state was 3703 * indeed present at least at some point which matches the regular lookup. 3704 */ 3705 static int __noinline 3706 cache_fplookup_climb_mount(struct cache_fpl *fpl) 3707 { 3708 struct mount *mp, *prev_mp; 3709 struct vnode *vp; 3710 seqc_t vp_seqc; 3711 3712 vp = fpl->tvp; 3713 vp_seqc = fpl->tvp_seqc; 3714 3715 VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp); 3716 mp = atomic_load_ptr(&vp->v_mountedhere); 3717 if (mp == NULL) 3718 return (0); 3719 3720 prev_mp = NULL; 3721 for (;;) { 3722 if (!vfs_op_thread_enter_crit(mp)) { 3723 if (prev_mp != NULL) 3724 vfs_op_thread_exit_crit(prev_mp); 3725 return (cache_fpl_partial(fpl)); 3726 } 3727 if (prev_mp != NULL) 3728 vfs_op_thread_exit_crit(prev_mp); 3729 if (!vn_seqc_consistent(vp, vp_seqc)) { 3730 vfs_op_thread_exit_crit(mp); 3731 return (cache_fpl_partial(fpl)); 3732 } 3733 if (!cache_fplookup_mp_supported(mp)) { 3734 vfs_op_thread_exit_crit(mp); 3735 return (cache_fpl_partial(fpl)); 3736 } 3737 vp = atomic_load_ptr(&mp->mnt_rootvnode); 3738 if (vp == NULL || VN_IS_DOOMED(vp)) { 3739 vfs_op_thread_exit_crit(mp); 3740 return (cache_fpl_partial(fpl)); 3741 } 3742 vp_seqc = vn_seqc_read_any(vp); 3743 if (seqc_in_modify(vp_seqc)) { 3744 vfs_op_thread_exit_crit(mp); 3745 return (cache_fpl_partial(fpl)); 3746 } 3747 prev_mp = mp; 3748 mp = atomic_load_ptr(&vp->v_mountedhere); 3749 if (mp == NULL) 3750 break; 3751 } 3752 3753 vfs_op_thread_exit_crit(prev_mp); 3754 fpl->tvp = vp; 3755 fpl->tvp_seqc = vp_seqc; 3756 return (0); 3757 } 3758 3759 static bool 3760 cache_fplookup_need_climb_mount(struct cache_fpl *fpl) 3761 { 3762 struct mount *mp; 3763 struct vnode *vp; 3764 3765 vp = fpl->tvp; 3766 3767 /* 3768 * Hack: while this is a union, the pointer tends to be NULL so save on 3769 * a branch. 3770 */ 3771 mp = atomic_load_ptr(&vp->v_mountedhere); 3772 if (mp == NULL) 3773 return (false); 3774 if (vp->v_type == VDIR) 3775 return (true); 3776 return (false); 3777 } 3778 3779 /* 3780 * Parse the path. 3781 * 3782 * The code is mostly copy-pasted from regular lookup, see lookup(). 3783 * The structure is maintained along with comments for easier maintenance. 3784 * Deduplicating the code will become feasible after fast path lookup 3785 * becomes more feature-complete. 3786 */ 3787 static int 3788 cache_fplookup_parse(struct cache_fpl *fpl) 3789 { 3790 struct nameidata *ndp; 3791 struct componentname *cnp; 3792 char *cp; 3793 char *prev_ni_next; /* saved ndp->ni_next */ 3794 size_t prev_ni_pathlen; /* saved ndp->ni_pathlen */ 3795 3796 ndp = fpl->ndp; 3797 cnp = fpl->cnp; 3798 3799 /* 3800 * Search a new directory. 3801 * 3802 * The last component of the filename is left accessible via 3803 * cnp->cn_nameptr for callers that need the name. Callers needing 3804 * the name set the SAVENAME flag. When done, they assume 3805 * responsibility for freeing the pathname buffer. 3806 */ 3807 for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++) 3808 continue; 3809 cnp->cn_namelen = cp - cnp->cn_nameptr; 3810 if (__predict_false(cnp->cn_namelen > NAME_MAX)) { 3811 cache_fpl_smr_exit(fpl); 3812 return (cache_fpl_handled(fpl, ENAMETOOLONG)); 3813 } 3814 prev_ni_pathlen = ndp->ni_pathlen; 3815 ndp->ni_pathlen -= cnp->cn_namelen; 3816 KASSERT(ndp->ni_pathlen <= PATH_MAX, 3817 ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen)); 3818 prev_ni_next = ndp->ni_next; 3819 ndp->ni_next = cp; 3820 3821 /* 3822 * Replace multiple slashes by a single slash and trailing slashes 3823 * by a null. This must be done before VOP_LOOKUP() because some 3824 * fs's don't know about trailing slashes. Remember if there were 3825 * trailing slashes to handle symlinks, existing non-directories 3826 * and non-existing files that won't be directories specially later. 3827 */ 3828 while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) { 3829 cp++; 3830 ndp->ni_pathlen--; 3831 if (*cp == '\0') { 3832 /* 3833 * TODO 3834 * Regular lookup performs the following: 3835 * *ndp->ni_next = '\0'; 3836 * cnp->cn_flags |= TRAILINGSLASH; 3837 * 3838 * Which is problematic since it modifies data read 3839 * from userspace. Then if fast path lookup was to 3840 * abort we would have to either restore it or convey 3841 * the flag. Since this is a corner case just ignore 3842 * it for simplicity. 3843 */ 3844 return (cache_fpl_partial(fpl)); 3845 } 3846 } 3847 ndp->ni_next = cp; 3848 3849 /* 3850 * Check for degenerate name (e.g. / or "") 3851 * which is a way of talking about a directory, 3852 * e.g. like "/." or ".". 3853 * 3854 * TODO 3855 * Another corner case handled by the regular lookup 3856 */ 3857 if (__predict_false(cnp->cn_nameptr[0] == '\0')) { 3858 return (cache_fpl_partial(fpl)); 3859 } 3860 return (0); 3861 } 3862 3863 static void 3864 cache_fplookup_parse_advance(struct cache_fpl *fpl) 3865 { 3866 struct nameidata *ndp; 3867 struct componentname *cnp; 3868 3869 ndp = fpl->ndp; 3870 cnp = fpl->cnp; 3871 3872 cnp->cn_nameptr = ndp->ni_next; 3873 while (*cnp->cn_nameptr == '/') { 3874 cnp->cn_nameptr++; 3875 ndp->ni_pathlen--; 3876 } 3877 } 3878 3879 static int __noinline 3880 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error) 3881 { 3882 3883 switch (error) { 3884 case EAGAIN: 3885 /* 3886 * Can happen when racing against vgone. 3887 * */ 3888 case EOPNOTSUPP: 3889 cache_fpl_partial(fpl); 3890 break; 3891 default: 3892 /* 3893 * See the API contract for VOP_FPLOOKUP_VEXEC. 3894 */ 3895 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { 3896 error = cache_fpl_aborted(fpl); 3897 } else { 3898 cache_fpl_smr_exit(fpl); 3899 cache_fpl_handled(fpl, error); 3900 } 3901 break; 3902 } 3903 return (error); 3904 } 3905 3906 static int 3907 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl) 3908 { 3909 struct nameidata *ndp; 3910 struct componentname *cnp; 3911 struct mount *mp; 3912 int error; 3913 3914 error = CACHE_FPL_FAILED; 3915 ndp = fpl->ndp; 3916 cnp = fpl->cnp; 3917 3918 cache_fpl_checkpoint(fpl, &fpl->snd); 3919 3920 fpl->dvp = dvp; 3921 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); 3922 if (seqc_in_modify(fpl->dvp_seqc)) { 3923 cache_fpl_aborted(fpl); 3924 goto out; 3925 } 3926 mp = atomic_load_ptr(&fpl->dvp->v_mount); 3927 if (!cache_fplookup_mp_supported(mp)) { 3928 cache_fpl_aborted(fpl); 3929 goto out; 3930 } 3931 3932 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); 3933 3934 for (;;) { 3935 error = cache_fplookup_parse(fpl); 3936 if (__predict_false(error != 0)) { 3937 break; 3938 } 3939 3940 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); 3941 3942 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred, cnp->cn_thread); 3943 if (__predict_false(error != 0)) { 3944 error = cache_fplookup_failed_vexec(fpl, error); 3945 break; 3946 } 3947 3948 if (__predict_false(cache_fpl_isdotdot(cnp))) { 3949 error = cache_fplookup_dotdot(fpl); 3950 if (__predict_false(error != 0)) { 3951 break; 3952 } 3953 } else { 3954 error = cache_fplookup_next(fpl); 3955 if (__predict_false(error != 0)) { 3956 break; 3957 } 3958 3959 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 3960 3961 if (cache_fplookup_need_climb_mount(fpl)) { 3962 error = cache_fplookup_climb_mount(fpl); 3963 if (__predict_false(error != 0)) { 3964 break; 3965 } 3966 } 3967 } 3968 3969 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); 3970 3971 if (cache_fpl_islastcn(ndp)) { 3972 error = cache_fplookup_final(fpl); 3973 break; 3974 } 3975 3976 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { 3977 error = cache_fpl_aborted(fpl); 3978 break; 3979 } 3980 3981 fpl->dvp = fpl->tvp; 3982 fpl->dvp_seqc = fpl->tvp_seqc; 3983 3984 cache_fplookup_parse_advance(fpl); 3985 cache_fpl_checkpoint(fpl, &fpl->snd); 3986 } 3987 out: 3988 switch (fpl->status) { 3989 case CACHE_FPL_STATUS_UNSET: 3990 __assert_unreachable(); 3991 break; 3992 case CACHE_FPL_STATUS_PARTIAL: 3993 cache_fpl_smr_assert_entered(fpl); 3994 return (cache_fplookup_partial_setup(fpl)); 3995 case CACHE_FPL_STATUS_ABORTED: 3996 if (fpl->in_smr) 3997 cache_fpl_smr_exit(fpl); 3998 return (CACHE_FPL_FAILED); 3999 case CACHE_FPL_STATUS_HANDLED: 4000 MPASS(error != CACHE_FPL_FAILED); 4001 cache_fpl_smr_assert_not_entered(fpl); 4002 if (__predict_false(error != 0)) { 4003 ndp->ni_dvp = NULL; 4004 ndp->ni_vp = NULL; 4005 cache_fpl_cleanup_cnp(cnp); 4006 return (error); 4007 } 4008 ndp->ni_dvp = fpl->dvp; 4009 ndp->ni_vp = fpl->tvp; 4010 if (cnp->cn_flags & SAVENAME) 4011 cnp->cn_flags |= HASBUF; 4012 else 4013 cache_fpl_cleanup_cnp(cnp); 4014 return (error); 4015 } 4016 } 4017 4018 /* 4019 * Fast path lookup protected with SMR and sequence counters. 4020 * 4021 * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one. 4022 * 4023 * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria 4024 * outlined below. 4025 * 4026 * Traditional vnode lookup conceptually looks like this: 4027 * 4028 * vn_lock(current); 4029 * for (;;) { 4030 * next = find(); 4031 * vn_lock(next); 4032 * vn_unlock(current); 4033 * current = next; 4034 * if (last) 4035 * break; 4036 * } 4037 * return (current); 4038 * 4039 * Each jump to the next vnode is safe memory-wise and atomic with respect to 4040 * any modifications thanks to holding respective locks. 4041 * 4042 * The same guarantee can be provided with a combination of safe memory 4043 * reclamation and sequence counters instead. If all operations which affect 4044 * the relationship between the current vnode and the one we are looking for 4045 * also modify the counter, we can verify whether all the conditions held as 4046 * we made the jump. This includes things like permissions, mount points etc. 4047 * Counter modification is provided by enclosing relevant places in 4048 * vn_seqc_write_begin()/end() calls. 4049 * 4050 * Thus this translates to: 4051 * 4052 * vfs_smr_enter(); 4053 * dvp_seqc = seqc_read_any(dvp); 4054 * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode 4055 * abort(); 4056 * for (;;) { 4057 * tvp = find(); 4058 * tvp_seqc = seqc_read_any(tvp); 4059 * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode 4060 * abort(); 4061 * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode 4062 * abort(); 4063 * dvp = tvp; // we know nothing of importance has changed 4064 * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration 4065 * if (last) 4066 * break; 4067 * } 4068 * vget(); // secure the vnode 4069 * if (!seqc_consistent(tvp, tvp_seqc) // final check 4070 * abort(); 4071 * // at this point we know nothing has changed for any parent<->child pair 4072 * // as they were crossed during the lookup, meaning we matched the guarantee 4073 * // of the locked variant 4074 * return (tvp); 4075 * 4076 * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows: 4077 * - they are called while within vfs_smr protection which they must never exit 4078 * - EAGAIN can be returned to denote checking could not be performed, it is 4079 * always valid to return it 4080 * - if the sequence counter has not changed the result must be valid 4081 * - if the sequence counter has changed both false positives and false negatives 4082 * are permitted (since the result will be rejected later) 4083 * - for simple cases of unix permission checks vaccess_vexec_smr can be used 4084 * 4085 * Caveats to watch out for: 4086 * - vnodes are passed unlocked and unreferenced with nothing stopping 4087 * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised 4088 * to use atomic_load_ptr to fetch it. 4089 * - the aforementioned object can also get freed, meaning absent other means it 4090 * should be protected with vfs_smr 4091 * - either safely checking permissions as they are modified or guaranteeing 4092 * their stability is left to the routine 4093 */ 4094 int 4095 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status, 4096 struct pwd **pwdp) 4097 { 4098 struct cache_fpl fpl; 4099 struct pwd *pwd; 4100 struct vnode *dvp; 4101 struct componentname *cnp; 4102 struct nameidata_saved orig; 4103 int error; 4104 4105 MPASS(ndp->ni_lcf == 0); 4106 4107 fpl.status = CACHE_FPL_STATUS_UNSET; 4108 fpl.ndp = ndp; 4109 fpl.cnp = &ndp->ni_cnd; 4110 MPASS(curthread == fpl.cnp->cn_thread); 4111 4112 if (!cache_can_fplookup(&fpl)) { 4113 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 4114 *status = fpl.status; 4115 return (EOPNOTSUPP); 4116 } 4117 4118 cache_fpl_checkpoint(&fpl, &orig); 4119 4120 cache_fpl_smr_enter_initial(&fpl); 4121 pwd = pwd_get_smr(); 4122 fpl.pwd = pwd; 4123 ndp->ni_rootdir = pwd->pwd_rdir; 4124 ndp->ni_topdir = pwd->pwd_jdir; 4125 4126 cnp = fpl.cnp; 4127 cnp->cn_nameptr = cnp->cn_pnbuf; 4128 if (cnp->cn_pnbuf[0] == '/') { 4129 cache_fpl_handle_root(ndp, &dvp); 4130 } else { 4131 MPASS(ndp->ni_dirfd == AT_FDCWD); 4132 dvp = pwd->pwd_cdir; 4133 } 4134 4135 SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true); 4136 4137 error = cache_fplookup_impl(dvp, &fpl); 4138 cache_fpl_smr_assert_not_entered(&fpl); 4139 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); 4140 4141 *status = fpl.status; 4142 switch (fpl.status) { 4143 case CACHE_FPL_STATUS_UNSET: 4144 __assert_unreachable(); 4145 break; 4146 case CACHE_FPL_STATUS_HANDLED: 4147 SDT_PROBE3(vfs, namei, lookup, return, error, 4148 (error == 0 ? ndp->ni_vp : NULL), true); 4149 break; 4150 case CACHE_FPL_STATUS_PARTIAL: 4151 *pwdp = fpl.pwd; 4152 /* 4153 * Status restored by cache_fplookup_partial_setup. 4154 */ 4155 break; 4156 case CACHE_FPL_STATUS_ABORTED: 4157 cache_fpl_restore(&fpl, &orig); 4158 break; 4159 } 4160 return (error); 4161 } 4162