1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Poul-Henning Kamp of the FreeBSD Project. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_ddb.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/counter.h> 46 #include <sys/filedesc.h> 47 #include <sys/fnv_hash.h> 48 #include <sys/kernel.h> 49 #include <sys/ktr.h> 50 #include <sys/lock.h> 51 #include <sys/malloc.h> 52 #include <sys/fcntl.h> 53 #include <sys/mount.h> 54 #include <sys/namei.h> 55 #include <sys/proc.h> 56 #include <sys/rwlock.h> 57 #include <sys/sdt.h> 58 #include <sys/smp.h> 59 #include <sys/syscallsubr.h> 60 #include <sys/sysctl.h> 61 #include <sys/sysproto.h> 62 #include <sys/vnode.h> 63 #ifdef KTRACE 64 #include <sys/ktrace.h> 65 #endif 66 67 #ifdef DDB 68 #include <ddb/ddb.h> 69 #endif 70 71 #include <vm/uma.h> 72 73 SDT_PROVIDER_DECLARE(vfs); 74 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", 75 "struct vnode *"); 76 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", 77 "char *"); 78 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); 79 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", 80 "char *", "struct vnode *"); 81 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); 82 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", 83 "struct vnode *", "char *"); 84 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", 85 "struct vnode *"); 86 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, 87 "struct vnode *", "char *"); 88 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", 89 "char *"); 90 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); 91 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); 92 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); 93 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", 94 "struct vnode *"); 95 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", 96 "char *"); 97 SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *", 98 "char *"); 99 100 /* 101 * This structure describes the elements in the cache of recent 102 * names looked up by namei. 103 */ 104 105 struct namecache { 106 LIST_ENTRY(namecache) nc_hash; /* hash chain */ 107 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 108 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 109 struct vnode *nc_dvp; /* vnode of parent of name */ 110 union { 111 struct vnode *nu_vp; /* vnode the name refers to */ 112 } n_un; 113 u_char nc_flag; /* flag bits */ 114 u_char nc_nlen; /* length of name */ 115 char nc_name[0]; /* segment name + nul */ 116 }; 117 118 /* 119 * struct namecache_ts repeats struct namecache layout up to the 120 * nc_nlen member. 121 * struct namecache_ts is used in place of struct namecache when time(s) need 122 * to be stored. The nc_dotdottime field is used when a cache entry is mapping 123 * both a non-dotdot directory name plus dotdot for the directory's 124 * parent. 125 */ 126 struct namecache_ts { 127 struct timespec nc_time; /* timespec provided by fs */ 128 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ 129 int nc_ticks; /* ticks value when entry was added */ 130 struct namecache nc_nc; 131 }; 132 133 #define nc_vp n_un.nu_vp 134 135 /* 136 * Flags in namecache.nc_flag 137 */ 138 #define NCF_WHITE 0x01 139 #define NCF_ISDOTDOT 0x02 140 #define NCF_TS 0x04 141 #define NCF_DTS 0x08 142 #define NCF_DVDROP 0x10 143 #define NCF_NEGATIVE 0x20 144 #define NCF_HOTNEGATIVE 0x40 145 146 /* 147 * Name caching works as follows: 148 * 149 * Names found by directory scans are retained in a cache 150 * for future reference. It is managed LRU, so frequently 151 * used names will hang around. Cache is indexed by hash value 152 * obtained from (dvp, name) where dvp refers to the directory 153 * containing name. 154 * 155 * If it is a "negative" entry, (i.e. for a name that is known NOT to 156 * exist) the vnode pointer will be NULL. 157 * 158 * Upon reaching the last segment of a path, if the reference 159 * is for DELETE, or NOCACHE is set (rewrite), and the 160 * name is located in the cache, it will be dropped. 161 * 162 * These locks are used (in the order in which they can be taken): 163 * NAME TYPE ROLE 164 * vnodelock mtx vnode lists and v_cache_dd field protection 165 * bucketlock rwlock for access to given set of hash buckets 166 * neglist mtx negative entry LRU management 167 * 168 * Additionally, ncneg_shrink_lock mtx is used to have at most one thread 169 * shrinking the LRU list. 170 * 171 * It is legal to take multiple vnodelock and bucketlock locks. The locking 172 * order is lower address first. Both are recursive. 173 * 174 * "." lookups are lockless. 175 * 176 * ".." and vnode -> name lookups require vnodelock. 177 * 178 * name -> vnode lookup requires the relevant bucketlock to be held for reading. 179 * 180 * Insertions and removals of entries require involved vnodes and bucketlocks 181 * to be write-locked to prevent other threads from seeing the entry. 182 * 183 * Some lookups result in removal of the found entry (e.g. getting rid of a 184 * negative entry with the intent to create a positive one), which poses a 185 * problem when multiple threads reach the state. Similarly, two different 186 * threads can purge two different vnodes and try to remove the same name. 187 * 188 * If the already held vnode lock is lower than the second required lock, we 189 * can just take the other lock. However, in the opposite case, this could 190 * deadlock. As such, this is resolved by trylocking and if that fails unlocking 191 * the first node, locking everything in order and revalidating the state. 192 */ 193 194 /* 195 * Structures associated with name caching. 196 */ 197 #define NCHHASH(hash) \ 198 (&nchashtbl[(hash) & nchash]) 199 static __read_mostly LIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ 200 static u_long __read_mostly nchash; /* size of hash table */ 201 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 202 "Size of namecache hash table"); 203 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */ 204 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 205 "Ratio of negative namecache entries"); 206 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ 207 SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, 208 "Number of negative entries in namecache"); 209 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ 210 SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, 211 "Number of namecache entries"); 212 static u_long __exclusive_cache_line numcachehv;/* number of cache entries with vnodes held */ 213 SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0, 214 "Number of namecache entries with vnodes held"); 215 u_int ncsizefactor = 2; 216 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0, 217 "Size factor for namecache"); 218 static u_int __read_mostly ncpurgeminvnodes; 219 SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0, 220 "Number of vnodes below which purgevfs ignores the request"); 221 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */ 222 223 struct nchstats nchstats; /* cache effectiveness statistics */ 224 225 static struct mtx __exclusive_cache_line ncneg_shrink_lock; 226 static int shrink_list_turn; 227 228 struct neglist { 229 struct mtx nl_lock; 230 TAILQ_HEAD(, namecache) nl_list; 231 } __aligned(CACHE_LINE_SIZE); 232 233 static struct neglist __read_mostly *neglists; 234 static struct neglist ncneg_hot; 235 static u_long numhotneg; 236 237 #define numneglists (ncneghash + 1) 238 static u_int __read_mostly ncneghash; 239 static inline struct neglist * 240 NCP2NEGLIST(struct namecache *ncp) 241 { 242 243 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); 244 } 245 246 #define numbucketlocks (ncbuckethash + 1) 247 static u_int __read_mostly ncbuckethash; 248 static struct rwlock_padalign __read_mostly *bucketlocks; 249 #define HASH2BUCKETLOCK(hash) \ 250 ((struct rwlock *)(&bucketlocks[((hash) & ncbuckethash)])) 251 252 #define numvnodelocks (ncvnodehash + 1) 253 static u_int __read_mostly ncvnodehash; 254 static struct mtx __read_mostly *vnodelocks; 255 static inline struct mtx * 256 VP2VNODELOCK(struct vnode *vp) 257 { 258 259 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); 260 } 261 262 /* 263 * UMA zones for the VFS cache. 264 * 265 * The small cache is used for entries with short names, which are the 266 * most common. The large cache is used for entries which are too big to 267 * fit in the small cache. 268 */ 269 static uma_zone_t __read_mostly cache_zone_small; 270 static uma_zone_t __read_mostly cache_zone_small_ts; 271 static uma_zone_t __read_mostly cache_zone_large; 272 static uma_zone_t __read_mostly cache_zone_large_ts; 273 274 #define CACHE_PATH_CUTOFF 35 275 276 static struct namecache * 277 cache_alloc(int len, int ts) 278 { 279 struct namecache_ts *ncp_ts; 280 struct namecache *ncp; 281 282 if (__predict_false(ts)) { 283 if (len <= CACHE_PATH_CUTOFF) 284 ncp_ts = uma_zalloc(cache_zone_small_ts, M_WAITOK); 285 else 286 ncp_ts = uma_zalloc(cache_zone_large_ts, M_WAITOK); 287 ncp = &ncp_ts->nc_nc; 288 } else { 289 if (len <= CACHE_PATH_CUTOFF) 290 ncp = uma_zalloc(cache_zone_small, M_WAITOK); 291 else 292 ncp = uma_zalloc(cache_zone_large, M_WAITOK); 293 } 294 return (ncp); 295 } 296 297 static void 298 cache_free(struct namecache *ncp) 299 { 300 struct namecache_ts *ncp_ts; 301 302 if (ncp == NULL) 303 return; 304 if ((ncp->nc_flag & NCF_DVDROP) != 0) 305 vdrop(ncp->nc_dvp); 306 if (__predict_false(ncp->nc_flag & NCF_TS)) { 307 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 308 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 309 uma_zfree(cache_zone_small_ts, ncp_ts); 310 else 311 uma_zfree(cache_zone_large_ts, ncp_ts); 312 } else { 313 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 314 uma_zfree(cache_zone_small, ncp); 315 else 316 uma_zfree(cache_zone_large, ncp); 317 } 318 } 319 320 static void 321 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) 322 { 323 struct namecache_ts *ncp_ts; 324 325 KASSERT((ncp->nc_flag & NCF_TS) != 0 || 326 (tsp == NULL && ticksp == NULL), 327 ("No NCF_TS")); 328 329 if (tsp == NULL && ticksp == NULL) 330 return; 331 332 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 333 if (tsp != NULL) 334 *tsp = ncp_ts->nc_time; 335 if (ticksp != NULL) 336 *ticksp = ncp_ts->nc_ticks; 337 } 338 339 static int __read_mostly doingcache = 1; /* 1 => enable the cache */ 340 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, 341 "VFS namecache enabled"); 342 343 /* Export size information to userland */ 344 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 345 sizeof(struct namecache), "sizeof(struct namecache)"); 346 347 /* 348 * The new name cache statistics 349 */ 350 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, 351 "Name cache statistics"); 352 #define STATNODE_ULONG(name, descr) \ 353 SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr); 354 #define STATNODE_COUNTER(name, descr) \ 355 static counter_u64_t __read_mostly name; \ 356 SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, descr); 357 STATNODE_ULONG(numneg, "Number of negative cache entries"); 358 STATNODE_ULONG(numcache, "Number of cache entries"); 359 STATNODE_COUNTER(numcalls, "Number of cache lookups"); 360 STATNODE_COUNTER(dothits, "Number of '.' hits"); 361 STATNODE_COUNTER(dotdothits, "Number of '..' hits"); 362 STATNODE_COUNTER(numchecks, "Number of checks in lookup"); 363 STATNODE_COUNTER(nummiss, "Number of cache misses"); 364 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache"); 365 STATNODE_COUNTER(numposzaps, 366 "Number of cache hits (positive) we do not want to cache"); 367 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)"); 368 STATNODE_COUNTER(numnegzaps, 369 "Number of cache hits (negative) we do not want to cache"); 370 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)"); 371 /* These count for kern___getcwd(), too. */ 372 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls"); 373 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); 374 STATNODE_COUNTER(numfullpathfail2, 375 "Number of fullpath search errors (VOP_VPTOCNP failures)"); 376 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); 377 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls"); 378 STATNODE_COUNTER(zap_and_exit_bucket_relock_success, 379 "Number of successful removals after relocking"); 380 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail, 381 "Number of times zap_and_exit failed to lock"); 382 static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2, 383 "Number of times zap_and_exit failed to lock"); 384 static long cache_lock_vnodes_cel_3_failures; 385 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures, 386 "Number of times 3-way vnode locking failed"); 387 STATNODE_ULONG(numhotneg, "Number of hot negative entries"); 388 STATNODE_COUNTER(numneg_evicted, 389 "Number of negative entries evicted when adding a new entry"); 390 STATNODE_COUNTER(shrinking_skipped, 391 "Number of times shrinking was already in progress"); 392 393 static void cache_zap_locked(struct namecache *ncp, bool neg_locked); 394 static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir, 395 char *buf, char **retbuf, u_int buflen); 396 397 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 398 399 static int cache_yield; 400 SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0, 401 "Number of times cache called yield"); 402 403 static void __noinline 404 cache_maybe_yield(void) 405 { 406 407 if (should_yield()) { 408 cache_yield++; 409 kern_yield(PRI_USER); 410 } 411 } 412 413 static inline void 414 cache_assert_vlp_locked(struct mtx *vlp) 415 { 416 417 if (vlp != NULL) 418 mtx_assert(vlp, MA_OWNED); 419 } 420 421 static inline void 422 cache_assert_vnode_locked(struct vnode *vp) 423 { 424 struct mtx *vlp; 425 426 vlp = VP2VNODELOCK(vp); 427 cache_assert_vlp_locked(vlp); 428 } 429 430 static uint32_t 431 cache_get_hash(char *name, u_char len, struct vnode *dvp) 432 { 433 uint32_t hash; 434 435 hash = fnv_32_buf(name, len, FNV1_32_INIT); 436 hash = fnv_32_buf(&dvp, sizeof(dvp), hash); 437 return (hash); 438 } 439 440 static inline struct rwlock * 441 NCP2BUCKETLOCK(struct namecache *ncp) 442 { 443 uint32_t hash; 444 445 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 446 return (HASH2BUCKETLOCK(hash)); 447 } 448 449 #ifdef INVARIANTS 450 static void 451 cache_assert_bucket_locked(struct namecache *ncp, int mode) 452 { 453 struct rwlock *blp; 454 455 blp = NCP2BUCKETLOCK(ncp); 456 rw_assert(blp, mode); 457 } 458 #else 459 #define cache_assert_bucket_locked(x, y) do { } while (0) 460 #endif 461 462 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) 463 static void 464 _cache_sort_vnodes(void **p1, void **p2) 465 { 466 void *tmp; 467 468 MPASS(*p1 != NULL || *p2 != NULL); 469 470 if (*p1 > *p2) { 471 tmp = *p2; 472 *p2 = *p1; 473 *p1 = tmp; 474 } 475 } 476 477 static void 478 cache_lock_all_buckets(void) 479 { 480 u_int i; 481 482 for (i = 0; i < numbucketlocks; i++) 483 rw_wlock(&bucketlocks[i]); 484 } 485 486 static void 487 cache_unlock_all_buckets(void) 488 { 489 u_int i; 490 491 for (i = 0; i < numbucketlocks; i++) 492 rw_wunlock(&bucketlocks[i]); 493 } 494 495 static void 496 cache_lock_all_vnodes(void) 497 { 498 u_int i; 499 500 for (i = 0; i < numvnodelocks; i++) 501 mtx_lock(&vnodelocks[i]); 502 } 503 504 static void 505 cache_unlock_all_vnodes(void) 506 { 507 u_int i; 508 509 for (i = 0; i < numvnodelocks; i++) 510 mtx_unlock(&vnodelocks[i]); 511 } 512 513 static int 514 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 515 { 516 517 cache_sort_vnodes(&vlp1, &vlp2); 518 519 if (vlp1 != NULL) { 520 if (!mtx_trylock(vlp1)) 521 return (EAGAIN); 522 } 523 if (!mtx_trylock(vlp2)) { 524 if (vlp1 != NULL) 525 mtx_unlock(vlp1); 526 return (EAGAIN); 527 } 528 529 return (0); 530 } 531 532 static void 533 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 534 { 535 536 MPASS(vlp1 != NULL || vlp2 != NULL); 537 MPASS(vlp1 <= vlp2); 538 539 if (vlp1 != NULL) 540 mtx_lock(vlp1); 541 if (vlp2 != NULL) 542 mtx_lock(vlp2); 543 } 544 545 static void 546 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 547 { 548 549 MPASS(vlp1 != NULL || vlp2 != NULL); 550 551 if (vlp1 != NULL) 552 mtx_unlock(vlp1); 553 if (vlp2 != NULL) 554 mtx_unlock(vlp2); 555 } 556 557 static int 558 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 559 { 560 struct nchstats snap; 561 562 if (req->oldptr == NULL) 563 return (SYSCTL_OUT(req, 0, sizeof(snap))); 564 565 snap = nchstats; 566 snap.ncs_goodhits = counter_u64_fetch(numposhits); 567 snap.ncs_neghits = counter_u64_fetch(numneghits); 568 snap.ncs_badhits = counter_u64_fetch(numposzaps) + 569 counter_u64_fetch(numnegzaps); 570 snap.ncs_miss = counter_u64_fetch(nummisszap) + 571 counter_u64_fetch(nummiss); 572 573 return (SYSCTL_OUT(req, &snap, sizeof(snap))); 574 } 575 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | 576 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", 577 "VFS cache effectiveness statistics"); 578 579 #ifdef DIAGNOSTIC 580 /* 581 * Grab an atomic snapshot of the name cache hash chain lengths 582 */ 583 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL, 584 "hash table stats"); 585 586 static int 587 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) 588 { 589 struct nchashhead *ncpp; 590 struct namecache *ncp; 591 int i, error, n_nchash, *cntbuf; 592 593 retry: 594 n_nchash = nchash + 1; /* nchash is max index, not count */ 595 if (req->oldptr == NULL) 596 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); 597 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); 598 cache_lock_all_buckets(); 599 if (n_nchash != nchash + 1) { 600 cache_unlock_all_buckets(); 601 free(cntbuf, M_TEMP); 602 goto retry; 603 } 604 /* Scan hash tables counting entries */ 605 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) 606 LIST_FOREACH(ncp, ncpp, nc_hash) 607 cntbuf[i]++; 608 cache_unlock_all_buckets(); 609 for (error = 0, i = 0; i < n_nchash; i++) 610 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) 611 break; 612 free(cntbuf, M_TEMP); 613 return (error); 614 } 615 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| 616 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", 617 "nchash chain lengths"); 618 619 static int 620 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) 621 { 622 int error; 623 struct nchashhead *ncpp; 624 struct namecache *ncp; 625 int n_nchash; 626 int count, maxlength, used, pct; 627 628 if (!req->oldptr) 629 return SYSCTL_OUT(req, 0, 4 * sizeof(int)); 630 631 cache_lock_all_buckets(); 632 n_nchash = nchash + 1; /* nchash is max index, not count */ 633 used = 0; 634 maxlength = 0; 635 636 /* Scan hash tables for applicable entries */ 637 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { 638 count = 0; 639 LIST_FOREACH(ncp, ncpp, nc_hash) { 640 count++; 641 } 642 if (count) 643 used++; 644 if (maxlength < count) 645 maxlength = count; 646 } 647 n_nchash = nchash + 1; 648 cache_unlock_all_buckets(); 649 pct = (used * 100) / (n_nchash / 100); 650 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); 651 if (error) 652 return (error); 653 error = SYSCTL_OUT(req, &used, sizeof(used)); 654 if (error) 655 return (error); 656 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); 657 if (error) 658 return (error); 659 error = SYSCTL_OUT(req, &pct, sizeof(pct)); 660 if (error) 661 return (error); 662 return (0); 663 } 664 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| 665 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", 666 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); 667 #endif 668 669 /* 670 * Negative entries management 671 * 672 * A variation of LRU scheme is used. New entries are hashed into one of 673 * numneglists cold lists. Entries get promoted to the hot list on first hit. 674 * 675 * The shrinker will demote hot list head and evict from the cold list in a 676 * round-robin manner. 677 */ 678 static void 679 cache_negative_hit(struct namecache *ncp) 680 { 681 struct neglist *neglist; 682 683 MPASS(ncp->nc_flag & NCF_NEGATIVE); 684 if (ncp->nc_flag & NCF_HOTNEGATIVE) 685 return; 686 neglist = NCP2NEGLIST(ncp); 687 mtx_lock(&ncneg_hot.nl_lock); 688 mtx_lock(&neglist->nl_lock); 689 if (!(ncp->nc_flag & NCF_HOTNEGATIVE)) { 690 numhotneg++; 691 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 692 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); 693 ncp->nc_flag |= NCF_HOTNEGATIVE; 694 } 695 mtx_unlock(&neglist->nl_lock); 696 mtx_unlock(&ncneg_hot.nl_lock); 697 } 698 699 static void 700 cache_negative_insert(struct namecache *ncp, bool neg_locked) 701 { 702 struct neglist *neglist; 703 704 MPASS(ncp->nc_flag & NCF_NEGATIVE); 705 cache_assert_bucket_locked(ncp, RA_WLOCKED); 706 neglist = NCP2NEGLIST(ncp); 707 if (!neg_locked) { 708 mtx_lock(&neglist->nl_lock); 709 } else { 710 mtx_assert(&neglist->nl_lock, MA_OWNED); 711 } 712 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 713 if (!neg_locked) 714 mtx_unlock(&neglist->nl_lock); 715 atomic_add_rel_long(&numneg, 1); 716 } 717 718 static void 719 cache_negative_remove(struct namecache *ncp, bool neg_locked) 720 { 721 struct neglist *neglist; 722 bool hot_locked = false; 723 bool list_locked = false; 724 725 MPASS(ncp->nc_flag & NCF_NEGATIVE); 726 cache_assert_bucket_locked(ncp, RA_WLOCKED); 727 neglist = NCP2NEGLIST(ncp); 728 if (!neg_locked) { 729 if (ncp->nc_flag & NCF_HOTNEGATIVE) { 730 hot_locked = true; 731 mtx_lock(&ncneg_hot.nl_lock); 732 if (!(ncp->nc_flag & NCF_HOTNEGATIVE)) { 733 list_locked = true; 734 mtx_lock(&neglist->nl_lock); 735 } 736 } else { 737 list_locked = true; 738 mtx_lock(&neglist->nl_lock); 739 } 740 } 741 if (ncp->nc_flag & NCF_HOTNEGATIVE) { 742 mtx_assert(&ncneg_hot.nl_lock, MA_OWNED); 743 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 744 numhotneg--; 745 } else { 746 mtx_assert(&neglist->nl_lock, MA_OWNED); 747 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 748 } 749 if (list_locked) 750 mtx_unlock(&neglist->nl_lock); 751 if (hot_locked) 752 mtx_unlock(&ncneg_hot.nl_lock); 753 atomic_subtract_rel_long(&numneg, 1); 754 } 755 756 static void 757 cache_negative_shrink_select(int start, struct namecache **ncpp, 758 struct neglist **neglistpp) 759 { 760 struct neglist *neglist; 761 struct namecache *ncp; 762 int i; 763 764 *ncpp = ncp = NULL; 765 neglist = NULL; 766 767 for (i = start; i < numneglists; i++) { 768 neglist = &neglists[i]; 769 if (TAILQ_FIRST(&neglist->nl_list) == NULL) 770 continue; 771 mtx_lock(&neglist->nl_lock); 772 ncp = TAILQ_FIRST(&neglist->nl_list); 773 if (ncp != NULL) 774 break; 775 mtx_unlock(&neglist->nl_lock); 776 } 777 778 *neglistpp = neglist; 779 *ncpp = ncp; 780 } 781 782 static void 783 cache_negative_zap_one(void) 784 { 785 struct namecache *ncp, *ncp2; 786 struct neglist *neglist; 787 struct mtx *dvlp; 788 struct rwlock *blp; 789 790 if (mtx_owner(&ncneg_shrink_lock) != NULL || 791 !mtx_trylock(&ncneg_shrink_lock)) { 792 counter_u64_add(shrinking_skipped, 1); 793 return; 794 } 795 796 mtx_lock(&ncneg_hot.nl_lock); 797 ncp = TAILQ_FIRST(&ncneg_hot.nl_list); 798 if (ncp != NULL) { 799 neglist = NCP2NEGLIST(ncp); 800 mtx_lock(&neglist->nl_lock); 801 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 802 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 803 ncp->nc_flag &= ~NCF_HOTNEGATIVE; 804 numhotneg--; 805 mtx_unlock(&neglist->nl_lock); 806 } 807 mtx_unlock(&ncneg_hot.nl_lock); 808 809 cache_negative_shrink_select(shrink_list_turn, &ncp, &neglist); 810 shrink_list_turn++; 811 if (shrink_list_turn == numneglists) 812 shrink_list_turn = 0; 813 if (ncp == NULL && shrink_list_turn == 0) 814 cache_negative_shrink_select(shrink_list_turn, &ncp, &neglist); 815 mtx_unlock(&ncneg_shrink_lock); 816 if (ncp == NULL) 817 return; 818 819 MPASS(ncp->nc_flag & NCF_NEGATIVE); 820 dvlp = VP2VNODELOCK(ncp->nc_dvp); 821 blp = NCP2BUCKETLOCK(ncp); 822 mtx_unlock(&neglist->nl_lock); 823 mtx_lock(dvlp); 824 rw_wlock(blp); 825 mtx_lock(&neglist->nl_lock); 826 ncp2 = TAILQ_FIRST(&neglist->nl_list); 827 if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) || 828 blp != NCP2BUCKETLOCK(ncp2) || !(ncp2->nc_flag & NCF_NEGATIVE)) { 829 ncp = NULL; 830 } else { 831 SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp, 832 ncp->nc_name); 833 834 cache_zap_locked(ncp, true); 835 counter_u64_add(numneg_evicted, 1); 836 } 837 mtx_unlock(&neglist->nl_lock); 838 rw_wunlock(blp); 839 mtx_unlock(dvlp); 840 cache_free(ncp); 841 } 842 843 /* 844 * cache_zap_locked(): 845 * 846 * Removes a namecache entry from cache, whether it contains an actual 847 * pointer to a vnode or if it is just a negative cache entry. 848 */ 849 static void 850 cache_zap_locked(struct namecache *ncp, bool neg_locked) 851 { 852 853 if (!(ncp->nc_flag & NCF_NEGATIVE)) 854 cache_assert_vnode_locked(ncp->nc_vp); 855 cache_assert_vnode_locked(ncp->nc_dvp); 856 cache_assert_bucket_locked(ncp, RA_WLOCKED); 857 858 CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, 859 (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp); 860 LIST_REMOVE(ncp, nc_hash); 861 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 862 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, 863 ncp->nc_name, ncp->nc_vp); 864 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); 865 if (ncp == ncp->nc_vp->v_cache_dd) 866 ncp->nc_vp->v_cache_dd = NULL; 867 } else { 868 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp, 869 ncp->nc_name); 870 cache_negative_remove(ncp, neg_locked); 871 } 872 if (ncp->nc_flag & NCF_ISDOTDOT) { 873 if (ncp == ncp->nc_dvp->v_cache_dd) 874 ncp->nc_dvp->v_cache_dd = NULL; 875 } else { 876 LIST_REMOVE(ncp, nc_src); 877 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { 878 ncp->nc_flag |= NCF_DVDROP; 879 atomic_subtract_rel_long(&numcachehv, 1); 880 } 881 } 882 atomic_subtract_rel_long(&numcache, 1); 883 } 884 885 static void 886 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) 887 { 888 struct rwlock *blp; 889 890 MPASS(ncp->nc_dvp == vp); 891 MPASS(ncp->nc_flag & NCF_NEGATIVE); 892 cache_assert_vnode_locked(vp); 893 894 blp = NCP2BUCKETLOCK(ncp); 895 rw_wlock(blp); 896 cache_zap_locked(ncp, false); 897 rw_wunlock(blp); 898 } 899 900 static bool 901 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, 902 struct mtx **vlpp) 903 { 904 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 905 struct rwlock *blp; 906 907 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 908 cache_assert_vnode_locked(vp); 909 910 if (ncp->nc_flag & NCF_NEGATIVE) { 911 if (*vlpp != NULL) { 912 mtx_unlock(*vlpp); 913 *vlpp = NULL; 914 } 915 cache_zap_negative_locked_vnode_kl(ncp, vp); 916 return (true); 917 } 918 919 pvlp = VP2VNODELOCK(vp); 920 blp = NCP2BUCKETLOCK(ncp); 921 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 922 vlp2 = VP2VNODELOCK(ncp->nc_vp); 923 924 if (*vlpp == vlp1 || *vlpp == vlp2) { 925 to_unlock = *vlpp; 926 *vlpp = NULL; 927 } else { 928 if (*vlpp != NULL) { 929 mtx_unlock(*vlpp); 930 *vlpp = NULL; 931 } 932 cache_sort_vnodes(&vlp1, &vlp2); 933 if (vlp1 == pvlp) { 934 mtx_lock(vlp2); 935 to_unlock = vlp2; 936 } else { 937 if (!mtx_trylock(vlp1)) 938 goto out_relock; 939 to_unlock = vlp1; 940 } 941 } 942 rw_wlock(blp); 943 cache_zap_locked(ncp, false); 944 rw_wunlock(blp); 945 if (to_unlock != NULL) 946 mtx_unlock(to_unlock); 947 return (true); 948 949 out_relock: 950 mtx_unlock(vlp2); 951 mtx_lock(vlp1); 952 mtx_lock(vlp2); 953 MPASS(*vlpp == NULL); 954 *vlpp = vlp1; 955 return (false); 956 } 957 958 static int __noinline 959 cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp) 960 { 961 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 962 struct rwlock *blp; 963 int error = 0; 964 965 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 966 cache_assert_vnode_locked(vp); 967 968 pvlp = VP2VNODELOCK(vp); 969 if (ncp->nc_flag & NCF_NEGATIVE) { 970 cache_zap_negative_locked_vnode_kl(ncp, vp); 971 goto out; 972 } 973 974 blp = NCP2BUCKETLOCK(ncp); 975 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 976 vlp2 = VP2VNODELOCK(ncp->nc_vp); 977 cache_sort_vnodes(&vlp1, &vlp2); 978 if (vlp1 == pvlp) { 979 mtx_lock(vlp2); 980 to_unlock = vlp2; 981 } else { 982 if (!mtx_trylock(vlp1)) { 983 error = EAGAIN; 984 goto out; 985 } 986 to_unlock = vlp1; 987 } 988 rw_wlock(blp); 989 cache_zap_locked(ncp, false); 990 rw_wunlock(blp); 991 mtx_unlock(to_unlock); 992 out: 993 mtx_unlock(pvlp); 994 return (error); 995 } 996 997 /* 998 * If trylocking failed we can get here. We know enough to take all needed locks 999 * in the right order and re-lookup the entry. 1000 */ 1001 static int 1002 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1003 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, 1004 struct rwlock *blp) 1005 { 1006 struct namecache *rncp; 1007 1008 cache_assert_bucket_locked(ncp, RA_UNLOCKED); 1009 1010 cache_sort_vnodes(&dvlp, &vlp); 1011 cache_lock_vnodes(dvlp, vlp); 1012 rw_wlock(blp); 1013 LIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { 1014 if (rncp == ncp && rncp->nc_dvp == dvp && 1015 rncp->nc_nlen == cnp->cn_namelen && 1016 !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) 1017 break; 1018 } 1019 if (rncp != NULL) { 1020 cache_zap_locked(rncp, false); 1021 rw_wunlock(blp); 1022 cache_unlock_vnodes(dvlp, vlp); 1023 counter_u64_add(zap_and_exit_bucket_relock_success, 1); 1024 return (0); 1025 } 1026 1027 rw_wunlock(blp); 1028 cache_unlock_vnodes(dvlp, vlp); 1029 return (EAGAIN); 1030 } 1031 1032 static int __noinline 1033 cache_zap_wlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1034 uint32_t hash, struct rwlock *blp) 1035 { 1036 struct mtx *dvlp, *vlp; 1037 struct vnode *dvp; 1038 1039 cache_assert_bucket_locked(ncp, RA_WLOCKED); 1040 1041 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1042 vlp = NULL; 1043 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1044 vlp = VP2VNODELOCK(ncp->nc_vp); 1045 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1046 cache_zap_locked(ncp, false); 1047 rw_wunlock(blp); 1048 cache_unlock_vnodes(dvlp, vlp); 1049 return (0); 1050 } 1051 1052 dvp = ncp->nc_dvp; 1053 rw_wunlock(blp); 1054 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1055 } 1056 1057 static int __noinline 1058 cache_zap_rlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1059 uint32_t hash, struct rwlock *blp) 1060 { 1061 struct mtx *dvlp, *vlp; 1062 struct vnode *dvp; 1063 1064 cache_assert_bucket_locked(ncp, RA_RLOCKED); 1065 1066 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1067 vlp = NULL; 1068 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1069 vlp = VP2VNODELOCK(ncp->nc_vp); 1070 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1071 rw_runlock(blp); 1072 rw_wlock(blp); 1073 cache_zap_locked(ncp, false); 1074 rw_wunlock(blp); 1075 cache_unlock_vnodes(dvlp, vlp); 1076 return (0); 1077 } 1078 1079 dvp = ncp->nc_dvp; 1080 rw_runlock(blp); 1081 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1082 } 1083 1084 static int 1085 cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp, 1086 struct mtx **vlpp1, struct mtx **vlpp2) 1087 { 1088 struct mtx *dvlp, *vlp; 1089 1090 cache_assert_bucket_locked(ncp, RA_WLOCKED); 1091 1092 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1093 vlp = NULL; 1094 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1095 vlp = VP2VNODELOCK(ncp->nc_vp); 1096 cache_sort_vnodes(&dvlp, &vlp); 1097 1098 if (*vlpp1 == dvlp && *vlpp2 == vlp) { 1099 cache_zap_locked(ncp, false); 1100 cache_unlock_vnodes(dvlp, vlp); 1101 *vlpp1 = NULL; 1102 *vlpp2 = NULL; 1103 return (0); 1104 } 1105 1106 if (*vlpp1 != NULL) 1107 mtx_unlock(*vlpp1); 1108 if (*vlpp2 != NULL) 1109 mtx_unlock(*vlpp2); 1110 *vlpp1 = NULL; 1111 *vlpp2 = NULL; 1112 1113 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1114 cache_zap_locked(ncp, false); 1115 cache_unlock_vnodes(dvlp, vlp); 1116 return (0); 1117 } 1118 1119 rw_wunlock(blp); 1120 *vlpp1 = dvlp; 1121 *vlpp2 = vlp; 1122 if (*vlpp1 != NULL) 1123 mtx_lock(*vlpp1); 1124 mtx_lock(*vlpp2); 1125 rw_wlock(blp); 1126 return (EAGAIN); 1127 } 1128 1129 static void 1130 cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp) 1131 { 1132 1133 if (blp != NULL) { 1134 rw_runlock(blp); 1135 } else { 1136 mtx_unlock(vlp); 1137 } 1138 } 1139 1140 static int __noinline 1141 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1142 struct timespec *tsp, int *ticksp) 1143 { 1144 int ltype; 1145 1146 *vpp = dvp; 1147 CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .", 1148 dvp, cnp->cn_nameptr); 1149 counter_u64_add(dothits, 1); 1150 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); 1151 if (tsp != NULL) 1152 timespecclear(tsp); 1153 if (ticksp != NULL) 1154 *ticksp = ticks; 1155 vrefact(*vpp); 1156 /* 1157 * When we lookup "." we still can be asked to lock it 1158 * differently... 1159 */ 1160 ltype = cnp->cn_lkflags & LK_TYPE_MASK; 1161 if (ltype != VOP_ISLOCKED(*vpp)) { 1162 if (ltype == LK_EXCLUSIVE) { 1163 vn_lock(*vpp, LK_UPGRADE | LK_RETRY); 1164 if ((*vpp)->v_iflag & VI_DOOMED) { 1165 /* forced unmount */ 1166 vrele(*vpp); 1167 *vpp = NULL; 1168 return (ENOENT); 1169 } 1170 } else 1171 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); 1172 } 1173 return (-1); 1174 } 1175 1176 static __noinline int 1177 cache_lookup_nomakeentry(struct vnode *dvp, struct vnode **vpp, 1178 struct componentname *cnp, struct timespec *tsp, int *ticksp) 1179 { 1180 struct namecache *ncp; 1181 struct rwlock *blp; 1182 struct mtx *dvlp, *dvlp2; 1183 uint32_t hash; 1184 int error; 1185 1186 if (cnp->cn_namelen == 2 && 1187 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1188 counter_u64_add(dotdothits, 1); 1189 dvlp = VP2VNODELOCK(dvp); 1190 dvlp2 = NULL; 1191 mtx_lock(dvlp); 1192 retry_dotdot: 1193 ncp = dvp->v_cache_dd; 1194 if (ncp == NULL) { 1195 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, 1196 "..", NULL); 1197 mtx_unlock(dvlp); 1198 if (dvlp2 != NULL) 1199 mtx_unlock(dvlp2); 1200 return (0); 1201 } 1202 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1203 if (ncp->nc_dvp != dvp) 1204 panic("dvp %p v_cache_dd %p\n", dvp, ncp); 1205 if (!cache_zap_locked_vnode_kl2(ncp, 1206 dvp, &dvlp2)) 1207 goto retry_dotdot; 1208 MPASS(dvp->v_cache_dd == NULL); 1209 mtx_unlock(dvlp); 1210 if (dvlp2 != NULL) 1211 mtx_unlock(dvlp2); 1212 cache_free(ncp); 1213 } else { 1214 dvp->v_cache_dd = NULL; 1215 mtx_unlock(dvlp); 1216 if (dvlp2 != NULL) 1217 mtx_unlock(dvlp2); 1218 } 1219 return (0); 1220 } 1221 1222 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1223 blp = HASH2BUCKETLOCK(hash); 1224 retry: 1225 if (LIST_EMPTY(NCHHASH(hash))) 1226 goto out_no_entry; 1227 1228 rw_wlock(blp); 1229 1230 LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1231 counter_u64_add(numchecks, 1); 1232 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1233 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1234 break; 1235 } 1236 1237 /* We failed to find an entry */ 1238 if (ncp == NULL) { 1239 rw_wunlock(blp); 1240 goto out_no_entry; 1241 } 1242 1243 error = cache_zap_wlocked_bucket(ncp, cnp, hash, blp); 1244 if (__predict_false(error != 0)) { 1245 zap_and_exit_bucket_fail++; 1246 cache_maybe_yield(); 1247 goto retry; 1248 } 1249 counter_u64_add(numposzaps, 1); 1250 cache_free(ncp); 1251 return (0); 1252 out_no_entry: 1253 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, NULL); 1254 counter_u64_add(nummisszap, 1); 1255 return (0); 1256 } 1257 1258 /** 1259 * Lookup a name in the name cache 1260 * 1261 * # Arguments 1262 * 1263 * - dvp: Parent directory in which to search. 1264 * - vpp: Return argument. Will contain desired vnode on cache hit. 1265 * - cnp: Parameters of the name search. The most interesting bits of 1266 * the cn_flags field have the following meanings: 1267 * - MAKEENTRY: If clear, free an entry from the cache rather than look 1268 * it up. 1269 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." 1270 * - tsp: Return storage for cache timestamp. On a successful (positive 1271 * or negative) lookup, tsp will be filled with any timespec that 1272 * was stored when this cache entry was created. However, it will 1273 * be clear for "." entries. 1274 * - ticks: Return storage for alternate cache timestamp. On a successful 1275 * (positive or negative) lookup, it will contain the ticks value 1276 * that was current when the cache entry was created, unless cnp 1277 * was ".". 1278 * 1279 * # Returns 1280 * 1281 * - -1: A positive cache hit. vpp will contain the desired vnode. 1282 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due 1283 * to a forced unmount. vpp will not be modified. If the entry 1284 * is a whiteout, then the ISWHITEOUT flag will be set in 1285 * cnp->cn_flags. 1286 * - 0: A cache miss. vpp will not be modified. 1287 * 1288 * # Locking 1289 * 1290 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up 1291 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the 1292 * lock is not recursively acquired. 1293 */ 1294 int 1295 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1296 struct timespec *tsp, int *ticksp) 1297 { 1298 struct namecache_ts *ncp_ts; 1299 struct namecache *ncp; 1300 struct rwlock *blp; 1301 struct mtx *dvlp; 1302 uint32_t hash; 1303 enum vgetstate vs; 1304 int error, ltype; 1305 1306 if (__predict_false(!doingcache)) { 1307 cnp->cn_flags &= ~MAKEENTRY; 1308 return (0); 1309 } 1310 1311 counter_u64_add(numcalls, 1); 1312 1313 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) 1314 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); 1315 1316 if ((cnp->cn_flags & MAKEENTRY) == 0) 1317 return (cache_lookup_nomakeentry(dvp, vpp, cnp, tsp, ticksp)); 1318 1319 retry: 1320 blp = NULL; 1321 dvlp = NULL; 1322 error = 0; 1323 if (cnp->cn_namelen == 2 && 1324 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1325 counter_u64_add(dotdothits, 1); 1326 dvlp = VP2VNODELOCK(dvp); 1327 mtx_lock(dvlp); 1328 ncp = dvp->v_cache_dd; 1329 if (ncp == NULL) { 1330 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, 1331 "..", NULL); 1332 mtx_unlock(dvlp); 1333 return (0); 1334 } 1335 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1336 if (ncp->nc_flag & NCF_NEGATIVE) 1337 *vpp = NULL; 1338 else 1339 *vpp = ncp->nc_vp; 1340 } else 1341 *vpp = ncp->nc_dvp; 1342 /* Return failure if negative entry was found. */ 1343 if (*vpp == NULL) 1344 goto negative_success; 1345 CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..", 1346 dvp, cnp->cn_nameptr, *vpp); 1347 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", 1348 *vpp); 1349 cache_out_ts(ncp, tsp, ticksp); 1350 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == 1351 NCF_DTS && tsp != NULL) { 1352 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1353 *tsp = ncp_ts->nc_dotdottime; 1354 } 1355 goto success; 1356 } 1357 1358 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1359 blp = HASH2BUCKETLOCK(hash); 1360 rw_rlock(blp); 1361 1362 LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1363 counter_u64_add(numchecks, 1); 1364 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1365 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1366 break; 1367 } 1368 1369 /* We failed to find an entry */ 1370 if (__predict_false(ncp == NULL)) { 1371 rw_runlock(blp); 1372 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1373 NULL); 1374 counter_u64_add(nummiss, 1); 1375 return (0); 1376 } 1377 1378 if (ncp->nc_flag & NCF_NEGATIVE) 1379 goto negative_success; 1380 1381 /* We found a "positive" match, return the vnode */ 1382 counter_u64_add(numposhits, 1); 1383 *vpp = ncp->nc_vp; 1384 CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p", 1385 dvp, cnp->cn_nameptr, *vpp, ncp); 1386 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, 1387 *vpp); 1388 cache_out_ts(ncp, tsp, ticksp); 1389 success: 1390 /* 1391 * On success we return a locked and ref'd vnode as per the lookup 1392 * protocol. 1393 */ 1394 MPASS(dvp != *vpp); 1395 ltype = 0; /* silence gcc warning */ 1396 if (cnp->cn_flags & ISDOTDOT) { 1397 ltype = VOP_ISLOCKED(dvp); 1398 VOP_UNLOCK(dvp, 0); 1399 } 1400 vs = vget_prep(*vpp); 1401 cache_lookup_unlock(blp, dvlp); 1402 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1403 if (cnp->cn_flags & ISDOTDOT) { 1404 vn_lock(dvp, ltype | LK_RETRY); 1405 if (dvp->v_iflag & VI_DOOMED) { 1406 if (error == 0) 1407 vput(*vpp); 1408 *vpp = NULL; 1409 return (ENOENT); 1410 } 1411 } 1412 if (error) { 1413 *vpp = NULL; 1414 goto retry; 1415 } 1416 if ((cnp->cn_flags & ISLASTCN) && 1417 (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) { 1418 ASSERT_VOP_ELOCKED(*vpp, "cache_lookup"); 1419 } 1420 return (-1); 1421 1422 negative_success: 1423 /* We found a negative match, and want to create it, so purge */ 1424 if (cnp->cn_nameiop == CREATE) { 1425 counter_u64_add(numnegzaps, 1); 1426 goto zap_and_exit; 1427 } 1428 1429 counter_u64_add(numneghits, 1); 1430 cache_negative_hit(ncp); 1431 if (ncp->nc_flag & NCF_WHITE) 1432 cnp->cn_flags |= ISWHITEOUT; 1433 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, 1434 ncp->nc_name); 1435 cache_out_ts(ncp, tsp, ticksp); 1436 cache_lookup_unlock(blp, dvlp); 1437 return (ENOENT); 1438 1439 zap_and_exit: 1440 if (blp != NULL) 1441 error = cache_zap_rlocked_bucket(ncp, cnp, hash, blp); 1442 else 1443 error = cache_zap_locked_vnode(ncp, dvp); 1444 if (__predict_false(error != 0)) { 1445 zap_and_exit_bucket_fail2++; 1446 cache_maybe_yield(); 1447 goto retry; 1448 } 1449 cache_free(ncp); 1450 return (0); 1451 } 1452 1453 struct celockstate { 1454 struct mtx *vlp[3]; 1455 struct rwlock *blp[2]; 1456 }; 1457 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); 1458 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); 1459 1460 static inline void 1461 cache_celockstate_init(struct celockstate *cel) 1462 { 1463 1464 bzero(cel, sizeof(*cel)); 1465 } 1466 1467 static void 1468 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, 1469 struct vnode *dvp) 1470 { 1471 struct mtx *vlp1, *vlp2; 1472 1473 MPASS(cel->vlp[0] == NULL); 1474 MPASS(cel->vlp[1] == NULL); 1475 MPASS(cel->vlp[2] == NULL); 1476 1477 MPASS(vp != NULL || dvp != NULL); 1478 1479 vlp1 = VP2VNODELOCK(vp); 1480 vlp2 = VP2VNODELOCK(dvp); 1481 cache_sort_vnodes(&vlp1, &vlp2); 1482 1483 if (vlp1 != NULL) { 1484 mtx_lock(vlp1); 1485 cel->vlp[0] = vlp1; 1486 } 1487 mtx_lock(vlp2); 1488 cel->vlp[1] = vlp2; 1489 } 1490 1491 static void 1492 cache_unlock_vnodes_cel(struct celockstate *cel) 1493 { 1494 1495 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); 1496 1497 if (cel->vlp[0] != NULL) 1498 mtx_unlock(cel->vlp[0]); 1499 if (cel->vlp[1] != NULL) 1500 mtx_unlock(cel->vlp[1]); 1501 if (cel->vlp[2] != NULL) 1502 mtx_unlock(cel->vlp[2]); 1503 } 1504 1505 static bool 1506 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) 1507 { 1508 struct mtx *vlp; 1509 bool ret; 1510 1511 cache_assert_vlp_locked(cel->vlp[0]); 1512 cache_assert_vlp_locked(cel->vlp[1]); 1513 MPASS(cel->vlp[2] == NULL); 1514 1515 MPASS(vp != NULL); 1516 vlp = VP2VNODELOCK(vp); 1517 1518 ret = true; 1519 if (vlp >= cel->vlp[1]) { 1520 mtx_lock(vlp); 1521 } else { 1522 if (mtx_trylock(vlp)) 1523 goto out; 1524 cache_lock_vnodes_cel_3_failures++; 1525 cache_unlock_vnodes_cel(cel); 1526 if (vlp < cel->vlp[0]) { 1527 mtx_lock(vlp); 1528 mtx_lock(cel->vlp[0]); 1529 mtx_lock(cel->vlp[1]); 1530 } else { 1531 if (cel->vlp[0] != NULL) 1532 mtx_lock(cel->vlp[0]); 1533 mtx_lock(vlp); 1534 mtx_lock(cel->vlp[1]); 1535 } 1536 ret = false; 1537 } 1538 out: 1539 cel->vlp[2] = vlp; 1540 return (ret); 1541 } 1542 1543 static void 1544 cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1, 1545 struct rwlock *blp2) 1546 { 1547 1548 MPASS(cel->blp[0] == NULL); 1549 MPASS(cel->blp[1] == NULL); 1550 1551 cache_sort_vnodes(&blp1, &blp2); 1552 1553 if (blp1 != NULL) { 1554 rw_wlock(blp1); 1555 cel->blp[0] = blp1; 1556 } 1557 rw_wlock(blp2); 1558 cel->blp[1] = blp2; 1559 } 1560 1561 static void 1562 cache_unlock_buckets_cel(struct celockstate *cel) 1563 { 1564 1565 if (cel->blp[0] != NULL) 1566 rw_wunlock(cel->blp[0]); 1567 rw_wunlock(cel->blp[1]); 1568 } 1569 1570 /* 1571 * Lock part of the cache affected by the insertion. 1572 * 1573 * This means vnodelocks for dvp, vp and the relevant bucketlock. 1574 * However, insertion can result in removal of an old entry. In this 1575 * case we have an additional vnode and bucketlock pair to lock. If the 1576 * entry is negative, ncelock is locked instead of the vnode. 1577 * 1578 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while 1579 * preserving the locking order (smaller address first). 1580 */ 1581 static void 1582 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1583 uint32_t hash) 1584 { 1585 struct namecache *ncp; 1586 struct rwlock *blps[2]; 1587 1588 blps[0] = HASH2BUCKETLOCK(hash); 1589 for (;;) { 1590 blps[1] = NULL; 1591 cache_lock_vnodes_cel(cel, dvp, vp); 1592 if (vp == NULL || vp->v_type != VDIR) 1593 break; 1594 ncp = vp->v_cache_dd; 1595 if (ncp == NULL) 1596 break; 1597 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1598 break; 1599 MPASS(ncp->nc_dvp == vp); 1600 blps[1] = NCP2BUCKETLOCK(ncp); 1601 if (ncp->nc_flag & NCF_NEGATIVE) 1602 break; 1603 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1604 break; 1605 /* 1606 * All vnodes got re-locked. Re-validate the state and if 1607 * nothing changed we are done. Otherwise restart. 1608 */ 1609 if (ncp == vp->v_cache_dd && 1610 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1611 blps[1] == NCP2BUCKETLOCK(ncp) && 1612 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1613 break; 1614 cache_unlock_vnodes_cel(cel); 1615 cel->vlp[0] = NULL; 1616 cel->vlp[1] = NULL; 1617 cel->vlp[2] = NULL; 1618 } 1619 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1620 } 1621 1622 static void 1623 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1624 uint32_t hash) 1625 { 1626 struct namecache *ncp; 1627 struct rwlock *blps[2]; 1628 1629 blps[0] = HASH2BUCKETLOCK(hash); 1630 for (;;) { 1631 blps[1] = NULL; 1632 cache_lock_vnodes_cel(cel, dvp, vp); 1633 ncp = dvp->v_cache_dd; 1634 if (ncp == NULL) 1635 break; 1636 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1637 break; 1638 MPASS(ncp->nc_dvp == dvp); 1639 blps[1] = NCP2BUCKETLOCK(ncp); 1640 if (ncp->nc_flag & NCF_NEGATIVE) 1641 break; 1642 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1643 break; 1644 if (ncp == dvp->v_cache_dd && 1645 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1646 blps[1] == NCP2BUCKETLOCK(ncp) && 1647 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1648 break; 1649 cache_unlock_vnodes_cel(cel); 1650 cel->vlp[0] = NULL; 1651 cel->vlp[1] = NULL; 1652 cel->vlp[2] = NULL; 1653 } 1654 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1655 } 1656 1657 static void 1658 cache_enter_unlock(struct celockstate *cel) 1659 { 1660 1661 cache_unlock_buckets_cel(cel); 1662 cache_unlock_vnodes_cel(cel); 1663 } 1664 1665 /* 1666 * Add an entry to the cache. 1667 */ 1668 void 1669 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 1670 struct timespec *tsp, struct timespec *dtsp) 1671 { 1672 struct celockstate cel; 1673 struct namecache *ncp, *n2, *ndd; 1674 struct namecache_ts *ncp_ts, *n2_ts; 1675 struct nchashhead *ncpp; 1676 struct neglist *neglist; 1677 uint32_t hash; 1678 int flag; 1679 int len; 1680 bool neg_locked, held_dvp; 1681 u_long lnumcache; 1682 1683 CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr); 1684 VNASSERT(vp == NULL || (vp->v_iflag & VI_DOOMED) == 0, vp, 1685 ("cache_enter: Adding a doomed vnode")); 1686 VNASSERT(dvp == NULL || (dvp->v_iflag & VI_DOOMED) == 0, dvp, 1687 ("cache_enter: Doomed vnode used as src")); 1688 1689 if (__predict_false(!doingcache)) 1690 return; 1691 1692 /* 1693 * Avoid blowout in namecache entries. 1694 */ 1695 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; 1696 if (__predict_false(lnumcache >= ncsize)) { 1697 atomic_add_long(&numcache, -1); 1698 return; 1699 } 1700 1701 cache_celockstate_init(&cel); 1702 ndd = NULL; 1703 ncp_ts = NULL; 1704 flag = 0; 1705 if (cnp->cn_nameptr[0] == '.') { 1706 if (cnp->cn_namelen == 1) 1707 return; 1708 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 1709 len = cnp->cn_namelen; 1710 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1711 cache_enter_lock_dd(&cel, dvp, vp, hash); 1712 /* 1713 * If dotdot entry already exists, just retarget it 1714 * to new parent vnode, otherwise continue with new 1715 * namecache entry allocation. 1716 */ 1717 if ((ncp = dvp->v_cache_dd) != NULL && 1718 ncp->nc_flag & NCF_ISDOTDOT) { 1719 KASSERT(ncp->nc_dvp == dvp, 1720 ("wrong isdotdot parent")); 1721 neg_locked = false; 1722 if (ncp->nc_flag & NCF_NEGATIVE || vp == NULL) { 1723 neglist = NCP2NEGLIST(ncp); 1724 mtx_lock(&ncneg_hot.nl_lock); 1725 mtx_lock(&neglist->nl_lock); 1726 neg_locked = true; 1727 } 1728 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 1729 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, 1730 ncp, nc_dst); 1731 } else { 1732 cache_negative_remove(ncp, true); 1733 } 1734 if (vp != NULL) { 1735 TAILQ_INSERT_HEAD(&vp->v_cache_dst, 1736 ncp, nc_dst); 1737 if (ncp->nc_flag & NCF_HOTNEGATIVE) 1738 numhotneg--; 1739 ncp->nc_flag &= ~(NCF_NEGATIVE|NCF_HOTNEGATIVE); 1740 } else { 1741 if (ncp->nc_flag & NCF_HOTNEGATIVE) { 1742 numhotneg--; 1743 ncp->nc_flag &= ~(NCF_HOTNEGATIVE); 1744 } 1745 ncp->nc_flag |= NCF_NEGATIVE; 1746 cache_negative_insert(ncp, true); 1747 } 1748 if (neg_locked) { 1749 mtx_unlock(&neglist->nl_lock); 1750 mtx_unlock(&ncneg_hot.nl_lock); 1751 } 1752 ncp->nc_vp = vp; 1753 cache_enter_unlock(&cel); 1754 return; 1755 } 1756 dvp->v_cache_dd = NULL; 1757 cache_enter_unlock(&cel); 1758 cache_celockstate_init(&cel); 1759 SDT_PROBE3(vfs, namecache, enter, done, dvp, "..", vp); 1760 flag = NCF_ISDOTDOT; 1761 } 1762 } 1763 1764 held_dvp = false; 1765 if (LIST_EMPTY(&dvp->v_cache_src) && flag != NCF_ISDOTDOT) { 1766 vhold(dvp); 1767 atomic_add_long(&numcachehv, 1); 1768 held_dvp = true; 1769 } 1770 1771 /* 1772 * Calculate the hash key and setup as much of the new 1773 * namecache entry as possible before acquiring the lock. 1774 */ 1775 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); 1776 ncp->nc_flag = flag; 1777 ncp->nc_vp = vp; 1778 if (vp == NULL) 1779 ncp->nc_flag |= NCF_NEGATIVE; 1780 ncp->nc_dvp = dvp; 1781 if (tsp != NULL) { 1782 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1783 ncp_ts->nc_time = *tsp; 1784 ncp_ts->nc_ticks = ticks; 1785 ncp_ts->nc_nc.nc_flag |= NCF_TS; 1786 if (dtsp != NULL) { 1787 ncp_ts->nc_dotdottime = *dtsp; 1788 ncp_ts->nc_nc.nc_flag |= NCF_DTS; 1789 } 1790 } 1791 len = ncp->nc_nlen = cnp->cn_namelen; 1792 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1793 strlcpy(ncp->nc_name, cnp->cn_nameptr, len + 1); 1794 cache_enter_lock(&cel, dvp, vp, hash); 1795 1796 /* 1797 * See if this vnode or negative entry is already in the cache 1798 * with this name. This can happen with concurrent lookups of 1799 * the same path name. 1800 */ 1801 ncpp = NCHHASH(hash); 1802 LIST_FOREACH(n2, ncpp, nc_hash) { 1803 if (n2->nc_dvp == dvp && 1804 n2->nc_nlen == cnp->cn_namelen && 1805 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { 1806 if (tsp != NULL) { 1807 KASSERT((n2->nc_flag & NCF_TS) != 0, 1808 ("no NCF_TS")); 1809 n2_ts = __containerof(n2, struct namecache_ts, nc_nc); 1810 n2_ts->nc_time = ncp_ts->nc_time; 1811 n2_ts->nc_ticks = ncp_ts->nc_ticks; 1812 if (dtsp != NULL) { 1813 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; 1814 if (ncp->nc_flag & NCF_NEGATIVE) 1815 mtx_lock(&ncneg_hot.nl_lock); 1816 n2_ts->nc_nc.nc_flag |= NCF_DTS; 1817 if (ncp->nc_flag & NCF_NEGATIVE) 1818 mtx_unlock(&ncneg_hot.nl_lock); 1819 } 1820 } 1821 goto out_unlock_free; 1822 } 1823 } 1824 1825 if (flag == NCF_ISDOTDOT) { 1826 /* 1827 * See if we are trying to add .. entry, but some other lookup 1828 * has populated v_cache_dd pointer already. 1829 */ 1830 if (dvp->v_cache_dd != NULL) 1831 goto out_unlock_free; 1832 KASSERT(vp == NULL || vp->v_type == VDIR, 1833 ("wrong vnode type %p", vp)); 1834 dvp->v_cache_dd = ncp; 1835 } 1836 1837 if (vp != NULL) { 1838 if (vp->v_type == VDIR) { 1839 if (flag != NCF_ISDOTDOT) { 1840 /* 1841 * For this case, the cache entry maps both the 1842 * directory name in it and the name ".." for the 1843 * directory's parent. 1844 */ 1845 if ((ndd = vp->v_cache_dd) != NULL) { 1846 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) 1847 cache_zap_locked(ndd, false); 1848 else 1849 ndd = NULL; 1850 } 1851 vp->v_cache_dd = ncp; 1852 } 1853 } else { 1854 vp->v_cache_dd = NULL; 1855 } 1856 } 1857 1858 if (flag != NCF_ISDOTDOT) { 1859 if (LIST_EMPTY(&dvp->v_cache_src)) { 1860 if (!held_dvp) { 1861 vhold(dvp); 1862 atomic_add_long(&numcachehv, 1); 1863 } 1864 } else { 1865 if (held_dvp) { 1866 /* 1867 * This will not take the interlock as someone 1868 * else already holds the vnode on account of 1869 * the namecache and we hold locks preventing 1870 * this from changing. 1871 */ 1872 vdrop(dvp); 1873 atomic_subtract_long(&numcachehv, 1); 1874 } 1875 } 1876 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); 1877 } 1878 1879 /* 1880 * Insert the new namecache entry into the appropriate chain 1881 * within the cache entries table. 1882 */ 1883 LIST_INSERT_HEAD(ncpp, ncp, nc_hash); 1884 1885 /* 1886 * If the entry is "negative", we place it into the 1887 * "negative" cache queue, otherwise, we place it into the 1888 * destination vnode's cache entries queue. 1889 */ 1890 if (vp != NULL) { 1891 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); 1892 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, 1893 vp); 1894 } else { 1895 if (cnp->cn_flags & ISWHITEOUT) 1896 ncp->nc_flag |= NCF_WHITE; 1897 cache_negative_insert(ncp, false); 1898 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, 1899 ncp->nc_name); 1900 } 1901 cache_enter_unlock(&cel); 1902 if (numneg * ncnegfactor > lnumcache) 1903 cache_negative_zap_one(); 1904 cache_free(ndd); 1905 return; 1906 out_unlock_free: 1907 cache_enter_unlock(&cel); 1908 cache_free(ncp); 1909 if (held_dvp) { 1910 vdrop(dvp); 1911 atomic_subtract_long(&numcachehv, 1); 1912 } 1913 return; 1914 } 1915 1916 static u_int 1917 cache_roundup_2(u_int val) 1918 { 1919 u_int res; 1920 1921 for (res = 1; res <= val; res <<= 1) 1922 continue; 1923 1924 return (res); 1925 } 1926 1927 /* 1928 * Name cache initialization, from vfs_init() when we are booting 1929 */ 1930 static void 1931 nchinit(void *dummy __unused) 1932 { 1933 u_int i; 1934 1935 cache_zone_small = uma_zcreate("S VFS Cache", 1936 sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1, 1937 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache), 1938 UMA_ZONE_ZINIT); 1939 cache_zone_small_ts = uma_zcreate("STS VFS Cache", 1940 sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1, 1941 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts), 1942 UMA_ZONE_ZINIT); 1943 cache_zone_large = uma_zcreate("L VFS Cache", 1944 sizeof(struct namecache) + NAME_MAX + 1, 1945 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache), 1946 UMA_ZONE_ZINIT); 1947 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", 1948 sizeof(struct namecache_ts) + NAME_MAX + 1, 1949 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts), 1950 UMA_ZONE_ZINIT); 1951 1952 ncsize = desiredvnodes * ncsizefactor; 1953 nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash); 1954 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; 1955 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ 1956 ncbuckethash = 7; 1957 if (ncbuckethash > nchash) 1958 ncbuckethash = nchash; 1959 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, 1960 M_WAITOK | M_ZERO); 1961 for (i = 0; i < numbucketlocks; i++) 1962 rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE); 1963 ncvnodehash = ncbuckethash; 1964 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, 1965 M_WAITOK | M_ZERO); 1966 for (i = 0; i < numvnodelocks; i++) 1967 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); 1968 ncpurgeminvnodes = numbucketlocks * 2; 1969 1970 ncneghash = 3; 1971 neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE, 1972 M_WAITOK | M_ZERO); 1973 for (i = 0; i < numneglists; i++) { 1974 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); 1975 TAILQ_INIT(&neglists[i].nl_list); 1976 } 1977 mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF); 1978 TAILQ_INIT(&ncneg_hot.nl_list); 1979 1980 mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF); 1981 1982 numcalls = counter_u64_alloc(M_WAITOK); 1983 dothits = counter_u64_alloc(M_WAITOK); 1984 dotdothits = counter_u64_alloc(M_WAITOK); 1985 numchecks = counter_u64_alloc(M_WAITOK); 1986 nummiss = counter_u64_alloc(M_WAITOK); 1987 nummisszap = counter_u64_alloc(M_WAITOK); 1988 numposzaps = counter_u64_alloc(M_WAITOK); 1989 numposhits = counter_u64_alloc(M_WAITOK); 1990 numnegzaps = counter_u64_alloc(M_WAITOK); 1991 numneghits = counter_u64_alloc(M_WAITOK); 1992 numfullpathcalls = counter_u64_alloc(M_WAITOK); 1993 numfullpathfail1 = counter_u64_alloc(M_WAITOK); 1994 numfullpathfail2 = counter_u64_alloc(M_WAITOK); 1995 numfullpathfail4 = counter_u64_alloc(M_WAITOK); 1996 numfullpathfound = counter_u64_alloc(M_WAITOK); 1997 zap_and_exit_bucket_relock_success = counter_u64_alloc(M_WAITOK); 1998 numneg_evicted = counter_u64_alloc(M_WAITOK); 1999 shrinking_skipped = counter_u64_alloc(M_WAITOK); 2000 } 2001 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); 2002 2003 void 2004 cache_changesize(int newmaxvnodes) 2005 { 2006 struct nchashhead *new_nchashtbl, *old_nchashtbl; 2007 u_long new_nchash, old_nchash; 2008 struct namecache *ncp; 2009 uint32_t hash; 2010 int newncsize; 2011 int i; 2012 2013 newncsize = newmaxvnodes * ncsizefactor; 2014 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); 2015 if (newmaxvnodes < numbucketlocks) 2016 newmaxvnodes = numbucketlocks; 2017 2018 new_nchashtbl = hashinit(newmaxvnodes, M_VFSCACHE, &new_nchash); 2019 /* If same hash table size, nothing to do */ 2020 if (nchash == new_nchash) { 2021 free(new_nchashtbl, M_VFSCACHE); 2022 return; 2023 } 2024 /* 2025 * Move everything from the old hash table to the new table. 2026 * None of the namecache entries in the table can be removed 2027 * because to do so, they have to be removed from the hash table. 2028 */ 2029 cache_lock_all_vnodes(); 2030 cache_lock_all_buckets(); 2031 old_nchashtbl = nchashtbl; 2032 old_nchash = nchash; 2033 nchashtbl = new_nchashtbl; 2034 nchash = new_nchash; 2035 for (i = 0; i <= old_nchash; i++) { 2036 while ((ncp = LIST_FIRST(&old_nchashtbl[i])) != NULL) { 2037 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, 2038 ncp->nc_dvp); 2039 LIST_REMOVE(ncp, nc_hash); 2040 LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); 2041 } 2042 } 2043 ncsize = newncsize; 2044 cache_unlock_all_buckets(); 2045 cache_unlock_all_vnodes(); 2046 free(old_nchashtbl, M_VFSCACHE); 2047 } 2048 2049 /* 2050 * Invalidate all entries from and to a particular vnode. 2051 */ 2052 void 2053 cache_purge(struct vnode *vp) 2054 { 2055 TAILQ_HEAD(, namecache) ncps; 2056 struct namecache *ncp, *nnp; 2057 struct mtx *vlp, *vlp2; 2058 2059 CTR1(KTR_VFS, "cache_purge(%p)", vp); 2060 SDT_PROBE1(vfs, namecache, purge, done, vp); 2061 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2062 vp->v_cache_dd == NULL) 2063 return; 2064 TAILQ_INIT(&ncps); 2065 vlp = VP2VNODELOCK(vp); 2066 vlp2 = NULL; 2067 mtx_lock(vlp); 2068 retry: 2069 while (!LIST_EMPTY(&vp->v_cache_src)) { 2070 ncp = LIST_FIRST(&vp->v_cache_src); 2071 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2072 goto retry; 2073 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2074 } 2075 while (!TAILQ_EMPTY(&vp->v_cache_dst)) { 2076 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2077 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2078 goto retry; 2079 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2080 } 2081 ncp = vp->v_cache_dd; 2082 if (ncp != NULL) { 2083 KASSERT(ncp->nc_flag & NCF_ISDOTDOT, 2084 ("lost dotdot link")); 2085 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2086 goto retry; 2087 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2088 } 2089 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); 2090 mtx_unlock(vlp); 2091 if (vlp2 != NULL) 2092 mtx_unlock(vlp2); 2093 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2094 cache_free(ncp); 2095 } 2096 } 2097 2098 /* 2099 * Invalidate all negative entries for a particular directory vnode. 2100 */ 2101 void 2102 cache_purge_negative(struct vnode *vp) 2103 { 2104 TAILQ_HEAD(, namecache) ncps; 2105 struct namecache *ncp, *nnp; 2106 struct mtx *vlp; 2107 2108 CTR1(KTR_VFS, "cache_purge_negative(%p)", vp); 2109 SDT_PROBE1(vfs, namecache, purge_negative, done, vp); 2110 if (LIST_EMPTY(&vp->v_cache_src)) 2111 return; 2112 TAILQ_INIT(&ncps); 2113 vlp = VP2VNODELOCK(vp); 2114 mtx_lock(vlp); 2115 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { 2116 if (!(ncp->nc_flag & NCF_NEGATIVE)) 2117 continue; 2118 cache_zap_negative_locked_vnode_kl(ncp, vp); 2119 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2120 } 2121 mtx_unlock(vlp); 2122 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2123 cache_free(ncp); 2124 } 2125 } 2126 2127 /* 2128 * Flush all entries referencing a particular filesystem. 2129 */ 2130 void 2131 cache_purgevfs(struct mount *mp, bool force) 2132 { 2133 TAILQ_HEAD(, namecache) ncps; 2134 struct mtx *vlp1, *vlp2; 2135 struct rwlock *blp; 2136 struct nchashhead *bucket; 2137 struct namecache *ncp, *nnp; 2138 u_long i, j, n_nchash; 2139 int error; 2140 2141 /* Scan hash tables for applicable entries */ 2142 SDT_PROBE1(vfs, namecache, purgevfs, done, mp); 2143 if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes) 2144 return; 2145 TAILQ_INIT(&ncps); 2146 n_nchash = nchash + 1; 2147 vlp1 = vlp2 = NULL; 2148 for (i = 0; i < numbucketlocks; i++) { 2149 blp = (struct rwlock *)&bucketlocks[i]; 2150 rw_wlock(blp); 2151 for (j = i; j < n_nchash; j += numbucketlocks) { 2152 retry: 2153 bucket = &nchashtbl[j]; 2154 LIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) { 2155 cache_assert_bucket_locked(ncp, RA_WLOCKED); 2156 if (ncp->nc_dvp->v_mount != mp) 2157 continue; 2158 error = cache_zap_wlocked_bucket_kl(ncp, blp, 2159 &vlp1, &vlp2); 2160 if (error != 0) 2161 goto retry; 2162 TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst); 2163 } 2164 } 2165 rw_wunlock(blp); 2166 if (vlp1 == NULL && vlp2 == NULL) 2167 cache_maybe_yield(); 2168 } 2169 if (vlp1 != NULL) 2170 mtx_unlock(vlp1); 2171 if (vlp2 != NULL) 2172 mtx_unlock(vlp2); 2173 2174 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2175 cache_free(ncp); 2176 } 2177 } 2178 2179 /* 2180 * Perform canonical checks and cache lookup and pass on to filesystem 2181 * through the vop_cachedlookup only if needed. 2182 */ 2183 2184 int 2185 vfs_cache_lookup(struct vop_lookup_args *ap) 2186 { 2187 struct vnode *dvp; 2188 int error; 2189 struct vnode **vpp = ap->a_vpp; 2190 struct componentname *cnp = ap->a_cnp; 2191 struct ucred *cred = cnp->cn_cred; 2192 int flags = cnp->cn_flags; 2193 struct thread *td = cnp->cn_thread; 2194 2195 *vpp = NULL; 2196 dvp = ap->a_dvp; 2197 2198 if (dvp->v_type != VDIR) 2199 return (ENOTDIR); 2200 2201 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && 2202 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) 2203 return (EROFS); 2204 2205 error = VOP_ACCESS(dvp, VEXEC, cred, td); 2206 if (error) 2207 return (error); 2208 2209 error = cache_lookup(dvp, vpp, cnp, NULL, NULL); 2210 if (error == 0) 2211 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); 2212 if (error == -1) 2213 return (0); 2214 return (error); 2215 } 2216 2217 /* 2218 * XXX All of these sysctls would probably be more productive dead. 2219 */ 2220 static int __read_mostly disablecwd; 2221 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, 2222 "Disable the getcwd syscall"); 2223 2224 /* Implementation of the getcwd syscall. */ 2225 int 2226 sys___getcwd(struct thread *td, struct __getcwd_args *uap) 2227 { 2228 2229 return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen, 2230 MAXPATHLEN)); 2231 } 2232 2233 int 2234 kern___getcwd(struct thread *td, char *buf, enum uio_seg bufseg, size_t buflen, 2235 size_t path_max) 2236 { 2237 char *bp, *tmpbuf; 2238 struct filedesc *fdp; 2239 struct vnode *cdir, *rdir; 2240 int error; 2241 2242 if (__predict_false(disablecwd)) 2243 return (ENODEV); 2244 if (__predict_false(buflen < 2)) 2245 return (EINVAL); 2246 if (buflen > path_max) 2247 buflen = path_max; 2248 2249 tmpbuf = malloc(buflen, M_TEMP, M_WAITOK); 2250 fdp = td->td_proc->p_fd; 2251 FILEDESC_SLOCK(fdp); 2252 cdir = fdp->fd_cdir; 2253 vrefact(cdir); 2254 rdir = fdp->fd_rdir; 2255 vrefact(rdir); 2256 FILEDESC_SUNLOCK(fdp); 2257 error = vn_fullpath1(td, cdir, rdir, tmpbuf, &bp, buflen); 2258 vrele(rdir); 2259 vrele(cdir); 2260 2261 if (!error) { 2262 if (bufseg == UIO_SYSSPACE) 2263 bcopy(bp, buf, strlen(bp) + 1); 2264 else 2265 error = copyout(bp, buf, strlen(bp) + 1); 2266 #ifdef KTRACE 2267 if (KTRPOINT(curthread, KTR_NAMEI)) 2268 ktrnamei(bp); 2269 #endif 2270 } 2271 free(tmpbuf, M_TEMP); 2272 return (error); 2273 } 2274 2275 /* 2276 * Thus begins the fullpath magic. 2277 */ 2278 2279 static int __read_mostly disablefullpath; 2280 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0, 2281 "Disable the vn_fullpath function"); 2282 2283 /* 2284 * Retrieve the full filesystem path that correspond to a vnode from the name 2285 * cache (if available) 2286 */ 2287 int 2288 vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf) 2289 { 2290 char *buf; 2291 struct filedesc *fdp; 2292 struct vnode *rdir; 2293 int error; 2294 2295 if (__predict_false(disablefullpath)) 2296 return (ENODEV); 2297 if (__predict_false(vn == NULL)) 2298 return (EINVAL); 2299 2300 buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); 2301 fdp = td->td_proc->p_fd; 2302 FILEDESC_SLOCK(fdp); 2303 rdir = fdp->fd_rdir; 2304 vrefact(rdir); 2305 FILEDESC_SUNLOCK(fdp); 2306 error = vn_fullpath1(td, vn, rdir, buf, retbuf, MAXPATHLEN); 2307 vrele(rdir); 2308 2309 if (!error) 2310 *freebuf = buf; 2311 else 2312 free(buf, M_TEMP); 2313 return (error); 2314 } 2315 2316 /* 2317 * This function is similar to vn_fullpath, but it attempts to lookup the 2318 * pathname relative to the global root mount point. This is required for the 2319 * auditing sub-system, as audited pathnames must be absolute, relative to the 2320 * global root mount point. 2321 */ 2322 int 2323 vn_fullpath_global(struct thread *td, struct vnode *vn, 2324 char **retbuf, char **freebuf) 2325 { 2326 char *buf; 2327 int error; 2328 2329 if (__predict_false(disablefullpath)) 2330 return (ENODEV); 2331 if (__predict_false(vn == NULL)) 2332 return (EINVAL); 2333 buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); 2334 error = vn_fullpath1(td, vn, rootvnode, buf, retbuf, MAXPATHLEN); 2335 if (!error) 2336 *freebuf = buf; 2337 else 2338 free(buf, M_TEMP); 2339 return (error); 2340 } 2341 2342 int 2343 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, u_int *buflen) 2344 { 2345 struct vnode *dvp; 2346 struct namecache *ncp; 2347 struct mtx *vlp; 2348 int error; 2349 2350 vlp = VP2VNODELOCK(*vp); 2351 mtx_lock(vlp); 2352 TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) { 2353 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2354 break; 2355 } 2356 if (ncp != NULL) { 2357 if (*buflen < ncp->nc_nlen) { 2358 mtx_unlock(vlp); 2359 vrele(*vp); 2360 counter_u64_add(numfullpathfail4, 1); 2361 error = ENOMEM; 2362 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2363 vp, NULL); 2364 return (error); 2365 } 2366 *buflen -= ncp->nc_nlen; 2367 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 2368 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, 2369 ncp->nc_name, vp); 2370 dvp = *vp; 2371 *vp = ncp->nc_dvp; 2372 vref(*vp); 2373 mtx_unlock(vlp); 2374 vrele(dvp); 2375 return (0); 2376 } 2377 SDT_PROBE1(vfs, namecache, fullpath, miss, vp); 2378 2379 mtx_unlock(vlp); 2380 vn_lock(*vp, LK_SHARED | LK_RETRY); 2381 error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen); 2382 vput(*vp); 2383 if (error) { 2384 counter_u64_add(numfullpathfail2, 1); 2385 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2386 return (error); 2387 } 2388 2389 *vp = dvp; 2390 if (dvp->v_iflag & VI_DOOMED) { 2391 /* forced unmount */ 2392 vrele(dvp); 2393 error = ENOENT; 2394 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2395 return (error); 2396 } 2397 /* 2398 * *vp has its use count incremented still. 2399 */ 2400 2401 return (0); 2402 } 2403 2404 /* 2405 * The magic behind kern___getcwd() and vn_fullpath(). 2406 */ 2407 static int 2408 vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir, 2409 char *buf, char **retbuf, u_int buflen) 2410 { 2411 int error, slash_prefixed; 2412 #ifdef KDTRACE_HOOKS 2413 struct vnode *startvp = vp; 2414 #endif 2415 struct vnode *vp1; 2416 2417 buflen--; 2418 buf[buflen] = '\0'; 2419 error = 0; 2420 slash_prefixed = 0; 2421 2422 SDT_PROBE1(vfs, namecache, fullpath, entry, vp); 2423 counter_u64_add(numfullpathcalls, 1); 2424 vref(vp); 2425 if (vp->v_type != VDIR) { 2426 error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen); 2427 if (error) 2428 return (error); 2429 if (buflen == 0) { 2430 vrele(vp); 2431 return (ENOMEM); 2432 } 2433 buf[--buflen] = '/'; 2434 slash_prefixed = 1; 2435 } 2436 while (vp != rdir && vp != rootvnode) { 2437 /* 2438 * The vp vnode must be already fully constructed, 2439 * since it is either found in namecache or obtained 2440 * from VOP_VPTOCNP(). We may test for VV_ROOT safely 2441 * without obtaining the vnode lock. 2442 */ 2443 if ((vp->v_vflag & VV_ROOT) != 0) { 2444 vn_lock(vp, LK_RETRY | LK_SHARED); 2445 2446 /* 2447 * With the vnode locked, check for races with 2448 * unmount, forced or not. Note that we 2449 * already verified that vp is not equal to 2450 * the root vnode, which means that 2451 * mnt_vnodecovered can be NULL only for the 2452 * case of unmount. 2453 */ 2454 if ((vp->v_iflag & VI_DOOMED) != 0 || 2455 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || 2456 vp1->v_mountedhere != vp->v_mount) { 2457 vput(vp); 2458 error = ENOENT; 2459 SDT_PROBE3(vfs, namecache, fullpath, return, 2460 error, vp, NULL); 2461 break; 2462 } 2463 2464 vref(vp1); 2465 vput(vp); 2466 vp = vp1; 2467 continue; 2468 } 2469 if (vp->v_type != VDIR) { 2470 vrele(vp); 2471 counter_u64_add(numfullpathfail1, 1); 2472 error = ENOTDIR; 2473 SDT_PROBE3(vfs, namecache, fullpath, return, 2474 error, vp, NULL); 2475 break; 2476 } 2477 error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen); 2478 if (error) 2479 break; 2480 if (buflen == 0) { 2481 vrele(vp); 2482 error = ENOMEM; 2483 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2484 startvp, NULL); 2485 break; 2486 } 2487 buf[--buflen] = '/'; 2488 slash_prefixed = 1; 2489 } 2490 if (error) 2491 return (error); 2492 if (!slash_prefixed) { 2493 if (buflen == 0) { 2494 vrele(vp); 2495 counter_u64_add(numfullpathfail4, 1); 2496 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, 2497 startvp, NULL); 2498 return (ENOMEM); 2499 } 2500 buf[--buflen] = '/'; 2501 } 2502 counter_u64_add(numfullpathfound, 1); 2503 vrele(vp); 2504 2505 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, buf + buflen); 2506 *retbuf = buf + buflen; 2507 return (0); 2508 } 2509 2510 struct vnode * 2511 vn_dir_dd_ino(struct vnode *vp) 2512 { 2513 struct namecache *ncp; 2514 struct vnode *ddvp; 2515 struct mtx *vlp; 2516 enum vgetstate vs; 2517 2518 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); 2519 vlp = VP2VNODELOCK(vp); 2520 mtx_lock(vlp); 2521 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { 2522 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 2523 continue; 2524 ddvp = ncp->nc_dvp; 2525 vs = vget_prep(ddvp); 2526 mtx_unlock(vlp); 2527 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) 2528 return (NULL); 2529 return (ddvp); 2530 } 2531 mtx_unlock(vlp); 2532 return (NULL); 2533 } 2534 2535 int 2536 vn_commname(struct vnode *vp, char *buf, u_int buflen) 2537 { 2538 struct namecache *ncp; 2539 struct mtx *vlp; 2540 int l; 2541 2542 vlp = VP2VNODELOCK(vp); 2543 mtx_lock(vlp); 2544 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) 2545 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2546 break; 2547 if (ncp == NULL) { 2548 mtx_unlock(vlp); 2549 return (ENOENT); 2550 } 2551 l = min(ncp->nc_nlen, buflen - 1); 2552 memcpy(buf, ncp->nc_name, l); 2553 mtx_unlock(vlp); 2554 buf[l] = '\0'; 2555 return (0); 2556 } 2557 2558 /* 2559 * This function updates path string to vnode's full global path 2560 * and checks the size of the new path string against the pathlen argument. 2561 * 2562 * Requires a locked, referenced vnode. 2563 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 2564 * 2565 * If sysctl debug.disablefullpath is set, ENODEV is returned, 2566 * vnode is left locked and path remain untouched. 2567 * 2568 * If vp is a directory, the call to vn_fullpath_global() always succeeds 2569 * because it falls back to the ".." lookup if the namecache lookup fails. 2570 */ 2571 int 2572 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, 2573 u_int pathlen) 2574 { 2575 struct nameidata nd; 2576 struct vnode *vp1; 2577 char *rpath, *fbuf; 2578 int error; 2579 2580 ASSERT_VOP_ELOCKED(vp, __func__); 2581 2582 /* Return ENODEV if sysctl debug.disablefullpath==1 */ 2583 if (__predict_false(disablefullpath)) 2584 return (ENODEV); 2585 2586 /* Construct global filesystem path from vp. */ 2587 VOP_UNLOCK(vp, 0); 2588 error = vn_fullpath_global(td, vp, &rpath, &fbuf); 2589 2590 if (error != 0) { 2591 vrele(vp); 2592 return (error); 2593 } 2594 2595 if (strlen(rpath) >= pathlen) { 2596 vrele(vp); 2597 error = ENAMETOOLONG; 2598 goto out; 2599 } 2600 2601 /* 2602 * Re-lookup the vnode by path to detect a possible rename. 2603 * As a side effect, the vnode is relocked. 2604 * If vnode was renamed, return ENOENT. 2605 */ 2606 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, 2607 UIO_SYSSPACE, path, td); 2608 error = namei(&nd); 2609 if (error != 0) { 2610 vrele(vp); 2611 goto out; 2612 } 2613 NDFREE(&nd, NDF_ONLY_PNBUF); 2614 vp1 = nd.ni_vp; 2615 vrele(vp); 2616 if (vp1 == vp) 2617 strcpy(path, rpath); 2618 else { 2619 vput(vp1); 2620 error = ENOENT; 2621 } 2622 2623 out: 2624 free(fbuf, M_TEMP); 2625 return (error); 2626 } 2627 2628 #ifdef DDB 2629 static void 2630 db_print_vpath(struct vnode *vp) 2631 { 2632 2633 while (vp != NULL) { 2634 db_printf("%p: ", vp); 2635 if (vp == rootvnode) { 2636 db_printf("/"); 2637 vp = NULL; 2638 } else { 2639 if (vp->v_vflag & VV_ROOT) { 2640 db_printf("<mount point>"); 2641 vp = vp->v_mount->mnt_vnodecovered; 2642 } else { 2643 struct namecache *ncp; 2644 char *ncn; 2645 int i; 2646 2647 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2648 if (ncp != NULL) { 2649 ncn = ncp->nc_name; 2650 for (i = 0; i < ncp->nc_nlen; i++) 2651 db_printf("%c", *ncn++); 2652 vp = ncp->nc_dvp; 2653 } else { 2654 vp = NULL; 2655 } 2656 } 2657 } 2658 db_printf("\n"); 2659 } 2660 2661 return; 2662 } 2663 2664 DB_SHOW_COMMAND(vpath, db_show_vpath) 2665 { 2666 struct vnode *vp; 2667 2668 if (!have_addr) { 2669 db_printf("usage: show vpath <struct vnode *>\n"); 2670 return; 2671 } 2672 2673 vp = (struct vnode *)addr; 2674 db_print_vpath(vp); 2675 } 2676 2677 #endif 2678