1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Poul-Henning Kamp of the FreeBSD Project. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_ddb.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/counter.h> 46 #include <sys/filedesc.h> 47 #include <sys/fnv_hash.h> 48 #include <sys/kernel.h> 49 #include <sys/ktr.h> 50 #include <sys/lock.h> 51 #include <sys/malloc.h> 52 #include <sys/fcntl.h> 53 #include <sys/mount.h> 54 #include <sys/namei.h> 55 #include <sys/proc.h> 56 #include <sys/rwlock.h> 57 #include <sys/sdt.h> 58 #include <sys/smp.h> 59 #include <sys/syscallsubr.h> 60 #include <sys/sysctl.h> 61 #include <sys/sysproto.h> 62 #include <sys/vnode.h> 63 #ifdef KTRACE 64 #include <sys/ktrace.h> 65 #endif 66 67 #ifdef DDB 68 #include <ddb/ddb.h> 69 #endif 70 71 #include <vm/uma.h> 72 73 SDT_PROVIDER_DECLARE(vfs); 74 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", 75 "struct vnode *"); 76 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", 77 "char *"); 78 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); 79 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", 80 "char *", "struct vnode *"); 81 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); 82 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", 83 "struct vnode *", "char *"); 84 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", 85 "struct vnode *"); 86 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, 87 "struct vnode *", "char *"); 88 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", 89 "char *"); 90 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); 91 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); 92 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); 93 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", 94 "struct vnode *"); 95 SDT_PROBE_DEFINE3(vfs, namecache, zap_negative, done, "struct vnode *", 96 "char *", "int"); 97 SDT_PROBE_DEFINE3(vfs, namecache, shrink_negative, done, "struct vnode *", 98 "char *", "int"); 99 100 /* 101 * This structure describes the elements in the cache of recent 102 * names looked up by namei. 103 */ 104 105 struct namecache { 106 LIST_ENTRY(namecache) nc_hash; /* hash chain */ 107 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 108 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 109 struct vnode *nc_dvp; /* vnode of parent of name */ 110 union { 111 struct vnode *nu_vp; /* vnode the name refers to */ 112 u_int nu_neghits; /* negative entry hits */ 113 } n_un; 114 u_char nc_flag; /* flag bits */ 115 u_char nc_nlen; /* length of name */ 116 char nc_name[0]; /* segment name + nul */ 117 }; 118 119 /* 120 * struct namecache_ts repeats struct namecache layout up to the 121 * nc_nlen member. 122 * struct namecache_ts is used in place of struct namecache when time(s) need 123 * to be stored. The nc_dotdottime field is used when a cache entry is mapping 124 * both a non-dotdot directory name plus dotdot for the directory's 125 * parent. 126 */ 127 struct namecache_ts { 128 struct timespec nc_time; /* timespec provided by fs */ 129 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ 130 int nc_ticks; /* ticks value when entry was added */ 131 struct namecache nc_nc; 132 }; 133 134 #define nc_vp n_un.nu_vp 135 #define nc_neghits n_un.nu_neghits 136 137 /* 138 * Flags in namecache.nc_flag 139 */ 140 #define NCF_WHITE 0x01 141 #define NCF_ISDOTDOT 0x02 142 #define NCF_TS 0x04 143 #define NCF_DTS 0x08 144 #define NCF_DVDROP 0x10 145 #define NCF_NEGATIVE 0x20 146 #define NCF_HOTNEGATIVE 0x40 147 148 /* 149 * Name caching works as follows: 150 * 151 * Names found by directory scans are retained in a cache 152 * for future reference. It is managed LRU, so frequently 153 * used names will hang around. Cache is indexed by hash value 154 * obtained from (dvp, name) where dvp refers to the directory 155 * containing name. 156 * 157 * If it is a "negative" entry, (i.e. for a name that is known NOT to 158 * exist) the vnode pointer will be NULL. 159 * 160 * Upon reaching the last segment of a path, if the reference 161 * is for DELETE, or NOCACHE is set (rewrite), and the 162 * name is located in the cache, it will be dropped. 163 * 164 * These locks are used (in the order in which they can be taken): 165 * NAME TYPE ROLE 166 * vnodelock mtx vnode lists and v_cache_dd field protection 167 * bucketlock rwlock for access to given set of hash buckets 168 * neglist mtx negative entry LRU management 169 * 170 * Additionally, ncneg_shrink_lock mtx is used to have at most one thread 171 * shrinking the LRU list. 172 * 173 * It is legal to take multiple vnodelock and bucketlock locks. The locking 174 * order is lower address first. Both are recursive. 175 * 176 * "." lookups are lockless. 177 * 178 * ".." and vnode -> name lookups require vnodelock. 179 * 180 * name -> vnode lookup requires the relevant bucketlock to be held for reading. 181 * 182 * Insertions and removals of entries require involved vnodes and bucketlocks 183 * to be write-locked to prevent other threads from seeing the entry. 184 * 185 * Some lookups result in removal of the found entry (e.g. getting rid of a 186 * negative entry with the intent to create a positive one), which poses a 187 * problem when multiple threads reach the state. Similarly, two different 188 * threads can purge two different vnodes and try to remove the same name. 189 * 190 * If the already held vnode lock is lower than the second required lock, we 191 * can just take the other lock. However, in the opposite case, this could 192 * deadlock. As such, this is resolved by trylocking and if that fails unlocking 193 * the first node, locking everything in order and revalidating the state. 194 */ 195 196 /* 197 * Structures associated with name caching. 198 */ 199 #define NCHHASH(hash) \ 200 (&nchashtbl[(hash) & nchash]) 201 static __read_mostly LIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ 202 static u_long __read_mostly nchash; /* size of hash table */ 203 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 204 "Size of namecache hash table"); 205 static u_long __read_mostly ncnegfactor = 12; /* ratio of negative entries */ 206 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 207 "Ratio of negative namecache entries"); 208 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ 209 SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, 210 "Number of negative entries in namecache"); 211 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ 212 SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, 213 "Number of namecache entries"); 214 static u_long __exclusive_cache_line numcachehv;/* number of cache entries with vnodes held */ 215 SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0, 216 "Number of namecache entries with vnodes held"); 217 u_int __read_mostly ncsizefactor = 2; 218 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0, 219 "Size factor for namecache"); 220 static u_int __read_mostly ncpurgeminvnodes; 221 SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0, 222 "Number of vnodes below which purgevfs ignores the request"); 223 static u_int __read_mostly ncneghitsrequeue = 8; 224 SYSCTL_UINT(_vfs, OID_AUTO, ncneghitsrequeue, CTLFLAG_RW, &ncneghitsrequeue, 0, 225 "Number of hits to requeue a negative entry in the LRU list"); 226 227 struct nchstats nchstats; /* cache effectiveness statistics */ 228 229 static struct mtx __exclusive_cache_line ncneg_shrink_lock; 230 static int shrink_list_turn; 231 232 struct neglist { 233 struct mtx nl_lock; 234 TAILQ_HEAD(, namecache) nl_list; 235 } __aligned(CACHE_LINE_SIZE); 236 237 static struct neglist __read_mostly *neglists; 238 static struct neglist ncneg_hot; 239 static u_long numhotneg; 240 241 #define numneglists (ncneghash + 1) 242 static u_int __read_mostly ncneghash; 243 static inline struct neglist * 244 NCP2NEGLIST(struct namecache *ncp) 245 { 246 247 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); 248 } 249 250 #define numbucketlocks (ncbuckethash + 1) 251 static u_int __read_mostly ncbuckethash; 252 static struct rwlock_padalign __read_mostly *bucketlocks; 253 #define HASH2BUCKETLOCK(hash) \ 254 ((struct rwlock *)(&bucketlocks[((hash) & ncbuckethash)])) 255 256 #define numvnodelocks (ncvnodehash + 1) 257 static u_int __read_mostly ncvnodehash; 258 static struct mtx __read_mostly *vnodelocks; 259 static inline struct mtx * 260 VP2VNODELOCK(struct vnode *vp) 261 { 262 263 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); 264 } 265 266 /* 267 * UMA zones for the VFS cache. 268 * 269 * The small cache is used for entries with short names, which are the 270 * most common. The large cache is used for entries which are too big to 271 * fit in the small cache. 272 */ 273 static uma_zone_t __read_mostly cache_zone_small; 274 static uma_zone_t __read_mostly cache_zone_small_ts; 275 static uma_zone_t __read_mostly cache_zone_large; 276 static uma_zone_t __read_mostly cache_zone_large_ts; 277 278 #define CACHE_PATH_CUTOFF 35 279 280 static struct namecache * 281 cache_alloc(int len, int ts) 282 { 283 struct namecache_ts *ncp_ts; 284 struct namecache *ncp; 285 286 if (__predict_false(ts)) { 287 if (len <= CACHE_PATH_CUTOFF) 288 ncp_ts = uma_zalloc(cache_zone_small_ts, M_WAITOK); 289 else 290 ncp_ts = uma_zalloc(cache_zone_large_ts, M_WAITOK); 291 ncp = &ncp_ts->nc_nc; 292 } else { 293 if (len <= CACHE_PATH_CUTOFF) 294 ncp = uma_zalloc(cache_zone_small, M_WAITOK); 295 else 296 ncp = uma_zalloc(cache_zone_large, M_WAITOK); 297 } 298 return (ncp); 299 } 300 301 static void 302 cache_free(struct namecache *ncp) 303 { 304 struct namecache_ts *ncp_ts; 305 306 if (ncp == NULL) 307 return; 308 if ((ncp->nc_flag & NCF_DVDROP) != 0) 309 vdrop(ncp->nc_dvp); 310 if (__predict_false(ncp->nc_flag & NCF_TS)) { 311 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 312 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 313 uma_zfree(cache_zone_small_ts, ncp_ts); 314 else 315 uma_zfree(cache_zone_large_ts, ncp_ts); 316 } else { 317 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 318 uma_zfree(cache_zone_small, ncp); 319 else 320 uma_zfree(cache_zone_large, ncp); 321 } 322 } 323 324 static void 325 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) 326 { 327 struct namecache_ts *ncp_ts; 328 329 KASSERT((ncp->nc_flag & NCF_TS) != 0 || 330 (tsp == NULL && ticksp == NULL), 331 ("No NCF_TS")); 332 333 if (tsp == NULL && ticksp == NULL) 334 return; 335 336 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 337 if (tsp != NULL) 338 *tsp = ncp_ts->nc_time; 339 if (ticksp != NULL) 340 *ticksp = ncp_ts->nc_ticks; 341 } 342 343 static int __read_mostly doingcache = 1; /* 1 => enable the cache */ 344 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, 345 "VFS namecache enabled"); 346 347 /* Export size information to userland */ 348 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 349 sizeof(struct namecache), "sizeof(struct namecache)"); 350 351 /* 352 * The new name cache statistics 353 */ 354 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, 355 "Name cache statistics"); 356 #define STATNODE_ULONG(name, descr) \ 357 SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr); 358 #define STATNODE_COUNTER(name, descr) \ 359 static counter_u64_t __read_mostly name; \ 360 SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, descr); 361 STATNODE_ULONG(numneg, "Number of negative cache entries"); 362 STATNODE_ULONG(numcache, "Number of cache entries"); 363 STATNODE_COUNTER(numcalls, "Number of cache lookups"); 364 STATNODE_COUNTER(dothits, "Number of '.' hits"); 365 STATNODE_COUNTER(dotdothits, "Number of '..' hits"); 366 STATNODE_COUNTER(numchecks, "Number of checks in lookup"); 367 STATNODE_COUNTER(nummiss, "Number of cache misses"); 368 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache"); 369 STATNODE_COUNTER(numposzaps, 370 "Number of cache hits (positive) we do not want to cache"); 371 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)"); 372 STATNODE_COUNTER(numnegzaps, 373 "Number of cache hits (negative) we do not want to cache"); 374 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)"); 375 /* These count for kern___getcwd(), too. */ 376 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls"); 377 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); 378 STATNODE_COUNTER(numfullpathfail2, 379 "Number of fullpath search errors (VOP_VPTOCNP failures)"); 380 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); 381 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls"); 382 static long numneg_evicted; STATNODE_ULONG(numneg_evicted, 383 "Number of negative entries evicted when adding a new entry"); 384 STATNODE_COUNTER(zap_and_exit_bucket_relock_success, 385 "Number of successful removals after relocking"); 386 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail, 387 "Number of times zap_and_exit failed to lock"); 388 static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2, 389 "Number of times zap_and_exit failed to lock"); 390 static long cache_lock_vnodes_cel_3_failures; 391 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures, 392 "Number of times 3-way vnode locking failed"); 393 STATNODE_ULONG(numhotneg, "Number of hot negative entries"); 394 395 static void cache_zap_locked(struct namecache *ncp, bool neg_locked); 396 static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir, 397 char *buf, char **retbuf, u_int buflen); 398 399 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 400 401 static int cache_yield; 402 SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0, 403 "Number of times cache called yield"); 404 405 static void __noinline 406 cache_maybe_yield(void) 407 { 408 409 if (should_yield()) { 410 cache_yield++; 411 kern_yield(PRI_USER); 412 } 413 } 414 415 static inline void 416 cache_assert_vlp_locked(struct mtx *vlp) 417 { 418 419 if (vlp != NULL) 420 mtx_assert(vlp, MA_OWNED); 421 } 422 423 static inline void 424 cache_assert_vnode_locked(struct vnode *vp) 425 { 426 struct mtx *vlp; 427 428 vlp = VP2VNODELOCK(vp); 429 cache_assert_vlp_locked(vlp); 430 } 431 432 static uint32_t 433 cache_get_hash(char *name, u_char len, struct vnode *dvp) 434 { 435 uint32_t hash; 436 437 hash = fnv_32_buf(name, len, FNV1_32_INIT); 438 hash = fnv_32_buf(&dvp, sizeof(dvp), hash); 439 return (hash); 440 } 441 442 static inline struct rwlock * 443 NCP2BUCKETLOCK(struct namecache *ncp) 444 { 445 uint32_t hash; 446 447 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 448 return (HASH2BUCKETLOCK(hash)); 449 } 450 451 #ifdef INVARIANTS 452 static void 453 cache_assert_bucket_locked(struct namecache *ncp, int mode) 454 { 455 struct rwlock *blp; 456 457 blp = NCP2BUCKETLOCK(ncp); 458 rw_assert(blp, mode); 459 } 460 #else 461 #define cache_assert_bucket_locked(x, y) do { } while (0) 462 #endif 463 464 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) 465 static void 466 _cache_sort_vnodes(void **p1, void **p2) 467 { 468 void *tmp; 469 470 MPASS(*p1 != NULL || *p2 != NULL); 471 472 if (*p1 > *p2) { 473 tmp = *p2; 474 *p2 = *p1; 475 *p1 = tmp; 476 } 477 } 478 479 static void 480 cache_lock_all_buckets(void) 481 { 482 u_int i; 483 484 for (i = 0; i < numbucketlocks; i++) 485 rw_wlock(&bucketlocks[i]); 486 } 487 488 static void 489 cache_unlock_all_buckets(void) 490 { 491 u_int i; 492 493 for (i = 0; i < numbucketlocks; i++) 494 rw_wunlock(&bucketlocks[i]); 495 } 496 497 static void 498 cache_lock_all_vnodes(void) 499 { 500 u_int i; 501 502 for (i = 0; i < numvnodelocks; i++) 503 mtx_lock(&vnodelocks[i]); 504 } 505 506 static void 507 cache_unlock_all_vnodes(void) 508 { 509 u_int i; 510 511 for (i = 0; i < numvnodelocks; i++) 512 mtx_unlock(&vnodelocks[i]); 513 } 514 515 static int 516 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 517 { 518 519 cache_sort_vnodes(&vlp1, &vlp2); 520 521 if (vlp1 != NULL) { 522 if (!mtx_trylock(vlp1)) 523 return (EAGAIN); 524 } 525 if (!mtx_trylock(vlp2)) { 526 if (vlp1 != NULL) 527 mtx_unlock(vlp1); 528 return (EAGAIN); 529 } 530 531 return (0); 532 } 533 534 static void 535 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 536 { 537 538 MPASS(vlp1 != NULL || vlp2 != NULL); 539 MPASS(vlp1 <= vlp2); 540 541 if (vlp1 != NULL) 542 mtx_lock(vlp1); 543 if (vlp2 != NULL) 544 mtx_lock(vlp2); 545 } 546 547 static void 548 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 549 { 550 551 MPASS(vlp1 != NULL || vlp2 != NULL); 552 553 if (vlp1 != NULL) 554 mtx_unlock(vlp1); 555 if (vlp2 != NULL) 556 mtx_unlock(vlp2); 557 } 558 559 static int 560 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 561 { 562 struct nchstats snap; 563 564 if (req->oldptr == NULL) 565 return (SYSCTL_OUT(req, 0, sizeof(snap))); 566 567 snap = nchstats; 568 snap.ncs_goodhits = counter_u64_fetch(numposhits); 569 snap.ncs_neghits = counter_u64_fetch(numneghits); 570 snap.ncs_badhits = counter_u64_fetch(numposzaps) + 571 counter_u64_fetch(numnegzaps); 572 snap.ncs_miss = counter_u64_fetch(nummisszap) + 573 counter_u64_fetch(nummiss); 574 575 return (SYSCTL_OUT(req, &snap, sizeof(snap))); 576 } 577 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | 578 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", 579 "VFS cache effectiveness statistics"); 580 581 #ifdef DIAGNOSTIC 582 /* 583 * Grab an atomic snapshot of the name cache hash chain lengths 584 */ 585 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL, 586 "hash table stats"); 587 588 static int 589 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) 590 { 591 struct nchashhead *ncpp; 592 struct namecache *ncp; 593 int i, error, n_nchash, *cntbuf; 594 595 retry: 596 n_nchash = nchash + 1; /* nchash is max index, not count */ 597 if (req->oldptr == NULL) 598 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); 599 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); 600 cache_lock_all_buckets(); 601 if (n_nchash != nchash + 1) { 602 cache_unlock_all_buckets(); 603 free(cntbuf, M_TEMP); 604 goto retry; 605 } 606 /* Scan hash tables counting entries */ 607 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) 608 LIST_FOREACH(ncp, ncpp, nc_hash) 609 cntbuf[i]++; 610 cache_unlock_all_buckets(); 611 for (error = 0, i = 0; i < n_nchash; i++) 612 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) 613 break; 614 free(cntbuf, M_TEMP); 615 return (error); 616 } 617 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| 618 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", 619 "nchash chain lengths"); 620 621 static int 622 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) 623 { 624 int error; 625 struct nchashhead *ncpp; 626 struct namecache *ncp; 627 int n_nchash; 628 int count, maxlength, used, pct; 629 630 if (!req->oldptr) 631 return SYSCTL_OUT(req, 0, 4 * sizeof(int)); 632 633 cache_lock_all_buckets(); 634 n_nchash = nchash + 1; /* nchash is max index, not count */ 635 used = 0; 636 maxlength = 0; 637 638 /* Scan hash tables for applicable entries */ 639 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { 640 count = 0; 641 LIST_FOREACH(ncp, ncpp, nc_hash) { 642 count++; 643 } 644 if (count) 645 used++; 646 if (maxlength < count) 647 maxlength = count; 648 } 649 n_nchash = nchash + 1; 650 cache_unlock_all_buckets(); 651 pct = (used * 100) / (n_nchash / 100); 652 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); 653 if (error) 654 return (error); 655 error = SYSCTL_OUT(req, &used, sizeof(used)); 656 if (error) 657 return (error); 658 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); 659 if (error) 660 return (error); 661 error = SYSCTL_OUT(req, &pct, sizeof(pct)); 662 if (error) 663 return (error); 664 return (0); 665 } 666 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| 667 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", 668 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); 669 #endif 670 671 /* 672 * Negative entries management 673 * 674 * A variation of LRU scheme is used. New entries are hashed into one of 675 * numneglists cold lists. Entries get promoted to the hot list on first hit. 676 * Partial LRU for the hot list is maintained by requeueing them every 677 * ncneghitsrequeue hits. 678 * 679 * The shrinker will demote hot list head and evict from the cold list in a 680 * round-robin manner. 681 */ 682 static void 683 cache_negative_hit(struct namecache *ncp) 684 { 685 struct neglist *neglist; 686 u_int hits; 687 688 MPASS(ncp->nc_flag & NCF_NEGATIVE); 689 hits = atomic_fetchadd_int(&ncp->nc_neghits, 1); 690 if (ncp->nc_flag & NCF_HOTNEGATIVE) { 691 if ((hits % ncneghitsrequeue) != 0) 692 return; 693 mtx_lock(&ncneg_hot.nl_lock); 694 if (ncp->nc_flag & NCF_HOTNEGATIVE) { 695 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 696 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); 697 mtx_unlock(&ncneg_hot.nl_lock); 698 return; 699 } 700 /* 701 * The shrinker cleared the flag and removed the entry from 702 * the hot list. Put it back. 703 */ 704 } else { 705 mtx_lock(&ncneg_hot.nl_lock); 706 } 707 neglist = NCP2NEGLIST(ncp); 708 mtx_lock(&neglist->nl_lock); 709 if (!(ncp->nc_flag & NCF_HOTNEGATIVE)) { 710 numhotneg++; 711 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 712 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); 713 ncp->nc_flag |= NCF_HOTNEGATIVE; 714 } 715 mtx_unlock(&neglist->nl_lock); 716 mtx_unlock(&ncneg_hot.nl_lock); 717 } 718 719 static void 720 cache_negative_insert(struct namecache *ncp, bool neg_locked) 721 { 722 struct neglist *neglist; 723 724 MPASS(ncp->nc_flag & NCF_NEGATIVE); 725 cache_assert_bucket_locked(ncp, RA_WLOCKED); 726 neglist = NCP2NEGLIST(ncp); 727 if (!neg_locked) { 728 mtx_lock(&neglist->nl_lock); 729 } else { 730 mtx_assert(&neglist->nl_lock, MA_OWNED); 731 } 732 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 733 if (!neg_locked) 734 mtx_unlock(&neglist->nl_lock); 735 atomic_add_rel_long(&numneg, 1); 736 } 737 738 static void 739 cache_negative_remove(struct namecache *ncp, bool neg_locked) 740 { 741 struct neglist *neglist; 742 bool hot_locked = false; 743 bool list_locked = false; 744 745 MPASS(ncp->nc_flag & NCF_NEGATIVE); 746 cache_assert_bucket_locked(ncp, RA_WLOCKED); 747 neglist = NCP2NEGLIST(ncp); 748 if (!neg_locked) { 749 if (ncp->nc_flag & NCF_HOTNEGATIVE) { 750 hot_locked = true; 751 mtx_lock(&ncneg_hot.nl_lock); 752 if (!(ncp->nc_flag & NCF_HOTNEGATIVE)) { 753 list_locked = true; 754 mtx_lock(&neglist->nl_lock); 755 } 756 } else { 757 list_locked = true; 758 mtx_lock(&neglist->nl_lock); 759 } 760 } 761 if (ncp->nc_flag & NCF_HOTNEGATIVE) { 762 mtx_assert(&ncneg_hot.nl_lock, MA_OWNED); 763 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 764 numhotneg--; 765 } else { 766 mtx_assert(&neglist->nl_lock, MA_OWNED); 767 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 768 } 769 if (list_locked) 770 mtx_unlock(&neglist->nl_lock); 771 if (hot_locked) 772 mtx_unlock(&ncneg_hot.nl_lock); 773 atomic_subtract_rel_long(&numneg, 1); 774 } 775 776 static void 777 cache_negative_shrink_select(int start, struct namecache **ncpp, 778 struct neglist **neglistpp) 779 { 780 struct neglist *neglist; 781 struct namecache *ncp; 782 int i; 783 784 *ncpp = ncp = NULL; 785 neglist = NULL; 786 787 for (i = start; i < numneglists; i++) { 788 neglist = &neglists[i]; 789 if (TAILQ_FIRST(&neglist->nl_list) == NULL) 790 continue; 791 mtx_lock(&neglist->nl_lock); 792 ncp = TAILQ_FIRST(&neglist->nl_list); 793 if (ncp != NULL) 794 break; 795 mtx_unlock(&neglist->nl_lock); 796 } 797 798 *neglistpp = neglist; 799 *ncpp = ncp; 800 } 801 802 static void 803 cache_negative_zap_one(void) 804 { 805 struct namecache *ncp, *ncp2; 806 struct neglist *neglist; 807 struct mtx *dvlp; 808 struct rwlock *blp; 809 810 if (mtx_owner(&ncneg_shrink_lock) != NULL || 811 !mtx_trylock(&ncneg_shrink_lock)) 812 return; 813 814 mtx_lock(&ncneg_hot.nl_lock); 815 ncp = TAILQ_FIRST(&ncneg_hot.nl_list); 816 if (ncp != NULL) { 817 neglist = NCP2NEGLIST(ncp); 818 mtx_lock(&neglist->nl_lock); 819 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 820 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 821 ncp->nc_flag &= ~NCF_HOTNEGATIVE; 822 numhotneg--; 823 mtx_unlock(&neglist->nl_lock); 824 } 825 mtx_unlock(&ncneg_hot.nl_lock); 826 827 cache_negative_shrink_select(shrink_list_turn, &ncp, &neglist); 828 shrink_list_turn++; 829 if (shrink_list_turn == numneglists) 830 shrink_list_turn = 0; 831 if (ncp == NULL && shrink_list_turn == 0) 832 cache_negative_shrink_select(shrink_list_turn, &ncp, &neglist); 833 if (ncp == NULL) 834 goto out; 835 836 MPASS(ncp->nc_flag & NCF_NEGATIVE); 837 dvlp = VP2VNODELOCK(ncp->nc_dvp); 838 blp = NCP2BUCKETLOCK(ncp); 839 mtx_unlock(&neglist->nl_lock); 840 mtx_lock(dvlp); 841 rw_wlock(blp); 842 mtx_lock(&neglist->nl_lock); 843 ncp2 = TAILQ_FIRST(&neglist->nl_list); 844 if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) || 845 blp != NCP2BUCKETLOCK(ncp2) || !(ncp2->nc_flag & NCF_NEGATIVE)) { 846 ncp = NULL; 847 goto out_unlock_all; 848 } 849 SDT_PROBE3(vfs, namecache, shrink_negative, done, ncp->nc_dvp, 850 ncp->nc_name, ncp->nc_neghits); 851 852 cache_zap_locked(ncp, true); 853 numneg_evicted++; 854 out_unlock_all: 855 mtx_unlock(&neglist->nl_lock); 856 rw_wunlock(blp); 857 mtx_unlock(dvlp); 858 out: 859 mtx_unlock(&ncneg_shrink_lock); 860 cache_free(ncp); 861 } 862 863 /* 864 * cache_zap_locked(): 865 * 866 * Removes a namecache entry from cache, whether it contains an actual 867 * pointer to a vnode or if it is just a negative cache entry. 868 */ 869 static void 870 cache_zap_locked(struct namecache *ncp, bool neg_locked) 871 { 872 873 if (!(ncp->nc_flag & NCF_NEGATIVE)) 874 cache_assert_vnode_locked(ncp->nc_vp); 875 cache_assert_vnode_locked(ncp->nc_dvp); 876 cache_assert_bucket_locked(ncp, RA_WLOCKED); 877 878 CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, 879 (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp); 880 LIST_REMOVE(ncp, nc_hash); 881 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 882 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, 883 ncp->nc_name, ncp->nc_vp); 884 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); 885 if (ncp == ncp->nc_vp->v_cache_dd) 886 ncp->nc_vp->v_cache_dd = NULL; 887 } else { 888 SDT_PROBE3(vfs, namecache, zap_negative, done, ncp->nc_dvp, 889 ncp->nc_name, ncp->nc_neghits); 890 cache_negative_remove(ncp, neg_locked); 891 } 892 if (ncp->nc_flag & NCF_ISDOTDOT) { 893 if (ncp == ncp->nc_dvp->v_cache_dd) 894 ncp->nc_dvp->v_cache_dd = NULL; 895 } else { 896 LIST_REMOVE(ncp, nc_src); 897 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { 898 ncp->nc_flag |= NCF_DVDROP; 899 atomic_subtract_rel_long(&numcachehv, 1); 900 } 901 } 902 atomic_subtract_rel_long(&numcache, 1); 903 } 904 905 static void 906 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) 907 { 908 struct rwlock *blp; 909 910 MPASS(ncp->nc_dvp == vp); 911 MPASS(ncp->nc_flag & NCF_NEGATIVE); 912 cache_assert_vnode_locked(vp); 913 914 blp = NCP2BUCKETLOCK(ncp); 915 rw_wlock(blp); 916 cache_zap_locked(ncp, false); 917 rw_wunlock(blp); 918 } 919 920 static bool 921 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, 922 struct mtx **vlpp) 923 { 924 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 925 struct rwlock *blp; 926 927 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 928 cache_assert_vnode_locked(vp); 929 930 if (ncp->nc_flag & NCF_NEGATIVE) { 931 if (*vlpp != NULL) { 932 mtx_unlock(*vlpp); 933 *vlpp = NULL; 934 } 935 cache_zap_negative_locked_vnode_kl(ncp, vp); 936 return (true); 937 } 938 939 pvlp = VP2VNODELOCK(vp); 940 blp = NCP2BUCKETLOCK(ncp); 941 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 942 vlp2 = VP2VNODELOCK(ncp->nc_vp); 943 944 if (*vlpp == vlp1 || *vlpp == vlp2) { 945 to_unlock = *vlpp; 946 *vlpp = NULL; 947 } else { 948 if (*vlpp != NULL) { 949 mtx_unlock(*vlpp); 950 *vlpp = NULL; 951 } 952 cache_sort_vnodes(&vlp1, &vlp2); 953 if (vlp1 == pvlp) { 954 mtx_lock(vlp2); 955 to_unlock = vlp2; 956 } else { 957 if (!mtx_trylock(vlp1)) 958 goto out_relock; 959 to_unlock = vlp1; 960 } 961 } 962 rw_wlock(blp); 963 cache_zap_locked(ncp, false); 964 rw_wunlock(blp); 965 if (to_unlock != NULL) 966 mtx_unlock(to_unlock); 967 return (true); 968 969 out_relock: 970 mtx_unlock(vlp2); 971 mtx_lock(vlp1); 972 mtx_lock(vlp2); 973 MPASS(*vlpp == NULL); 974 *vlpp = vlp1; 975 return (false); 976 } 977 978 static int __noinline 979 cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp) 980 { 981 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 982 struct rwlock *blp; 983 int error = 0; 984 985 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 986 cache_assert_vnode_locked(vp); 987 988 pvlp = VP2VNODELOCK(vp); 989 if (ncp->nc_flag & NCF_NEGATIVE) { 990 cache_zap_negative_locked_vnode_kl(ncp, vp); 991 goto out; 992 } 993 994 blp = NCP2BUCKETLOCK(ncp); 995 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 996 vlp2 = VP2VNODELOCK(ncp->nc_vp); 997 cache_sort_vnodes(&vlp1, &vlp2); 998 if (vlp1 == pvlp) { 999 mtx_lock(vlp2); 1000 to_unlock = vlp2; 1001 } else { 1002 if (!mtx_trylock(vlp1)) { 1003 error = EAGAIN; 1004 goto out; 1005 } 1006 to_unlock = vlp1; 1007 } 1008 rw_wlock(blp); 1009 cache_zap_locked(ncp, false); 1010 rw_wunlock(blp); 1011 mtx_unlock(to_unlock); 1012 out: 1013 mtx_unlock(pvlp); 1014 return (error); 1015 } 1016 1017 /* 1018 * If trylocking failed we can get here. We know enough to take all needed locks 1019 * in the right order and re-lookup the entry. 1020 */ 1021 static int 1022 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1023 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, 1024 struct rwlock *blp) 1025 { 1026 struct namecache *rncp; 1027 1028 cache_assert_bucket_locked(ncp, RA_UNLOCKED); 1029 1030 cache_sort_vnodes(&dvlp, &vlp); 1031 cache_lock_vnodes(dvlp, vlp); 1032 rw_wlock(blp); 1033 LIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { 1034 if (rncp == ncp && rncp->nc_dvp == dvp && 1035 rncp->nc_nlen == cnp->cn_namelen && 1036 !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) 1037 break; 1038 } 1039 if (rncp != NULL) { 1040 cache_zap_locked(rncp, false); 1041 rw_wunlock(blp); 1042 cache_unlock_vnodes(dvlp, vlp); 1043 counter_u64_add(zap_and_exit_bucket_relock_success, 1); 1044 return (0); 1045 } 1046 1047 rw_wunlock(blp); 1048 cache_unlock_vnodes(dvlp, vlp); 1049 return (EAGAIN); 1050 } 1051 1052 static int __noinline 1053 cache_zap_wlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1054 uint32_t hash, struct rwlock *blp) 1055 { 1056 struct mtx *dvlp, *vlp; 1057 struct vnode *dvp; 1058 1059 cache_assert_bucket_locked(ncp, RA_WLOCKED); 1060 1061 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1062 vlp = NULL; 1063 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1064 vlp = VP2VNODELOCK(ncp->nc_vp); 1065 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1066 cache_zap_locked(ncp, false); 1067 rw_wunlock(blp); 1068 cache_unlock_vnodes(dvlp, vlp); 1069 return (0); 1070 } 1071 1072 dvp = ncp->nc_dvp; 1073 rw_wunlock(blp); 1074 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1075 } 1076 1077 static int __noinline 1078 cache_zap_rlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1079 uint32_t hash, struct rwlock *blp) 1080 { 1081 struct mtx *dvlp, *vlp; 1082 struct vnode *dvp; 1083 1084 cache_assert_bucket_locked(ncp, RA_RLOCKED); 1085 1086 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1087 vlp = NULL; 1088 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1089 vlp = VP2VNODELOCK(ncp->nc_vp); 1090 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1091 rw_runlock(blp); 1092 rw_wlock(blp); 1093 cache_zap_locked(ncp, false); 1094 rw_wunlock(blp); 1095 cache_unlock_vnodes(dvlp, vlp); 1096 return (0); 1097 } 1098 1099 dvp = ncp->nc_dvp; 1100 rw_runlock(blp); 1101 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1102 } 1103 1104 static int 1105 cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp, 1106 struct mtx **vlpp1, struct mtx **vlpp2) 1107 { 1108 struct mtx *dvlp, *vlp; 1109 1110 cache_assert_bucket_locked(ncp, RA_WLOCKED); 1111 1112 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1113 vlp = NULL; 1114 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1115 vlp = VP2VNODELOCK(ncp->nc_vp); 1116 cache_sort_vnodes(&dvlp, &vlp); 1117 1118 if (*vlpp1 == dvlp && *vlpp2 == vlp) { 1119 cache_zap_locked(ncp, false); 1120 cache_unlock_vnodes(dvlp, vlp); 1121 *vlpp1 = NULL; 1122 *vlpp2 = NULL; 1123 return (0); 1124 } 1125 1126 if (*vlpp1 != NULL) 1127 mtx_unlock(*vlpp1); 1128 if (*vlpp2 != NULL) 1129 mtx_unlock(*vlpp2); 1130 *vlpp1 = NULL; 1131 *vlpp2 = NULL; 1132 1133 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1134 cache_zap_locked(ncp, false); 1135 cache_unlock_vnodes(dvlp, vlp); 1136 return (0); 1137 } 1138 1139 rw_wunlock(blp); 1140 *vlpp1 = dvlp; 1141 *vlpp2 = vlp; 1142 if (*vlpp1 != NULL) 1143 mtx_lock(*vlpp1); 1144 mtx_lock(*vlpp2); 1145 rw_wlock(blp); 1146 return (EAGAIN); 1147 } 1148 1149 static void 1150 cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp) 1151 { 1152 1153 if (blp != NULL) { 1154 rw_runlock(blp); 1155 } else { 1156 mtx_unlock(vlp); 1157 } 1158 } 1159 1160 static int __noinline 1161 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1162 struct timespec *tsp, int *ticksp) 1163 { 1164 int ltype; 1165 1166 *vpp = dvp; 1167 CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .", 1168 dvp, cnp->cn_nameptr); 1169 counter_u64_add(dothits, 1); 1170 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); 1171 if (tsp != NULL) 1172 timespecclear(tsp); 1173 if (ticksp != NULL) 1174 *ticksp = ticks; 1175 vrefact(*vpp); 1176 /* 1177 * When we lookup "." we still can be asked to lock it 1178 * differently... 1179 */ 1180 ltype = cnp->cn_lkflags & LK_TYPE_MASK; 1181 if (ltype != VOP_ISLOCKED(*vpp)) { 1182 if (ltype == LK_EXCLUSIVE) { 1183 vn_lock(*vpp, LK_UPGRADE | LK_RETRY); 1184 if ((*vpp)->v_iflag & VI_DOOMED) { 1185 /* forced unmount */ 1186 vrele(*vpp); 1187 *vpp = NULL; 1188 return (ENOENT); 1189 } 1190 } else 1191 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); 1192 } 1193 return (-1); 1194 } 1195 1196 static __noinline int 1197 cache_lookup_nomakeentry(struct vnode *dvp, struct vnode **vpp, 1198 struct componentname *cnp, struct timespec *tsp, int *ticksp) 1199 { 1200 struct namecache *ncp; 1201 struct rwlock *blp; 1202 struct mtx *dvlp, *dvlp2; 1203 uint32_t hash; 1204 int error; 1205 1206 if (cnp->cn_namelen == 2 && 1207 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1208 counter_u64_add(dotdothits, 1); 1209 dvlp = VP2VNODELOCK(dvp); 1210 dvlp2 = NULL; 1211 mtx_lock(dvlp); 1212 retry_dotdot: 1213 ncp = dvp->v_cache_dd; 1214 if (ncp == NULL) { 1215 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, 1216 "..", NULL); 1217 mtx_unlock(dvlp); 1218 if (dvlp2 != NULL) 1219 mtx_unlock(dvlp2); 1220 return (0); 1221 } 1222 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1223 if (ncp->nc_dvp != dvp) 1224 panic("dvp %p v_cache_dd %p\n", dvp, ncp); 1225 if (!cache_zap_locked_vnode_kl2(ncp, 1226 dvp, &dvlp2)) 1227 goto retry_dotdot; 1228 MPASS(dvp->v_cache_dd == NULL); 1229 mtx_unlock(dvlp); 1230 if (dvlp2 != NULL) 1231 mtx_unlock(dvlp2); 1232 cache_free(ncp); 1233 } else { 1234 dvp->v_cache_dd = NULL; 1235 mtx_unlock(dvlp); 1236 if (dvlp2 != NULL) 1237 mtx_unlock(dvlp2); 1238 } 1239 return (0); 1240 } 1241 1242 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1243 blp = HASH2BUCKETLOCK(hash); 1244 retry: 1245 if (LIST_EMPTY(NCHHASH(hash))) 1246 goto out_no_entry; 1247 1248 rw_wlock(blp); 1249 1250 LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1251 counter_u64_add(numchecks, 1); 1252 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1253 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1254 break; 1255 } 1256 1257 /* We failed to find an entry */ 1258 if (ncp == NULL) { 1259 rw_wunlock(blp); 1260 goto out_no_entry; 1261 } 1262 1263 error = cache_zap_wlocked_bucket(ncp, cnp, hash, blp); 1264 if (__predict_false(error != 0)) { 1265 zap_and_exit_bucket_fail++; 1266 cache_maybe_yield(); 1267 goto retry; 1268 } 1269 counter_u64_add(numposzaps, 1); 1270 cache_free(ncp); 1271 return (0); 1272 out_no_entry: 1273 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, NULL); 1274 counter_u64_add(nummisszap, 1); 1275 return (0); 1276 } 1277 1278 /** 1279 * Lookup a name in the name cache 1280 * 1281 * # Arguments 1282 * 1283 * - dvp: Parent directory in which to search. 1284 * - vpp: Return argument. Will contain desired vnode on cache hit. 1285 * - cnp: Parameters of the name search. The most interesting bits of 1286 * the cn_flags field have the following meanings: 1287 * - MAKEENTRY: If clear, free an entry from the cache rather than look 1288 * it up. 1289 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." 1290 * - tsp: Return storage for cache timestamp. On a successful (positive 1291 * or negative) lookup, tsp will be filled with any timespec that 1292 * was stored when this cache entry was created. However, it will 1293 * be clear for "." entries. 1294 * - ticks: Return storage for alternate cache timestamp. On a successful 1295 * (positive or negative) lookup, it will contain the ticks value 1296 * that was current when the cache entry was created, unless cnp 1297 * was ".". 1298 * 1299 * # Returns 1300 * 1301 * - -1: A positive cache hit. vpp will contain the desired vnode. 1302 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due 1303 * to a forced unmount. vpp will not be modified. If the entry 1304 * is a whiteout, then the ISWHITEOUT flag will be set in 1305 * cnp->cn_flags. 1306 * - 0: A cache miss. vpp will not be modified. 1307 * 1308 * # Locking 1309 * 1310 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up 1311 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the 1312 * lock is not recursively acquired. 1313 */ 1314 int 1315 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1316 struct timespec *tsp, int *ticksp) 1317 { 1318 struct namecache_ts *ncp_ts; 1319 struct namecache *ncp; 1320 struct rwlock *blp; 1321 struct mtx *dvlp; 1322 uint32_t hash; 1323 enum vgetstate vs; 1324 int error, ltype; 1325 1326 if (__predict_false(!doingcache)) { 1327 cnp->cn_flags &= ~MAKEENTRY; 1328 return (0); 1329 } 1330 1331 counter_u64_add(numcalls, 1); 1332 1333 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) 1334 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); 1335 1336 if ((cnp->cn_flags & MAKEENTRY) == 0) 1337 return (cache_lookup_nomakeentry(dvp, vpp, cnp, tsp, ticksp)); 1338 1339 retry: 1340 blp = NULL; 1341 dvlp = NULL; 1342 error = 0; 1343 if (cnp->cn_namelen == 2 && 1344 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1345 counter_u64_add(dotdothits, 1); 1346 dvlp = VP2VNODELOCK(dvp); 1347 mtx_lock(dvlp); 1348 ncp = dvp->v_cache_dd; 1349 if (ncp == NULL) { 1350 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, 1351 "..", NULL); 1352 mtx_unlock(dvlp); 1353 return (0); 1354 } 1355 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1356 if (ncp->nc_flag & NCF_NEGATIVE) 1357 *vpp = NULL; 1358 else 1359 *vpp = ncp->nc_vp; 1360 } else 1361 *vpp = ncp->nc_dvp; 1362 /* Return failure if negative entry was found. */ 1363 if (*vpp == NULL) 1364 goto negative_success; 1365 CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..", 1366 dvp, cnp->cn_nameptr, *vpp); 1367 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", 1368 *vpp); 1369 cache_out_ts(ncp, tsp, ticksp); 1370 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == 1371 NCF_DTS && tsp != NULL) { 1372 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1373 *tsp = ncp_ts->nc_dotdottime; 1374 } 1375 goto success; 1376 } 1377 1378 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1379 blp = HASH2BUCKETLOCK(hash); 1380 rw_rlock(blp); 1381 1382 LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1383 counter_u64_add(numchecks, 1); 1384 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1385 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1386 break; 1387 } 1388 1389 /* We failed to find an entry */ 1390 if (__predict_false(ncp == NULL)) { 1391 rw_runlock(blp); 1392 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1393 NULL); 1394 counter_u64_add(nummiss, 1); 1395 return (0); 1396 } 1397 1398 if (ncp->nc_flag & NCF_NEGATIVE) 1399 goto negative_success; 1400 1401 /* We found a "positive" match, return the vnode */ 1402 counter_u64_add(numposhits, 1); 1403 *vpp = ncp->nc_vp; 1404 CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p", 1405 dvp, cnp->cn_nameptr, *vpp, ncp); 1406 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, 1407 *vpp); 1408 cache_out_ts(ncp, tsp, ticksp); 1409 success: 1410 /* 1411 * On success we return a locked and ref'd vnode as per the lookup 1412 * protocol. 1413 */ 1414 MPASS(dvp != *vpp); 1415 ltype = 0; /* silence gcc warning */ 1416 if (cnp->cn_flags & ISDOTDOT) { 1417 ltype = VOP_ISLOCKED(dvp); 1418 VOP_UNLOCK(dvp, 0); 1419 } 1420 vs = vget_prep(*vpp); 1421 cache_lookup_unlock(blp, dvlp); 1422 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1423 if (cnp->cn_flags & ISDOTDOT) { 1424 vn_lock(dvp, ltype | LK_RETRY); 1425 if (dvp->v_iflag & VI_DOOMED) { 1426 if (error == 0) 1427 vput(*vpp); 1428 *vpp = NULL; 1429 return (ENOENT); 1430 } 1431 } 1432 if (error) { 1433 *vpp = NULL; 1434 goto retry; 1435 } 1436 if ((cnp->cn_flags & ISLASTCN) && 1437 (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) { 1438 ASSERT_VOP_ELOCKED(*vpp, "cache_lookup"); 1439 } 1440 return (-1); 1441 1442 negative_success: 1443 /* We found a negative match, and want to create it, so purge */ 1444 if (cnp->cn_nameiop == CREATE) { 1445 counter_u64_add(numnegzaps, 1); 1446 goto zap_and_exit; 1447 } 1448 1449 counter_u64_add(numneghits, 1); 1450 cache_negative_hit(ncp); 1451 if (ncp->nc_flag & NCF_WHITE) 1452 cnp->cn_flags |= ISWHITEOUT; 1453 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, 1454 ncp->nc_name); 1455 cache_out_ts(ncp, tsp, ticksp); 1456 cache_lookup_unlock(blp, dvlp); 1457 return (ENOENT); 1458 1459 zap_and_exit: 1460 if (blp != NULL) 1461 error = cache_zap_rlocked_bucket(ncp, cnp, hash, blp); 1462 else 1463 error = cache_zap_locked_vnode(ncp, dvp); 1464 if (__predict_false(error != 0)) { 1465 zap_and_exit_bucket_fail2++; 1466 cache_maybe_yield(); 1467 goto retry; 1468 } 1469 cache_free(ncp); 1470 return (0); 1471 } 1472 1473 struct celockstate { 1474 struct mtx *vlp[3]; 1475 struct rwlock *blp[2]; 1476 }; 1477 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); 1478 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); 1479 1480 static inline void 1481 cache_celockstate_init(struct celockstate *cel) 1482 { 1483 1484 bzero(cel, sizeof(*cel)); 1485 } 1486 1487 static void 1488 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, 1489 struct vnode *dvp) 1490 { 1491 struct mtx *vlp1, *vlp2; 1492 1493 MPASS(cel->vlp[0] == NULL); 1494 MPASS(cel->vlp[1] == NULL); 1495 MPASS(cel->vlp[2] == NULL); 1496 1497 MPASS(vp != NULL || dvp != NULL); 1498 1499 vlp1 = VP2VNODELOCK(vp); 1500 vlp2 = VP2VNODELOCK(dvp); 1501 cache_sort_vnodes(&vlp1, &vlp2); 1502 1503 if (vlp1 != NULL) { 1504 mtx_lock(vlp1); 1505 cel->vlp[0] = vlp1; 1506 } 1507 mtx_lock(vlp2); 1508 cel->vlp[1] = vlp2; 1509 } 1510 1511 static void 1512 cache_unlock_vnodes_cel(struct celockstate *cel) 1513 { 1514 1515 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); 1516 1517 if (cel->vlp[0] != NULL) 1518 mtx_unlock(cel->vlp[0]); 1519 if (cel->vlp[1] != NULL) 1520 mtx_unlock(cel->vlp[1]); 1521 if (cel->vlp[2] != NULL) 1522 mtx_unlock(cel->vlp[2]); 1523 } 1524 1525 static bool 1526 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) 1527 { 1528 struct mtx *vlp; 1529 bool ret; 1530 1531 cache_assert_vlp_locked(cel->vlp[0]); 1532 cache_assert_vlp_locked(cel->vlp[1]); 1533 MPASS(cel->vlp[2] == NULL); 1534 1535 MPASS(vp != NULL); 1536 vlp = VP2VNODELOCK(vp); 1537 1538 ret = true; 1539 if (vlp >= cel->vlp[1]) { 1540 mtx_lock(vlp); 1541 } else { 1542 if (mtx_trylock(vlp)) 1543 goto out; 1544 cache_lock_vnodes_cel_3_failures++; 1545 cache_unlock_vnodes_cel(cel); 1546 if (vlp < cel->vlp[0]) { 1547 mtx_lock(vlp); 1548 mtx_lock(cel->vlp[0]); 1549 mtx_lock(cel->vlp[1]); 1550 } else { 1551 if (cel->vlp[0] != NULL) 1552 mtx_lock(cel->vlp[0]); 1553 mtx_lock(vlp); 1554 mtx_lock(cel->vlp[1]); 1555 } 1556 ret = false; 1557 } 1558 out: 1559 cel->vlp[2] = vlp; 1560 return (ret); 1561 } 1562 1563 static void 1564 cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1, 1565 struct rwlock *blp2) 1566 { 1567 1568 MPASS(cel->blp[0] == NULL); 1569 MPASS(cel->blp[1] == NULL); 1570 1571 cache_sort_vnodes(&blp1, &blp2); 1572 1573 if (blp1 != NULL) { 1574 rw_wlock(blp1); 1575 cel->blp[0] = blp1; 1576 } 1577 rw_wlock(blp2); 1578 cel->blp[1] = blp2; 1579 } 1580 1581 static void 1582 cache_unlock_buckets_cel(struct celockstate *cel) 1583 { 1584 1585 if (cel->blp[0] != NULL) 1586 rw_wunlock(cel->blp[0]); 1587 rw_wunlock(cel->blp[1]); 1588 } 1589 1590 /* 1591 * Lock part of the cache affected by the insertion. 1592 * 1593 * This means vnodelocks for dvp, vp and the relevant bucketlock. 1594 * However, insertion can result in removal of an old entry. In this 1595 * case we have an additional vnode and bucketlock pair to lock. If the 1596 * entry is negative, ncelock is locked instead of the vnode. 1597 * 1598 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while 1599 * preserving the locking order (smaller address first). 1600 */ 1601 static void 1602 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1603 uint32_t hash) 1604 { 1605 struct namecache *ncp; 1606 struct rwlock *blps[2]; 1607 1608 blps[0] = HASH2BUCKETLOCK(hash); 1609 for (;;) { 1610 blps[1] = NULL; 1611 cache_lock_vnodes_cel(cel, dvp, vp); 1612 if (vp == NULL || vp->v_type != VDIR) 1613 break; 1614 ncp = vp->v_cache_dd; 1615 if (ncp == NULL) 1616 break; 1617 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1618 break; 1619 MPASS(ncp->nc_dvp == vp); 1620 blps[1] = NCP2BUCKETLOCK(ncp); 1621 if (ncp->nc_flag & NCF_NEGATIVE) 1622 break; 1623 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1624 break; 1625 /* 1626 * All vnodes got re-locked. Re-validate the state and if 1627 * nothing changed we are done. Otherwise restart. 1628 */ 1629 if (ncp == vp->v_cache_dd && 1630 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1631 blps[1] == NCP2BUCKETLOCK(ncp) && 1632 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1633 break; 1634 cache_unlock_vnodes_cel(cel); 1635 cel->vlp[0] = NULL; 1636 cel->vlp[1] = NULL; 1637 cel->vlp[2] = NULL; 1638 } 1639 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1640 } 1641 1642 static void 1643 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1644 uint32_t hash) 1645 { 1646 struct namecache *ncp; 1647 struct rwlock *blps[2]; 1648 1649 blps[0] = HASH2BUCKETLOCK(hash); 1650 for (;;) { 1651 blps[1] = NULL; 1652 cache_lock_vnodes_cel(cel, dvp, vp); 1653 ncp = dvp->v_cache_dd; 1654 if (ncp == NULL) 1655 break; 1656 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1657 break; 1658 MPASS(ncp->nc_dvp == dvp); 1659 blps[1] = NCP2BUCKETLOCK(ncp); 1660 if (ncp->nc_flag & NCF_NEGATIVE) 1661 break; 1662 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1663 break; 1664 if (ncp == dvp->v_cache_dd && 1665 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1666 blps[1] == NCP2BUCKETLOCK(ncp) && 1667 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1668 break; 1669 cache_unlock_vnodes_cel(cel); 1670 cel->vlp[0] = NULL; 1671 cel->vlp[1] = NULL; 1672 cel->vlp[2] = NULL; 1673 } 1674 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1675 } 1676 1677 static void 1678 cache_enter_unlock(struct celockstate *cel) 1679 { 1680 1681 cache_unlock_buckets_cel(cel); 1682 cache_unlock_vnodes_cel(cel); 1683 } 1684 1685 /* 1686 * Add an entry to the cache. 1687 */ 1688 void 1689 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 1690 struct timespec *tsp, struct timespec *dtsp) 1691 { 1692 struct celockstate cel; 1693 struct namecache *ncp, *n2, *ndd; 1694 struct namecache_ts *ncp_ts, *n2_ts; 1695 struct nchashhead *ncpp; 1696 struct neglist *neglist; 1697 uint32_t hash; 1698 int flag; 1699 int len; 1700 bool neg_locked, held_dvp; 1701 u_long lnumcache; 1702 1703 CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr); 1704 VNASSERT(vp == NULL || (vp->v_iflag & VI_DOOMED) == 0, vp, 1705 ("cache_enter: Adding a doomed vnode")); 1706 VNASSERT(dvp == NULL || (dvp->v_iflag & VI_DOOMED) == 0, dvp, 1707 ("cache_enter: Doomed vnode used as src")); 1708 1709 if (__predict_false(!doingcache)) 1710 return; 1711 1712 /* 1713 * Avoid blowout in namecache entries. 1714 */ 1715 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; 1716 if (__predict_false(lnumcache >= desiredvnodes * ncsizefactor)) { 1717 atomic_add_long(&numcache, -1); 1718 return; 1719 } 1720 1721 cache_celockstate_init(&cel); 1722 ndd = NULL; 1723 ncp_ts = NULL; 1724 flag = 0; 1725 if (cnp->cn_nameptr[0] == '.') { 1726 if (cnp->cn_namelen == 1) 1727 return; 1728 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 1729 len = cnp->cn_namelen; 1730 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1731 cache_enter_lock_dd(&cel, dvp, vp, hash); 1732 /* 1733 * If dotdot entry already exists, just retarget it 1734 * to new parent vnode, otherwise continue with new 1735 * namecache entry allocation. 1736 */ 1737 if ((ncp = dvp->v_cache_dd) != NULL && 1738 ncp->nc_flag & NCF_ISDOTDOT) { 1739 KASSERT(ncp->nc_dvp == dvp, 1740 ("wrong isdotdot parent")); 1741 neg_locked = false; 1742 if (ncp->nc_flag & NCF_NEGATIVE || vp == NULL) { 1743 neglist = NCP2NEGLIST(ncp); 1744 mtx_lock(&ncneg_hot.nl_lock); 1745 mtx_lock(&neglist->nl_lock); 1746 neg_locked = true; 1747 } 1748 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 1749 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, 1750 ncp, nc_dst); 1751 } else { 1752 cache_negative_remove(ncp, true); 1753 } 1754 if (vp != NULL) { 1755 TAILQ_INSERT_HEAD(&vp->v_cache_dst, 1756 ncp, nc_dst); 1757 if (ncp->nc_flag & NCF_HOTNEGATIVE) 1758 numhotneg--; 1759 ncp->nc_flag &= ~(NCF_NEGATIVE|NCF_HOTNEGATIVE); 1760 } else { 1761 if (ncp->nc_flag & NCF_HOTNEGATIVE) { 1762 numhotneg--; 1763 ncp->nc_flag &= ~(NCF_HOTNEGATIVE); 1764 } 1765 ncp->nc_flag |= NCF_NEGATIVE; 1766 cache_negative_insert(ncp, true); 1767 } 1768 if (neg_locked) { 1769 mtx_unlock(&neglist->nl_lock); 1770 mtx_unlock(&ncneg_hot.nl_lock); 1771 } 1772 ncp->nc_vp = vp; 1773 cache_enter_unlock(&cel); 1774 return; 1775 } 1776 dvp->v_cache_dd = NULL; 1777 cache_enter_unlock(&cel); 1778 cache_celockstate_init(&cel); 1779 SDT_PROBE3(vfs, namecache, enter, done, dvp, "..", vp); 1780 flag = NCF_ISDOTDOT; 1781 } 1782 } 1783 1784 held_dvp = false; 1785 if (LIST_EMPTY(&dvp->v_cache_src) && flag != NCF_ISDOTDOT) { 1786 vhold(dvp); 1787 atomic_add_long(&numcachehv, 1); 1788 held_dvp = true; 1789 } 1790 1791 /* 1792 * Calculate the hash key and setup as much of the new 1793 * namecache entry as possible before acquiring the lock. 1794 */ 1795 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); 1796 ncp->nc_flag = flag; 1797 ncp->nc_vp = vp; 1798 if (vp == NULL) 1799 ncp->nc_flag |= NCF_NEGATIVE; 1800 ncp->nc_dvp = dvp; 1801 if (tsp != NULL) { 1802 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1803 ncp_ts->nc_time = *tsp; 1804 ncp_ts->nc_ticks = ticks; 1805 ncp_ts->nc_nc.nc_flag |= NCF_TS; 1806 if (dtsp != NULL) { 1807 ncp_ts->nc_dotdottime = *dtsp; 1808 ncp_ts->nc_nc.nc_flag |= NCF_DTS; 1809 } 1810 } 1811 len = ncp->nc_nlen = cnp->cn_namelen; 1812 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1813 strlcpy(ncp->nc_name, cnp->cn_nameptr, len + 1); 1814 cache_enter_lock(&cel, dvp, vp, hash); 1815 1816 /* 1817 * See if this vnode or negative entry is already in the cache 1818 * with this name. This can happen with concurrent lookups of 1819 * the same path name. 1820 */ 1821 ncpp = NCHHASH(hash); 1822 LIST_FOREACH(n2, ncpp, nc_hash) { 1823 if (n2->nc_dvp == dvp && 1824 n2->nc_nlen == cnp->cn_namelen && 1825 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { 1826 if (tsp != NULL) { 1827 KASSERT((n2->nc_flag & NCF_TS) != 0, 1828 ("no NCF_TS")); 1829 n2_ts = __containerof(n2, struct namecache_ts, nc_nc); 1830 n2_ts->nc_time = ncp_ts->nc_time; 1831 n2_ts->nc_ticks = ncp_ts->nc_ticks; 1832 if (dtsp != NULL) { 1833 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; 1834 if (ncp->nc_flag & NCF_NEGATIVE) 1835 mtx_lock(&ncneg_hot.nl_lock); 1836 n2_ts->nc_nc.nc_flag |= NCF_DTS; 1837 if (ncp->nc_flag & NCF_NEGATIVE) 1838 mtx_unlock(&ncneg_hot.nl_lock); 1839 } 1840 } 1841 goto out_unlock_free; 1842 } 1843 } 1844 1845 if (flag == NCF_ISDOTDOT) { 1846 /* 1847 * See if we are trying to add .. entry, but some other lookup 1848 * has populated v_cache_dd pointer already. 1849 */ 1850 if (dvp->v_cache_dd != NULL) 1851 goto out_unlock_free; 1852 KASSERT(vp == NULL || vp->v_type == VDIR, 1853 ("wrong vnode type %p", vp)); 1854 dvp->v_cache_dd = ncp; 1855 } 1856 1857 if (vp != NULL) { 1858 if (vp->v_type == VDIR) { 1859 if (flag != NCF_ISDOTDOT) { 1860 /* 1861 * For this case, the cache entry maps both the 1862 * directory name in it and the name ".." for the 1863 * directory's parent. 1864 */ 1865 if ((ndd = vp->v_cache_dd) != NULL) { 1866 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) 1867 cache_zap_locked(ndd, false); 1868 else 1869 ndd = NULL; 1870 } 1871 vp->v_cache_dd = ncp; 1872 } 1873 } else { 1874 vp->v_cache_dd = NULL; 1875 } 1876 } 1877 1878 if (flag != NCF_ISDOTDOT) { 1879 if (LIST_EMPTY(&dvp->v_cache_src)) { 1880 if (!held_dvp) { 1881 vhold(dvp); 1882 atomic_add_long(&numcachehv, 1); 1883 } 1884 } else { 1885 if (held_dvp) { 1886 /* 1887 * This will not take the interlock as someone 1888 * else already holds the vnode on account of 1889 * the namecache and we hold locks preventing 1890 * this from changing. 1891 */ 1892 vdrop(dvp); 1893 atomic_subtract_long(&numcachehv, 1); 1894 } 1895 } 1896 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); 1897 } 1898 1899 /* 1900 * Insert the new namecache entry into the appropriate chain 1901 * within the cache entries table. 1902 */ 1903 LIST_INSERT_HEAD(ncpp, ncp, nc_hash); 1904 1905 /* 1906 * If the entry is "negative", we place it into the 1907 * "negative" cache queue, otherwise, we place it into the 1908 * destination vnode's cache entries queue. 1909 */ 1910 if (vp != NULL) { 1911 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); 1912 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, 1913 vp); 1914 } else { 1915 if (cnp->cn_flags & ISWHITEOUT) 1916 ncp->nc_flag |= NCF_WHITE; 1917 cache_negative_insert(ncp, false); 1918 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, 1919 ncp->nc_name); 1920 } 1921 cache_enter_unlock(&cel); 1922 if (numneg * ncnegfactor > lnumcache) 1923 cache_negative_zap_one(); 1924 cache_free(ndd); 1925 return; 1926 out_unlock_free: 1927 cache_enter_unlock(&cel); 1928 cache_free(ncp); 1929 if (held_dvp) { 1930 vdrop(dvp); 1931 atomic_subtract_long(&numcachehv, 1); 1932 } 1933 return; 1934 } 1935 1936 static u_int 1937 cache_roundup_2(u_int val) 1938 { 1939 u_int res; 1940 1941 for (res = 1; res <= val; res <<= 1) 1942 continue; 1943 1944 return (res); 1945 } 1946 1947 /* 1948 * Name cache initialization, from vfs_init() when we are booting 1949 */ 1950 static void 1951 nchinit(void *dummy __unused) 1952 { 1953 u_int i; 1954 1955 cache_zone_small = uma_zcreate("S VFS Cache", 1956 sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1, 1957 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache), 1958 UMA_ZONE_ZINIT); 1959 cache_zone_small_ts = uma_zcreate("STS VFS Cache", 1960 sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1, 1961 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts), 1962 UMA_ZONE_ZINIT); 1963 cache_zone_large = uma_zcreate("L VFS Cache", 1964 sizeof(struct namecache) + NAME_MAX + 1, 1965 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache), 1966 UMA_ZONE_ZINIT); 1967 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", 1968 sizeof(struct namecache_ts) + NAME_MAX + 1, 1969 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts), 1970 UMA_ZONE_ZINIT); 1971 1972 nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash); 1973 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; 1974 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ 1975 ncbuckethash = 7; 1976 if (ncbuckethash > nchash) 1977 ncbuckethash = nchash; 1978 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, 1979 M_WAITOK | M_ZERO); 1980 for (i = 0; i < numbucketlocks; i++) 1981 rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE); 1982 ncvnodehash = ncbuckethash; 1983 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, 1984 M_WAITOK | M_ZERO); 1985 for (i = 0; i < numvnodelocks; i++) 1986 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); 1987 ncpurgeminvnodes = numbucketlocks * 2; 1988 1989 ncneghash = 3; 1990 neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE, 1991 M_WAITOK | M_ZERO); 1992 for (i = 0; i < numneglists; i++) { 1993 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); 1994 TAILQ_INIT(&neglists[i].nl_list); 1995 } 1996 mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF); 1997 TAILQ_INIT(&ncneg_hot.nl_list); 1998 1999 mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF); 2000 2001 numcalls = counter_u64_alloc(M_WAITOK); 2002 dothits = counter_u64_alloc(M_WAITOK); 2003 dotdothits = counter_u64_alloc(M_WAITOK); 2004 numchecks = counter_u64_alloc(M_WAITOK); 2005 nummiss = counter_u64_alloc(M_WAITOK); 2006 nummisszap = counter_u64_alloc(M_WAITOK); 2007 numposzaps = counter_u64_alloc(M_WAITOK); 2008 numposhits = counter_u64_alloc(M_WAITOK); 2009 numnegzaps = counter_u64_alloc(M_WAITOK); 2010 numneghits = counter_u64_alloc(M_WAITOK); 2011 numfullpathcalls = counter_u64_alloc(M_WAITOK); 2012 numfullpathfail1 = counter_u64_alloc(M_WAITOK); 2013 numfullpathfail2 = counter_u64_alloc(M_WAITOK); 2014 numfullpathfail4 = counter_u64_alloc(M_WAITOK); 2015 numfullpathfound = counter_u64_alloc(M_WAITOK); 2016 zap_and_exit_bucket_relock_success = counter_u64_alloc(M_WAITOK); 2017 } 2018 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); 2019 2020 void 2021 cache_changesize(int newmaxvnodes) 2022 { 2023 struct nchashhead *new_nchashtbl, *old_nchashtbl; 2024 u_long new_nchash, old_nchash; 2025 struct namecache *ncp; 2026 uint32_t hash; 2027 int i; 2028 2029 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); 2030 if (newmaxvnodes < numbucketlocks) 2031 newmaxvnodes = numbucketlocks; 2032 2033 new_nchashtbl = hashinit(newmaxvnodes, M_VFSCACHE, &new_nchash); 2034 /* If same hash table size, nothing to do */ 2035 if (nchash == new_nchash) { 2036 free(new_nchashtbl, M_VFSCACHE); 2037 return; 2038 } 2039 /* 2040 * Move everything from the old hash table to the new table. 2041 * None of the namecache entries in the table can be removed 2042 * because to do so, they have to be removed from the hash table. 2043 */ 2044 cache_lock_all_vnodes(); 2045 cache_lock_all_buckets(); 2046 old_nchashtbl = nchashtbl; 2047 old_nchash = nchash; 2048 nchashtbl = new_nchashtbl; 2049 nchash = new_nchash; 2050 for (i = 0; i <= old_nchash; i++) { 2051 while ((ncp = LIST_FIRST(&old_nchashtbl[i])) != NULL) { 2052 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, 2053 ncp->nc_dvp); 2054 LIST_REMOVE(ncp, nc_hash); 2055 LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); 2056 } 2057 } 2058 cache_unlock_all_buckets(); 2059 cache_unlock_all_vnodes(); 2060 free(old_nchashtbl, M_VFSCACHE); 2061 } 2062 2063 /* 2064 * Invalidate all entries from and to a particular vnode. 2065 */ 2066 void 2067 cache_purge(struct vnode *vp) 2068 { 2069 TAILQ_HEAD(, namecache) ncps; 2070 struct namecache *ncp, *nnp; 2071 struct mtx *vlp, *vlp2; 2072 2073 CTR1(KTR_VFS, "cache_purge(%p)", vp); 2074 SDT_PROBE1(vfs, namecache, purge, done, vp); 2075 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2076 vp->v_cache_dd == NULL) 2077 return; 2078 TAILQ_INIT(&ncps); 2079 vlp = VP2VNODELOCK(vp); 2080 vlp2 = NULL; 2081 mtx_lock(vlp); 2082 retry: 2083 while (!LIST_EMPTY(&vp->v_cache_src)) { 2084 ncp = LIST_FIRST(&vp->v_cache_src); 2085 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2086 goto retry; 2087 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2088 } 2089 while (!TAILQ_EMPTY(&vp->v_cache_dst)) { 2090 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2091 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2092 goto retry; 2093 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2094 } 2095 ncp = vp->v_cache_dd; 2096 if (ncp != NULL) { 2097 KASSERT(ncp->nc_flag & NCF_ISDOTDOT, 2098 ("lost dotdot link")); 2099 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2100 goto retry; 2101 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2102 } 2103 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); 2104 mtx_unlock(vlp); 2105 if (vlp2 != NULL) 2106 mtx_unlock(vlp2); 2107 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2108 cache_free(ncp); 2109 } 2110 } 2111 2112 /* 2113 * Invalidate all negative entries for a particular directory vnode. 2114 */ 2115 void 2116 cache_purge_negative(struct vnode *vp) 2117 { 2118 TAILQ_HEAD(, namecache) ncps; 2119 struct namecache *ncp, *nnp; 2120 struct mtx *vlp; 2121 2122 CTR1(KTR_VFS, "cache_purge_negative(%p)", vp); 2123 SDT_PROBE1(vfs, namecache, purge_negative, done, vp); 2124 if (LIST_EMPTY(&vp->v_cache_src)) 2125 return; 2126 TAILQ_INIT(&ncps); 2127 vlp = VP2VNODELOCK(vp); 2128 mtx_lock(vlp); 2129 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { 2130 if (!(ncp->nc_flag & NCF_NEGATIVE)) 2131 continue; 2132 cache_zap_negative_locked_vnode_kl(ncp, vp); 2133 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2134 } 2135 mtx_unlock(vlp); 2136 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2137 cache_free(ncp); 2138 } 2139 } 2140 2141 /* 2142 * Flush all entries referencing a particular filesystem. 2143 */ 2144 void 2145 cache_purgevfs(struct mount *mp, bool force) 2146 { 2147 TAILQ_HEAD(, namecache) ncps; 2148 struct mtx *vlp1, *vlp2; 2149 struct rwlock *blp; 2150 struct nchashhead *bucket; 2151 struct namecache *ncp, *nnp; 2152 u_long i, j, n_nchash; 2153 int error; 2154 2155 /* Scan hash tables for applicable entries */ 2156 SDT_PROBE1(vfs, namecache, purgevfs, done, mp); 2157 if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes) 2158 return; 2159 TAILQ_INIT(&ncps); 2160 n_nchash = nchash + 1; 2161 vlp1 = vlp2 = NULL; 2162 for (i = 0; i < numbucketlocks; i++) { 2163 blp = (struct rwlock *)&bucketlocks[i]; 2164 rw_wlock(blp); 2165 for (j = i; j < n_nchash; j += numbucketlocks) { 2166 retry: 2167 bucket = &nchashtbl[j]; 2168 LIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) { 2169 cache_assert_bucket_locked(ncp, RA_WLOCKED); 2170 if (ncp->nc_dvp->v_mount != mp) 2171 continue; 2172 error = cache_zap_wlocked_bucket_kl(ncp, blp, 2173 &vlp1, &vlp2); 2174 if (error != 0) 2175 goto retry; 2176 TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst); 2177 } 2178 } 2179 rw_wunlock(blp); 2180 if (vlp1 == NULL && vlp2 == NULL) 2181 cache_maybe_yield(); 2182 } 2183 if (vlp1 != NULL) 2184 mtx_unlock(vlp1); 2185 if (vlp2 != NULL) 2186 mtx_unlock(vlp2); 2187 2188 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2189 cache_free(ncp); 2190 } 2191 } 2192 2193 /* 2194 * Perform canonical checks and cache lookup and pass on to filesystem 2195 * through the vop_cachedlookup only if needed. 2196 */ 2197 2198 int 2199 vfs_cache_lookup(struct vop_lookup_args *ap) 2200 { 2201 struct vnode *dvp; 2202 int error; 2203 struct vnode **vpp = ap->a_vpp; 2204 struct componentname *cnp = ap->a_cnp; 2205 struct ucred *cred = cnp->cn_cred; 2206 int flags = cnp->cn_flags; 2207 struct thread *td = cnp->cn_thread; 2208 2209 *vpp = NULL; 2210 dvp = ap->a_dvp; 2211 2212 if (dvp->v_type != VDIR) 2213 return (ENOTDIR); 2214 2215 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && 2216 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) 2217 return (EROFS); 2218 2219 error = VOP_ACCESS(dvp, VEXEC, cred, td); 2220 if (error) 2221 return (error); 2222 2223 error = cache_lookup(dvp, vpp, cnp, NULL, NULL); 2224 if (error == 0) 2225 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); 2226 if (error == -1) 2227 return (0); 2228 return (error); 2229 } 2230 2231 /* 2232 * XXX All of these sysctls would probably be more productive dead. 2233 */ 2234 static int __read_mostly disablecwd; 2235 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, 2236 "Disable the getcwd syscall"); 2237 2238 /* Implementation of the getcwd syscall. */ 2239 int 2240 sys___getcwd(struct thread *td, struct __getcwd_args *uap) 2241 { 2242 2243 return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen, 2244 MAXPATHLEN)); 2245 } 2246 2247 int 2248 kern___getcwd(struct thread *td, char *buf, enum uio_seg bufseg, size_t buflen, 2249 size_t path_max) 2250 { 2251 char *bp, *tmpbuf; 2252 struct filedesc *fdp; 2253 struct vnode *cdir, *rdir; 2254 int error; 2255 2256 if (__predict_false(disablecwd)) 2257 return (ENODEV); 2258 if (__predict_false(buflen < 2)) 2259 return (EINVAL); 2260 if (buflen > path_max) 2261 buflen = path_max; 2262 2263 tmpbuf = malloc(buflen, M_TEMP, M_WAITOK); 2264 fdp = td->td_proc->p_fd; 2265 FILEDESC_SLOCK(fdp); 2266 cdir = fdp->fd_cdir; 2267 vrefact(cdir); 2268 rdir = fdp->fd_rdir; 2269 vrefact(rdir); 2270 FILEDESC_SUNLOCK(fdp); 2271 error = vn_fullpath1(td, cdir, rdir, tmpbuf, &bp, buflen); 2272 vrele(rdir); 2273 vrele(cdir); 2274 2275 if (!error) { 2276 if (bufseg == UIO_SYSSPACE) 2277 bcopy(bp, buf, strlen(bp) + 1); 2278 else 2279 error = copyout(bp, buf, strlen(bp) + 1); 2280 #ifdef KTRACE 2281 if (KTRPOINT(curthread, KTR_NAMEI)) 2282 ktrnamei(bp); 2283 #endif 2284 } 2285 free(tmpbuf, M_TEMP); 2286 return (error); 2287 } 2288 2289 /* 2290 * Thus begins the fullpath magic. 2291 */ 2292 2293 static int __read_mostly disablefullpath; 2294 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0, 2295 "Disable the vn_fullpath function"); 2296 2297 /* 2298 * Retrieve the full filesystem path that correspond to a vnode from the name 2299 * cache (if available) 2300 */ 2301 int 2302 vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf) 2303 { 2304 char *buf; 2305 struct filedesc *fdp; 2306 struct vnode *rdir; 2307 int error; 2308 2309 if (__predict_false(disablefullpath)) 2310 return (ENODEV); 2311 if (__predict_false(vn == NULL)) 2312 return (EINVAL); 2313 2314 buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); 2315 fdp = td->td_proc->p_fd; 2316 FILEDESC_SLOCK(fdp); 2317 rdir = fdp->fd_rdir; 2318 vrefact(rdir); 2319 FILEDESC_SUNLOCK(fdp); 2320 error = vn_fullpath1(td, vn, rdir, buf, retbuf, MAXPATHLEN); 2321 vrele(rdir); 2322 2323 if (!error) 2324 *freebuf = buf; 2325 else 2326 free(buf, M_TEMP); 2327 return (error); 2328 } 2329 2330 /* 2331 * This function is similar to vn_fullpath, but it attempts to lookup the 2332 * pathname relative to the global root mount point. This is required for the 2333 * auditing sub-system, as audited pathnames must be absolute, relative to the 2334 * global root mount point. 2335 */ 2336 int 2337 vn_fullpath_global(struct thread *td, struct vnode *vn, 2338 char **retbuf, char **freebuf) 2339 { 2340 char *buf; 2341 int error; 2342 2343 if (__predict_false(disablefullpath)) 2344 return (ENODEV); 2345 if (__predict_false(vn == NULL)) 2346 return (EINVAL); 2347 buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); 2348 error = vn_fullpath1(td, vn, rootvnode, buf, retbuf, MAXPATHLEN); 2349 if (!error) 2350 *freebuf = buf; 2351 else 2352 free(buf, M_TEMP); 2353 return (error); 2354 } 2355 2356 int 2357 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, u_int *buflen) 2358 { 2359 struct vnode *dvp; 2360 struct namecache *ncp; 2361 struct mtx *vlp; 2362 int error; 2363 2364 vlp = VP2VNODELOCK(*vp); 2365 mtx_lock(vlp); 2366 TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) { 2367 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2368 break; 2369 } 2370 if (ncp != NULL) { 2371 if (*buflen < ncp->nc_nlen) { 2372 mtx_unlock(vlp); 2373 vrele(*vp); 2374 counter_u64_add(numfullpathfail4, 1); 2375 error = ENOMEM; 2376 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2377 vp, NULL); 2378 return (error); 2379 } 2380 *buflen -= ncp->nc_nlen; 2381 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 2382 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, 2383 ncp->nc_name, vp); 2384 dvp = *vp; 2385 *vp = ncp->nc_dvp; 2386 vref(*vp); 2387 mtx_unlock(vlp); 2388 vrele(dvp); 2389 return (0); 2390 } 2391 SDT_PROBE1(vfs, namecache, fullpath, miss, vp); 2392 2393 mtx_unlock(vlp); 2394 vn_lock(*vp, LK_SHARED | LK_RETRY); 2395 error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen); 2396 vput(*vp); 2397 if (error) { 2398 counter_u64_add(numfullpathfail2, 1); 2399 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2400 return (error); 2401 } 2402 2403 *vp = dvp; 2404 if (dvp->v_iflag & VI_DOOMED) { 2405 /* forced unmount */ 2406 vrele(dvp); 2407 error = ENOENT; 2408 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2409 return (error); 2410 } 2411 /* 2412 * *vp has its use count incremented still. 2413 */ 2414 2415 return (0); 2416 } 2417 2418 /* 2419 * The magic behind kern___getcwd() and vn_fullpath(). 2420 */ 2421 static int 2422 vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir, 2423 char *buf, char **retbuf, u_int buflen) 2424 { 2425 int error, slash_prefixed; 2426 #ifdef KDTRACE_HOOKS 2427 struct vnode *startvp = vp; 2428 #endif 2429 struct vnode *vp1; 2430 2431 buflen--; 2432 buf[buflen] = '\0'; 2433 error = 0; 2434 slash_prefixed = 0; 2435 2436 SDT_PROBE1(vfs, namecache, fullpath, entry, vp); 2437 counter_u64_add(numfullpathcalls, 1); 2438 vref(vp); 2439 if (vp->v_type != VDIR) { 2440 error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen); 2441 if (error) 2442 return (error); 2443 if (buflen == 0) { 2444 vrele(vp); 2445 return (ENOMEM); 2446 } 2447 buf[--buflen] = '/'; 2448 slash_prefixed = 1; 2449 } 2450 while (vp != rdir && vp != rootvnode) { 2451 /* 2452 * The vp vnode must be already fully constructed, 2453 * since it is either found in namecache or obtained 2454 * from VOP_VPTOCNP(). We may test for VV_ROOT safely 2455 * without obtaining the vnode lock. 2456 */ 2457 if ((vp->v_vflag & VV_ROOT) != 0) { 2458 vn_lock(vp, LK_RETRY | LK_SHARED); 2459 2460 /* 2461 * With the vnode locked, check for races with 2462 * unmount, forced or not. Note that we 2463 * already verified that vp is not equal to 2464 * the root vnode, which means that 2465 * mnt_vnodecovered can be NULL only for the 2466 * case of unmount. 2467 */ 2468 if ((vp->v_iflag & VI_DOOMED) != 0 || 2469 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || 2470 vp1->v_mountedhere != vp->v_mount) { 2471 vput(vp); 2472 error = ENOENT; 2473 SDT_PROBE3(vfs, namecache, fullpath, return, 2474 error, vp, NULL); 2475 break; 2476 } 2477 2478 vref(vp1); 2479 vput(vp); 2480 vp = vp1; 2481 continue; 2482 } 2483 if (vp->v_type != VDIR) { 2484 vrele(vp); 2485 counter_u64_add(numfullpathfail1, 1); 2486 error = ENOTDIR; 2487 SDT_PROBE3(vfs, namecache, fullpath, return, 2488 error, vp, NULL); 2489 break; 2490 } 2491 error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen); 2492 if (error) 2493 break; 2494 if (buflen == 0) { 2495 vrele(vp); 2496 error = ENOMEM; 2497 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2498 startvp, NULL); 2499 break; 2500 } 2501 buf[--buflen] = '/'; 2502 slash_prefixed = 1; 2503 } 2504 if (error) 2505 return (error); 2506 if (!slash_prefixed) { 2507 if (buflen == 0) { 2508 vrele(vp); 2509 counter_u64_add(numfullpathfail4, 1); 2510 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, 2511 startvp, NULL); 2512 return (ENOMEM); 2513 } 2514 buf[--buflen] = '/'; 2515 } 2516 counter_u64_add(numfullpathfound, 1); 2517 vrele(vp); 2518 2519 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, buf + buflen); 2520 *retbuf = buf + buflen; 2521 return (0); 2522 } 2523 2524 struct vnode * 2525 vn_dir_dd_ino(struct vnode *vp) 2526 { 2527 struct namecache *ncp; 2528 struct vnode *ddvp; 2529 struct mtx *vlp; 2530 enum vgetstate vs; 2531 2532 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); 2533 vlp = VP2VNODELOCK(vp); 2534 mtx_lock(vlp); 2535 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { 2536 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 2537 continue; 2538 ddvp = ncp->nc_dvp; 2539 vs = vget_prep(ddvp); 2540 mtx_unlock(vlp); 2541 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) 2542 return (NULL); 2543 return (ddvp); 2544 } 2545 mtx_unlock(vlp); 2546 return (NULL); 2547 } 2548 2549 int 2550 vn_commname(struct vnode *vp, char *buf, u_int buflen) 2551 { 2552 struct namecache *ncp; 2553 struct mtx *vlp; 2554 int l; 2555 2556 vlp = VP2VNODELOCK(vp); 2557 mtx_lock(vlp); 2558 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) 2559 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2560 break; 2561 if (ncp == NULL) { 2562 mtx_unlock(vlp); 2563 return (ENOENT); 2564 } 2565 l = min(ncp->nc_nlen, buflen - 1); 2566 memcpy(buf, ncp->nc_name, l); 2567 mtx_unlock(vlp); 2568 buf[l] = '\0'; 2569 return (0); 2570 } 2571 2572 /* 2573 * This function updates path string to vnode's full global path 2574 * and checks the size of the new path string against the pathlen argument. 2575 * 2576 * Requires a locked, referenced vnode. 2577 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 2578 * 2579 * If sysctl debug.disablefullpath is set, ENODEV is returned, 2580 * vnode is left locked and path remain untouched. 2581 * 2582 * If vp is a directory, the call to vn_fullpath_global() always succeeds 2583 * because it falls back to the ".." lookup if the namecache lookup fails. 2584 */ 2585 int 2586 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, 2587 u_int pathlen) 2588 { 2589 struct nameidata nd; 2590 struct vnode *vp1; 2591 char *rpath, *fbuf; 2592 int error; 2593 2594 ASSERT_VOP_ELOCKED(vp, __func__); 2595 2596 /* Return ENODEV if sysctl debug.disablefullpath==1 */ 2597 if (__predict_false(disablefullpath)) 2598 return (ENODEV); 2599 2600 /* Construct global filesystem path from vp. */ 2601 VOP_UNLOCK(vp, 0); 2602 error = vn_fullpath_global(td, vp, &rpath, &fbuf); 2603 2604 if (error != 0) { 2605 vrele(vp); 2606 return (error); 2607 } 2608 2609 if (strlen(rpath) >= pathlen) { 2610 vrele(vp); 2611 error = ENAMETOOLONG; 2612 goto out; 2613 } 2614 2615 /* 2616 * Re-lookup the vnode by path to detect a possible rename. 2617 * As a side effect, the vnode is relocked. 2618 * If vnode was renamed, return ENOENT. 2619 */ 2620 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, 2621 UIO_SYSSPACE, path, td); 2622 error = namei(&nd); 2623 if (error != 0) { 2624 vrele(vp); 2625 goto out; 2626 } 2627 NDFREE(&nd, NDF_ONLY_PNBUF); 2628 vp1 = nd.ni_vp; 2629 vrele(vp); 2630 if (vp1 == vp) 2631 strcpy(path, rpath); 2632 else { 2633 vput(vp1); 2634 error = ENOENT; 2635 } 2636 2637 out: 2638 free(fbuf, M_TEMP); 2639 return (error); 2640 } 2641 2642 #ifdef DDB 2643 static void 2644 db_print_vpath(struct vnode *vp) 2645 { 2646 2647 while (vp != NULL) { 2648 db_printf("%p: ", vp); 2649 if (vp == rootvnode) { 2650 db_printf("/"); 2651 vp = NULL; 2652 } else { 2653 if (vp->v_vflag & VV_ROOT) { 2654 db_printf("<mount point>"); 2655 vp = vp->v_mount->mnt_vnodecovered; 2656 } else { 2657 struct namecache *ncp; 2658 char *ncn; 2659 int i; 2660 2661 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2662 if (ncp != NULL) { 2663 ncn = ncp->nc_name; 2664 for (i = 0; i < ncp->nc_nlen; i++) 2665 db_printf("%c", *ncn++); 2666 vp = ncp->nc_dvp; 2667 } else { 2668 vp = NULL; 2669 } 2670 } 2671 } 2672 db_printf("\n"); 2673 } 2674 2675 return; 2676 } 2677 2678 DB_SHOW_COMMAND(vpath, db_show_vpath) 2679 { 2680 struct vnode *vp; 2681 2682 if (!have_addr) { 2683 db_printf("usage: show vpath <struct vnode *>\n"); 2684 return; 2685 } 2686 2687 vp = (struct vnode *)addr; 2688 db_print_vpath(vp); 2689 } 2690 2691 #endif 2692